Exemplo n.º 1
0
    def image_downloaded(self, response, request, info, item=None):
        path = self.file_path(request, response=response, info=info)

        try:
            orig_image = Image.open(BytesIO(response.body))
        except UnidentifiedImageError:
            raise ImageException(f'Image cannot be identified ({request.url})')

        width, height = orig_image.size
        if width > self.max_size_px or height > self.max_size_px:
            raise ImageException(
                f'Image too large ({width}x{height} < {self.max_size_px}x{self.max_size_px})'
            )

        image, buffer = self.convert_image(orig_image)
        buffer.seek(0)
        checksum = md5sum(buffer)

        width, height = image.size
        self.store.persist_file(path,
                                buffer,
                                info,
                                meta={
                                    'width': width,
                                    'height': height
                                },
                                headers={'Content-Type': 'image/png'})
        return checksum
Exemplo n.º 2
0
    def get_image(self, response, request, info):
        """ See: parent get_images, but only does a single image """
        path = self.file_path(request, response=response, info=info)
        orig_image = Image.open(BytesIO(response.body))

        width, height = orig_image.size
        if width < self.min_width or height < self.min_height:
            raise ImageException("Image too small (%dx%d < %dx%d)" %
                                 (width, height, self.min_width, self.min_height))

        image, buf = self.convert_image(orig_image)
        return path, image, buf
Exemplo n.º 3
0
    def get_images(self, response, request, info):
        path = self.file_path(request, response=response, info=info)
        orig_buf = BytesIO(response.body)
        orig_image = Image.open(orig_buf)

        width, height = orig_image.size
        if width < self.min_width or height < self.min_height:
            raise ImageException("Image too small (%dx%d < %dx%d)" %
                                 (width, height, self.min_width, self.min_height))

        yield path, orig_image, orig_buf

        image, buf = self.convert_image(orig_image)
        for thumb_id, size in six.iteritems(self.thumbs):
            thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
            thumb_image, thumb_buf = self.convert_image(image, size)
            yield thumb_path, thumb_image, thumb_buf
 def get_images(self, response, request, info):
     path = self.file_path(request, response=response, info=info)
     #orig_image = Image.open(BytesIO(response.body))
     try:
         orig_image = Image.open(BytesIO(response.body))
     except Exception, e:
         print "url:%s  get_images failed:%s try io.BytersIO" % (
             request.url, e)
         try:
             orig_image = Image.open(io.BytesIO(response.body))
         except Exception, e:
             print "failed again. url:%s  get_images failed:%s try io.BytersIO" % (
                 request.url, e)
             '''
             print "failed again. url:%s  get_images failed:%s try io.BytersIO" % (request.url, e)
             fout = open('/data/spider_cluster_pic_10_dev_test/oreo/log/%s' % path, 'w')
             fout.write(response.body)
             fout.close()
             '''
             raise ImageException("图片流处理失败。返回的html-body不是图片流")
Exemplo n.º 5
0
    def get_images(self, response, request, info):
        path = self.file_path(request, response=response, info=info)
        origImage = Image.open(BytesIO(response.body))
        width, height = origImage.size
        if width < self.min_width or width > ConstValue['MAX_WIDTH'] or \
                height < self.min_height or height > ConstValue['MAX_HEIGHT']:
            urlMd5 = GetMd5(request.url)
            if self.imgNameDict.get(urlMd5):
                del self.imgNameDict[urlMd5]
            raise ImageException("Image not is standard size")

        image, buf = self.convert_image(origImage)
        yield path, image, buf

        for thumb_id, size in six.iteritems(self.thumbs):
            thumb_path = self.thumb_path(request,
                                         thumb_id,
                                         response=response,
                                         info=info)
            thumb_image, thumb_buf = self.convert_image(image, size)
            yield thumb_path, thumb_image, thumb_buf
Exemplo n.º 6
0
    def get_images(self, response, request, info, **kwargs):
        driver = response.meta['driver']
        image_src = driver.find_element_by_tag_name('img')
        path = self.file_path(request, response=response, info=info)
        orig_image = Image.open(BytesIO(image_src.screenshot_as_png))

        width, height = orig_image.size
        if width < self.min_width or height < self.min_height:
            raise ImageException(
                "Image too small (%dx%d < %dx%d)" %
                (width, height, self.min_width, self.min_height))

        image, buf = self.convert_image(orig_image)
        yield path, image, buf

        for thumb_id, size in self.thumbs.items():
            thumb_path = self.thumb_path(request,
                                         thumb_id,
                                         response=response.meta['screenshot'],
                                         info=info)
            thumb_image, thumb_buf = self.convert_image(image, size)
            yield thumb_path, thumb_image, thumb_buf
                orig_image = Image.open(io.BytesIO(response.body))
            except Exception, e:
                print "failed again. url:%s  get_images failed:%s try io.BytersIO" % (
                    request.url, e)
                '''
                print "failed again. url:%s  get_images failed:%s try io.BytersIO" % (request.url, e)
                fout = open('/data/spider_cluster_pic_10_dev_test/oreo/log/%s' % path, 'w')
                fout.write(response.body)
                fout.close()
                '''
                raise ImageException("图片流处理失败。返回的html-body不是图片流")

        width, height = orig_image.size
        if width < self.min_width or height < self.min_height:
            raise ImageException(
                "Image too small (%dx%d < %dx%d)" %
                (width, height, self.min_width, self.min_height))

        image, buf = self.convert_image(orig_image)
        yield path, image, buf

        for thumb_id, size in six.iteritems(self.thumbs):
            thumb_path = self.thumb_path(request,
                                         thumb_id,
                                         response=response,
                                         info=info)
            thumb_image, thumb_buf = self.convert_image(image, size)
            yield thumb_path, thumb_image, thumb_buf

    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
Exemplo n.º 8
0
 def convert_image(self, image, size=None):
     buf = StringIO()
     try:
         image.save(buf, image.format)
     except Exception, ex:
         raise ImageException("Cannot process image. Error: %s" % ex)