예제 #1
0
파일: files.py 프로젝트: CPoirot3/scrapy
        def _onsuccess(result):
            if not result:
                return  # returning None force download

            last_modified = result.get('last_modified', None)
            if not last_modified:
                return  # returning None force download

            age_seconds = time.time() - last_modified
            age_days = age_seconds / 60 / 60 / 24
            if age_days > self.expires:
                return  # returning None force download

            referer = referer_str(request)
            logger.debug(
                'File (uptodate): Downloaded %(medianame)s from %(request)s '
                'referred in <%(referer)s>',
                {'medianame': self.MEDIA_NAME, 'request': request,
                 'referer': referer},
                extra={'spider': info.spider}
            )
            self.inc_stats(info.spider, 'uptodate')

            checksum = result.get('checksum', None)
            return {'url': request.url, 'path': path, 'checksum': checksum}
예제 #2
0
 def crawled(self, request, response, spider):
     flags = " %s" % str(response.flags) if response.flags else ""
     return {
         "level": logging.DEBUG,
         "msg": CRAWLEDMSG,
         "args": {"status": response.status, "request": request, "referer": referer_str(request), "flags": flags},
     }
예제 #3
0
 def crawled(self, request, response, spider):
     flags = ' {0!s}'.format(str(response.flags)) if response.flags else ''
     return {
         'level': logging.DEBUG,
         'msg': CRAWLEDMSG,
         'args': {
             'status': response.status,
             'request': request,
             'referer': referer_str(request),
             'flags': flags,
         }
     }
예제 #4
0
파일: files.py 프로젝트: CPoirot3/scrapy
    def media_failed(self, failure, request, info):
        if not isinstance(failure.value, IgnoreRequest):
            referer = referer_str(request)
            logger.warning(
                'File (unknown-error): Error downloading %(medianame)s from '
                '%(request)s referred in <%(referer)s>: %(exception)s',
                {'medianame': self.MEDIA_NAME, 'request': request,
                 'referer': referer, 'exception': failure.value},
                extra={'spider': info.spider}
            )

        raise FileException
예제 #5
0
파일: files.py 프로젝트: CPoirot3/scrapy
    def media_downloaded(self, response, request, info):
        referer = referer_str(request)

        if response.status != 200:
            logger.warning(
                'File (code: %(status)s): Error downloading file from '
                '%(request)s referred in <%(referer)s>',
                {'status': response.status,
                 'request': request, 'referer': referer},
                extra={'spider': info.spider}
            )
            raise FileException('download-error')

        if not response.body:
            logger.warning(
                'File (empty-content): Empty file from %(request)s referred '
                'in <%(referer)s>: no-content',
                {'request': request, 'referer': referer},
                extra={'spider': info.spider}
            )
            raise FileException('empty-content')

        status = 'cached' if 'cached' in response.flags else 'downloaded'
        logger.debug(
            'File (%(status)s): Downloaded file from %(request)s referred in '
            '<%(referer)s>',
            {'status': status, 'request': request, 'referer': referer},
            extra={'spider': info.spider}
        )
        self.inc_stats(info.spider, status)

        try:
            path = self.file_path(request, response=response, info=info)
            checksum = self.file_downloaded(response, request, info)
        except FileException as exc:
            logger.warning(
                'File (error): Error processing file from %(request)s '
                'referred in <%(referer)s>: %(errormsg)s',
                {'request': request, 'referer': referer, 'errormsg': str(exc)},
                extra={'spider': info.spider}, exc_info=True
            )
            raise
        except Exception as exc:
            logger.error(
                'File (unknown-error): Error processing file from %(request)s '
                'referred in <%(referer)s>',
                {'request': request, 'referer': referer},
                exc_info=True, extra={'spider': info.spider}
            )
            raise FileException(str(exc))

        return {'url': request.url, 'path': path, 'checksum': checksum}
예제 #6
0
    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
            args = {'request': request, 'referer': referer_str(request) }
            self.logger.debug(msg, args, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
    def spider_error(self, failure, request, response, spider):
        """Logs an error message from a spider.

        .. versionadded:: 2.0
        """
        return {
            'level': logging.ERROR,
            'msg': SPIDERERRORMSG,
            'args': {
                'request': request,
                'referer': referer_str(request),
            }
        }
예제 #8
0
 def media_downloaded(self, response, request, info):
     try:
         return super().media_downloaded(response, request, info)
     except FileException as fe:
         failure = {
             'url': response.url,
             'origin': referer_str(request),
             'reason': str(fe),
             'http-status': response.status
         }
         self.bad_site(failure['url'])
         self.error(failure)
         raise fe
예제 #9
0
 def crawled(self, request, response, spider):
     request_flags = ' %s' % str(request.flags) if request.flags else ''
     response_flags = ' %s' % str(response.flags) if response.flags else ''
     return {
         'level': logging.DEBUG,
         'msg': CRAWLEDMSG,
         'args': {
             'status': response.status,
             'request': request,
             'request_flags': request_flags,
             'referer': referer_str(request),
             'response_flags': response_flags,
         }
     }
예제 #10
0
 def crawled(self, request, response, spider):
     request_flags = ' %s' % str(request.flags) if request.flags else ''
     response_flags = ' %s' % str(response.flags) if response.flags else ''
     return {
         'level': logging.DEBUG,
         'msg': CRAWLEDMSG,
         'args': {
             'status': response.status,
             'request': request,
             'request_flags' : request_flags,
             'referer': referer_str(request),
             'response_flags': response_flags,
         }
     }
예제 #11
0
파일: files.py 프로젝트: yaokaifei/scrapy
    def media_failed(self, failure, request, info):
        if not isinstance(failure.value, IgnoreRequest):
            referer = referer_str(request)
            logger.warning(
                'File (unknown-error): Error downloading %(medianame)s from '
                '%(request)s referred in <%(referer)s>: %(exception)s', {
                    'medianame': self.MEDIA_NAME,
                    'request': request,
                    'referer': referer,
                    'exception': failure.value
                },
                extra={'spider': info.spider})

        raise FileException
예제 #12
0
    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
            args = {'request': request, 'referer': referer_str(request)}
            self.logger.debug(msg, args, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request},
                              extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
예제 #13
0
 def crawled(self, request, response, spider):
     request_flags = ' %s' % str(request.flags) if request.flags else ''
     response_flags = ' %s' % str(response.flags) if response.flags else ''
     return {
         'level': logging.DEBUG,
         'msg': CRAWLEDMSG,
         'args': {
             'status': response.status,
             'request': request,
             'request_flags': request_flags,
             'referer': referer_str(request),
             'response_flags': response_flags,
             # backward compatibility with Scrapy logformatter below 1.4 version
             'flags': response_flags
         }
     }
예제 #14
0
 def crawled(self, request, response, spider):
     request_flags = ' %s' % str(request.flags) if request.flags else ''
     response_flags = ' %s' % str(response.flags) if response.flags else ''
     return {
         'level': logging.DEBUG,
         'msg': CRAWLEDMSG,
         'args': {
             'status': response.status,
             'request': request,
             'request_flags' : request_flags,
             'referer': referer_str(request),
             'response_flags': response_flags,
             # backward compatibility with Scrapy logformatter below 1.4 version
             'flags': response_flags
         }
     }
예제 #15
0
 def crawled(self, request, response, spider):
     """Logs a message when the crawler finds a webpage."""
     request_flags = f' {str(request.flags)}' if request.flags else ''
     response_flags = f' {str(response.flags)}' if response.flags else ''
     return {
         'level': logging.DEBUG,
         'msg': CRAWLEDMSG,
         'args': {
             'status': response.status,
             'request': request,
             'request_flags': request_flags,
             'referer': referer_str(request),
             'response_flags': response_flags,
             # backward compatibility with Scrapy logformatter below 1.4 version
             'flags': response_flags
         }
     }
예제 #16
0
 def crawled(self, request, response, spider):
     """Logs a message when the crawler finds a webpage."""
     request_flags = " %s" % str(request.flags) if request.flags else ""
     response_flags = " %s" % str(response.flags) if response.flags else ""
     return {
         "level": logging.DEBUG,
         "msg": CRAWLEDMSG,
         "args": {
             "status": response.status,
             "request": request,
             "request_flags": request_flags,
             "referer": referer_str(request),
             "response_flags": response_flags,
             # backward compatibility with Scrapy logformatter below 1.4 version
             "flags": response_flags,
         },
     }
예제 #17
0
        def _onsuccess(result):
            if not result:
                return  # returning None force download

            last_modified = result.get('last_modified', None)
            if not last_modified:
                return  # returning None force download

            # DO NOT CHECK avatar EXPIRATION
            # age_seconds = time.time() - last_modified
            # age_days = age_seconds / 60 / 60 / 24
            # if age_days > self.expires:
            #     return  # returning None force download

            try:
                fn = urllib.parse.urlparse(request.url).path.split('/')[-1]
                r_fn = info.spider.redis.hget(REDIS_AVATAR_PUBLISHER_KEY,
                                              request.flags[0])
            except:
                return
            if r_fn is None or fn != r_fn.decode('utf-8'):
                logger.info(
                    'Remote avatar file changed. Updating avatar for %s. %s -> %s',
                    request.flags[0], fn,
                    r_fn if r_fn is None else r_fn.decode('utf-8'))
                request.flags.append(fn)
                return

            referer = referer_str(request)
            logger.debug(
                'File (uptodate): Downloaded %(medianame)s from %(request)s '
                'referred in <%(referer)s>', {
                    'medianame': self.MEDIA_NAME,
                    'request': request,
                    'referer': referer
                },
                extra={'spider': info.spider})
            self.inc_stats(info.spider, 'uptodate')

            checksum = result.get('checksum', None)
            return {
                'url': request.url,
                'path': path,
                'checksum': checksum,
                'updated': False
            }
예제 #18
0
파일: scraper.py 프로젝트: zjkanjie/scrapy
 def handle_spider_error(self, _failure, request, response, spider):
     exc = _failure.value
     if isinstance(exc, CloseSpider):
         self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
         return
     logger.error(
         "Spider error processing %(request)s (referer: %(referer)s)", {
             'request': request,
             'referer': referer_str(request)
         },
         exc_info=failure_to_exc_info(_failure),
         extra={'spider': spider})
     self.signals.send_catch_log(signal=signals.spider_error,
                                 failure=_failure,
                                 response=response,
                                 spider=spider)
     self.crawler.stats.inc_value("spider_exceptions/%s" %
                                  _failure.value.__class__.__name__,
                                  spider=spider)
예제 #19
0
파일: scraper.py 프로젝트: CPoirot3/scrapy
 def handle_spider_error(self, _failure, request, response, spider):
     exc = _failure.value
     if isinstance(exc, CloseSpider):
         self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
         return
     logger.error(
         "Spider error processing %(request)s (referer: %(referer)s)",
         {'request': request, 'referer': referer_str(request)},
         exc_info=failure_to_exc_info(_failure),
         extra={'spider': spider}
     )
     self.signals.send_catch_log(
         signal=signals.spider_error,
         failure=_failure, response=response,
         spider=spider
     )
     self.crawler.stats.inc_value(
         "spider_exceptions/%s" % _failure.value.__class__.__name__,
         spider=spider
     )
예제 #20
0
    def file_downloaded(self, response, request, info):
        expected_csum = request.meta.get('sha256')
        if expected_csum:
            response_csum = sha256sum(BytesIO(response.body))

            logger.debug('Request %s SHA256: expected %s, actual %s',
                         request, expected_csum, response_csum)

            if expected_csum != response_csum:
                logger.warning(
                    'File (checksum-mismatch): Error downloading %s '
                    'from %s referred in <%s>: expected SHA256 digest %s, '
                    'got %s',
                    self.MEDIA_NAME, request, referer_str(request),
                    expected_csum, response_csum,
                    extra={'spider': info.spider})

                raise FileException('checksum-mismatch')

        return super(OpenWrtDownloaderPipeline, self).file_downloaded(
            response, request, info)
예제 #21
0
    def spider_error(self, failure, request, response, spider):
        spider_name = spider.name
        ticker = response.meta["ticker"]
        report_type = response.meta["ReportType"]
        try:
            page = response.meta["Page"]
        except:
            page = "1"

        msg_dict = {
            'ticker': ticker,
            'type': "Spider Error",
            'message': "There was an error in the spider...",
        }
        msg = json.dumps(msg_dict)
        return {
            'level': logging.ERROR,
            'msg': msg,
            'args': {
                'request': request,
                'referer': referer_str(request)
            }
        }
예제 #22
0
파일: files.py 프로젝트: yaokaifei/scrapy
    def media_downloaded(self, response, request, info):
        referer = referer_str(request)

        if response.status != 200:
            logger.warning(
                'File (code: %(status)s): Error downloading file from '
                '%(request)s referred in <%(referer)s>', {
                    'status': response.status,
                    'request': request,
                    'referer': referer
                },
                extra={'spider': info.spider})
            raise FileException('download-error')

        if not response.body:
            logger.warning(
                'File (empty-content): Empty file from %(request)s referred '
                'in <%(referer)s>: no-content', {
                    'request': request,
                    'referer': referer
                },
                extra={'spider': info.spider})
            raise FileException('empty-content')

        status = 'cached' if 'cached' in response.flags else 'downloaded'
        logger.debug(
            'File (%(status)s): Downloaded file from %(request)s referred in '
            '<%(referer)s>', {
                'status': status,
                'request': request,
                'referer': referer
            },
            extra={'spider': info.spider})
        self.inc_stats(info.spider, status)

        try:
            path = self.file_path(request, response=response, info=info)
            checksum = self.file_downloaded(response, request, info)
        except FileException as exc:
            logger.warning(
                'File (error): Error processing file from %(request)s '
                'referred in <%(referer)s>: %(errormsg)s', {
                    'request': request,
                    'referer': referer,
                    'errormsg': str(exc)
                },
                extra={'spider': info.spider},
                exc_info=True)
            raise
        except Exception as exc:
            logger.error(
                'File (unknown-error): Error processing file from %(request)s '
                'referred in <%(referer)s>', {
                    'request': request,
                    'referer': referer
                },
                exc_info=True,
                extra={'spider': info.spider})
            raise FileException(str(exc))

        return {'url': request.url, 'path': path, 'checksum': checksum}
예제 #23
0
    def media_downloaded(self, response, request, info):
        referer = referer_str(request)

        if response.status not in [200, 201]:
            logger.warning(
                'File (code: %(status)s): Error downloading file from '
                '%(request)s referred in <%(referer)s>', {
                    'status': response.status,
                    'request': request,
                    'referer': referer
                },
                extra={'spider': info.spider})
            raise FileException('download-error')

        if not response.body:
            if response.status == 201 and 'location' in response.headers:
                logger.debug(
                    'File (code: %(status)s): Status 201 received. Downloading '
                    'resource marked by location parameter in the response headers for '
                    '%(request)s referred in <%(referer)s>', {
                        'status': response.status,
                        'request': request,
                        'referer': referer
                    },
                    extra={'spider': info.spider})

                redirect_dlist = [self._process_request(Request(loc), info)]
                redirect_dfd = DeferredList(redirect_dlist, consumeErrors=1)
                return redirect_dfd
            else:
                logger.warning(
                    'File (empty-content): Empty file from %(request)s referred '
                    'in <%(referer)s>: no-content', {
                        'request': request,
                        'referer': referer
                    },
                    extra={'spider': info.spider})
                raise FileException('empty-content')

        status = 'cached' if 'cached' in response.flags else 'downloaded'
        logger.debug(
            'File (%(status)s): Downloaded file from %(request)s referred in '
            '<%(referer)s>', {
                'status': status,
                'request': request,
                'referer': referer
            },
            extra={'spider': info.spider})
        self.inc_stats(info.spider, status)

        try:
            path = self.file_path(request, response=response, info=info)
            checksum = self.file_downloaded(response, request, info)
        except FileException as exc:
            logger.warning(
                'File (error): Error processing file from %(request)s '
                'referred in <%(referer)s>: %(errormsg)s', {
                    'request': request,
                    'referer': referer,
                    'errormsg': str(exc)
                },
                extra={'spider': info.spider},
                exc_info=True)
            raise
        except Exception as exc:
            logger.error(
                'File (unknown-error): Error processing file from %(request)s '
                'referred in <%(referer)s>', {
                    'request': request,
                    'referer': referer
                },
                exc_info=True,
                extra={'spider': info.spider})
            raise FileException(str(exc))

        return {'url': request.url, 'path': path, 'checksum': checksum}
예제 #24
0
    def parse(self, response):

        Text = ''

        #techcrunch
        if 'https://techcrunch.com/2' in response.url:
            Title = response.css('h1.article__title::text').get()

            for node in response.xpath('//div[@class="article-content"]//p'):
                Text = ''.join(node.xpath('string()').extract())

            #Checks for null vals
            if referer_str(
                    response.request
            ) != '' or Title != '' or Text != '' or Text != ' ' or Text.encode(
                    'utf-8') != "b''" or Text != 'None' or Title != 'None':
                yield {
                    'SourceLink': referer_str(response.request),
                    'Link': response.url,
                    'Title': str(Title).encode('utf-8'),
                    'Text':
                    str(Title).encode('utf-8') + str(Text).encode('utf-8'),
                }
        #startupsavant
        elif 'https://startupsavant.com/news' in response.url:
            Title = response.css('h1.headline::text').get()

            for node in response.xpath('//div[@class="row"]//p'):
                Text = ''.join(node.xpath('string()').extract())

            #Checks for null vals
            if referer_str(
                    response.request
            ) != '' or Title != '' or Text != '' or Text != ' ' or Text.encode(
                    'utf-8') != "b''" or Text != 'None' or Title != 'None':
                yield {
                    'SourceLink': referer_str(response.request),
                    'Link': response.url,
                    'Title': str(Title).encode('utf-8'),
                    'Text':
                    str(Title).encode('utf-8') + str(Text).encode('utf-8'),
                }

        #techstartups
        elif 'https://techstartups.com/2' in response.url:
            Title = response.css('div.post_header_title h1::text').get()

            for node in response.xpath(
                    '//div[@class="post_content_wrapper"]//p'):
                Text = ''.join(node.xpath('string()').extract())

            #Checks for null vals
            if referer_str(
                    response.request
            ) != '' or Title != '' or Text != '' or Text != ' ' or Text.encode(
                    'utf-8') != "b''" or Text != 'None' or Title != 'None':
                yield {
                    'SourceLink': referer_str(response.request),
                    'Link': response.url,
                    'Title': str(Title).encode('utf-8'),
                    'Text':
                    str(Title).encode('utf-8') + str(Text).encode('utf-8'),
                }
예제 #25
0
    def media_downloaded(self, response, request, info):

        referer = referer_str(request)

        # Synchronous request inside pipeline. What idiot wrote this? ;)
        if response.status == 301 or response.status == 302:
            logger.info('Following redirect in %s', request)
            redirect_location = response.headers['Location'].decode()
            r = requests.get(redirect_location)
            response = Response(redirect_location, status=r.status_code, body=r.content, request=request)
            logger.info('Followed redirect. Result: %s', str(response))

        if response.status != 200:
            logger.warning(
                'File (code: %(status)s): Error downloading file from '
                '%(request)s referred in <%(referer)s>',
                {'status': response.status,
                 'request': request, 'referer': referer},
                extra={'spider': info.spider}
            )
            raise FileException('download-error')

        if not response.body:
            logger.warning(
                'File (empty-content): Empty file from %(request)s referred '
                'in <%(referer)s>: no-content',
                {'request': request, 'referer': referer},
                extra={'spider': info.spider}
            )
            raise FileException('empty-content')

        status = 'cached' if 'cached' in response.flags else 'downloaded'
        logger.debug(
            'File (%(status)s): Downloaded file from %(request)s referred in '
            '<%(referer)s>',
            {'status': status, 'request': request, 'referer': referer},
            extra={'spider': info.spider}
        )
        self.inc_stats(info.spider, status)

        try:
            path = self.file_path(request, response=response, info=info)
            checksum = self.file_downloaded(response, request, info)
        except FileException as exc:
            logger.warning(
                'File (error): Error processing file from %(request)s '
                'referred in <%(referer)s>: %(errormsg)s',
                {'request': request, 'referer': referer, 'errormsg': str(exc)},
                extra={'spider': info.spider}, exc_info=True
            )
            raise
        except Exception as exc:
            logger.error(
                'File (unknown-error): Error processing file from %(request)s '
                'referred in <%(referer)s>',
                {'request': request, 'referer': referer},
                exc_info=True, extra={'spider': info.spider}
            )
            raise FileException(str(exc))

        return {'url': request.url, 'path': path, 'checksum': checksum}
예제 #26
0
    def media_downloaded(self, response, request, info):
        referer = referer_str(request)
        if response.status != 200:
            logger.warning(
                'File (code: %(status)s): Error downloading file from '
                '%(request)s referred in <%(referer)s>',
                {'status': response.status,
                 'request': request, 'referer': referer},
                extra={'spider': info.spider}
            )
            raise FileException('download-error')

        if not response.body:
            logger.warning(
                'File (empty-content): Empty file from %(request)s referred '
                'in <%(referer)s>: no-content',
                {'request': request, 'referer': referer},
                extra={'spider': info.spider}
            )
            raise FileException('empty-content')
        status = 'cached' if 'cached' in response.flags else 'downloaded'

        if ('content-disposition' in response.headers):
            # Pineapple crawler returns content-disposition header
            # but it is type of "bytes" so it needes to be encoded as ascii.
            # Decode function can't be called on string, so need to check type
            if isinstance(response.headers['content-disposition'], bytes):
                d = response.headers['content-disposition'].decode('ascii')
            else:
                d = response.headers['content-disposition']
            fname = re.findall("filename=(.+)", d)
            self.filename = fname[0]
        logger.debug(
            'File (%(status)s): Downloaded file from %(request)s referred in '
            '<%(referer)s>',
            {'status': status, 'request': request, 'referer': referer},
            extra={'spider': info.spider}
        )

        self.inc_stats(info.spider, status)

        try:
            path = self.file_path(request, response=response, info=info)
            checksum = self.file_downloaded(response, request, info)
        except FileException as exc:
            logger.warning(
                'File (error): Error processing file from %(request)s '
                'referred in <%(referer)s>: %(errormsg)s',
                {'request': request, 'referer': referer, 'errormsg': str(exc)},
                extra={'spider': info.spider}, exc_info=True
            )
            raise
        except Exception as exc:
            logger.error(
                'File (unknown-error): Error processing file from %(request)s '
                'referred in <%(referer)s>',
                {'request': request, 'referer': referer},
                exc_info=True, extra={'spider': info.spider}
            )
            raise FileException(str(exc))

        return {'url': request.url, 'path': path, 'checksum': checksum}
예제 #27
0
    def media_downloaded(self, response, request, info):
        referer = referer_str(request)

        if response.status != 200:
            logger.warning(
                'File (code: %(status)s): Error downloading file from '
                '%(request)s referred in <%(referer)s>', {
                    'status': response.status,
                    'request': request,
                    'referer': referer
                },
                extra={'spider': info.spider})
            raise FileException('download-error')

        if not response.body:
            logger.warning(
                'File (empty-content): Empty file from %(request)s referred '
                'in <%(referer)s>: no-content', {
                    'request': request,
                    'referer': referer
                },
                extra={'spider': info.spider})
            raise FileException('empty-content')

        status = 'cached' if 'cached' in response.flags else 'downloaded'
        logger.debug(
            'File (%(status)s): Downloaded file from %(request)s referred in '
            '<%(referer)s>', {
                'status': status,
                'request': request,
                'referer': referer
            },
            extra={'spider': info.spider})
        self.inc_stats(info.spider, status)

        try:
            # path = self.file_path(request, response=response, info=info)
            width, height, url_sha2, phash, checksum = self.file_downloaded(
                response, request, info)
        except FileException as exc:
            logger.warning(
                'File (error): Error processing file from %(request)s '
                'referred in <%(referer)s>: %(errormsg)s', {
                    'request': request,
                    'referer': referer,
                    'errormsg': str(exc)
                },
                extra={'spider': info.spider},
                exc_info=True)
            raise
        except Exception as exc:
            logger.error(
                'File (unknown-error): Error processing file from %(request)s '
                'referred in <%(referer)s>', {
                    'request': request,
                    'referer': referer
                },
                exc_info=True,
                extra={'spider': info.spider})
            raise FileException(str(exc))
        resultDict = {
            'url': request.url,
            'url_sha2': url_sha2,
            'checksum': checksum,
            'width': width,
            'heigth': height,
            'phash': phash
        }
        print json.dumps(resultDict)
        raise DropItem("Printed to console")
예제 #28
0
    def media_downloaded(self, response, request, info):
        """
        从content-dispositon中取文件名
        :param response:
        :param request:
        :param info:
        :return:
        """
        referer = referer_str(request)
        if response.status != 200:
            logger.warning(
                'File (code: %(status)s): Error downloading file from '
                '%(request)s referred in <%(referer)s>',
                {'status': response.status,
                 'request': request, 'referer': referer},
                extra={'spider': info.spider}
            )
            raise FileException('download-error')
        if not response.body:
            logger.warning(
                'File (empty-content): Empty file from %(request)s referred '
                'in <%(referer)s>: no-content',
                {'request': request, 'referer': referer},
                extra={'spider': info.spider}
            )
            raise FileException('empty-content')
        status = 'cached' if 'cached' in response.flags else 'downloaded'
        logger.debug(
            'File (%(status)s): Downloaded file from %(request)s referred in '
            '<%(referer)s>',
            {'status': status, 'request': request, 'referer': referer},
            extra={'spider': info.spider}
        )
        self.inc_stats(info.spider, status)

        try:
            containFileName = response.headers.get('Content-Disposition') or response.headers.get('content-disposition')
            if containFileName is not None:
                pattern_marks = re.compile(r'filename="(.*)"')
                pattern_no_marks = re.compile(r'filename=(.*)')
                try:
                    file_name = pattern_marks.search(containFileName.decode('utf-8')) or pattern_no_marks.search(containFileName.decode('utf-8'))
                except:
                    file_name = pattern_marks.search(str(containFileName).split("'")[1]) or pattern_no_marks.search(str(containFileName).split("'")[1])
                if file_name is not None:
                    file_name = urlparse.unquote(file_name.group(1).strip())
                else:
                    file_name = urlparse.unquote(os.path.basename(urlparse.unquote(response.request.url)))
            else:
                file_name = urlparse.unquote(os.path.basename(urlparse.unquote(response.request.url)))
            media_ext = os.path.splitext(file_name)[1]
            if "." not in media_ext or "\\" in media_ext or "/" in media_ext or ":" in media_ext or "*" in media_ext or "?" in media_ext or '"' in media_ext or "<" in media_ext or ">" in media_ext or "|" in media_ext or ";" in media_ext:
                content_type = response.headers.get('Content-Type') or response.headers.get('content-type')
                file_type = "." + content_type.decode('utf-8').split("/")[-1]
                file_name = str(int(time.time())) + file_type
                media_ext = file_type
            if response.meta.get("data"):
                url = request.url + urlparse.urlencode(response.meta["data"])
            else:
                url = request.url
            media_guid = hashlib.sha1(to_bytes(url)).hexdigest()
            path = 'full/%s%s' % (media_guid, media_ext)
            checksum = self.file_downloaded(response, request, info, path)
        except FileException as exc:
            logger.warning(
                'File (error): Error processing file from %(request)s '
                'referred in <%(referer)s>: %(errormsg)s',
                {'request': request, 'referer': referer, 'errormsg': str(exc)},
                extra={'spider': info.spider}, exc_info=True
            )
            raise
        except Exception as exc:
            logger.error(
                'File (unknown-error): Error processing file from %(request)s '
                'referred in <%(referer)s>',
                {'request': request, 'referer': referer},
                exc_info=True, extra={'spider': info.spider}
            )
            raise FileException(str(exc))

        return {'url': urlparse.unquote(url), 'path': path, 'checksum': checksum, "name": file_name}