def _onsuccess(result): if not result: return # returning None force download last_modified = result.get('last_modified', None) if not last_modified: return # returning None force download age_seconds = time.time() - last_modified age_days = age_seconds / 60 / 60 / 24 if age_days > self.expires: return # returning None force download referer = referer_str(request) logger.debug( 'File (uptodate): Downloaded %(medianame)s from %(request)s ' 'referred in <%(referer)s>', {'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer}, extra={'spider': info.spider} ) self.inc_stats(info.spider, 'uptodate') checksum = result.get('checksum', None) return {'url': request.url, 'path': path, 'checksum': checksum}
def crawled(self, request, response, spider): flags = " %s" % str(response.flags) if response.flags else "" return { "level": logging.DEBUG, "msg": CRAWLEDMSG, "args": {"status": response.status, "request": request, "referer": referer_str(request), "flags": flags}, }
def crawled(self, request, response, spider): flags = ' {0!s}'.format(str(response.flags)) if response.flags else '' return { 'level': logging.DEBUG, 'msg': CRAWLEDMSG, 'args': { 'status': response.status, 'request': request, 'referer': referer_str(request), 'flags': flags, } }
def media_failed(self, failure, request, info): if not isinstance(failure.value, IgnoreRequest): referer = referer_str(request) logger.warning( 'File (unknown-error): Error downloading %(medianame)s from ' '%(request)s referred in <%(referer)s>: %(exception)s', {'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer, 'exception': failure.value}, extra={'spider': info.spider} ) raise FileException
def media_downloaded(self, response, request, info): referer = referer_str(request) if response.status != 200: logger.warning( 'File (code: %(status)s): Error downloading file from ' '%(request)s referred in <%(referer)s>', {'status': response.status, 'request': request, 'referer': referer}, extra={'spider': info.spider} ) raise FileException('download-error') if not response.body: logger.warning( 'File (empty-content): Empty file from %(request)s referred ' 'in <%(referer)s>: no-content', {'request': request, 'referer': referer}, extra={'spider': info.spider} ) raise FileException('empty-content') status = 'cached' if 'cached' in response.flags else 'downloaded' logger.debug( 'File (%(status)s): Downloaded file from %(request)s referred in ' '<%(referer)s>', {'status': status, 'request': request, 'referer': referer}, extra={'spider': info.spider} ) self.inc_stats(info.spider, status) try: path = self.file_path(request, response=response, info=info) checksum = self.file_downloaded(response, request, info) except FileException as exc: logger.warning( 'File (error): Error processing file from %(request)s ' 'referred in <%(referer)s>: %(errormsg)s', {'request': request, 'referer': referer, 'errormsg': str(exc)}, extra={'spider': info.spider}, exc_info=True ) raise except Exception as exc: logger.error( 'File (unknown-error): Error processing file from %(request)s ' 'referred in <%(referer)s>', {'request': request, 'referer': referer}, exc_info=True, extra={'spider': info.spider} ) raise FileException(str(exc)) return {'url': request.url, 'path': path, 'checksum': checksum}
def log(self, request, spider): if self.debug: msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)" args = {'request': request, 'referer': referer_str(request) } self.logger.debug(msg, args, extra={'spider': spider}) elif self.logdupes: msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") self.logger.debug(msg, {'request': request}, extra={'spider': spider}) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
def spider_error(self, failure, request, response, spider): """Logs an error message from a spider. .. versionadded:: 2.0 """ return { 'level': logging.ERROR, 'msg': SPIDERERRORMSG, 'args': { 'request': request, 'referer': referer_str(request), } }
def media_downloaded(self, response, request, info): try: return super().media_downloaded(response, request, info) except FileException as fe: failure = { 'url': response.url, 'origin': referer_str(request), 'reason': str(fe), 'http-status': response.status } self.bad_site(failure['url']) self.error(failure) raise fe
def crawled(self, request, response, spider): request_flags = ' %s' % str(request.flags) if request.flags else '' response_flags = ' %s' % str(response.flags) if response.flags else '' return { 'level': logging.DEBUG, 'msg': CRAWLEDMSG, 'args': { 'status': response.status, 'request': request, 'request_flags': request_flags, 'referer': referer_str(request), 'response_flags': response_flags, } }
def crawled(self, request, response, spider): request_flags = ' %s' % str(request.flags) if request.flags else '' response_flags = ' %s' % str(response.flags) if response.flags else '' return { 'level': logging.DEBUG, 'msg': CRAWLEDMSG, 'args': { 'status': response.status, 'request': request, 'request_flags' : request_flags, 'referer': referer_str(request), 'response_flags': response_flags, } }
def media_failed(self, failure, request, info): if not isinstance(failure.value, IgnoreRequest): referer = referer_str(request) logger.warning( 'File (unknown-error): Error downloading %(medianame)s from ' '%(request)s referred in <%(referer)s>: %(exception)s', { 'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer, 'exception': failure.value }, extra={'spider': info.spider}) raise FileException
def log(self, request, spider): if self.debug: msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)" args = {'request': request, 'referer': referer_str(request)} self.logger.debug(msg, args, extra={'spider': spider}) elif self.logdupes: msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") self.logger.debug(msg, {'request': request}, extra={'spider': spider}) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
def crawled(self, request, response, spider): request_flags = ' %s' % str(request.flags) if request.flags else '' response_flags = ' %s' % str(response.flags) if response.flags else '' return { 'level': logging.DEBUG, 'msg': CRAWLEDMSG, 'args': { 'status': response.status, 'request': request, 'request_flags': request_flags, 'referer': referer_str(request), 'response_flags': response_flags, # backward compatibility with Scrapy logformatter below 1.4 version 'flags': response_flags } }
def crawled(self, request, response, spider): request_flags = ' %s' % str(request.flags) if request.flags else '' response_flags = ' %s' % str(response.flags) if response.flags else '' return { 'level': logging.DEBUG, 'msg': CRAWLEDMSG, 'args': { 'status': response.status, 'request': request, 'request_flags' : request_flags, 'referer': referer_str(request), 'response_flags': response_flags, # backward compatibility with Scrapy logformatter below 1.4 version 'flags': response_flags } }
def crawled(self, request, response, spider): """Logs a message when the crawler finds a webpage.""" request_flags = f' {str(request.flags)}' if request.flags else '' response_flags = f' {str(response.flags)}' if response.flags else '' return { 'level': logging.DEBUG, 'msg': CRAWLEDMSG, 'args': { 'status': response.status, 'request': request, 'request_flags': request_flags, 'referer': referer_str(request), 'response_flags': response_flags, # backward compatibility with Scrapy logformatter below 1.4 version 'flags': response_flags } }
def crawled(self, request, response, spider): """Logs a message when the crawler finds a webpage.""" request_flags = " %s" % str(request.flags) if request.flags else "" response_flags = " %s" % str(response.flags) if response.flags else "" return { "level": logging.DEBUG, "msg": CRAWLEDMSG, "args": { "status": response.status, "request": request, "request_flags": request_flags, "referer": referer_str(request), "response_flags": response_flags, # backward compatibility with Scrapy logformatter below 1.4 version "flags": response_flags, }, }
def _onsuccess(result): if not result: return # returning None force download last_modified = result.get('last_modified', None) if not last_modified: return # returning None force download # DO NOT CHECK avatar EXPIRATION # age_seconds = time.time() - last_modified # age_days = age_seconds / 60 / 60 / 24 # if age_days > self.expires: # return # returning None force download try: fn = urllib.parse.urlparse(request.url).path.split('/')[-1] r_fn = info.spider.redis.hget(REDIS_AVATAR_PUBLISHER_KEY, request.flags[0]) except: return if r_fn is None or fn != r_fn.decode('utf-8'): logger.info( 'Remote avatar file changed. Updating avatar for %s. %s -> %s', request.flags[0], fn, r_fn if r_fn is None else r_fn.decode('utf-8')) request.flags.append(fn) return referer = referer_str(request) logger.debug( 'File (uptodate): Downloaded %(medianame)s from %(request)s ' 'referred in <%(referer)s>', { 'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer }, extra={'spider': info.spider}) self.inc_stats(info.spider, 'uptodate') checksum = result.get('checksum', None) return { 'url': request.url, 'path': path, 'checksum': checksum, 'updated': False }
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return logger.error( "Spider error processing %(request)s (referer: %(referer)s)", { 'request': request, 'referer': referer_str(request) }, exc_info=failure_to_exc_info(_failure), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.spider_error, failure=_failure, response=response, spider=spider) self.crawler.stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider)
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return logger.error( "Spider error processing %(request)s (referer: %(referer)s)", {'request': request, 'referer': referer_str(request)}, exc_info=failure_to_exc_info(_failure), extra={'spider': spider} ) self.signals.send_catch_log( signal=signals.spider_error, failure=_failure, response=response, spider=spider ) self.crawler.stats.inc_value( "spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider )
def file_downloaded(self, response, request, info): expected_csum = request.meta.get('sha256') if expected_csum: response_csum = sha256sum(BytesIO(response.body)) logger.debug('Request %s SHA256: expected %s, actual %s', request, expected_csum, response_csum) if expected_csum != response_csum: logger.warning( 'File (checksum-mismatch): Error downloading %s ' 'from %s referred in <%s>: expected SHA256 digest %s, ' 'got %s', self.MEDIA_NAME, request, referer_str(request), expected_csum, response_csum, extra={'spider': info.spider}) raise FileException('checksum-mismatch') return super(OpenWrtDownloaderPipeline, self).file_downloaded( response, request, info)
def spider_error(self, failure, request, response, spider): spider_name = spider.name ticker = response.meta["ticker"] report_type = response.meta["ReportType"] try: page = response.meta["Page"] except: page = "1" msg_dict = { 'ticker': ticker, 'type': "Spider Error", 'message': "There was an error in the spider...", } msg = json.dumps(msg_dict) return { 'level': logging.ERROR, 'msg': msg, 'args': { 'request': request, 'referer': referer_str(request) } }
def media_downloaded(self, response, request, info): referer = referer_str(request) if response.status != 200: logger.warning( 'File (code: %(status)s): Error downloading file from ' '%(request)s referred in <%(referer)s>', { 'status': response.status, 'request': request, 'referer': referer }, extra={'spider': info.spider}) raise FileException('download-error') if not response.body: logger.warning( 'File (empty-content): Empty file from %(request)s referred ' 'in <%(referer)s>: no-content', { 'request': request, 'referer': referer }, extra={'spider': info.spider}) raise FileException('empty-content') status = 'cached' if 'cached' in response.flags else 'downloaded' logger.debug( 'File (%(status)s): Downloaded file from %(request)s referred in ' '<%(referer)s>', { 'status': status, 'request': request, 'referer': referer }, extra={'spider': info.spider}) self.inc_stats(info.spider, status) try: path = self.file_path(request, response=response, info=info) checksum = self.file_downloaded(response, request, info) except FileException as exc: logger.warning( 'File (error): Error processing file from %(request)s ' 'referred in <%(referer)s>: %(errormsg)s', { 'request': request, 'referer': referer, 'errormsg': str(exc) }, extra={'spider': info.spider}, exc_info=True) raise except Exception as exc: logger.error( 'File (unknown-error): Error processing file from %(request)s ' 'referred in <%(referer)s>', { 'request': request, 'referer': referer }, exc_info=True, extra={'spider': info.spider}) raise FileException(str(exc)) return {'url': request.url, 'path': path, 'checksum': checksum}
def media_downloaded(self, response, request, info): referer = referer_str(request) if response.status not in [200, 201]: logger.warning( 'File (code: %(status)s): Error downloading file from ' '%(request)s referred in <%(referer)s>', { 'status': response.status, 'request': request, 'referer': referer }, extra={'spider': info.spider}) raise FileException('download-error') if not response.body: if response.status == 201 and 'location' in response.headers: logger.debug( 'File (code: %(status)s): Status 201 received. Downloading ' 'resource marked by location parameter in the response headers for ' '%(request)s referred in <%(referer)s>', { 'status': response.status, 'request': request, 'referer': referer }, extra={'spider': info.spider}) redirect_dlist = [self._process_request(Request(loc), info)] redirect_dfd = DeferredList(redirect_dlist, consumeErrors=1) return redirect_dfd else: logger.warning( 'File (empty-content): Empty file from %(request)s referred ' 'in <%(referer)s>: no-content', { 'request': request, 'referer': referer }, extra={'spider': info.spider}) raise FileException('empty-content') status = 'cached' if 'cached' in response.flags else 'downloaded' logger.debug( 'File (%(status)s): Downloaded file from %(request)s referred in ' '<%(referer)s>', { 'status': status, 'request': request, 'referer': referer }, extra={'spider': info.spider}) self.inc_stats(info.spider, status) try: path = self.file_path(request, response=response, info=info) checksum = self.file_downloaded(response, request, info) except FileException as exc: logger.warning( 'File (error): Error processing file from %(request)s ' 'referred in <%(referer)s>: %(errormsg)s', { 'request': request, 'referer': referer, 'errormsg': str(exc) }, extra={'spider': info.spider}, exc_info=True) raise except Exception as exc: logger.error( 'File (unknown-error): Error processing file from %(request)s ' 'referred in <%(referer)s>', { 'request': request, 'referer': referer }, exc_info=True, extra={'spider': info.spider}) raise FileException(str(exc)) return {'url': request.url, 'path': path, 'checksum': checksum}
def parse(self, response): Text = '' #techcrunch if 'https://techcrunch.com/2' in response.url: Title = response.css('h1.article__title::text').get() for node in response.xpath('//div[@class="article-content"]//p'): Text = ''.join(node.xpath('string()').extract()) #Checks for null vals if referer_str( response.request ) != '' or Title != '' or Text != '' or Text != ' ' or Text.encode( 'utf-8') != "b''" or Text != 'None' or Title != 'None': yield { 'SourceLink': referer_str(response.request), 'Link': response.url, 'Title': str(Title).encode('utf-8'), 'Text': str(Title).encode('utf-8') + str(Text).encode('utf-8'), } #startupsavant elif 'https://startupsavant.com/news' in response.url: Title = response.css('h1.headline::text').get() for node in response.xpath('//div[@class="row"]//p'): Text = ''.join(node.xpath('string()').extract()) #Checks for null vals if referer_str( response.request ) != '' or Title != '' or Text != '' or Text != ' ' or Text.encode( 'utf-8') != "b''" or Text != 'None' or Title != 'None': yield { 'SourceLink': referer_str(response.request), 'Link': response.url, 'Title': str(Title).encode('utf-8'), 'Text': str(Title).encode('utf-8') + str(Text).encode('utf-8'), } #techstartups elif 'https://techstartups.com/2' in response.url: Title = response.css('div.post_header_title h1::text').get() for node in response.xpath( '//div[@class="post_content_wrapper"]//p'): Text = ''.join(node.xpath('string()').extract()) #Checks for null vals if referer_str( response.request ) != '' or Title != '' or Text != '' or Text != ' ' or Text.encode( 'utf-8') != "b''" or Text != 'None' or Title != 'None': yield { 'SourceLink': referer_str(response.request), 'Link': response.url, 'Title': str(Title).encode('utf-8'), 'Text': str(Title).encode('utf-8') + str(Text).encode('utf-8'), }
def media_downloaded(self, response, request, info): referer = referer_str(request) # Synchronous request inside pipeline. What idiot wrote this? ;) if response.status == 301 or response.status == 302: logger.info('Following redirect in %s', request) redirect_location = response.headers['Location'].decode() r = requests.get(redirect_location) response = Response(redirect_location, status=r.status_code, body=r.content, request=request) logger.info('Followed redirect. Result: %s', str(response)) if response.status != 200: logger.warning( 'File (code: %(status)s): Error downloading file from ' '%(request)s referred in <%(referer)s>', {'status': response.status, 'request': request, 'referer': referer}, extra={'spider': info.spider} ) raise FileException('download-error') if not response.body: logger.warning( 'File (empty-content): Empty file from %(request)s referred ' 'in <%(referer)s>: no-content', {'request': request, 'referer': referer}, extra={'spider': info.spider} ) raise FileException('empty-content') status = 'cached' if 'cached' in response.flags else 'downloaded' logger.debug( 'File (%(status)s): Downloaded file from %(request)s referred in ' '<%(referer)s>', {'status': status, 'request': request, 'referer': referer}, extra={'spider': info.spider} ) self.inc_stats(info.spider, status) try: path = self.file_path(request, response=response, info=info) checksum = self.file_downloaded(response, request, info) except FileException as exc: logger.warning( 'File (error): Error processing file from %(request)s ' 'referred in <%(referer)s>: %(errormsg)s', {'request': request, 'referer': referer, 'errormsg': str(exc)}, extra={'spider': info.spider}, exc_info=True ) raise except Exception as exc: logger.error( 'File (unknown-error): Error processing file from %(request)s ' 'referred in <%(referer)s>', {'request': request, 'referer': referer}, exc_info=True, extra={'spider': info.spider} ) raise FileException(str(exc)) return {'url': request.url, 'path': path, 'checksum': checksum}
def media_downloaded(self, response, request, info): referer = referer_str(request) if response.status != 200: logger.warning( 'File (code: %(status)s): Error downloading file from ' '%(request)s referred in <%(referer)s>', {'status': response.status, 'request': request, 'referer': referer}, extra={'spider': info.spider} ) raise FileException('download-error') if not response.body: logger.warning( 'File (empty-content): Empty file from %(request)s referred ' 'in <%(referer)s>: no-content', {'request': request, 'referer': referer}, extra={'spider': info.spider} ) raise FileException('empty-content') status = 'cached' if 'cached' in response.flags else 'downloaded' if ('content-disposition' in response.headers): # Pineapple crawler returns content-disposition header # but it is type of "bytes" so it needes to be encoded as ascii. # Decode function can't be called on string, so need to check type if isinstance(response.headers['content-disposition'], bytes): d = response.headers['content-disposition'].decode('ascii') else: d = response.headers['content-disposition'] fname = re.findall("filename=(.+)", d) self.filename = fname[0] logger.debug( 'File (%(status)s): Downloaded file from %(request)s referred in ' '<%(referer)s>', {'status': status, 'request': request, 'referer': referer}, extra={'spider': info.spider} ) self.inc_stats(info.spider, status) try: path = self.file_path(request, response=response, info=info) checksum = self.file_downloaded(response, request, info) except FileException as exc: logger.warning( 'File (error): Error processing file from %(request)s ' 'referred in <%(referer)s>: %(errormsg)s', {'request': request, 'referer': referer, 'errormsg': str(exc)}, extra={'spider': info.spider}, exc_info=True ) raise except Exception as exc: logger.error( 'File (unknown-error): Error processing file from %(request)s ' 'referred in <%(referer)s>', {'request': request, 'referer': referer}, exc_info=True, extra={'spider': info.spider} ) raise FileException(str(exc)) return {'url': request.url, 'path': path, 'checksum': checksum}
def media_downloaded(self, response, request, info): referer = referer_str(request) if response.status != 200: logger.warning( 'File (code: %(status)s): Error downloading file from ' '%(request)s referred in <%(referer)s>', { 'status': response.status, 'request': request, 'referer': referer }, extra={'spider': info.spider}) raise FileException('download-error') if not response.body: logger.warning( 'File (empty-content): Empty file from %(request)s referred ' 'in <%(referer)s>: no-content', { 'request': request, 'referer': referer }, extra={'spider': info.spider}) raise FileException('empty-content') status = 'cached' if 'cached' in response.flags else 'downloaded' logger.debug( 'File (%(status)s): Downloaded file from %(request)s referred in ' '<%(referer)s>', { 'status': status, 'request': request, 'referer': referer }, extra={'spider': info.spider}) self.inc_stats(info.spider, status) try: # path = self.file_path(request, response=response, info=info) width, height, url_sha2, phash, checksum = self.file_downloaded( response, request, info) except FileException as exc: logger.warning( 'File (error): Error processing file from %(request)s ' 'referred in <%(referer)s>: %(errormsg)s', { 'request': request, 'referer': referer, 'errormsg': str(exc) }, extra={'spider': info.spider}, exc_info=True) raise except Exception as exc: logger.error( 'File (unknown-error): Error processing file from %(request)s ' 'referred in <%(referer)s>', { 'request': request, 'referer': referer }, exc_info=True, extra={'spider': info.spider}) raise FileException(str(exc)) resultDict = { 'url': request.url, 'url_sha2': url_sha2, 'checksum': checksum, 'width': width, 'heigth': height, 'phash': phash } print json.dumps(resultDict) raise DropItem("Printed to console")
def media_downloaded(self, response, request, info): """ 从content-dispositon中取文件名 :param response: :param request: :param info: :return: """ referer = referer_str(request) if response.status != 200: logger.warning( 'File (code: %(status)s): Error downloading file from ' '%(request)s referred in <%(referer)s>', {'status': response.status, 'request': request, 'referer': referer}, extra={'spider': info.spider} ) raise FileException('download-error') if not response.body: logger.warning( 'File (empty-content): Empty file from %(request)s referred ' 'in <%(referer)s>: no-content', {'request': request, 'referer': referer}, extra={'spider': info.spider} ) raise FileException('empty-content') status = 'cached' if 'cached' in response.flags else 'downloaded' logger.debug( 'File (%(status)s): Downloaded file from %(request)s referred in ' '<%(referer)s>', {'status': status, 'request': request, 'referer': referer}, extra={'spider': info.spider} ) self.inc_stats(info.spider, status) try: containFileName = response.headers.get('Content-Disposition') or response.headers.get('content-disposition') if containFileName is not None: pattern_marks = re.compile(r'filename="(.*)"') pattern_no_marks = re.compile(r'filename=(.*)') try: file_name = pattern_marks.search(containFileName.decode('utf-8')) or pattern_no_marks.search(containFileName.decode('utf-8')) except: file_name = pattern_marks.search(str(containFileName).split("'")[1]) or pattern_no_marks.search(str(containFileName).split("'")[1]) if file_name is not None: file_name = urlparse.unquote(file_name.group(1).strip()) else: file_name = urlparse.unquote(os.path.basename(urlparse.unquote(response.request.url))) else: file_name = urlparse.unquote(os.path.basename(urlparse.unquote(response.request.url))) media_ext = os.path.splitext(file_name)[1] if "." not in media_ext or "\\" in media_ext or "/" in media_ext or ":" in media_ext or "*" in media_ext or "?" in media_ext or '"' in media_ext or "<" in media_ext or ">" in media_ext or "|" in media_ext or ";" in media_ext: content_type = response.headers.get('Content-Type') or response.headers.get('content-type') file_type = "." + content_type.decode('utf-8').split("/")[-1] file_name = str(int(time.time())) + file_type media_ext = file_type if response.meta.get("data"): url = request.url + urlparse.urlencode(response.meta["data"]) else: url = request.url media_guid = hashlib.sha1(to_bytes(url)).hexdigest() path = 'full/%s%s' % (media_guid, media_ext) checksum = self.file_downloaded(response, request, info, path) except FileException as exc: logger.warning( 'File (error): Error processing file from %(request)s ' 'referred in <%(referer)s>: %(errormsg)s', {'request': request, 'referer': referer, 'errormsg': str(exc)}, extra={'spider': info.spider}, exc_info=True ) raise except Exception as exc: logger.error( 'File (unknown-error): Error processing file from %(request)s ' 'referred in <%(referer)s>', {'request': request, 'referer': referer}, exc_info=True, extra={'spider': info.spider} ) raise FileException(str(exc)) return {'url': urlparse.unquote(url), 'path': path, 'checksum': checksum, "name": file_name}