def test_results_are_cached_across_multiple_items(self): rsp1 = Response('http://url1') req1 = Request('http://url1', meta=dict(response=rsp1)) item = dict(requests=req1) new_item = yield self.pipe.process_item(item, self.spider) self.assertTrue(new_item is item) self.assertEqual(new_item['results'], [(True, rsp1)]) # rsp2 is ignored, rsp1 must be in results because request fingerprints are the same req2 = Request(req1.url, meta=dict(response=Response('http://donot.download.me'))) item = dict(requests=req2) new_item = yield self.pipe.process_item(item, self.spider) self.assertTrue(new_item is item) self.assertEqual(request_fingerprint(req1), request_fingerprint(req2)) self.assertEqual(new_item['results'], [(True, rsp1)])
def parse(self, response): cate = response.meta['cate'] # 获取本页所有新闻链接 node_list = response.xpath('//div[@class="list_left"]/ul/li') for node in node_list: link_list = node.xpath('./a/@href').extract_first() if link_list: item = ScrapyHuodianbjxV101Item() item['content_url'] = link_list item['issue_time'] = node.xpath('./span/text()').extract_first() req = scrapy.Request(url=link_list, callback=self.parse2, meta={'item': item, 'cate': cate}) item["id"] = request.request_fingerprint(req) yield req # print(link_list) # 下一页新闻链接 next_page1 = response.xpath('//div[@class="page"]/a[@title="下一页"]/@href').extract_first() next_page2 = response.xpath('//div[@class="tempa page btemp"]/a[@title="下一页"]/@href').extract_first() if next_page1: next_url = f'http://huodian.bjx.com.cn{next_page1}' # print(next_url) yield scrapy.Request(url=next_url, callback=self.parse, meta={'cate': cate}) elif next_page2: next_url = f'http://huodian.bjx.com.cn{next_page2}' # print(next_url) yield scrapy.Request(url=next_url, callback=self.parse, meta={'cate': cate})
def request_seen(self, request): fp = request_fingerprint(request) if fp in self.fingerprints: return True self.fingerprints.add(fp) if self.file: self.file.write(fp + os.linesep)
def request_seen(self, request): fp = request_fingerprint(request) if self.exists(fp): return True self.insert(fp) return False
def parse(self, response): config_list = response.xpath('//div[@class="wonderful"]/ul/li') for config in config_list: item = ScrapyCcbuildV1Item() link = config.xpath('./a/@href').extract_first() # print(link) req = scrapy.Request(url=link, callback=self.parse_peple, dont_filter=True, meta={'item': item}) item['content_url'] = link item['id'] = request.request_fingerprint(req) title_images = config.xpath('./a/img/@src').extract_first() if title_images: if 'http' in title_images: item['title_images'] = title_images else: item['title_images'] = self.base_url + title_images else: item['title_images'] = None item['issue_time'] = config.xpath( './a/article/div/span[2]/text()').extract_first()[:9] item['tags'] = config.xpath( './a/article/div/span[1]/text()').extract_first() yield req
def parse_item(self, response): #i = DoubancrawlItem() print "ljw: *****************" print response.url item = qtfyItem() #tp = md5.new(response.url) #item["id"] = tp.hexdigest() item["id"] = request_fingerprint(response.request) item["url"] = response.url item["name"] = response.xpath("//h2/text()").extract()[0] entry = response.xpath('//div[@class="entry"]/p') item["desc"] = entry[0].xpath("text()").extract()[0] infos = entry[1].xpath("node()").extract() if len(infos) < 3: infos = entry[2].xpath("node()").extract() item["info"] = reduce((lambda x, y: x+y), infos) item["img"] = entry.xpath('img/@src').extract()[0] lnks = entry.xpath('a') lkret = [] for ln in lnks: href = ln.xpath("@href").extract()[0] if not self.isDownloadLink(href): continue title = ln.xpath("text()").extract()[0] lkret.append({"title":title, "href":href}) item["links"] = lkret #print item #inspect_response(response, self) yield item
def request_seen(self, request): fingerprint = request_fingerprint(request) if fingerprint in self.record: print(request.url, '已经访问过') return True else: self.record.add(fingerprint)
def process_response(self, request, response, spider): if type(response) is HtmlResponse and len(response.body) > 4000 and response.status <> 403: times = request.meta.get('retry_times2', 0) if times > 0: log.msg('retry %s time(s) saved %s' % (times, response.url), level=log.ERROR) retry = False if response.status == 403: retry = True if type(response) is HtmlResponse: if len(response.body) < 4000: retry = True elif response_is_invalid(response): retry = True if retry: retries = request.meta.get('retry_times2', 0) + 1 if retries < 10: key = request_fingerprint(request) rpath = os.path.join(self.cachedir, spider.name, key[0:2], key) metapath = os.path.join(rpath, 'pickled_meta') if os.path.exists(metapath): os.unlink(metapath) retryreq = request.copy() retryreq.meta['retry_times2'] = retries retryreq.dont_filter = True log.msg('retry %s time(s) %s' % (retries, request.url), level=log.ERROR) return retryreq log.msg('give up %s' % request.url, level=log.ERROR) raise IgnoreRequest return response
def save_response(self, response, spider): if isinstance(response, TextResponse): fp = request_fingerprint(response.request) payload = { "_key": fp, "_jobid": self.hsref.job.key, "_type": "_pageitem", "_encoding": response.encoding, "url": response.url, } self._set_cookies(payload, response) if response.request.method == 'POST': payload["postdata"] = dict(parse_qsl(response.request.body.decode())) payload["body"] = response.body_as_unicode() if self.trim_html: payload['body'] = payload['body'].strip(' \r\n\0') if len(payload['body']) > self._writer.maxitemsize: spider.logger.warning("Page not saved, body too large: <%s>" % response.url) return try: self._writer.write(payload) except ValueTooLarge as exc: spider.logger.warning("Page not saved, %s: <%s>" % (exc, response.url))
def parse(self, response): self.logger.info("Parse") soup = BeautifulSoup(response.body, 'lxml') all_img_tags = filter(self.tagfilter, soup.find_all('img')) candidate_img_tags = filter(self.maybe_ad_img, all_img_tags) for candidate in candidate_img_tags: ad_url = self.get_img_ad_url(candidate) ad_src = candidate.attrs.get('src', None) ad_url = fix_url(ad_url) ad_src = fix_url(ad_src) if not ad_url or not ad_src: continue item = AdcrawlerDataTaskItem() item['url'] = ad_url item['fingerprint'] = request_fingerprint(response.request) item['ad_img_urls'] = [ad_src] yield item all_a_tags = soup.find_all('a') if self.server.llen(self.redis_key) > self.settings.get('MAX_URL_TASKS', self.DEFAULT_MAX_URL_TASKS): return for at in all_a_tags: a_url = at.attrs.get('href', None) a_url = fix_url(a_url) if not a_url: continue next_task = AdcrawlerUrlTaskItem() next_task['url'] = a_url next_task['cur_depth'] = response.meta['cur_depth'] + 1 yield next_task
def process_spider_output(self, response, result, spider): f = self.outputs[spider] fp = request_fingerprint(response.request) tracetime = time.time() data = self._objtodict(self.RESPONSE_ATTRS, response) data['request'] = self._objtodict(self.REQUEST_ATTRS, response.request) self._write(f, fp, tracetime, 'response', data) for item in result: if isinstance(item, Request): data = self._objtodict(self.REQUEST_ATTRS, item) data['fp'] = request_fingerprint(item) self._write(f, fp, tracetime, 'request', data) else: self._write(f, fp, tracetime, 'item', dict(item)) yield item
def parse(self, response): cate = response.meta['cate'] config_list = response.xpath('//div[@class="catlist"]/ul/li') m = [6, 12, 18, 24] n = 1 for config in config_list: # print(config) if n not in m: item = ScrapyJc123V101Item() # title_img = config.xpath('./a/img/@src').extract_first() title = config.xpath('./a/text()').extract_first() link = config.xpath('./a/@href').extract_first() issue_time = config.xpath('./span/text()').extract_first()[:10] item['title'] = title item['issue_time'] = issue_time item['content_url'] = link item['information_categories'] = cate item['title_images'] = None # print(item) req = scrapy.Request(url=link, callback=self.parse2, meta={'item': item}, dont_filter=True) item['id'] = request.request_fingerprint(req) # print(time, title, link) yield req n += 1
def _check_downloading(response): fp = request_fingerprint(req1) self.assertTrue(fp in self.info.downloading) self.assertTrue(fp in self.info.waiting) self.assertTrue(fp not in self.info.downloaded) self.assertEqual(len(self.info.waiting[fp]), 2) return response
def parse(self, response): # pass cate = response.meta['cate'] config_list = response.xpath('//ul[@class="pagelist"]/li') # print(len(config_list)) n = [10, 21, 32] m = 0 for config in config_list: # print(config) if m not in n: # print(m) item = ScrapyChinabaogaoV101Item() title = config.xpath('./h3/a/text()').extract_first() link =config.xpath('./h3/a/@href').extract_first() issue_time = config.xpath('./span/text()').extract_first() item['title'] = title item['issue_time'] = issue_time item['content_url'] = link item['information_categories'] = cate item['title_images'] = None # print(item) req = scrapy.Request(url=link, callback=self.parse2, meta={'item': item}, dont_filter=True) item['id'] = request.request_fingerprint(req) yield req m += 1
def parse(self, response): config_list = response.xpath('//div[@class="catlist"]/ul/li') num = [5, 11, 17, 23] for i in range(len(config_list)): if i not in num: item = Scrapy9ToV1Item() link = config_list[i].xpath('./a/@href').extract_first() title = config_list[i].xpath('./a/text()').extract_first() issue_time = config_list[i].xpath('./i/text()').extract_first() # print(title, link, issue_time) req = scrapy.Request(url=link, callback=self.parse_detail, meta={'item': item}) item['id'] = request.request_fingerprint(req) item['title'] = title item['title_images'] = None item['content_url'] = link item['issue_time'] = issue_time[:10] if issue_time else None item['industry_categories'] = 'E' item['industry_Lcategories'] = response.meta[ 'industry_Lcategories'][:2] item['industry_Mcategories'] = response.meta[ 'industry_Lcategories'] item['industry_Scategories'] = None item['information_categories'] = response.meta[ 'information_categories'] yield req
def store_response(self, spider, request, response): """Store the given response in the mongo.""" key = request_fingerprint(request) response_headers = headers_dict_to_raw(response.headers) response_body = self._get_body(response.headers, response.body) request_headers = headers_dict_to_raw(request.headers) request_body = self._get_body(request.headers, request.body) stored_data = { 'metadata': { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), }, 'response_headers': response_headers, 'response_body': response_body, 'request_headers': request_headers, 'request_body': request_body, } #print stored_data collection_index = int(key, 16) % 1000 collection_name = 'collection' + str(collection_index) collection = self.db[collection_name] try: collection.insert({"_id": key, "value": stored_data}) print "-----------------Write cache %s------------------" % collection_name except Exception, e: print e.message pass
class RFPDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None, debug=False): self.file = None self.fingerprints = set() self.logdupes = True self.debug = debug self.logger = logging.getLogger(__name__) if path: self.file = open(os.path.join(path, 'requests.seen'), 'a+') self.fingerprints.update(x.rstrip() for x in self.file) @classmethod def from_settings(cls, settings): debug = settings.getbool('DUPEFILTER_DEBUG') return cls(job_dir(settings), debug) def request_seen(self, request): fp = self.request_fingerprint(request) if fp in self.fingerprints: return True self.fingerprints.add(fp) if self.file: self.file.write(fp + os.linesep) def request_fingerprint(self, request):5 return request_fingerprint(request)
def parse_item(self, response): #i = DoubancrawlItem() print "ljw: *****************" print response.url item = qtfyItem() #tp = md5.new(response.url) #item["id"] = tp.hexdigest() item["id"] = request_fingerprint(response.request) item["url"] = response.url item["name"] = response.xpath("//h2/text()").extract()[0] entry = response.xpath('//div[@class="entry"]/p') item["desc"] = entry[0].xpath("text()").extract()[0] infos = entry[1].xpath("node()").extract() if len(infos) < 3: infos = entry[2].xpath("node()").extract() item["info"] = reduce((lambda x, y: x + y), infos) item["img"] = entry.xpath('img/@src').extract()[0] lnks = entry.xpath('a') lkret = [] for ln in lnks: href = ln.xpath("@href").extract()[0] if not self.isDownloadLink(href): continue title = ln.xpath("text()").extract()[0] lkret.append({"title": title, "href": href}) item["links"] = lkret #print item #inspect_response(response, self) yield item
def splash_request_fingerprint(request, include_headers=None): """ Request fingerprint which takes 'splash' meta key into account """ fp = request_fingerprint(request, include_headers=include_headers) if 'splash' not in request.meta: return fp return dict_hash(request.meta['splash'], fp)
def parse(self, response): config_list = response.xpath('//div[@class="morenews"]/ul/li') for con in config_list: item = ScrapyNewsbuildhrV1Item() title = con.xpath('./h1/a/text()').extract_first() link = 'http:' + con.xpath('./h1/a/@href').extract_first() source = con.xpath('./h1/span/address/text()').extract_first() issue_time = con.xpath( './h1/span/b/text()').extract_first().replace( '年', '-').replace('月', '-').replace('日', '') # print(title, link, issue_time, source) req = scrapy.Request(url=link, callback=self.parse_detail, dont_filter=True, meta={'item': item}) item['id'] = request.request_fingerprint(req) item['title'] = title item['title_images'] = None item['content_url'] = link item['issue_time'] = issue_time item['source'] = source[3:] if source else '建筑英才网' yield req
def _process_request(self, request, info): fp = request_fingerprint(request) cb = request.callback or (lambda _: _) eb = request.errback request.callback = None request.errback = None # Return cached result if request was already seen if fp in info.downloaded: return defer_result(info.downloaded[fp]).addCallbacks(cb, eb) # Otherwise, wait for result wad = Deferred().addCallbacks(cb, eb) info.waiting[fp].append(wad) # Check if request is downloading right now to avoid doing it twice if fp in info.downloading: return wad # Download request checking media_to_download hook output first info.downloading.add(fp) dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fp, info) dfd.addErrback(log.err, spider=info.spider) return dfd.addBoth(lambda _: wad) # it must return wad at last
def request_reseen(self, response, request, spider): if 200 != response.status: fp = request_fingerprint(request) if self.server.sismember(self.key, fp): self.server.srem(self.key, fp) print "reseen [sts]%d [url]%s!" % (response.status, request.url)
def test_get_media_requests(self): # returns single Request (without callback) req = Request('http://url') item = dict(requests=req) # pass a single item new_item = yield self.pipe.process_item(item, self.spider) assert new_item is item assert request_fingerprint(req) in self.info.downloaded # returns iterable of Requests req1 = Request('http://url1') req2 = Request('http://url2') item = dict(requests=iter([req1, req2])) new_item = yield self.pipe.process_item(item, self.spider) assert new_item is item assert request_fingerprint(req1) in self.info.downloaded assert request_fingerprint(req2) in self.info.downloaded
def test_process_spider_output(self): fake_response = mock.Mock() fake_response.request = Request('http://source-request') def sort_requests_and_items(val): return val.__class__.__name__ fake_result = sorted([ Request('ftp://req1'), Request('https://req2'), Response('http://source-request'), DictItem(), { 'field1': 'value1' } ], key=sort_requests_and_items) results = self.instance.process_spider_output(fake_response, fake_result, self.spider) assert isinstance(results, types.GeneratorType) for r in sorted(results, key=sort_requests_and_items): assert isinstance(r, type(fake_result.pop(0))) if isinstance(r, (DictItem, dict)): self.assertEqual(r["_cached_page_id"], request_fingerprint(fake_response.request)) bad_fake_request = DictItem() bad_fake_request._values = None self.instance.process_spider_exception = mock.Mock() with self.assertRaises(TypeError): for _ in self.instance.process_spider_output( fake_response, [bad_fake_request], self.spider): pass assert self.instance.process_spider_exception.called
def test_get_key(self): mw = self.mwcls(self.temp_dir, reset=True) test_req1 = Request('http://url1') self.assertEqual(mw._get_key(test_req1), to_bytes(request_fingerprint(test_req1))) test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'}) self.assertEqual(mw._get_key(test_req2), b'dfkey1')
def delete_cached_request(self, request): ''' Delete a cached request from FilesystemCacheStorage. Returns True if successful. ''' import os.path from shutil import rmtree from scrapy.utils.request import request_fingerprint if not settings[ 'HTTPCACHE_ENABLED'] and not 'httpcache.FilesystemCacheStorage' in settings[ 'HTTPCACHE_STORAGE']: self.log( 'HTTPCACHE is disabled or HTTPCACHE_STORAGE is not FilesystemCacheStorage.', log.ERROR) return False if not request and not isinstance(request, 'scrapy.http.request.Request'): raise TypeError('Invalid argument "request"') req_fp = request_fingerprint(request) req_dir = os.path.join('.scrapy', settings['HTTPCACHE_DIR'], self.name, req_fp[:2], req_fp) if not os.path.exists(req_dir): self.log('Path does not exist or permission denied %s' % req_dir, log.ERROR) return False try: rmtree(req_dir) self.log( 'Deleted cached request %s, url %s' % (req_dir, request.url), log.DEBUG) return True except Exception: self.log('Error deleting %s' % req_dir, log.ERROR) return False
def request_seen(self, request): fp = request_fingerprint(request) if self.bf.exists(fp): # bf is a object of BloomFilter return True self.bf.insert(fp) return False
def request_seen(self, request): fp = request_fingerprint(request) if self.bf.isContains(fp): #已经存在 return True else: self.bf.insert(fp) return False
def parse(self, response): cate = response.meta['cate'] print(response.status) new_urls = response.xpath('//div[@class="main_left"]/div') a = 0 for new_url in new_urls: if a != 0 and a != 21: item = ScrapyPowerofweekV101Item() item['content_url'] = new_url.xpath( './/h3/a/@href').extract_first() item['tags'] = new_url.xpath( './/div[@class="tag"]/span[1]/a/text()').extract_first( ).strip() item['title_images'] = None item['industry_categories'] = 'D' item['industry_Lcategories'] = '44' item['industry_Mcategories'] = '441' item['industry_Scategories'] = None item['information_categories'] = cate req = scrapy.Request(url=item['content_url'], callback=self.parse2, meta={'item': item}) item["id"] = request.request_fingerprint(req) yield req a += 1
def parse_candidate_overview(self, response): if response.status == 429: lostitem_str = 'lost overview: ' + response.url self.lostitem_file.write(lostitem_str) self.lostitem_file.close() raise CloseSpider(reason='被封了,准备切换ip') print '-----------start to process: ' + response.url headers = response.request.headers headers["referer"] = response.url item = RGPersonItem() featured_researches = response.xpath('//div[contains(@class, "profile-highlights-publications")]').extract() address = DataFilter.simple_format(response.xpath('//div[contains(@class, "institution-location")]/text()').extract()) add_list = address.split(',') add_len = len(add_list) if add_len == 3: city = add_list[0].strip() province = add_list[1].strip() country = add_list[2].strip() elif add_len == 2: city = add_list[0].strip() province = '' country = add_list[1].strip() elif add_len == 1: city = add_list[0].strip() province = '' country = '' else: city = address province = '' country = '' person_key = request_fingerprint(response.request) item['person_key'] = person_key item['fullname'] = DataFilter.simple_format(response.xpath('//a[@class = "ga-profile-header-name"]/text()').extract()) item['target_sciences'] = DataFilter.simple_format(response.xpath('//*[@id="target-sciences"]/text()').extract()) item['title'] = DataFilter.simple_format(response.xpath('//*[contains(@class,"profile-degree")]/div[@class="title"]/text()').extract()) item['score'] = DataFilter.simple_format(response.xpath('//span[starts-with(@class, "score-link")]').extract()) top_coauthors = response.xpath('//div[starts-with(@class, "authors-block")]//ul/li//h5[@class="ga-top-coauthor-name"]/a') item['co_authors'] = parse_text_by_multi_content(top_coauthors, "|") skills_expertise = response.xpath('//div[starts-with(@class, "profile-skills")]/ul/li//a[starts-with(@class, "keyword-list-token-text")]') item['skills'] = parse_text_by_multi_content(skills_expertise, "|") topics = response.xpath('//ul[@class="keyword-list clearfix"]/li//a[starts-with(@class, "keyword-list-token-text")]') item['topics'] = parse_text_by_multi_content(topics, "|") item['institution'] = DataFilter.simple_format(response.xpath('//div[starts-with(@class, "institution-name")]').extract()) item['department'] = DataFilter.simple_format(response.xpath('//div[@class = "institution-dept"]').extract()) item['city'] = city item['province'] = province item['country'] = country if featured_researches and country != 'China': url = response.url + "/publications" yield item yield Request(url, headers=headers, callback=self.parse_contribution, dont_filter=True, meta={"person_key":person_key}) else: print "--------Nothing to return, it is invalid--------"
def parse(self, response): config_list = response.xpath('//div[@class="show2 left"]/div') num = [0, 1, 12] for i in range(len(config_list)): if i not in num: item = ScrapyCnbridgeV1Item() title = config_list[i].xpath( './/div[@class="list5"]/a/text()').extract_first() link = self.base_url + config_list[i].xpath( './/div[@class="list5"]/a/@href').extract_first() title_images = config_list[i].xpath( './/img/@src').extract_first() # print(link, title, title_images) req = scrapy.Request(url=link, callback=self.parse_detail, meta={'item': item}) item['id'] = request.request_fingerprint(req) item['title'] = title item['title_images'] = title_images item['content_url'] = link item['industry_categories'] = 'E' item['industry_Lcategories'] = '48' item['industry_Mcategories'] = '481' item['industry_Scategories'] = None item['information_categories'] = '行业资讯' yield req
def test_results_are_cached_across_multiple_items(self): rsp1 = Response('http://url1') req1 = Request('http://url1', meta=dict(response=rsp1)) item = dict(requests=req1) new_item = yield self.pipe.process_item(item, self.spider) self.assertTrue(new_item is item) self.assertEqual(new_item['results'], [(True, rsp1)]) # rsp2 is ignored, rsp1 must be in results because request fingerprints are the same req2 = Request( req1.url, meta=dict(response=Response('http://donot.download.me'))) item = dict(requests=req2) new_item = yield self.pipe.process_item(item, self.spider) self.assertTrue(new_item is item) self.assertEqual(request_fingerprint(req1), request_fingerprint(req2)) self.assertEqual(new_item['results'], [(True, rsp1)])
def request_seen(self, request): fp = request_fingerprint(request) if self.bf.isContains(fp): return True else: self.bf.insert(fp) return False
def test_get_media_requests(self): # returns single Request (without callback) info = self.pipe.spiderinfo[self.spider] req = Request("http://media.com/2.gif") item = dict(requests=req) # pass a single item new_item = yield self.pipe.process_item(self.spider, item) assert new_item is item assert request_fingerprint(req) in info.downloaded # returns iterable of Requests req1 = Request("http://media.com/1.gif") req2 = Request("http://media.com/1.jpg") item = dict(requests=iter([req1, req2])) new_item = yield self.pipe.process_item(self.spider, item) assert new_item is item assert info.downloaded.get(request_fingerprint(req1)) is None assert info.downloaded.get(request_fingerprint(req2)) is None
def request_seen(self, request): fp = request_fingerprint(request) crawlid = request.meta['crawlid'] key = self.key + ':' + crawlid added = self.server.sadd(key, fp) self.server.expire(key, self.timeout) return not added
def _request_key(self, spider, request): rfp = request_fingerprint(request) # We could disable the namespacing in sharded mode (old behaviour), # but keeping it allows us to merge collections later without # worrying about key conflicts. #if self.sharded: # return rfp return '%s/%s' % (spider.name, rfp)
def request_seen(self, request): fp = request_fingerprint(request) added = self.server.sadd(self.key, fp) if self.ttl > 0: self.server.expire(self.key, self.ttl) return not added
def request_seen(self, request): fp = request_fingerprint(request) c_id = request.meta['crawlid'] added = self.server.sadd(self.key + ":" + c_id, fp) self.server.expire(self.key + ":" + c_id, self.timeout) return not added
def get_request_finger(url): """ 获取 url 指纹(允许参数无序) :param url: :return: """ req = Request(url=url) return request.request_fingerprint(req)
def request_seen(self, request): fp = request_fingerprint(request) q = 'INSERT INTO crawl_history VALUES (?, ?)' args = (fp, request.url) try: self.conn.execute(q, args) self.conn.commit() except sqlite3.IntegrityError: return True
def request_seen(self, request): level = self.determine_level(request) if level == 5: fp = request_fingerprint(request) if fp in self.fingerprints: return True self.fingerprints.add(fp) if self.file: self.file.write(fp + os.linesep)
def request_fingerprint(self, request): # Order matters to lizard-rc.se. if urlparse.urlparse(request.url).netloc == "lizard-rc.se": fp = hashlib.sha1() fp.update(request.method) fp.update(request.url) fp.update(request.body or b'') return fp.hexdigest() return request_fingerprint(request)
def request_seen(self, request): fp = request_fingerprint(request) if fp in self.fingerprints: return True self.fingerprints.add(fp) if 'not_write_file' in request.meta and request.meta['not_write_file']: return if self.file: self.file.write(fp + os.linesep)
def request_seen(self, request): fp = request_fingerprint(request) q = "INSERT INTO seen VALUES (?)" args = (fp,) try: self.conn.execute(q, args) self.conn.commit() except sqlite3.IntegrityError: return True
def request_seen(self, request): """ use sismember judge whether fp is duplicate. """ fp = request_fingerprint(request) if self.server.sismember(self.key, fp): return True self.server.sadd(self.key, fp) return False
def request_seen(self, request): fp = request_fingerprint(request) added = self.server.basic_publish( exchange='', routing_key=self.key, body=fp ) return not added
def request_seen(self, request): added = True fp = request_fingerprint(request) result = self.collection.find({'fp': fp}).limit(1) if not result.count(): self.collection.insert({'fp': fp}) added = False return added
def process_spider_input(self, response, spider): self.pipe_writer.write_request( url=response.url, status=response.status, method=response.request.method, rs=len(response.body), duration=response.meta.get('download_latency', 0) * 1000, parent=response.meta.get(HS_PARENT_ID_KEY), fp=request_fingerprint(response.request), ) self._seen[response] = next(self.request_id_sequence)
def splash_requst_fingerprint(request, include_headers=None): """ Request fingerprint that takes 'splash' meta key into account """ fp = request_fingerprint(request, include_headers=include_headers) if 'splash' not in request.meta: return fp h = hashlib.sha1(fp) for key, value in sorted(request.meta['splash'].items()): h.update(key) h.update(str(value)) return h.hexdigest()
def request_seen(self, request): """ use sismember judge whether fp is duplicate. """ fp = request_fingerprint(request) if self.server.sismember(self.key,fp): return True self.server.sadd(self.key, fp) with open('sinxin_debug', 'a') as f: f.write("[url]%s [url2]%s [mode]%s\n" % (request.url, canonicalize_url(request.url), request.method)) return False
def request_fingerprint(self, request): """Returns a fingerprint for a given request. Parameters ---------- request : scrapy.http.Request Returns ------- str """ return request_fingerprint(request)
def request_seen(self, request): """ use sismember judge whether fp is duplicate. """ thisurl = request.url if(thisurl.find('page') != -1 ): return False fp = request_fingerprint(request) if self.server.sismember(self.key,fp): return True self.server.sadd(self.key, fp) return False
def process_spider_input(self, response, spider): parent = response.meta.get('_hsparent') riq = hsref.job.requests.add( parent=parent, url=response.url, status=response.status, method=response.request.method, rs=len(response.body), duration=response.meta.get('download_latency', 0) * 1000, ts=time.time() * 1000, fp=request_fingerprint(response.request), ) self._seen[response] = riq