def __init__(self, path, debug): RFPDupeFilter.__init__(self, path, debug) self.db = dbconnection.getConnection() self.cur = self.db.cursor() self.task = task self.urls = self.loadFromDB() self.filter = getPersistFilter(self.task)
def init(self): if not self.crawler.settings.getbool('CRAWLGRAPH_ENABLED', True): raise NotConfigured() # fixme: it should be in spider state self.crawler.spider.G = self.G = nx.DiGraph(name='Crawl Graph') self.node_ids = itertools.count() self.crawler.signals.connect(self.on_spider_closed, signals.spider_closed) self.filename = self.crawler.settings.get('CRAWLGRAPH_FILENAME', None) # HACKHACKHACK self.dupefilter = RFPDupeFilter()
def test_filter(self): dupefilter = RFPDupeFilter() dupefilter.open() r1 = Request('http://scrapytest.org/1') r2 = Request('http://scrapytest.org/2') r3 = Request('http://scrapytest.org/2') assert not dupefilter.request_seen(r1) assert dupefilter.request_seen(r1) assert not dupefilter.request_seen(r2) assert dupefilter.request_seen(r3) dupefilter.close('finished')
def __init__(self, path=None, debug=False): logging.info("init redis bloomFilter") self.key = "url" self.redis_client = redis.Redis(host='127.0.0.1', port=6379) error_rate = 0.001 initial_size = 1000 try: # bf.reserve,提供了三个参数, key, error_rate和initial_size。错误率越低,需要的空间越大,initial_size # 参数表示预计放入布隆过滤器的元素数量,当实际数量超出这个数值时,误判率会上升。 默认的参数是 error_rate=0.01, initial_size=100。 self.redis_client.execute_command("bf.reserve", self.key, error_rate, initial_size) except ResponseError as e: logging.info(e) RFPDupeFilter.__init__(self, path)
def test_request_fingerprint(self): """Test if customization of request_fingerprint method will change output of request_seen. """ r1 = Request('http://scrapytest.org/index.html') r2 = Request('http://scrapytest.org/INDEX.html') dupefilter = RFPDupeFilter() dupefilter.open() assert not dupefilter.request_seen(r1) assert not dupefilter.request_seen(r2) dupefilter.close('finished') class CaseInsensitiveRFPDupeFilter(RFPDupeFilter): def request_fingerprint(self, request): fp = hashlib.sha1() fp.update(request.url.lower()) return fp.hexdigest() case_insensitive_dupefilter = CaseInsensitiveRFPDupeFilter() case_insensitive_dupefilter.open() assert not case_insensitive_dupefilter.request_seen(r1) assert case_insensitive_dupefilter.request_seen(r2) case_insensitive_dupefilter.close('finished')
def configure_request_sharing(self): if not hasattr(self._baseclass, '_queue_size'): self._baseclass._queue_size = 0 if not hasattr(self._baseclass, 'shared_dupefilter'): self._baseclass.shared_dupefilter = RFPDupeFilter.from_settings( self.settings) if not hasattr(self._baseclass, '_request_queue'): self._baseclass._request_queue = PriorityQueue()
def test_request_fingerprint(self): """Test if customization of request_fingerprint method will change output of request_seen. """ r1 = Request('http://scrapytest.org/index.html') r2 = Request('http://scrapytest.org/INDEX.html') dupefilter = RFPDupeFilter() dupefilter.open() assert not dupefilter.request_seen(r1) assert not dupefilter.request_seen(r2) dupefilter.close('finished') class CaseInsensitiveRFPDupeFilter(RFPDupeFilter): def request_fingerprint(self, request): fp = hashlib.sha1() fp.update(to_bytes(request.url.lower())) return fp.hexdigest() case_insensitive_dupefilter = CaseInsensitiveRFPDupeFilter() case_insensitive_dupefilter.open() assert not case_insensitive_dupefilter.request_seen(r1) assert case_insensitive_dupefilter.request_seen(r2) case_insensitive_dupefilter.close('finished')
def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. """ if not settings.getbool('SCRAPY_REDIS_ENABLED'): return RFPDupeFilter.from_settings(settings) server = get_redis_from_settings(settings) key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} debug = settings.getbool('DUPEFILTER_DEBUG') instance = cls(server, key=key, debug=debug) if settings.getbool('BLOOMFILTER_ENABLED'): instance.bloomfilter = BloomFilter(server, key) instance.request_seen = instance.bloom_request_seen return instance
def test_seenreq_newlines(self): """ Checks against adding duplicate \r to line endings on Windows platforms. """ r1 = Request('http://scrapytest.org/1') path = tempfile.mkdtemp() try: df = RFPDupeFilter(path) df.open() df.request_seen(r1) df.close('finished') with open(os.path.join(path, 'requests.seen'), 'rb') as seen_file: line = next(seen_file).decode() assert not line.endswith('\r\r\n') if sys.platform == 'win32': assert line.endswith('\r\n') else: assert line.endswith('\n') finally: shutil.rmtree(path)
def test_dupefilter_path(self): r1 = Request('http://scrapytest.org/1') r2 = Request('http://scrapytest.org/2') path = tempfile.mkdtemp() try: df = RFPDupeFilter(path) df.open() assert not df.request_seen(r1) assert df.request_seen(r1) df.close('finished') df2 = RFPDupeFilter(path) df2.open() assert df2.request_seen(r1) assert not df2.request_seen(r2) assert df2.request_seen(r2) df2.close('finished') finally: shutil.rmtree(path)
def __init__(self, path=None, debug=None): RFPDupeFilter.__init__(self, path, debug) self.fingerprints = {} print "[***] filter running!"
def __init__(self, path=None): RFPDupeFilter.__init__(self, path)
def __init__(self, path=None): self.url_seen = set() RFPDupeFilter.__init__(self, path)
class CrawlGraphMiddleware(BaseExtension): """ This spider middleware keeps track of crawl graph. The graph is accessible from spider as ``spider.G`` attribute; node ID of each response is available as ``response.meta['node_id']``. Enable this middleware in settings:: SPIDER_MIDDLEWARES = { 'deepdeep.spidermiddlewares.CrawlGraphMiddleware': 400, } By default each node contains the following information:: { 'url': <response url>, 'original url': <request url (before redirects)>, 'visited': True/False, # this is False for links which are not visited yet 'ok': True/False, # True if response is a HTTP 200 HTML response 'priority': <request.priority> } Spider can add more information to node in two ways: 1. set ``request.meta['node_data']`` dict with additional node attributes when sending the request; 2. update ``self.G.node[response.meta['node_id']]`` dict after response is received (usually in a ``parse_..`` callback). Edge data is empty by default; to attach information to edges send requests with non-empty ``request.meta['edge_data']`` dicts. """ def init(self): if not self.crawler.settings.getbool('CRAWLGRAPH_ENABLED', True): raise NotConfigured() # fixme: it should be in spider state self.crawler.spider.G = self.G = nx.DiGraph(name='Crawl Graph') self.node_ids = itertools.count() self.crawler.signals.connect(self.on_spider_closed, signals.spider_closed) self.filename = self.crawler.settings.get('CRAWLGRAPH_FILENAME', None) # HACKHACKHACK self.dupefilter = RFPDupeFilter() def on_spider_closed(self): if self.filename: nx.write_gpickle(self.G, self.filename) def process_spider_input(self, response, spider): """ Assign response.node_id attribute, make sure a node exists in a graph and update the node with received information. """ if 'node_id' not in response.meta: # seed requests don't have node_id yet response.meta['node_id'] = next(self.node_ids) node_id = response.meta['node_id'] data = dict( url=response.url, visited=True, ok=self._response_ok(response), priority=response.request.priority, ) spider.G.add_node(node_id, data) logger.debug("VISITED NODE %s %s", node_id, data) self.crawler.stats.inc_value('graph_nodes/visited') if data['ok']: self.crawler.stats.inc_value('graph_nodes/visited/ok') else: self.crawler.stats.inc_value('graph_nodes/visited/err') def process_spider_output(self, response, result, spider): for request in result: if isinstance(request, scrapy.Request): ok = self._process_outgoing_request(response, request, spider) if not ok: continue yield request def _process_outgoing_request(self, response, request, spider): """ Create new nodes and edges for outgoing requests. Data can be attached to nodes and edges using ``request.meta['node_data']`` and ``request.meta['edge_data']`` dicts; these keys are then removed by this middleware. """ if self.dupefilter.request_seen(request): return False this_node_id = response.meta.get('node_id') new_node_id = next(self.node_ids) request.meta['node_id'] = new_node_id node_data = request.meta.pop('node_data', {}) node_data.update( url=request.url, original_url=request.url, priority=request.priority, visited=False, ok=None, ) edge_data = request.meta.pop('edge_data', {}) spider.G.add_node(new_node_id, node_data) spider.G.add_edge(this_node_id, new_node_id, edge_data) logger.debug("Created node %s -> %s %s", this_node_id, new_node_id, node_data) self.crawler.stats.set_value('graph_nodes/created', len(spider.G)) return True def _response_ok(self, response): return response.status == 200 and hasattr(response, 'text')
def __init__(self, path=None, debug=False): self.urls_seen = set() RFPDupeFilter.__init__(self, path)
def __init__(self, path=None, debug=True): self.redis_client = RedisHelper.get_instance() RFPDupeFilter.__init__(self, path, debug)
def __init__(self, path=None, debug=False): RFPDupeFilter.__init__(self, path=None, debug=False) self.rclient = redis.StrictRedis(host="localhost", port=6379, db=0)
def __init__(self, path=None, debug=False): RFPDupeFilter.__init__(self, path=path, debug=debug)
def __init__(self, path=None, debug=False): RFPDupeFilter.__init__(self, path) self.dupefilter = UrlFilterAndAdd()
def __init__(self, path=None): self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) RFPDupeFilter.__init__(self, path)
def __init__(self, path=None, other=None): inmem = [it['url'] for it in MongoClient(settings['DBINFO']).nbbs.dsl.find({'out': 1})] self.already_seen = set(inmem) RFPDupeFilter.__init__(self, path, other)