Пример #1
0
 def __init__(self, path, debug):
     RFPDupeFilter.__init__(self, path, debug)
     self.db = dbconnection.getConnection()
     self.cur = self.db.cursor()
     self.task = task
     self.urls = self.loadFromDB()
     self.filter = getPersistFilter(self.task)
Пример #2
0
 def __init__(self, path, debug):
     RFPDupeFilter.__init__(self, path, debug)
     self.db = dbconnection.getConnection()
     self.cur = self.db.cursor()
     self.task = task
     self.urls = self.loadFromDB()
     self.filter = getPersistFilter(self.task)
Пример #3
0
    def init(self):
        if not self.crawler.settings.getbool('CRAWLGRAPH_ENABLED', True):
            raise NotConfigured()

        # fixme: it should be in spider state
        self.crawler.spider.G = self.G = nx.DiGraph(name='Crawl Graph')
        self.node_ids = itertools.count()
        self.crawler.signals.connect(self.on_spider_closed,
                                     signals.spider_closed)

        self.filename = self.crawler.settings.get('CRAWLGRAPH_FILENAME', None)

        # HACKHACKHACK
        self.dupefilter = RFPDupeFilter()
Пример #4
0
    def test_filter(self):
        dupefilter = RFPDupeFilter()
        dupefilter.open()

        r1 = Request('http://scrapytest.org/1')
        r2 = Request('http://scrapytest.org/2')
        r3 = Request('http://scrapytest.org/2')

        assert not dupefilter.request_seen(r1)
        assert dupefilter.request_seen(r1)

        assert not dupefilter.request_seen(r2)
        assert dupefilter.request_seen(r3)

        dupefilter.close('finished')
Пример #5
0
    def __init__(self, path=None, debug=False):
        logging.info("init redis bloomFilter")
        self.key = "url"
        self.redis_client = redis.Redis(host='127.0.0.1', port=6379)
        error_rate = 0.001
        initial_size = 1000
        try:
            # bf.reserve,提供了三个参数, key, error_rate和initial_size。错误率越低,需要的空间越大,initial_size
            # 参数表示预计放入布隆过滤器的元素数量,当实际数量超出这个数值时,误判率会上升。 默认的参数是 error_rate=0.01, initial_size=100。
            self.redis_client.execute_command("bf.reserve", self.key,
                                              error_rate, initial_size)
        except ResponseError as e:
            logging.info(e)

        RFPDupeFilter.__init__(self, path)
Пример #6
0
    def test_request_fingerprint(self):
        """Test if customization of request_fingerprint method will change
        output of request_seen.

        """
        r1 = Request('http://scrapytest.org/index.html')
        r2 = Request('http://scrapytest.org/INDEX.html')

        dupefilter = RFPDupeFilter()
        dupefilter.open()

        assert not dupefilter.request_seen(r1)
        assert not dupefilter.request_seen(r2)

        dupefilter.close('finished')

        class CaseInsensitiveRFPDupeFilter(RFPDupeFilter):

            def request_fingerprint(self, request):
                fp = hashlib.sha1()
                fp.update(request.url.lower())
                return fp.hexdigest()

        case_insensitive_dupefilter = CaseInsensitiveRFPDupeFilter()
        case_insensitive_dupefilter.open()

        assert not case_insensitive_dupefilter.request_seen(r1)
        assert case_insensitive_dupefilter.request_seen(r2)

        case_insensitive_dupefilter.close('finished')
Пример #7
0
    def test_filter(self):
        dupefilter = RFPDupeFilter()
        dupefilter.open()

        r1 = Request('http://scrapytest.org/1')
        r2 = Request('http://scrapytest.org/2')
        r3 = Request('http://scrapytest.org/2')

        assert not dupefilter.request_seen(r1)
        assert dupefilter.request_seen(r1)

        assert not dupefilter.request_seen(r2)
        assert dupefilter.request_seen(r3)

        dupefilter.close('finished')
    def configure_request_sharing(self):
        if not hasattr(self._baseclass, '_queue_size'):
            self._baseclass._queue_size = 0

        if not hasattr(self._baseclass, 'shared_dupefilter'):
            self._baseclass.shared_dupefilter = RFPDupeFilter.from_settings(
                self.settings)

        if not hasattr(self._baseclass, '_request_queue'):
            self._baseclass._request_queue = PriorityQueue()
Пример #9
0
    def test_request_fingerprint(self):
        """Test if customization of request_fingerprint method will change
        output of request_seen.

        """
        r1 = Request('http://scrapytest.org/index.html')
        r2 = Request('http://scrapytest.org/INDEX.html')

        dupefilter = RFPDupeFilter()
        dupefilter.open()

        assert not dupefilter.request_seen(r1)
        assert not dupefilter.request_seen(r2)

        dupefilter.close('finished')

        class CaseInsensitiveRFPDupeFilter(RFPDupeFilter):

            def request_fingerprint(self, request):
                fp = hashlib.sha1()
                fp.update(to_bytes(request.url.lower()))
                return fp.hexdigest()

        case_insensitive_dupefilter = CaseInsensitiveRFPDupeFilter()
        case_insensitive_dupefilter.open()

        assert not case_insensitive_dupefilter.request_seen(r1)
        assert case_insensitive_dupefilter.request_seen(r2)

        case_insensitive_dupefilter.close('finished')
Пример #10
0
    def from_settings(cls, settings):
        """Returns an instance from given settings.

        This uses by default the key ``dupefilter:<timestamp>``. When using the
        ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
        it needs to pass the spider name in the key.

        """
        if not settings.getbool('SCRAPY_REDIS_ENABLED'):
            return RFPDupeFilter.from_settings(settings)

        server = get_redis_from_settings(settings)

        key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
        debug = settings.getbool('DUPEFILTER_DEBUG')

        instance = cls(server, key=key, debug=debug)
        if settings.getbool('BLOOMFILTER_ENABLED'):
            instance.bloomfilter = BloomFilter(server, key)
            instance.request_seen = instance.bloom_request_seen
        return instance
Пример #11
0
    def test_seenreq_newlines(self):
        """ Checks against adding duplicate \r to
        line endings on Windows platforms. """

        r1 = Request('http://scrapytest.org/1')

        path = tempfile.mkdtemp()
        try:
            df = RFPDupeFilter(path)
            df.open()
            df.request_seen(r1)
            df.close('finished')

            with open(os.path.join(path, 'requests.seen'), 'rb') as seen_file:
                line = next(seen_file).decode()
                assert not line.endswith('\r\r\n')
                if sys.platform == 'win32':
                    assert line.endswith('\r\n')
                else:
                    assert line.endswith('\n')

        finally:
            shutil.rmtree(path)
Пример #12
0
    def test_dupefilter_path(self):
        r1 = Request('http://scrapytest.org/1')
        r2 = Request('http://scrapytest.org/2')

        path = tempfile.mkdtemp()
        try:
            df = RFPDupeFilter(path)
            df.open()
            assert not df.request_seen(r1)
            assert df.request_seen(r1)
            df.close('finished')

            df2 = RFPDupeFilter(path)
            df2.open()
            assert df2.request_seen(r1)
            assert not df2.request_seen(r2)
            assert df2.request_seen(r2)
            df2.close('finished')
        finally:
            shutil.rmtree(path)
Пример #13
0
 def __init__(self, path=None, debug=None):
     RFPDupeFilter.__init__(self, path, debug)
     self.fingerprints = {}
     print "[***]  filter running!"
Пример #14
0
 def __init__(self, path=None):
     RFPDupeFilter.__init__(self, path)
Пример #15
0
 def __init__(self, path=None):
     self.url_seen = set()
     RFPDupeFilter.__init__(self, path)
Пример #16
0
class CrawlGraphMiddleware(BaseExtension):
    """
    This spider middleware keeps track of crawl graph.
    The graph is accessible from spider as ``spider.G`` attribute;
    node ID of each response is available as ``response.meta['node_id']``.

    Enable this middleware in settings::

        SPIDER_MIDDLEWARES = {
            'deepdeep.spidermiddlewares.CrawlGraphMiddleware': 400,
        }

    By default each node contains the following information::

        {
            'url': <response url>,
            'original url': <request url (before redirects)>,
            'visited': True/False,  # this is False for links which are not visited yet
            'ok': True/False,       # True if response is a HTTP 200 HTML response
            'priority': <request.priority>
        }

    Spider can add more information to node in two ways:

    1. set ``request.meta['node_data']`` dict with additional node attributes
       when sending the request;
    2. update ``self.G.node[response.meta['node_id']]`` dict after response
       is received (usually in a ``parse_..`` callback).

    Edge data is empty by default; to attach information to edges send requests
    with non-empty ``request.meta['edge_data']`` dicts.
    """
    def init(self):
        if not self.crawler.settings.getbool('CRAWLGRAPH_ENABLED', True):
            raise NotConfigured()

        # fixme: it should be in spider state
        self.crawler.spider.G = self.G = nx.DiGraph(name='Crawl Graph')
        self.node_ids = itertools.count()
        self.crawler.signals.connect(self.on_spider_closed,
                                     signals.spider_closed)

        self.filename = self.crawler.settings.get('CRAWLGRAPH_FILENAME', None)

        # HACKHACKHACK
        self.dupefilter = RFPDupeFilter()

    def on_spider_closed(self):
        if self.filename:
            nx.write_gpickle(self.G, self.filename)

    def process_spider_input(self, response, spider):
        """
        Assign response.node_id attribute, make sure a node exists
        in a graph and update the node with received information.
        """
        if 'node_id' not in response.meta:
            # seed requests don't have node_id yet
            response.meta['node_id'] = next(self.node_ids)

        node_id = response.meta['node_id']
        data = dict(
            url=response.url,
            visited=True,
            ok=self._response_ok(response),
            priority=response.request.priority,
        )
        spider.G.add_node(node_id, data)
        logger.debug("VISITED NODE %s %s", node_id, data)

        self.crawler.stats.inc_value('graph_nodes/visited')
        if data['ok']:
            self.crawler.stats.inc_value('graph_nodes/visited/ok')
        else:
            self.crawler.stats.inc_value('graph_nodes/visited/err')

    def process_spider_output(self, response, result, spider):
        for request in result:
            if isinstance(request, scrapy.Request):
                ok = self._process_outgoing_request(response, request, spider)
                if not ok:
                    continue
            yield request

    def _process_outgoing_request(self, response, request, spider):
        """
        Create new nodes and edges for outgoing requests.
        Data can be attached to nodes and edges using
        ``request.meta['node_data']`` and ``request.meta['edge_data']``
        dicts; these keys are then removed by this middleware.
        """
        if self.dupefilter.request_seen(request):
            return False

        this_node_id = response.meta.get('node_id')
        new_node_id = next(self.node_ids)
        request.meta['node_id'] = new_node_id

        node_data = request.meta.pop('node_data', {})
        node_data.update(
            url=request.url,
            original_url=request.url,
            priority=request.priority,
            visited=False,
            ok=None,
        )
        edge_data = request.meta.pop('edge_data', {})
        spider.G.add_node(new_node_id, node_data)
        spider.G.add_edge(this_node_id, new_node_id, edge_data)
        logger.debug("Created node %s -> %s %s", this_node_id, new_node_id,
                     node_data)
        self.crawler.stats.set_value('graph_nodes/created', len(spider.G))
        return True

    def _response_ok(self, response):
        return response.status == 200 and hasattr(response, 'text')
Пример #17
0
 def __init__(self, path=None, debug=False):
   self.urls_seen = set()
   RFPDupeFilter.__init__(self, path)
Пример #18
0
 def __init__(self, path=None, debug=True):
     self.redis_client = RedisHelper.get_instance()
     RFPDupeFilter.__init__(self, path, debug)
Пример #19
0
 def __init__(self, path=None, debug=False):
     RFPDupeFilter.__init__(self, path=None, debug=False)
     self.rclient = redis.StrictRedis(host="localhost", port=6379, db=0)
Пример #20
0
 def __init__(self, path=None, debug=False):
     self.urls_seen = set()
     RFPDupeFilter.__init__(self, path)
Пример #21
0
 def __init__(self, path=None, debug=False):
     RFPDupeFilter.__init__(self, path=path, debug=debug)
Пример #22
0
    def test_dupefilter_path(self):
        r1 = Request('http://scrapytest.org/1')
        r2 = Request('http://scrapytest.org/2')

        path = tempfile.mkdtemp()
        try:
            df = RFPDupeFilter(path)
            df.open()
            assert not df.request_seen(r1)
            assert df.request_seen(r1)
            df.close('finished')

            df2 = RFPDupeFilter(path)
            df2.open()
            assert df2.request_seen(r1)
            assert not df2.request_seen(r2)
            assert df2.request_seen(r2)
            df2.close('finished')
        finally:
            shutil.rmtree(path)
 def __init__(self, path=None, debug=False):
     RFPDupeFilter.__init__(self, path)
     self.dupefilter = UrlFilterAndAdd()
Пример #24
0
 def __init__(self, path=None):
     self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
     RFPDupeFilter.__init__(self, path)
Пример #25
0
 def __init__(self, path=None, other=None):
     inmem = [it['url'] for it in MongoClient(settings['DBINFO']).nbbs.dsl.find({'out': 1})]
     self.already_seen = set(inmem)
     RFPDupeFilter.__init__(self, path, other)