예제 #1
0
    def __init__(self, crawler):
        StatsCollector.__init__(self, crawler)

        self.redis = redis_from_settings(crawler.settings)
        self.crawler = crawler
        self.key = '%s:stats' % crawler.spidercls.name

        self.encoding = crawler.settings['REDIS_ENCODING']
예제 #2
0
    def setUp(self):
        crawler = get_crawler(Spider)
        self.spider = crawler._create_spider('scrapytest.org')

        self.stats = StatsCollector(crawler)
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats, True)
예제 #3
0
    def setUp(self):
        self.spider_name = 'df_tests'
        self.spider = Spider(self.spider_name)

        # DeltaFetch creates .db files named after the spider's name
        self.temp_dir = tempfile.gettempdir()
        self.db_path = os.path.join(self.temp_dir, '%s.db' % self.spider.name)

        crawler = get_crawler(Spider)
        self.stats = StatsCollector(crawler)
예제 #4
0
    def parse(self, response):
        #getting the question blocks from response.
        question_blocks = Selector(
            text=json.loads(response.body.decode("utf-8"))['msg'][1]).xpath(
                '//div[contains(@itemtype, "http://schema.org/Question")]')

        for question_block in question_blocks:
            item = ZhihuapiItem()
            item['question_name'] = question_block.xpath(
                './/div/div/h2/a/text()').extract_first()
            item['question_url'] = question_block.xpath(
                './/div/div/h2/a/@href').extract_first()
            item['question_answer'] = question_block.xpath(
                './/div/div/div[1]/div[5]/div/a/@href').extract_first()
            item['question_answer_author_profile'] = question_block.xpath(
                './/div/div/div[1]/div[3]/span/span[1]/a/@href').extract_first(
                )
            item['question_answer_author'] = question_block.xpath(
                './/div/div/div[1]/div[3]/span/span[1]/a/text()'
            ).extract_first()

            self.logger.info(
                'Question info: question name - {}, question answer - {}, question url - {}, question answer author profile - {}, question answer author - {}'
                .format(item['question_name'], item['question_answer'],
                        item['question_url'],
                        item['question_answer_author_profile'],
                        item['question_answer_author']))

            yield item

        if len(question_blocks) > 0:
            last_data_score = question_blocks[len(question_blocks) - 1].xpath(
                '@data-score').extract_first()
        else:
            self.logger.info("No more new questions, waiting to stop...")
            StatsCollector.close_spider(self,
                                        spider=zhihuSpider,
                                        reason="No more questions...")

        self.logger.info('Last Data Score is - {}'.format(last_data_score))
        yield scrapy.http.FormRequest(self.topic_url,
                                      method='POST',
                                      headers=self.headers,
                                      formdata={
                                          'start': '0',
                                          'offset': str(last_data_score)
                                      },
                                      callback=self.parse)
예제 #5
0
    def setUp(self):
        crawler = get_crawler(Spider)
        self.spider = crawler._create_spider("scrapytest.org")

        self.stats = StatsCollector(crawler)
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats, True)
예제 #6
0
async def _request_handler(request: pyppeteer.network_manager.Request,
                           scrapy_request: Request,
                           stats: StatsCollector) -> None:
    # set headers, method and body
    if request.url == scrapy_request.url:
        overrides = {
            "method": scrapy_request.method,
            "headers": {
                key.decode("utf-8").lower(): value[0].decode("utf-8")
                for key, value in scrapy_request.headers.items()
            },
        }
        if scrapy_request.body:
            overrides["postData"] = scrapy_request.body.decode(
                scrapy_request.encoding)
    else:
        overrides = {"headers": request.headers.copy()}
        if scrapy_request.headers.get("user-agent"):
            user_agent = scrapy_request.headers["user-agent"].decode("utf-8")
            overrides["headers"]["user-agent"] = user_agent
    await request.continue_(overrides)
    # increment stats
    stats.inc_value("pyppeteer/request_method_count/{}".format(request.method))
    stats.inc_value("pyppeteer/request_count")
    if request.isNavigationRequest():
        stats.inc_value("pyppeteer/request_count/navigation")
예제 #7
0
class TestDepthMiddleware(TestCase):

    def setUp(self):
        crawler = get_crawler(Spider)
        self.spider = crawler._create_spider('scrapytest.org')

        self.stats = StatsCollector(crawler)
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats, True)

    def test_process_spider_output(self):
        req = Request('http://scrapytest.org')
        resp = Response('http://scrapytest.org')
        resp.request = req
        result = [Request('http://scrapytest.org')]

        out = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out, result)

        rdc = self.stats.get_value('request_depth_count/1', spider=self.spider)
        self.assertEquals(rdc, 1)

        req.meta['depth'] = 1

        out2 = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out2, [])

        rdm = self.stats.get_value('request_depth_max', spider=self.spider)
        self.assertEquals(rdm, 1)

    def tearDown(self):
        self.stats.close_spider(self.spider, '')
예제 #8
0
class TestDepthMiddleware(TestCase):
    def setUp(self):
        crawler = get_crawler(Spider)
        self.spider = crawler._create_spider("scrapytest.org")

        self.stats = StatsCollector(crawler)
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats, True)

    def test_process_spider_output(self):
        req = Request("http://scrapytest.org")
        resp = Response("http://scrapytest.org")
        resp.request = req
        result = [Request("http://scrapytest.org")]

        out = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out, result)

        rdc = self.stats.get_value("request_depth_count/1", spider=self.spider)
        self.assertEquals(rdc, 1)

        req.meta["depth"] = 1

        out2 = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out2, [])

        rdm = self.stats.get_value("request_depth_max", spider=self.spider)
        self.assertEquals(rdm, 1)

    def tearDown(self):
        self.stats.close_spider(self.spider, "")
예제 #9
0
 def test_core_stats_default_stats_collector(self, mock_datetime):
     fixed_datetime = datetime(2019, 12, 1, 11, 38)
     mock_datetime.utcnow = mock.Mock(return_value=fixed_datetime)
     self.crawler.stats = StatsCollector(self.crawler)
     ext = CoreStats.from_crawler(self.crawler)
     ext.spider_opened(self.spider)
     ext.item_scraped({}, self.spider)
     ext.response_received(self.spider)
     ext.item_dropped({}, self.spider, ZeroDivisionError())
     ext.spider_closed(self.spider, 'finished')
     self.assertEqual(
         ext.stats._stats, {
             'start_time': fixed_datetime,
             'finish_time': fixed_datetime,
             'item_scraped_count': 1,
             'response_received_count': 1,
             'item_dropped_count': 1,
             'item_dropped_reasons_count/ZeroDivisionError': 1,
             'finish_reason': 'finished',
             'elapsed_time_seconds': 0.0,
         })
예제 #10
0
async def _response_handler(response: pyppeteer.network_manager.Response,
                            stats: StatsCollector):
    stats.inc_value("pyppeteer/response_count")
    stats.inc_value("pyppeteer/response_status_count/{}".format(
        response.status))
예제 #11
0
class DeltaFetchTestCase(TestCase):

    mwcls = DeltaFetch

    def setUp(self):
        self.spider_name = 'df_tests'
        self.spider = Spider(self.spider_name)

        # DeltaFetch creates .db files named after the spider's name
        self.temp_dir = tempfile.gettempdir()
        self.db_path = os.path.join(self.temp_dir, '%s.db' % self.spider.name)

        crawler = get_crawler(Spider)
        self.stats = StatsCollector(crawler)

    def test_init(self):
        # path format is any,  the folder is not created
        instance = self.mwcls('/any/dir', True, stats=self.stats)
        assert isinstance(instance, self.mwcls)
        self.assertEqual(instance.dir, '/any/dir')
        self.assertEqual(self.stats.get_stats(), {})
        self.assertEqual(instance.reset, True)

    def test_init_from_crawler(self):
        crawler = mock.Mock()
        # void settings
        crawler.settings = Settings({})
        self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler)
        with mock.patch('scrapy.utils.project.project_data_dir') as data_dir, \
             mock.patch('scrapy.utils.project.inside_project') as in_project:
            data_dir.return_value = self.temp_dir
            in_project.return_value = True

            # simple project_data_dir mock with based settings
            crawler.settings = Settings({'DELTAFETCH_ENABLED': True})
            instance = self.mwcls.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(
                instance.dir, os.path.join(self.temp_dir, 'deltafetch'))
            self.assertEqual(instance.reset, False)

            # project_data_dir mock with advanced settings
            crawler.settings = Settings({'DELTAFETCH_ENABLED': True,
                                         'DELTAFETCH_DIR': 'other',
                                         'DELTAFETCH_RESET': True})
            instance = self.mwcls.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(
                instance.dir, os.path.join(self.temp_dir, 'other'))
            self.assertEqual(instance.reset, True)

    def test_spider_opened_new(self):
        """Middleware should create a .db file if not found."""
        if os.path.exists(self.db_path):
            os.remove(self.db_path)
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        mw.spider_opened(self.spider)
        assert os.path.isdir(self.temp_dir)
        assert os.path.exists(self.db_path)
        assert hasattr(mw, 'db')
        assert isinstance(mw.db, type(dbmodule.db.DB()))
        assert mw.db.items() == []
        assert mw.db.get_type() == dbmodule.db.DB_HASH
        assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE

    def test_spider_opened_existing(self):
        """Middleware should open and use existing and valid .db files."""
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        mw.spider_opened(self.spider)
        assert hasattr(mw, 'db')
        assert isinstance(mw.db, type(dbmodule.db.DB()))
        assert mw.db.items() == [(b'test_key_1', b'test_v_1'),
                                 (b'test_key_2', b'test_v_2')]
        assert mw.db.get_type() == dbmodule.db.DB_HASH
        assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE

    def test_spider_opened_corrupt_dbfile(self):
        """Middleware should create a new .db if it cannot open it."""
        # create an invalid .db file
        with open(self.db_path, "wb") as dbfile:
            dbfile.write(b'bad')
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')

        # file corruption is only detected when opening spider
        mw.spider_opened(self.spider)
        assert os.path.isdir(self.temp_dir)
        assert os.path.exists(self.db_path)
        assert hasattr(mw, 'db')
        assert isinstance(mw.db, type(dbmodule.db.DB()))

        # and db should be empty (it was re-created)
        assert mw.db.items() == []
        assert mw.db.get_type() == dbmodule.db.DB_HASH
        assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE

    def test_spider_opened_existing_spider_reset(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        self.spider.deltafetch_reset = True
        mw.spider_opened(self.spider)
        assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE

    def test_spider_opened_reset_non_existing_db(self):
        mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        self.spider.deltafetch_reset = True
        mw.spider_opened(self.spider)
        assert mw.db.fd()
        # there's different logic for different bdb versions:
        # it can fail when opening a non-existing db with truncate flag,
        # then it should be caught and retried with rm & create flag
        assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or
                mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE)

    def test_spider_opened_recreate(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        mw.spider_opened(self.spider)
        assert hasattr(mw, 'db')
        assert isinstance(mw.db, type(dbmodule.db.DB()))
        assert mw.db.items() == []
        assert mw.db.get_type() == dbmodule.db.DB_HASH
        assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE

    def test_spider_closed(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
        mw.spider_opened(self.spider)
        assert mw.db.fd()
        mw.spider_closed(self.spider)
        self.assertRaises(dbmodule.db.DBError, mw.db.fd)

    def test_process_spider_output(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [])
        result = [
            # same URL but with new key --> it should be processed
            Request('http://url', meta={'deltafetch_key': 'key1'}),

            # 'test_key_1' is already in the test db --> it should be skipped
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]
        # so only the 1 request should go through
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [result[0]])

        # the skipped "http://url1" should be counted in stats
        self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1})

        # b'key' should not be in the db yet as no item was collected yet
        self.assertEqual(set(mw.db.keys()),
                         set([b'test_key_1',
                              b'test_key_2']))

        # if the spider returns items, the request's key is added in db
        result = [BaseItem(), "not a base item"]
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), result)
        self.assertEqual(set(mw.db.keys()),
                         set([b'key',
                              b'test_key_1',
                              b'test_key_2']))
        assert mw.db[b'key']

    def test_process_spider_output_dict(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = [{"somekey": "somevalue"}]
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), result)
        self.assertEqual(set(mw.db.keys()),
                         set([b'key',
                              b'test_key_1',
                              b'test_key_2']))
        assert mw.db[b'key']

    def test_process_spider_output_stats(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [])
        self.assertEqual(self.stats.get_stats(), {})
        result = [
            Request('http://url', meta={'deltafetch_key': 'key'}),
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [result[0]])
        self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1)
        result = [BaseItem(), "not a base item"]
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), result)
        self.assertEqual(self.stats.get_value('deltafetch/stored'), 1)

    def test_init_from_crawler_legacy(self):
        # test with subclass not handling passed stats
        class LegacyDeltaFetchSubClass(self.mwcls):

            def __init__(self, dir, reset=False, *args, **kwargs):
                super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset)
                self.something = True

        crawler = mock.Mock()
        # void settings
        crawler.settings = Settings({})
        self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler)

        with mock.patch('scrapy.utils.project.project_data_dir') as data_dir, \
             mock.patch('scrapy.utils.project.inside_project') as in_project:
            data_dir.return_value = self.temp_dir
            in_project.return_value = True

            # simple project_data_dir mock with based settings
            crawler.settings = Settings({'DELTAFETCH_ENABLED': True})
            instance = LegacyDeltaFetchSubClass.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(
                instance.dir, os.path.join(self.temp_dir, 'deltafetch'))
            self.assertEqual(instance.reset, False)

            # project_data_dir mock with advanced settings
            crawler.settings = Settings({'DELTAFETCH_ENABLED': True,
                                         'DELTAFETCH_DIR': 'other',
                                         'DELTAFETCH_RESET': True})
            instance = LegacyDeltaFetchSubClass.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(
                instance.dir, os.path.join(self.temp_dir, 'other'))
            self.assertEqual(instance.reset, True)

    def test_process_spider_output_stats_legacy(self):
        # testing the subclass not handling stats works at runtime
        # (i.e. that trying to update stats does not trigger exception)
        class LegacyDeltaFetchSubClass(self.mwcls):

            def __init__(self, dir, reset=False, *args, **kwargs):
                super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset)
                self.something = True

        self._create_test_db()
        mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [])
        self.assertEqual(self.stats.get_stats(), {})
        result = [
            Request('http://url', meta={'deltafetch_key': 'key'}),
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]

        # stats should not be updated
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [result[0]])
        self.assertEqual(self.stats.get_value('deltafetch/skipped'), None)

        result = [BaseItem(), "not a base item"]
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), result)
        self.assertEqual(self.stats.get_value('deltafetch/stored'), None)

    def test_get_key(self):
        mw = self.mwcls(self.temp_dir, reset=True)
        test_req1 = Request('http://url1')
        self.assertEqual(mw._get_key(test_req1),
                         to_bytes(request_fingerprint(test_req1)))
        test_req2 = Request('http://url2', meta={'deltafetch_key': b'dfkey1'})
        self.assertEqual(mw._get_key(test_req2), b'dfkey1')

        test_req3 = Request('http://url2', meta={'deltafetch_key': u'dfkey1'})
        # key will be converted to bytes
        self.assertEqual(mw._get_key(test_req3), b'dfkey1')

    def _create_test_db(self):
        db = dbmodule.db.DB()
        # truncate test db if there were failed tests
        db.open(self.db_path, dbmodule.db.DB_HASH,
                dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE)
        db[b'test_key_1'] = b'test_v_1'
        db[b'test_key_2'] = b'test_v_2'
        db.close()
예제 #12
0
class DeltaFetchTestCase(TestCase):

    mwcls = DeltaFetch

    def setUp(self):
        self.spider = Spider('df_tests')
        self.temp_dir = tempfile.gettempdir()
        self.db_path = os.path.join(self.temp_dir, 'df_tests.db')
        crawler = get_crawler(Spider)
        self.stats = StatsCollector(crawler)

    def test_init(self):
        # path format is any,  the folder is not created
        instance = self.mwcls('/any/dir', True, stats=self.stats)
        assert isinstance(instance, self.mwcls)
        self.assertEqual(instance.dir, '/any/dir')
        self.assertEqual(self.stats.get_stats(), {})
        self.assertEqual(instance.reset, True)

    def test_init_from_crawler(self):
        crawler = mock.Mock()
        # void settings
        crawler.settings = Settings({})
        self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler)
        with mock.patch('scrapy.utils.project.project_data_dir') as data_dir:
            data_dir.return_value = self.temp_dir

            # simple project_data_dir mock with based settings
            crawler.settings = Settings({'DELTAFETCH_ENABLED': True})
            instance = self.mwcls.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(instance.dir,
                             os.path.join(self.temp_dir, 'deltafetch'))
            self.assertEqual(instance.reset, False)

            # project_data_dir mock with advanced settings
            crawler.settings = Settings({
                'DELTAFETCH_ENABLED': True,
                'DELTAFETCH_DIR': 'other',
                'DELTAFETCH_RESET': True
            })
            instance = self.mwcls.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(instance.dir,
                             os.path.join(self.temp_dir, 'other'))
            self.assertEqual(instance.reset, True)

    def test_spider_opened_new(self):
        if os.path.exists(self.db_path):
            os.remove(self.db_path)
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        mw.spider_opened(self.spider)
        assert os.path.isdir(self.temp_dir)
        assert os.path.exists(self.db_path)
        assert hasattr(mw, 'db')
        assert isinstance(mw.db, type(dbmodule.db.DB()))
        assert mw.db.items() == []
        assert mw.db.get_type() == dbmodule.db.DB_HASH
        assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE

    def test_spider_opened_existing(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        mw.spider_opened(self.spider)
        assert hasattr(mw, 'db')
        assert isinstance(mw.db, type(dbmodule.db.DB()))
        assert mw.db.items() == [('test_key_1', 'test_v_1'),
                                 ('test_key_2', 'test_v_2')]
        assert mw.db.get_type() == dbmodule.db.DB_HASH
        assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE

    def test_spider_opened_existing_spider_reset(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        self.spider.deltafetch_reset = True
        mw.spider_opened(self.spider)
        assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE

    def test_spider_opened_reset_non_existing_db(self):
        mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        self.spider.deltafetch_reset = True
        mw.spider_opened(self.spider)
        assert mw.db.fd()
        # there's different logic for different bdb versions:
        # it can fail when opening a non-existing db with truncate flag,
        # then it should be caught and retried with rm & create flag
        assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE
                or mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE)

    def test_spider_opened_recreate(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        mw.spider_opened(self.spider)
        assert hasattr(mw, 'db')
        assert isinstance(mw.db, type(dbmodule.db.DB()))
        assert mw.db.items() == []
        assert mw.db.get_type() == dbmodule.db.DB_HASH
        assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE

    def test_spider_closed(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
        mw.spider_opened(self.spider)
        assert mw.db.fd()
        mw.spider_closed(self.spider)
        self.assertRaises(dbmodule.db.DBError, mw.db.fd)

    def test_process_spider_output(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)), [])
        result = [
            Request('http://url', meta={'deltafetch_key': 'key1'}),
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)),
            [result[0]])
        self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1})
        result = [BaseItem(), "not a base item"]
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)),
            result)
        self.assertEqual(mw.db.keys(), ['test_key_1', 'key', 'test_key_2'])
        assert mw.db['key']

    def test_process_spider_output_stats(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)), [])
        self.assertEqual(self.stats.get_stats(), {})
        result = [
            Request('http://url', meta={'deltafetch_key': 'key'}),
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)),
            [result[0]])
        self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1)
        result = [BaseItem(), "not a base item"]
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)),
            result)
        self.assertEqual(self.stats.get_value('deltafetch/stored'), 1)

    def test_init_from_crawler_legacy(self):
        # test with subclass not handling passed stats
        class LegacyDeltaFetchSubClass(self.mwcls):
            def __init__(self, dir, reset=False, *args, **kwargs):
                super(LegacyDeltaFetchSubClass, self).__init__(dir=dir,
                                                               reset=reset)
                self.something = True

        crawler = mock.Mock()
        # void settings
        crawler.settings = Settings({})
        self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler)

        with mock.patch('scrapy.utils.project.project_data_dir') as data_dir:
            data_dir.return_value = self.temp_dir

            # simple project_data_dir mock with based settings
            crawler.settings = Settings({'DELTAFETCH_ENABLED': True})
            instance = LegacyDeltaFetchSubClass.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(instance.dir,
                             os.path.join(self.temp_dir, 'deltafetch'))
            self.assertEqual(instance.reset, False)

            # project_data_dir mock with advanced settings
            crawler.settings = Settings({
                'DELTAFETCH_ENABLED': True,
                'DELTAFETCH_DIR': 'other',
                'DELTAFETCH_RESET': True
            })
            instance = LegacyDeltaFetchSubClass.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(instance.dir,
                             os.path.join(self.temp_dir, 'other'))
            self.assertEqual(instance.reset, True)

    def test_process_spider_output_stats_legacy(self):
        # testing the subclass not handling stats works at runtime
        # (i.e. that trying to update stats does not trigger exception)
        class LegacyDeltaFetchSubClass(self.mwcls):
            def __init__(self, dir, reset=False, *args, **kwargs):
                super(LegacyDeltaFetchSubClass, self).__init__(dir=dir,
                                                               reset=reset)
                self.something = True

        self._create_test_db()
        mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)), [])
        self.assertEqual(self.stats.get_stats(), {})
        result = [
            Request('http://url', meta={'deltafetch_key': 'key'}),
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]

        # stats should not be updated
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)),
            [result[0]])
        self.assertEqual(self.stats.get_value('deltafetch/skipped'), None)

        result = [BaseItem(), "not a base item"]
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)),
            result)
        self.assertEqual(self.stats.get_value('deltafetch/stored'), None)

    def test_get_key(self):
        mw = self.mwcls(self.temp_dir, reset=True)
        test_req1 = Request('http://url1')
        self.assertEqual(mw._get_key(test_req1),
                         request_fingerprint(test_req1))
        test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'})
        self.assertEqual(mw._get_key(test_req2), 'dfkey1')

    def _create_test_db(self):
        db = dbmodule.db.DB()
        # truncate test db if there were failed tests
        db.open(self.db_path, dbmodule.db.DB_HASH,
                dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE)
        db['test_key_1'] = 'test_v_1'
        db['test_key_2'] = 'test_v_2'
        db.close()
예제 #13
0
class DeltaFetchTestCase(TestCase):

    mwcls = DeltaFetch

    def setUp(self):
        self.spider = Spider('df_tests')
        self.temp_dir = tempfile.gettempdir()
        self.db_path = os.path.join(self.temp_dir, 'df_tests.db')
        crawler = get_crawler(Spider)
        self.stats = StatsCollector(crawler)

    def test_init(self):
        # path format is any,  the folder is not created
        instance = self.mwcls('/any/dir', True, stats=self.stats)
        assert isinstance(instance, self.mwcls)
        self.assertEqual(instance.dir, '/any/dir')
        self.assertEqual(self.stats.get_stats(), {})
        self.assertEqual(instance.reset, True)

    def test_init_from_crawler(self):
        crawler = mock.Mock()
        # void settings
        crawler.settings = Settings({})
        self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler)
        with mock.patch('scrapy.utils.project.project_data_dir') as data_dir:
            data_dir.return_value = self.temp_dir

            # simple project_data_dir mock with based settings
            crawler.settings = Settings({'DELTAFETCH_ENABLED': True})
            instance = self.mwcls.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(
                instance.dir, os.path.join(self.temp_dir, 'deltafetch'))
            self.assertEqual(instance.reset, False)

            # project_data_dir mock with advanced settings
            crawler.settings = Settings({'DELTAFETCH_ENABLED': True,
                                         'DELTAFETCH_DIR': 'other',
                                         'DELTAFETCH_RESET': True})
            instance = self.mwcls.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(
                instance.dir, os.path.join(self.temp_dir, 'other'))
            self.assertEqual(instance.reset, True)

    def test_spider_opened_new(self):
        if os.path.exists(self.db_path):
            os.remove(self.db_path)
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        mw.spider_opened(self.spider)
        assert os.path.isdir(self.temp_dir)
        assert os.path.exists(self.db_path)
        assert hasattr(mw, 'db')
        assert isinstance(mw.db, type(dbmodule.db.DB()))
        assert mw.db.items() == []
        assert mw.db.get_type() == dbmodule.db.DB_HASH
        assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE

    def test_spider_opened_existing(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        mw.spider_opened(self.spider)
        assert hasattr(mw, 'db')
        assert isinstance(mw.db, type(dbmodule.db.DB()))
        assert mw.db.items() == [(b'test_key_1', b'test_v_1'),
                                 (b'test_key_2', b'test_v_2')]
        assert mw.db.get_type() == dbmodule.db.DB_HASH
        assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE

    def test_spider_opened_existing_spider_reset(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        self.spider.deltafetch_reset = True
        mw.spider_opened(self.spider)
        assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE

    def test_spider_opened_reset_non_existing_db(self):
        mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        self.spider.deltafetch_reset = True
        mw.spider_opened(self.spider)
        assert mw.db.fd()
        # there's different logic for different bdb versions:
        # it can fail when opening a non-existing db with truncate flag,
        # then it should be caught and retried with rm & create flag
        assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or
                mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE)

    def test_spider_opened_recreate(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
        assert not hasattr(self.mwcls, 'db')
        mw.spider_opened(self.spider)
        assert hasattr(mw, 'db')
        assert isinstance(mw.db, type(dbmodule.db.DB()))
        assert mw.db.items() == []
        assert mw.db.get_type() == dbmodule.db.DB_HASH
        assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE

    def test_spider_closed(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
        mw.spider_opened(self.spider)
        assert mw.db.fd()
        mw.spider_closed(self.spider)
        self.assertRaises(dbmodule.db.DBError, mw.db.fd)

    def test_process_spider_output(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [])
        result = [
            Request('http://url', meta={'deltafetch_key': 'key1'}),
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [result[0]])
        self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1})
        result = [BaseItem(), "not a base item"]
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), result)
        self.assertEqual(mw.db.keys(), [b'test_key_1', b'key', b'test_key_2'])
        assert mw.db[b'key']

    def test_process_spider_output_stats(self):
        self._create_test_db()
        mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [])
        self.assertEqual(self.stats.get_stats(), {})
        result = [
            Request('http://url', meta={'deltafetch_key': 'key'}),
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [result[0]])
        self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1)
        result = [BaseItem(), "not a base item"]
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), result)
        self.assertEqual(self.stats.get_value('deltafetch/stored'), 1)

    def test_init_from_crawler_legacy(self):
        # test with subclass not handling passed stats
        class LegacyDeltaFetchSubClass(self.mwcls):

            def __init__(self, dir, reset=False, *args, **kwargs):
                super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset)
                self.something = True

        crawler = mock.Mock()
        # void settings
        crawler.settings = Settings({})
        self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler)

        with mock.patch('scrapy.utils.project.project_data_dir') as data_dir:
            data_dir.return_value = self.temp_dir

            # simple project_data_dir mock with based settings
            crawler.settings = Settings({'DELTAFETCH_ENABLED': True})
            instance = LegacyDeltaFetchSubClass.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(
                instance.dir, os.path.join(self.temp_dir, 'deltafetch'))
            self.assertEqual(instance.reset, False)

            # project_data_dir mock with advanced settings
            crawler.settings = Settings({'DELTAFETCH_ENABLED': True,
                                         'DELTAFETCH_DIR': 'other',
                                         'DELTAFETCH_RESET': True})
            instance = LegacyDeltaFetchSubClass.from_crawler(crawler)
            assert isinstance(instance, self.mwcls)
            self.assertEqual(
                instance.dir, os.path.join(self.temp_dir, 'other'))
            self.assertEqual(instance.reset, True)

    def test_process_spider_output_stats_legacy(self):
        # testing the subclass not handling stats works at runtime
        # (i.e. that trying to update stats does not trigger exception)
        class LegacyDeltaFetchSubClass(self.mwcls):

            def __init__(self, dir, reset=False, *args, **kwargs):
                super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset)
                self.something = True

        self._create_test_db()
        mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [])
        self.assertEqual(self.stats.get_stats(), {})
        result = [
            Request('http://url', meta={'deltafetch_key': 'key'}),
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]

        # stats should not be updated
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), [result[0]])
        self.assertEqual(self.stats.get_value('deltafetch/skipped'), None)

        result = [BaseItem(), "not a base item"]
        self.assertEqual(list(mw.process_spider_output(
            response, result, self.spider)), result)
        self.assertEqual(self.stats.get_value('deltafetch/stored'), None)

    def test_get_key(self):
        mw = self.mwcls(self.temp_dir, reset=True)
        test_req1 = Request('http://url1')
        self.assertEqual(mw._get_key(test_req1),
                         to_bytes(request_fingerprint(test_req1)))
        test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'})
        self.assertEqual(mw._get_key(test_req2), b'dfkey1')

    def _create_test_db(self):
        db = dbmodule.db.DB()
        # truncate test db if there were failed tests
        db.open(self.db_path, dbmodule.db.DB_HASH,
                dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE)
        db.put(b'test_key_1', b'test_v_1')
        db.put(b'test_key_2', b'test_v_2')
        db.close()
예제 #14
0
파일: test_stats.py 프로젝트: 01-/scrapy
 def test_collector(self):
     stats = StatsCollector(self.crawler)
     self.assertEqual(stats.get_stats(), {})
     self.assertEqual(stats.get_value('anything'), None)
     self.assertEqual(stats.get_value('anything', 'default'), 'default')
     stats.set_value('test', 'value')
     self.assertEqual(stats.get_stats(), {'test': 'value'})
     stats.set_value('test2', 23)
     self.assertEqual(stats.get_stats(), {'test': 'value', 'test2': 23})
     self.assertEqual(stats.get_value('test2'), 23)
     stats.inc_value('test2')
     self.assertEqual(stats.get_value('test2'), 24)
     stats.inc_value('test2', 6)
     self.assertEqual(stats.get_value('test2'), 30)
     stats.max_value('test2', 6)
     self.assertEqual(stats.get_value('test2'), 30)
     stats.max_value('test2', 40)
     self.assertEqual(stats.get_value('test2'), 40)
     stats.max_value('test3', 1)
     self.assertEqual(stats.get_value('test3'), 1)
     stats.min_value('test2', 60)
     self.assertEqual(stats.get_value('test2'), 40)
     stats.min_value('test2', 35)
     self.assertEqual(stats.get_value('test2'), 35)
     stats.min_value('test4', 7)
     self.assertEqual(stats.get_value('test4'), 7)
예제 #15
0
 def _min_value(self, key, value, spider=None):
     StatsCollector.min_value(self, key, value)
     self.max_min_value(min, key, value)
예제 #16
0
def assert_stats(stats: StatsCollector, expected: dict):
    for k, expected_val in expected.items():
        actual_val = stats.get_value(k, 0)
        assert actual_val == expected_val, \
            f"key: '{k}', value: {actual_val}, expected: {expected_val}"
예제 #17
0
 def setUp(self):
     self.spider = Spider('df_tests')
     self.temp_dir = tempfile.gettempdir()
     self.db_path = os.path.join(self.temp_dir, 'df_tests.db')
     crawler = get_crawler(Spider)
     self.stats = StatsCollector(crawler)
예제 #18
0
 def setUp(self):
     self.spider = Spider('df_tests')
     self.temp_dir = tempfile.gettempdir()
     self.db_path = os.path.join(self.temp_dir, 'df_tests.db')
     crawler = get_crawler(Spider)
     self.stats = StatsCollector(crawler)
예제 #19
0
 def test_collector(self):
     stats = StatsCollector(self.crawler)
     self.assertEqual(stats.get_stats(), {})
     self.assertEqual(stats.get_value('anything'), None)
     self.assertEqual(stats.get_value('anything', 'default'), 'default')
     stats.set_value('test', 'value')
     self.assertEqual(stats['test'], 'value')
     self.assertEqual(stats.get_stats(), {'test': 'value'})
     stats['test'] = 'value2'
     self.assertIn('test', stats)
     self.assertEqual(stats['test'], 'value2')
     self.assertEqual(next(iter(stats)), 'test')
     self.assertEqual(stats.get_stats(), {'test': 'value2'})
     stats['test'] = 'value'
     stats.set_value('test2', 23)
     self.assertEqual(stats.get_stats(), {'test': 'value', 'test2': 23})
     self.assertEqual(stats.get_value('test2'), 23)
     stats.inc_value('test2')
     self.assertEqual(stats.get_value('test2'), 24)
     stats.inc_value('test2', 6)
     self.assertEqual(stats.get_value('test2'), 30)
     stats.max_value('test2', 6)
     self.assertEqual(stats.get_value('test2'), 30)
     stats.max_value('test2', 40)
     self.assertEqual(stats.get_value('test2'), 40)
     stats.max_value('test3', 1)
     self.assertEqual(stats.get_value('test3'), 1)
     stats.min_value('test2', 60)
     self.assertEqual(stats.get_value('test2'), 40)
     stats.min_value('test2', 35)
     self.assertEqual(stats.get_value('test2'), 35)
     stats.min_value('test4', 7)
     self.assertEqual(stats.get_value('test4'), 7)