def __init__(self, crawler): StatsCollector.__init__(self, crawler) self.redis = redis_from_settings(crawler.settings) self.crawler = crawler self.key = '%s:stats' % crawler.spidercls.name self.encoding = crawler.settings['REDIS_ENCODING']
def setUp(self): crawler = get_crawler(Spider) self.spider = crawler._create_spider('scrapytest.org') self.stats = StatsCollector(crawler) self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats, True)
def setUp(self): self.spider_name = 'df_tests' self.spider = Spider(self.spider_name) # DeltaFetch creates .db files named after the spider's name self.temp_dir = tempfile.gettempdir() self.db_path = os.path.join(self.temp_dir, '%s.db' % self.spider.name) crawler = get_crawler(Spider) self.stats = StatsCollector(crawler)
def parse(self, response): #getting the question blocks from response. question_blocks = Selector( text=json.loads(response.body.decode("utf-8"))['msg'][1]).xpath( '//div[contains(@itemtype, "http://schema.org/Question")]') for question_block in question_blocks: item = ZhihuapiItem() item['question_name'] = question_block.xpath( './/div/div/h2/a/text()').extract_first() item['question_url'] = question_block.xpath( './/div/div/h2/a/@href').extract_first() item['question_answer'] = question_block.xpath( './/div/div/div[1]/div[5]/div/a/@href').extract_first() item['question_answer_author_profile'] = question_block.xpath( './/div/div/div[1]/div[3]/span/span[1]/a/@href').extract_first( ) item['question_answer_author'] = question_block.xpath( './/div/div/div[1]/div[3]/span/span[1]/a/text()' ).extract_first() self.logger.info( 'Question info: question name - {}, question answer - {}, question url - {}, question answer author profile - {}, question answer author - {}' .format(item['question_name'], item['question_answer'], item['question_url'], item['question_answer_author_profile'], item['question_answer_author'])) yield item if len(question_blocks) > 0: last_data_score = question_blocks[len(question_blocks) - 1].xpath( '@data-score').extract_first() else: self.logger.info("No more new questions, waiting to stop...") StatsCollector.close_spider(self, spider=zhihuSpider, reason="No more questions...") self.logger.info('Last Data Score is - {}'.format(last_data_score)) yield scrapy.http.FormRequest(self.topic_url, method='POST', headers=self.headers, formdata={ 'start': '0', 'offset': str(last_data_score) }, callback=self.parse)
def setUp(self): crawler = get_crawler(Spider) self.spider = crawler._create_spider("scrapytest.org") self.stats = StatsCollector(crawler) self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats, True)
async def _request_handler(request: pyppeteer.network_manager.Request, scrapy_request: Request, stats: StatsCollector) -> None: # set headers, method and body if request.url == scrapy_request.url: overrides = { "method": scrapy_request.method, "headers": { key.decode("utf-8").lower(): value[0].decode("utf-8") for key, value in scrapy_request.headers.items() }, } if scrapy_request.body: overrides["postData"] = scrapy_request.body.decode( scrapy_request.encoding) else: overrides = {"headers": request.headers.copy()} if scrapy_request.headers.get("user-agent"): user_agent = scrapy_request.headers["user-agent"].decode("utf-8") overrides["headers"]["user-agent"] = user_agent await request.continue_(overrides) # increment stats stats.inc_value("pyppeteer/request_method_count/{}".format(request.method)) stats.inc_value("pyppeteer/request_count") if request.isNavigationRequest(): stats.inc_value("pyppeteer/request_count/navigation")
class TestDepthMiddleware(TestCase): def setUp(self): crawler = get_crawler(Spider) self.spider = crawler._create_spider('scrapytest.org') self.stats = StatsCollector(crawler) self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats, True) def test_process_spider_output(self): req = Request('http://scrapytest.org') resp = Response('http://scrapytest.org') resp.request = req result = [Request('http://scrapytest.org')] out = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out, result) rdc = self.stats.get_value('request_depth_count/1', spider=self.spider) self.assertEquals(rdc, 1) req.meta['depth'] = 1 out2 = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out2, []) rdm = self.stats.get_value('request_depth_max', spider=self.spider) self.assertEquals(rdm, 1) def tearDown(self): self.stats.close_spider(self.spider, '')
class TestDepthMiddleware(TestCase): def setUp(self): crawler = get_crawler(Spider) self.spider = crawler._create_spider("scrapytest.org") self.stats = StatsCollector(crawler) self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats, True) def test_process_spider_output(self): req = Request("http://scrapytest.org") resp = Response("http://scrapytest.org") resp.request = req result = [Request("http://scrapytest.org")] out = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out, result) rdc = self.stats.get_value("request_depth_count/1", spider=self.spider) self.assertEquals(rdc, 1) req.meta["depth"] = 1 out2 = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out2, []) rdm = self.stats.get_value("request_depth_max", spider=self.spider) self.assertEquals(rdm, 1) def tearDown(self): self.stats.close_spider(self.spider, "")
def test_core_stats_default_stats_collector(self, mock_datetime): fixed_datetime = datetime(2019, 12, 1, 11, 38) mock_datetime.utcnow = mock.Mock(return_value=fixed_datetime) self.crawler.stats = StatsCollector(self.crawler) ext = CoreStats.from_crawler(self.crawler) ext.spider_opened(self.spider) ext.item_scraped({}, self.spider) ext.response_received(self.spider) ext.item_dropped({}, self.spider, ZeroDivisionError()) ext.spider_closed(self.spider, 'finished') self.assertEqual( ext.stats._stats, { 'start_time': fixed_datetime, 'finish_time': fixed_datetime, 'item_scraped_count': 1, 'response_received_count': 1, 'item_dropped_count': 1, 'item_dropped_reasons_count/ZeroDivisionError': 1, 'finish_reason': 'finished', 'elapsed_time_seconds': 0.0, })
async def _response_handler(response: pyppeteer.network_manager.Response, stats: StatsCollector): stats.inc_value("pyppeteer/response_count") stats.inc_value("pyppeteer/response_status_count/{}".format( response.status))
class DeltaFetchTestCase(TestCase): mwcls = DeltaFetch def setUp(self): self.spider_name = 'df_tests' self.spider = Spider(self.spider_name) # DeltaFetch creates .db files named after the spider's name self.temp_dir = tempfile.gettempdir() self.db_path = os.path.join(self.temp_dir, '%s.db' % self.spider.name) crawler = get_crawler(Spider) self.stats = StatsCollector(crawler) def test_init(self): # path format is any, the folder is not created instance = self.mwcls('/any/dir', True, stats=self.stats) assert isinstance(instance, self.mwcls) self.assertEqual(instance.dir, '/any/dir') self.assertEqual(self.stats.get_stats(), {}) self.assertEqual(instance.reset, True) def test_init_from_crawler(self): crawler = mock.Mock() # void settings crawler.settings = Settings({}) self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) with mock.patch('scrapy.utils.project.project_data_dir') as data_dir, \ mock.patch('scrapy.utils.project.inside_project') as in_project: data_dir.return_value = self.temp_dir in_project.return_value = True # simple project_data_dir mock with based settings crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( instance.dir, os.path.join(self.temp_dir, 'deltafetch')) self.assertEqual(instance.reset, False) # project_data_dir mock with advanced settings crawler.settings = Settings({'DELTAFETCH_ENABLED': True, 'DELTAFETCH_DIR': 'other', 'DELTAFETCH_RESET': True}) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( instance.dir, os.path.join(self.temp_dir, 'other')) self.assertEqual(instance.reset, True) def test_spider_opened_new(self): """Middleware should create a .db file if not found.""" if os.path.exists(self.db_path): os.remove(self.db_path) mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert os.path.isdir(self.temp_dir) assert os.path.exists(self.db_path) assert hasattr(mw, 'db') assert isinstance(mw.db, type(dbmodule.db.DB())) assert mw.db.items() == [] assert mw.db.get_type() == dbmodule.db.DB_HASH assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE def test_spider_opened_existing(self): """Middleware should open and use existing and valid .db files.""" self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert hasattr(mw, 'db') assert isinstance(mw.db, type(dbmodule.db.DB())) assert mw.db.items() == [(b'test_key_1', b'test_v_1'), (b'test_key_2', b'test_v_2')] assert mw.db.get_type() == dbmodule.db.DB_HASH assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE def test_spider_opened_corrupt_dbfile(self): """Middleware should create a new .db if it cannot open it.""" # create an invalid .db file with open(self.db_path, "wb") as dbfile: dbfile.write(b'bad') mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) assert not hasattr(self.mwcls, 'db') # file corruption is only detected when opening spider mw.spider_opened(self.spider) assert os.path.isdir(self.temp_dir) assert os.path.exists(self.db_path) assert hasattr(mw, 'db') assert isinstance(mw.db, type(dbmodule.db.DB())) # and db should be empty (it was re-created) assert mw.db.items() == [] assert mw.db.get_type() == dbmodule.db.DB_HASH assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE def test_spider_opened_existing_spider_reset(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) assert not hasattr(self.mwcls, 'db') self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE def test_spider_opened_reset_non_existing_db(self): mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) assert not hasattr(self.mwcls, 'db') self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.fd() # there's different logic for different bdb versions: # it can fail when opening a non-existing db with truncate flag, # then it should be caught and retried with rm & create flag assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE) def test_spider_opened_recreate(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert hasattr(mw, 'db') assert isinstance(mw.db, type(dbmodule.db.DB())) assert mw.db.items() == [] assert mw.db.get_type() == dbmodule.db.DB_HASH assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE def test_spider_closed(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) mw.spider_opened(self.spider) assert mw.db.fd() mw.spider_closed(self.spider) self.assertRaises(dbmodule.db.DBError, mw.db.fd) def test_process_spider_output(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), []) result = [ # same URL but with new key --> it should be processed Request('http://url', meta={'deltafetch_key': 'key1'}), # 'test_key_1' is already in the test db --> it should be skipped Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] # so only the 1 request should go through self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), [result[0]]) # the skipped "http://url1" should be counted in stats self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1}) # b'key' should not be in the db yet as no item was collected yet self.assertEqual(set(mw.db.keys()), set([b'test_key_1', b'test_key_2'])) # if the spider returns items, the request's key is added in db result = [BaseItem(), "not a base item"] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), result) self.assertEqual(set(mw.db.keys()), set([b'key', b'test_key_1', b'test_key_2'])) assert mw.db[b'key'] def test_process_spider_output_dict(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [{"somekey": "somevalue"}] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), result) self.assertEqual(set(mw.db.keys()), set([b'key', b'test_key_1', b'test_key_2'])) assert mw.db[b'key'] def test_process_spider_output_stats(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), []) self.assertEqual(self.stats.get_stats(), {}) result = [ Request('http://url', meta={'deltafetch_key': 'key'}), Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), [result[0]]) self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1) result = [BaseItem(), "not a base item"] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), result) self.assertEqual(self.stats.get_value('deltafetch/stored'), 1) def test_init_from_crawler_legacy(self): # test with subclass not handling passed stats class LegacyDeltaFetchSubClass(self.mwcls): def __init__(self, dir, reset=False, *args, **kwargs): super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) self.something = True crawler = mock.Mock() # void settings crawler.settings = Settings({}) self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) with mock.patch('scrapy.utils.project.project_data_dir') as data_dir, \ mock.patch('scrapy.utils.project.inside_project') as in_project: data_dir.return_value = self.temp_dir in_project.return_value = True # simple project_data_dir mock with based settings crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( instance.dir, os.path.join(self.temp_dir, 'deltafetch')) self.assertEqual(instance.reset, False) # project_data_dir mock with advanced settings crawler.settings = Settings({'DELTAFETCH_ENABLED': True, 'DELTAFETCH_DIR': 'other', 'DELTAFETCH_RESET': True}) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( instance.dir, os.path.join(self.temp_dir, 'other')) self.assertEqual(instance.reset, True) def test_process_spider_output_stats_legacy(self): # testing the subclass not handling stats works at runtime # (i.e. that trying to update stats does not trigger exception) class LegacyDeltaFetchSubClass(self.mwcls): def __init__(self, dir, reset=False, *args, **kwargs): super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) self.something = True self._create_test_db() mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), []) self.assertEqual(self.stats.get_stats(), {}) result = [ Request('http://url', meta={'deltafetch_key': 'key'}), Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] # stats should not be updated self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), [result[0]]) self.assertEqual(self.stats.get_value('deltafetch/skipped'), None) result = [BaseItem(), "not a base item"] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), result) self.assertEqual(self.stats.get_value('deltafetch/stored'), None) def test_get_key(self): mw = self.mwcls(self.temp_dir, reset=True) test_req1 = Request('http://url1') self.assertEqual(mw._get_key(test_req1), to_bytes(request_fingerprint(test_req1))) test_req2 = Request('http://url2', meta={'deltafetch_key': b'dfkey1'}) self.assertEqual(mw._get_key(test_req2), b'dfkey1') test_req3 = Request('http://url2', meta={'deltafetch_key': u'dfkey1'}) # key will be converted to bytes self.assertEqual(mw._get_key(test_req3), b'dfkey1') def _create_test_db(self): db = dbmodule.db.DB() # truncate test db if there were failed tests db.open(self.db_path, dbmodule.db.DB_HASH, dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE) db[b'test_key_1'] = b'test_v_1' db[b'test_key_2'] = b'test_v_2' db.close()
class DeltaFetchTestCase(TestCase): mwcls = DeltaFetch def setUp(self): self.spider = Spider('df_tests') self.temp_dir = tempfile.gettempdir() self.db_path = os.path.join(self.temp_dir, 'df_tests.db') crawler = get_crawler(Spider) self.stats = StatsCollector(crawler) def test_init(self): # path format is any, the folder is not created instance = self.mwcls('/any/dir', True, stats=self.stats) assert isinstance(instance, self.mwcls) self.assertEqual(instance.dir, '/any/dir') self.assertEqual(self.stats.get_stats(), {}) self.assertEqual(instance.reset, True) def test_init_from_crawler(self): crawler = mock.Mock() # void settings crawler.settings = Settings({}) self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) with mock.patch('scrapy.utils.project.project_data_dir') as data_dir: data_dir.return_value = self.temp_dir # simple project_data_dir mock with based settings crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual(instance.dir, os.path.join(self.temp_dir, 'deltafetch')) self.assertEqual(instance.reset, False) # project_data_dir mock with advanced settings crawler.settings = Settings({ 'DELTAFETCH_ENABLED': True, 'DELTAFETCH_DIR': 'other', 'DELTAFETCH_RESET': True }) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual(instance.dir, os.path.join(self.temp_dir, 'other')) self.assertEqual(instance.reset, True) def test_spider_opened_new(self): if os.path.exists(self.db_path): os.remove(self.db_path) mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert os.path.isdir(self.temp_dir) assert os.path.exists(self.db_path) assert hasattr(mw, 'db') assert isinstance(mw.db, type(dbmodule.db.DB())) assert mw.db.items() == [] assert mw.db.get_type() == dbmodule.db.DB_HASH assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE def test_spider_opened_existing(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert hasattr(mw, 'db') assert isinstance(mw.db, type(dbmodule.db.DB())) assert mw.db.items() == [('test_key_1', 'test_v_1'), ('test_key_2', 'test_v_2')] assert mw.db.get_type() == dbmodule.db.DB_HASH assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE def test_spider_opened_existing_spider_reset(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) assert not hasattr(self.mwcls, 'db') self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE def test_spider_opened_reset_non_existing_db(self): mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) assert not hasattr(self.mwcls, 'db') self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.fd() # there's different logic for different bdb versions: # it can fail when opening a non-existing db with truncate flag, # then it should be caught and retried with rm & create flag assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE) def test_spider_opened_recreate(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert hasattr(mw, 'db') assert isinstance(mw.db, type(dbmodule.db.DB())) assert mw.db.items() == [] assert mw.db.get_type() == dbmodule.db.DB_HASH assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE def test_spider_closed(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) mw.spider_opened(self.spider) assert mw.db.fd() mw.spider_closed(self.spider) self.assertRaises(dbmodule.db.DBError, mw.db.fd) def test_process_spider_output(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), []) result = [ Request('http://url', meta={'deltafetch_key': 'key1'}), Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), [result[0]]) self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1}) result = [BaseItem(), "not a base item"] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), result) self.assertEqual(mw.db.keys(), ['test_key_1', 'key', 'test_key_2']) assert mw.db['key'] def test_process_spider_output_stats(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), []) self.assertEqual(self.stats.get_stats(), {}) result = [ Request('http://url', meta={'deltafetch_key': 'key'}), Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), [result[0]]) self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1) result = [BaseItem(), "not a base item"] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), result) self.assertEqual(self.stats.get_value('deltafetch/stored'), 1) def test_init_from_crawler_legacy(self): # test with subclass not handling passed stats class LegacyDeltaFetchSubClass(self.mwcls): def __init__(self, dir, reset=False, *args, **kwargs): super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) self.something = True crawler = mock.Mock() # void settings crawler.settings = Settings({}) self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) with mock.patch('scrapy.utils.project.project_data_dir') as data_dir: data_dir.return_value = self.temp_dir # simple project_data_dir mock with based settings crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual(instance.dir, os.path.join(self.temp_dir, 'deltafetch')) self.assertEqual(instance.reset, False) # project_data_dir mock with advanced settings crawler.settings = Settings({ 'DELTAFETCH_ENABLED': True, 'DELTAFETCH_DIR': 'other', 'DELTAFETCH_RESET': True }) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual(instance.dir, os.path.join(self.temp_dir, 'other')) self.assertEqual(instance.reset, True) def test_process_spider_output_stats_legacy(self): # testing the subclass not handling stats works at runtime # (i.e. that trying to update stats does not trigger exception) class LegacyDeltaFetchSubClass(self.mwcls): def __init__(self, dir, reset=False, *args, **kwargs): super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) self.something = True self._create_test_db() mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), []) self.assertEqual(self.stats.get_stats(), {}) result = [ Request('http://url', meta={'deltafetch_key': 'key'}), Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] # stats should not be updated self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), [result[0]]) self.assertEqual(self.stats.get_value('deltafetch/skipped'), None) result = [BaseItem(), "not a base item"] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), result) self.assertEqual(self.stats.get_value('deltafetch/stored'), None) def test_get_key(self): mw = self.mwcls(self.temp_dir, reset=True) test_req1 = Request('http://url1') self.assertEqual(mw._get_key(test_req1), request_fingerprint(test_req1)) test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'}) self.assertEqual(mw._get_key(test_req2), 'dfkey1') def _create_test_db(self): db = dbmodule.db.DB() # truncate test db if there were failed tests db.open(self.db_path, dbmodule.db.DB_HASH, dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE) db['test_key_1'] = 'test_v_1' db['test_key_2'] = 'test_v_2' db.close()
class DeltaFetchTestCase(TestCase): mwcls = DeltaFetch def setUp(self): self.spider = Spider('df_tests') self.temp_dir = tempfile.gettempdir() self.db_path = os.path.join(self.temp_dir, 'df_tests.db') crawler = get_crawler(Spider) self.stats = StatsCollector(crawler) def test_init(self): # path format is any, the folder is not created instance = self.mwcls('/any/dir', True, stats=self.stats) assert isinstance(instance, self.mwcls) self.assertEqual(instance.dir, '/any/dir') self.assertEqual(self.stats.get_stats(), {}) self.assertEqual(instance.reset, True) def test_init_from_crawler(self): crawler = mock.Mock() # void settings crawler.settings = Settings({}) self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) with mock.patch('scrapy.utils.project.project_data_dir') as data_dir: data_dir.return_value = self.temp_dir # simple project_data_dir mock with based settings crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( instance.dir, os.path.join(self.temp_dir, 'deltafetch')) self.assertEqual(instance.reset, False) # project_data_dir mock with advanced settings crawler.settings = Settings({'DELTAFETCH_ENABLED': True, 'DELTAFETCH_DIR': 'other', 'DELTAFETCH_RESET': True}) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( instance.dir, os.path.join(self.temp_dir, 'other')) self.assertEqual(instance.reset, True) def test_spider_opened_new(self): if os.path.exists(self.db_path): os.remove(self.db_path) mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert os.path.isdir(self.temp_dir) assert os.path.exists(self.db_path) assert hasattr(mw, 'db') assert isinstance(mw.db, type(dbmodule.db.DB())) assert mw.db.items() == [] assert mw.db.get_type() == dbmodule.db.DB_HASH assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE def test_spider_opened_existing(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert hasattr(mw, 'db') assert isinstance(mw.db, type(dbmodule.db.DB())) assert mw.db.items() == [(b'test_key_1', b'test_v_1'), (b'test_key_2', b'test_v_2')] assert mw.db.get_type() == dbmodule.db.DB_HASH assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE def test_spider_opened_existing_spider_reset(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) assert not hasattr(self.mwcls, 'db') self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE def test_spider_opened_reset_non_existing_db(self): mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) assert not hasattr(self.mwcls, 'db') self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.fd() # there's different logic for different bdb versions: # it can fail when opening a non-existing db with truncate flag, # then it should be caught and retried with rm & create flag assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE) def test_spider_opened_recreate(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) assert not hasattr(self.mwcls, 'db') mw.spider_opened(self.spider) assert hasattr(mw, 'db') assert isinstance(mw.db, type(dbmodule.db.DB())) assert mw.db.items() == [] assert mw.db.get_type() == dbmodule.db.DB_HASH assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE def test_spider_closed(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) mw.spider_opened(self.spider) assert mw.db.fd() mw.spider_closed(self.spider) self.assertRaises(dbmodule.db.DBError, mw.db.fd) def test_process_spider_output(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), []) result = [ Request('http://url', meta={'deltafetch_key': 'key1'}), Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), [result[0]]) self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1}) result = [BaseItem(), "not a base item"] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), result) self.assertEqual(mw.db.keys(), [b'test_key_1', b'key', b'test_key_2']) assert mw.db[b'key'] def test_process_spider_output_stats(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), []) self.assertEqual(self.stats.get_stats(), {}) result = [ Request('http://url', meta={'deltafetch_key': 'key'}), Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), [result[0]]) self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1) result = [BaseItem(), "not a base item"] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), result) self.assertEqual(self.stats.get_value('deltafetch/stored'), 1) def test_init_from_crawler_legacy(self): # test with subclass not handling passed stats class LegacyDeltaFetchSubClass(self.mwcls): def __init__(self, dir, reset=False, *args, **kwargs): super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) self.something = True crawler = mock.Mock() # void settings crawler.settings = Settings({}) self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) with mock.patch('scrapy.utils.project.project_data_dir') as data_dir: data_dir.return_value = self.temp_dir # simple project_data_dir mock with based settings crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( instance.dir, os.path.join(self.temp_dir, 'deltafetch')) self.assertEqual(instance.reset, False) # project_data_dir mock with advanced settings crawler.settings = Settings({'DELTAFETCH_ENABLED': True, 'DELTAFETCH_DIR': 'other', 'DELTAFETCH_RESET': True}) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) self.assertEqual( instance.dir, os.path.join(self.temp_dir, 'other')) self.assertEqual(instance.reset, True) def test_process_spider_output_stats_legacy(self): # testing the subclass not handling stats works at runtime # (i.e. that trying to update stats does not trigger exception) class LegacyDeltaFetchSubClass(self.mwcls): def __init__(self, dir, reset=False, *args, **kwargs): super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) self.something = True self._create_test_db() mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), []) self.assertEqual(self.stats.get_stats(), {}) result = [ Request('http://url', meta={'deltafetch_key': 'key'}), Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] # stats should not be updated self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), [result[0]]) self.assertEqual(self.stats.get_value('deltafetch/skipped'), None) result = [BaseItem(), "not a base item"] self.assertEqual(list(mw.process_spider_output( response, result, self.spider)), result) self.assertEqual(self.stats.get_value('deltafetch/stored'), None) def test_get_key(self): mw = self.mwcls(self.temp_dir, reset=True) test_req1 = Request('http://url1') self.assertEqual(mw._get_key(test_req1), to_bytes(request_fingerprint(test_req1))) test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'}) self.assertEqual(mw._get_key(test_req2), b'dfkey1') def _create_test_db(self): db = dbmodule.db.DB() # truncate test db if there were failed tests db.open(self.db_path, dbmodule.db.DB_HASH, dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE) db.put(b'test_key_1', b'test_v_1') db.put(b'test_key_2', b'test_v_2') db.close()
def test_collector(self): stats = StatsCollector(self.crawler) self.assertEqual(stats.get_stats(), {}) self.assertEqual(stats.get_value('anything'), None) self.assertEqual(stats.get_value('anything', 'default'), 'default') stats.set_value('test', 'value') self.assertEqual(stats.get_stats(), {'test': 'value'}) stats.set_value('test2', 23) self.assertEqual(stats.get_stats(), {'test': 'value', 'test2': 23}) self.assertEqual(stats.get_value('test2'), 23) stats.inc_value('test2') self.assertEqual(stats.get_value('test2'), 24) stats.inc_value('test2', 6) self.assertEqual(stats.get_value('test2'), 30) stats.max_value('test2', 6) self.assertEqual(stats.get_value('test2'), 30) stats.max_value('test2', 40) self.assertEqual(stats.get_value('test2'), 40) stats.max_value('test3', 1) self.assertEqual(stats.get_value('test3'), 1) stats.min_value('test2', 60) self.assertEqual(stats.get_value('test2'), 40) stats.min_value('test2', 35) self.assertEqual(stats.get_value('test2'), 35) stats.min_value('test4', 7) self.assertEqual(stats.get_value('test4'), 7)
def _min_value(self, key, value, spider=None): StatsCollector.min_value(self, key, value) self.max_min_value(min, key, value)
def assert_stats(stats: StatsCollector, expected: dict): for k, expected_val in expected.items(): actual_val = stats.get_value(k, 0) assert actual_val == expected_val, \ f"key: '{k}', value: {actual_val}, expected: {expected_val}"
def setUp(self): self.spider = Spider('df_tests') self.temp_dir = tempfile.gettempdir() self.db_path = os.path.join(self.temp_dir, 'df_tests.db') crawler = get_crawler(Spider) self.stats = StatsCollector(crawler)
def test_collector(self): stats = StatsCollector(self.crawler) self.assertEqual(stats.get_stats(), {}) self.assertEqual(stats.get_value('anything'), None) self.assertEqual(stats.get_value('anything', 'default'), 'default') stats.set_value('test', 'value') self.assertEqual(stats['test'], 'value') self.assertEqual(stats.get_stats(), {'test': 'value'}) stats['test'] = 'value2' self.assertIn('test', stats) self.assertEqual(stats['test'], 'value2') self.assertEqual(next(iter(stats)), 'test') self.assertEqual(stats.get_stats(), {'test': 'value2'}) stats['test'] = 'value' stats.set_value('test2', 23) self.assertEqual(stats.get_stats(), {'test': 'value', 'test2': 23}) self.assertEqual(stats.get_value('test2'), 23) stats.inc_value('test2') self.assertEqual(stats.get_value('test2'), 24) stats.inc_value('test2', 6) self.assertEqual(stats.get_value('test2'), 30) stats.max_value('test2', 6) self.assertEqual(stats.get_value('test2'), 30) stats.max_value('test2', 40) self.assertEqual(stats.get_value('test2'), 40) stats.max_value('test3', 1) self.assertEqual(stats.get_value('test3'), 1) stats.min_value('test2', 60) self.assertEqual(stats.get_value('test2'), 40) stats.min_value('test2', 35) self.assertEqual(stats.get_value('test2'), 35) stats.min_value('test4', 7) self.assertEqual(stats.get_value('test4'), 7)