def test_log_debug(self): with LogCapture() as l: settings = {'DUPEFILTER_DEBUG': True, 'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'} crawler = get_crawler(SimpleSpider, settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) spider = SimpleSpider.from_crawler(crawler) dupefilter = scheduler.df dupefilter.open() r1 = Request('http://scrapytest.org/index.html') r2 = Request('http://scrapytest.org/index.html', headers={'Referer': 'http://scrapytest.org/INDEX.html'} ) dupefilter.log(r1, spider) dupefilter.log(r2, spider) assert crawler.stats.get_value('dupefilter/filtered') == 2 l.check_present(('scrapy.dupefilters', 'DEBUG', ('Filtered duplicate request: <GET http://scrapytest.org/index.html>' ' (referer: None)'))) l.check_present(('scrapy.dupefilters', 'DEBUG', ('Filtered duplicate request: <GET http://scrapytest.org/index.html>' ' (referer: http://scrapytest.org/INDEX.html)'))) dupefilter.close('finished')
def test_df_from_settings_scheduler(self): settings = {'DUPEFILTER_DEBUG': True, 'DUPEFILTER_CLASS': __name__ + '.FromSettingsRFPDupeFilter'} crawler = get_crawler(settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) self.assertTrue(scheduler.df.debug) self.assertEqual(scheduler.df.method, 'from_settings')
def test_log(self): with LogCapture() as l: settings = { 'DUPEFILTER_DEBUG': False, 'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter' } crawler = get_crawler(SimpleSpider, settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) spider = SimpleSpider.from_crawler(crawler) dupefilter = scheduler.df dupefilter.open() r1 = Request('http://scrapytest.org/index.html') r2 = Request('http://scrapytest.org/index.html') dupefilter.log(r1, spider) dupefilter.log(r2, spider) assert crawler.stats.get_value('dupefilter/filtered') == 2 l.check_present(('scrapy.dupefilters', 'DEBUG', ( 'Filtered duplicate request: <GET http://scrapytest.org/index.html>' ' - no more duplicates will be shown' ' (see DUPEFILTER_DEBUG to show all duplicates)'))) dupefilter.close('finished')
def test_seenreq_newlines(self): """ Checks against adding duplicate \r to line endings on Windows platforms. """ r1 = Request('http://scrapytest.org/1') path = tempfile.mkdtemp() crawler = get_crawler(settings_dict={'JOBDIR': path}) try: scheduler = Scheduler.from_crawler(crawler) df = scheduler.df df.open() df.request_seen(r1) df.close('finished') with open(os.path.join(path, 'requests.seen'), 'rb') as seen_file: line = next(seen_file).decode() assert not line.endswith('\r\r\n') if sys.platform == 'win32': assert line.endswith('\r\n') else: assert line.endswith('\n') finally: shutil.rmtree(path)
def _incompatible(self): settings = dict(SCHEDULER_PRIORITY_QUEUE= 'scrapy.pqueues.DownloaderAwarePriorityQueue', CONCURRENT_REQUESTS_PER_IP=1) crawler = Crawler(Spider, settings) scheduler = Scheduler.from_crawler(crawler) spider = Spider(name='spider') scheduler.open(spider)
def test_df_direct_scheduler(self): settings = { 'DUPEFILTER_CLASS': DirectDupeFilter, 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION' } crawler = get_crawler(settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) self.assertEqual(scheduler.df.method, 'n/a')
def _get_dupefilter(*, crawler=None, settings=None, open=True): if crawler is None: crawler = get_crawler(settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) dupefilter = scheduler.df if open: dupefilter.open() return dupefilter
def _incompatible(self): settings = dict( SCHEDULER_PRIORITY_QUEUE='scrapy.pqueues.DownloaderAwarePriorityQueue', CONCURRENT_REQUESTS_PER_IP=1 ) crawler = Crawler(Spider, settings) scheduler = Scheduler.from_crawler(crawler) spider = Spider(name='spider') scheduler.open(spider)
def test_df_from_settings_scheduler(self): settings = { 'DUPEFILTER_DEBUG': True, 'DUPEFILTER_CLASS': FromSettingsRFPDupeFilter, 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION' } crawler = get_crawler(settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) self.assertTrue(scheduler.df.debug) self.assertEqual(scheduler.df.method, 'from_settings')
def test_log(self): with LogCapture() as l: settings = {'DUPEFILTER_DEBUG': False, 'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'} crawler = get_crawler(SimpleSpider, settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) spider = SimpleSpider.from_crawler(crawler) dupefilter = scheduler.df dupefilter.open() r1 = Request('http://scrapytest.org/index.html') r2 = Request('http://scrapytest.org/index.html') dupefilter.log(r1, spider) dupefilter.log(r2, spider) assert crawler.stats.get_value('dupefilter/filtered') == 2 l.check_present(('scrapy.dupefilters', 'DEBUG', ('Filtered duplicate request: <GET http://scrapytest.org/index.html>' ' - no more duplicates will be shown' ' (see DUPEFILTER_DEBUG to show all duplicates)'))) dupefilter.close('finished')
def test_df_direct_scheduler(self): settings = {'DUPEFILTER_CLASS': __name__ + '.DirectDupeFilter'} crawler = get_crawler(settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) self.assertEqual(scheduler.df.method, 'n/a')
def create_scheduler(self): self.mock_crawler = MockCrawler(self.priority_queue_cls, self.jobdir) self.scheduler = Scheduler.from_crawler(self.mock_crawler) self.spider = Spider(name='spider') self.scheduler.open(self.spider)