def test_task_clone_grab_config_and_url(self): g = build_grab() g.setup(url='http://foo.com/') task = Task('foo', grab=g) task2 = task.clone(url='http://bar.com/') self.assertEqual(task2.url, 'http://bar.com/') self.assertEqual(task2.grab_config['url'], 'http://bar.com/')
def test_task_useragent(self): bot = SimpleSpider() bot.setup_queue() g = Grab() g.setup(url=SERVER.BASE_URL) g.setup(user_agent='Foo') task = Task('baz', grab=g) bot.add_task(task.clone()) bot.run() self.assertEqual(SERVER.REQUEST['headers']['User-Agent'], 'Foo')
def test_task_useragent(self): bot = build_spider(SimpleSpider, ) bot.setup_queue() g = Grab() g.setup(url=self.server.get_url()) g.setup(user_agent='Foo') task = Task('baz', grab=g) bot.add_task(task.clone()) bot.run() self.assertEqual(self.server.request['headers']['User-Agent'], 'Foo')
def test_update_grab_instance(self): class TestSpider(Spider): def update_grab_instance(self, grab): grab.setup(timeout=77) def task_generator(self): yield Task('page', url=self.meta['server'].get_url()) yield Task('page', grab=Grab(url=self.meta['server'].get_url(), timeout=1)) def task_page(self, grab, task): self.stat.collect('points', grab.config['timeout']) bot = build_spider(TestSpider, meta={'server': self.server}) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.add_task( Task('page', grab=Grab(url=self.server.get_url(), timeout=1))) bot.run() self.assertEqual(set([77]), set(bot.stat.collections['points']))
def test_task_queue_clear(self): class TestSpider(Spider): def task_page(self, unused_grab, unused_task): self.stop() bot = build_spider(TestSpider) bot.setup_queue() for _ in six.moves.range(5): bot.add_task(Task('page', url=self.server.get_url())) self.assertEqual(5, bot.task_queue.size()) bot.run() self.assertEqual(0, bot.task_queue.size())
def test_network_limit(self): class CustomSimpleSpider(SimpleSpider): def create_grab_instance(self): return Grab(connect_timeout=1, timeout=1) self.server.response['get.data'] = 'Hello spider!' self.server.response['sleep'] = 1.1 bot = build_spider(CustomSimpleSpider, network_try_limit=1) bot.setup_queue() #bot.setup_grab(connect_timeout=1, timeout=1) bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(bot.stat.counters['spider:request-network'], 1) bot = build_spider(CustomSimpleSpider, network_try_limit=2) bot.setup_queue() #bot.setup_grab(connect_timeout=1, timeout=1) bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(bot.stat.counters['spider:request-network'], 2)
def test_timeout(self): self.server.response['get.data'] = ContentGenerator(self.server) bot = build_spider(SimpleSpider, meta={'server': self.server}) self.setup_cache(bot) bot.cache_pipeline.cache.clear() bot.setup_queue() bot.add_task(Task('one', self.server.get_url())) bot.add_task(Task('one', self.server.get_url(), delay=2)) bot.run() self.assertEqual(2, bot.stat.counters['spider:request']) self.assertEqual(1, bot.stat.counters['spider:request-cache']) self.assertEqual([1, 1], bot.stat.collections['resp_counters']) bot = build_spider(SimpleSpider, meta={'server': self.server}, parser_pool_size=1) self.setup_cache(bot) # DO not clear the cache # bot.cache_pipeline.cache.clear() bot.setup_queue() bot.add_task(Task('one', self.server.get_url(), priority=1)) bot.add_task(Task('one', self.server.get_url(), priority=2, cache_timeout=0, delay=1)) bot.add_task(Task('one', self.server.get_url(), priority=3, cache_timeout=10, delay=3)) bot.add_task(Task('one', self.server.get_url(), priority=4, cache_timeout=0, delay=4)) bot.run() self.assertEqual([1, 2, 2, 3], bot.stat.collections['resp_counters'])
def test_timeout(self): bot = SimpleSpider() self.setup_cache(bot) bot.cache.clear() bot.setup_queue() bot.add_task(Task('one', SERVER.BASE_URL)) bot.add_task(Task('one', SERVER.BASE_URL, delay=0.5)) bot.run() self.assertEqual([1, 1], bot.resp_counters) bot = SimpleSpider() self.setup_cache(bot) # DO not clear the cache #bot.cache.clear() bot.setup_queue() bot.add_task(Task('one', SERVER.BASE_URL, priority=1)) bot.add_task( Task('one', SERVER.BASE_URL, priority=2, cache_timeout=0, delay=1)) bot.add_task( Task('one', SERVER.BASE_URL, priority=3, cache_timeout=10, delay=1.1)) bot.add_task( Task('one', SERVER.BASE_URL, priority=4, cache_timeout=0, delay=1.2)) bot.run() self.assertEqual([1, 2, 2, 3], bot.resp_counters)
def test_task_clone(self): bot = build_spider(SimpleSpider, ) bot.setup_queue() task = Task('baz', url='http://xxx.com') bot.add_task(task.clone()) # Pass grab to clone task = Task('baz', url='http://xxx.com') grab = Grab() grab.setup(url='zzz') bot.add_task(task.clone(grab=grab)) # Pass grab_config to clone task = Task('baz', url='http://xxx.com') grab = Grab() grab.setup(url='zzz') bot.add_task(task.clone(grab_config=grab.config))
def test_task_clone(self): bot = SimpleSpider() bot.setup_queue() task = Task('baz', url='xxx') bot.add_task(task.clone()) # Pass grab to clone task = Task('baz', url='xxx') g = Grab() g.setup(url='zzz') bot.add_task(task.clone(grab=g)) # Pass grab_config to clone task = Task('baz', url='xxx') g = Grab() g.setup(url='zzz') bot.add_task(task.clone(grab_config=g.config))
def test_task_get_fallback_handler(self): class TestSpider(Spider): def do_smth(self, task): pass def task_bar_fallback(self, task): pass task1 = Task('foo', url='http://foo.com/', fallback_name='do_smth') task2 = Task('bar', url='http://foo.com/') task3 = Task(url='http://foo.com/') bot = build_spider(TestSpider, ) self.assertEqual(task1.get_fallback_handler(bot), bot.do_smth) self.assertEqual(task2.get_fallback_handler(bot), bot.task_bar_fallback) self.assertEqual(task3.get_fallback_handler(bot), None)
def test_exception_from_data_handler(self): class TestSpider(Spider): def task_page(self, grab, task): yield Data('foo', num=1) def data_foo(self, num): 1 / 0 bot = TestSpider() bot.setup_queue() bot.add_task(Task('page', url=SERVER.BASE_URL)) bot.run() self.assertTrue('data_foo' in bot.items['fatal'][0])
def test_exception_from_data_handler(self): class TestSpider(Spider): def task_page(self, dummy_grab, dummy_task): yield Data('foo', num=1) def data_foo(self, num): # pylint: disable=unused-argument raise Exception('Shit happens!') bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertTrue('data_foo' in bot.stat.collections['fatal'][0])
def task_level_4(self, grab, task): """ Наконец то список продукции где также есть ссылки на карточку компании """ if not chek_loading(grab.response.body): yield task.clone(refresh_cache=True, priority=50) return url_level_5_list = grab.doc.select( '//a[@class="supplierTit"]').attr_list('href') for url_level_5 in url_level_5_list: url_level_5 = grab.make_url_absolute(url_level_5) if not url_level_5 in self.parsed_url: ## проверям не извлекали ли уже ссылку на карточку компании yield Task('level_5', url=url_level_5, priority=45) for next_page_url in grab.doc.select( '//p[@class="pagination mt5"]/a').attr_list('href'): next_page_url = grab.make_url_absolute(next_page_url) if not next_page_url in self.parsed_url: yield Task('level_4', url=next_page_url, priority=55) self.parsed_url.append(next_page_url)
def task_handle_author(self, grab, task): try: articles_selector = r'//div[@class="mw-parser-output"]//li//a[not(contains(@class,"external text"))]' author_articles = grab.doc.select(articles_selector ) for article in author_articles: try: href = article.select(r'@href').text() new_url = self.base_url + (r'/wiki/' if href[:len(r'/wiki/')] != r'/wiki/' else '') + href yield Task('handle_article', url=new_url, art_path=task.art_path) except IndexError: warnings.warn('Invalid article "{}" from author page: {}'.format(article.text(), task.url)) except NameError: pass
def task_level_1(self, grab, task): """ Получаем ссылки на категории """ if not chek_loading(grab.response.body): yield task.clone(refresh_cache=True, priority=90) return for url_level_2 in grab.doc.select( '//div[@class="browse-ttl"]/a').attr_list('href'): yield Task('level_2', url=grab.make_url_absolute(url_level_2), priority=85)
def test_task_queue_clear(self): class TestSpider(Spider): def task_page(self, grab, task): self.stop() def task_keyboard_interrupt_page(self, grab, task): raise KeyboardInterrupt bot = build_spider(TestSpider) bot.setup_queue() for _ in six.moves.range(5): bot.add_task(Task('page', url=self.server.get_url())) self.assertEqual(5, bot.task_queue.size()) bot.run() self.assertEqual(0, bot.task_queue.size()) for _ in six.moves.range(5): bot.add_task( Task('keyboard_interrupt_page', url=self.server.get_url())) self.assertEqual(5, bot.task_queue.size()) bot.run() self.assertEqual(0, bot.task_queue.size())
def test_exception_from_data_handler(self): class TestSpider(Spider): def task_page(self, grab, task): yield Data('foo', num=1) def data_foo(self, num): 1 / 0 bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertTrue('data_foo' in bot.stat.collections['fatal'][0])
def test_handler_result_invalid(self): class TestSpider(Spider): def prepare(self): self.points = [] def task_page(self, grab, task): yield 1 bot = TestSpider() bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertEqual(1, bot.counters['error-spidererror'])
def test_only_cache_task(self): self.server.response['get.data'] = ContentGenerator(self.server) class TestSpider(Spider): def task_page(self, dummy_grab, dummy_task): self.stat.collect('points', 1) bot = build_spider(TestSpider, only_cache=True) self.setup_cache(bot) bot.cache_pipeline.cache.clear() bot.setup_queue() bot.add_task(Task('page', self.server.get_url())) bot.run() self.assertEqual(bot.stat.collections['points'], [])
def test_handler_result_none(self): class TestSpider(Spider): def prepare(self): # pylint: disable=attribute-defined-outside-init self.points = [] def task_page(self, dummy_grab, dummy_task): yield None bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run()
def test_check_task_limits_invalid_value(self): class TestSpider(Spider): def task_page(self, grab, task): pass def check_task_limits(self, task): return False, 'zz' bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url(), fallback_name='fallback_zz')) self.assertRaises(SpiderError, bot.run)
def test_cache_size(self): self.server.response['get.data'] = ContentGenerator(self.server) class TestSpider(Spider): def task_page(self, grab, task): pass bot = build_spider(TestSpider) self.setup_cache(bot) bot.cache_pipeline.cache.clear() bot.setup_queue() bot.add_task(Task('page', self.server.get_url())) bot.run() self.assertEqual(bot.cache_pipeline.cache.size(), 1)
def task_getcategory(self, grab, task): print(task.url) response = json.loads(grab.response.unicode_body()) with open('category.json', 'at', encoding='cp1251', errors='ignore') as f: f.write( json.dumps(response, ensure_ascii=False, sort_keys=True) + '\n') f.flush() for child in response.get('children'): child_url = 'https://supl.biz/api/v1.0/suppliers-catalog/categories/' + str( child['id']) + '/menu/' yield Task('getcategory', url=child_url)
def task_level_3(self, grab, task): """ Получаем ссылки на подкатегории """ if not chek_loading(grab.response.body): yield task.clone(refresh_cache=True, priority=70) return for url_level_4 in grab.doc.select( '//div[@class="category-top"]/a').attr_list('href'): url_level_4 = grab.make_url_absolute(url_level_4) yield Task('level_4', url=url_level_4, priority=65) self.parsed_url.append(url_level_4)
def task_table(self, grab, task): try: term_url = grab.doc.select( '//script[contains(text(),"open_terms()")]').text() term_url = term_url.split("window.open(")[1].split('"')[1] except: #Dealing with common error (Terms' page formating) try: term_url = grab.doc.select( '//a[contains(text(), "Terms")]/@href').text() except Exception as e: try: #Dealing with common error (there is no direct link on Terms) - # - Terms can be found through "Selections" section: selection_url = grab.doc.select( '//a[contains(text(), "Selections")]//@href').text() root = urlparse(task.url).hostname #use task_selection function which is below for this url: yield Task( "selection", url="http://" + root + selection_url, lasturl=task.url, ) except Exception as e: self.term_url_error.add(task.url + " " + getattr(e, "message", str(e))) #Just a useful part of auction's url: root = urlparse(task.url).hostname try: #use task_term function for this url: yield Task("term", url="http://" + root + term_url, lasturl=task.url) except Exception as e: self.table_error.add(task.url + " " + getattr(e, "message", str(e)))
def task_initial(self, grab, task): ''' ''' self.base_url = self.initial_urls[0] if self.args.c == 'm': if self.args.t == 'l': yield Task('get_ranks', url='http://live-tennis.eu/') elif self.args.t == 'o': yield Task('get_ranks', url='http://live-tennis.eu/official_atp_ranking') else: sys.stderr.write('Unvalid input of utility -t\n') elif self.args.c == 'f': if self.args.t == 'l': yield Task('get_ranks', url='http://live-tennis.eu/wta-live-ranking') elif self.args.t == 'o': yield Task('get_ranks', url='http://live-tennis.eu/official-wta-ranking') else: sys.stderr.write('Unvalid input of utility -t\n') else: sys.stderr.write('Unvalid input of option -c\n')
def test_create_grab_instance(self): class TestSpider(Spider): def create_grab_instance(self, **kwargs): grab = super(TestSpider, self).create_grab_instance(**kwargs) grab.setup(timeout=77) return grab def task_generator(self): yield Task('page', url=self.meta['server'].get_url()) yield Task('page', grab=Grab(url=self.meta['server'].get_url(), timeout=76)) def task_page(self, grab, dummy_task): self.stat.collect('points', grab.config['timeout']) bot = build_spider(TestSpider, meta={'server': self.server}) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.add_task(Task('page', grab=Grab(url=self.server.get_url(), timeout=75))) bot.run() self.assertEqual(set([77, 76, 75]), set(bot.stat.collections['points']))
def test_basic_priority(self): bot = self.SimpleSpider() self.setup_queue(bot) bot.taskq.clear() requested_urls = {} for priority in (4, 2, 1, 5): url = SERVER.BASE_URL + '?p=%d' % priority requested_urls[priority] = url bot.add_task(Task('page', url=url, priority=priority)) bot.run() urls = [x[1] for x in sorted(requested_urls.items(), key=lambda x: x[0])] self.assertEqual(urls, bot.url_history)
def task_speciality(self, grab, task): # Разбор страницы со специальностями for lnk in grab.doc.select( '//ul[@class="list-unstyled flat-list"]//a/@href'): url = str(task.url) + lnk.text() yield Task('schedule', url=url, lpu=task.lpu_obj) # Для отладки/тестирования # # g = Grab() # g.go(url + lnk.text()) # self.task_schedule(g, org) return
def task_subcategory(self, grab, task): cat_description = grab.css_list(u'td.txt b') page_list = grab.doc.select('//div[@class="list_pages"]//a') cur_page = re.search(r'page=\d+', grab.response.url) if cur_page: cur_page = int(cur_page.group().replace('page=', '')) else: cur_page = 1 if page_list.exists(): for a in page_list: a_text = a.text() if a_text == u'Далее': break if int(a_text) == (cur_page + 1): url = self.rebuild_url_to_city(a.attr('href')) yield Task('subcategory', url=url) if cat_description[1].text: for place in grab.css_list('div.orgl.lc a'): yield Task('place', url=place.get('href')) for place in grab.css_list('div.orgl div.lc a'): yield Task('place', url=place.get('href'))
def test_render_stats(self): class TestSpider(Spider): def prepare(self): self.stat.logging_period = 0 self.stat.inc('foo') def task_page(self, grab, task): pass bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() bot.render_stats()
def test_has_item(self): self.server.response['get.data'] = ContentGenerator(self.server) class TestSpider(Spider): def task_page(self, grab, task): pass bot = build_spider(TestSpider) self.setup_cache(bot) bot.cache_pipeline.cache.clear() bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.add_task(Task('page', url=self.server.get_url('/foo'))) bot.run() self.assertTrue(bot.cache_pipeline.cache .has_item(self.server.get_url())) self.assertTrue(bot.cache_pipeline.cache .has_item(self.server.get_url(), timeout=100)) self.assertFalse(bot.cache_pipeline.cache .has_item(self.server.get_url(), timeout=0)) self.assertTrue(bot.cache_pipeline.cache .has_item(self.server.get_url('/foo'))) self.assertFalse(bot.cache_pipeline.cache .has_item(self.server.get_url('/bar')))
def test_task_clone(self): bot = build_spider(SimpleSpider, ) bot.setup_queue() task = Task('baz', url='xxx') bot.add_task(task.clone()) # Pass grab to clone task = Task('baz', url='xxx') g = Grab() g.setup(url='zzz') bot.add_task(task.clone(grab=g)) # Pass grab_config to clone task = Task('baz', url='xxx') g = Grab() g.setup(url='zzz') bot.add_task(task.clone(grab_config=g.config))
def test_task_get_fallback_handler(self): class TestSpider(Spider): def zz(self, task): pass def task_bar_fallback(self, task): pass t1 = Task('foo', url='http://foo.com/', fallback_name='zz') t2 = Task('bar', url='http://foo.com/') t3 = Task(url='http://foo.com/') bot = build_spider(TestSpider, ) self.assertEqual(t1.get_fallback_handler(bot), bot.zz) self.assertEqual(t2.get_fallback_handler(bot), bot.task_bar_fallback) self.assertEqual(t3.get_fallback_handler(bot), None)
def test_task_clone_with_url_param(self): task = Task('baz', url='xxx') task.clone(url='http://yandex.ru/')
def test_task_clone_kwargs(self): g = build_grab() g.setup(url='http://foo.com/') task = Task('foo', grab=g, cache_timeout=1) task2 = task.clone(cache_timeout=2) self.assertEqual(2, task2.cache_timeout)