def task_generator(self): for _ in six.moves.range(1111): yield Task('page', url=server.get_url())
def task_generator(self): self.done_counter = 0 yield Task('page', url=server.get_url())
def task_generator(self): g = self.create_grab_instance() g.setup(url='http://h.wrttn.me/status/503', log_dir='log', debug=True) yield Task('initial', grab=g)
def task_page(self, grab, task): print('Start parse olx') for elem in grab.xpath_list( '//a[@class="marginright5 link linkWithHash detailsLink"]'): yield Task('olxpost', url=elem.get('href'))
def task_parse_items(self, grab, task): self.logger.info('[{}] Start: {}'.format(task.name, task.url)) if self._check_body_errors(grab, task): if task.task_try_count < self.err_limit: self.logger.error( '[{}] Restart task with url {}, attempt {}'.format( task.name, task.url, task.task_try_count)) yield Task('parse_items', url=task.url, priority=105, task_try_count=task.task_try_count + 1, raw=True) else: self.logger.error( '[{}] Skip task with url {}, attempt {}'.format( task.name, task.url, task.task_try_count)) return try: # parse pagination numbers if not task.get('d_skip_page_check'): items = grab.doc.select('//a[contains(@href, "{}")]'.format( Config.get('SITE_PAGE_PARAM'))) max_page = get_max_page(items, 1) self.logger.info('[{}] Find max page: {}'.format( task.name, max_page)) url_gen = UrlGenerator(task.url, Config.get('SITE_PAGE_PARAM')) # self-execute from 2 page (if needed) for p in range(2, max_page + 1): url = url_gen.get_page(p) yield Task('parse_items', url=url, priority=100, d_skip_page_check=True, raw=True) # parse items items_list = grab.doc.select( '//div[@class="cart_table"]/div/div/table/tbody/tr') for index, row in enumerate(items_list): try: # NAME item_name = row.select( './td[1]//div[@class="description"]/div/a').text( ).strip() # UNIT unit = row.select('./td[2]').text().strip() if unit == '': unit = 'ед.' # PRICE price_raw = row.select( './td[6]//meta[@itemprop="lowprice"]').attr('content') match = Ree.float.match(price_raw) # check & fix if not match: self.logger.warning( '[{}] Skip item, because price is {} (line: {})'. format(task.name, price_raw, index)) continue price = match.groupdict()['price'].replace(',', '.') # COUNT count = row.select('./td[5]') count_text = count.text().strip() # case 1: string line if count_text == 'распродано': item_count = self.const_price_on_request item_place = self.const_default_place # OUTPUT self.logger.debug( '[{}] Item added, index {} at url {}'.format( task.name, index, task.url)) self.result.append({ 'name': item_name, 'count': item_count, 'unit': unit, 'price': price, 'place': item_place }) # case 2: string line elif count_text == 'под заказ': item_count = self.const_stock_zero item_place = self.const_default_place # OUTPUT self.logger.debug( '[{}] Item added, index {} at url {}'.format( task.name, index, task.url)) self.result.append({ 'name': item_name, 'count': item_count, 'unit': unit, 'price': price, 'place': item_place }) # case 3 else: count_rows = count.select( './/div[@class="layer_info"]/table/tbody/tr') for count_row in count_rows: item_place = count_row.select( './td[1]').text().strip() item_count = 0 # add stock place_count_stock = count_row.select( './td[1]').text().strip() if Ree.float.match(place_count_stock): item_count += float(place_count_stock) # add expo place_count_expo = count_row.select( './td[2]').text().strip() if Ree.float.match(place_count_expo): item_count += float(place_count_expo) if item_count > 0: # OUTPUT self.logger.debug( '[{}] Item added, index {} at url {}'. format(task.name, index, task.url)) self.result.append({ 'name': item_name, # 3.140 -> 3.14; 3.0 -> 3 'count': '{0:g}'.format(item_count), 'unit': unit, 'price': price, 'place': item_place }) except IndexError as e: self.logger.warning('[{}] Skip item: {}, {}'.format( task.name, type(e).__name__, task.url)) except Exception as e: self._process_error(grab, task, e) finally: self.logger.info('[{}] Finish: {}'.format(task.name, task.url))
def task_generator(self): for x in xrange(2): yield Task('page', 'http://dumpz.org/%d/' % x)
def task_foo(self, grab, task): grab.setup(url=SERVER.BASE_URL) yield Task('bar', grab=grab)
def test_setup_proxylist(self): content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3) open('/tmp/__proxy.txt', 'w').write(content) # Simple test, one task bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file') bot.setup_queue() bot.add_task(Task('baz', grab=Grab(url='http://yandex.ru', debug=True))) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertEqual(1, len(set(bot.stat.collections['ports']))) # By default auto_change is True bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file') bot.setup_queue() for x in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertTrue(len(set(bot.stat.collections['ports'])) > 1) # DO the same test with load_proxylist method bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file') bot.setup_queue() for x in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertTrue(len(set(bot.stat.collections['ports'])) > 1) # Disable auto_change # By default auto_init is True bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False) bot.setup_queue() for x in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertEqual(1, len(set(bot.stat.collections['ports']))) # Disable auto_change # Disable auto_init # Proxylist will not be used by default bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False, auto_init=False) bot.setup_queue() for x in six.moves.range(10): bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(self.server.request['headers'].get('host'), '%s:%s' % (ADDRESS, self.server.port)) self.assertEqual(1, len(set(bot.stat.collections['ports']))) self.assertEqual(bot.stat.collections['ports'][0], self.server.port)
def task_generator(self): for category in CATEGORIES: url = category % REGION[self.city] addition = {'category': category} yield Task('collect_adv_data', url, addition=addition)
def task_generator(self): yield Task('page', 'http://dumpz.org/100/') yield Task('page', 'http://dumpz.org/101/', disable_cache=True)
def task_generator(self): print '//////////////task generator///////////////' for proxy in self.proxy_list: yield Task('check_proxy', url='http://ci.ua', delay=3, network_try_limit=1, raw=True)
def test_setup_proxylist(self): content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3) open('/tmp/__proxy.txt', 'w').write(content) # Simple test, one task bot = SimpleSpider(thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file') bot.setup_queue() bot.add_task(Task('baz', grab=Grab(url='http://yandex.ru', debug=True))) bot.run() self.assertEqual(SERVER.REQUEST['headers']['host'], 'yandex.ru') self.assertTrue(len(bot.ports) == 1) # By default auto_change is True bot = SimpleSpider(thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file') bot.setup_queue() for x in xrange(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(SERVER.REQUEST['headers']['host'], 'yandex.ru') self.assertTrue(len(bot.ports) > 1) # DO the same test with load_proxylist method bot = SimpleSpider(thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file') bot.setup_queue() for x in xrange(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(SERVER.REQUEST['headers']['host'], 'yandex.ru') self.assertTrue(len(bot.ports) > 1) # Disable auto_change # By default auto_init is True bot = SimpleSpider(thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False) bot.setup_queue() for x in xrange(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(SERVER.REQUEST['headers']['host'], 'yandex.ru') self.assertTrue(len(bot.ports) == 1) # Disable auto_change # Disable auto_init # Proxylist will not be used by default bot = SimpleSpider(thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False, auto_init=False) bot.setup_queue() for x in xrange(10): bot.add_task(Task('baz', SERVER.BASE_URL)) bot.run() self.assertEqual(SERVER.REQUEST['headers'].get('host'), '%s:%s' % ('localhost', SERVER.PORT)) self.assertTrue(len(bot.ports) == 1) self.assertEqual(list(bot.ports)[0], SERVER.PORT)
def data_foo(self, count): self.data_processed.append(count) if count == 1: yield Data('foo', count=666) yield Task('page', url=server.get_url(), count=count + 1)
def task_level_5(self, grab, task): """ Парсим карточку компании """ if not chek_loading(grab.response.body, 'manufacturers'): yield task.clone(refresh_cache=True, priority=80) return company_info = grab.doc.select('//div[@class="companyInfo"]').one() company = comp_db.Company() try: company.name = company_info.select('//*[@class="mt10"]').text() except IndexError: yield task.clone(refresh_cache=True) return company.url_card = task.url company.site = '; '.join( company_info.select( 'p[contains(text(), "Homepage Address")]/following-sibling::p[1]' ).text_list()) try: country_and_index = ''.join( company_info.select('text()').text_list()) except IndexError: country_and_index = '' try: company.country, company.address_index = INDEX_PATTERN.search( country_and_index).group(1, 2) except AttributeError: company.country = country_and_index try: company.city = company_info.select( 'p[contains(text(), "Tel")]/preceding-sibling::p[2]').text() except IndexError: pass try: company.province = company_info.select( 'p[contains(text(), "Tel")]/preceding-sibling::p[1]').text() except IndexError: pass try: company.address = company_info.select('p[2]').text() except IndexError: pass try: company.fax = company_info.select( 'p[contains(text(), "Fax")]').text().replace('Fax:', '').strip() except IndexError: pass try: company.tel = company_info.select( 'p[contains(text(), "Tel")]').text().replace('Tel:', '').strip() except IndexError: pass try: company.about = grab.doc.select( '//div[@class="commonBox userContent"]').text().replace( '... more >>', '') except IndexError: pass try: company.email_img_url = company_info.select( 'p[contains(text(), "-mail")]/img').attr('src') company.email_img_url = grab.make_url_absolute( company.email_img_url) except IndexError: pass try: company.person = grab.doc.select( '//div[@class="companyInfo"][2]/p[2]').text() except IndexError: pass try: company.stars = grab.doc.select( '//p[@class="supplierInfo_main"]/a').text() except IndexError: pass company.importer = 'Import' in grab.doc.select( '//div[@class="CoProfile"]').text(smart=True) company.exporter = 'Export' in grab.doc.select( '//div[@class="CoProfile"]').text(smart=True) #pdb.set_trace() if company.email_img_url: yield Task('ocr_image', url=company.email_img_url, priority=35) comp_db.session.add(company) comp_db.session.commit()
def task_initial(self, grab, task): yield Task('more', url=server.get_url())
def task_generator(self): yield Task('page', url=server.get_url(), delay=1.5, num=3) yield Task('page', url=server.get_url(), delay=4.5, num=2) yield Task('page', url=server.get_url(), delay=3, num=4) yield Task('page', url=server.get_url(), num=1)
def task_generator(self): yield Task('page', url=self.url)
def task_initial(self, grab, task): for cat in grab.css_list(u'.index_rubrics a'): yield Task('category', url=cat.get('href'))
def task_generator(self): for category in CATEGORIES: url = '%s%s' % (DOMEN, category) addition = {'category': category} yield Task('collect_adv_data', url, addition=addition)
def task_category(self, grab, task): for cat in grab.css_list(u'ul.list_rubrics li a'): if HeroCategory.objects.filter(hero_name=cat.text.lower()).count(): url = self.rebuild_url_to_city(cat.get('href')) yield Task('subcategory', url=url)
def task_foo(self, grab, dummy_task): grab.setup(url=server.get_url()) yield Task('bar', grab=grab)
def task_generator(self): for x in xrange(1111): yield Task('page', url=SERVER.BASE_URL)
def task_generator(self): yield Task('initial', url='')
def task_initial(self, grab, task): yield Task('parse', grab=grab)
def task_generator(self): yield Task('page', url=INVALID_URL)
def task_generator(self): grab = Grab() grab.setup(url=server.get_url(), timeout=1) yield Task('page', grab=grab, raw=True)
def task_generator(self): for query, tag in settings.QUERY_LIST: g = Grab() g.setup(url=self.build_query_url(query), content_type='xml') yield Task('feed', grab=g, query=query, tag=tag)
def task_generator(self): # pylint: disable=attribute-defined-outside-init self.done_counter = 0 # pylint: enable=attribute-defined-outside-init yield Task('page', url=server.get_url())
def task_generator(self): grab = Grab(url=server.get_url(), timeout=1) yield Task('page', grab=grab)
def task_initial(self, grab, task): items = grab.xpath_list('//h5[@class]') for item in items: link = item.getparent() url = 'http://www.immobilienscout24.de' + link.attrib['href'] self.add_task(Task(name='get_data', url=url))