def parse(item_links: Queue, items: Queue, prox: str): start = True while not item_links.empty() or start: start = False item = inst['_base'] + item_links.get() for _ in range(inst['_retry']): sleep(2) try: txt = get(item, headers=inst['_headers'], proxies={ 'https': prox }, timeout=30).text page = fs(txt) res = {'url': item} for k, v in inst['fields'].items(): val = page.xpath(v['path']) res[k] = v['type'](val) if val else None table = page.xpath(inst['table']['home']) for t in table: for k, v in zip(t.xpath(inst['table']['title']), t.xpath(inst['table']['value'])): res[k] = v if res['name'] is None: continue else: print(datetime.now()) items.put(res) break except Exception as e: print(f'{e.__class__.__name__}: {e}') continue
async def crawl(sess, q): nxt = start_url url_counter = 0 while url_counter < CRAWL_LIMIT: for try_count in range(RETRY_COUNT): try: p = choice(proxies) print( f'Trying get pagination {nxt}, attempt {try_count}, proxy: {p}') page = await sess.get(nxt, proxy=p) if page.status == 200: page = fs(await page.text()) url_counter += 1 break except Exception as e: print(f'{e.__class__.__name__}: {e}') else: print('Connection error!') await q.put(DONE) return items = page.xpath(item_path) for i in items: await q.put(i) try: nxt = page.xpath(pagination_path)[0] print(f'Crawled {nxt}') except IndexError: print('No next pagionation!') await q.put(DONE) return
def crawl(page_links: Queue, item_links: Queue, prox: str): start = None while 1: if page_links.empty(): return if start is None: start = page_links.get() if not start.startswith('http'): start = inst['_base'] + start for tr in range(inst['_retry']): try: sleep(2) print(f'{start} {tr+1}') txt = get(start, proxies={ 'https': prox }, headers=inst['_headers'], timeout=30).text page = fs(txt) items = page.xpath(inst['_tree']['item']) for i in items: item_links.put(i) nxt = page.xpath(inst['_tree']['pagination']) start = nxt[0] if nxt else None break except Exception as e: print(f'{e.__class__.__name__}: {e}') continue
async def parse(item_links: asyncio.Queue, sess: ClientSession, items: asyncio.Queue, prox: str): start = True while not item_links.empty() or start: start = False item = inst['_base'] + await item_links.get() for _ in range(inst['_retry']): await asyncio.sleep(1.5) try: async with sess.get(item, headers=inst['_headers'], proxy=prox, proxy_auth=auth) as resp: txt = await resp.text() page = fs(txt) res = {'url': item} for k, v in inst['fields'].items(): val = page.xpath(v['path']) res[k] = v['type'](val) if val else None table = page.xpath(inst['table']['home']) for t in table: for k, v in zip(t.xpath(inst['table']['title']), t.xpath(inst['table']['value'])): res[k] = v if res['name'] is None: await item_links.put(item[len(inst['_base']):]) else: print(datetime.now()) await items.put(res) break except: await item_links.put(item[len(inst['_base']):]) continue
async def crawl(page_links: asyncio.Queue, item_links: asyncio.Queue, sess: ClientSession, prox: str): start = None while True: if start is None: start = await page_links.get() if not start.startswith('http'): start = inst['_base'] + start for tr in range(inst['_retry']): try: await asyncio.sleep(1.5) print(f'{start} {tr}') async with sess.get(start, headers=inst['_headers'], proxy=prox, proxy_auth=auth, ssl=False) as resp: page = fs(await resp.text()) items = page.xpath(inst['_tree']['item']) for i in items: await item_links.put(i) nxt = page.xpath(inst['_tree']['pagination']) start = nxt[0] if nxt else None break except Exception as e: print(f'{e.__class__.__name__}: {e}') continue
def parse(self, response): if self.pagination_xpath is not None: lext = LinkExtractor(restrict_xpaths=(self.pagination_xpath)) next = lext.extract_links(response) meta = {'dont_redirect': True} for link in next: yield Request( link.url, self.parse, meta=meta, ) crawl_date = dt.now() links = response.xpath(self.item_xpath).extract() csrf_token = self.get_csrf_token(response) jsession_id = response.headers.getlist('Set-Cookie')[0].decode( 'utf-8').split(';')[0] for link in links: inner = fs(link) url = response.urljoin(inner.xpath('//a/@href')[0]) img = inner.xpath('//img/@src')[0] title = inner.xpath('//a/@title')[0] meta = { 'link': url, 'crawl_date': crawl_date, 'img': img, 'title': title, 'csrf_token': csrf_token, 'jsession_id': jsession_id, } yield Request(url, callback=self.parse_item, meta=meta)
def get_proxy(): page = fs(get('http://online-proxy.ru/index.html?sort=uptime').text) ip = page.xpath( '//p[text()="Список бесплатных прокси"]/following-sibling::table//tr/td[2]/text()' ) port = page.xpath( '//p[text()="Список бесплатных прокси"]/following-sibling::table//tr/td[3]/text()' ) return list(map(lambda x: 'http://' + ':'.join(x), zip(ip, port)))[:100]
async def parse_one_extractum(session, base, url): res = [] async with session.get(base + url) as response: page = await response.text() page = fs(page) table = page.xpath(TABLE) for tr in (table or [[]])[0][1:]: i = Item((tr.xpath('.//td[1]/text()') or [''])[0].strip(), 0, (tr.xpath('./td[@nowrap][2]/text()') or [''])[0].strip(), (tr.xpath('./td/a/text()') or [''])[0].strip()) res.append(i) log.debug('{url} done'.format(url=url)) return res
async def parse_extractum(loop): log.info('Starting {}'.format(__file__)) lnk = "//ul[@class='alphabet fl']/li/a/@href" base = 'http://aptekadoktor.com' start = 'http://aptekadoktor.com/availability' async with ClientSession() as session: async with session.get(start) as response: page = fs(await response.text()) urls = page.xpath(lnk) log.info('Collected {} pages'.format(len(urls))) futures = [parse_one_extractum(session, base, url) for url in urls] write( reduce(lambda a, x: a + x, await asyncio.gather(*futures, loop=loop), []))
async def parse_vivafarm(loop): log.info('Starting vivafarm') start = r'http://vivafarm.md/124-katalog-all?id_category=124&n=75&p={}' last = r'//li[@id="pagination_next_bottom"]/preceding-sibling::li[1]/a/span/text()' items = [] async with ClientSession() as session: async with session.get(start.format(1)) as response: page_count = int(fs(await response.text()).xpath(last)[0]) log.info('Collected {} pages'.format(page_count)) for i in range(1, page_count + 1): items += await parse_page_vivafarm(start.format(i), session) log.debug('From {} pages collected {} items'.format(i, len(items))) log.info('Collecting items finished. Collected {} items'.format( len(items))) futures = [parse_one_vivafarm(i, session, len(items)) for i in items] log.debug('Futures done') write( reduce(lambda a, x: a + x, await asyncio.gather(*futures, loop=loop), []))
async def parse_one_vivafarm(url, session, l): global collected result = [] log.debug('Parsing {}'.format(url)) async with session.get(url) as response: page = fs(await response.text()) name = (page.xpath(NAME) or [None])[0] table = page.xpath(TABLE) for rec in table: i = Item(name, (rec.xpath('.//td[@data-label="Кол-во:"]//text()') or [''])[0].strip(), (rec.xpath('.//td[@data-label="Стоимость:"]//text()') or [''])[0].strip(), (rec.xpath('.//td[@data-label="Город:"]//text()') or [''])[0].strip() + ', ' + (rec.xpath('.//td[@data-label="Адрес:"]//text()') or [''])[0].strip()) result.append(i) collected += 1 log.info('Parsed {}/{}'.format(collected, l)) return result
async def scrap(sess, q): res = [] elem = await q.get() while elem != DONE: for try_count in range(RETRY_COUNT): p = choice(proxies) print(f'Trying get item {elem}, attempt {try_count}, proxy: {p}') try: page = await sess.get(base + elem, proxy=p) if page.status == 200: page = fs(await page.text()) break except Exception as e: print(f'{e.__class__.__name__}: {e}') else: print('No item!') continue for k, v in harks.items(): res = set(page.xpath(v)) res = {x.strip() for x in res} print(f'Item {elem} scraped!') with open('items.json', 'a', encoding='utf-8') as f: dump(list(res), f, ensure_ascii=False) f.write('\n')
def htmlParser(htmlfile): global text2save, count text = cleaner.clean_html(fs(htmlfile)).text_content() textReplaced = text.replace("\n", " ").replace("\t", " ") text2save += "page" + str(count) + ',"' + cleanText(textReplaced) + '"\n' count += 1
def htmlParser(htmlfile): text = cleaner.clean_html(fs(htmlfile)).text_content() textReplaced = text.replace("\n"," ").replace("\t"," ") text2save = cleanText(textReplaced) with open(pathSave,'a',encoding='utf-8') as file2save: file2save.write(text2save)
async def parse_page_vivafarm(url, session): item = r'//a[@class="product-name"]/@href' async with session.get(url) as response: page = fs(await response.text()) return page.xpath(item)