示例#1
0
def parse(item_links: Queue, items: Queue, prox: str):
    start = True
    while not item_links.empty() or start:
        start = False
        item = inst['_base'] + item_links.get()
        for _ in range(inst['_retry']):
            sleep(2)
            try:
                txt = get(item,
                          headers=inst['_headers'],
                          proxies={
                              'https': prox
                          },
                          timeout=30).text
                page = fs(txt)
                res = {'url': item}
                for k, v in inst['fields'].items():
                    val = page.xpath(v['path'])
                    res[k] = v['type'](val) if val else None
                table = page.xpath(inst['table']['home'])
                for t in table:
                    for k, v in zip(t.xpath(inst['table']['title']),
                                    t.xpath(inst['table']['value'])):
                        res[k] = v

                if res['name'] is None:
                    continue
                else:
                    print(datetime.now())
                    items.put(res)
                    break
            except Exception as e:
                print(f'{e.__class__.__name__}: {e}')
                continue
示例#2
0
async def crawl(sess, q):
    nxt = start_url
    url_counter = 0
    while url_counter < CRAWL_LIMIT:
        for try_count in range(RETRY_COUNT):
            try:
                p = choice(proxies)
                print(
                    f'Trying get pagination {nxt}, attempt {try_count}, proxy: {p}')
                page = await sess.get(nxt, proxy=p)
                if page.status == 200:
                    page = fs(await page.text())
                    url_counter += 1
                    break
            except Exception as e:
                print(f'{e.__class__.__name__}: {e}')
        else:
            print('Connection error!')
            await q.put(DONE)
            return
        items = page.xpath(item_path)
        for i in items:
            await q.put(i)
        try:
            nxt = page.xpath(pagination_path)[0]
            print(f'Crawled {nxt}')
        except IndexError:
            print('No next pagionation!')
            await q.put(DONE)
            return
示例#3
0
def crawl(page_links: Queue, item_links: Queue, prox: str):
    start = None
    while 1:
        if page_links.empty():
            return
        if start is None:
            start = page_links.get()
        if not start.startswith('http'):
            start = inst['_base'] + start
        for tr in range(inst['_retry']):
            try:
                sleep(2)
                print(f'{start}    {tr+1}')
                txt = get(start,
                          proxies={
                              'https': prox
                          },
                          headers=inst['_headers'],
                          timeout=30).text
                page = fs(txt)
                items = page.xpath(inst['_tree']['item'])
                for i in items:
                    item_links.put(i)
                nxt = page.xpath(inst['_tree']['pagination'])
                start = nxt[0] if nxt else None
                break
            except Exception as e:
                print(f'{e.__class__.__name__}: {e}')
                continue
示例#4
0
async def parse(item_links: asyncio.Queue, sess: ClientSession,
                items: asyncio.Queue, prox: str):
    start = True
    while not item_links.empty() or start:
        start = False
        item = inst['_base'] + await item_links.get()
        for _ in range(inst['_retry']):
            await asyncio.sleep(1.5)
            try:
                async with sess.get(item,
                                    headers=inst['_headers'],
                                    proxy=prox,
                                    proxy_auth=auth) as resp:
                    txt = await resp.text()
                    page = fs(txt)
                    res = {'url': item}
                    for k, v in inst['fields'].items():
                        val = page.xpath(v['path'])
                        res[k] = v['type'](val) if val else None
                    table = page.xpath(inst['table']['home'])
                    for t in table:
                        for k, v in zip(t.xpath(inst['table']['title']),
                                        t.xpath(inst['table']['value'])):
                            res[k] = v

                    if res['name'] is None:
                        await item_links.put(item[len(inst['_base']):])
                    else:
                        print(datetime.now())
                        await items.put(res)
                        break
            except:
                await item_links.put(item[len(inst['_base']):])
                continue
示例#5
0
async def crawl(page_links: asyncio.Queue, item_links: asyncio.Queue,
                sess: ClientSession, prox: str):
    start = None
    while True:
        if start is None:
            start = await page_links.get()
        if not start.startswith('http'):
            start = inst['_base'] + start
        for tr in range(inst['_retry']):
            try:
                await asyncio.sleep(1.5)
                print(f'{start}    {tr}')
                async with sess.get(start,
                                    headers=inst['_headers'],
                                    proxy=prox,
                                    proxy_auth=auth,
                                    ssl=False) as resp:
                    page = fs(await resp.text())
                    items = page.xpath(inst['_tree']['item'])
                    for i in items:
                        await item_links.put(i)
                    nxt = page.xpath(inst['_tree']['pagination'])
                    start = nxt[0] if nxt else None
                    break
            except Exception as e:
                print(f'{e.__class__.__name__}: {e}')
                continue
示例#6
0
 def parse(self, response):
     if self.pagination_xpath is not None:
         lext = LinkExtractor(restrict_xpaths=(self.pagination_xpath))
         next = lext.extract_links(response)
         meta = {'dont_redirect': True}
         for link in next:
             yield Request(
                 link.url,
                 self.parse,
                 meta=meta,
             )
     crawl_date = dt.now()
     links = response.xpath(self.item_xpath).extract()
     csrf_token = self.get_csrf_token(response)
     jsession_id = response.headers.getlist('Set-Cookie')[0].decode(
         'utf-8').split(';')[0]
     for link in links:
         inner = fs(link)
         url = response.urljoin(inner.xpath('//a/@href')[0])
         img = inner.xpath('//img/@src')[0]
         title = inner.xpath('//a/@title')[0]
         meta = {
             'link': url,
             'crawl_date': crawl_date,
             'img': img,
             'title': title,
             'csrf_token': csrf_token,
             'jsession_id': jsession_id,
         }
         yield Request(url, callback=self.parse_item, meta=meta)
示例#7
0
def get_proxy():
    page = fs(get('http://online-proxy.ru/index.html?sort=uptime').text)
    ip = page.xpath(
        '//p[text()="Список бесплатных прокси"]/following-sibling::table//tr/td[2]/text()'
    )
    port = page.xpath(
        '//p[text()="Список бесплатных прокси"]/following-sibling::table//tr/td[3]/text()'
    )
    return list(map(lambda x: 'http://' + ':'.join(x), zip(ip, port)))[:100]
示例#8
0
async def parse_one_extractum(session, base, url):
    res = []
    async with session.get(base + url) as response:
        page = await response.text()
        page = fs(page)
        table = page.xpath(TABLE)
        for tr in (table or [[]])[0][1:]:
            i = Item((tr.xpath('.//td[1]/text()') or [''])[0].strip(), 0,
                     (tr.xpath('./td[@nowrap][2]/text()') or [''])[0].strip(),
                     (tr.xpath('./td/a/text()') or [''])[0].strip())
            res.append(i)
        log.debug('{url} done'.format(url=url))
    return res
示例#9
0
async def parse_extractum(loop):
    log.info('Starting {}'.format(__file__))
    lnk = "//ul[@class='alphabet fl']/li/a/@href"
    base = 'http://aptekadoktor.com'
    start = 'http://aptekadoktor.com/availability'
    async with ClientSession() as session:
        async with session.get(start) as response:
            page = fs(await response.text())
            urls = page.xpath(lnk)
        log.info('Collected {} pages'.format(len(urls)))
        futures = [parse_one_extractum(session, base, url) for url in urls]
        write(
            reduce(lambda a, x: a + x, await asyncio.gather(*futures,
                                                            loop=loop), []))
示例#10
0
async def parse_vivafarm(loop):
    log.info('Starting vivafarm')
    start = r'http://vivafarm.md/124-katalog-all?id_category=124&n=75&p={}'
    last = r'//li[@id="pagination_next_bottom"]/preceding-sibling::li[1]/a/span/text()'
    items = []
    async with ClientSession() as session:
        async with session.get(start.format(1)) as response:
            page_count = int(fs(await response.text()).xpath(last)[0])
            log.info('Collected {} pages'.format(page_count))
        for i in range(1, page_count + 1):
            items += await parse_page_vivafarm(start.format(i), session)
            log.debug('From {} pages collected {} items'.format(i, len(items)))
        log.info('Collecting items finished. Collected {} items'.format(
            len(items)))
        futures = [parse_one_vivafarm(i, session, len(items)) for i in items]
        log.debug('Futures done')
        write(
            reduce(lambda a, x: a + x, await asyncio.gather(*futures,
                                                            loop=loop), []))
示例#11
0
async def parse_one_vivafarm(url, session, l):
    global collected
    result = []
    log.debug('Parsing {}'.format(url))
    async with session.get(url) as response:
        page = fs(await response.text())
        name = (page.xpath(NAME) or [None])[0]
        table = page.xpath(TABLE)
        for rec in table:
            i = Item(name, (rec.xpath('.//td[@data-label="Кол-во:"]//text()')
                            or [''])[0].strip(),
                     (rec.xpath('.//td[@data-label="Стоимость:"]//text()')
                      or [''])[0].strip(),
                     (rec.xpath('.//td[@data-label="Город:"]//text()')
                      or [''])[0].strip() + ', ' +
                     (rec.xpath('.//td[@data-label="Адрес:"]//text()')
                      or [''])[0].strip())
            result.append(i)
    collected += 1
    log.info('Parsed {}/{}'.format(collected, l))
    return result
示例#12
0
async def scrap(sess, q):
    res = []
    elem = await q.get()
    while elem != DONE:
        for try_count in range(RETRY_COUNT):
            p = choice(proxies)
            print(f'Trying get item {elem}, attempt {try_count}, proxy: {p}')
            try:
                page = await sess.get(base + elem, proxy=p)
                if page.status == 200:
                    page = fs(await page.text())
                    break
            except Exception as e:
                print(f'{e.__class__.__name__}: {e}')
        else:
            print('No item!')
            continue
        for k, v in harks.items():
            res = set(page.xpath(v))
        res = {x.strip() for x in res}
        print(f'Item {elem} scraped!')
        with open('items.json', 'a', encoding='utf-8') as f:
            dump(list(res), f, ensure_ascii=False)
            f.write('\n')
示例#13
0
def htmlParser(htmlfile):
    global text2save, count
    text = cleaner.clean_html(fs(htmlfile)).text_content()
    textReplaced = text.replace("\n", " ").replace("\t", " ")
    text2save += "page" + str(count) + ',"' + cleanText(textReplaced) + '"\n'
    count += 1
示例#14
0
def htmlParser(htmlfile):
        text = cleaner.clean_html(fs(htmlfile)).text_content()
        textReplaced = text.replace("\n"," ").replace("\t"," ")
        text2save = cleanText(textReplaced)
        with open(pathSave,'a',encoding='utf-8') as file2save:
                file2save.write(text2save)
示例#15
0
async def parse_page_vivafarm(url, session):
    item = r'//a[@class="product-name"]/@href'
    async with session.get(url) as response:
        page = fs(await response.text())
        return page.xpath(item)