示例#1
0
# Category Scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(20):
        urlp = url.format(page=p + 1)

        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        print(fpath, p, urlp)
        if not op.exists(fpath):
            driver.get(urlp)
            # driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.7);")
            # print('sleeping')
            # sleep(10)
            # driver.waitclick('//*[contains(@class, "bb-modal-close-button")]', timeout=1, silent=False)
            driver.save_page(fpath, scroll_to_bottom=True)
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)

        for li in tree.xpath('//div[@id="venta-main"]/div'):
            produrl = li.xpath('.//a/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
                ' '.join(''.join(li.xpath('.//h2/a/text()')[:1]).split()),
                'raw_price':
                ' '.join(''.join(
                    li.xpath(
                        './/div[@class="wineblock-leftprice"]//*[@class="uc-price"]//text()'
示例#2
0
    'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/rum-44#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0',
    'tequila':
    'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/tequila-44#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0',
    'liquor':
    'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/liqueurs---speciality-spirits#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0',
}

# Categories scraping
for ctg, url in urls_ctgs_dict.items():
    print(ctg, url)
    categories[ctg] = []
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        sleep(1)
        driver.save_page(fpath, scroll_to_bottom=True)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for li in tree.xpath('//ul[@class="productLister gridView"]/li'):
        produrl = li.xpath('.//h3/a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        categories[ctg].append(produrl)
        products[produrl] = {
            'pdct_name_on_eretailer':
            " ".join("".join(
                li.xpath(
                    './/div[@class="productNameAndPromotions"]//h3//text()')).
                     split()),
            'raw_price':
            " ".join("".join(
示例#3
0
            try:
                driver.waitclick('//div[@class="mod_product_list__more"]/a',
                                 timeout=5,
                                 silent=True)
            except:
                pass
            # Wait to load page
            sleep(2)

            # Calculate new scroll height and compare with last scroll height
            new_height = driver.driver.execute_script(
                "return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        driver.save_page(fpath)

    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)

    for li in tree.xpath('//li[@class="mod_product_list__item"]'):
        produrl = li.xpath('.//h3/a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        produrl = clean_url(produrl, root_url)
        products[produrl] = {
            'pdct_name_on_eretailer':
            ' '.join(''.join(li.xpath('.//h3/a//text()')).split()),
            'ctg_denom_txt':
            ' '.join(''.join(li.xpath('.//h3/a//text()')).split()),
            'raw_price':