Пример #1
0
# KW searches Scraping - with selenium - with nb page hard-coded in url - multiple page per search
search_url = "https://www.nicks.com.au/search?q={kw}&page={page}&limit=48&sort=relevance&mode=grid&type=products"
for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0

    for p in range(10):
        # Storing and extracting infos
        urlp = search_url.format(kw=kw, page=p + 1)

        fpath = fpath_namer(shop_id, 'search', kw, p)
        if not op.exists(fpath):
            driver.get(urlp)
            try:
                driver.wait_for_xpath('//div[@class="product item"]',
                                      timeout=15)
            except:
                pass
            driver.save_page(fpath)
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)

        # r = requests.get(urlp)
        # with open('/tmp/' + shop_id + " " + kw + '.html', 'wb') as f:
        #     f.write(r.content)
        # tree = etree.parse(BytesIO(r.content), parser=parser)

        for li in tree.xpath('//div[@class="product item"]'):
            produrl = li.xpath(
                './/div[@class="productblock-title"]/a/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
Пример #2
0
    'gin': 'https://www.reservebar.com/gin',
    'red_wine': 'https://www.reservebar.com/wine/red-wine',
    'white_wine': 'https://www.reservebar.com/wine/white-wine',
    'tequila': 'https://www.reservebar.com/tequila',
    'rum': 'https://www.reservebar.com/rum',
}

# Category Scraping - with selenium - one page per category
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    print(ctg, url)
    if not op.exists(fpath):
        driver.get(url)
        try:
            driver.wait_for_xpath(
                '//article[contains(@class, "product-item")]', timeout=8)
        except:
            pass
        driver.save_page(fpath, scroll_to_bottom=True)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for li in tree.xpath('//article[contains(@class, "product-item")]'):
        produrl = li.xpath('.//figure/a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        produrl = clean_url(produrl, root_url)
        products[produrl] = {
            'pdct_name_on_eretailer':
            ''.join(li.xpath('.//h3/a//text()')),
            'raw_price':
            ''.join(
Пример #3
0
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
print([(c, len(categories[c])) for c in categories])


# KW searches Scraping - with selenium - with nb page hard-coded in url - multiple page per search
search_url = "https://www.plus-de-bulles.com/browse/?q={kw}&search_param=all"
for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0

    # Storing and extracting infos
    urlp = search_url.format(kw=kw)
    fpath = fpath_namer(shop_id, 'search', kw, 0)
    if not op.exists(fpath):
        driver.get(urlp)
        try: driver.wait_for_xpath('//*[class="product-box"]', timeout=15)
        except: pass
        driver.save_page(fpath)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)

    for li in tree.xpath('//div[@class="product-box"]'):
        produrl = li.xpath('./a/@href')[0]
        produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
        produrl = clean_url(produrl, root_url)
        products[produrl] = {
            'pdct_name_on_eretailer': ' '.join(''.join(li.xpath('.//div[@class="group-first"]//text()')).split()).strip(),
            'raw_price': ' '.join(''.join(li.xpath('.//*[contains(@class,"price-box")]//span/text()')).split()).strip().split("€")[0] + '€',
            'raw_promo_price': ' '.join(''.join(li.xpath('.//span[@class="crossed"]//text()')).split()).strip(),
        }
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
Пример #4
0
# KW searches Scraping - with selenium - one page per search
search_url = "https://www.waitrose.com/ecom/shop/search?&searchTerm={kw}"
for kw in keywords:
    searches[kw] = []
    # Storing and extracting infos
    fpath = fpath_namer(shop_id, 'search', kw, 0)
    url = search_url.format(kw=kw, page=0)
    if not op.exists(fpath):
        driver.get(url)
        driver.waitclick('//*[@class="closeNoticeSomethingDifferentPopup"]', timeout=4)
        last_height = driver.driver.execute_script("return document.body.scrollHeight")
        while True:
            sleep(1)
            driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            driver.wait_for_xpath('//*[@data-actiontype="load"]', timeout=4, is_enabled=True)
            driver.waitclick('//*[@data-actiontype="load"]', timeout=2)
            new_height = driver.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        driver.save_page(fpath, scroll_to_bottom=True)

    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for li in tree.xpath('//article[@data-test="product-pod"]'):
        produrl = clean_url(li.xpath('.//a[h2]/@href')[0], root_url)
        produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
            urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer': ' '.join(li.xpath('.//h2//text()')),
            'raw_price': ''.join(w for t in li.xpath('.//span[@data-test="product-pod-price"]//text()') for w in t.split()).strip(),
Пример #5
0
saucey_was_initialised = False

# Categories scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(10):
        print(ctg, p)
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            if not saucey_was_initialised:
                init_saucey(driver)
                saucey_was_initialised = True
            driver.get(url.format(page=p * 60))
            driver.wait_for_xpath('//*[@itemtype="http://schema.org/Product"]',
                                  timeout=10)
            driver.smooth_scroll(sleep_time=0.3)
            driver.save_page(fpath, scroll_to_bottom=True)
        # Parsing
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
        for li in tree.xpath('//*[@itemtype="http://schema.org/Product"]'):
            produrl = "".join(li.xpath('.//a[@itemprop="url"]/@href'))
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
                " ".join(''.join(
                    li.xpath('.//*[@itemprop="name"]//text()')).split()),
                'raw_price':
    'https://www.wineanthology.com/c-2-wine.aspx?term=brandy&pagenum={page}&sort=0',
    'liquor':
    'https://www.wineanthology.com/c-2-wine.aspx?term=liquor&pagenum={page}&sort=0',
}

# Category Scraping - with requests - multiple pages per category
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(100):
        urlp = url.format(page=p + 1)
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            driver.get(urlp)
            driver.wait_for_xpath(
                '//div[@id="productResults"]//div[contains(@class, "product-list-item")]//div[contains(@class, "product-hover")]',
                timeout=5)
            driver.save_page(fpath, scroll_to_bottom=True)
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
        for li in tree.xpath(
                '//div[@id="productResults"]//div[contains(@class, "product-list-item")]//div[contains(@class, "product-hover")]'
        ):
            produrl = li.xpath('.//a/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
                ''.join(
                    li.xpath(
Пример #7
0
    'https://www.davidjones.com/home-and-food/food-and-wine/wine-champagne-and-spirits/white-wine',
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        click_trials = 0
        while True:
            driver.scroll_to_bottom()
            sleep(2)
            if driver.wait_for_xpath(
                    '//a[@class="btn load-products loading-button externalLink"]'
            ):
                driver.waitclick(
                    '//a[@class="btn load-products loading-button externalLink"]'
                )
                click_trials += 1
                if click_trials > 1:
                    break
            else:
                break
        driver.save_page(fpath)

    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    # for li in tree.xpath('//li[contains(@id,"WC_CatalogSearchResultDisplay")]'):
    for li in tree.xpath('//div[@class="item isUpdated"]'):
        if not li.xpath('.//figure/a/@href'):
Пример #8
0
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
print([(c, len(categories[c])) for c in categories])

# KW searches Scraping - with selenium - with nb page hard-coded in url - multiple page per search
search_url = "https://www.langtons.com.au/search?p={page}&query={kw}&IncludeBuyNow=true"
for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0

    for p in range(10):
        # Storing and extracting infos
        fpath = fpath_namer(shop_id, 'search', kw, p)
        url = search_url.format(kw=kw, page=p + 1)
        if not op.exists(fpath):
            driver.get(url)
            driver.wait_for_xpath('//*[@ng-repeat="prod in List.Products"]')
            driver.save_page(fpath, scroll_to_bottom=True)
        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        for li in tree.xpath(
                '//*[@id="results"]//li[contains(@ng-repeat, "prod in List.Products")]'
        ):
            produrl = li.xpath('.//a/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
                li.xpath('.//b[@class="ng-binding"]//text()')[0],
                'raw_price':
                li.xpath('.//div[@class="current-bid ng-binding"]//text()')[0],
Пример #9
0
shop_id = "leshop"
root_url = "https://www.leshop.ch/"

####
search_box_xpath = '//*[@id="autocompleteSearchInput"]'
items_xpath = '//*[@data-ng-controller="AutocompleteSearchCtrl as controller"]//div[@class="item"]'
####

l = []
for kw in [
        'vodka', 'champagne', 'whisky', 'sparkling', 'cognac', 'still wine'
]:
    driver.get(root_url)
    driver.text_input(kw, search_box_xpath, timeout=5)
    try:
        driver.wait_for_xpath(items_xpath, timeout=5)
    except:
        continue
    fpath = '/tmp/prompted ' + shop_id + ' ' + kw + '.html'
    driver.save_page(fpath, scroll_to_bottom=True)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for c, li in enumerate(tree.xpath(items_xpath)):
        txt = ' '.join(''.join(li.xpath('.//text()')).split())
        print(kw, shop_id)
        tmp = {'shop_id': shop_id, 'kw': kw, 'num': c, 'product': txt}
        l.append(tmp)

df = pd.DataFrame(l).to_csv(op.join("../data_prompted", shop_id + '.csv'),
                            index=None,
                            sep=";")