# KW searches Scraping - with selenium - with nb page hard-coded in url - multiple page per search search_url = "https://www.nicks.com.au/search?q={kw}&page={page}&limit=48&sort=relevance&mode=grid&type=products" for kw in keywords: searches[kw] = [] number_of_pdcts_in_kw_search = 0 for p in range(10): # Storing and extracting infos urlp = search_url.format(kw=kw, page=p + 1) fpath = fpath_namer(shop_id, 'search', kw, p) if not op.exists(fpath): driver.get(urlp) try: driver.wait_for_xpath('//div[@class="product item"]', timeout=15) except: pass driver.save_page(fpath) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) # r = requests.get(urlp) # with open('/tmp/' + shop_id + " " + kw + '.html', 'wb') as f: # f.write(r.content) # tree = etree.parse(BytesIO(r.content), parser=parser) for li in tree.xpath('//div[@class="product item"]'): produrl = li.xpath( './/div[@class="productblock-title"]/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
'gin': 'https://www.reservebar.com/gin', 'red_wine': 'https://www.reservebar.com/wine/red-wine', 'white_wine': 'https://www.reservebar.com/wine/white-wine', 'tequila': 'https://www.reservebar.com/tequila', 'rum': 'https://www.reservebar.com/rum', } # Category Scraping - with selenium - one page per category for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] fpath = fpath_namer(shop_id, 'ctg', ctg, 0) print(ctg, url) if not op.exists(fpath): driver.get(url) try: driver.wait_for_xpath( '//article[contains(@class, "product-item")]', timeout=8) except: pass driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//article[contains(@class, "product-item")]'): produrl = li.xpath('.//figure/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': ''.join(li.xpath('.//h3/a//text()')), 'raw_price': ''.join(
number_of_pdcts_in_ctg = len(set(categories[ctg])) print([(c, len(categories[c])) for c in categories]) # KW searches Scraping - with selenium - with nb page hard-coded in url - multiple page per search search_url = "https://www.plus-de-bulles.com/browse/?q={kw}&search_param=all" for kw in keywords: searches[kw] = [] number_of_pdcts_in_kw_search = 0 # Storing and extracting infos urlp = search_url.format(kw=kw) fpath = fpath_namer(shop_id, 'search', kw, 0) if not op.exists(fpath): driver.get(urlp) try: driver.wait_for_xpath('//*[class="product-box"]', timeout=15) except: pass driver.save_page(fpath) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//div[@class="product-box"]'): produrl = li.xpath('./a/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': ' '.join(''.join(li.xpath('.//div[@class="group-first"]//text()')).split()).strip(), 'raw_price': ' '.join(''.join(li.xpath('.//*[contains(@class,"price-box")]//span/text()')).split()).strip().split("€")[0] + '€', 'raw_promo_price': ' '.join(''.join(li.xpath('.//span[@class="crossed"]//text()')).split()).strip(), } print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price'])
# KW searches Scraping - with selenium - one page per search search_url = "https://www.waitrose.com/ecom/shop/search?&searchTerm={kw}" for kw in keywords: searches[kw] = [] # Storing and extracting infos fpath = fpath_namer(shop_id, 'search', kw, 0) url = search_url.format(kw=kw, page=0) if not op.exists(fpath): driver.get(url) driver.waitclick('//*[@class="closeNoticeSomethingDifferentPopup"]', timeout=4) last_height = driver.driver.execute_script("return document.body.scrollHeight") while True: sleep(1) driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") driver.wait_for_xpath('//*[@data-actiontype="load"]', timeout=4, is_enabled=True) driver.waitclick('//*[@data-actiontype="load"]', timeout=2) new_height = driver.driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//article[@data-test="product-pod"]'): produrl = clean_url(li.xpath('.//a[h2]/@href')[0], root_url) produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': ' '.join(li.xpath('.//h2//text()')), 'raw_price': ''.join(w for t in li.xpath('.//span[@data-test="product-pod-price"]//text()') for w in t.split()).strip(),
saucey_was_initialised = False # Categories scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(10): print(ctg, p) fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): if not saucey_was_initialised: init_saucey(driver) saucey_was_initialised = True driver.get(url.format(page=p * 60)) driver.wait_for_xpath('//*[@itemtype="http://schema.org/Product"]', timeout=10) driver.smooth_scroll(sleep_time=0.3) driver.save_page(fpath, scroll_to_bottom=True) # Parsing tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//*[@itemtype="http://schema.org/Product"]'): produrl = "".join(li.xpath('.//a[@itemprop="url"]/@href')) produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': " ".join(''.join( li.xpath('.//*[@itemprop="name"]//text()')).split()), 'raw_price':
'https://www.wineanthology.com/c-2-wine.aspx?term=brandy&pagenum={page}&sort=0', 'liquor': 'https://www.wineanthology.com/c-2-wine.aspx?term=liquor&pagenum={page}&sort=0', } # Category Scraping - with requests - multiple pages per category for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): urlp = url.format(page=p + 1) fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(urlp) driver.wait_for_xpath( '//div[@id="productResults"]//div[contains(@class, "product-list-item")]//div[contains(@class, "product-hover")]', timeout=5) driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath( '//div[@id="productResults"]//div[contains(@class, "product-list-item")]//div[contains(@class, "product-hover")]' ): produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': ''.join( li.xpath(
'https://www.davidjones.com/home-and-food/food-and-wine/wine-champagne-and-spirits/white-wine', } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 fpath = fpath_namer(shop_id, 'ctg', ctg, 0) if not op.exists(fpath): driver.get(url) click_trials = 0 while True: driver.scroll_to_bottom() sleep(2) if driver.wait_for_xpath( '//a[@class="btn load-products loading-button externalLink"]' ): driver.waitclick( '//a[@class="btn load-products loading-button externalLink"]' ) click_trials += 1 if click_trials > 1: break else: break driver.save_page(fpath) tree = etree.parse(open(fpath, 'rb'), parser=parser) # for li in tree.xpath('//li[contains(@id,"WC_CatalogSearchResultDisplay")]'): for li in tree.xpath('//div[@class="item isUpdated"]'): if not li.xpath('.//figure/a/@href'):
number_of_pdcts_in_ctg = len(set(categories[ctg])) print([(c, len(categories[c])) for c in categories]) # KW searches Scraping - with selenium - with nb page hard-coded in url - multiple page per search search_url = "https://www.langtons.com.au/search?p={page}&query={kw}&IncludeBuyNow=true" for kw in keywords: searches[kw] = [] number_of_pdcts_in_kw_search = 0 for p in range(10): # Storing and extracting infos fpath = fpath_namer(shop_id, 'search', kw, p) url = search_url.format(kw=kw, page=p + 1) if not op.exists(fpath): driver.get(url) driver.wait_for_xpath('//*[@ng-repeat="prod in List.Products"]') driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath( '//*[@id="results"]//li[contains(@ng-repeat, "prod in List.Products")]' ): produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': li.xpath('.//b[@class="ng-binding"]//text()')[0], 'raw_price': li.xpath('.//div[@class="current-bid ng-binding"]//text()')[0],
shop_id = "leshop" root_url = "https://www.leshop.ch/" #### search_box_xpath = '//*[@id="autocompleteSearchInput"]' items_xpath = '//*[@data-ng-controller="AutocompleteSearchCtrl as controller"]//div[@class="item"]' #### l = [] for kw in [ 'vodka', 'champagne', 'whisky', 'sparkling', 'cognac', 'still wine' ]: driver.get(root_url) driver.text_input(kw, search_box_xpath, timeout=5) try: driver.wait_for_xpath(items_xpath, timeout=5) except: continue fpath = '/tmp/prompted ' + shop_id + ' ' + kw + '.html' driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for c, li in enumerate(tree.xpath(items_xpath)): txt = ' '.join(''.join(li.xpath('.//text()')).split()) print(kw, shop_id) tmp = {'shop_id': shop_id, 'kw': kw, 'num': c, 'product': txt} l.append(tmp) df = pd.DataFrame(l).to_csv(op.join("../data_prompted", shop_id + '.csv'), index=None, sep=";")