" ".join("".join( li.xpath('.//p[@class="pricePerUnit"]/text()')[0]).split()), } print(products[produrl]) products[produrl]['price'] = getprice(products[produrl]['raw_price']) print(products[produrl]) print([(c, len(categories[c])) for c in categories]) for kw in keywords: searches[kw] = [] number_of_pdcts_in_kw_search = 0 search_input_box_xpath = u'//*[@id="search"]' fpath = fpath_namer(shop_id, 'search', kw, 0) if not op.exists(fpath_namer(shop_id, 'search', kw, 0)): if not driver.check_exists_by_xpath(search_input_box_xpath): # Getting back to root if search input box is not found driver.get( 'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/' ) driver.text_input(kw, search_input_box_xpath, enter=True) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) # Storing and extracting infos tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//ul[@class="productLister gridView"]/li'): produrl = li.xpath('.//h3/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl
products[produrl] = { 'pdct_name_on_eretailer': li.xpath('.//div[@class="productName"]//text()')[0].strip(), 'raw_price': "".join( li.xpath('.//div[@class="productCurrentPrice"]//text()')). replace('Now', ''), } # print(products[produrl]) products[produrl]['price'] = getprice( products[produrl]['raw_price']) # print(products[produrl]) # Going to next page if need be next_page_click = '//a[@class="resultsNext"]' if not op.exists(fpath_namer(shop_id, 'ctg', ctg, p + 1)): if not driver.check_exists_by_xpath(next_page_click): break else: driver.waitclick(next_page_click) print(ctg, url, p, len(categories[ctg])) # Difficult case, where you should click a button to get on next page and send the request via the search bar for kw in keywords: searches[kw] = [] number_of_pdcts_in_kw_search = 0 search_input_box_xpath = u'//*[@id="SimpleSearchForm_SearchTerm"]' if not op.exists(fpath_namer(shop_id, 'search', kw, 0)): if not driver.check_exists_by_xpath(search_input_box_xpath): # Getting back to root if search input box is not found driver.get(root_url) driver.text_input(kw, search_input_box_xpath, enter=True)
tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="col-main-content"]//ul/li'): produrl = li.xpath('.//h2[@class="product-name"]/a/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': "".join(li.xpath('.//h2[@class="product-name"]//text()')), 'raw_price': ''.join(w for t in li.xpath('.//span[@class="price"]/text()') for w in t.split()).strip(), } print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) print(products[produrl]) categories[ctg].append(produrl) # Going to next page if need be next_page_click = '//a[@class="next i-next"]' if not op.exists(fpath_namer(shop_id, 'ctg', ctg, p+1)): if not driver.check_exists_by_xpath(next_page_click): break else: driver.waitclick(next_page_click) print(ctg, url, p, len(categories[ctg])) # KW searches Scraping - with selenium - one page per search search_url = "http://twinliquors.com/shop/catalogsearch/result/?q={kw}" for kw in keywords: searches[kw] = [] # Storing and extracting infos fpath = fpath_namer(shop_id, 'search', kw, 0) url = search_url.format(kw=kw, page=0) if not op.exists(fpath): driver.get(url)
'still_wines': 'https://www.hawesko.de/weisswein', 'cognac': 'https://www.hawesko.de/spirituosen', 'red_wine': 'https://www.hawesko.de/rotwein', 'white_wine': 'https://www.hawesko.de/weisswein', } # Category Scraping - with selenium - one page per category for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] fpath = fpath_namer(shop_id, 'ctg', ctg, 0) if not op.exists(fpath): driver.get(url) for k in range(20): sleep(1.5) if driver.check_exists_by_xpath( '//div[@class="article list loader"]//*[@class="button loading loaderbutton"]' ): driver.waitclick( '//div[@class="article list loader"]//*[@class="button loading loaderbutton"]' ) sleep(1) else: break driver.save_page(fpath) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//div[@data-module="article"]'): if not li.xpath('.//div/a/@href'): break produrl = li.xpath('.//div/a/@href')[0] produrl = parse_qs(