if len(set(categories[ctg])) == number_of_pdcts_in_ctg: break else: number_of_pdcts_in_ctg = len(set(categories[ctg])) print(categories) # KW searches Scraping - with selenium - with nb page hard-coded in url - multiple page per search search_url = "https://www.bodeboca.com/" for kw in keywords: searches[kw] = [] number_of_pdcts_in_kw_search = 0 fpath = fpath_namer(shop_id, 'search', kw, 0) if not op.exists(fpath): driver.get(search_url) sleep(2) driver.text_input(kw, '//*[@id="bodeboca-search-box"]/input') sleep(2) driver.save_page(fpath, scroll_to_bottom=False) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//div[@id="search-results-main"]/div'): if not li.xpath('.//a/@href'): continue if not ' '.join(''.join(li.xpath('.//h2/a/text()')[:1]).split()): continue produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = {
print(products[produrl]) print([(c, len(categories[c])) for c in categories]) for kw in keywords: searches[kw] = [] number_of_pdcts_in_kw_search = 0 search_input_box_xpath = u'//*[@id="search"]' fpath = fpath_namer(shop_id, 'search', kw, 0) if not op.exists(fpath_namer(shop_id, 'search', kw, 0)): if not driver.check_exists_by_xpath(search_input_box_xpath): # Getting back to root if search input box is not found driver.get( 'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/' ) driver.text_input(kw, search_input_box_xpath, enter=True) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) # Storing and extracting infos tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//ul[@class="productLister gridView"]/li'): produrl = li.xpath('.//h3/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl searches[kw].append(produrl) products[produrl] = { 'pdct_name_on_eretailer': " ".join("".join( li.xpath(
# # KW searches scrapping ############ ###################################### # KW searches Scraping - with requests - one page per search driver.get('https://www.aeondewine.com') for kw in keywords: searches[kw] = [] number_of_pdcts_in_kw_search = 0 # If files exist, don't scrap perform_scrapping = not op.exists(fpath_namer(shop_id, 'search', kw, 0)) for p in range(5): fpath = fpath_namer(shop_id, 'search', kw, p) if not op.exists(fpath) and perform_scrapping: driver.text_input(kw, '//input[@id="keyword"]', enter=True) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) searches, products = kw_parsing(fpath, kw, searches, products) # Break or change pages if number_of_pdcts_in_kw_search == len(searches[kw]): print("Finished, because no more new products") break if not perform_scrapping and not op.exists( fpath_namer(shop_id, 'search', kw, p + 1)): print("Finished, because no more new products") break
'red_wine': 'https://www.belvini.de/rotweine?page={page}', 'white_wine': 'https://www.belvini.de/weisswein?page={page}', 'rum': 'https://www.belvini.de/spirituosen/rum?page={page}', 'brandy': 'https://www.belvini.de/spirituosen/brandy?page={page}', 'liquor': 'https://www.belvini.de/spirituosen/likoer?page={page}', } # Get price list d = {} for kw in keywords: fpath = fpath_namer(shop_id, 'other', kw) url = "https://www.belvini.de/" if not op.exists(fpath): driver.get(url) driver.text_input(kw, '//input[@name="keywords"]') sleep(1.5) driver.save_page(fpath) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//*[not(contains(@id, "Help")) and @class="suggRow"]/*[@class="suggItem"]'): pdct_name_on_eretailer = ' '.join(''.join(li.xpath('.//*[@class="suggProduct"]//text()')).split()).strip() d[pdct_name_on_eretailer] = { 'raw_price': ' '.join(''.join(li.xpath('.//*[@class="suggCat"]//text()')).split()).strip(), 'raw_promo_price' "" 'promo_price': '', } d[pdct_name_on_eretailer]['price'] = getprice(d[pdct_name_on_eretailer]['raw_price']) print(len(d.keys()))
###################################### # # KW searches scrapping ############ ###################################### # KW searches Scraping - with requests - one page per search kw_search_url = "https://www.iy-net.jp/nspc/getproducts.do?shopcd=00209&categoryCode=30856&page=1" # TODO : modify URL for kw in keywords: searches[kw] = [] number_of_pdcts_in_kw_search = 0 if not op.exists(fpath_namer(shop_id, 'search', kw, 0)): driver.get(kw_search_url.format(kw=kw)) for p in range(1): fpath = fpath_namer(shop_id, 'search', kw, p) if not op.exists(fpath): driver.text_input(kw, '//input[@id="searchtxt"]', enter=True) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) searches, products = kw_parsing(fpath, kw, searches, products) print(kw, len(searches[kw])) ###################################### # # Download images ########### ###################################### brm = BrandMatcher() for url, pdt in products.items(): if 'pdct_img_main_url' in pdt and pdt['pdct_img_main_url'] and \ brm.find_brand(pdt['pdct_name_on_eretailer'], special_country='JP')['brand'] in mh_brands: print(pdt['pdct_name_on_eretailer'] + "." +
'red_wine': 'https://www.b-21.com/searchprods.asp?searchstring=red+wine&pagenumber={page}&val=0', 'white_wine': 'https://www.b-21.com/searchprods.asp?searchstring=white+wine&pagenumber={page}&val=0', 'tequila': 'https://www.b-21.com/searchprods.asp?searchstring=tequila&pagenumber={page}&val=0', 'gin': 'https://www.b-21.com/searchprods.asp?searchstring=gin&pagenumber={page}&val=0', 'rum': 'https://www.b-21.com/searchprods.asp?searchstring=rum&pagenumber={page}&val=0', 'brandy': 'https://www.b-21.com/searchprods.asp?searchstring=brandy&pagenumber={page}&val=0', } for ctg, caturl in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 req_sent = False if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 1)): req_sent = True driver.get('https://www.b-21.com/') driver.text_input(ctg, '//input[@id="code"]', enter=True) for page in range(1, 100): url = caturl.format(page=page) fpath = fpath_namer(shop_id, 'ctg', ctg, page) if not op.exists(fpath) and req_sent: driver.smooth_scroll() driver.save_page(fpath, scroll_to_bottom=True) elif not op.exists(fpath) and not req_sent: break tree = etree.parse(open(fpath, 'rb'), parser=parser) for tr in tree.xpath('//div[contains(@class, "c data2")]/table[3]/tbody/tr'): if not tr.xpath('.//*[contains(@class, "prodstitle")]/@href'): continue produrl = tr.xpath('.//*[contains(@class, "prodstitle")]/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url)
driver = CustomDriver(headless=False) shop_id = "leshop" root_url = "https://www.leshop.ch/" #### search_box_xpath = '//*[@id="autocompleteSearchInput"]' items_xpath = '//*[@data-ng-controller="AutocompleteSearchCtrl as controller"]//div[@class="item"]' #### l = [] for kw in [ 'vodka', 'champagne', 'whisky', 'sparkling', 'cognac', 'still wine' ]: driver.get(root_url) driver.text_input(kw, search_box_xpath, timeout=5) try: driver.wait_for_xpath(items_xpath, timeout=5) except: continue fpath = '/tmp/prompted ' + shop_id + ' ' + kw + '.html' driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for c, li in enumerate(tree.xpath(items_xpath)): txt = ' '.join(''.join(li.xpath('.//text()')).split()) print(kw, shop_id) tmp = {'shop_id': shop_id, 'kw': kw, 'num': c, 'product': txt} l.append(tmp) df = pd.DataFrame(l).to_csv(op.join("../data_prompted", shop_id + '.csv'), index=None,