from lxml import etree parser = etree.HTMLParser() from custom_browser import CustomDriver import random # Init variables and assets driver = CustomDriver(headless=False, firefox=True, download_images=True) random.choice([1, 2, 3]) count = 0 driver.get('https://www.leparisien.fr') while True: print('Looping', count) elems = driver.driver.find_elements_by_xpath("//a[@href]") elems = [el.get_attribute('href') for el in elems] elems2 = [el for el in elems if "www.leparisien.fr" in el] elems3 = [ el for el in elems if ("www.leparisien.fr" in el) and ('.php' in el) ] if elems3: url = random.choice(elems3) count += 1 print(count, url) driver.get(url) elif elems2: url = random.choice(elems2) count += 1 print(count, url) driver.get(url) else: driver.get('https://www.leparisien.fr')
'rum': 'https://www.argonautliquor.com/search/categories/Rum/result_size/96/page/{page}', 'liquor': 'https://www.argonautliquor.com/search/categories/Liqueur/result_size/96/page/{page}', 'brandy': 'https://www.argonautliquor.com/search/categories/Brandy/result_size/96/page/{page}', 'mezcal': 'https://www.argonautliquor.com/search/categories/Mezcal/result_size/96/page/{page}', } # Category Scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): urlp = url.format(page=p + 1) fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(urlp) sleep(2) driver.save_page(fpath) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) # r = requests.get(urlp) # tree = etree.parse(BytesIO(r.content), parser=parser) for li in tree.xpath('//div[@id="product-list"]//div[@class="grid-item"]'): produrl = li.xpath('.//a[@class="product-link"]/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': ' '.join( ''.join(li.xpath('.//div[@class="product-name"]//text()')).split()).strip(),
'red_wine': 'https://www.seijoishii.com/c/1283?&row_limit=50&page={page}', 'bourbon': 'https://www.seijoishii.com/c/277?&row_limit=50&page={page}', 'brandy': 'https://www.seijoishii.com/c/239?&row_limit=50&page={page}', # 'rum': '', } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(url.format(page=p + 1)) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) if len(set(categories[ctg])) == number_of_pdcts_in_ctg: break else: number_of_pdcts_in_ctg = len(set(categories[ctg])) print(ctg, url, p, len(categories[ctg])) ###################################### # # KW searches scrapping ############ ###################################### # KW searches Scraping - with requests - one page per search
'https://iyec.omni7.jp/basic/42450?sort=recommend&displayCnt=80&startIndex={page}', # 'bourbon': '',#na # 'brandy': '',#na # 'rum': '',#na } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(url.format(page=p * 80)) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) if len(set(categories[ctg])) == number_of_pdcts_in_ctg: break else: number_of_pdcts_in_ctg = len(set(categories[ctg])) print(ctg, url, p, len(categories[ctg])) ###################################### # # KW searches scrapping ############ ###################################### # KW searches Scraping - with requests - one page per search
return price.named['pound'] * 100 else: return price.named['pound'] * 100 + price.named['pence'] # Category Scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for page in range(10): fpath = fpath_namer(shop_id, 'ctg', ctg, page) if not op.exists(fpath): print(url.format(page=page + 1)) driver.respawn() driver.get(url.format(page=page + 1)) driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//li[@class="product-grid_item"]'): produrl = li.xpath( './/a[contains(@class, "product-card_link")]/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) categories[ctg].append(produrl) products[produrl] = { 'pdct_name_on_eretailer': ' '.join([ li.xpath('.//span[@class="product-card_brand"]//text()')
'rum': 'https://www.auchandrive.fr/catalog/boissons-3686969/bieres-alcools-3686338/rhums-R3702929', 'liquor': 'https://www.auchandrive.fr/catalog/boissons-3686969/bieres-alcools-3686338/aperitifs-anises-R3702917', } # Category Scraping for ctg, url in urls_ctgs_dict.items(): print('Beginning,', ctg, url) categories[ctg] = [] fpath = fpath_namer(shop_id, 'ctg', ctg, 0) if not op.exists(fpath): if not auchan_drive_was_initialised: init_auchan_drive(driver) auchan_drive_was_initialised = True driver.get(url) driver.smooth_scroll() driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) print(ctg, url, len(categories[ctg])) ###################################### # # KW searches scrapping ############ ###################################### # KW searches Scraping - with requests - one page per search kw_search_url = "https://www.auchandrive.fr/recherche/{kw}" # TODO : modify URL for kw in keywords: searches[kw] = [] fpath = fpath_namer(shop_id, 'search', kw, 0) if not op.exists(fpath):
saucey_was_initialised = False # Categories scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(10): print(ctg, p) fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): if not saucey_was_initialised: init_saucey(driver) saucey_was_initialised = True driver.get(url.format(page=p * 60)) driver.wait_for_xpath('//*[@itemtype="http://schema.org/Product"]', timeout=10) driver.smooth_scroll(sleep_time=0.3) driver.save_page(fpath, scroll_to_bottom=True) # Parsing tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//*[@itemtype="http://schema.org/Product"]'): produrl = "".join(li.xpath('.//a[@itemprop="url"]/@href')) produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': " ".join(''.join(
urls_ctgs_dict = { 'champagne': 'https://www.mondovino.ch/catalogue/typedevin/Champagne', 'vodka': 'https://www.mondovino.ch/selections/spiritueux/Cfr', 'cognac': 'https://www.mondovino.ch/selections/spiritueux/Cfr', 'whisky': 'https://www.mondovino.ch/selections/spiritueux/Cfr', 'still_wines': 'https://www.mondovino.ch/catalogue/typedevin/Vin+blanc', 'white_wine': 'https://www.mondovino.ch/catalogue/typedevin/Vin+blanc', 'red_wine': 'https://www.mondovino.ch/catalogue/typedevin/Vin+rouge', } # Category Scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] fpath = fpath_namer(shop_id, 'ctg', ctg, 0) if not op.exists(fpath): driver.get(url) # Get scroll height last_height = driver.driver.execute_script( "return document.body.scrollHeight") while True: # Scroll down to bottom driver.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") try: driver.waitclick('//div[@class="mod_product_list__more"]/a', timeout=5, silent=True) except: pass # Wait to load page sleep(2)
# 'bourbon': '',#na 'brandy': 'http://www.kakuyasu.co.jp/ec/disp/CSfDispListPage_001.jsp?dispNo=001014&q=&j=&min=&max=&ys=&yl=&yoryotanni=&allSearch=&type=01&sort=01&page={page}', # 'rum': '',#na } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(url.format(page=p + 1)) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) if len(set(categories[ctg])) == number_of_pdcts_in_ctg: print("Finishing with :", len(set(categories[ctg])), "products") break else: number_of_pdcts_in_ctg = len(set(categories[ctg])) print(ctg, url, p, len(categories[ctg])) ###################################### # # KW searches scrapping ############ ######################################
'white_wine':'https://www.abcfws.com/category/WINE/WHITE/pc/2/16.uts?currentIndex={start}&pageSize=48', 'red_wine':'https://www.abcfws.com/category/WINE/RED/pc/2/3.uts?currentIndex={start}&pageSize=48', 'gin':'https://www.abcfws.com/category/SPIRITS/GIN/pc/46/50.uts?currentIndex={start}&pageSize=48', 'tequila':'https://www.abcfws.com/category/SPIRITS/TEQUILA/pc/46/59.uts?currentIndex={start}&pageSize=48', 'rum':'https://www.abcfws.com/category/SPIRITS/RUM/pc/46/51.uts?currentIndex={start}&pageSize=48', 'scotch':'https://www.abcfws.com/thumbnail/SPIRITS/WHISKEY/SCOTCH/pc/46/c/67/74.uts?currentIndex={start}&pageSize=48', 'bourbon':'https://www.abcfws.com/thumbnail/SPIRITS/WHISKEY/BOURBON/pc/46/c/67/69.uts?currentIndex={start}&pageSize=48', } for ctg, url in categories_urls.items(): categories[ctg] = [] for p, start in enumerate(range(0, 1000, 48)): # r = requests.get(url.format(start=start)) fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(url.format(start = start)) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) # tree = etree.parse(BytesIO(r.content), parser=parser) tree = etree.parse(open(fpath, 'rb'), parser=parser) articles = tree.xpath('//section[contains(@class, "productsList")]/div[@class="product"]') aurls = [a.xpath('.//div[@class="name"]/a/@href')[0] for a in articles] if not articles: break categories[ctg] += aurls for a in articles: data = { 'url': a.xpath('.//div[@class="name"]/a/@href')[0], 'pdct_name_on_eretailer': a.xpath('.//div[@class="name"]/a/text()')[0].strip(), 'volume': a.xpath('.//div[@class="volume"]//text()')[0].strip(), 'price': getprice(''.join(a.xpath('.//div[@class="price pl0"]/span/text()')).strip()),
'still_wines': 'https://www.b-21.com/searchprods.asp?searchstring=wine&pagenumber={page}&val=0', 'red_wine': 'https://www.b-21.com/searchprods.asp?searchstring=red+wine&pagenumber={page}&val=0', 'white_wine': 'https://www.b-21.com/searchprods.asp?searchstring=white+wine&pagenumber={page}&val=0', 'tequila': 'https://www.b-21.com/searchprods.asp?searchstring=tequila&pagenumber={page}&val=0', 'gin': 'https://www.b-21.com/searchprods.asp?searchstring=gin&pagenumber={page}&val=0', 'rum': 'https://www.b-21.com/searchprods.asp?searchstring=rum&pagenumber={page}&val=0', 'brandy': 'https://www.b-21.com/searchprods.asp?searchstring=brandy&pagenumber={page}&val=0', } for ctg, caturl in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 req_sent = False if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 1)): req_sent = True driver.get('https://www.b-21.com/') driver.text_input(ctg, '//input[@id="code"]', enter=True) for page in range(1, 100): url = caturl.format(page=page) fpath = fpath_namer(shop_id, 'ctg', ctg, page) if not op.exists(fpath) and req_sent: driver.smooth_scroll() driver.save_page(fpath, scroll_to_bottom=True) elif not op.exists(fpath) and not req_sent: break tree = etree.parse(open(fpath, 'rb'), parser=parser) for tr in tree.xpath('//div[contains(@class, "c data2")]/table[3]/tbody/tr'): if not tr.xpath('.//*[contains(@class, "prodstitle")]/@href'): continue produrl = tr.xpath('.//*[contains(@class, "prodstitle")]/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
site_was_initialised = False # Categories scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): print(ctg, p) fpath = fpath_namer(shop_id, 'ctg', ctg, p) print(fpath) if not op.exists(fpath): # if not site_was_initialised: # init_site(driver) # site_was_initialised = True driver.get(url.format(page=p + 1)) driver.save_page(fpath, scroll_to_bottom=True) # Parsing tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//*[@class="product-list"]/div'): produrl = "".join(li.xpath('.//a[@class="rebl15"]/@href')) produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': li.xpath('.//a[@class="rebl15"]/text()')[0].strip(), 'volume':
'bourbon': 'https://www.liquorland.com.au/Spirits?facets=spiritproducttype%3dBourbon?show=200&page={page}', 'liquor': 'https://www.liquorland.com.au/Spirits?facets=spiritproducttype%3dImported+Liqueurs?show=200&page={page}', 'tequila': 'https://www.liquorland.com.au/Spirits?facets=spiritproducttype%3dTequila?show=200&page={page}', } # Category Scraping - with selenium - one page per category for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(20): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(url.format(page=p)) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//ul[@class="productList"]/li'): produrl = li.xpath('.//div/h2/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': "".join(li.xpath('.//div/h2//text()')).strip(), 'raw_price': ''.join( w for t in li.xpath('.//div[@class="valueLarge"]//text()')
# 'tequila': '',#no tequila # 'liquor': '',#no liquor # 'white_wine': 'https://www.aeondewine.com/shop/c/c060102/?l:inkid=aw69_avGM7kHb', # 'red_wine': 'https://www.aeondewine.com/shop/c/c060101/?linkid=aw69_Xl3132nk', # 'bourbon': '',#no bourbon # 'brandy': '',#no brandy # 'rum': '',#no rum } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 print("Beginning ", ctg, url) if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)): driver.get(url) # If files exist, don't scrap perform_scrapping = not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)) for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath) and perform_scrapping: sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) print(fpath, ctg, p, len(categories[ctg])) # Break or change pages if number_of_pdcts_in_ctg == len(categories[ctg]): print("Finished, because no more new products") break
'https://www.goodygoody.com/Products/Products?searchTerm=&category=1AGN&type=0&orderBy=name&minprice=&maxprice=', 'tequila': 'https://www.goodygoody.com/Products/Products?searchTerm=&category=1ATQ&type=0&orderBy=name&minprice=&maxprice=', 'rum': 'https://www.goodygoody.com/Products/Products?searchTerm=&category=1ARM&type=0&orderBy=name&minprice=&maxprice=', 'brandy': 'https://www.goodygoody.com/Products/Products?searchTerm=&category=1ABR&type=0&orderBy=name&minprice=&maxprice=', 'bourbon': 'https://www.goodygoody.com/Products/Products?searchTerm=&category=1ABN&type=0&orderBy=name&minprice=&maxprice=', } # Categories scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)): driver.get(url) for p in range(100): # Scraping urlp = url.format(page=p + 1) fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): if not goodygoody_was_initialised: init_goodygoody(driver) goodygoody_was_initialised = True sleep(2) driver.save_page(fpath) # Parsing tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath( '//div[@class="row productRow"]//div[@class="row"]'):
'red_wine': 'http://www.waitrosecellar.com/all-wines/wine-type/red-wine', 'white_wine': 'http://www.waitrosecellar.com/all-wines/wine-type/white-wine', 'gin': 'http://www.waitrosecellar.com/gin', 'rum': 'http://www.waitrosecellar.com/rum', 'tequila': 'http://www.waitrosecellar.com/tequila', 'liquor': 'http://www.waitrosecellar.com/liqueurs', } # Difficult case, where you should click a button to get on next page and send the request via the search bar for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)): # Getting back to root if search input box is not found driver.get(url) for p in range(100): # Storing and extracting infos fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.save_page(fpath) sleep(2) tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="productCard"]'): produrl = li.xpath('.//div[@class="productName"]/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl categories[ctg].append(produrl) products[produrl] = { 'pdct_name_on_eretailer':
'http://shop.bevmo.com/search?format=varietal&lbc=bevmo&method=and&p=Q&ts=custom&uid=644456520&view=list&w=rum&af=varietal%3aliqueur&srt={page}', } # Categories scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): print(ctg, p) fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): if not bevmo_was_initialised: init_bevmo(driver) bevmo_was_initialised = True print(url.format(page=32 * p)) driver.get(url.format(page=32 * p)) sleep(1) driver.save_page(fpath, scroll_to_bottom=True) # Parsing tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath( '//ul[contains(@class, "products")]//li[@class="item"]'): produrl = "".join( li.xpath('.//h2[contains(@class, "product-name")]/a/@href')) produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer':
if not pricestr: return '' price = parse('${pound:d}.{pence:d}', pricestr) if not price: price = parse('${th:d},{pound:d}.{pence:d}', pricestr) return price.named['th'] * 100000 + price.named['pound'] * 100 + price.named['pence'] return price.named['pound'] * 100 + price.named['pence'] # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)): # Getting to ctg url driver.get(url) for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.save_page(fpath) sleep(2) tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="col-main-content"]//ul/li'): produrl = li.xpath('.//h2[@class="product-name"]/a/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': "".join(li.xpath('.//h2[@class="product-name"]//text()')), 'raw_price': ''.join(w for t in li.xpath('.//span[@class="price"]/text()') for w in t.split()).strip(), } print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price'])
'red_wine': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/wine/red_wine', 'white_wine': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/wine/white_wine', 'gin': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/gin', 'tequila': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/tequila', 'rum': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/rum', 'liquor': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/liqueurs_and_aperitifs', 'brandy': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/brandy', } # Category Scraping - with selenium - one page per category for ctg, url in urls_ctgs_dict.items(): count = 1 categories[ctg] = [] fpath = fpath_namer(shop_id, 'ctg', ctg, 0) if not op.exists(fpath): driver.get(url) driver.waitclick('//*[@class="closeNoticeSomethingDifferentPopup"]', timeout=4) last_height = driver.driver.execute_script("return document.body.scrollHeight") while True: sleep(1) driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") driver.waitclick('//*[@data-actiontype="load"]', timeout=3) driver.waitclick('//*[@data-actiontype="load"]', timeout=0.5) new_height = driver.driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//article[@data-test="product-pod"]'):
"https://www.freshdirect.com/browse.jsp?pageType=browse&id=vin_spirits_liqueurs&pageSize=100&all=true&activePage=1&sortBy=Sort_PopularityUp&orderAsc=true&activeTab=product", } # Categories scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): print(ctg, p) fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): # if not freshdirect_was_initialised: # init_freshdirect(driver) # freshdirect_was_initialised = True driver.get(url.format(page=p + 1)) sleep(1) driver.save_page(fpath, scroll_to_bottom=True) # Parsing tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath( '//ul[contains(@class, "products transactional")]/li'): produrl = li.xpath( './/a[@class="portrait-item-image-link"]/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer':
'scotch': 'https://www.abcfws.com/thumbnail/SPIRITS/WHISKEY/SCOTCH/pc/46/c/67/74.uts?currentIndex={start}&pageSize=48', 'bourbon': 'https://www.abcfws.com/thumbnail/SPIRITS/WHISKEY/BOURBON/pc/46/c/67/69.uts?currentIndex={start}&pageSize=48', } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p, start in enumerate(range(0, 1000, 48)): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(url.format(start=start)) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) if len(set(categories[ctg])) == number_of_pdcts_in_ctg: break else: number_of_pdcts_in_ctg = len(set(categories[ctg])) print(ctg, url, p, len(categories[ctg])) ###################################### # # KW searches scrapping ############ ###################################### # KW searches Scraping - with requests - one page per search
"brandy": 'https://www.bodeboca.com/destilados-licores/brandy?page={page}', "red_wine": 'https://www.bodeboca.com/vino/tinto?page={page}', "white_wine": 'https://www.bodeboca.com/vino/blanco?page={page}', } # Category Scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(20): urlp = url.format(page=p + 1) fpath = fpath_namer(shop_id, 'ctg', ctg, p) print(fpath, p, urlp) if not op.exists(fpath): driver.get(urlp) # driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.7);") # print('sleeping') # sleep(10) # driver.waitclick('//*[contains(@class, "bb-modal-close-button")]', timeout=1, silent=False) driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//div[@id="venta-main"]/div'): produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer':
"brandy": "https://www.nicks.com.au/store/spirits-liqueurs/other-brandy-eau-de-vie?limit=60&mode=grid&p={page}", "liquor": "https://www.nicks.com.au/store/spirits-liqueurs/liqueurs?limit=60&mode=grid&p={page}", } # Category Scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): urlp = url.format(page=p + 1) fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(urlp) sleep(2) driver.save_page(fpath) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//div[@class="product item"]'): produrl = li.xpath( './/div[@class="productblock-title"]/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': ' '.join(''.join( li.xpath('.//div[@class="productblock-title"]/a//text()')).
'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/gin-340887-44#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0', 'rum': 'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/rum-44#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0', 'tequila': 'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/tequila-44#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0', 'liquor': 'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/liqueurs---speciality-spirits#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0', } # Categories scraping for ctg, url in urls_ctgs_dict.items(): print(ctg, url) categories[ctg] = [] fpath = fpath_namer(shop_id, 'ctg', ctg, 0) if not op.exists(fpath): driver.get(url) sleep(1) driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//ul[@class="productLister gridView"]/li'): produrl = li.xpath('.//h3/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl categories[ctg].append(produrl) products[produrl] = { 'pdct_name_on_eretailer': " ".join("".join( li.xpath( './/div[@class="productNameAndPromotions"]//h3//text()')). split()),
# 'brandy': '',#na # 'rum': '',#na } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): print(url.format(page=p+1)) driver.get(url.format(page=p+1), True) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) if len(set(categories[ctg])) == number_of_pdcts_in_ctg: break else: number_of_pdcts_in_ctg = len(set(categories[ctg])) print(ctg, url, p, len(categories[ctg])) break ###################################### # # KW searches scrapping ############ ######################################