# Categories scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(10): print(ctg, p) fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): if not saucey_was_initialised: init_saucey(driver) saucey_was_initialised = True driver.get(url.format(page=p * 60)) driver.wait_for_xpath('//*[@itemtype="http://schema.org/Product"]', timeout=10) driver.smooth_scroll(sleep_time=0.3) driver.save_page(fpath, scroll_to_bottom=True) # Parsing tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//*[@itemtype="http://schema.org/Product"]'): produrl = "".join(li.xpath('.//a[@itemprop="url"]/@href')) produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': " ".join(''.join( li.xpath('.//*[@itemprop="name"]//text()')).split()), 'raw_price': ' '.join(
# # KW searches scrapping ############ ###################################### # KW searches Scraping - with requests - one page per search kw_search_url = "https://www.seijoishii.com/s?search_word={kw}&x=0&y=0" # TODO : modify URL for kw in keywords: searches[kw] = [] number_of_pdcts_in_kw_search = 0 if not op.exists(fpath_namer(shop_id, 'search', kw, 0)): driver.get(kw_search_url.format(kw=kw)) for p in range(2): fpath = fpath_namer(shop_id, 'search', kw, p) if not op.exists(fpath): sleep(2) driver.smooth_scroll() driver.save_page(fpath, scroll_to_bottom=True) searches, products = kw_parsing(fpath, kw, searches, products) print(kw, len(searches[kw])) ###################################### # # Product pages scraping ########### ###################################### # Download the pages - with selenium brm = BrandMatcher() for url in sorted(list(set(products))): d = products[url] if brm.find_brand(d['pdct_name_on_eretailer'], special_country='JP')['brand'] in mh_brands:
'rum': 'https://groceries.morrisons.com/webshop/getCategories.do?tags=%7C105651%7C103120%7C105916%7C151520&Asidebar=1', 'liquor': 'https://groceries.morrisons.com/webshop/getCategories.do?tags=%7C105651%7C103120%7C105916%7C151516&Asidebar=1', } for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(1): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(url) sleep(2) driver.smooth_scroll(10) driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath( '//ul[contains(@class, "fops-shelf")]/li[@class="fops-item"]'): if not li.xpath('.//a/@href') or not li.xpath('.//h4//text()'): continue produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': li.xpath('.//h4//text()')[0].strip(),