'pdct_name_on_eretailer': " ".join("".join( li.xpath( './/div[@class="productNameAndPromotions"]//h3//text()')). split()), 'raw_price': " ".join("".join( li.xpath('.//p[@class="pricePerUnit"]/text()')[0]).split()), } print(products[produrl]) products[produrl]['price'] = getprice(products[produrl]['raw_price']) print(products[produrl]) print(searches) # Download the pages brm = BrandMatcher() for url in sorted(products): d = products[url] if brm.find_brand(d['pdct_name_on_eretailer'])['brand'] in mh_brands: url_mod = clean_url(url, root_url=root_url) fname = fpath_namer(shop_id, 'pdct', d['pdct_name_on_eretailer'], 0) if not op.exists(fname): print(url_mod) driver.get(url_mod) sleep(2) driver.save_page(fname, scroll_to_bottom=True) tree = etree.parse(open(fname), parser=parser) products[url] = { 'pdct_name_on_eretailer': ''.join( tree.xpath(
for p in range(2): fpath = fpath_namer(shop_id, 'search', kw, p) if not op.exists(fpath): sleep(2) driver.smooth_scroll() driver.save_page(fpath, scroll_to_bottom=True) searches, products = kw_parsing(fpath, kw, searches, products) print(kw, len(searches[kw])) ###################################### # # Product pages scraping ########### ###################################### # Download the pages - with selenium brm = BrandMatcher() for url in sorted(list(set(products))): d = products[url] if brm.find_brand(d['pdct_name_on_eretailer'], special_country='JP')['brand'] in mh_brands: print(d['pdct_name_on_eretailer']) url_mod = clean_url(url, root_url=root_url) fpath = fpath_namer(shop_id, 'pdct', d['pdct_name_on_eretailer'], 0) if not op.exists(fpath): driver.get(url_mod) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) products = pdct_parsing(fpath, url, products) print(products[url])
from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH import shutil from custom_browser import CustomDriver from parse import parse from ers import clean_xpathd_text # Init variables and assets shop_id = "binnys" root_url = "http://www.binnys.com/" requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = "USA" searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True) brm = BrandMatcher() # If necessary def getprice(pricestr): pricestr = re.sub("[^0-9.$,]", "", pricestr) if not pricestr: return None if not "." in pricestr: pricestr += ".00" price = parse('${dollar:d}.{cent:d}', pricestr) if price is None: price = parse('${kd:d},{dollar:d}.{cent:d}', pricestr) if price is None: price = parse('${kd:d},{dollar:d}', pricestr)
for kw in keywords: searches[kw] = [] for page in range(1, 10): r = session.get('https://www.mybottleshop.com.au/catalogsearch/result/?p={page}&q={kw}'.format( page=page, kw=quote_plus(kw))) tree = etree.parse(BytesIO(r.content), parser=parser) articles = tree.xpath('id("em-grid-mode")/ul[1]/li') aurls = [a.xpath('.//h2[@itemprop="name"]/a/@href')[0] for a in articles] if not articles or all(a in searches[kw] for a in aurls): break searches[kw] += aurls [getproduct(a) for a in articles] print(kw, len(articles), len(searches[kw])) brm = BrandMatcher() for url, product in products.items(): if brm.find_brand(product['pdct_name_on_eretailer'])['brand'] in mh_brands: print(url) fname = fpath_namer(shop_id, 'pdct', product['pdct_name_on_eretailer'], 0) if not op.exists(fname): driver.get(url) sleep(2) driver.save_page(fname, scroll_to_bottom=True) tree = etree.parse(open(fname), parser=parser) data = { 'pdct_img_main_url': tree.xpath('//meta[@property="og:image"]/@content')[0], } product.update(data) # Download images
]), 'raw_price': '£' + ''.join( w for t in li.xpath('.//span[@class="price_amount"]/@content') for w in t.split()).strip(), } print(kw, products[produrl]) products[produrl]['price'] = getprice(products[produrl]['raw_price']) print(kw, products[produrl]) assert all(products[produrl][k] for k in products[produrl]) if not r.from_cache: sleep(3) print(kw, len(searches[kw])) # Download the pages brm = BrandMatcher() for url in sorted(products): d = products[url] if brm.find_brand(d['pdct_name_on_eretailer'])['brand'] in mh_brands: print(d['pdct_name_on_eretailer']) url_mod = clean_url(url, root_url=root_url) r = session.get(url_mod, cookies=cookies) with open( '/tmp/' + shop_id + ' ' + d['pdct_name_on_eretailer'].replace('/', "-") + '.html', 'wb') as f: f.write(r.content) tree = etree.parse(BytesIO(r.content), parser=parser) products[url] = { 'pdct_name_on_eretailer': ' '.join(w for t in tree.xpath(
for segment in ['SPECIAL', "ACCESSIBLE", "EXCLUSIVE"]: tmp = pd.DataFrame(df.copy()) tmp['segment'] = segment tmp['must_have'] = 1 * (tmp[segment] == 'Y') tdf = tdf.append(tmp) tdf.drop(columns=[ 'SPECIAL', "ACCESSIBLE", "EXCLUSIVE", '_merge', 'to_delete_flagship_pdct_of_brnd' ], errors='ignore', inplace=True) final_cols = [ 'continent', 'country', 'segment', 'ctg', 'brnd', 'brnd_query', 'pdct_name', 'pdct_quality_name', 'pdct_query', 'pdct_family', 'pdct_order', 'brnd_order', 'abs_pdct_order', 'ref_pdct_key_viseo', 'flagship', 'must_have', 'source', 'priority', 'min_price', 'max_price', 'competitor', 'competitor_query', 'competitor_brnd', 'competitor_volume_in_ml', 'volume_in_ml', 'box', 'rose', 'vintage', 'program', 'pdct_names_equivalents', 'words_to_include', 'tolerance05', 'exclude_terms', 'words_to_include_05', 'pdct_img_ref_path', 'competitor_min_price', 'competitor_max_price' ] brm = BrandMatcher() tdf['competitor_brnd'] = df['competitor'].apply( lambda x: brm.find_brand(x)['brand'] if x == x else '') print("Differences in columns", set(final_cols) ^ set(tdf.columns)) tdf[final_cols].to_excel(op.join(BASE_DIR, "ressources/pdcts_jp.xlsx"), index=None) print(f"soffice '{op.join(BASE_DIR, 'ressources/pdcts_jp.xlsx')}'")