print(products[produrl])

            searches[kw].append(produrl)
        if len(set(searches[kw])) == number_of_pdcts_in_kw_search:
            break
        else:
            number_of_pdcts_in_kw_search = len(set(searches[kw]))
        # if not r.from_cache:
        #     sleep(2)
    print(kw, p, len(searches[kw]))

# Download the pages - with selenium
brm = BrandMatcher()
for url in sorted(list(set(products))):
    d = products[url]
    if brm.find_brand(d['pdct_name_on_eretailer'])['brand'] in mh_brands:
        print(d['pdct_name_on_eretailer'])
        url_mod = clean_url(url, root_url=root_url)

        r = requests.get(url_mod, headers)
        with open(
                '/tmp/' + d['pdct_name_on_eretailer'].replace('/', "-") +
                '.html', 'wb') as f:
            f.write(r.content)
        tree = etree.parse(BytesIO(r.content), parser=parser)

        products[url].update({
            'volume':
            ' '.join(''.join(
                tree.xpath('//*[@class="item-meta txt"]//text()')).split()),
            'pdct_img_main_url':
    searches[kw] = []
    for page in range(1, 10):
        r = session.get('https://www.mybottleshop.com.au/catalogsearch/result/?p={page}&q={kw}'.format(
            page=page, kw=quote_plus(kw)))
        tree = etree.parse(BytesIO(r.content), parser=parser)
        articles = tree.xpath('id("em-grid-mode")/ul[1]/li')
        aurls = [a.xpath('.//h2[@itemprop="name"]/a/@href')[0] for a in articles]
        if not articles or all(a in searches[kw] for a in aurls):
            break
        searches[kw] += aurls
        [getproduct(a) for a in articles]
        print(kw,  len(articles), len(searches[kw]))

brm = BrandMatcher()
for url, product in products.items():
    if brm.find_brand(product['pdct_name_on_eretailer'])['brand'] in mh_brands:
        print(url)
        fname = fpath_namer(shop_id, 'pdct', product['pdct_name_on_eretailer'], 0)
        if not op.exists(fname):
            driver.get(url)
            sleep(2)
            driver.save_page(fname, scroll_to_bottom=True)
        tree = etree.parse(open(fname), parser=parser)
        data = {
            'pdct_img_main_url': tree.xpath('//meta[@property="og:image"]/@content')[0],
        }
        product.update(data)

# Download images
for url, pdt in products.items():
     if 'pdct_img_main_url' in pdt and pdt['pdct_img_main_url'] and brm.find_brand(pdt['pdct_name_on_eretailer'])['brand'] in mh_brands:
Exemplo n.º 3
0
                w for t in li.xpath('.//span[@class="price_amount"]/@content')
                for w in t.split()).strip(),
        }
        print(kw, products[produrl])
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        print(kw, products[produrl])
    assert all(products[produrl][k] for k in products[produrl])
    if not r.from_cache:
        sleep(3)
    print(kw, len(searches[kw]))

# Download the pages
brm = BrandMatcher()
for url in sorted(products):
    d = products[url]
    if brm.find_brand(d['pdct_name_on_eretailer'])['brand'] in mh_brands:
        print(d['pdct_name_on_eretailer'])
        url_mod = clean_url(url, root_url=root_url)
        r = session.get(url_mod, cookies=cookies)
        with open(
                '/tmp/' + shop_id + ' ' +
                d['pdct_name_on_eretailer'].replace('/', "-") + '.html',
                'wb') as f:
            f.write(r.content)
        tree = etree.parse(BytesIO(r.content), parser=parser)
        products[url] = {
            'pdct_name_on_eretailer':
            ' '.join(w for t in tree.xpath(
                '//h1[@class="buying-controls_title"]//text()')
                     for w in t.split()).strip(),
            'volume':
Exemplo n.º 4
0
        fpath = fpath_namer(shop_id, 'search', kw, p)
        if not op.exists(fpath):
            sleep(2)
            driver.smooth_scroll()
            driver.save_page(fpath, scroll_to_bottom=True)
        searches, products = kw_parsing(fpath, kw, searches, products)

    print(kw, len(searches[kw]))

######################################
# # Download images        ###########
######################################
brm = BrandMatcher()
for url, pdt in products.items():
    if 'pdct_img_main_url' in pdt and pdt['pdct_img_main_url'] and \
            brm.find_brand(pdt['pdct_name_on_eretailer'], special_country='JP')['brand'] in mh_brands:
        print(pdt['pdct_name_on_eretailer'] + "." +
              pdt['pdct_img_main_url'].split('.')[-1])
        print(pdt['pdct_img_main_url'])
        response = requests.get(pdt['pdct_img_main_url'],
                                stream=True,
                                verify=False,
                                headers=headers)
        # response.raw.decode_content = True
        tmp_file_path = '/tmp/' + shop_id + 'mhers_tmp_{}.imgtype'.format(
            abs(hash(pdt['pdct_img_main_url'])))
        img_path = img_path_namer(shop_id, pdt['pdct_name_on_eretailer'])
        with open(tmp_file_path, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
        if imghdr.what(tmp_file_path) is not None:
            img_path = img_path.split('.')[0] + '.' + imghdr.what(
for segment in ['SPECIAL', "ACCESSIBLE", "EXCLUSIVE"]:
    tmp = pd.DataFrame(df.copy())
    tmp['segment'] = segment
    tmp['must_have'] = 1 * (tmp[segment] == 'Y')
    tdf = tdf.append(tmp)
tdf.drop(columns=[
    'SPECIAL', "ACCESSIBLE", "EXCLUSIVE", '_merge',
    'to_delete_flagship_pdct_of_brnd'
],
         errors='ignore',
         inplace=True)

final_cols = [
    'continent', 'country', 'segment', 'ctg', 'brnd', 'brnd_query',
    'pdct_name', 'pdct_quality_name', 'pdct_query', 'pdct_family',
    'pdct_order', 'brnd_order', 'abs_pdct_order', 'ref_pdct_key_viseo',
    'flagship', 'must_have', 'source', 'priority', 'min_price', 'max_price',
    'competitor', 'competitor_query', 'competitor_brnd',
    'competitor_volume_in_ml', 'volume_in_ml', 'box', 'rose', 'vintage',
    'program', 'pdct_names_equivalents', 'words_to_include', 'tolerance05',
    'exclude_terms', 'words_to_include_05', 'pdct_img_ref_path',
    'competitor_min_price', 'competitor_max_price'
]

brm = BrandMatcher()
tdf['competitor_brnd'] = df['competitor'].apply(
    lambda x: brm.find_brand(x)['brand'] if x == x else '')
print("Differences in columns", set(final_cols) ^ set(tdf.columns))
tdf[final_cols].to_excel(op.join(BASE_DIR, "ressources/pdcts_jp.xlsx"),
                         index=None)
print(f"soffice '{op.join(BASE_DIR, 'ressources/pdcts_jp.xlsx')}'")