Python BrandMatcher示例，matcher.BrandMatcher Python示例

示例#1

0

显示文件

            'pdct_name_on_eretailer':
            " ".join("".join(
                li.xpath(
                    './/div[@class="productNameAndPromotions"]//h3//text()')).
                     split()),
            'raw_price':
            " ".join("".join(
                li.xpath('.//p[@class="pricePerUnit"]/text()')[0]).split()),
        }
        print(products[produrl])
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        print(products[produrl])
    print(searches)

# Download the pages
brm = BrandMatcher()
for url in sorted(products):
    d = products[url]
    if brm.find_brand(d['pdct_name_on_eretailer'])['brand'] in mh_brands:
        url_mod = clean_url(url, root_url=root_url)
        fname = fpath_namer(shop_id, 'pdct', d['pdct_name_on_eretailer'], 0)
        if not op.exists(fname):
            print(url_mod)
            driver.get(url_mod)
            sleep(2)
            driver.save_page(fname, scroll_to_bottom=True)
        tree = etree.parse(open(fname), parser=parser)
        products[url] = {
            'pdct_name_on_eretailer':
            ''.join(
                tree.xpath(

示例#2

0

显示文件

文件： seijoishii.py 项目： maker-project-1/webscrapping

    for p in range(2):
        fpath = fpath_namer(shop_id, 'search', kw, p)
        if not op.exists(fpath):
            sleep(2)
            driver.smooth_scroll()
            driver.save_page(fpath, scroll_to_bottom=True)
        searches, products = kw_parsing(fpath, kw, searches, products)

    print(kw, len(searches[kw]))

######################################
# # Product pages scraping ###########
######################################

# Download the pages - with selenium
brm = BrandMatcher()
for url in sorted(list(set(products))):
    d = products[url]
    if brm.find_brand(d['pdct_name_on_eretailer'],
                      special_country='JP')['brand'] in mh_brands:
        print(d['pdct_name_on_eretailer'])
        url_mod = clean_url(url, root_url=root_url)

        fpath = fpath_namer(shop_id, 'pdct', d['pdct_name_on_eretailer'], 0)
        if not op.exists(fpath):
            driver.get(url_mod)
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        products = pdct_parsing(fpath, url, products)
        print(products[url])

示例#3

0

显示文件

文件： binnys.py 项目： maker-project-1/webscrapping

from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH
import shutil
from custom_browser import CustomDriver
from parse import parse
from ers import clean_xpathd_text


# Init variables and assets
shop_id = "binnys"
root_url = "http://www.binnys.com/"
requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache'))
country = "USA"

searches, categories, products = {}, {}, {}
driver = CustomDriver(headless=True)
brm = BrandMatcher()


# If necessary

def getprice(pricestr):
    pricestr = re.sub("[^0-9.$,]", "", pricestr)
    if not pricestr:
        return None
    if not "." in pricestr:
        pricestr += ".00"
    price = parse('${dollar:d}.{cent:d}', pricestr)
    if price is None:
        price = parse('${kd:d},{dollar:d}.{cent:d}', pricestr)
        if price is None:
            price = parse('${kd:d},{dollar:d}', pricestr)

示例#4

0

显示文件

文件： my_bottle_shop.py 项目： maker-project-1/webscrapping

for kw in keywords:
    searches[kw] = []
    for page in range(1, 10):
        r = session.get('https://www.mybottleshop.com.au/catalogsearch/result/?p={page}&q={kw}'.format(
            page=page, kw=quote_plus(kw)))
        tree = etree.parse(BytesIO(r.content), parser=parser)
        articles = tree.xpath('id("em-grid-mode")/ul[1]/li')
        aurls = [a.xpath('.//h2[@itemprop="name"]/a/@href')[0] for a in articles]
        if not articles or all(a in searches[kw] for a in aurls):
            break
        searches[kw] += aurls
        [getproduct(a) for a in articles]
        print(kw,  len(articles), len(searches[kw]))

brm = BrandMatcher()
for url, product in products.items():
    if brm.find_brand(product['pdct_name_on_eretailer'])['brand'] in mh_brands:
        print(url)
        fname = fpath_namer(shop_id, 'pdct', product['pdct_name_on_eretailer'], 0)
        if not op.exists(fname):
            driver.get(url)
            sleep(2)
            driver.save_page(fname, scroll_to_bottom=True)
        tree = etree.parse(open(fname), parser=parser)
        data = {
            'pdct_img_main_url': tree.xpath('//meta[@property="og:image"]/@content')[0],
        }
        product.update(data)

# Download images

示例#5

0

显示文件

            ]),
            'raw_price':
            '£' + ''.join(
                w for t in li.xpath('.//span[@class="price_amount"]/@content')
                for w in t.split()).strip(),
        }
        print(kw, products[produrl])
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        print(kw, products[produrl])
    assert all(products[produrl][k] for k in products[produrl])
    if not r.from_cache:
        sleep(3)
    print(kw, len(searches[kw]))

# Download the pages
brm = BrandMatcher()
for url in sorted(products):
    d = products[url]
    if brm.find_brand(d['pdct_name_on_eretailer'])['brand'] in mh_brands:
        print(d['pdct_name_on_eretailer'])
        url_mod = clean_url(url, root_url=root_url)
        r = session.get(url_mod, cookies=cookies)
        with open(
                '/tmp/' + shop_id + ' ' +
                d['pdct_name_on_eretailer'].replace('/', "-") + '.html',
                'wb') as f:
            f.write(r.content)
        tree = etree.parse(BytesIO(r.content), parser=parser)
        products[url] = {
            'pdct_name_on_eretailer':
            ' '.join(w for t in tree.xpath(

示例#6

0

显示文件

文件： pdcts_from_objgrid_jp.py 项目： maker-project-1/webscrapping

for segment in ['SPECIAL', "ACCESSIBLE", "EXCLUSIVE"]:
    tmp = pd.DataFrame(df.copy())
    tmp['segment'] = segment
    tmp['must_have'] = 1 * (tmp[segment] == 'Y')
    tdf = tdf.append(tmp)
tdf.drop(columns=[
    'SPECIAL', "ACCESSIBLE", "EXCLUSIVE", '_merge',
    'to_delete_flagship_pdct_of_brnd'
],
         errors='ignore',
         inplace=True)

final_cols = [
    'continent', 'country', 'segment', 'ctg', 'brnd', 'brnd_query',
    'pdct_name', 'pdct_quality_name', 'pdct_query', 'pdct_family',
    'pdct_order', 'brnd_order', 'abs_pdct_order', 'ref_pdct_key_viseo',
    'flagship', 'must_have', 'source', 'priority', 'min_price', 'max_price',
    'competitor', 'competitor_query', 'competitor_brnd',
    'competitor_volume_in_ml', 'volume_in_ml', 'box', 'rose', 'vintage',
    'program', 'pdct_names_equivalents', 'words_to_include', 'tolerance05',
    'exclude_terms', 'words_to_include_05', 'pdct_img_ref_path',
    'competitor_min_price', 'competitor_max_price'
]

brm = BrandMatcher()
tdf['competitor_brnd'] = df['competitor'].apply(
    lambda x: brm.find_brand(x)['brand'] if x == x else '')
print("Differences in columns", set(final_cols) ^ set(tdf.columns))
tdf[final_cols].to_excel(op.join(BASE_DIR, "ressources/pdcts_jp.xlsx"),
                         index=None)
print(f"soffice '{op.join(BASE_DIR, 'ressources/pdcts_jp.xlsx')}'")