Exemplo n.º 1
0
        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
print(categories)

# KW searches Scraping - with selenium - with nb page hard-coded in url - multiple page per search
search_url = "https://www.bodeboca.com/"
for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0
    fpath = fpath_namer(shop_id, 'search', kw, 0)
    if not op.exists(fpath):
        driver.get(search_url)
        sleep(2)
        driver.text_input(kw, '//*[@id="bodeboca-search-box"]/input')
        sleep(2)
        driver.save_page(fpath, scroll_to_bottom=False)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)

    for li in tree.xpath('//div[@id="search-results-main"]/div'):
        if not li.xpath('.//a/@href'):
            continue
        if not ' '.join(''.join(li.xpath('.//h2/a/text()')[:1]).split()):
            continue
        produrl = li.xpath('.//a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        produrl = clean_url(produrl, root_url)
        products[produrl] = {
Exemplo n.º 2
0
        print(products[produrl])
print([(c, len(categories[c])) for c in categories])

for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0
    search_input_box_xpath = u'//*[@id="search"]'
    fpath = fpath_namer(shop_id, 'search', kw, 0)

    if not op.exists(fpath_namer(shop_id, 'search', kw, 0)):
        if not driver.check_exists_by_xpath(search_input_box_xpath):
            # Getting back to root if search input box is not found
            driver.get(
                'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/'
            )
        driver.text_input(kw, search_input_box_xpath, enter=True)
        sleep(2)
        driver.save_page(fpath, scroll_to_bottom=True)

    # Storing and extracting infos
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//ul[@class="productLister gridView"]/li'):
        produrl = li.xpath('.//h3/a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        searches[kw].append(produrl)
        products[produrl] = {
            'pdct_name_on_eretailer':
            " ".join("".join(
                li.xpath(
Exemplo n.º 3
0
# # KW searches scrapping ############
######################################

# KW searches Scraping - with requests - one page per search
driver.get('https://www.aeondewine.com')
for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0

    # If files exist, don't scrap
    perform_scrapping = not op.exists(fpath_namer(shop_id, 'search', kw, 0))

    for p in range(5):
        fpath = fpath_namer(shop_id, 'search', kw, p)
        if not op.exists(fpath) and perform_scrapping:
            driver.text_input(kw, '//input[@id="keyword"]', enter=True)
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)

        searches, products = kw_parsing(fpath, kw, searches, products)

        # Break or change pages
        if number_of_pdcts_in_kw_search == len(searches[kw]):
            print("Finished, because no more new products")
            break

        if not perform_scrapping and not op.exists(
                fpath_namer(shop_id, 'search', kw, p + 1)):
            print("Finished, because no more new products")
            break
Exemplo n.º 4
0
                  'red_wine': 'https://www.belvini.de/rotweine?page={page}',
                  'white_wine': 'https://www.belvini.de/weisswein?page={page}',
                  'rum': 'https://www.belvini.de/spirituosen/rum?page={page}',
                  'brandy': 'https://www.belvini.de/spirituosen/brandy?page={page}',
                  'liquor': 'https://www.belvini.de/spirituosen/likoer?page={page}',
                  }

#  Get price list

d = {}
for kw in keywords:
    fpath = fpath_namer(shop_id, 'other', kw)
    url = "https://www.belvini.de/"
    if not op.exists(fpath):
        driver.get(url)
        driver.text_input(kw, '//input[@name="keywords"]')
        sleep(1.5)
        driver.save_page(fpath)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for li in tree.xpath('//*[not(contains(@id, "Help")) and @class="suggRow"]/*[@class="suggItem"]'):
        pdct_name_on_eretailer = ' '.join(''.join(li.xpath('.//*[@class="suggProduct"]//text()')).split()).strip()
        d[pdct_name_on_eretailer] = {
            'raw_price': ' '.join(''.join(li.xpath('.//*[@class="suggCat"]//text()')).split()).strip(),
            'raw_promo_price' ""
            'promo_price': '',
            }
        d[pdct_name_on_eretailer]['price'] = getprice(d[pdct_name_on_eretailer]['raw_price'])

print(len(d.keys()))

######################################
# # KW searches scrapping ############
######################################

# KW searches Scraping - with requests - one page per search
kw_search_url = "https://www.iy-net.jp/nspc/getproducts.do?shopcd=00209&categoryCode=30856&page=1"  # TODO : modify URL
for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0
    if not op.exists(fpath_namer(shop_id, 'search', kw, 0)):
        driver.get(kw_search_url.format(kw=kw))

    for p in range(1):
        fpath = fpath_namer(shop_id, 'search', kw, p)
        if not op.exists(fpath):
            driver.text_input(kw, '//input[@id="searchtxt"]', enter=True)
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)

        searches, products = kw_parsing(fpath, kw, searches, products)

    print(kw, len(searches[kw]))

######################################
# # Download images        ###########
######################################
brm = BrandMatcher()
for url, pdt in products.items():
    if 'pdct_img_main_url' in pdt and pdt['pdct_img_main_url'] and \
            brm.find_brand(pdt['pdct_name_on_eretailer'], special_country='JP')['brand'] in mh_brands:
        print(pdt['pdct_name_on_eretailer'] + "." +
Exemplo n.º 6
0
    'red_wine': 'https://www.b-21.com/searchprods.asp?searchstring=red+wine&pagenumber={page}&val=0',
    'white_wine': 'https://www.b-21.com/searchprods.asp?searchstring=white+wine&pagenumber={page}&val=0',
    'tequila': 'https://www.b-21.com/searchprods.asp?searchstring=tequila&pagenumber={page}&val=0',
    'gin': 'https://www.b-21.com/searchprods.asp?searchstring=gin&pagenumber={page}&val=0',
    'rum': 'https://www.b-21.com/searchprods.asp?searchstring=rum&pagenumber={page}&val=0',
    'brandy': 'https://www.b-21.com/searchprods.asp?searchstring=brandy&pagenumber={page}&val=0',
}

for ctg, caturl in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    req_sent = False
    if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 1)):
        req_sent = True
        driver.get('https://www.b-21.com/')
        driver.text_input(ctg, '//input[@id="code"]', enter=True)
    for page in range(1, 100):
        url = caturl.format(page=page)
        fpath = fpath_namer(shop_id, 'ctg', ctg, page)
        if not op.exists(fpath) and req_sent:
            driver.smooth_scroll()
            driver.save_page(fpath, scroll_to_bottom=True)
        elif not op.exists(fpath) and not req_sent:
            break
        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        for tr in tree.xpath('//div[contains(@class, "c data2")]/table[3]/tbody/tr'):
            if not tr.xpath('.//*[contains(@class, "prodstitle")]/@href'):
                continue
            produrl = tr.xpath('.//*[contains(@class, "prodstitle")]/@href')[0]
            produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
Exemplo n.º 7
0
driver = CustomDriver(headless=False)

shop_id = "leshop"
root_url = "https://www.leshop.ch/"

####
search_box_xpath = '//*[@id="autocompleteSearchInput"]'
items_xpath = '//*[@data-ng-controller="AutocompleteSearchCtrl as controller"]//div[@class="item"]'
####

l = []
for kw in [
        'vodka', 'champagne', 'whisky', 'sparkling', 'cognac', 'still wine'
]:
    driver.get(root_url)
    driver.text_input(kw, search_box_xpath, timeout=5)
    try:
        driver.wait_for_xpath(items_xpath, timeout=5)
    except:
        continue
    fpath = '/tmp/prompted ' + shop_id + ' ' + kw + '.html'
    driver.save_page(fpath, scroll_to_bottom=True)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for c, li in enumerate(tree.xpath(items_xpath)):
        txt = ' '.join(''.join(li.xpath('.//text()')).split())
        print(kw, shop_id)
        tmp = {'shop_id': shop_id, 'kw': kw, 'num': c, 'product': txt}
        l.append(tmp)

df = pd.DataFrame(l).to_csv(op.join("../data_prompted", shop_id + '.csv'),
                            index=None,