Exemplo n.º 1
0
# Categories scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(10):
        print(ctg, p)
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            if not saucey_was_initialised:
                init_saucey(driver)
                saucey_was_initialised = True
            driver.get(url.format(page=p * 60))
            driver.wait_for_xpath('//*[@itemtype="http://schema.org/Product"]',
                                  timeout=10)
            driver.smooth_scroll(sleep_time=0.3)
            driver.save_page(fpath, scroll_to_bottom=True)
        # Parsing
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
        for li in tree.xpath('//*[@itemtype="http://schema.org/Product"]'):
            produrl = "".join(li.xpath('.//a[@itemprop="url"]/@href'))
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
                " ".join(''.join(
                    li.xpath('.//*[@itemprop="name"]//text()')).split()),
                'raw_price':
                ' '.join(
Exemplo n.º 2
0
# # KW searches scrapping ############
######################################

# KW searches Scraping - with requests - one page per search
kw_search_url = "https://www.seijoishii.com/s?search_word={kw}&x=0&y=0"  # TODO : modify URL
for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0
    if not op.exists(fpath_namer(shop_id, 'search', kw, 0)):
        driver.get(kw_search_url.format(kw=kw))

    for p in range(2):
        fpath = fpath_namer(shop_id, 'search', kw, p)
        if not op.exists(fpath):
            sleep(2)
            driver.smooth_scroll()
            driver.save_page(fpath, scroll_to_bottom=True)
        searches, products = kw_parsing(fpath, kw, searches, products)

    print(kw, len(searches[kw]))

######################################
# # Product pages scraping ###########
######################################

# Download the pages - with selenium
brm = BrandMatcher()
for url in sorted(list(set(products))):
    d = products[url]
    if brm.find_brand(d['pdct_name_on_eretailer'],
                      special_country='JP')['brand'] in mh_brands:
Exemplo n.º 3
0
    'rum':
    'https://groceries.morrisons.com/webshop/getCategories.do?tags=%7C105651%7C103120%7C105916%7C151520&Asidebar=1',
    'liquor':
    'https://groceries.morrisons.com/webshop/getCategories.do?tags=%7C105651%7C103120%7C105916%7C151516&Asidebar=1',
}

for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0

    for p in range(1):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            driver.get(url)
            sleep(2)
            driver.smooth_scroll(10)
            driver.save_page(fpath, scroll_to_bottom=True)
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)

        for li in tree.xpath(
                '//ul[contains(@class, "fops-shelf")]/li[@class="fops-item"]'):
            if not li.xpath('.//a/@href') or not li.xpath('.//h4//text()'):
                continue
            produrl = li.xpath('.//a/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
                li.xpath('.//h4//text()')[0].strip(),