Пример #1
0
def kw_parsing(fpath, kw, searches, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@class="productbox"]'):
        produrl = li.xpath('.//div[@class="name"]//a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//div[@class="name"]//text()'),
                              unicodedata_normalize=True),
            'raw_price':
            clean_xpathd_text(li.xpath(
                './/div[@class="pricebox"]//span[@class="total"]//text()'),
                              unicodedata_normalize=True),
            'raw_promo_price':
            clean_xpathd_text(
                li.xpath('.//div[contains(.//text(), "通常価格:")]//text()'),
                unicodedata_normalize=True),
            'volume':
            clean_xpathd_text(li.xpath('.//div[@class="name"]//text()'),
                              unicodedata_normalize=True),
        }

        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        print(products[produrl])

        searches[kw].append(produrl)

    return searches, products
Пример #2
0
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@class="row productRow"]//div[@class="row"]'):
        if not li.xpath('./zzzzzz'):
            continue
        produrl = li.xpath('')
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('./div[2]/p[1]//text()')),
            'volume':
            clean_xpathd_text(
                li.xpath('./div[2]/br[1]/preceding-sibling::text()[1]')),
            'raw_price':
            clean_xpathd_text(
                li.xpath('./div[2]/br[1]/following-sibling::text()[1]')),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('./zzzzzzzzzz')),
            'pdct_img_main_url':
            "".join(li.xpath('.//img[@class="img-thumbnail"]/@src')),
        }
        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//*[@id="search-result-items"]/div'):
        if not li.xpath('.//a[@class="thumb-link"]/@href'):
            continue
        produrl = li.xpath('.//a[@class="thumb-link"]/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(
                li.xpath('.//div[@class="product-name"]//text()')),
            'volume':
            clean_xpathd_text(li.xpath('.//span[@itemprop="weight"]//text()')),
            'raw_price':
            clean_xpathd_text(li.xpath('.//span[@class="box-price"]//text()')),
            'raw_promo_price':
            '',
            'pdct_img_main_url':
            '',
        }
        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        #products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
Пример #4
0
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@class="product"]'):
        if not li.xpath('(./div/a/@href)[1]'):
            continue
        produrl = li.xpath('(./div/a/@href)[1]')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//h4/@title')),
            'raw_price':
            clean_xpathd_text(li.xpath('.//div[@class="price"]/p[2]//text()')),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('.//div[@class="price"]/p[1]//text()')),
            'volume':
            clean_xpathd_text(
                li.xpath('.//span[@class="total-weight"]//text()')),
            'pdct_img_main_url':
            "".join(li.xpath('.//img[@itemprop="contentUrl"]/@src')[:1]),
        }
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'].replace(
                'w_107/h_125', 'w_305/h_376'), root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
Пример #5
0
def kw_parsing(fpath, kw, searches, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//ul[@class="products wpex-row clr"]/li'):
        if not li.xpath('(.//a/@href)[1]'):
            continue
        produrl = li.xpath('(.//a/@href)[1]')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//p[@class="bcs_title"]/a//text()')),
            'volume':
            clean_xpathd_text(li.xpath('.//p[@class="bcs_title"]/a//text()')),
            'raw_price':
            clean_xpathd_text(li.xpath('.//p[@class="bcs_price"]//text()')),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('.//xpath/text()')),
            'pdct_img_main_url':
            "".join(li.xpath('.//p[@class="bcs_image"]//img/@src')[:1]),
        }
        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])
        print(products[produrl])

        searches[kw].append(produrl)
    return searches, products
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@class="product-list"]/div'):
        if not li.xpath('.//a/@href'):
            continue
        produrl = li.xpath('.//a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//td[@valign="top"]/a/text()')),
            'volume':
            clean_xpathd_text(
                li.xpath('.//span[@class="Srch-bottlesize"]/text()')),
            'raw_price':
            clean_xpathd_text(
                li.xpath('.//span[@class="RegularPrice"]/text()')),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('./zzzzzzzz')),
            'pdct_img_main_url':
            "".join(li.xpath('.//td[@valign="middle"]//@src')),
        }
        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
def kw_parsing(fpath, kw, searches, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//li[@data-item-id]'):
        produrl = li.xpath('.//a[@class="goodsname"]/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//a[@class="goodsname"]//text()'),
                              unicodedata_normalize=True),
            'volume':
            clean_xpathd_text(li.xpath('.//a[@class="goodsname"]//text()'),
                              unicodedata_normalize=True),
            'raw_price':
            clean_xpathd_text(
                li.xpath('.//p[contains(@class, "goodsprice ")]/text()'),
                unicodedata_normalize=True),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('//xpath'), unicodedata_normalize=True),
            'pdct_img_main_url':
            clean_url(li.xpath('.//img/@src')[0], root_url),
        }

        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = products[produrl][
            'pdct_img_main_url'].replace('thumbnail/pc/', "")
        print(products[produrl])

        searches[kw].append(produrl)
    return searches, products
Пример #8
0
def kw_parsing(fpath, kw, searches, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@class="closeup-frame"]'):
        produrl = li.xpath('.//p[@class="text"]//a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//p[@class="text"]/a/text()'),
                              unicodedata_normalize=True),
            'raw_price':
            clean_xpathd_text(li.xpath('.//span[@class="price"]//text()'),
                              unicodedata_normalize=True),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('.//xpath/text()'),
                              unicodedata_normalize=True),
            'volume':
            clean_xpathd_text(li.xpath('.//p[@class="text"]/a/text()'),
                              unicodedata_normalize=True),
            'pdct_img_main_url':
            clean_url(li.xpath('.//p[@class="image"]//img/@src')[0], root_url),
        }

        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = products[produrl][
            'pdct_img_main_url'].split('?')[0] + "?$VC_LL$"
        print(products[produrl])

        searches[kw].append(produrl)
    return searches, products
Пример #9
0
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//section[@class="item-list"]/div[@class="item"]'):
        if not li.xpath('.//div[@class="product-ttl"]/a/@href'):
            continue
        produrl = li.xpath('.//div[@class="product-ttl"]/a/@href')[0]
        produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//div[@class="product-price"]//span[@class="price-original"]//text()')),
            'volume': clean_xpathd_text(li.xpath('.//table//tr[2]//td//text()')),
            'raw_price': clean_xpathd_text(li.xpath('.//div[@class="product-price"]//span[@class="price-original"]//text()')),
            'raw_promo_price': clean_xpathd_text(li.xpath('(.//div[@class="product-price"]//div[@class="price-sale"]/span/text())[1]')),
            'pdct_img_main_url': "".join(li.xpath('.//div[@class="productimg"]//img/@data-src')),
        }
        if products[produrl]['pdct_img_main_url'] == []:
            products[produrl]['pdct_img_main_url'] = li.xpath('.//div[@class="productimg"]//img/@src')
        products[produrl]['pdct_img_main_url'] = clean_url("".join(products[produrl]['pdct_img_main_url'][0]), root_url)

        if not products[produrl]['raw_price']:
            products[produrl]['raw_price'] = clean_xpathd_text(li.xpath('.//div[@class="product-price"]/span/text()'))

        products[produrl]['brnd'] = brm.find_brand(products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
Пример #10
0
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@class="content"]/ul/li[@class="col"]'):
        if not li.xpath('.//a/@href'):
            continue
        produrl = li.xpath('.//a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//p[@class="text"]//a//text()')[0]),
            'volume':
            clean_xpathd_text(li.xpath('.//p[@class="text"]/a/font//text()')),
            'raw_price':
            clean_xpathd_text(li.xpath('.//p[@class="text"]/a//span//text()')),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('.//xpath/text()')),
            'pdct_img_main_url':
            "".join(li.xpath('.//p[@class="image"]//img/@src')[0]),
        }
        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
Пример #11
0
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@class="item isUpdated"]'):
        if not li.xpath('(.//a/@href)[1]'):
            continue
        produrl = li.xpath('(.//a/@href)[1]')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//h4//text()')),
            'volume':
            clean_xpathd_text(li.xpath('./h4/a/text()')),
            'raw_price':
            clean_xpathd_text(li.xpath('.//*[@class="price was"]//text()')),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('.//*[@itemprop="price"]//text()')),
            'pdct_img_main_url':
            "".join(li.xpath('.//figure/a/img/@src')),
        }
        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
Пример #12
0
def kw_parsing(fpath, kw, searches, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@class="bcs_listItem"]/ul/li'):
        produrl = li.xpath('.//p[@class="bcs_title"]/a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//p[@class="bcs_title"]/a//text()'),
                              unicodedata_normalize=True),
            'raw_price':
            clean_xpathd_text(li.xpath('.//p[@class="bcs_price"]//text()'),
                              unicodedata_normalize=True),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('.//xpath/text()'),
                              unicodedata_normalize=True),
            'volume':
            clean_xpathd_text(li.xpath('.//p[@class="bcs_price"]//text()'),
                              unicodedata_normalize=True),
            'pdct_img_main_url':
            clean_url(
                li.xpath('.//p[@class="bcs_image"]//img/@src')[0], root_url),
        }
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = products[produrl][
            'pdct_img_main_url'].split(
                "?")[0] + "?sr.dw=320&sr.jqh=60&sr.dh=320&sr.mat=1"
        print(products[produrl])

        searches[kw].append(produrl)
    return searches, products
def kw_parsing(fpath, kw, searches, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@class="product-wrapper"]'):
        if not li.xpath('.//h3/a/@href'):
            continue
        produrl = li.xpath('.//h3/a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//h3/a//text()')),
            'volume':
            clean_xpathd_text(li.xpath('./zzzz')),
            'raw_price':
            clean_xpathd_text(
                li.xpath('.//span[@class="price-per-bottle"]/text()')[0:]),
            'raw_promo_price':
            '',
            'pdct_img_main_url':
            "".join(li.xpath('.//a//img/@src')[0]),
        }
        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])
        print(products[produrl])

        searches[kw].append(produrl)
    return searches, products
Пример #14
0
def kw_parsing(fpath, kw, searches, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//ul[@class="lineupItemList"]/li'):
        produrl = li.xpath('.//a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//p[@class="itemName"]//text()'),
                              unicodedata_normalize=True),
            'raw_price':
            clean_xpathd_text(
                li.xpath('.//p[@class="price"]/strong/text()')[:1],
                unicodedata_normalize=True),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('.//xpath/text()'),
                              unicodedata_normalize=True),
            'volume':
            clean_xpathd_text(li.xpath('.//p[@class="itemName"]//text()'),
                              unicodedata_normalize=True),
            'pdct_img_main_url':
            "".join(li.xpath('.//span[@class="imgBox"]//img/@src')[0]),
        }
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'].replace('_L.', '_3L.'),
            root_url)
        print(products[produrl])

        searches[kw].append(produrl)
    return searches, products
Пример #15
0
def kw_parsing(fpath, kw, searches, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//section[@class="item-list"]/div[@class="item"]'):
        produrl = li.xpath('.//div[@class="product-ttl"]/a/@href')[0]
        produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//div[@class="product-ttl"]/a//text()'),
                                                        unicodedata_normalize=True) + " " + clean_xpathd_text(
                li.xpath('.//span[@class="vintage_code"]//text()'),
                unicodedata_normalize=True),
            'raw_price': clean_xpathd_text(li.xpath('.//div[@class="product-price"]//span//text()'), unicodedata_normalize=True),
            'raw_promo_price': clean_xpathd_text(li.xpath('.//xpath/text()'), unicodedata_normalize=True),
            'volume': clean_xpathd_text(li.xpath('.//table//tr[2]//td//text()'), unicodedata_normalize=True),
            'pdct_img_main_url': li.xpath('.//div[@class="productimg"]//img/@data-src'),
        }

        if products[produrl]['pdct_img_main_url'] == []:
            products[produrl]['pdct_img_main_url'] = li.xpath('.//div[@class="productimg"]//img/@src')
        products[produrl]['pdct_img_main_url'] = clean_url("".join(products[produrl]['pdct_img_main_url'][0]), root_url)

        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price'])
        print(products[produrl])

        searches[kw].append(produrl)
    return searches, products
Пример #16
0
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[contains(@class, "widget item product")]'):
        if not li.xpath('(.//a/@href)[1]'):
            continue
        produrl = li.xpath('(.//a/@href)[1]')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//div[@class="_title"]//text()')),
            'volume':
            clean_xpathd_text(li.xpath('.//div[@class="_title"]//text()')),
            'raw_price':
            clean_xpathd_text(
                li.xpath('(.//div[@class="_price-wrapper"]//text())[1]')) +
            clean_xpathd_text(
                li.xpath(
                    '(.//div[@class="_price-wrapper"]/div/@data-cents)[1]')),
            'raw_promo_price':
            clean_xpathd_text(
                li.xpath('.//span[contains(@id, "_old-price")]//text()')),
        }
        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']

        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
Пример #17
0
def kw_parsing(fpath, kw, searches, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//article[@class="product-item"]'):
        produrl = li.xpath('.//a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('./@data-name')),
            'raw_price': clean_xpathd_text(li.xpath('./@data-price')) + "€",
            'raw_promo_price': clean_xpathd_text(li.xpath('.//xpath/text()')),
            'volume': clean_xpathd_text(li.xpath('./@data-name')),
            'pdct_img_main_url': "".join(li.xpath('.//picture//img/@src')[0]),
        }
        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'].replace(
                '_165x165', '_460x460'), root_url)
        print(products[produrl])

        searches[kw].append(produrl)
    return searches, products
Пример #18
0
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//li[@class="item product product-item denner-tile"]'):
        if not li.xpath('.//a[@class="product photo denner-tile__link"]/@href'):
            continue
        produrl = li.xpath('.//a[@class="product photo denner-tile__link"]/@href')[0]
        produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//div[@class="denner-tile__title"]/h2//text()')),
            'volume': clean_xpathd_text(li.xpath('./zzzz')),
            'raw_price': clean_xpathd_text(li.xpath('..//div[@class="denner-price__additional"]//text()')),
            'raw_promo_price': clean_xpathd_text(li.xpath('.//div[@class="denner-price__additional"]//text()')),
            'pdct_img_main_url': "".join(li.xpath('./zzzzzz')[:1]),
        }
        products[produrl]['brnd'] = brm.find_brand(products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['raw_price'] = get_pricestr(products[produrl]['raw_price'])
        products[produrl]['raw_promo_price'] = get_promopricestr(products[produrl]['raw_promo_price'])
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@data-scope="product"]'):
        if not li.xpath('(.//h3/a/@href)'):
            continue
        produrl = li.xpath('(.//h3/a/@href)')[0]
        produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//h3/a/@title')),
            'volume': clean_xpathd_text(li.xpath('./zzzzzzzzz')),
            'raw_price': clean_xpathd_text(li.xpath('.//div[contains(@class,"prices-price _curren")]//text()')),
            'raw_promo_price': clean_xpathd_text(li.xpath('./zzzzzzzzzzzz')),
            'pdct_img_main_url': "".join(li.xpath('./zzzzzzzz')),
        }
        if not products[produrl]['raw_price']:
            products[produrl].update({
                'raw_price': ' '.join(''.join(li.xpath('.//span[@class="current   sale"]//text()')).split()),
                'raw_promo_price': ' '.join(''.join(li.xpath('.//span[@class="former stroked"]//text()')).split()),
            })
        products[produrl]['brnd'] = brm.find_brand(products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
Пример #20
0
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//*[contains(@id, "product_id_")]'):
        if not li.xpath('.//td/a/@href'):
            continue
        produrl = li.xpath('.//td/a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(
                li.xpath('.//a[@class="contentpagetitle"]//text()')),
            'volume':
            clean_xpathd_text(li.xpath('.//zzzzzzzzzzz')),
            'raw_price':
            clean_xpathd_text(li.xpath('.//zzzzzzzzzzzzzz')),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('.//zzzzzzzzzzzzzzz')),
            'pdct_img_main_url':
            "".join(li.xpath('./zzzzzzzzz')[:1]),
        }
        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
Пример #21
0
def ctg_parsing(fpath, ctg, categories, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//ul[contains(@class, "tiles-container")]/li'):
        if not li.xpath('.//a[img]/@href'):
            continue
        produrl = li.xpath('.//a[img]/@href')[0]
        produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//span[@class="product-name"]/text()')),
            'volume': clean_xpathd_text(li.xpath('.//div[contains(@class, "price-sales")]/span[last()]/text()')),
            'raw_price': clean_xpathd_text(li.xpath('.//div[contains(@class,"price-sales")]/@data-baseprice')),
            'raw_promo_price': clean_xpathd_text(li.xpath('./zzzzzzzzz')),
            'pdct_img_main_url': "".join(li.xpath('./zzzzzzzzz')),
        }
        if li.xpath('.//div[@class="price"]/@data-baseprice'):
            products[produrl]['promo_price'] = int(
                float(li.xpath('.//div[@class="price"]/@data-baseprice')[0]) * 100),

        products[produrl]['brnd'] = brm.find_brand(products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        categories[ctg].append(produrl)
    return categories, products
def kw_parsing(fpath, kw, searches, products):  # TODO : modify xpaths
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//li[@itemtype="http://schema.org/Product"]'):
        if not li.xpath('(.//a/@href)')[0]:
            continue
        produrl = li.xpath('.//a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        products[produrl] = {
            'pdct_name_on_eretailer':
            clean_xpathd_text(li.xpath('.//*[@itemprop="name"]//text()')),
            'volume':
            clean_xpathd_text(li.xpath('.//span[@class="volume"]//text()')),
            'raw_price':
            clean_xpathd_text(
                li.xpath(
                    './/span[contains(@class, "price") and contains(@id, "product-price")]//text()'
                )),
            'raw_promo_price':
            clean_xpathd_text(li.xpath('.//p[@class="old-price"]//text()')),
            'pdct_img_main_url':
            "".join(
                li.xpath(
                    './/a[@class="product-image"]/img[@class="lazy"]/@data-original'
                )),
        }
        print(products[produrl], produrl)
        if products[produrl]['pdct_img_main_url']:
            products[produrl]['pdct_img_main_url'] = products[produrl][
                'pdct_img_main_url'].replace(
                    "small_image/x300/17f82f742ffe127f42dca9de82fb58b1",
                    "image/9df78eab33525d08d6e5fb8d27136e95")
        else:
            products[produrl]['pdct_img_main_url'] = ""

        products[produrl]['brnd'] = brm.find_brand(
            products[produrl]['pdct_name_on_eretailer'])['brand']
        print(products[produrl], produrl)
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        products[produrl]['promo_price'] = getprice(
            products[produrl]['raw_promo_price'])
        products[produrl]['pdct_img_main_url'] = clean_url(
            products[produrl]['pdct_img_main_url'], root_url)
        print(products[produrl])

        searches[kw].append(produrl)
    return searches, products
Пример #23
0
def pdct_parsing(fpath, url, products): # TODO : modify xpaths
    tree = etree.parse(open(fpath), parser=parser)
    products[url].update({
        'volume': clean_xpathd_text(tree.xpath('//div[@class="denner-price__additional"]//text()')[2:], unicodedata_normalize=True),
        'pdct_img_main_url': clean_url(''.join(tree.xpath('//img[contains(@class, "fotorama__img")]/@src')[:1]), root_url),
        'ctg_denom_txt': ' '.join(tree.xpath('/zzzzzz')),
    })
    return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths
    tree = etree.parse(open(fpath), parser=parser)
    products[url].update({
        'volume': clean_xpathd_text(tree.xpath('//div[@class="product-abv-vol"]/span//text()')),
        'pdct_img_main_url': clean_url(''.join(tree.xpath('//div[@class="product-image-gallery"]//img/@src')), root_url),
        'ctg_denom_txt': ' '.join(tree.xpath('//div[@class="breadcrumbs"]//text()')),
    })
    return products
Пример #25
0
def pdct_parsing(fpath, url, products): # TODO : modify xpaths
    tree = etree.parse(open(fpath), parser=parser)
    products[url].update({
        'volume': clean_xpathd_text(tree.xpath('//div[@class="cd-ProductDescription"]//text()')[:3], unicodedata_normalize=True),
        'pdct_img_main_url': clean_url(''.join(tree.xpath('//*[@id="productSlider"]/li[@data-itemnb="0"]/@data-imgname')[:1]), root_url),
        'ctg_denom_txt': ' '.join(tree.xpath('//div[@class="cd-NavSubMenu"]//text()')),
    })
    return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths
    tree = etree.parse(open(fpath), parser=parser)
    products[url].update({
        'volume': clean_xpathd_text(tree.xpath('//div[contains(@class, "pack_composition")]//text()')),
        'pdct_img_main_url': clean_url(''.join(tree.xpath('//img[@id="product-image-placer"]/@src')[-1]), root_url),
        'ctg_denom_txt': ' '.join(tree.xpath('//*[@id="breadcrumbs"]//text()')),
    })
    return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths
    tree = etree.parse(open(fpath), parser=parser)
    products[url].update({
        'volume': clean_xpathd_text(tree.xpath('//*[@class="ContentArea"]/h1[@class="title"]//text()')[:3], unicodedata_normalize=True),
        'pdct_img_main_url': clean_url(''.join(tree.xpath('//div[contains(@class, "ProductThumbImage")]//img/@src')[:1]), root_url),
        'ctg_denom_txt': ' '.join(tree.xpath('//*[@id="ProductBreadcrumb"]//text()')),
    })
    return products
Пример #28
0
def pdct_parsing(fpath, url, products): # TODO : modify xpaths
    tree = etree.parse(open(fpath), parser=parser)
    products[url].update({
        'volume': clean_xpathd_text(tree.xpath('//ul[@class="ingredients"]//text()'), unicodedata_normalize=True),
        'pdct_img_main_url': clean_url(''.join(tree.xpath('//*[@class="product-img-box"]//a[@id="zoom1"]/@href')), root_url),
        'ctg_denom_txt': ' '.join(tree.xpath('//div[@class="grid-full breadcrumbs"]//text()')),
    })
    return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths
    tree = etree.parse(open(fpath), parser=parser)
    products[url].update({
        'volume': clean_xpathd_text(tree.xpath('(//div[@class="col-xs-6 col-sm-12 nopadding pull-right"]//text())[23]')),
        'pdct_img_main_url': clean_url(''.join(tree.xpath('//span[@id="view_full_size"]//img/@src')), root_url),
        'ctg_denom_txt': ' '.join(tree.xpath('//div[@class="breadcrumb clearfix"]//text()')),
    })
    return products
Пример #30
0
def pdct_parsing(fpath, url, products): # TODO : modify xpaths
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    #tree = etree.parse(open(fpath), parser=parser)
    products[url].update({
        'volume': clean_xpathd_text(tree.xpath('//*[@id="bmg_itemdetail_size"]//text()')[:3], unicodedata_normalize=True),
        'pdct_img_main_url': clean_url(''.join(tree.xpath('//*[@id="loadarea"]//img/@src')[:1]), root_url),
        'ctg_denom_txt': ' '.join(tree.xpath('//div[@class="layMain"]//text()')),
    })
    return products