def kw_parsing(fpath, kw, searches, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="productbox"]'): produrl = li.xpath('.//div[@class="name"]//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//div[@class="name"]//text()'), unicodedata_normalize=True), 'raw_price': clean_xpathd_text(li.xpath( './/div[@class="pricebox"]//span[@class="total"]//text()'), unicodedata_normalize=True), 'raw_promo_price': clean_xpathd_text( li.xpath('.//div[contains(.//text(), "通常価格:")]//text()'), unicodedata_normalize=True), 'volume': clean_xpathd_text(li.xpath('.//div[@class="name"]//text()'), unicodedata_normalize=True), } print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) print(products[produrl]) searches[kw].append(produrl) return searches, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="row productRow"]//div[@class="row"]'): if not li.xpath('./zzzzzz'): continue produrl = li.xpath('') produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('./div[2]/p[1]//text()')), 'volume': clean_xpathd_text( li.xpath('./div[2]/br[1]/preceding-sibling::text()[1]')), 'raw_price': clean_xpathd_text( li.xpath('./div[2]/br[1]/following-sibling::text()[1]')), 'raw_promo_price': clean_xpathd_text(li.xpath('./zzzzzzzzzz')), 'pdct_img_main_url': "".join(li.xpath('.//img[@class="img-thumbnail"]/@src')), } products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//*[@id="search-result-items"]/div'): if not li.xpath('.//a[@class="thumb-link"]/@href'): continue produrl = li.xpath('.//a[@class="thumb-link"]/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text( li.xpath('.//div[@class="product-name"]//text()')), 'volume': clean_xpathd_text(li.xpath('.//span[@itemprop="weight"]//text()')), 'raw_price': clean_xpathd_text(li.xpath('.//span[@class="box-price"]//text()')), 'raw_promo_price': '', 'pdct_img_main_url': '', } products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) #products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="product"]'): if not li.xpath('(./div/a/@href)[1]'): continue produrl = li.xpath('(./div/a/@href)[1]')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//h4/@title')), 'raw_price': clean_xpathd_text(li.xpath('.//div[@class="price"]/p[2]//text()')), 'raw_promo_price': clean_xpathd_text(li.xpath('.//div[@class="price"]/p[1]//text()')), 'volume': clean_xpathd_text( li.xpath('.//span[@class="total-weight"]//text()')), 'pdct_img_main_url': "".join(li.xpath('.//img[@itemprop="contentUrl"]/@src')[:1]), } print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'].replace( 'w_107/h_125', 'w_305/h_376'), root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def kw_parsing(fpath, kw, searches, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//ul[@class="products wpex-row clr"]/li'): if not li.xpath('(.//a/@href)[1]'): continue produrl = li.xpath('(.//a/@href)[1]')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//p[@class="bcs_title"]/a//text()')), 'volume': clean_xpathd_text(li.xpath('.//p[@class="bcs_title"]/a//text()')), 'raw_price': clean_xpathd_text(li.xpath('.//p[@class="bcs_price"]//text()')), 'raw_promo_price': clean_xpathd_text(li.xpath('.//xpath/text()')), 'pdct_img_main_url': "".join(li.xpath('.//p[@class="bcs_image"]//img/@src')[:1]), } products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) print(products[produrl]) searches[kw].append(produrl) return searches, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="product-list"]/div'): if not li.xpath('.//a/@href'): continue produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//td[@valign="top"]/a/text()')), 'volume': clean_xpathd_text( li.xpath('.//span[@class="Srch-bottlesize"]/text()')), 'raw_price': clean_xpathd_text( li.xpath('.//span[@class="RegularPrice"]/text()')), 'raw_promo_price': clean_xpathd_text(li.xpath('./zzzzzzzz')), 'pdct_img_main_url': "".join(li.xpath('.//td[@valign="middle"]//@src')), } products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def kw_parsing(fpath, kw, searches, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//li[@data-item-id]'): produrl = li.xpath('.//a[@class="goodsname"]/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//a[@class="goodsname"]//text()'), unicodedata_normalize=True), 'volume': clean_xpathd_text(li.xpath('.//a[@class="goodsname"]//text()'), unicodedata_normalize=True), 'raw_price': clean_xpathd_text( li.xpath('.//p[contains(@class, "goodsprice ")]/text()'), unicodedata_normalize=True), 'raw_promo_price': clean_xpathd_text(li.xpath('//xpath'), unicodedata_normalize=True), 'pdct_img_main_url': clean_url(li.xpath('.//img/@src')[0], root_url), } print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = products[produrl][ 'pdct_img_main_url'].replace('thumbnail/pc/', "") print(products[produrl]) searches[kw].append(produrl) return searches, products
def kw_parsing(fpath, kw, searches, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="closeup-frame"]'): produrl = li.xpath('.//p[@class="text"]//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//p[@class="text"]/a/text()'), unicodedata_normalize=True), 'raw_price': clean_xpathd_text(li.xpath('.//span[@class="price"]//text()'), unicodedata_normalize=True), 'raw_promo_price': clean_xpathd_text(li.xpath('.//xpath/text()'), unicodedata_normalize=True), 'volume': clean_xpathd_text(li.xpath('.//p[@class="text"]/a/text()'), unicodedata_normalize=True), 'pdct_img_main_url': clean_url(li.xpath('.//p[@class="image"]//img/@src')[0], root_url), } print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = products[produrl][ 'pdct_img_main_url'].split('?')[0] + "?$VC_LL$" print(products[produrl]) searches[kw].append(produrl) return searches, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//section[@class="item-list"]/div[@class="item"]'): if not li.xpath('.//div[@class="product-ttl"]/a/@href'): continue produrl = li.xpath('.//div[@class="product-ttl"]/a/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//div[@class="product-price"]//span[@class="price-original"]//text()')), 'volume': clean_xpathd_text(li.xpath('.//table//tr[2]//td//text()')), 'raw_price': clean_xpathd_text(li.xpath('.//div[@class="product-price"]//span[@class="price-original"]//text()')), 'raw_promo_price': clean_xpathd_text(li.xpath('(.//div[@class="product-price"]//div[@class="price-sale"]/span/text())[1]')), 'pdct_img_main_url': "".join(li.xpath('.//div[@class="productimg"]//img/@data-src')), } if products[produrl]['pdct_img_main_url'] == []: products[produrl]['pdct_img_main_url'] = li.xpath('.//div[@class="productimg"]//img/@src') products[produrl]['pdct_img_main_url'] = clean_url("".join(products[produrl]['pdct_img_main_url'][0]), root_url) if not products[produrl]['raw_price']: products[produrl]['raw_price'] = clean_xpathd_text(li.xpath('.//div[@class="product-price"]/span/text()')) products[produrl]['brnd'] = brm.find_brand(products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url(products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="content"]/ul/li[@class="col"]'): if not li.xpath('.//a/@href'): continue produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//p[@class="text"]//a//text()')[0]), 'volume': clean_xpathd_text(li.xpath('.//p[@class="text"]/a/font//text()')), 'raw_price': clean_xpathd_text(li.xpath('.//p[@class="text"]/a//span//text()')), 'raw_promo_price': clean_xpathd_text(li.xpath('.//xpath/text()')), 'pdct_img_main_url': "".join(li.xpath('.//p[@class="image"]//img/@src')[0]), } products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="item isUpdated"]'): if not li.xpath('(.//a/@href)[1]'): continue produrl = li.xpath('(.//a/@href)[1]')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//h4//text()')), 'volume': clean_xpathd_text(li.xpath('./h4/a/text()')), 'raw_price': clean_xpathd_text(li.xpath('.//*[@class="price was"]//text()')), 'raw_promo_price': clean_xpathd_text(li.xpath('.//*[@itemprop="price"]//text()')), 'pdct_img_main_url': "".join(li.xpath('.//figure/a/img/@src')), } products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def kw_parsing(fpath, kw, searches, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="bcs_listItem"]/ul/li'): produrl = li.xpath('.//p[@class="bcs_title"]/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//p[@class="bcs_title"]/a//text()'), unicodedata_normalize=True), 'raw_price': clean_xpathd_text(li.xpath('.//p[@class="bcs_price"]//text()'), unicodedata_normalize=True), 'raw_promo_price': clean_xpathd_text(li.xpath('.//xpath/text()'), unicodedata_normalize=True), 'volume': clean_xpathd_text(li.xpath('.//p[@class="bcs_price"]//text()'), unicodedata_normalize=True), 'pdct_img_main_url': clean_url( li.xpath('.//p[@class="bcs_image"]//img/@src')[0], root_url), } print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = products[produrl][ 'pdct_img_main_url'].split( "?")[0] + "?sr.dw=320&sr.jqh=60&sr.dh=320&sr.mat=1" print(products[produrl]) searches[kw].append(produrl) return searches, products
def kw_parsing(fpath, kw, searches, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@class="product-wrapper"]'): if not li.xpath('.//h3/a/@href'): continue produrl = li.xpath('.//h3/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//h3/a//text()')), 'volume': clean_xpathd_text(li.xpath('./zzzz')), 'raw_price': clean_xpathd_text( li.xpath('.//span[@class="price-per-bottle"]/text()')[0:]), 'raw_promo_price': '', 'pdct_img_main_url': "".join(li.xpath('.//a//img/@src')[0]), } products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) print(products[produrl]) searches[kw].append(produrl) return searches, products
def kw_parsing(fpath, kw, searches, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//ul[@class="lineupItemList"]/li'): produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//p[@class="itemName"]//text()'), unicodedata_normalize=True), 'raw_price': clean_xpathd_text( li.xpath('.//p[@class="price"]/strong/text()')[:1], unicodedata_normalize=True), 'raw_promo_price': clean_xpathd_text(li.xpath('.//xpath/text()'), unicodedata_normalize=True), 'volume': clean_xpathd_text(li.xpath('.//p[@class="itemName"]//text()'), unicodedata_normalize=True), 'pdct_img_main_url': "".join(li.xpath('.//span[@class="imgBox"]//img/@src')[0]), } print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'].replace('_L.', '_3L.'), root_url) print(products[produrl]) searches[kw].append(produrl) return searches, products
def kw_parsing(fpath, kw, searches, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//section[@class="item-list"]/div[@class="item"]'): produrl = li.xpath('.//div[@class="product-ttl"]/a/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//div[@class="product-ttl"]/a//text()'), unicodedata_normalize=True) + " " + clean_xpathd_text( li.xpath('.//span[@class="vintage_code"]//text()'), unicodedata_normalize=True), 'raw_price': clean_xpathd_text(li.xpath('.//div[@class="product-price"]//span//text()'), unicodedata_normalize=True), 'raw_promo_price': clean_xpathd_text(li.xpath('.//xpath/text()'), unicodedata_normalize=True), 'volume': clean_xpathd_text(li.xpath('.//table//tr[2]//td//text()'), unicodedata_normalize=True), 'pdct_img_main_url': li.xpath('.//div[@class="productimg"]//img/@data-src'), } if products[produrl]['pdct_img_main_url'] == []: products[produrl]['pdct_img_main_url'] = li.xpath('.//div[@class="productimg"]//img/@src') products[produrl]['pdct_img_main_url'] = clean_url("".join(products[produrl]['pdct_img_main_url'][0]), root_url) print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price']) print(products[produrl]) searches[kw].append(produrl) return searches, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[contains(@class, "widget item product")]'): if not li.xpath('(.//a/@href)[1]'): continue produrl = li.xpath('(.//a/@href)[1]')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//div[@class="_title"]//text()')), 'volume': clean_xpathd_text(li.xpath('.//div[@class="_title"]//text()')), 'raw_price': clean_xpathd_text( li.xpath('(.//div[@class="_price-wrapper"]//text())[1]')) + clean_xpathd_text( li.xpath( '(.//div[@class="_price-wrapper"]/div/@data-cents)[1]')), 'raw_promo_price': clean_xpathd_text( li.xpath('.//span[contains(@id, "_old-price")]//text()')), } products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def kw_parsing(fpath, kw, searches, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//article[@class="product-item"]'): produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('./@data-name')), 'raw_price': clean_xpathd_text(li.xpath('./@data-price')) + "€", 'raw_promo_price': clean_xpathd_text(li.xpath('.//xpath/text()')), 'volume': clean_xpathd_text(li.xpath('./@data-name')), 'pdct_img_main_url': "".join(li.xpath('.//picture//img/@src')[0]), } products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'].replace( '_165x165', '_460x460'), root_url) print(products[produrl]) searches[kw].append(produrl) return searches, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//li[@class="item product product-item denner-tile"]'): if not li.xpath('.//a[@class="product photo denner-tile__link"]/@href'): continue produrl = li.xpath('.//a[@class="product photo denner-tile__link"]/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//div[@class="denner-tile__title"]/h2//text()')), 'volume': clean_xpathd_text(li.xpath('./zzzz')), 'raw_price': clean_xpathd_text(li.xpath('..//div[@class="denner-price__additional"]//text()')), 'raw_promo_price': clean_xpathd_text(li.xpath('.//div[@class="denner-price__additional"]//text()')), 'pdct_img_main_url': "".join(li.xpath('./zzzzzz')[:1]), } products[produrl]['brnd'] = brm.find_brand(products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['raw_price'] = get_pricestr(products[produrl]['raw_price']) products[produrl]['raw_promo_price'] = get_promopricestr(products[produrl]['raw_promo_price']) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url(products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//div[@data-scope="product"]'): if not li.xpath('(.//h3/a/@href)'): continue produrl = li.xpath('(.//h3/a/@href)')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//h3/a/@title')), 'volume': clean_xpathd_text(li.xpath('./zzzzzzzzz')), 'raw_price': clean_xpathd_text(li.xpath('.//div[contains(@class,"prices-price _curren")]//text()')), 'raw_promo_price': clean_xpathd_text(li.xpath('./zzzzzzzzzzzz')), 'pdct_img_main_url': "".join(li.xpath('./zzzzzzzz')), } if not products[produrl]['raw_price']: products[produrl].update({ 'raw_price': ' '.join(''.join(li.xpath('.//span[@class="current sale"]//text()')).split()), 'raw_promo_price': ' '.join(''.join(li.xpath('.//span[@class="former stroked"]//text()')).split()), }) products[produrl]['brnd'] = brm.find_brand(products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url(products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//*[contains(@id, "product_id_")]'): if not li.xpath('.//td/a/@href'): continue produrl = li.xpath('.//td/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text( li.xpath('.//a[@class="contentpagetitle"]//text()')), 'volume': clean_xpathd_text(li.xpath('.//zzzzzzzzzzz')), 'raw_price': clean_xpathd_text(li.xpath('.//zzzzzzzzzzzzzz')), 'raw_promo_price': clean_xpathd_text(li.xpath('.//zzzzzzzzzzzzzzz')), 'pdct_img_main_url': "".join(li.xpath('./zzzzzzzzz')[:1]), } products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def ctg_parsing(fpath, ctg, categories, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//ul[contains(@class, "tiles-container")]/li'): if not li.xpath('.//a[img]/@href'): continue produrl = li.xpath('.//a[img]/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//span[@class="product-name"]/text()')), 'volume': clean_xpathd_text(li.xpath('.//div[contains(@class, "price-sales")]/span[last()]/text()')), 'raw_price': clean_xpathd_text(li.xpath('.//div[contains(@class,"price-sales")]/@data-baseprice')), 'raw_promo_price': clean_xpathd_text(li.xpath('./zzzzzzzzz')), 'pdct_img_main_url': "".join(li.xpath('./zzzzzzzzz')), } if li.xpath('.//div[@class="price"]/@data-baseprice'): products[produrl]['promo_price'] = int( float(li.xpath('.//div[@class="price"]/@data-baseprice')[0]) * 100), products[produrl]['brnd'] = brm.find_brand(products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url(products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) categories[ctg].append(produrl) return categories, products
def kw_parsing(fpath, kw, searches, products): # TODO : modify xpaths tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//li[@itemtype="http://schema.org/Product"]'): if not li.xpath('(.//a/@href)')[0]: continue produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': clean_xpathd_text(li.xpath('.//*[@itemprop="name"]//text()')), 'volume': clean_xpathd_text(li.xpath('.//span[@class="volume"]//text()')), 'raw_price': clean_xpathd_text( li.xpath( './/span[contains(@class, "price") and contains(@id, "product-price")]//text()' )), 'raw_promo_price': clean_xpathd_text(li.xpath('.//p[@class="old-price"]//text()')), 'pdct_img_main_url': "".join( li.xpath( './/a[@class="product-image"]/img[@class="lazy"]/@data-original' )), } print(products[produrl], produrl) if products[produrl]['pdct_img_main_url']: products[produrl]['pdct_img_main_url'] = products[produrl][ 'pdct_img_main_url'].replace( "small_image/x300/17f82f742ffe127f42dca9de82fb58b1", "image/9df78eab33525d08d6e5fb8d27136e95") else: products[produrl]['pdct_img_main_url'] = "" products[produrl]['brnd'] = brm.find_brand( products[produrl]['pdct_name_on_eretailer'])['brand'] print(products[produrl], produrl) products[produrl]['price'] = getprice(products[produrl]['raw_price']) products[produrl]['promo_price'] = getprice( products[produrl]['raw_promo_price']) products[produrl]['pdct_img_main_url'] = clean_url( products[produrl]['pdct_img_main_url'], root_url) print(products[produrl]) searches[kw].append(produrl) return searches, products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths tree = etree.parse(open(fpath), parser=parser) products[url].update({ 'volume': clean_xpathd_text(tree.xpath('//div[@class="denner-price__additional"]//text()')[2:], unicodedata_normalize=True), 'pdct_img_main_url': clean_url(''.join(tree.xpath('//img[contains(@class, "fotorama__img")]/@src')[:1]), root_url), 'ctg_denom_txt': ' '.join(tree.xpath('/zzzzzz')), }) return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths tree = etree.parse(open(fpath), parser=parser) products[url].update({ 'volume': clean_xpathd_text(tree.xpath('//div[@class="product-abv-vol"]/span//text()')), 'pdct_img_main_url': clean_url(''.join(tree.xpath('//div[@class="product-image-gallery"]//img/@src')), root_url), 'ctg_denom_txt': ' '.join(tree.xpath('//div[@class="breadcrumbs"]//text()')), }) return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths tree = etree.parse(open(fpath), parser=parser) products[url].update({ 'volume': clean_xpathd_text(tree.xpath('//div[@class="cd-ProductDescription"]//text()')[:3], unicodedata_normalize=True), 'pdct_img_main_url': clean_url(''.join(tree.xpath('//*[@id="productSlider"]/li[@data-itemnb="0"]/@data-imgname')[:1]), root_url), 'ctg_denom_txt': ' '.join(tree.xpath('//div[@class="cd-NavSubMenu"]//text()')), }) return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths tree = etree.parse(open(fpath), parser=parser) products[url].update({ 'volume': clean_xpathd_text(tree.xpath('//div[contains(@class, "pack_composition")]//text()')), 'pdct_img_main_url': clean_url(''.join(tree.xpath('//img[@id="product-image-placer"]/@src')[-1]), root_url), 'ctg_denom_txt': ' '.join(tree.xpath('//*[@id="breadcrumbs"]//text()')), }) return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths tree = etree.parse(open(fpath), parser=parser) products[url].update({ 'volume': clean_xpathd_text(tree.xpath('//*[@class="ContentArea"]/h1[@class="title"]//text()')[:3], unicodedata_normalize=True), 'pdct_img_main_url': clean_url(''.join(tree.xpath('//div[contains(@class, "ProductThumbImage")]//img/@src')[:1]), root_url), 'ctg_denom_txt': ' '.join(tree.xpath('//*[@id="ProductBreadcrumb"]//text()')), }) return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths tree = etree.parse(open(fpath), parser=parser) products[url].update({ 'volume': clean_xpathd_text(tree.xpath('//ul[@class="ingredients"]//text()'), unicodedata_normalize=True), 'pdct_img_main_url': clean_url(''.join(tree.xpath('//*[@class="product-img-box"]//a[@id="zoom1"]/@href')), root_url), 'ctg_denom_txt': ' '.join(tree.xpath('//div[@class="grid-full breadcrumbs"]//text()')), }) return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths tree = etree.parse(open(fpath), parser=parser) products[url].update({ 'volume': clean_xpathd_text(tree.xpath('(//div[@class="col-xs-6 col-sm-12 nopadding pull-right"]//text())[23]')), 'pdct_img_main_url': clean_url(''.join(tree.xpath('//span[@id="view_full_size"]//img/@src')), root_url), 'ctg_denom_txt': ' '.join(tree.xpath('//div[@class="breadcrumb clearfix"]//text()')), }) return products
def pdct_parsing(fpath, url, products): # TODO : modify xpaths tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) #tree = etree.parse(open(fpath), parser=parser) products[url].update({ 'volume': clean_xpathd_text(tree.xpath('//*[@id="bmg_itemdetail_size"]//text()')[:3], unicodedata_normalize=True), 'pdct_img_main_url': clean_url(''.join(tree.xpath('//*[@id="loadarea"]//img/@src')[:1]), root_url), 'ctg_denom_txt': ' '.join(tree.xpath('//div[@class="layMain"]//text()')), }) return products