def parse_products(self, response): json_res = parse_json_response(response) page_title = json_res['title'] json_res = json_res[u'_embedded'][u'lanes'] # Get the ProductDetailLane product_lane = next(lane for lane in json_res if lane['type'] == 'ProductDetailLane') product_details = product_lane['_embedded']['items'][0] if product_details: product_name = product_details['_embedded']['product'][ 'description'] description = product_details['_embedded']['product']['details'][ 'summary'] description = description.replace('[list]', '') description = description.replace('[*]', '') size_or_weight = product_details['_embedded']['product'][ 'unitSize'] if size_or_weight is not None: if "stuk" in size_or_weight: size = size_or_weight weight_q = None weight_ind = None else: size = None weight_q = WeightStandardizer.standardize_quantity( size_or_weight) weight_ind = WeightStandardizer.standardize_indicator( size_or_weight) else: size = None weight_q = None weight_ind = None images = product_details['_embedded']['product']['images'] img_src = None if images: first_image = images[0] if first_image: img_src = first_image['link']['href'] price = product_details['_embedded']['product']['priceLabel'][ 'now'] # filename = f'data/ah/{title}.html' # with open(filename, 'wb') as f: # f.write(response.body) yield create_grocery_bot_item(product_name, page_title, description, 'albert heijn ah', response.url, dt.now(), weight_q, weight_ind, size, '', price, img_src)
def parse_product(self, response): title = response.css('div.container div.container h1::text').get() page_title = response.css('title::text').get() img_src = response.css('div#productImage').css("img").xpath( "@src").get() page_title = page_title.replace('/', '') page_title = page_title.replace('_', '') # get the h5 headers that contain the information we need to extract headers = response.css( 'div.container div.container div.productInfo div.col-md-6')[0].css( 'h5::text').getall() desc_index = headers.index( next(header for header in headers if header == 'Beschrijving')) # weight_index = headers.index(next(header for header in headers if header == 'Inhoud en gewicht')) description = response.css( 'div.container div.container div.productInfo div.col-md-6')[0].css( 'p::text')[desc_index].get() weight_or_size = response.css('div.unitQuantity span::text').get() weight_or_size = weight_or_size.strip() weight_or_size = weight_or_size.replace("Stuks", "").replace("Stuk", "") if weight_or_size is not None: if "Per" in weight_or_size: size = weight_or_size weight_q = None weight_ind = None else: weight_q = WeightStandardizer.standardize_quantity( weight_or_size) weight_ind = WeightStandardizer.standardize_indicator( weight_or_size) size = None else: weight_q = None weight_ind = None size = None category = ' '.join(response.css('ol.breadcrumb li ::text').getall()) price = ''.join( response.css( 'div.priceUnitQuantity div.price span::text').getall()) yield create_grocery_bot_item(title, page_title, description, 'vomar', response.url, dt.now(), weight_q, weight_ind, size, category, price, 'www.vomar.nl' + img_src)
def save_product(self, response): product_name = response.css("li.page-header__breadcrumb").css( "a::text").getall()[-1] # product_name = response.css('div.pdp-right-block h1::text').get() page_title = response.css("title::text").get() img_src = "https://www.plus.nl/" + response.css("img.lazy").xpath( "@data-src").get() description = None number_of_units = response.css( 'div.product-detail-packing::text').get() if ' \n' in number_of_units: number_of_units = number_of_units.strip(' \n') if number_of_units is not None: if 'stuks' in number_of_units: size = number_of_units weight_q = None weight_ind = None else: weight_q = WeightStandardizer.standardize_quantity( number_of_units) weight_ind = WeightStandardizer.standardize_indicator( number_of_units) size = None else: size = None weight_q = None weight_ind = None try: euros = response.css('span.price span::text').getall()[-1] cents = response.css('span.price sup::text').get() price = euros + '.' + cents except: print("COULD NOT GET TRUE PRICE") price = response.css('span.price span::text').get() try: category = response.css("li.page-header__breadcrumb").css( "a::text").getall()[:2] except: category = None print("Could not find category") yield create_grocery_bot_item(product_name, page_title, description, 'plus', response.url, dt.now(), weight_q, weight_ind, size, category, price, img_src)
def parse_product(self, response): title = response.css('h1.attributebox__headline--h1::text').get() # page_title = response.css('title::text').get() filename = 'data/lidl/lidl-%s.html' % title with open(filename, 'wb') as f: f.write(response.body) yield create_grocery_bot_item(title, response.url, filename, dt.now()) for a in response.css( 'div.product.product--tile a.product__body::attr(href)' ).getall(): yield response.follow(a, callback=self.parse_product)
def parse_products(self, response): product_name = response.css("div.jum-column-main").css("h1::text").get() page_title = response.css('title::text').get() # only extract the description when the p is an immediate child of dd.active, otherwise it's not a description description = response.css('div.jum-summary-description p::text').get() image_url = response.css('div.jum-product-image-figure').css("img").xpath("@data-jum-src").get() number_of_units = response.css('div.jum-sale-price-info span.jum-pack-size::text').get() if number_of_units is not None: if " g" in number_of_units or " l" in number_of_units or " kg" in number_of_units or " ml" in number_of_units: weight_q = WeightStandardizer.standardize_quantity(number_of_units) weight_ind = WeightStandardizer.standardize_indicator(number_of_units) size = None else: size = number_of_units weight_q = None weight_ind = None else: size = None weight_q = None weight_ind = None # nothing category = None # get both parts of the price price_euro = response.css('span.jum-price-format::text').getall()[0] price_cent = response.css('span.jum-price-format::text').get() price = price_euro+'.'+price_cent price_per_unit = response.css('span.jum-price-format::text').getall()[1] yield create_grocery_bot_item(product_name, page_title, description, 'jumbo', response.url, dt.now(), weight_q, weight_ind, size, category, price, image_url)
def save_product(self, response): product_name = response.css('h1.altHead::text').get() page_title = response.css('title::text').get() # only extract the description when the p is an immediate child of dd.active, otherwise it's not a description description = response.css('dl.definitionList dd.active>p::text').get() # in coop, weight and size are currently the same 'block' weight_or_size = response.css('header.gi h2.subHead::text').get() if weight_or_size is not None: if "stuk" in weight_or_size: size = weight_or_size weight_q = None weight_ind = None else: weight_q = WeightStandardizer.standardize_quantity(weight_or_size) weight_ind = WeightStandardizer.standardize_indicator(weight_or_size) size = None else: size = None weight_q = None weight_ind = None # image = response.css('div.gi b0_12 img').xpath('@data-srcset').get() img_src = response.css('div.b0_12').css('img').xpath('@data-srcset').get().split(' ')[0] # grab the n-2 th element of the item/category breadcrumb category = response.css('ol.cf span::text')[-2].get() # get both parts of the price (replace comma's by dots to make for easy float parsing) price = str(response.css('ins.price::text').get() + response.css('ins.price span.sup::text').get()).replace(',', '.') #with open('coop-' + (time.strftime("%d%m%Y")) + '.txt', 'a+') as f: # f.write(response.url + '\n') # print(f.read()) # f.close() yield create_grocery_bot_item(product_name, page_title, description, 'coop', response.url, dt.now(), weight_q, weight_ind, size, category, price, img_src)