def parse_product(self, response): prod = self.parse_product_base(response) currencyRate = re.search('var currencyRate\D+([\d\.]+)', response.body) if currencyRate: currencyRate = Decimal(currencyRate.group(1)) else: currencyRate = 1 productPriceTaxExcluded = re.search("var productPriceTaxExcluded\D+([\d\.]+)", response.body) if productPriceTaxExcluded: productPriceTaxExcluded = Decimal(productPriceTaxExcluded.group(1)) else: productPriceTaxExcluded = 0 idDefaultImage = re.search('var idDefaultImage=(\d+)', response.body) if idDefaultImage: idDefaultImage = idDefaultImage.group(1) data = response.xpath('//script/text()').re_first('var combinations=({.+?});') if not data: yield prod return data = json.loads(data) for identifier in data: loader = ProductLoader(Product(), response=response) loader.add_value(None, prod) loader.replace_value('identifier', '-'.join((prod['identifier'], identifier))) loader.replace_value('sku', data[identifier]['reference']) loader.replace_value('stock', data[identifier]['quantity']) option_price = Decimal(data[identifier]['price']) if option_price != 0: price = (option_price * Decimal('1.2')).quantize(Decimal('0.01')) loader.replace_value('price', price) attr_values = data[identifier]['attributes_values'] for attr in sorted(attr_values): loader.add_value('name', attr_values[attr]) image_url = prod['image_url'].replace(idDefaultImage, str(data[identifier]['id_image'])) yield loader.load_item()
def string_to_decimal(self, price): was_price = None if price: was_price = (re.search(ur"Was\s€(\d+\.\d*)", price, re.I) or re.search(ur"Was\s(\d+c)", price, re.I) or re.search(ur"Was\s€(\d+)", price, re.I) or re.search(ur"^€(\d+\.\d*)", price, re.I) or re.search(ur"^€(\d+)", price, re.I) or re.search(ur"^(\d+c)", price, re.I)) if was_price: try: was_price = Decimal(was_price.group(1)) except Exception: # cents e.g 90c was_price = re.search(r"(\d+)c", was_price.group(1), re.I) if was_price: was_price = "0.{}".format(was_price.group(1)) was_price = Decimal(was_price) return self.decimal_to_float(was_price)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name_xpath = '//div[@id="primary_block"]/h1//text()' image_xpath = '//div[@id="image-block"]/img/@src' breadcrumb_xpath = '//div[@class="breadcrumb"]/a[last()]/text()' breadcrumb = hxs.select(breadcrumb_xpath).extract() if len(breadcrumb) > 0: category = breadcrumb.pop().strip() else: category = 'No category' name = hxs.select(name_xpath).extract().pop().strip() image = hxs.select(image_xpath).extract().pop() product_url = urljoin_rfc(base_url, response.url) image_url = urljoin_rfc(base_url, image) currencyRate = re.search('var currencyRate\D+([\d\.]+)', response.body) if currencyRate: currencyRate = Decimal(currencyRate.group(1)) else: currencyRate = 1 productPriceTaxExcluded = re.search( "var productPriceTaxExcluded\D+([\d\.]+)", response.body) if productPriceTaxExcluded: productPriceTaxExcluded = Decimal(productPriceTaxExcluded.group(1)) else: productPriceTaxExcluded = 0 idDefaultImage = re.search('var idDefaultImage = (\d+)', response.body) if idDefaultImage: idDefaultImage = idDefaultImage.group(1) if response.url.find("unlimited-telecom.fr") != -1: if re.search('addCombination.*?;', response.body): # self.log("WARNING options found") # here we parse option tags for more product options. option_value_xpath = '//div[@id="attributes"]//select/option/@value' option_values = hxs.select(option_value_xpath).extract() option_text_xpath = '//div[@id="attributes"]//select/option//text()' option_texts = hxs.select(option_text_xpath).extract() # build the lookup table. options = {} for i in range(len(option_values)): options[option_values[i]] = option_texts[i] for x in re.finditer('addCombination.*?;', response.body): s = x.group(0).split(',') offset = Decimal(s[-6]) # determining place of options keys option_key_start = 1 option_key_end = len(s) - 7 # parsing option keys option_texts = [] opt = '' for i in range(option_key_start, option_key_end): try: opt = re.sub('[^\d]+', '', s[i]) option_text = options[opt] except: pass if len(option_text) > 0: option_texts.append(option_text.strip()) price = productPriceTaxExcluded + offset * currencyRate loader = ProductLoader(response=response, item=Product()) loader.add_value('url', product_url) loader.add_value('name', name + ' ' + ' '.join(option_texts)) image_id = s[-4].strip(" '") if image_id != idDefaultImage: loader.add_value( 'image_url', image_url.replace('-' + idDefaultImage + '-', '-' + image_id + '-')) else: loader.add_value('image_url', image_url) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('identifier', '%s_%s' % (s[-3].strip("' "), opt)) loader.add_value('sku', s[-3].strip("' ")) yield loader.load_item() return else: # self.log("WARNING options not found") prod = self.parse_product_base(response, hxs) if prod: yield prod return elif response.url.find("unlimited-telecom.com") != -1: prod = self.parse_product_base(response, hxs) if prod: yield prod return else: self.log("ERROR unknown url: " + response.url) return
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select( u'//table[child::tr[child::td[@colspan="2" and child::h2]]]') for product in products: multiple_options = product.select(u'.//select/option') general_price = product.select( u'.//span[@class="actlarge"]/text()').extract() if general_price: if len(general_price) > 1: multiplier = Decimal("1") general_price = general_price[1] else: multiplier = Decimal("1.2") general_price = general_price[0] general_price = general_price.replace(u'\xa3', '') general_price = Decimal(general_price.replace(",", "")) general_price = general_price * multiplier else: general_price = None if not general_price: general_price = product.select(u'.//*/text()').re( u'Price inc UK Mainland Carriage.*?\:.*?\xa3([\d\.,]*)') general_price = general_price[0] if general_price else None log.msg(u'Product with: Price inc UK Mainland Carriage') if multiple_options and general_price: options_text = u' '.join( product.select(u'.//select/option/text()').extract()) if u'\xa3' in options_text: log.msg( u'Product with both option and general price: [%s]' % response.url) name = product.select(u'.//h2/text()')[0].extract().strip() if multiple_options and not general_price: idx = 1 for option in multiple_options: option_text = option.select(u'./text()')[0].extract() loader = ProductLoader(item=Product(), selector=product) price = re.search(u'\xa3([\d\.,]+)inc vat', option_text, re.I) multiplier = Decimal("1") if not price: multiplier = Decimal("1") price = re.search(u'\xa3([\d\.,]+)inc', option_text, re.I) if not price: multiplier = Decimal("1") price = re.search(u'\(\xa3([\d\.,]+)\)?', option_text, re.I) if not price: multiplier = Decimal("1.2") price = re.search(u'\xa3([\d\.,]+)', option_text, re.I) if price: price = Decimal(price.group(1).replace( ",", "")) * multiplier else: continue loader.add_value('name', name + u' %s' % option_text.strip()) loader.add_value('url', response.url) loader.add_value('price', price) m = re.search(r'\(Ref:\s*([^\)]+)\)', name, re.I) if m: optsku = option_text.strip().lower().replace( 'code', '').strip('-. ').split('-')[0] # optsku = re.sub(r'\W+','',re.sub(r'.*\(ref:\s*[^\)]+\)','',re.sub(r'\xa3.*','',name.lower().replace('code',''))).strip('-. ').split('-')[0]) if optsku: loader.add_value('sku', m.group(1) + optsku) else: loader.add_value('sku', m.group(1) + ".inc" + str(idx)) idx += 1 loader.add_value('identifier', loader.get_output_value('sku')) yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', response.url) loader.add_value('name', name) if not general_price: continue loader.add_value('price', general_price) m = re.search(r'\(Ref:\s*([^\)]+)\)', loader.get_output_value('name'), re.I) if m: loader.add_value('sku', m.group(1)) loader.add_value('identifier', loader.get_output_value('sku')) # if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) name = hxs.select('//h1/text()').extract() if not name: return else: name = name[0] identifier = hxs.select( '//input[@name="product_id"]/@value').extract()[0] price = hxs.select( '//div[@class="price"]/div[@id="myoc-lpu"]/text()').extract() if price: price = extract_price2uk(price[0]) stock = 1 else: price = Decimal(0) stock = 0 loader = ProductLoader(selector=hxs, item=Product()) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('stock', stock) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_xpath('image_url', '//a[@class="thumbnail"]/img/@src') loader.add_value('url', response.url) loader.add_value('shipping_cost', 0) for category in hxs.select( '//ul[@class="breadcrumb"]/li/a/text()')[:-1].extract(): loader.add_value('category', category) loader.add_xpath('brand', '//li[contains(text(), "Brand")]/a/text()') product = loader.load_item() option_boxes = hxs.select( '//select[@class="form-control" and contains(@id, "option")\ and not(contains(./option/., "V.A.T."))\ and not(contains(./option/., "VAT"))\ and not(contains(./option/., "Delivery"))]') if not option_boxes: yield product return options_dict = dict() options = [] for option_box in option_boxes: option_group = [] for option in option_box.select( './option[@value!="" and not(contains(.,"VAT Exempt"))]'): option_id = option.select('./@value')[0].extract() option_name = option.select('./text()')[0].extract() option_price = re.search(u'\(\+\xa3(.*)\)', option_name) option_price = Decimal( option_price.group(1)) if option_price else Decimal('0.00') option_name = re.sub('VAT Payable ?-? ?', '', option_name) option_name = re.sub(u'\(\+\xa3(.*)\)', '', option_name).strip() options_dict[option_id] = { 'name': option_name, 'price': option_price } option_group.append(option_id) options.append(option_group) options = itertools.product(*options) for option in options: option_name = ' '.join( [options_dict[option_id]['name'] for option_id in option]) option_price = sum( [options_dict[option_id]['price'] for option_id in option]) option = sorted(option) option_identifier = '-'.join(option) product['identifier'] = '-'.join((identifier, option_identifier)) product['price'] = price + option_price product['name'] = fix_spaces(' '.join((name, option_name))) yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_found = hxs.select('//div[@id="primary_block"]') if not product_found: return product_id = hxs.select('//input[@name="id_product"]/@value').extract()[0] name = hxs.select('//div[@id="dfCenter"]//h1/text()').extract()[0] category = hxs.select('//div[@class="breadcrumb"]/a/text()').extract()[1:] image_url = hxs.select('//img[@id="bigpic"]/@src').extract() if image_url: image_url = image_url[0] product_url = response.url product_brand = hxs.select('//div[@id="short_description_content"]//p[1]//text()').extract()[0] product_brand = product_brand.replace(' di ', ' da ') product_brand = product_brand.replace(' by ', ' da ') try: if len(product_brand) > 20: product_brand = re.search(' da.+?[,.]', product_brand).group(0) except: pass product_brand = product_brand.split(' da ')[-1] product_brand = product_brand.strip().strip('.,') if len(product_brand) > 20: title = hxs.select('//title/text()').extract()[0] s = SequenceMatcher(a=product_brand.title(), b=title.title()) m = s.find_longest_match(0, len(s.a), 1, len(s.b)) product_brand = s.a[m[0]:m[0]+m[-1]].strip() if len(product_brand) < 7 or ' ' not in product_brand: product_brand = None currencyRate = re.search('var currencyRate\D+([\d\.]+)', response.body) if currencyRate: currencyRate = Decimal(currencyRate.group(1)) else: currencyRate = 1 taxRate = re.search("var taxRate\D+([\d\.]+)", response.body) if taxRate: taxRate = Decimal(taxRate.group(1)) else: taxRate = 0 reduction_percent = re.search("var reduction_percent\D+([\d\.]+)", response.body) if reduction_percent: reduction_percent = Decimal(reduction_percent.group(1)) else: reduction_percent = 0 reduction_price = re.search("var reduction_price\D+([\d\.]+)", response.body) if reduction_price: reduction_price = Decimal(reduction_price.group(1)) else: reduction_price = 0 productPriceTaxExcluded = re.search("var productPriceTaxExcluded\D+([\d\.]+)", response.body) if productPriceTaxExcluded: productPriceTaxExcluded = Decimal(productPriceTaxExcluded.group(1)) else: productPriceTaxExcluded = 0 idDefaultImage = re.search('var idDefaultImage = (\d+)', response.body) if idDefaultImage: idDefaultImage = idDefaultImage.group(1) if re.search('addCombination.*?;', response.body): # here we parse option tags for more product options. option_value_xpath = '//div[@id="attributes"]//select/option/@value' option_values = hxs.select(option_value_xpath).extract() option_text_xpath = '//div[@id="attributes"]//select/option//text()' option_texts = hxs.select(option_text_xpath).extract() # build the lookup table. options = {} for i in range(len(option_values)): options[option_values[i]] = option_texts[i] # addCombination(5631, new Array('259'), 11, 109.99, 0, -1, 'GGT3050', 0.00, 1); for x in re.finditer('addCombination\((.*?)\);', response.body): s = x.group(0).split(',') offset = Decimal(s[-6]) # determining place of options keys option_key_start = 1 option_key_end = len(s) - 7 # parsing option keys option_texts = [] opt = '' for i in range(option_key_start, option_key_end): try: opt = re.sub('[^\d]+', '', s[i]) option_text = options[opt] except: pass if len(option_text) > 0: option_texts.append(option_text.strip()) price = productPriceTaxExcluded + offset * currencyRate tax = (taxRate / Decimal('100')) + 1 price = price * tax reduction = Decimal('0') if reduction_price or reduction_percent: reduction = price * (reduction_percent / Decimal('100')) + reduction_price price = price - reduction price = round(price, 2) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', product_url) loader.add_value('name', name + ' ' + ' '.join(option_texts)) image_id = s[-4].strip(" '") if image_url and image_id != "-1" and image_id != idDefaultImage: loader.add_value('image_url', image_url.replace('-' + idDefaultImage + '-', '-' + image_id + '-')) else: loader.add_value('image_url', image_url) loader.add_value('brand', product_brand) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('identifier', '%s-%s' % (product_id, re.search(r'(\d+)', s[0]).group(1))) loader.add_value('sku', s[-3].strip("' ").decode('utf8')) yield loader.load_item() else: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', product_url) loader.add_value('name', name) loader.add_value('image_url', image_url) loader.add_xpath('price', '//*[@id="our_price_display"]/text()', lambda x: extract_price_eu(x[0]) if x else Decimal('0')) loader.add_value('category', category) loader.add_value('identifier', product_id) loader.add_xpath('sku', '//*[@id="product_reference"]/span/text()') loader.add_value('brand', product_brand) yield loader.load_item()