def _populate_from_html(self, response, product): reseller_id = re.findall('\/sku(\d+)', response.url) # reseller_id = reseller_id[0] if reseller_id else None cond_set(product, 'reseller_id', reseller_id) cond_set(product, 'title', response.css('[itemprop=name]::text').extract()) cond_set(product, 'brand', response.css('#ctl00_content_lnkBrand::text').extract()) cond_set(product, 'price', response.css('[itemprop=price]::text').extract()) if product.get('price', '') and not isinstance(product['price'], Price): if not 'Rp' in product['price']: self.log('Unrecognized currency at %s' % response.url) else: product['price'] = Price( price=product['price'].lower().replace( 'rp', '').replace(',', '').strip(), priceCurrency='IDR' ) cond_replace(product, 'image_url', response.css('#prodMedia img::attr(src)').extract()) specs = response.css('.spesifications').extract() specs = specs[0] if specs else '' description = product.get('description', '') + specs.strip() cond_replace_value(product, 'description', description) self._get_model_from_title(product)
def _unify_price(self, product): price = product.get('price') if not price: return price_match = re.search('\$ *([, 0-9]+(?:\.[, 0-9]+)?)', price) if price_match: price = price_match.group(1) price = ''.join(re.split('[ ,]+', price)) cond_replace_value(product, 'price', Price('USD', price))
def _price_from_html(self, response, product): css = '.product-price-bol [itemprop=price]::attr(content)' cond_replace(product, 'price', response.css(css).extract()) cond_set( product, 'price', response.xpath( "//span[@class='offer_price']/meta[@itemprop='price']/@content" ).extract()) currency = response.css('[itemprop=priceCurrency]::attr(content)') currency = currency.extract()[0] if currency else 'EUR' price = product.get('price', '') price = price.replace(',', '.') if price and re.match(' *\d+\.?\d* *\Z', price): cond_replace_value(product, 'price', Price(currency, price))
def _populate_from_html(self, response, product): cond_set(product, 'image_url', response.css('.largeimage::attr(src)').extract()) cond_set(product, 'title', response.css('.productname::text').extract()) cond_set(product, 'brand', response.css('.productbrand [itemprop=name]::text').extract()) delivery_opts = response.css('.deliverycallout li') delivery_opts = [bool(do.css('.available')) for do in delivery_opts] opt_len = len(filter(None, delivery_opts)) if opt_len: cond_set_value(product, 'is_in_store_only', delivery_opts[1] and opt_len == 1) else: cond_set_value(product, 'is_out_of_stock', False) cond_set(product, 'price', response.css('[itemprop=price]::text').extract(), unicode.strip) cond_set(product, 'model', response.css('[itemprop=model]::text').extract()) regex = "\/(\d+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) price = product.get("price") if not re.findall(u'\xa3 *\d[\d, .]*', price): price = response.xpath( "//ul[contains(@class, 'pricing')]/li[last()]/span/text()" ).extract() if price: price = price[0].strip() price = re.findall(u'\xa3 *\d[\d, .]*', price) if price: price = re.sub(u'[\xa3, ]+', '', price[0]) cond_replace_value(product, 'price', Price(priceCurrency='GBP', price=price)) xpath = '//div[@id="pdpTab1"]/node()[normalize-space()]' cond_set_value(product, 'description', response.xpath(xpath).extract(), ''.join) product['url'] = product['url'].rsplit('#', 1)[0]
def parse_product_old(self, response): prod = response.meta['product'] # populate_from_open_graph not awailable cause no type=product metadata = _extract_open_graph_metadata(response) description = response.xpath('//p[@itemprop="description"]//text()').extract() if description: cond_set_value(prod, 'description', description[0]) else: cond_set_value(prod, 'description', metadata.get('description')) cond_set_value(prod, 'title', metadata.get('title')) cond_replace_value(prod, 'url', metadata.get('url')) img_url = metadata.get('image').rstrip('?$browse_thumbnail$') cond_set_value(prod, 'image_url', img_url) locale = response.xpath( '//meta[@name="gwt:property"]/@content' ).re(r'locale=\s*(.*)') if locale: cond_set_value(prod, 'locale', locale[0]) re_pattern = r'(\d+,\d+|\d+)' price = response.xpath( '//span[@itemprop="price"]//span[contains(@class,"price-sales")]//text()' ).extract() if len(price) > 0: price = re.findall(r'[\d\.]+', price[0]) if len(price) > 0: price = price[0].replace(",", "") else: price = None # in case item use usual price, not sale if price: prod['price'] = Price( priceCurrency='USD', price=price ) brand = response.xpath( '//meta[@itemprop="brand"]/@content' ).extract() cond_set(prod, 'brand', brand) return prod
def _populate_from_html(self, response, product): cond_set(product, 'price', response.css('.price span::text').re( u'\u00a3([\d, .]+)')) cond_set(product, 'title', _itemprop(response, 'model'), unicode.strip) cond_set(product, 'brand', _itemprop(_itemprop(response, 'brand', False), 'name'), unicode.strip) cond_set(product, 'image_url', _itemprop(response, 'image', False) .css('img::attr(src)').extract()) image = product.get('image_url') if image and image.endswith('noImage.gif'): del (product['image_url']) cond_set_value(product, 'is_out_of_stock', response.css('.stockMessaging::text').re( 'out of stock|Discontinued product'), bool) regex = "\/([0-9]+)[\/\?]" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) details = response.css('.prodDetailsContainer').xpath( 'node()[normalize-space()]') details = [d.extract() for d in details if not d.css('form')] if details: cond_set_value(product, 'description', details, conv=''.join) self._populate_related_products(response, product) self._populate_buyer_reviews(response, product) price = product.get('price', None) if price == 0: del (product['price']) elif price: price = re.sub(', ', '', price) cond_replace_value(product, 'price', Price(priceCurrency='GBP', price=price))
def _unify_price(self, product): price = product['price'] price = unify_price(['USD'], {'$': 'USD'}, unify_decimal(', ', '.'), 'USD')(price) cond_replace_value(product, 'price', price)
def _unify_price(self, product): price = product['price'].encode('utf-8') price = unify_price(valid_currency_codes, CURRENCY_SIGNS, unify_decimal(', ', '.'))(price) cond_replace_value(product, 'price', price)
def _unify_price(self, product): price = product.get('price') if not price: return cond_replace_value(product, 'price', Price('USD', price))
def _populate_from_html(self, response, product): cond_set(product, 'title', _itemprop(response, 'name'), unicode.strip) product['title'] = _strip_non_ascii(product.get('title', '')) cond_set(product, 'model', _itemprop(response, 'model'), lambda s: s.replace(u'\xa0 Model # ', '')) cond_set(product, 'price', _itemprop(response, 'price')) cond_set(product, 'image_url', response.css('.skuImageSTD::attr(src)').extract(), lambda url: urljoin(response.url, url)) xpath = '//div[@id="divDescription"]/div[@class="qOverflow"]' \ '/node()[normalize-space()]' cond_set_value(product, 'description', response.xpath(xpath).extract(), ''.join) if not product.get("description"): desc = response.xpath("//div[@id='SkuTabDescription']").extract() if desc: product["description"] = desc[0] cond_set( product, 'brand', filter( product.get('title', '').startswith, response.meta.get('brands', []))) if product.get('description', '') == '': xpath = '//div[@id="divDescription"]/node()[normalize-space()]' product['description'] = ''.join(response.xpath(xpath).extract()) self._populate_related_products(response, product) price = product.get('price') if price: if price.startswith('$'): price = re.sub('[$ ,]+', '', price) product['price'] = Price(priceCurrency='USD', price=price) else: self.log('Incorrect price format %s at %s' % (price, response.url)) product['price'] = None self._buyer_reviews_from_html(response, product) cond_replace_value(product, 'url', response.url.split('?', 1)[0]) regex = "\/(\d+)\." reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) data = r'quillMData\s=\s(.*)</script>' data_script = re.findall(data, response.body_as_unicode()) j = json.loads(data_script[0]) brand = j.get('brandName') if brand: cond_set_value(product, 'brand', brand[0]) locale = j['culturecode'] cond_set_value(product, 'locale', locale) if not product.get("is_out_of_stock"): red_span = response.xpath('//span[@class="red"]/text()').extract() if red_span: s = 'Currently out of stock' if s in red_span[0]: cond_set_value(product, 'is_out_of_stock', True) else: cond_set_value(product, 'is_out_of_stock', False)
def _unify_price(self, product): price = product['price'].encode('utf-8') price = unify_price(['GBP'], {SYM_GBP: 'GBP'}, unify_decimal(', ', '.'), 'GBP')(price) cond_replace_value(product, 'price', price)
def parse_product_new(self, response): prod = response.meta['product'] populate_from_open_graph(response, prod) prod['locale'] = 'en_US' title = response.xpath( '//meta[@property="og:title"]/@content' ).extract() if title: cond_set_value(prod, 'title', title[0].capitalize()) price = response.xpath( '//div[@class="sales-price-container"]' '/span[contains(@class, "salesprice")]/text()' ).extract() # if no sale price was found if not price: price = response.xpath( '//div[@class="product-price"]/span/text()' ).extract() if price and '$' in price[0]: n_price = price[0].strip().replace('$', '').\ replace(',', '').strip() prod['price'] = Price(priceCurrency='USD', price=n_price) brand = response.xpath( '//meta[@itemprop="brand"]/@content' ).extract() cond_set(prod, 'brand', brand) # we need repopulate description cause at meta data it may be false description = response.xpath( '//p[@itemprop="description"]/text()' ).extract() if description: cond_replace_value(prod, 'description', description[0].strip()) only_in_online_stock = response.xpath( '//li[@class="product-message"]' ).extract() if only_in_online_stock: prod['is_in_store_only'] = True else: prod['is_in_store_only'] = False recommendations = [] unique_checker = [] related_div = response.xpath( '//div[@id="relatedProducts"]/div[contains(@class, ' '"recommendations")]//div[@itemprop="isRelatedTo"]' ) for div in related_div: link = div.xpath('.//a[@itemprop="url"]/@href').extract() name = div.xpath('.//meta[@itemprop="name"]/@content').extract() if name and link: # because site can recommend the same items if name not in unique_checker: unique_checker.append(name) item = RelatedProduct(title=name[0].strip().capitalize(), url=link[0].strip()) recommendations.append(item) prod['related_products'] = {'recommended': recommendations} return prod