def parse_product(self, response): hxs = HtmlXPathSelector(response) try: identifier = response.xpath( '//dd[@data-product-sku]/text()').extract()[0] name = response.xpath('//h1[@itemprop="name"]/text()').extract()[0] except: return loader = ProductLoader(item=Product(), response=response) sku = self._re_sku.findall(name) sku = sku[0] if sku else '' loader.add_value('identifier', identifier) loader.add_value('name', name) loader.add_xpath('brand', '//h2[@itemprop="brand"]//span/text()') loader.add_css('category', 'li.breadcrumb:last-child a::text') loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_xpath('price', '//meta[@itemprop="price"]/@content') loader.add_value('price', '') loader.add_xpath('image_url', '//a[@id="image-zoom"]/@href') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=response.meta['product'], response=response) loader.add_value('url', response.url) identifier = response.xpath('//@data-product-id').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_css('name', 'h1.product-title::text') category = response.xpath('//script/text()').re_first( 'category: "(.+?)>') loader.add_value('category', category) img = response.xpath('//meta[@itemprop="image"]/@src').extract_first() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img)) loader.add_value('brand', response.meta.get('brand')) if response.css('div.product-add-to-cart'): loader.add_value('stock', '1') else: loader.add_value('stock', '0') product = self.add_shipping_cost(loader.load_item()) metadata = KeterMeta() metadata['reviews'] = [] product['metadata'] = metadata identifier = loader.get_output_value('identifier') methods = ('[{"method":"main_widget","params":{"pid":"' + identifier + '"}},' + '{"method":"bottomline", "params":{"pid": ' + identifier + ',' + '"link":"' + hxs.select('//div/@data-url').extract()[0] + '", "skip_average_score":false,' + '"main_widget_pid": ' + identifier + '}}]') formdata = { 'app_key': hxs.select('//div/@data-appkey').extract()[0], 'is_mobile': 'false', 'methods': methods, 'widget_version': '2015-08-30_11-33-24' } req = FormRequest("http://w2.yotpo.com/batch", formdata=formdata, callback=self.parse_review, meta={'product': product}) yield req
def parse_product(self, response): hxs = HtmlXPathSelector(response) try: name = hxs.select( u'//*[@itemprop="name"]/text()').extract()[0].strip() except: open('/tmp/lookfantastic', 'w').write(response.body) response.meta['retries'] = response.meta.get('retries', 0) + 1 if response.meta['retries'] > 10: self.log('Giving up on url [%s]' % (response.url)) raise yield Request(response.url, meta=response.meta, dont_filter=True) return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('identifier', response.url.split('/')[-1].split('.')[0]) product_loader.add_value('url', response.url) product_loader.add_value('name', name) product_loader.add_xpath('brand', u'(//meta[@itemprop="brand"]/@content)[1]') product_loader.add_css('price', '.product-price .price ::text') product_loader.add_value('sku', response.url.split('/')[-1].split('.')[0]) product_loader.add_value('category', response.meta.get('category')) img = hxs.select(u'//a/img[@class="product-img"]/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) if hxs.select( '//p[@class="availability" and contains(text(),"In stock")]'): product_loader.add_value('stock', '1') if hxs.select('//p[@class="free-delivery"]'): product_loader.add_value('shipping_cost', '0') item = product_loader.load_item() metadata = FragranceDirectMeta() metadata['promotion'] = normalize_space(' '.join( hxs.select( '//p[contains(@class, "yousave")]//text()|//h3[@class="offer-buy-x-delivery-discount"]//text()' ).extract())) if item.get('price'): metadata['price_exc_vat'] = Decimal(item['price']) / Decimal('1.2') item['metadata'] = metadata yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) options = hxs.select('//a[@itemprop="url"]/@href').extract() if options: for url in options: yield Request(response.urljoin(url), callback=self.parse_product) return loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]//text()') categories = hxs.select( '//div[@class="breadcrumbs"]/ul/li/a/span/text()').extract()[6:] for category in categories: if category not in loader.get_output_value('name'): loader.add_value('name', category) loader.add_xpath('identifier', '//meta[@itemprop="productID"]/@content') loader.add_xpath('price', '//span[@itemprop="price"]/text()') loader.add_css('price', '.price ::text') loader.add_value('url', response.url) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') if loader.get_output_value('identifier'): yield loader.load_item()
def parse_product(response): hxs = HtmlXPathSelector(response) opt_groups = [] inside = False lst = '' for line in response.body.split('\n'): if line.startswith('perms[\''): inside = True lst = '' elif line.startswith('];'): if lst: opts = eval('[' + lst + ']') # XXX http://www.thesleepshop.co.uk/acatalog/4ft6_Double_Kyoto_Memphis_Futon.html#a11717 # second option has "Deluxe Mattress" twice with different additional price # however price calculation ignores second addition price (uses first value) filtered_opts = [] for price, name in opts: if not [name for pn in filtered_opts if pn[1] == name]: filtered_opts.append([price, name]) opt_groups.append(filtered_opts) inside = False elif inside: lst += line identifier = hxs.select( '//form//input[contains(@name, "Q_")]/@name').re(r'Q_(.*)$')[0] product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h3[@class="product"]/text()') product_loader.add_xpath('name', u'//span[@class="product"]/text()') product_loader.add_value('sku', identifier) product_loader.add_value('identifier', identifier) product_loader.add_value('category', response.meta.get('category')) product_loader.add_css('price', '.discprice::text') price_reg = response.xpath( '//div[@id="price_inside"]//span//text()').extract_first( ) or response.xpath( '//div[@id="price_inside"]//span/@ppraw').extract_first() price_reg = extract_price2uk(price_reg) product_loader.add_value('price', price_reg) product_loader.add_value('price', '') discount = product_loader.get_output_value('price') / price_reg img = hxs.select( u'//div[@class="slides_control"]/a/img/@src').extract() if not img: img = hxs.select( u'//div[@class="image_product"]//img/@src').extract() product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) brand_logo = hxs.select( u'//h3[@class="product"]/../img/@src').extract() if not brand_logo: brand_logo = hxs.select( u'//h3[@class="product"]/img/@src').extract() brands = { '6thsense.jpg': '6th sense', 'bentley.gif': 'bentley', 'birlea.gif': 'birlea', 'blank.gif': '', 'brand': '', 'Breasley.gif': 'breasley', 'buoyant.jpg': 'buoyant', 'cro.gif': 'cro', 'cumfilux.gif': 'cumfilux', 'dt.gif': 'dt', 'dunlopillo.gif': 'dunlopillo', 'durabeds.gif': 'durabeds', 'easycomfort.gif': 'easy comfort', 'friendship_mill.gif': 'friendship mill', 'Furmanac.gif': 'furmanac', 'gainsborough.gif': 'fainsborough', 'gleneagle.gif': 'gleneagle', 'harlequin.gif': 'harlequin', 'harmony.gif': 'harmony', 'healthbeds.gif': 'healt beds', 'highgate.gif': 'highgate', 'hypnos.gif': 'hypnos', 'jay-be.gif': 'jay be', 'julianbowenlogo.jpg': 'julian bowen', 'kaymed.gif': 'kaymed', 'komfi.gif': 'komfi', 'kyoto.gif': 'kyoto', 'limelight.gif': 'limelight', 'metalbeds.gif': 'metalbeds', 'millbrook.gif': 'millbrook', 'myers.gif': 'myers', 'nd.gif': 'newdesign', 'nestledown.gif': 'nestledown', 'obc.gif': 'original bedstead', 'Protectabed.gif': 'protectabed', 'rauch.gif': 'rauch', 'relaxsan.gif': 'relaxsan', 'relyon.gif': 'relyon', 'rest_assured.gif': 'rest assured', 'richman.gif': 'richman', 'sealy.gif': 'sealy', 'shakespeare.gif': 'shakespeare', 'silentnight.gif': 'silentnight', 'sleepeezee.gif': 'sleepeezee', 'sleepshaper.gif': 'sleepshaper', 'sleepyvalley.gif': 'sleepyvalley', 'slumberland.gif': 'slumberland', 'staples.gif': 'staples', 'steens.gif': 'steens', 'swanglen.gif': 'swanglen', 'sweetdreams.gif': 'sweetdreams', 'tss.gif': 'the sleep shop', 'verona.jpg': 'verona', 'welcome.gif': 'welcome furniture', } product_loader.add_value( 'brand', brands.get(brand_logo[0], remove_extension(brand_logo[0]))) product = product_loader.load_item() for opt_price, opt_name in multiply(opt_groups): prod = Product(product) prod['name'] = (prod['name'] + ' ' + opt_name).strip() try: prod['price'] = (Decimal(prod['price']) + Decimal(opt_price) * discount).quantize( Decimal('1.00')) except TypeError: prod['price'] = Decimal(0) prod['identifier'] = prod['identifier'] + ':' + opt_name yield prod
def parse_product(self, response): hxs = HtmlXPathSelector(response) data = response.xpath( '//script/text()[contains(., "product/data")]').extract_first() data = json.loads( re.search('product/data",[ \n]*({.+})', data).group(1)) price = ''.join( hxs.select( '//div[contains(@class, "js-product-offer-summary")]//div[contains(@class, "price-display")]//text()' ).extract()) if not price: price = ''.join( response.xpath( '//div[@itemprop="offers"]//div[@itemprop="price"][1]//text()' ).extract()) if not price: price = ''.join( response.xpath( '//span[contains(@class, "hide-content-m")]/span[@data-tl-id="Price-ProductOffer"]//text()' ).extract()) # Some products are not available online and these have no price if price: stock_status = 1 if 'out of stock' in price.lower(): stock_status = 0 product_name = filter( lambda x: bool(x), map( unicode.strip, hxs.select('//h1[contains(@itemprop, "name")]//text()'). extract())) loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('identifier', re.search(r'/(\d+)$', response.url).group(1)) loader.add_value('sku', response.meta['sku']) loader.add_value('brand', response.meta['brand']) categories = hxs.select( '//ol[contains(@class, "breadcrumb-list")]//li//a/span/text()' ).extract() categories = map(lambda x: x.strip(), categories) loader.add_value('category', categories) loader.add_value('url', response.url) loader.add_xpath( 'image_url', '//img[contains(@class, "js-product-primary-image")]/@src') try: loader.add_value( 'shipping_cost', data['buyingOptions']['shippingPrice']['displayPrice']) except KeyError: loader.add_css('shipping_cost', 'h2.js-shipping-primary-msg::text') loader.add_value('price', price) if not stock_status: loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = {} yield Request(self._get_reviews_url(item, 1), meta={ 'product': item, 'page': 1 }, callback=self.parse_product_reviews)