def parse_product(self, response): item = response.meta['item'] data = SpiderSchema(response).get_product() category = response.css('a.GTM-breadcumb::text').extract()[1:] loader = ProductLoaderEU(Product(), response=response) loader.add_value(None, item) loader.replace_value('price', data['offers']['properties']['price']) loader.replace_value('category', category) if data['offers']['properties']['availability'] != 'inStock': loader.replace_value('stock', 0) yield loader.load_item()
def parse_products(self, response): data = json.loads(response.body) if data[0]['result']: for product in data[0]['result']: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', product['name']['sv']) if product['images']: product_loader.add_value('image_url', product['images'][0]) product_loader.add_value('url', product['url']['sv']) product_loader.add_value('identifier', product['uid']) sku = product['name']['sv'] sku = self.re_sku.findall(sku) product_loader.add_value('sku', sku) product_loader.add_value('price', product['price']['current']['SEK']) if not product['isBuyable']: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product offset = response.meta['offset'] + 200 data = self.post_data.replace('{}', str(offset)) yield scrapy.Request(self.post_url, method='POST', body=data, callback=self.parse_products, meta={'offset': offset}, dont_filter=True)
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_css('identifier', 'input.qs-cart-pid::attr(value)') loader.add_xpath('identifier', '//script/text()', re='product_id=(.+)"') loader.add_value('url', response.url) loader.add_css('name', 'h1.product-description-header::text') loader.add_css('price', 'input.qs-cart-price::attr(value)') loader.add_value('price', 0) name = loader.get_output_value('name') sku = self.re_sku.findall(name) if sku: sku = max(sku, key=len) loader.add_value('sku', sku) loader.add_css('image_url', 'div.product-images ::attr(src)') stock = response.xpath('//link[@itemprop="availability"]/@href').extract_first() if not stock or 'instock' not in stock.lower(): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) image_url = response.css('.picture').xpath('img/@src').extract_first() product_identifier = response.xpath( '//@data-productid').extract_first() product_name = response.xpath( '//h1[@itemprop="name"]/text()').extract_first() product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url)) price = ''.join( response.xpath('//span[@itemprop="price"]/text()').re('\S+')) sku = ''.join( response.xpath('//span[@itemprop="sku"]/text()').re('\w+')) product_loader.add_value('sku', sku) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('category', response.meta['category'][-3:]) product = product_loader.load_item() yield product
def parse_item(self, response): hxs = HtmlXPathSelector(response) name = hxs.select("//div[@class='product-name']/h1/text()").extract() if not name: self.log('No name on %s' %response.url) return # price = hxs.select('//*[@itemprop="price"]/text()').extract()[0] product_image = hxs.select('//*[@id="ma-zoom1"]/img/@src').extract() if product_image: product_image = urljoin_rfc(get_base_url(response), product_image[0]) category = ''.join(hxs.select('//div[@class="breadcrumbs"]/ul/li[2]/a/text()').extract()) shipping = hxs.select('//table[@id="product-attribute-specs-table"]' '//th[@class="label" and contains(text(), "Spese Spedizione")]' '/following-sibling::td/text()').extract() if not shipping: shipping = hxs.select('//table[@id="product-attribute-specs-table"]' '//th[@class="label" and contains(text(), "Shipping Cost")]' '/following-sibling::td/text()').extract() if shipping: shipping_cost = shipping[0].strip() if shipping_cost == 'Gratis': shipping_cost = '0.0' else: shipping_cost = extract_price_eu(shipping[0]) if shipping_cost >= Decimal(1000): shipping_cost = extract_price(shipping[0]) else: shipping_cost = None brand = hxs.select('//table[@id="product-attribute-specs-table"]' '//th[@class="label" and contains(text(), "Marca")]' '/following-sibling::td/a/@title').extract() if not brand: brand = hxs.select('//table[@id="product-attribute-specs-table"]' '//th[@class="label" and contains(text(), "Marca")]' '/following-sibling::td/text()').extract() l = ProductLoader(item=Product(), response=response) identifier = response.xpath("//input[@type='hidden'][@name='product']/@value").extract()[0] price = response.xpath('//div[@class="product-shop"]//span[@itemprop="price"]/text()').extract() l.add_xpath('sku', 'normalize-space(substring-after(//li[contains(text(),"Codice:")]/text(), ":"))') l.add_value('url', response.url) l.add_value('image_url', product_image) l.add_value('category', category) if brand: l.add_value('brand', brand[0].strip()) stock = response.xpath('//p[contains(@class, "availability")]') if stock.xpath('//@class[contains(., "instock") or contains(., "in-stock")]'): l.add_value('stock', 1) else: l.add_value('stock', 0) if shipping_cost is not None: l.add_value('shipping_cost', shipping_cost) if not price: price = response.xpath('//*[@id="product-price-{}"]//text()'.format(identifier)).re(r'[\d,.]+') if price: l.add_value('identifier', identifier) l.add_value('name', name) l.add_value('price', price[0]) yield l.load_item() return if price and len(price) == 1: l.add_value('identifier', identifier) l.add_value('name', name) l.add_value('price', price[0]) yield l.load_item() return table = response.xpath('//table[@id="super-product-table"]') if not table: self.log('No correct price found on %s' %response.url) self.log('Price is %s' %price) return item = l.load_item() for product in table.xpath('tbody/tr[td/input]'): loader = ProductLoader(item=Product(item), selector=product) loader.replace_xpath('name', 'td[1]/text()') loader.replace_xpath('identifier', 'td/div/span/@id', re='\d+') loader.replace_xpath('price', './/span[contains(@id, "product-price")]//text()', re='\S+') item = loader.load_item() yield item
def parse_product(self, response): loader = ProductLoaderEU(item=Product(), response=response) identifier = response.xpath('//@data-id').extract_first() loader.add_value('identifier', identifier) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('name', '(//h1/text())[1]') loader.add_css('price', '.price-including-tax .price ::text') if not loader.get_output_value('price'): return loader.add_value('sku', identifier) loader.add_value('category', response.meta.get('category')) image_url = response.xpath('//img[@id="image"]/@src').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath('brand', '//strong[text()="Brand:"]/following-sibling::a/text()') loader.add_xpath('brand', '//img[contains(@src, "/brands/")]/@title') if not response.css('.in-stock').xpath('div[@itemprop="availability"][not (contains(., "Ikke"))]').extract(): loader.add_value('stock', 0) loader.replace_value('price', 0) item = loader.load_item() option_attributes = response.xpath('//select[@id="bundle-option"]') if not option_attributes: yield item return options = [] for attribute in option_attributes: options.append(attribute.xpath('.//option[@value!=""]')) variants = itertools.product(*options) for variant in variants: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) identifier = '' loader.replace_value('name', '') price = item['price'] for option in variant: identifier += '-' + option.xpath('@value').extract_first() loader.add_value('name', option.xpath('text()').extract_first()) if option.xpath('@disabled'): loader.replace_value('stock', 0) extra_cost = option.xpath('@data-extra-cost').extract_first() if extra_cost: price += Decimal(extra_cost) loader.replace_value('price', price) loader.replace_value('identifier', identifier.strip('-')) loader.replace_value('sku', identifier.strip('-')) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) pages = hxs.select('//select[@name="nbPagesPerPage"]') cat_text = hxs.select('//h2[@class="titre_image titre_image_niv1"]') if not pages and not cat_text: try: category = hxs.select( '//div[@id="chemin_os"]//a/span[@itemprop="title"]/text()' ).extract()[-1] except: category = None main_ref = hxs.select( '//div[@id="ficheProduitPied"]//span[@class="reference"]/text()' ).re(r'R\xe9f. (.*)') name = response.xpath( '//div[@id="ficheProduitPied"]/div[@id="fichetitre"]/text()' ).extract() if not name or (name and not name[0].strip()): name = response.xpath( '//span[@itemprop="name"]/text()').extract() price = ''.join( response.xpath( '//div[@id="ficheProduitPied"]//*[@class="prix"]/text()'). re('\S+')) if name: identifier = remove_punctuation_and_spaces(name[0]).lower() image_url = response.xpath( '//div[@id="ficheProduitPied"]//img/@src').extract() image_url = urljoin_rfc(get_base_url(response), image_url[0]) if image_url else '' l = ProductLoader( item=Product(), selector=response.xpath('//div[@id="ficheProduitPied"]')) l.add_value('identifier', identifier) l.add_value('name', name) if category: l.add_value('category', category) l.add_xpath('sku', '//div[@id="ligne_achat"]//text()', re=':(.+)') l.add_value('stock', 1) l.add_value('url', response.url) l.add_value('price', price) l.add_value('image_url', image_url) yield l.load_item() products = hxs.select( '//div[@id="bloc_offre"]/div/div[@class="bloc_cadre_pied"]/form[@class="mini_fiche_ligne"]' ) products += hxs.select( '//div[@id="bloc_accessoire"]/div/div[@class="bloc_cadre_pied"]/form[@class="mini_fiche_ligne"]' ) for p in products: p_url = p.select( './/div[@class="ligne_titre"]/a/@href').extract() if p_url: yield Request(urljoin_rfc(get_base_url(response), p_url[0]), callback=self.parse_product) continue name = p.select( './/div[@class="colonne_1"]/div[@class="ligne_titre"]/span[@class="titre_descriptif"]/strong/text()' ) if not name: name = p.select( './/div[@class="colonne_1"]/div[@class="ligne_titre"]/a/span[@class="titre_descriptif"]/strong/text()' ) name = name[0].extract().strip() name = name.replace('- OFFRE SPECIALE !', '').strip() url = response.url price = "".join( p.select( './/div[@class="lignebeige"]/div[@class="wrapperPrix"]/div/div/div/b/text()' ).re(r'([0-9\,\. ]+)')).strip() # identifier = p.select('.//div/div/span[@class="reference"]/text()').extract()[1].strip() identifier = remove_punctuation_and_spaces(name).lower() image_url = p.select('.//div/img/@src').extract() if image_url: image_url = urljoin_rfc(get_base_url(response), image_url[0]) sku = '' p_ref = p.select('.//span[@class="reference"]//text()').re( r'(\d+)') if main_ref and p_ref: if p_ref[0] == main_ref[0]: p_sku = p.select( '//div[@id="ligne_achat"]/table/tr/td/text()' ).extract() if p_sku: try: sku = p_sku[0].strip().split(': ')[1] except IndexError: sku = p.select( '//div[@id="ligne_achat"]/table/tr/td/text()' ).re('\S+')[2] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', identifier) l.add_value('name', name) if category: l.add_value('category', category) l.add_value('sku', sku) l.add_value('stock', 1) l.add_value('url', url) l.add_value('price', price) l.add_value('image_url', image_url) yield l.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) identifier = response.xpath( '//input[@id="products-id"]/@value').extract_first() loader.add_value('identifier', identifier) sku = response.xpath( '//span[@itemprop="model"]/text()').extract_first() loader.add_value('sku', sku) name = response.xpath('//h2/span[@itemprop="name"]/text()' ).extract_first() or response.xpath( '//h1/text()').extract_first() loader.add_value('name', name) loader.add_value('url', response.url) price = response.xpath( '//span[@itemprop="price"]/@content').extract_first() if price: price = price.replace('.', ',') else: price = response.xpath( '//span[@itemprop="price"]/text()').extract_first( ) or response.css('div.current-price-container').xpath( 'br/following::text()').extract_first() or response.css( 'div.current-price-container ::text').extract_first( ) or 0 loader.add_value('price', price) category = hxs.select( '//div[@id="breadcrumb_navi"]/span/a/span/text()').extract() category = category[1:-1] if len(category) > 2 else '' loader.add_value('category', category) image_url = response.xpath( '//img[@itemprop="image"]/@src').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) item = loader.load_item() options = response.css('fieldset.attributes div div label') if options: for option in options: option_item = deepcopy(item) option_item['identifier'] += '-' + option.xpath( './/input/@value').extract_first() option_name = ' '.join( option.xpath('text()').extract()).strip() if '(' in option_name: price = extract_price(option_name.split('(')[-1]) option_name = option_name.split('(')[0].strip() option_item['price'] += price option_item['name'] += ' ' + option_name yield option_item else: yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.css( 'input.productId::attr(value)').extract_first() loader.add_value('identifier', identifier) loader.add_value('url', url_query_cleaner(response.url)) loader.add_css('name', '.title h1::text') category = response.css('.breadcrumbs a::text').extract() loader.add_value('category', category[2:]) image_url = response.css( '.productDetail1 .image img::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('brand', category[-1]) item = loader.load_item() for option in response.xpath('//div[@id="valStaffelSelection"]//li'): loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) identifier = item['identifier'] + '-' + option.xpath( 'input/@value').extract_first() loader.replace_value('identifier', identifier) url = item['url'] + '?' + option.xpath('@class').extract_first() loader.replace_value('url', url) loader.add_css('name', 'span.label::text') price = option.css('div.price::text').extract() loader.replace_value('price', price.pop()) loader.replace_value('sku', identifier) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select('//ul[@class="pagination"]//a/@href').extract(): yield Request(urljoin(base_url, url)) for product in hxs.select('//ul[@id="products"]/li'): loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('identifier', './/@data-id') url = product.select('.//a/@href').extract()[0].split('?')[0] loader.add_value('url', urljoin(base_url, url)) loader.add_xpath('name', './/@data-name') loader.add_value( 'price', ''.join(product.select('.//@data-price').re('\S'))) loader.add_xpath('sku', './/@data-id') loader.add_xpath( 'category', '//ol[@id="breadcrumbs"]/li[position()>1]/a/span/text()') loader.add_xpath('image_url', './/@src') loader.add_xpath('brand', './/@data-brand') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath( '//input[@id="prodid"]/@value').extract_first() if not identifier: self.logger.warning('No identifier for %s' % response.url) return loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', 'div.infotitle h1::text') loader.add_css('price', '.inline.price::text') loader.add_value('sku', identifier) image_url = response.css('.photo::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) brand = response.xpath( '//meta[@itemprop="brand"]/@content').extract_first() if not brand: try: brand = response.xpath('//script/text()').re( '"manufacturer":"(.*?)"')[0].decode('unicode-escape') except IndexError: pass loader.add_value('brand', brand) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) product_loader.add_css('name', '.heading-title-text::text') categories = response.css('.breadcrumb a::text').extract()[2:] category = categories.pop(0).strip() if categories else '' if category == 'All Categories': category = categories.pop(0) product_loader.add_value('category', category) product_loader.add_xpath( 'brand', '//*[@id="product-header-order-brand"]//img/@alt') product_loader.add_xpath('image_url', '//meta[@property="og:image"]/@content') product_loader.add_xpath('identifier', '//input[@name="product_id"]/@value') price = response.css('.price::text').extract_first() if price: product_loader.add_value('price', price.replace(' ', '')) product_loader.add_value('price', 0) stock = response.xpath('//script/text()').re('availability.+') if stock and 'InStock' not in stock[0]: product_loader.add_value('stock', 0) product_loader.add_xpath('sku', '//input[@name="product_id"]/@value') item = product_loader.load_item() metadata = CRCMeta() rrp = response.css('.price-public::text').extract_first() if rrp: rrp = extract_price(rrp) metadata['rrp'] = rrp if float(rrp) > float(item['price']) else '' item['metadata'] = metadata options = response.xpath('//select[@name="product_id"]/option') if not options: yield item return for opt in options: product_loader = ProductLoader(item=Product(), selector=opt) product_loader.add_value(None, item) identifier = opt.xpath('@value').extract_first() if not identifier: continue product_loader.replace_value('identifier', identifier) product_loader.replace_value('sku', identifier) product_loader.add_xpath('name', 'text()') price = response.xpath('//div[@data-value="%s"]' % identifier).css( '.alltricks-ChildSelector-customOptionPrice::text' ).extract_first() product_loader.replace_value('price', price.replace(' ', '')) stock = opt.xpath('@data-stock-label').extract_first() if stock == 'Out of stock': product_loader.replace_value('stock', 0) option_item = product_loader.load_item() option_item['metadata'] = metadata yield option_item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) identifier = hxs.select( '//span[@class="ProductNo DisplayBlock SmallTopMargin"]/text()' ).re('Artikel-Nr\.: (.*)') loader.add_value('identifier', identifier) loader.add_value('sku', identifier) name = hxs.select('//h1[@itemprop="name"]/text()').extract() loader.add_value('name', name[0]) loader.add_value('url', response.url) price = response.xpath('//span[@itemprop="price"]/text()').extract() price = price[0] if price else '0.00' loader.add_value('price', price) price = loader.get_output_value('price') if price and Decimal(price) <= 49.99: loader.add_value('shipping_cost', '4.99') category = hxs.select( '//a[@class="BreadcrumbItem"]/span/text()').extract() category = ' > '.join(category[1:-1] if len(category) > 2 else '') loader.add_value('category', category) image_url = hxs.select( '//div[@class="ProductImage"]//img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) yield loader.load_item() if response.meta.get('options_crawled', False): log.msg('Option found: ' + response.url) return primary_options = hxs.select( '//select[@id="SelectedVariation0"]/option/@value').extract() for primary_option in primary_options: secondary_options = hxs.select( '//select[@id="SelectedVariation1"]/option/@value').extract() if not secondary_options: formdata = { 'ChangeAction': 'SelectSubProduct', 'SelectedVariation': primary_option } yield FormRequest(response.url, dont_filter=True, formdata=formdata, meta={'options_crawled': True}, callback=self.parse_product) else: for secondary_option in secondary_options: formdata = { 'ChangeAction': 'SelectSubProduct', 'SelectedVariation': [primary_option, secondary_option] } yield FormRequest(response.url, dont_filter=True, formdata=formdata, meta={'options_crawled': True}, callback=self.parse_product)
def parse_product(self, response): if not response.xpath('//body[@id="product"]' ) and not 'body id="product"' in response.body: return promo_dates = response.xpath( '//div[@class="pl_promoinfo_product_promo"]/span[@class="date"]/text()' ).extract() promo_start, promo_end = (None, None) try: promo_dates = [ datetime.datetime.strptime(d, '%d-%m-%Y') for d in promo_dates ] promo_start, promo_end = promo_dates except ValueError: pass loader = ProductLoader(item=Product(), response=response) loader.add_xpath('identifier', '//input[@id="product_page_product_id"]/@value') loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = response.xpath( '//span[@id="our_price_display"]/text()').extract_first() loader.add_value('price', price.replace(' ', '')) loader.add_xpath('sku', '//span[@itemprop="sku"]/text()') loader.add_xpath('sku', '//script/text()', re="productReference='(.+?)'") category = response.css('.navigation_page ::attr(title)').extract() main_category = response.meta.get('category') if not category or category[0].strip() != main_category: category = [main_category] + category loader.add_value('category', category) loader.add_xpath('image_url', '//img[@id="bigpic"]/@src') loader.add_xpath('brand', '//a[@itemprop="brand"]/span/text()') if not response.css('.primary_block .avail3'): loader.add_value('stock', 0) metadata = SonaeMeta() if promo_start and promo_end: metadata['promo_start'] = promo_start.strftime('%Y-%m-%d') metadata['promo_end'] = promo_end.strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') item = loader.load_item() item['metadata'] = metadata yield item
def parse_product(self, response): loader = ProductLoader(response=response, item=Product()) base_identifier = response.xpath('//input[@id="productID"]/@value').extract_first() loader.add_value('url', response.url) base_name = response.css('.puu-led h1::text').extract_first() category = response.css('.puu-rbn span::text').extract() loader.add_value('category', category[2:]) image_url = response.css('.puu-vsl img::attr(src)').extract_first() loader.add_value('image_url', response.urljoin(image_url)) loader.add_css('brand', '.puu-brand ::text') base_product = loader.load_item() options = response.css('.puu-ofrs tr') for option in options: loader = ProductLoader(selector=option, item=Product(base_product)) name = base_name + ' ' + option.xpath('td[1]/text()').extract_first() loader.replace_value('name', name) identifier = base_identifier + '-' + option.xpath('td[1]/text()').re('\d+')[0] loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) loader.replace_css('price', '.puu-prc::text') yield loader.load_item() if options or not base_identifier: return loader.add_css('name', '.puu-prd h1::text') loader.add_value('identifier', base_identifier) loader.add_value('sku', base_identifier) loader.add_css('price', '.puu-prc::text') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) image_url = hxs.select('//img[@property="image"]/@src').extract() product_identifier = hxs.select('//script/text()').re("'productId': *(.+),") product_name = hxs.select('//script/text()').re("'name': *\"(.+)\"") product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select('//script/text()').re("'price': *\"(.+)\"") sku = hxs.select('//script/text()').re('"ArtNbr":"(.+?)"') product_loader.add_value('sku', sku[-1]) product_loader.add_value('price', price) product_loader.add_value('url', response.url) category = hxs.select('//div[@class="breadcrumb gridle_container"]/a/text()').extract()[1:-1] category = category[-3:] product_loader.add_value('category', category) brand = hxs.select('//script/text()').re("'brand': *\"(.+)\"") product_loader.add_value('brand', brand) product = product_loader.load_item() yield product
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) identifier = response.xpath('//script/text()').re('product_id: (\d+)') loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) name = response.css('.product_display_padding').xpath( '*[position()<3]//text()').extract() loader.add_value('name', name) loader.add_xpath('price', '//span[@id="p_price"]/text()') loader.add_value('category', response.meta.get('category')) loader.add_xpath('image_url', '//meta[@property="og:image"]/@content') loader.add_value('brand', name[0]) if response.xpath('//meta[@property="og:availability"]/@content').re( 'out *of *stock'): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath('//input[@name="product"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') image_url = response.xpath('//img[@id="image"]/@src').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath('brand', '//*[@itemprop="brand"]/text()') item = loader.load_item() promotion = response.xpath('//div[@id="advantages-of-registering-popup"]//p[contains(text(), "korting")]/text()').extract() promotion = promotion[0].strip() if promotion else '' for option in response.xpath('//table[@id="product-option-packages"]/tbody/tr'): loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) identifier = option.xpath('.//@data-id').extract_first() loader.replace_value('identifier', identifier) loader.add_xpath('name', './/label/text()') price = option.css('.price::text').extract() loader.replace_value('price', price.pop()) loader.replace_value('sku', identifier) metadata = SpecSaversMeta() metadata['promotion'] = promotion item = loader.load_item() item['metadata'] = metadata yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) try: identifier = hxs.select('//div[@id="product_description"]/@data-product_id').extract()[0] except IndexError: yield Request(response.url, dont_filter=True, callback=self.parse_cat) return loader.add_value('identifier', identifier) loader.add_xpath('sku', '//script/text()', re='"prdref","(.+)"') loader.add_value('url', response.url) loader.add_xpath('name', '//h1//text()', re='.+') loader.add_xpath('name', '//div/text()', re='Couleur.*:(.+)') loader.add_xpath('category', '//nav[@id="breadcrumb"]//a[position()>1]/span/text()') price = ''.join(hxs.select('//div[@class="product_container"]//div[@class="product-price"]/span[@data-product_id="%s"]//text()' %identifier).extract()) loader.add_value('price', ''.join(price.split())) loader.add_xpath('image_url', '//script/text()', re='"prdparam-image_url","(.+)"') if not hxs.select('//input[contains(@id, "addToCart")]'): loader.add_value('stock', '0') yield loader.load_item() siblings = hxs.select('//div[@id="slider_collection-container"]//a/@href').extract() siblings += hxs.select('//div[contains(@class, "siblings")]//a/@href').extract() for url in siblings: yield Request(urljoin(base_url, url), callback=self.parse_product)
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = re.search('-(\d+)\.html', response.url).group(1) loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', 'div.titles h1 ::text') loader.add_css('price', '.rprice .value::text') loader.add_value('sku', identifier) loader.add_xpath('category', '//div[@id="path"]//a[position()>1]/text()') loader.add_css('image_url', 'div#image img::attr(src)') loader.add_css('brand', 'h1 .brand-name::text') yield loader.load_item()
def parse_product(self, response): if response.xpath('//h5[contains(., "under varemerket")]'): return loader = ProductLoader(Product(), response=response) identifier = response.xpath( '//input[@id="articleId"]/@value').extract_first( ) or response.xpath('//input[@id="skuId"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) breadcrumbs = response.css('.breadcrumbs a::text').extract()[1:] loader.add_value('name', breadcrumbs.pop()) loader.add_value('category', breadcrumbs[-3:]) loader.add_xpath('price', '//h3[@itemprop="price"]/@content') loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_css('brand', '.product-hero-brand img::attr(alt)') if loader.get_output_value('price') < 1000: loader.add_value('shipping_cost', 49) yield loader.load_item()
def parse_products(self, response): category = response.css('.breadcrumbs').xpath( './/a/text()').extract()[1:] products = response.css('.listing_item') for product in products: loader = ProductLoader(item=Product(), selector=product) image_url = product.css('.listing_item_image').xpath( 'img/@src').extract_first() if not 'noimage' in image_url: loader.add_value('image_url', image_url) url = product.css('.listing_item_name').xpath( '@href').extract_first() url = url_query_cleaner(response.urljoin(url)) sku = url.split('/')[-1] loader.add_value('identifier', sku) loader.add_value('sku', sku) loader.add_value('url', url) loader.add_xpath('name', './/a[@class="listing_item_name"]/text()') loader.add_xpath( 'price', './/span[@class="listing_item_basic_price"]/text()') loader.add_value('category', category) shipping_cost = product.css('.listing_item_delivery_costs').xpath( 'text()').extract_first() loader.add_value('shipping_cost', extract_price_eu(shipping_cost)) if 'Non disponibile' in product.css( '.listing_item_availability').xpath( 'text()').extract_first(): loader.add_value('stock', 0) item = loader.load_item() dealer = product.css('.listing_item_merchant_name').xpath( 'img/@alt').extract_first() item['metadata'] = {'Dealer': dealer} yield item