def parse_product(self, response): sku = response.xpath('//input[@name="sFUPID"]/@value').extract() metadata = ToyMonitorMeta() ean = re.findall(u'data-flix-ean="(.*)"', response.body) if ean: metadata['ean'] = ean[0] promo = response.xpath( '//div[@class="currentPrice"]/span[@class="block-reduc-red"]/text()' ).extract() if promo: metadata['promotions'] = promo[0] for obj in super(PixmaniaSpider, self).parse_product(response): if isinstance(obj, Product): obj['identifier'] = obj['identifier'].split('-')[0].strip() obj['sku'] = sku[0] if sku else obj['identifier'] obj['metadata'] = metadata.copy() elif isinstance(obj, Request) and 'product' in obj.meta: obj.meta['product']['metadata'] = metadata.copy() obj.meta['product']['sku'] = sku[0] if sku else obj.meta[ 'product']['identifier'] yield obj
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//input[@id="product-name"]/@value') loader.add_value('url', response.url) loader.add_css('brand', 'span.b-brand_title::text') categories = response.css('div.b-breadcrumbs a::text').extract()[2:] loader.add_value('category', categories) loader.add_xpath('sku', '//meta[@itemprop="model"]/@content') identifier = response.xpath('//input[@name="pid"]/@value').extract() if not identifier: log.msg('PRODUCT WHIOUT IDENTIFIER: ' + response.url) return loader.add_value('identifier', identifier[0]) image_url = response.xpath('//link[@rel="image_src"]/@href').extract( ) or response.xpath('//meta[@itemprop="image"]/@content').extract() if image_url: loader.add_value('image_url', image_url[0]) price = response.xpath('//meta[@itemprop="price"]/@content').extract() loader.add_value('price', price) out_of_stock = response.css('div.b-availability').xpath( './/span[@data-availability="NOT_AVAILABLE"]') if out_of_stock: loader.add_value('stock', '0') product = loader.load_item() promo = response.xpath( '//div[@class="b-product_promo"]/div/span/text()').extract() metadata = ToyMonitorMeta() metadata['reviews'] = [] if promo: metadata['promotions'] = promo[0].strip() product['metadata'] = metadata reviews_url = 'http://mark.reevoo.com/reevoomark/en-GB/product.html?page=1&sku=%s&tab=reviews&trkref=MOT' yield Request(reviews_url % identifier[0], callback=self.parse_review_page, meta={'product': product})
def parse_products(self, response): category = response.xpath( '//div[@id="breadcrumb"]//span[@itemprop="name"]/text()').extract( )[2:] for product in response.css('.productList .product'): loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('identifier', '@id', re='product-(.+)') loader.add_xpath('url', './/@href') brand = product.xpath('.//h3/em/text()').extract_first() name = product.xpath('.//h3/span/text()').extract_first() if name[0].islower(): loader.add_value('name', brand) loader.add_value('name', name) loader.add_css('price', '.productPrice dd:last-child::text') loader.add_xpath('sku', '@id', re='product-(.+)') loader.add_value('category', category) loader.add_css('image_url', '.productMainImage img::attr(src)') image_url = loader.get_output_value('image_url') promotion = None if image_url and '3for2' in image_url: promotion = '3 for 2' loader.add_value('brand', brand) loader.add_value('shipping_cost', '3.99') stock = product.css('.productStock dd').extract_first().title() if 'In Stock' not in stock and 'Low Stock' not in stock: loader.add_value('stock', 0) product = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] if promotion: metadata['promotions'] = promotion product['metadata'] = metadata prod_id = re.findall("/(\d+).prd", product['url'])[0] reviews_url = "http://api.bazaarvoice.com/data/batch.json?passkey=35w0b6mavcfmefkhv3fccjwcc&apiversion=5.5&displaycode=17045-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A" + prod_id + "&filter.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&sort.q0=isfeatured%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv_1111_57043" request = Request(reviews_url, meta={ 'product': product, 'offset': 0 }, callback=self.parse_reviews) yield request
def parse_product(self, response): identifier = response.xpath('//*[@id="pid"]/@value').extract_first() name = response.xpath('//*[@id="product-name"]/@value').extract_first() price = response.xpath( '//meta[@itemprop="price"]/@content').extract_first() category = response.xpath( '//ul[@class="b-breadcrumbs-list"]//a/text()').extract()[1:] image_url = response.xpath( '//div[@class="b-product_details-print_image"]/img/@src' ).extract_first() brand = response.xpath( '//span[@class="b-brand_title"]/text()').extract_first() stock = response.xpath( '//span[@class="m-in_stock js-availability_msg"]/text()' ).extract_first() loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('url', response.url) loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('price', price) if not stock: loader.add_value('stock', 0) if loader.get_output_value('price') < 40: loader.add_value('shipping_cost', '3.95') product = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] product['metadata'] = metadata reviews_url = 'http://mark.reevoo.com/reevoomark/en-GB/product.html?page=1&sku=%s&tab=reviews&trkref=MLC' meta = { 'dont_retry': True, 'handle_httpstatus_list': [404, 302], 'product': product } yield scrapy.Request(reviews_url % identifier, callback=self.parse_review_page, meta=meta)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) name = ''.join( hxs.select('//h1[@class="productHeading"]//text()').extract()) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand', '')) category = re.findall(u',\\ncategory: "(.*)",', response.body) category = category[0] if category else '' loader.add_value('category', category) loader.add_xpath('sku', '//span[@id="catalogueNumber"]/text()') loader.add_xpath('identifier', '//span[@id="catalogueNumber"]/text()') image_url = hxs.select( '//div[@id="amp-originalImage"]/img/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) price = ''.join( hxs.select('//div[@class="priceNow"]//text()').extract()) loader.add_value('price', price) out_of_stock = 'IN STOCK' not in ''.join( hxs.select('//meta[@property="product:availability"]/@content'). extract()).upper() if out_of_stock: loader.add_value('stock', '0') item = loader.load_item() metadata = ToyMonitorMeta() ean = ''.join( hxs.select('//span[@id="productEAN"]/text()').extract()).strip() if ean: metadata['ean'] = ean item['metadata'] = metadata yield item
def parse_product(self, response): url = response.url l = ProductLoader(item=Product(), response=response) name = response.xpath('//h1[@class="page-title"]/text()').extract() if not name: logging.error("ERROR! NO NAME! %s" % url) return name = name[0].strip() l.add_value('name', name) price = response.xpath('//*[@itemprop="price"]/text()').extract() if not price: logging.error("ERROR! NO PRICE! %s %s" % (url, name)) price = '' l.add_value('price', price) identifier = response.xpath( '//input[@name="skuIdVal"]/@value').extract()[0] if not identifier: logging.error("ERROR! IDENTIFIER! %s %s" % (url, name)) return categories = response.xpath( '//div[@id="breadcrumb"]//span[@itemprop="title"]/text()' )[1:-1].extract() for category in categories: l.add_value('category', category.strip()) l.add_value('identifier', identifier) l.add_value('sku', identifier) l.add_value('url', url) sku = response.xpath('//p[@itemprop="sku"]/text()').extract() if sku: l.add_value('sku', sku[0]) image_url = response.xpath( '//div[contains(@class, "static-product-image")]/img/@src' ).extract() if image_url: l.add_value('image_url', image_url[0]) promo = response.xpath( '//div[@id="bbSeller1"]//div[@class="savings"]//span[@class="saving"]/text()' ).extract() product = l.load_item() metadata = ToyMonitorMeta() if promo: metadata['promotions'] = promo[0] metadata['reviews'] = [] product['metadata'] = metadata reviews_url = "http://api.bazaarvoice.com/data/batch.json?passkey=asiwwvlu4jk00qyffn49sr7tb&apiversion=5.5&displaycode=1235-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A" + identifier + "&filter.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&sort.q0=submissiontime%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&limit.q0=30&offset.q0=0&limit_comments.q0=3&callback=bv_1111_57408" request = Request(reviews_url, meta={ 'product': product, 'offset': 0 }, callback=self.parse_reviews) yield request
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brand = hxs.select( '//td[normalize-space(text())="Brand"]/../td[2]/text()' ).extract_first() product_url = urljoin(base_url, response.url) name = hxs.select('//h1[@id="productName"]/text()').extract() if not name: name = hxs.select('//div[@class="product-name"]/text()').extract() if not name: name = hxs.select('//h1[@class="product-name"]/text()').extract() image = hxs.select('//img[@id="prodImg"]/@src').extract( ) or hxs.select('//img[@class="singleImage"]/@src').extract() image_url = urljoin(base_url, image[0]) if image else '' breadcrumb = hxs.select( '//ol[@id="navBreadcrumbs"]/li/h2//a/text()').extract() if len(breadcrumb) > 0: category = breadcrumb.pop().strip() else: category = None if hxs.select( '//input[@id="item_available"][@value="InStock"]').extract(): stock = None else: stock = 0 price = hxs.select( '//div[@class="producttop"]//span[@class="productPrice"]/span[@class="pounds" or @class="newPrice"]/text()[last()]' ).extract() or hxs.select( '//meta[@itemprop="price"]/@content').extract() if price: price = price.pop() else: price = '0.00' try: sku = hxs.select( '//td[contains(text(), "Model Number")]/../td[@class="value"]/text()' ).extract()[0].strip() except IndexError: sku = '' loader = ProductLoader(response=response, item=Product()) loader.add_value('url', product_url) loader.add_value('name', name) loader.add_value('image_url', image_url) loader.add_value('price', price.replace(' ', '').replace(',', '.')) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('brand', brand) loader.add_xpath('identifier', '//input[@id="product_sku_string"]/@value') if stock == 0: loader.add_value('stock', 0) item = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] item['metadata'] = metadata if response.meta.get('promotions', ''): item['metadata']['promotions'] = response.meta.get( 'promotions', '') review_data = re.findall('app.reviewsQnAModel = (.*);', response.body) if review_data: review_data = json.loads(review_data[0]) review_url = 'http:' + review_data['bvAPICallURL'] review_url = add_or_replace_parameter(review_url, 'limit', '100') req = Request(review_url, meta={ 'item': item, 'offset': '0' }, callback=self.parse_reviews) yield req else: yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: name = hxs.select( '//*[@itemprop="name"]/text()').extract().pop().strip() except IndexError: yield Request(response.url.replace( 'hamleys.com/', 'hamleys.com/detail.jsp?pName=').replace('.ir', ''), callback=self.parse_product) return out_of_stock = 'OUT OF STOCK' in ''.join( hxs.select( '//li[@class="stockStatus"]/span/text()').extract()).upper() price = "".join( hxs.select('//div[@class="productprice "]/text()').re("([.0-9]+)") or hxs.select( '//div[@class="productprice "]/span[@class="detailOurPrice"]/text()' ).re("([.0-9]+)")) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//img[@class="productMain"]/@src', TakeFirst()) loader.add_value('price', price) category = hxs.select( '//div[@class="pagetopnav"]/ul[contains(@class, "crumb")]/li/a/text()' ).extract()[-2] loader.add_value('category', category) loader.add_value('sku', name, re=' (\d\d\d+)\s*$') loader.add_value('brand', response.meta.get('brand', '')) identifier = hxs.select( '//*[@itemprop="productID"]/text()').extract()[0].replace( 'Code: ', '') loader.add_value('identifier', identifier) if out_of_stock: loader.add_value('stock', 0) item = loader.load_item() metadata = ToyMonitorMeta() promotions = response.meta.get('promotions', '') metadata['reviews'] = [] item['metadata'] = metadata if promotions: item['metadata']['promotions'] = self.promos.get( promotions, promotions) reviews = hxs.select('//div[@class="reviewbody"]') prod_id = response.xpath('//input[@name="id"]/@value').extract()[0] has_reviews = response.xpath( '//a[@class="writeReviewLink" and contains(@onclick, "' + prod_id + '")]').extract() if has_reviews: for review in reviews: review_loader = ReviewLoader(item=Review(), response=response, date_format="%B %d, %Y") #review_date = datetime.datetime.strptime(review['SubmissionTime'].split('.')[0], '%Y-%m-%dT%H:%M:%S') #review_loader.add_value('date', review_date.strftime("%B %d, %Y")) title = ''.join( review.select( './/div[@class="reviewTagLine"]/text()').extract()) text = ''.join( review.select( './/div[@class="reviewText"]/text()').extract()) if title: full_text = title.encode('utf-8') + '\n' + text.encode( 'utf-8') else: full_text = text.encode('utf-8') review_loader.add_value('full_text', unicode(full_text, errors='ignore')) rating = float( review.select('.//div[@class="reviewStarsInner"]/@style'). re('\d+.\d+')[0]) / 20 review_loader.add_value('rating', rating) review_loader.add_value('url', item['url']) item['metadata']['reviews'].append(review_loader.load_item()) yield item
def parse(self, response): categories = response.xpath( '//div[@id="subCategorycategories"]/ul/li/a/@href').extract() categories += response.xpath( '//li[@id="categories"]/ul/li/a/@href').extract() categories += response.xpath( '//div[@class="cat_detail"]/div/a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) if '/toys/' in url: yield Request(url) # products new parse method products = response.xpath('//div[contains(@id, "PSPProductList")]') for product in products: loader = ProductLoader(item=Product(), selector=product) name = "".join( product.xpath( ".//div[contains(@class, 'product_name')]//text()"). extract()).strip() brand = product.xpath( 'div/a/div[@class="brand_name"]/text()').extract()[0].strip() url = product.xpath(".//a/@href").extract() url = urljoin_rfc(get_base_url(response), url[0]) sku = product.xpath(".//div[contains(@id, 'psp')]/@id").re( "psp_(.+)")[0] price = product.xpath(".//span[@class='price_now']/text()").re( u'Now\xa0\xa3(.*)') if not price: price = product.xpath( ".//span[@class='price-actual' and @itemprop='price']/text()" ).extract() if price: price = price[0] else: price = '' loader.add_value('stock', 0) category = response.xpath( '//div[@class="breadcrumb_links"]//a/text()').extract()[1:] category += response.xpath( '//div[@class="breadcrumb_links"]//span/text()').extract()[-1:] category = map(lambda x: x.strip(), category) name = brand + ' ' + name loader.add_value('name', name.strip()) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('url', url) loader.add_xpath('image_url', 'div//img[@class="proImg"]/@src') loader.add_value('sku', sku) loader.add_value('identifier', sku) loader.add_value('price', price) item = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] item['metadata'] = metadata yield Request(item['url'], meta={'product': item}, callback=self.parse_product) for page in response.xpath( '//div[@id="pagination"]/a/@href').extract(): url = urljoin_rfc(get_base_url(response), page) yield Request(url)
def parse_brand(self, response): products = response.xpath( '//div[contains(@class, "listing-item") and contains(@class, "product")]' ) pages = response.xpath( '//ul[contains(@class, "pagination") and contains(@class, "pages") and contains(@class, "pagination-sm")]' '//a[contains(@class, "ajax-link") and not(contains(@class, "selected"))]/@href' ).extract() if products: for page in pages: yield Request(response.urljoin(page), callback=self.parse_brand, meta=response.meta) category_name = response.xpath( '//h1[contains(@class, "category-name")]/text()').re( r'^(.*) \(\d+\)') for product in products: loader = ProductLoader(item=Product(), selector=product) try: product_name = product.xpath( './/div[@class="product-description"]/a[contains(@class, "product-name")]/text()' ).extract()[0].strip() except: continue else: loader.add_value('name', product_name) loader.add_value('brand', response.meta.get('brand', '')) loader.add_xpath( 'url', './/div[@class="product-description"]/a[contains(@class, "product-name")]/@href', lambda u: response.urljoin(u[0])) loader.add_xpath( 'identifier', './/div[@class="product-description"]/a[contains(@class, "product-name")]/@href', re=r'/p-(\d+)/') loader.add_xpath('image_url', './/div[@class="image"]//img/@src') sku = product.xpath('./div/a/@data-event').re( '"id": "([0-9]+)"') if sku: loader.add_value('sku', sku[0]) price = product.xpath( './/div[@class="pricing-container"]//span[@class="price now"]' ) if not price: price = product.xpath('.//div[@class="pricing-container"]') price = price.re(r'([\d,.]+)')[-1] loader.add_value('price', price) sku = product_name.split(' ')[-1] if not sku: self.log('ERROR: no SKU found! URL:{}'.format( response.url)) else: loader.add_value('sku', sku) loader.add_value('category', category_name) item = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] item['metadata'] = metadata product_id = product.xpath( './/a[@class="ega-prodclick"]/@data-event').re( '"id": "(.*)"')[0] reviews_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=lquk3xwywjr5jl6ty8h5wc2kx&apiversion=5.5&displaycode=17935-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A' + product_id + '&filter.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&sort.q0=submissiontime%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv_1111_55059' request = Request(reviews_url, meta={ 'item': item, 'offset': 0 }, callback=self.parse_reviews) yield request
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select('//div[@class="product-title"]/h1/text()').extract() if not name: self.log('ERROR: no product NAME found! URL:{}'.format( response.url)) else: loader.add_value('name', name[0].strip()) prod_id = hxs.select('//input[@id="productId"]/@value').extract()[0] loader.add_value('identifier', prod_id) loader.add_value('url', response.url) price = hxs.select( '//div[@class="price clearfix"]/div[@class="floatleft block"]/span/text()' ).extract() if not price: price = hxs.select( '//script[contains(text(), "product_base_price")]').re( 'product_base_price:\["(.*)"\]') if not price: self.log('ERROR: no product PRICE found! URL:{}'.format( response.url)) return if price: loader.add_value('price', price[0]) product_image = hxs.select('//a[@id="mainImage"]/img/@src').extract() if not product_image: self.log('ERROR: no product Image found!') else: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) loader.add_value('image_url', image) loader.add_value('category', response.meta.get('category', '')) sku = hxs.select('//input[@name="skuId"]/@value').extract() if not sku: self.log('ERROR: no SKU found! URL:{}'.format(response.url)) else: loader.add_value('sku', sku[0].strip()) brand = re.search('product_brand:\[\"(.*)\"\],', response.body) if brand: loader.add_value('brand', brand.group(1).strip()) promo = response.xpath( '//div[contains(@class,"pdp_add-cart")]/div[@class="truuk-offer-box"]' '//span[@class="truuk-special-offer-body"]/text()').extract() if not promo: promo = response.xpath( '//div[contains(@class,"pdp_add-cart")]//span[@class="was-2 block"]/text()' ).extract() product = loader.load_item() reviews_url = u'http://www.toysrus.co.uk/assets/pwr/content/%s/%s-en_GB-1-reviews.js' % ( self.calculate_url(prod_id), prod_id) metadata = ToyMonitorMeta() metadata['reviews'] = [] if promo: metadata['promotions'] = promo[0] product['metadata'] = metadata meta = { 'dont_retry': True, 'handle_httpstatus_list': [404, 302], 'cur_page': 1, 'product': product, 'dont_redirect': True, 'reviews_url': u'http://www.toysrus.co.uk/assets/pwr/content/' + u'%s/%s' % (self.calculate_url(prod_id), prod_id) + u'-en_GB-%s-reviews.js' } yield Request(reviews_url, meta=meta, callback=self.parse_review)
def parse_product(self, response): pdata = SpiderSchema(response).get_product() hxs = HtmlXPathSelector(response) url = response.url l = ProductLoader(item=Product(), response=response) name = pdata['name'] l.add_value('name', name) l.add_value('sku', pdata['sku']) l.add_value('category', SpiderSchema(response).get_category()) product_image = response.css( 'li.active a img::attr(src)').extract_first() if product_image: l.add_value('image_url', response.urljoin(product_image)) brand = response.css('.pdp-view-brand-main ::text').extract_first() l.add_value('url', url) l.add_value('price', pdata['offers']['properties']['price']) l.add_value('brand', response.meta.get('brand', brand)) identifier = response.xpath( '//form/input[@name="productId"]/@value').extract_first() if not identifier: self.log('No identifier found on %s' % response.url) return l.add_value('identifier', identifier) item = l.load_item() promotions = response.xpath( '//li[@class="pricesale"]/text()').extract() promotions += response.xpath( '//div[@class="special-offers"]/p/text()').extract() promotions = [x.strip() for x in promotions] promotions = u' * '.join(promotions) metadata = ToyMonitorMeta() ean = hxs.select('//li[contains(text(), "EAN")]/text()').re( "EAN: ([0-9]+)") if ean: metadata['ean'] = ean[0] metadata['reviews'] = [] item['metadata'] = metadata item['metadata']['promotions'] = promotions part_number = response.xpath( '//form/input[@name="partNumber"]/@value').extract_first() if pdata.get('aggregateRating'): review_url = ( "http://api.bazaarvoice.com/data/reviews.json?Callback=jQuery111206106209812916942_1465931826753" "&apiversion=5.4&passkey=q3mz09yipfffc2yhguids3abz&locale=en_GB&Filter=ProductId:%s" "&Filter=IsRatingsOnly:false&Include=Products&Stats=Reviews&Limit=100&Offset=0&Sort=SubmissionTime:Desc" "&_=1465931826756") % (part_number) req = Request(review_url, meta={ 'item': item, 'offset': 0 }, callback=self.parse_reviews) yield req else: yield item
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) name = ''.join(response.xpath('//h1[@class="productHeading"]//text()').extract()) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand', '')) category = re.findall(u',\\ncategory: "(.*)",', response.body) category = category[0] if category else '' loader.add_value('category', category) loader.add_xpath('sku', '//span[@id="catalogueNumber"]/text()') loader.add_xpath('identifier', '//span[@id="catalogueNumber"]/text()') image_url = response.xpath('//div[@id="amp-originalImage"]/img/@src').extract() promotion = None if image_url: loader.add_value('image_url', image_url[0]) if '3for2' in image_url[0]: promotion = '3 for 2' price = ''.join(response.xpath('//div[@class="priceNow"]//text()').extract()) loader.add_value('price', price) out_of_stock = 'IN STOCK' not in ''.join(response.xpath('//meta[@property="product:availability"]/@content').extract()).upper() if out_of_stock: loader.add_value('stock', '0') item = loader.load_item() metadata = ToyMonitorMeta() ean = ''.join(response.xpath('//span[@id="productEAN"]/text()').extract()).strip() if ean: metadata['ean'] = ean metadata['reviews'] = [] if promotion is not None: metadata['promotions'] = promotion item['metadata'] = metadata items = [] amount_options = len(response.xpath('//ul[@class="customerSelection"]')) options = [] # Extract option arrays options_text = re.findall('stockMatrix = \[(.*) \]; sdg.productOptions', ' '.join(response.body.split())) if options_text: options_text = re.findall('(.*)]; sdg.productOptions', options_text[0]) for line in options_text[0].split(' , '): if '"sku' in line: option = re.search('\[(.*)\]', line) if option: option = re.search('\[(.*)\]', line).group(0).replace('null', 'None') options.append(eval(option)) if len(options)>1: for option in options: option_item = deepcopy(item) name = ' '.join(option[:amount_options]) identifier = option[amount_options] price = option[-5] option_item['name'] += ' ' + name option_item['identifier'] += '-' + identifier option_item['price'] = extract_price(price) out_of_stock = [value for value in option if value and 'out of stock' in value.lower()] if out_of_stock: option_item['stock'] = 0 items.append(option_item) else: items.append(item) product_id = re.findall('productId: "(.*)"', response.body)[0] reviews_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=2x4wql4zeys4t8mu5x3x4rb1a&apiversion=5.5&displaycode=10628-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A'+product_id+'&filter.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&sort.q0=isfeatured%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv_1111_18822' request = Request(reviews_url, meta={'items': items, 'offset': 0, 'url': response.url}, callback=self.parse_reviews) yield request
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('normalize-space(//*[@itemprop="name"]/text())').extract()[0] brand = hxs.select('normalize-space(//*[@itemprop="brand"]/span/text())').extract() try: image_url = urljoin_rfc(base_url, hxs.select('//div[@id="prod-media-player"]' '//img/@src').extract()[0].strip()) except IndexError: image_url = '' options = hxs.select('//div[@id="prod-multi-product-types"]') items = [] if options: products = options.select('.//div[@class="product-type"]') for product in products: opt_name = product.select('.//h3/text()').extract()[0].strip() try: stock = product.select('//div[contains(@class, "mod-stock-availability")]' '//p/strong/text()').re(r'\d+')[0] except IndexError: stock = 0 loader = ProductLoader(item=Product(), selector=product) sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model name")]/following-sibling::dd/text()').extract() if not sku: sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model Number")]/following-sibling::dd/text()').extract() if sku: loader.add_value('sku', sku[0].strip()) loader.add_xpath('identifier', './/div[contains(@class, "mod-product-code")]/p/text()') loader.add_value('name', '%s %s' % (name, opt_name)) loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()') loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_xpath('price', './/p[@class="price"]/strong/text()') loader.add_value('stock', stock) item = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] item['metadata'] = metadata items.append(item) else: price = ''.join(hxs.select('//ul/li/strong[@class="price"]/text()').extract()).strip() if not price: price = ''.join(hxs.select('//span[@class="now-price"]/text()').extract()).split() if not price: price = ''.join(hxs.select('//div[@id="prod-price"]//strong/text()').extract()).split() try: stock = hxs.select('//div[contains(@class, "mod-stock-availability")]' '//p/strong/text()').re(r'\d+')[0] except IndexError: stock = 0 loader = ProductLoader(item=Product(), response=response) sku = hxs.select(u'//div[@id="prod-product-code"]//h2[contains(text(),"Product code")]/following-sibling::p/text()').extract() if sku: loader.add_value('sku', sku[0].strip()) loader.add_xpath('identifier', '//div[@id="prod-product-code"]/p/text()') loader.add_value('name', name) loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()') loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('stock', stock) item = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] item['metadata'] = metadata if item.get('identifier'): items.append(item) if items: product_id = response.xpath('//div/@data-product-id').extract()[0] reviews_url = 'http://johnlewis.ugc.bazaarvoice.com/7051redes-en_gb/%s/reviews.djs?format=embeddedhtml&page=1&scrollToTop=true' yield Request(reviews_url % product_id, callback=self.parse_review_page, meta={'items': items, 'url': response.url})
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select(u'//div[@class="prod_title"]/h1/text()').extract() if not name: self.log('ERROR: no product NAME found! URL:{}'.format( response.url)) return else: name = name[0].strip() loader.add_value('name', name) prod_id = hxs.select('//input[@name="productCode"]/@value').extract() loader.add_value('identifier', prod_id[0]) loader.add_value('url', response.url) price = hxs.select( u'//h3[@class="prod_price"]/text()').extract()[0].strip() if not price: self.log('ERROR: no product PRICE found! URL:{}'.format( response.url)) return if price: loader.add_value('price', price) product_image = hxs.select(u'//a[@id="imageLink"]/img/@src').extract() if not product_image: self.log('ERROR: no product Image found!') else: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) loader.add_value('image_url', image) categories = hxs.select( u'//nav[@id="breadcrumb"]/ol/li/a/text()').extract()[1:-1] if not categories: self.log('ERROR: category not found! URL:{}'.format(response.url)) else: for category in categories: loader.add_value('category', category.strip()) sku = hxs.select( '//dl[dt/text()="Our Product Number"]/dd/text()').extract() if not sku: self.log('ERROR: no SKU found! URL:{}'.format(response.url)) else: loader.add_value('sku', sku[0].strip()) loader.add_value('brand', response.meta.get('brand', '')) item = loader.load_item() metadata = ToyMonitorMeta() ean = ''.join( hxs.select('//dl[dt/text()="Manufacturer Number"]/dd/text()'). extract()).strip() if ean: metadata['ean'] = ean promo = response.xpath( '//div[@class="prod_details_main"]/span[@class="badge"]/img/@alt' ).extract() if promo: metadata['promotions'] = promo[0] metadata['reviews'] = [] item['metadata'] = metadata reviews_url = 'http://theentertainer.ugc.bazaarvoice.com/6038-en_gb/%s/reviews.djs?format=embeddedhtml&page=1&scrollToTop=true' yield Request(reviews_url % item['identifier'], callback=self.parse_review_page, meta={'item': item})
def parse_product(self, response): variations = None if not response.meta.get('no_variations', False): variations = response.xpath( '//select[@class="variation-select"]/option') name_add = '' else: name_add = response.meta.get('name', '') if variations: h = HTMLParser() for option in variations[1:]: name = option.xpath('./text()').extract_first() url = option.xpath('./@data-url').extract_first() self.log(url) url = h.unescape(url) meta = {'name': name, 'no_variations': True} yield scrapy.Request(url, meta=meta, callback=self.parse_product) else: identifier = response.xpath( '//*[@id="pid"]/@value').extract_first() identifier2 = response.xpath( '//*[@id="pid"]/@data-variant-id').extract_first() if identifier != identifier2: identifier += '_' + identifier2 stock = response.xpath( '//span[@class="in-stock-msg information"]').extract_first() name = response.xpath( '//h1[@itemprop="name"]/text()').extract_first() if name_add != '': if '(Out of stock)' in name_add: name_add = name_add.replace('(Out of stock)', '') stock = None name += ' ' + name_add price = response.xpath( '//span[@itemprop="price"]/text()').extract_first() if not price: price = 0 stock = None category = response.xpath( '//div[@class="breadcrumbs"]/a/text()').extract() image_url = response.xpath( '//img[@itemprop="image"]/@src').extract_first() loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('brand', 'Disney') loader.add_value('category', category) loader.add_value('url', response.url) loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('price', price) if not stock: loader.add_value('stock', 0) if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '3.95') product = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] product['metadata'] = metadata reviews_url = 'http://disneystore.ugc.bazaarvoice.com/4848-en_gb/%s/reviews.djs?format=embeddedhtml&page=1&scrollToTop=true' yield scrapy.Request(reviews_url % product['identifier'], callback=self.parse_review_page, meta={'item': product})
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) mpn = hxs.select('//span[@class="b-item"]').re("MPN: ([0-9]+)") ean = hxs.select('//span[@class="b-item"]').re("EAN: ([0-9]+)") sku = hxs.select('//input[@name="sku"]/@value').extract() name = hxs.select('//h1[@class="b-ttl-main"]/text()').extract()[0] dealer_name = "".join(hxs.select('//h2[@id="auto_shop_info_name"]//text()').extract()).strip() brand = hxs.select('.//span[@itemprop="brand"]/text()').extract() if brand: brand = brand[0].strip() else: brand = response.meta.get('brand') categories = hxs.select('//ul[@class="b-breadcrumb"]/li/a/text()').extract() image_url = hxs.select('//img[@itemprop="image"]/@data-frz-src').extract() ## options = hxs.select('//script[contains(text(), "var variant_details")]/text()').re('var variant_details = (.*);\n') options = hxs.select('//script[contains(text(), "var variant_details")]/text()').extract() if options: options = options[0].replace('"', "'") options = re.findall('var variant_details = (.*);\n', options) variants = json.loads(options[0]) else: identifier = hxs.select('//input[@name="item_id"]/@value').extract()[0] price = hxs.select('//div[@class="b-product-main"]//meta[@itemprop="price"]/@content').extract()[0] variants = [{'itemVariantId': identifier, 'sku': sku, 'variantValues': [], 'defaultPricing': {'price': price}}] items = [] for variant in variants: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', variant['itemVariantId']) loader.add_value('name', " ".join([name] + variant.get('variantValues', []))) loader.add_value('sku', variant['sku']) loader.add_value('url', response.url) loader.add_value('price', variant['defaultPricing']['price']) loader.add_value('dealer', dealer_name) loader.add_value('category', categories) if brand: loader.add_value('brand', brand) if image_url: loader.add_value('image_url', image_url[0]) product = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] product['metadata'] = metadata if mpn or ean: if mpn: metadata['mpn'] = mpn[0] if ean: metadata['ean'] = ean[0] product['metadata'] = metadata items.append(product) reviews_url = response.xpath('//a[contains(text(), "See All Reviews")]/@href').extract() if reviews_url: yield Request(reviews_url[0], callback=self.parse_reviews, meta={'items': items, 'url': response.url}) else: for item in items: yield item