def parse_car(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) identifier = response.url.split('/')[-2] price = hxs.select( '//td[contains(text(), "Cash Price")]/following-sibling::td/text()' ).extract() if not price: price = hxs.select('//h2/text()').re( 'Manager\'s Special Price (.*)') if not price: return loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) name = hxs.select( '//div[@class="textInner"][./h2]/*//strong/text()').extract() if name: name = name[0] loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select( '//div[@class="product-tile"]//a/@href').extract(): pid = url.split('_')[-1] if pid not in self.parsed_products: self.parsed_products.append(pid) yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) name = hxs.select('//h1/text()').extract() if not name: request = self.retry(response, "No name for product: " + response.url) if request: yield request return product_loader.add_value('name', name) category = hxs.select( '//ol[@class="breadcrumbs"]//a/text()').extract()[1:] product_loader.add_value('category', category) img = hxs.select('//div[@class="item"]//img/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img.pop(0))) product = product_loader.load_item() options = hxs.select(u'//div[contains(@class, "MainProds")]/ol/li') if not options: options = hxs.select( u'//div[@class="SingColl"]/div[contains(@class, "Prod")]') if True: if not options or len(options) == 1: prod = Product(product) prod['sku'] = hxs.select('//div[@class="product-sku"]/text()' ).re('Product code: (\w+)').pop() prod['identifier'] = prod['sku'] prod['price'] = extract_price( hxs.select('//div[@class="price-current"]/text()').extract( ).pop()) if prod['identifier']: yield prod else: for opt in options: prod = Product(product) prod['name'] = opt.select( u'normalize-space(.//h2/text())').extract()[0] prod['sku'] = \ opt.select(u'normalize-space(substring-after(.//div[@class="code"]/text(), ":"))').extract()[0] prod['identifier'] = prod['sku'] prod['price'] = extract_price( opt.select( u'.//span[@class="Price"]/text()').extract()[0]) yield prod
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) base_url = '/'.join(base_url.split('/')[:3]) product = {} product['identifier'] = response.xpath( '//input[@name="elementID"]/@value').extract_first() if not response.css('span.product-in-store'): product['stock'] = 0 product['name'] = response.xpath( '//h1[@itemprop="name"]/text()').extract_first() product['price'] = response.xpath( '//meta[@itemprop="price"]/@content').extract_first() product['url'] = response.url product['brand'] = hxs.select( u'//dt[contains(., "Производитель")]/following-sibling::dd/span/text()' ).extract_first() if not product['brand']: product['brand'] = response.xpath('//span/text()').re_first( u'Другие товары бренда (.+)') image_url = hxs.select('//img[@itemprop="image"]/@src').extract() if image_url: product['image_url'] = urljoin_rfc(base_url, image_url[0].strip()) product['sku'] = '' product['sku'] = response.xpath( u'//span[contains(., "Артикул:")]/following-sibling::span/text()' ).extract_first() product['category'] = hxs.select( '//div[contains(@class, "breadcrumbs")]//span/text()').extract( )[-2] product_loader = ProductLoaderWithoutSpaces(item=Product(), selector=hxs) for k, v in product.iteritems(): product_loader.add_value(k, v) product = product_loader.load_item() #time.sleep(random.random()*2.0) yield product
def parse_price_from_cart(self, response): loader = ProductLoader(item=response.meta['product'], response=response) loader.replace_xpath( 'price', '//td[@class="right"]/div[@class="prodetail-price"][1]/text()') shipping_cost = 9.9 if loader.get_output_value('price') < 200 else 0 loader.replace_value('shipping_cost', shipping_cost) yield loader.load_item()
def start_requests(self): with open(os.path.join(here, 'data.csv')) as f: reader = csv.reader(f) for row in reader: brand = row[0].decode('utf-8') name = row[1].replace('-', ' ').decode('utf-8') url = row[2].decode('utf-8') lenses = row[3] lens_type = row[4] loader = ProductLoader(item=Product(), selector=HtmlXPathSelector()) loader.add_value('name', name) loader.add_value('identifier', url) loader.add_value('url', url) loader.add_value('brand', brand) meta = SpecMeta() meta['Lenses'] = lenses meta['Lens_type'] = lens_type self.log('product url: %s' % url) yield Request(url, meta={'m': meta, 'loader': loader}, dont_filter=True)
def parse_product(self, response): hxs = HtmlXPathSelector(response) name = hxs.select('//h1[@class="product-info-head"]/div[1]/text()').extract() name = ''.join(name).strip() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', name) loader.add_xpath('price', ".//span[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" inline price bold productInfo-orgPrice product-info-price-current \")]/text()") image_url = hxs.select(".//div[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" productPage_image_default \")]/img[1][not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" photo \")]/@src").extract() if image_url: loader.add_value('image_url', 'http:' + image_url[0]) loader.add_xpath('brand', ".//dl[not(@id)][not(@class)][not(@style)]/dd[1][not(@id)][not(@class)][not(@style)]/text()") category = hxs.select(".//nav[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" breadcrumbs module small \")]/div[2][not(@id)][not(@class)][not(@style)]/a[1][not(@id)][not(@class)][not(@style)]//text()").extract() if category: category = ''.join(category).strip() loader.add_value('category', category) loader.add_value('url', response.url) loader.add_value('identifier', response.url.split('/')[-1]) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): pd = Selector(response) url = response.url category = response.meta['category'] image_url = pd.select('//a[@id="zoom1"]/@href').extract() product_identifier = response.xpath('//@data-code').extract() product_identifier = response.xpath( '//span[@id="product-code"]/span/strong/text()').extract() if not product_identifier: product_identifier = response.xpath( '//span[@id="product-code"]/text()').extract() if not product_identifier: log.msg(url + " no Code/ID") product_identifier = product_identifier[0].strip() product_name = pd.select( '//h1[@itemprop="name"]/text()').extract()[0].strip() brands = response.css('ul.breadcrumbsList li').xpath( './/a[contains(@href, "/producer/")]/text()').extract() if not brands: brands = pd.select( '//div[@class="modelContainer"]//li[@class="first"]/a/text()' ).extract() brand = '' if brands: brand = brands[0].strip() else: log.msg(url + " no BRND") product_loader = ProductLoader(item=Product(), selector=pd) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) product_loader.add_value('sku', product_identifier) if image_url: product_loader.add_value('image_url', image_url[0]) price = response.xpath('//script/text()').re('product_price":(.+?),') if not price: price = response.xpath( '//span[@id="price_per_m"]/text()').extract() price = price[0] if price else 0 product_loader.add_value('price', price.strip().replace(" ", "")) product_loader.add_value('url', url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product = product_loader.load_item() yield product
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@class="detailstitle"]/text()') loader.add_xpath('identifier', '//script/text()', re="'productID':'(\w+?)'") loader.add_xpath('sku', '//script/text()', re="'productID':'(\w+?)'") loader.add_value('url', response.url) loader.add_xpath('price', '//script/text()', re="'productValue':'([\d\.]+?)'") loader.add_xpath('category', '//div[@class="breadcrumb"]/a[position()>1]/text()') image_url = response.xpath( '//div[@class="mainProductImage"]//img/@src').extract() if not image_url: image_url = response.xpath( '(//div[@class="thumbnail"])[2]//input[@type="image"]/@src' ).extract() image_url = [image_url[0].replace('XSmall', 'Large')] if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_xpath( 'brand', '(//td[contains(h5/text(), "Brand")])[1]/following-sibling::td[1]/span/text()' ) if not response.xpath( '//div[@id="availDelTick"]//a[@class="BasketTickOn"]'): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): if 'aspxerrorpath' in response.url: yield Request(response.request.meta['redirect_urls'][0], self.parse_product, dont_filter=True) return base_product = True add_custom_personalization = False loader = ProductLoader(item=Product(), response=response) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_value('category', 'Kits') heros_data = response.xpath('//script/text()').re( 'product\d{7} =(.+?});var') base_product_data = response.xpath('//script/text()').re( 'product\w{6} =(.+?});var') if not base_product_data: for p in self.parse(response): yield p return if not heros_data: data = json.loads(base_product_data[0]) elif len(heros_data) == 1: data = json.loads(heros_data[0]) base_product = False else: data = [json.loads(x) for x in heros_data] data = {x['ProductID']: x for x in data} heros = response.css('select.heroShirts') hero = heros.xpath('option[@selected]') if not hero: data = json.loads(base_product_data[0]) else: data = data[int(hero.xpath('@value').extract_first())] base_product = False base_product_data = json.loads(base_product_data[0]) gbp_url = response.xpath( '//a[contains(@href, "?cur=GBP")]/@href').extract_first() if gbp_url: yield Request(response.urljoin(gbp_url), self.parse_product, dont_filter=True) return # Checking custom personalization printings = { p['PrintingTypeID']: p for p in base_product_data['printingitems'] } custom_printings = printings.get(1) if custom_printings and base_product: add_custom_personalization = True loader.add_value('name', data['Description']) loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"') if data['Brand']: loader.add_value('brand', data['Brand']['Name']) loader.add_value('image_url', response.urljoin(data['ImageURL'])) product = loader.load_item() player_from_name = re.search('with *([\w.\- ]+?) *(\d*|TBC) *printing', data['Description'], re.UNICODE) if player_from_name: player_name, number = player_from_name.groups() #sizes for variation in data['Variations']: size = variation['Description'] loader = ProductLoader(item=Product(), response=response) loader.add_value(None, product) loader.replace_value('identifier', variation['VariationId']) loader.add_value('name', size) loader.replace_value('price', variation['PriceActual']) if variation['PriceActual'] < 75: loader.replace_value('shipping_cost', '4.95') loader.replace_value('stock', int(variation['IsInStock'])) item = loader.load_item() if player_from_name: item['metadata'] = { 'player': player_name, 'number': number, 'size': size } else: item['metadata'] = {'size': size} yield item # Custom printings if add_custom_personalization: team_player_name = 'WILLIAMS' team_player_number = '10' team_player_id = 'WILLIAMS' loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.add_value('name', team_player_name) loader.add_value('name', team_player_number) price = Decimal(item['price']) + Decimal( str(custom_printings['PriceActual'])) loader.replace_value('price', price) if price >= 75: loader.replace_value('shipping_cost', 0) identifier = '-'.join( (item['identifier'], str(custom_printings['PrintingID']), team_player_id)) loader.replace_value('identifier', identifier) custom_item = loader.load_item() custom_item['metadata'] = { 'player': team_player_name, 'number': team_player_number, 'size': size } yield custom_item #Badges printings = { p['PrintingTypeID']: p for p in base_product_data['printingitems'] } printing = printings.get(3) if printing: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.add_value('name', printing['PrintingDescription']) price = variation['PriceActual'] + printing['PriceActual'] loader.replace_value('price', price) if price >= 75: loader.replace_value('shipping_cost', 0) identifier = str(variation['VariationId']) + '-' + str( printing['PrintingID']) loader.replace_value('identifier', identifier) item = loader.load_item() if player_from_name: item['metadata'] = { 'player': player_name, 'number': number, 'size': size } else: item['metadata'] = {'size': size} yield item
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta.copy() categories_urls = response.xpath('//ul[@class="categoryList"]/li//a') for category in categories_urls: url = category.select('@href').extract()[0] name = category.select('text()').extract()[0].strip() if "/prl/results" not in url and 'webapp' not in url: url += "/prl/results" yield Request(urljoin_rfc(base_url, url), callback=self.parse, meta={'category': name}) products = response.xpath( '//table[@id="sProdList"]/tbody/tr[td[@class="productImage"]]') for product in products: try: identifier = product.select( './/a[@class="sku"]/text()').extract()[0].strip() stock = int( product.select( './/td[@class="availability"]/input[@class="hVal"]/@value' ).extract()[0]) price = round( Decimal( product.css( '.price input.hVal::attr(value)').extract()[0]), 2) except IndexError: continue if identifier in self.cache_data: product_cached = self.cache_data[identifier] loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('name', product_cached['name'].decode('utf-8')) loader.add_value('url', product_cached['url'].decode('utf-8')) loader.add_value('sku', product_cached['sku'].decode('utf-8')) loader.add_value('category', product_cached['category'].decode('utf-8')) loader.add_value('image_url', product_cached['image_url'].decode('utf-8')) loader.add_value('brand', product_cached['brand'].decode('utf-8')) loader.add_value('price', price) loader.add_value('stock', stock) item = loader.load_item() try: self.missing_urls.remove(item['url']) except ValueError: pass yield item else: url = product.select( './/a[@class="sku"]/@href').extract()[0].strip() url = url_query_cleaner(url) if url in self.missing_urls: self.missing_urls.remove(url) yield Request(url, callback=self.parse_product, meta=meta) pages = response.css('.pages .pageIt a::attr(href)').extract() for url in pages: yield Request(urljoin_rfc(base_url, url), callback=self.parse, meta=meta) if not products and not categories_urls: yield Request(url_query_cleaner(response.url), dont_filter=True, callback=self.parse_product, meta=meta)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) options = hxs.select( '//div[@class="variation-group"]//a/@href').extract() for option in options: option_url = urljoin_rfc(base_url, option) log.msg('INFO >>> OPTION FOUND: ' + option_url) yield Request(option_url, callback=self.parse_product) one_seller = hxs.select( '//div[@class="marketplace-shipping-message"]//a[@class="bbypopup"]' ).extract() one_seller = True if one_seller else False identifier = hxs.select( '//span[@itemprop="productID"]/text()').extract() if not identifier: request = self.retry( response, "ERROR >>> No identifier for product URL: " + response.url) if request: yield request return identifier = identifier[0] loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('url', response.url) price = ''.join( hxs.select('//div[@class="item-price"]/text()').extract()).strip() loader.add_value('price', price) loader.add_xpath('name', '//div[@itemprop="name"]/h1/text()') image_url = hxs.select( '//meta[@property="og:image"]/@content').extract() if image_url: loader.add_value('image_url', image_url[0]) categories = response.xpath( '//ol[@id="breadcrumb-list"]/li/a/text()').extract()[-3:] loader.add_value('category', categories) brand = hxs.select( '//div[@itemprop="brand"]/meta[@itemprop="name"]/@content' ).extract() brand = brand[0].strip() if brand else '' loader.add_value('brand', 'Surfboard') sku = hxs.select('//span[@itemprop="model"]/text()').extract() sku = sku[0] if sku else '' loader.add_value('sku', sku) out_of_stock = hxs.select( '//div[@class="cart-button" and @data-button-state-id="SOLD_OUT_ONLINE"]' ) item = loader.load_item() item['metadata'] = {'reviews': []} reviews_url = 'http://bestbuy.ugc.bazaarvoice.com/3545w/%s/reviews.djs?format=embeddedhtml' yield Request(reviews_url % identifier, meta={'product': item}, callback=self.parse_review_page)
def parse_products(self, response): base_url = get_base_url(response) products = response.xpath('//table[@class="productListing"]/tr') for p in products: loader = ProductLoader(item=Product(), selector=p) try: url = p.select('.//a/@href').extract()[0] except IndexError: continue name = p.select('.//a[@class="boxtitle"]//text()').extract()[0] price = p.select('.//span[@class="boxprice"]/text()').extract()[0] image_url = p.select('.//img/@src').extract()[0] identifier = re.search('products_id=(\d+)', url).groups()[0] loader.add_value('url', url) loader.add_value('price', price) loader.add_value('name', name) loader.add_value('image_url', urljoin(base_url, image_url)) loader.add_value('category', response.url.split('=')[1]) loader.add_value('identifier', identifier) yield Request(url, meta={'loader': loader}, callback=self.parse_brand)
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) name = ' '.join(response.xpath('//div[@itemprop="name"]/*//text()').extract()) loader.add_value('name', name) loader.add_value('url', response.url) image_url = response.xpath('//img[@class="left-image"]/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) price = response.xpath('//div[@itemprop="offers"]/p[@class="box-price"]/b/text()').extract() if not price: price = response.xpath('//div[@itemprop="offers"]/span[@itemprop="price"]/text()').extract() loader.add_value('price', price) brand = response.xpath('//img[@class="brand"]/@alt').extract() if not brand: brand = response.xpath('//div[@itemprop="name"]/h1/text()').extract() if brand and not brand[0].isdigit(): loader.add_value('brand', brand) sku = response.xpath('//input[@type="hidden" and @name="productIdAnalytics"]/@value').extract() loader.add_value('sku', sku) loader.add_value('identifier', sku) item = loader.load_item() metadata = SpecSaversMeta() metadata['promotion'] = response.meta['promotional_data'] item['metadata'] = metadata yield item
def parse_options(self, response): data = json.loads(response.body) identifier = url_query_parameter(response.url, 'productId') sku = url_query_parameter(response.url, 'skuId') loader = ProductLoader(Product(), response=response) loader.add_value(None, response.meta['item']) loader.replace_value('identifier', '.'.join((identifier, sku))) loader.replace_value('sku', sku) loader.replace_value('name', data['skuName']) if not data['skuName'].endswith( data['size']) and not data['skuName'].endswith( data['size'].replace(' ', '')): loader.add_value('name', data['size']) loader.replace_value('image_url', response.urljoin(data['thumbnail_url'])) loader.replace_value('price', str(data['unit_sale_price'])) loader.replace_value('stock', data['stock']) if Decimal(data['unit_sale_price']) < 20: loader.add_value('shipping_cost', '2.99') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(selector=hxs, item=Product()) loader.add_value('url', response.url) loader.add_xpath('name', '//div[@id="BuyBoxArea"]//h1[@itemprop="name"]/text()') loader.add_value('identifier', response.meta['id']) loader.add_value('sku', response.meta['id']) loader.add_xpath('price', '//span[@itemprop="price"]/text()') stock = 1 if hxs.select('//span[text()="In Stock"]') else 0 loader.add_value('stock', stock) loader.add_xpath('category', '//div[@class="breadcrumb"]/a[position()>1]/text()') loader.add_xpath('brand', '//td[text()="Brand"]/../td[2]/text()') loader.add_xpath('image_url', '//img[@class="js-main-image"]/@src') product = loader.load_item() if product['price'] < 40: product['shipping_cost'] = 4.95 yield product
def parse_product(self, response): for product in self.products.extract_links(response): yield Request(product.url, self.parse_product) data = response.xpath( '//script/text()[contains(., "window.universal_variable")]' ).extract_first() if not data: return data = json.loads(re.search('.+?=(.+)', data, re.DOTALL).group(1)) pdata = data['product'] loader = ProductLoader(Product(), response=response) loader.add_value('identifier', pdata['id']) loader.add_value('url', response.urljoin(pdata['url'])) loader.add_value('name', pdata['name']) loader.add_value('name', pdata.get('size')) loader.add_value('price', str(pdata['unit_sale_price'])) loader.add_value('sku', pdata['sku_code']) category = response.css('div.crumb').xpath( './/span[@itemprop="name"]/text()').extract()[1:-1][-3:] loader.add_value('category', category) loader.add_value('image_url', response.urljoin(pdata['thumbnail_url'])) loader.add_value('stock', pdata['stock']) item = loader.load_item() options_url = 'http://www.hollandandbarrett.com/browse/json/selectSkuForPDP.jsp?skuId=%s&productId=%s' skus = response.xpath('//@data-sku-id').extract() if len(skus) > 1: for sku in skus: url = options_url % (sku, pdata['id']) yield Request(url, self.parse_options, meta={'item': Product(item)}) return if pdata['unit_sale_price'] < 20: item['shipping_cost'] = '2.99' yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = re.search('\d\d\d\d', response.url).group(0) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//header[@class="prodCat"]/h1/text()') category = response.css('.bread li a::text').extract()[1:] category += response.css('.bread li:last-child::text').extract() loader.add_value('category', category) image_url = response.css('.detimg a::attr(href)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) item = loader.load_item() options = response.css('.tbl').xpath('.//*[@class="tr"]') if not options: item['price'] = 0 yield item return for option in options: loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) identifier = option.xpath('.//input/@name').extract_first() loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) loader.replace_css('price', '.tc-price .pr-now::text') loader.add_css('price', '.tc-price::text') loader.replace_css('name', '.tc-title::text') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: data = json.loads( hxs.select('//script[@type="text/javascript"]/text()').re( 'var utag_data = ({.+})')[0]) except IndexError: return loader = ProductLoader(item=Product(), selector=hxs) try: loader.add_value('name', data['product_name']) except KeyError: return loader.add_value('identifier', data['product_id']) loader.add_value('sku', data['product_id']) loader.add_value('brand', data['product_attribute_trademark']) loader.add_value('url', urljoin(base_url, data['internal_url'])) loader.add_value('price', data['product_price'][0] + data['product_taxes'][0]) categories = hxs.select( '//div[@class="breadcrumbs"]//a/text()').extract()[1:] loader.add_value('category', categories) loader.add_xpath('image_url', '//img[@id="image"]/@src') item = loader.load_item() if item['price'] < 75: item['shipping_cost'] = 7.50 yield item for url in hxs.select( '//div[@id="slice_options"]//a/@href[.!="#"]').extract(): yield Request(urljoin(base_url, url), callback=self.parse_product) yield Request( 'http://www.fashionforhome.de/static/s.php?channel=child&limit=199&single_item_type=k&chunk_type=big¶ms[product_id]=%s¶ms[lazy]=1' % data.get("parent_product_id", [''])[0], callback=self.parse_php)
def parse_product(self, response): hxs = HtmlXPathSelector(response) price = filter( lambda p: p.strip(), hxs.select("//span[@class='regular-price']//text()").extract())[1:] loader = ProductLoader(item=Product(), response=response) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('name', "//div[@class='product-name']//h1//text()") loader.add_xpath( 'category', "//div[@class='breadcrumbs']//li[position() > 1 and position() < last()]/a/text()" ) brand = hxs.select( "//div[@class='product-shop']/div[@class='product-name']/a[@class='brand']/text()" ).extract() loader.add_value('brand', brand) loader.add_value('shipping_cost', 0) loader.add_xpath('sku', '//li/span[text()="SKU:"]/../text()') loader.add_xpath( 'identifier', "//div[@class='product-view']//input[@name='product']/@value") image_urls = hxs.select( '//img[contains(@class, "gallery-image")]/@src').extract() for image_url in image_urls: if len(image_url) < 1024: loader.add_value('image_url', image_url) break product = loader.load_item() if product['price'] > 0: yield product
def parse_products(self, response): for url in response.css( '.leftoption :contains("Filter by Manufacturers")').xpath( 'following-sibling::*//a/@href').extract(): yield Request(response.urljoin(url), callback=self.parse_products) text = re.sub('Estimated *<', 'Estimated <', response.body) selector = Selector(text=text) category = selector.css('.crumword').xpath( './/*[@itemprop="title"]/text()').extract() try: identifiers = selector.xpath('//script/text()').re( 'ecomm_prodid: *\[(.+)\]')[0].replace("'", '').split(',') except IndexError: return next_page_url = response.xpath( '//div[@class="pagination"]/a[@class="next"]/@href').extract() if next_page_url: yield Request(response.urljoin(next_page_url[0]), callback=self.parse_products) for num, product in enumerate(selector.css('.grid')): loader = ProductLoader(item=Product(), selector=product) identifier = identifiers[num] loader.add_value('identifier', identifier) url = product.xpath('@href').extract_first() loader.add_value('url', response.urljoin(url)) name = product.css('.gridname').xpath('text()').extract() loader.add_value('name', name) price = product.css('.gridPriceVat').xpath('text()').extract() if not price: price = 0 loader.add_value('price', price) loader.add_value('sku', identifier) loader.add_value('category', category) image_url = product.css('.gridimage').xpath('.//@src').extract() loader.add_value('image_url', image_url) if price and loader.get_output_value('price') < 200: loader.add_value('shipping_cost', '4.99') if 'in stock' not in product.css('.pItemStock').xpath( 'text()').extract_first().strip().lower(): loader.add_value('stock', 0) item = loader.load_item() if price: yield item else: yield Request(response.urljoin(url), self.parse_product, meta={'product': Product(item)})
def parse_product(self, response): identifier = response.xpath('//div[@itemscope]/@id').re('product-(.+)') loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') loader.add_xpath('url', '//link[@rel="canonical"]/@href') category = response.css('.breadcrumb a::text').extract()[1:] loader.add_value('category', category) loader.add_value('brand', response.meta['brand']) loader.add_xpath('image_url', '//div/@data-original-img') loader.add_value('identifier', identifier) product = loader.load_item() if not response.css('.variations'): yield product return variations = response.xpath('//form/@data-product_variations').extract_first() variations = json.loads(variations) for variation in variations: variation_loader = ProductLoader(item=Product(product), response=response) attributes = variation['attributes'].values() variation_loader.replace_value('name', product['name']) for attribute in attributes: variation_loader.add_xpath('name', '//option[@value="%s"]/text()' %attribute) variation_loader.replace_value('price', variation['display_price']) variation_loader.replace_value('identifier', variation['variation_id']) yield variation_loader.load_item()
def parse(self, response): response.selector.register_namespace("g", "http://base.google.com/ns/1.0") for item in response.xpath('//item'): image_url = item.xpath('g:image_link/text()').extract() image_url = image_url[0] if image_url else '' category = item.xpath('g:product_type/text()').extract() category = category[0].split('>')[1:] if category else '' brand = item.xpath('g:brand/text()').extract() identifier = item.xpath('g:id/text()').extract() name = item.xpath('title/text()').extract_first() if name: name = name.replace('...', '').strip() price = item.xpath('g:price/text()').extract() price = extract_price(price[0]) if price else 0 url = item.xpath('link/text()').extract()[0] out_of_stock = item.xpath( 'g:availability/text()').extract()[0] == 'out of stock' product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('identifier', identifier) product_loader.add_value('sku', identifier) product_loader.add_value('name', name) product_loader.add_value('image_url', image_url) product_loader.add_value('price', price) product_loader.add_value('url', url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) if out_of_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brand = response.xpath( '//span[@itemprop="http://schema.org/manufacturer"]/text()' ).extract_first() or response.xpath( '//span[@itemprop="http://schema.org/brand"]/text()' ).extract_first() identifier = hxs.select('//input[@id="itemsArray"]/@value').extract() if not identifier: return sku = response.xpath( '//*[@itemprop="mpn"]/text()').extract()[0].strip() product_loader = ProductLoader(item=Product(), selector=hxs) image_url = response.css( 'img#productMainImage::attr(src)').extract_first() if image_url: product_loader.add_value('image_url', response.urljoin(image_url)) category = response.meta.get('category', '') if not category: category = hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()' ).extract()[-2].strip() product_loader.add_value('category', category) product_name = response.xpath('//div[@id="product"]//h1//text()').re( '\S+') product_loader.add_value('name', product_name) product_loader.add_xpath('url', 'link[@rel="canonical"]/@href') product_loader.add_value('url', response.url) product_loader.add_value('identifier', identifier.pop()) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) price = ''.join( hxs.select( '//table[contains(@class, "pricing")]//td[@class="threeColTd"][1]/text()' ).extract()).strip().split('(')[0].strip().replace(u'\xa3', '') if price: price = extract_price(price) price = price.quantize(Decimal('.01')) product_loader.add_value('price', price) else: product_loader.add_value('price', 0) stock = response.css('span.availability::text').re('\d+') if stock: product_loader.add_value('stock', stock[0]) else: product_loader.add_value('stock', 0) yield product_loader.load_item()
def parse_products(self, response): try: base_url = get_base_url(response) except AttributeError: return if response.xpath( '//font[contains(text(), "Recommended Door Treatment")]' ) and not self.treatment: for treatment in self.parse_treatment(response): yield treatment identifiers = [] price_found = False for product in response.xpath( '//td[@bgcolor="#E5E5E5"]//table/tr[contains(., "Code:")]' ) or response.xpath( '//td[@bgcolor="#FFFFFF"]//table/tr[contains(., "Code:")]'): loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', response.url) product_name = ' '.join(product.xpath('.//text()').re('\S+')) sku = re.findall('Code: *([^:]+)$', product_name) if not sku: self.log('No SKU found for %s on %s' % (product_name, response.url)) continue sku = sku[0].replace(' ', '') loader.add_value('sku', sku) item = loader.load_item() price = '' for size in product.xpath('./following-sibling::tr'): if size.xpath('.//*[contains(.//text(), "Code:")]'): break if not size.xpath('./td[contains(.//text(), " x")]'): try: price = size.xpath('td[3]//td/text()').extract()[-1] except IndexError: pass continue size_name = size.xpath('td[1]//text()').extract() if not size_name: continue loader = ProductLoader(item=Product(item), selector=size) loader.add_value('name', product_name) loader.add_value('name', size_name) if size.xpath('td[3]'): try: price = size.xpath('td[3]//td/text()').extract()[-1] except IndexError: pass if not price: #self.log('No price found for %s %s on %s' %(product_name, size_name, response.url)) continue price_found = True loader.add_value('price', price) identifier = sku + '-' + '-'.join( re.findall('\d+', size_name[0])) identifier += '-' + response.url.split('/')[-1].split( '_')[0].split('.')[0] #avoiding duplicated identifiers if identifier in identifiers or identifier in self.ids_seen: identifier += '-d' identifiers.append(identifier) self.ids_seen.append(identifier) loader.add_value('identifier', identifier) final_item = loader.load_item() image_url = response.xpath( '//*[contains(text(), "Click on")]/../../..//img/@src' ).extract() or response.xpath( '//td[@bgcolor="#E5E5E5"]//img/@src').extract() for image in response.xpath('//*[@class="doorname"]'): image_name = image.xpath('font/text()').extract() if image_name and image_name[0].strip( ) in final_item['name']: image_url = image.xpath('./../p[2]//img/@src').extract( ) or image.xpath('./../../p[2]//img/@src').extract() if image_url: break final_item['image_url'] = urljoin(base_url, image_url[0]) yield loader.load_item() if price_found: return for url in response.xpath('//a[img]/@href').extract(): if url.endswith('html'): yield Request(urljoin(base_url, url), callback=self.parse_products) try: product = response.xpath('//td[@class="Pricegridlabel"]')[0] except IndexError: for product in self.parse_frames(response): yield product return identifiers = [] name = ' '.join( product.xpath('./following-sibling::td[1]//text()').extract()) image_url = response.xpath( '//*[contains(text(), "Click on")]/../preceding-sibling::*[1]//img/@src' ).extract() or response.xpath( '//img[contains(@alt, "door")]/@src').extract() found_sku = False for i, option in enumerate( product.xpath('./../following-sibling::tr[1]/td')): option_name = ' '.join(option.xpath('.//text()').extract()) code = ''.join( option.xpath('./../following-sibling::tr[1]/td[%d]//text()' % (i + 1)).extract()) sku = ''.join(re.findall('CODE: *([^: ]+)$', code)) if not sku: continue found_sku = True for size in option.xpath('./../following-sibling::tr'): if not size.xpath('./td[1][contains(.//text(), " x")]'): continue size_name = size.xpath('td[1]//text()').extract() loader = ProductLoader(item=Product(), selector=size) loader.add_value('name', (name, option_name)) loader.add_value('name', size_name) loader.add_value('sku', sku) identifier = sku + '-' + '-'.join( re.findall('\d+', size_name[0])) identifier += '-' + response.url.split('/')[-1].split( '_')[0].split('.')[0] #avoiding duplicated identifiers while identifier in identifiers or identifier in self.ids_seen: identifier += '-d' identifiers.append(identifier) self.ids_seen.append(identifier) loader.add_value('identifier', identifier) loader.add_xpath('price', 'td[%d]//text()' % (i + 2)) if image_url: loader.add_value('image_url', urljoin(base_url, image_url[0])) loader.add_value('url', response.url) yield loader.load_item() if not found_sku: for product in self.parse_frames(response): yield product
def parse_product(self, response): categories = response.xpath( '//li[@class="blockBreadcrumb__item"]/a/text()').extract()[-3:] loader = ProductLoader(item=Product(), response=response) loader.add_xpath('identifier', '//input[@name="simpleSku"]/@value') loader.add_xpath('sku', '//input[@id="configSku"]/@value') loader.add_value('url', response.url) loader.add_xpath('name', '//h1[contains(@class, "__heading")]/text()') loader.add_xpath('name', '//input[@name="simpleSku"]/../span/text()') loader.add_xpath('image_url', '//div[@class="layoutImage"]//img/@src') loader.add_xpath('price', '//input[@id="price"]/@value') loader.add_xpath('brand', '//input[@id="brand"]/@value') loader.add_value('category', categories) loader.add_xpath('stock', '//@data-instock') item = loader.load_item() options = response.xpath('//select[@id="js-simple-selector"]/option') if not options: if loader.get_output_value('identifier'): yield item return for option in options: loader = ProductLoader(item=Product(item), selector=option) loader.replace_xpath('identifier', './@value') loader.add_xpath('name', './text()') identifier = loader.get_output_value('identifier') price = response.xpath( '//div[@data-simple-sku="%s"]//span[contains(@class, "actualPrice")]/text()' % identifier).extract() loader.replace_value('price', price) image_url = response.xpath( '//div[@data-simple-sku="%s"]/a[contains(@class, "link_selected")]/@data-product-image' % identifier).extract() loader.replace_value('image_url', image_url) loader.replace_xpath('stock', './@data-instock') yield loader.load_item()
def parse(self, response): reader = csv.DictReader(StringIO(response.body)) for row in reader: identifier = row.get('ID', None) brand = row['Supplier'].decode('utf-8') name = row['lens-name'].decode('utf-8') loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('url', '') loader.add_value('brand', brand) loader.add_value('category', brand) loader.add_value('price', row[self.price_field]) p = loader.load_item() yield p
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) name = response.css('.product-name').xpath('h1/text()').extract_first() loader.add_value('name', name) loader.add_value('url', response.url) sname = name.lower() for brand in self.brands: if sname.startswith(brand): loader.add_value('brand', brand.title()) break categories = response.css('.breadcrumbs').xpath( './/a/span/text()').extract()[1:] loader.add_value('category', categories) sku = hxs.select( '//*[@id="product_addtocart_form"]//div[@class="expert-notes "]//span[contains(text(), "SKU: ")]/text()' ).extract() if sku: sku = sku[0].replace("SKU: ", '') else: sku = '' loader.add_value('sku', sku) identifier = hxs.select('//input[@name="product"]/@value').extract()[0] loader.add_value('identifier', identifier + '-new') image_url = hxs.select('//img[@id="image-main"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = response.xpath('//script/text()').re('price":"(.+?)"') price = extract_price(price[0]) if price else 0 loader.add_value('price', price) in_stock = hxs.select( '//div[@class="availability in-stock"]//div[@class="value" and contains(text(), "In stock")]' ) if not in_stock: in_stock = hxs.select( '//p[@class="availability back-order"]//span[@class="value" and contains(text(), "Back Order")]' ) if not in_stock: loader.add_value('stock', 0) if loader.get_output_value('price') < 100: loader.add_value('shipping_cost', 6.50) item = loader.load_item() options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) prices[product] = prices.get( product, 0) + extract_price(option['price']) base_price = extract_price(product_data['basePrice']) for option_identifier, option_name in products.iteritems(): option_item = deepcopy(item) option_item['identifier'] += '-' + option_identifier option_item['name'] += option_name option_item['price'] = base_price + prices[option_identifier] yield option_item else: yield item
def parse_product(self, response): identifier = response.xpath("//div[@class='item-number']/text()").extract_first() sku = identifier identifier = re.sub(u'a', u'', identifier, flags=re.IGNORECASE) name = response.xpath("//div[@class='product-title']/h1/text()").extract_first() price = response.xpath("//div[@class='price']//span[@class='disc-price']/text()").extract() if not price: price = response.xpath("//div[@class='price']/div[@class='regular-price']/span[@class]/text()").extract() if price: price = price[0].strip('$').replace(",", "") else: price = '0.00' price = Decimal(price) # convert using xe.com price = price * self.exchange_rate image_url = response.xpath("//a[@id='mainImage']/img/@src").extract_first() categories = response.xpath('//div[@id="breadcrumbs-"]/ul/li/a//text()')[1:-1].extract() try: brand = response.xpath('//b[contains(., "BRAND:")]/following-sibling::text()[1]').extract_first().title() except AttributeError: brand = '' attributes = response.xpath('//fieldset[@class="attributes"]//li') options = [] option_names = {} for option in response.xpath('//select[@name="attrValue_1"]/option[@value!=""]'): opt_val = option.xpath('./@value').extract() opt_name = option.xpath('./span/text()').extract() if opt_val and opt_name: option_names[opt_val[0]] = opt_name[0] for attr in attributes: attr_name = attr.xpath('.//input[@name="attrName_1"]/@value').extract() if attr_name: attr_name = attr_name[0] else: continue attr_options = [] attr_values = attr.xpath('.//select/option[@value!=""]/@value').extract() for attr_value in attr_values: attr_options.append((attr_name, attr_value)) if not attr_values: attr_value = attr.xpath('.//input[@name="attrValue_1"]/@value')[0].extract() attr_options.append((attr_name, attr_value)) if attr_options: options.append(attr_options) options = itertools.product(*options) items = [] for option in options: opt = [option_names.get(v, '') for _, v in option] opt = [o for o in opt if o] option_name = ' '.join(opt).strip() opt = [SIZES_DICT.get(o.lower(), o) for o in opt] option_id = ':'.join(opt).strip() option_name = re.sub('size', '', option_name, flags=re.IGNORECASE).strip() size = option_names.get(option[-1][-1], '') if option and option[-1] else '' size = re.sub('size', '', size, flags=re.IGNORECASE).strip() if option_name: product_name = name + ' (' + option_name + ')' else: product_name = name if option_id: product_identifier = identifier + u':' + option_id.strip().lower() else: product_identifier = identifier loader = ProductLoader(Product(), option) loader.add_value('name', product_name) loader.add_value('url', response.url) loader.add_value('identifier', product_identifier) loader.add_value('sku', sku) loader.add_value('price', price) loader.add_value('image_url', image_url) loader.add_value('brand', brand) for category in categories: loader.add_value('category', category) product = loader.load_item() product['metadata'] = {'size': size} player = [p for p in self.players if p[1].lower() in product_name.lower()] if player: product['metadata']['player'] = player[0][1].title() product['metadata']['number'] = player[0][2] if len(self.shipping_requests) < 5: self.make_shipping_request(response) item = {'item': product} item['attributes'] = () for k, v in option: item['attributes'] += ((k, v),) items.append(item) if not options: loader = ProductLoader(Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('price', price) loader.add_value('image_url', image_url) loader.add_value('brand', brand) for category in categories: loader.add_value('category', category) product = loader.load_item() product['metadata'] = {} player = [p for p in self.players if p[1].lower() in name.lower()] if player: product['metadata']['player'] = player[0][1].title() product['metadata']['number'] = player[0][2] if len(self.shipping_requests) < 5: self.make_shipping_request(response) item = {'item': product} item['attributes'] = () item['attributes'] += ((response.xpath('//input[@name="attrName_1"]/@value')[0].extract(), response.xpath('//input[@name="attrValue_1"]/@value')[0].extract()),) item['attributes'] += ((response.xpath('//input[@name="attrName_1"]/@value')[1].extract(), response.xpath('//input[@name="attrValue_1"]/@value')[1].extract()),) items.append(item) product_id = response.xpath('//input[@name="productId"]/@value')[0].extract() yield Request('http://www.foxsoccershop.com/InventoryCheck.json?productId={}'.format(product_id), meta={'items': items}, callback=self.parse_stock)
def parse_product(self, response): base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) name = ''.join( response.xpath( '//h1[@class="PrpdocutName"]//text()').extract()).strip() product_loader.add_value('name', name) brand = response.xpath( '//span[@class="parent_product_manufacture_logo"]/img/@alt' ).extract() brand = brand[0].strip() if brand else '' product_loader.add_value('brand', brand) identifier = response.xpath( '//input[@name="products_id"]/@value').extract() if not identifier: identifier = re.findall('custom_product_id=(\d+)', response.body) product_loader.add_value('identifier', identifier[0]) product_loader.add_value('sku', identifier[0]) category = response.xpath( '//div[@class="breadcrumb"]//span[@itemprop="title"]/text()' ).extract()[1:-1] product_loader.add_value('category', category) image_url = response.xpath( '//span[@class="image_container"]/img/@src').extract() if image_url: image_url = response.urljoin(image_url[0]) product_loader.add_value('image_url', image_url) product = product_loader.load_item() options = response.xpath( '//table[@id="product_price_list"]//tr[not(contains(@class, "HeadingRow"))]' ) if options: for option in options: prod = Product(product) product_loader = ProductLoader(item=prod, response=response) option_name = option.xpath( 'td/div[@class="subproduct_name"]/text()').extract() if option_name: option_name = name + ' ' + option_name[0].strip() product_loader.add_value('name', option_name) identifier = option.xpath( './/input[@name="sub_products_id[]"]/@value').extract() if not identifier: identifier = option.xpath( './/input[@name="email_me_products_id"]/@value' ).extract() if not identifier: identifier = option.xpath( './/input[@name="products_id"]/@value').extract() if identifier: product_loader.add_value( 'identifier', product['identifier'] + '-' + identifier[0]) else: log.msg(' >>>>>> Possible wrong identifier: ' + response.url) sku = product_loader.get_output_value('identifier') product_loader.add_value('sku', sku) price = option.xpath( './/span[@class="productSpecialPrice"]/text()').extract() if not price: price = option.xpath( './/span[@class="listing-price"]/text()').extract() price = price[0] if price else 0 product_loader.add_value('price', price) in_stock = option.xpath( './/span[@class="instock" and text()="In Stock"]').extract( ) if not in_stock or not product_loader.get_output_value( 'price'): product_loader.add_value('stock', 0) if product_loader.get_output_value('price') < 70: product_loader.add_value('shipping_cost', Decimal('9.90')) yield product_loader.load_item() else: log.msg(' >>>>> ERROR: NO OPTIONS' + response.url) #if product['price'] < 70: # product['shipping_cost'] = Decimal('9.90') '''
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('sku', '//script/@data-flix-sku') shipping_cost = self.shipping_costs.get(loader.get_output_value('sku'), None) if shipping_cost: loader.add_value('shipping_cost', extract_price(shipping_cost)) loader.add_xpath('identifier', '//input[contains(@id, "SKUID")]/@value') name = response.xpath('//h1/text()').extract() or response.xpath( '//h2[@itemprop="name"]/text()').extract() if not name: return name = name[0] loader.add_value('name', name) loader.add_xpath('price', '//span[@class="TotalPrice"]/text()') categories = response.xpath( '//a[@class="CMSBreadCrumbsLink"]/text()').extract() if not categories: categories = '' loader.add_value('category', categories) for brand in hxs.select( '//div[@title="Brand"]/following-sibling::div//span/@title' ).extract(): if name.title().startswith(brand.title()): break else: brand = '' loader.add_value('brand', brand) loader.add_value('shipping_cost', 19.99) if 'In stock' not in hxs.select( '//span[@class="stock available"]/text()').extract(): loader.add_value('stock', 0) product = loader.load_item() self.products[product['sku']].append(product)