def parse_product(self, response): schema = SpiderSchema(response) pdata = schema.get_product() is_giftcard = False pid = response.xpath('//input[@name="product"]/@value').extract() if not pid: return pid = pid[0] try: price = pdata['offers']['properties']['price'] except: price = extract_price_eu(response.xpath( '//*[contains(@id, "product-price-")]/text()').re(r'[\d\.,]+')[0]) try: out_of_stock = 'Varen er ikke' in pdata['offers']['properties']['availability'] except: out_of_stock = bool(response.xpath( '//*[contains(@class, "availability") and contains(@class, "out-of-stock")]')) if pdata: pname = pdata['name'] else: pname = ''.join(response.xpath('//*[@class="product-name"]//text()').extract()).strip() if 'image' in pdata: pimage = pdata['image'] else: pimage = response.xpath('//img[@id="image"]/@src').extract() if not pimage: pimage = response.xpath('//img[contains(@class, "giftcard-img")]/@src').extract() if pimage: is_giftcard = True pimage = response.urljoin(pimage[0]) if pimage else None loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', pid) loader.add_value('sku', pid) loader.add_value('url', response.url) loader.add_value('name', pname) loader.add_value('price', price) loader.add_value('brand', response.meta.get('brand')) loader.add_value('category', response.meta.get('brand')) if pimage: loader.add_value('image_url', pimage) if is_giftcard or (Decimal(price) >= Decimal(self.free_shipping_over)): loader.add_value('shipping_cost', '0') else: loader.add_value('shipping_cost', '49.99') if out_of_stock: loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): page_schema = SpiderSchema(response) product_data = page_schema.get_product() sku = product_data['sku'] main_name = product_data['name'] main_price = extract_price( product_data['offers']['properties']['price'].replace(' ', '')) brand = product_data['brand'] image_url = product_data['image'] category = [ d['properties']['name'] for d in page_schema.data['items'][1] ['properties']['itemListElement'] ][0] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', main_name) loader.add_value('identifier', sku) loader.add_value('price', main_price) loader.add_value('sku', sku) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('url', response.url) loader.add_value('image_url', image_url) description = ' '.join( response.xpath('//*[@class="product-details"]//text()').extract()) sold_as = '' if 'Priced per' in description: sold_as = description.split('Priced per')[1] if 'Priced by' in description: sold_as = description.split('Priced by')[1] if 'Price per' in description: sold_as = description.split('Price per')[1] if ';' in sold_as: sold_as = sold_as.split(';')[0] if '.' in sold_as: sold_as = sold_as.split('.')[0] if ',' in sold_as: sold_as = sold_as.split(',')[0] if not sold_as: sold_as = 'each' product = loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = sold_as product['metadata'] = metadata yield product for option in self._parse_options(response, product): yield option
def parse_product(self, response): data = SpiderSchema(response).get_product() options = response.xpath( '//div[@class="summary-container"]/table//tr[not(th)]') for option in options: loader = ProductLoader(item=Product(), response=response) opt_name = option.xpath( './/td[contains(@class,"optionscol")]/text()')[0].extract() opt_name = u'{} - {}'.format(data['name'], opt_name) opt_identifier = option.xpath('@class')[0].extract().split(' ')[0] opt_price = option.xpath('@data-price').extract() loader.add_value('name', opt_name) loader.add_value('url', response.url) loader.add_value('sku', data['sku']) loader.add_value('identifier', opt_identifier) if 'image' in data: loader.add_value('image_url', data['image']) else: loader.add_xpath('image_url', '//meta[@itemprop="og:image"]/@content') stock = option.xpath('@class').re('instock') if not stock: loader.add_value('stock', 0) loader.add_value('price', opt_price) loader.add_css('category', 'div.product_meta span.posted_in a::text') yield loader.load_item()
def parse(self, response): schema = SpiderSchema(response) pdata = schema.get_product() row = response.meta['row'] loader = ProductLoader(Product(), response=response) loader.add_value('identifier', pdata['sku']) loader.add_value('url', response.url) loader.add_value('name', pdata['name']) loader.add_value('price', pdata['offers']['properties']['price']) loader.add_value('sku', pdata['sku']) metadata = {'mpn': row['Manufacturer part code']} loader.add_value('category', row['Category']) loader.add_value('image_url', pdata['image']) loader.add_value('brand', row['Manufacturer name']) item = loader.load_item() item['metadata'] = metadata yield item
def parse_product(self, response): schema = SpiderSchema(response) product_data = schema.get_product() loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', product_data['productID']) loader.add_value('sku', product_data['productID']) loader.add_value('name', product_data['name']) out_stock = bool(response.css('.product-shop .out-of-stock')) if (not out_stock) and ( 'InStock' in product_data['offers']['properties']['availability']): loader.add_value('stock', 1) else: loader.add_value('stock', 0) category = response.css('.breadcrumbs').xpath( './/li/a/text()').extract()[1:] loader.add_value('category', category) loader.add_value('url', response.url) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_xpath( 'brand', '//th[@class="label" and contains(text(), ' '"Brand")]/following-sibling::td/text()') price = response.css('.product-shop .price-box .minimal-price .price' ).xpath('text()').re_first(r'[\d\.,]+') if not price: price = response.css( '.product-shop .price-box .regular-price .price').xpath( 'text()').re_first(r'[\d\.,]+') if not price: price = response.css( '.product-shop .price-box .special-price .price').xpath( 'text()').re_first(r'[\d\.,]+') loader.add_value('price', price) if loader.get_output_value('price') >= Decimal('45.0'): loader.add_value('shipping_cost', '0.0') else: loader.add_value('shipping_cost', '4.95') yield loader.load_item() for url in response.css('.grouped-items-table-wrapper .name-wrapper' ).xpath('a/@href').extract(): yield Request(url, callback=self.parse_product)
def parse_product(self, response): item = response.meta['item'] data = SpiderSchema(response).get_product() category = response.css('a.GTM-breadcumb::text').extract()[1:] loader = ProductLoaderEU(Product(), response=response) loader.add_value(None, item) loader.replace_value('price', data['offers']['properties']['price']) loader.replace_value('category', category) if data['offers']['properties']['availability'] != 'inStock': loader.replace_value('stock', 0) yield loader.load_item()
def parse_product(self, response): schema = SpiderSchema(response) pdata = schema.get_product() if not pdata: return loader = ProductLoader(Product(), response=response) identifier = re.search('/(\d+)$', url_query_cleaner(response.url)).group(1) loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_value('name', pdata['name']) loader.add_xpath('price', '//span[@id="product_priceExVAT"]/text()') loader.add_value('sku', pdata['productID']) category = response.css('p.breadcrumb a::text').extract()[-3:] loader.add_value('category', category) loader.add_value('image_url', pdata.get('image')) if pdata['brand'].get('properties'): loader.add_value('brand', pdata['brand']['properties']['name']) if loader.get_output_value('price') < 90: loader.add_value('shipping_cost', '5.25') yield loader.load_item()
def parse_product(self, response): schema = SpiderSchema(response) product_data = schema.get_product() breadcumbs_data = filter( lambda i: i.get('type') == 'http://data-vocabulary.org/Breadcrumb', schema.data['items'])[0] identifier = response.xpath( u'//div[contains(@class, "feature-detail")]//strong[contains(text(), "R\xe9f\xe9rence")]/following-sibling::span/text()' ).extract()[0].partition('-')[-1] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', identifier) l.add_value('name', ' '.join(product_data['name'].split())) l.add_value('brand', 'Lego') l.add_value('category', breadcumbs_data['properties']['title'][-2]) l.add_value('sku', identifier) l.add_value('url', response.url) l.add_value('price', product_data['offers']['properties']['price']) l.add_xpath('image_url', '//img[@itemprop="image"]/@src') if product_data['offers']['properties'][ 'availability'] == 'http://schema.org/OutOfStock': l.add_value('stock', 0) yield l.load_item()
def parse_category(self, response): try: data = SpiderSchema(response).get_products() except: return products = False for product in data: if not product.get('sku'): continue products = True loader = ProductLoader(Product(), response=response) loader.add_value('identifier', product['sku']) loader.add_value('url', product['url'][0]) loader.add_value('name', product['name']) loader.add_value('sku', product['sku']) category = response.css('a.GTM-breadcumb::text').extract( )[1:] or response.meta.get('category') loader.add_value('category', category) loader.add_value('image_url', product['image']) loader.add_value('brand', product['brand']) if product['offers']['properties']['availability'] != 'in stock': loader.add_value('stock', 0) price = product['offers']['properties']['price'] yield Request(loader.get_output_value('url'), self.parse_product, meta={'item': Product(loader.load_item())}) if not products: return page = url_query_parameter(response.url, 'page') if page: url = add_or_replace_parameter(response.url, 'page', int(page) + 1) else: id_families = response.xpath( '//input[@data-key="idFamilies"]/@value').extract_first() if id_families: url = add_or_replace_parameter( 'https://www.pccomponentes.pt/listado/ajax?page=0&order=price-desc', 'idFamilies[]', id_families) elif response.url.endswith('/novedades/'): return elif response.url.endswith('/'): url = response.url + 'ajax?page=0&order=price-desc' else: return yield Request(url, self.parse_category, meta={'category': category})
def parse_product(self, response): product = SpiderSchema(response).get_product() if not product: return loader = ProductLoader(Product(), response=response) loader.add_value('identifier', product['sku']) loader.add_value('url', response.url) loader.add_value('name', product['name']) loader.add_value('price', product['offers']['properties']['price']) loader.add_value('sku', product['sku']) loader.add_xpath('category', '//a[@id="breadCrumbDetails"]/text()') loader.add_value('image_url', product['image']) if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '9.95') if product['offers']['properties']['availability'] != 'http://schema.org/InStock': loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): # Normal options options = response.xpath( '//select[@class="variation-select"]/option[not(@selected)]') options = zip(map(unicode.strip, options.xpath('text()').extract()), options.xpath('@value').extract()) for desc, url in options: yield Request(url, meta={ 'category': response.meta.get('category'), 'option': desc }, callback=self.parse_product) # Variations popup variations_url = response.xpath( '//div[@class="variations"]//a/@data-href').extract() if variations_url: url = response.urljoin(variations_url[0]) yield Request(url, callback=self.parse_variations, meta=response.meta) schema = SpiderSchema(response) product = schema.get_product() name = product['name'] # Normal option selected current_option = map( unicode.strip, response.xpath( '//select[@class="variation-select"]/option[@selected]/text()' ).extract()) if current_option: name += ' - ' + current_option[0] # Variation selected currently_selected = response.xpath('//div[@class="variations"]' '//div[contains(@class, "variation-attribute-selected-value")]/text()')\ .extract() if currently_selected: current_option = currently_selected[-1].strip() name += ' - ' + current_option[0] identifier = product['productID'] price = product['offers']['properties']['price'] image_url = product['image'] category = response.meta.get('category') if not category: category = [c['properties']['name'] \ for c in schema.data['items'][0]['properties']\ ['breadcrumb']['properties']['itemListElement'][1:-1]] else: category = category.split(',') if '2 seater' in name.lower(): category[-1] = '2 Seater' elif '2.5 seater' in name.lower(): category[-1] = '2.5 Seater' elif '3 seater' in name.lower(): category[-1] = '3 Seater' elif '3.5 seater' in name.lower(): category[-1] = '3.5 Seater' elif '4 seater' in name.lower(): category[-1] = '4 Seater' if 'recliner' in name.lower(): if '2 Seater' in category: category[-1] = '2 seater recliner' elif '3 Seater' in category: category[-1] = '3 seater recliner' elif 'armchair' in category: category[-1] = 'Recliner armchairs' loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('image_url', image_url) loader.add_value('price', price) for cat in category: loader.add_value('category', cat) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) if not identifier in self.items: self.items.append(identifier) yield loader.load_item()
def parse_product(self, response): schema = SpiderSchema(response) data = schema.get_product() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', data['Name']) loader.add_xpath('category', u'//div[@class="breadcrumbs"]/ul/li[2]/a/text()') price = response.xpath( '//form[@id="productform"]/input[@name="price"]/@value').extract() if price: loader.add_value('price', price[0]) else: loader.add_value( 'price', data.get('offers', {}).get('properties', {}).get('price', '0.0')) sku = map( unicode.strip, response.xpath( '//span[contains(@class, "mfr-number")]/text()').extract()) loader.add_value('identifier', data['productID']) if sku: loader.add_value('sku', sku) else: loader.add_value('sku', data['productID'].replace('#', '')) image_url = data.get('image', '').replace('www.example.com', 'www.webstaurantstore.com') if image_url: loader.add_value('image_url', response.urljoin(image_url)) brand = data.get('brand', '') if not brand: brand = response.xpath( '//tr[@class="highlight" and .//b[contains(text(), "Manufacturer Name")]]/td[not(b)]/text()' ).extract() brand = brand[0].strip() if brand else '' if brand: loader.add_value('brand', brand) sold_as = response.xpath( '//div[@id="subject"]/div/div/p/span[@class="each"]/text()' ).extract() product = loader.load_item() if product.get('identifier', '').strip() != '': metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].replace('/', '') if sold_as else '' product['metadata'] = metadata # Add to cart to see the price if response.xpath( '//*[@itemprop="price" and contains(@class, "strikeOutPrice")][1]' ): cart_url = 'http://www.webstaurantstore.com/viewcart.html' inputs = response.xpath('//form[@id="productform"]/input') formdata = dict( zip( inputs.select('./@name').extract(), inputs.select('./@value').extract())) # quantity formdata[u'qty'] = '1' f_request = FormRequest(url=cart_url, method='POST', formdata=formdata, callback=self.parse_price, meta={ 'product': product, 'dont_merge_cookies': True }, dont_filter=True) yield f_request else: yield product # loader.load_item()
def parse_product(self, response): data = SpiderSchema(response).get_product() row = response.meta['row'] name = response.xpath('//h1[@itemprop="name"]/text()').extract()[0] identifier = response.xpath( '//div[@class="product-view"]//input[@name="product"]/@value' ).extract() if not identifier: identifier = response.xpath('//a[@title="Email"]/@href').re( 'id\/(\d+)') identifier = identifier[0] sku = row['Transcat SKU'] strike = None price = response.css('div.product-type-data p.special-price').xpath( 'span[@itemprop="price"]/text()').extract() if not price: price = response.css('div.product-type-data span.nobr').xpath( 'span[@itemprop="price"]/text()').extract() #price = extract_price(price[0]) if price else '0' if type(data.get('offers')) == list: price = data['offers'][0]['properties']['price'] else: try: price = data['offers']['properties']['price'] except KeyError: price = 0 if type(price) == list: strike = price[0] price = price[-1] if response.meta.get('retries'): self.logger.debug('Helped!') else: retries = response.meta.get('retries', 0) if retries < 2: meta = response.meta.copy() meta['retries'] = retries + 1 yield Request(response.url, self.parse_product, dont_filter=True, meta=meta) return if self.prev_strikes[identifier]: self.logger.debug('No strike price more on %s' % response.url) fname = os.path.join( DATA_DIR, '../logs/default/transcat-transcat.com/%s.html' % response.url.split('/')[-1]) with open(fname, 'w') as f: f.write(response.body) product_image = response.xpath('//img[@id="image"]/@src').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('price', 0) if product_image: loader.add_value('image_url', product_image[0]) loader.add_value('brand', row['Brand'].decode('utf')) categories = row['Category'].split(',') categories = map(lambda x: x.decode('utf'), categories) loader.add_value('category', categories) stock = response.xpath('//p[@class="availability in-stock"]') if stock: stock = stock.select('span[@class="product-qty"]').re('\d+') loader.add_value('stock', int(stock[0])) out_of_stock = response.xpath( '//p[@class="special-text-msg" and contains(text(), "not available")]' ) if out_of_stock: loader.add_value('stock', 0) item = loader.load_item() metadata = TranscatMeta() mpn = response.xpath('//p[@itemprop="mpn"]/text()').re( 'Mfg Part #: (.*)') metadata['mpn'] = mpn[0].strip() if mpn else '' #strike = response.xpath('//div[@class="product-type-data"]//p[@class="old-price"]/span[@itemprop="price"]/text()').extract_first() metadata['strike'] = strike.strip() if strike else '' item['metadata'] = metadata yield item
def parse_product(self, response): schema = SpiderSchema(response) pdata = schema.get_product() sku = pdata.get('mpn', '') image = pdata['image'].replace('example.com', 'prodirectsoccer.com') main_id = response.xpath( '//div[@id="define-profile"]/@data-quickref').extract()[0] main_name = pdata['name'] main_price = extract_price(pdata['offers']['properties']['price']) main_brand = response.meta.get('brand') shipping = '3.95' sizes = response.xpath('//select[@id="size"]/option[@value!=""]') player_sel_label = response.xpath( '//label[@for="pers-opt1"]/text()').extract() player_tourn_sel_label = response.xpath( '//label[@for="pers-opt2"]/text()').extract() for size_opt in sizes: size_desc = size_opt.xpath('text()').extract()[0].strip() size_value = size_opt.xpath('@value').extract()[0].strip() in_stock = True if ' ' in size_desc: size_desc, stock = size_desc.split(' ', 1) if 'OUT OF STOCK' in stock.upper(): in_stock = False loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', main_id + 'x' + size_value) loader.add_value('name', main_name + ' - ' + size_desc) loader.add_value('sku', sku) loader.add_value('price', main_price) loader.add_value('shipping_cost', shipping) loader.add_value('url', response.url) loader.add_value('image_url', image) if main_brand: loader.add_value('brand', main_brand) loader.add_value('category', 'Replicas') if not in_stock: loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = {'size': size_desc} yield item if player_sel_label: player_sel_price = extract_price(player_sel_label[0]) players = response.xpath( '//select[@id="pers-player"]/option[@value!=""]') for player_opt in players: player_desc = player_opt.xpath( 'text()').extract()[0].strip() player_value = player_opt.xpath( '@value').extract()[0].strip() new_item = Product(item) new_item['identifier'] += 'x' + player_value new_item['name'] += ' - ' + player_desc new_item['price'] = Decimal( new_item['price']) + player_sel_price try: player_number, player_name = re.search( r'(\d+)\s(.*)', player_desc).groups() new_item['metadata']['player'] = player_name.strip() new_item['metadata']['number'] = player_number except: pass yield new_item if player_tourn_sel_label: player_tourn_price = extract_price( player_tourn_sel_label[0]) tournaments = response.xpath( '//select[@id="pers-tournament"]/option[@value!=""]' ) for tourn_opt in tournaments: tourn_desc = tourn_opt.xpath( 'text()').extract()[0].strip() tourn_value = tourn_opt.xpath( '@value').extract()[0].strip() new_item = Product(item) new_item[ 'identifier'] += 'x' + player_value + 'x' + tourn_value new_item[ 'name'] += ' - ' + player_desc + ' - ' + tourn_desc new_item['price'] = Decimal( new_item['price']) + player_tourn_price try: player_number, player_name = re.search( r'(\d+)\s(.*)', player_desc).groups() new_item['metadata'][ 'player'] = player_name.strip() new_item['metadata']['number'] = player_number except: pass yield new_item
def parse_product(self, response): schema = SpiderSchema(response) pdata = schema.get_product() sku = response.xpath( '//meta[@itemprop="productId"]/@content').extract() if not sku: self.log('Product without identifier: ' + response.url) return name = pdata['name'] price = extract_price(pdata['offers']['properties']['price']) brand = response.xpath( '//tr[th[contains(text(), "Range Name")]]/td/text()').extract() brand = brand[0].strip() if brand else '' categories = response.xpath( '//a[@class="breadcrumb__link"]/span/text()').extract()[1:] l = ProductLoader(item=Product(), response=response) image_url = response.xpath( '//div[contains(@class, "product-slider__element")]//img[@itemprop="image"]/@src' ).extract() if not image_url: image_url = response.xpath( '//div[contains(@class, "product-slider__carousel")]//img[@itemprop="image"]/@src' ).extract() image_url = image_url[0] if image_url else '' l.add_value('image_url', image_url) l.add_value('url', response.url) l.add_value('name', name) discount_percentage = response.xpath( '//span[@class="voucher-banner__title"]/text()').re('(\d+)%') if discount_percentage: price = price - ( (int(discount_percentage[0]) * price) / Decimal(100)) l.add_value('price', price) l.add_value('brand', brand) l.add_value('category', categories) sku = sku[0] l.add_value('sku', sku) l.add_value('identifier', sku) out_of_stock = response.xpath( '//i[contains(@class, "stock-indicator__status--inactive")]') if out_of_stock: l.add_value('stock', 0) item = l.load_item() promotions = response.xpath( '//span[contains(@class, "price--type-was")]//text()').extract() if not promotions: promotions = response.xpath( '//div[contains(@class, "price--type-was")]//span[@class="text--strikethrough"]//text()' ).extract() metadata = MetaData() metadata['Promotions'] = ' '.join(map( lambda x: x.strip(), promotions)).strip() if promotions else '' item['metadata'] = metadata options = response.css('.variant__selector select option') if not options: yield item return for option in options: sku = option.xpath('@data-sku').extract_first() token = response.xpath('//@data-token').extract_first() headers = { 'Content-Length': 0, 'X-CSRF-Token': token, 'X-Requested-With': 'XMLHttpRequest' } yield Request( 'https://victoriaplum.com/api/v1/variant/%s/details' % sku, self.parse_option, method='POST', headers=headers, meta={'item': Product(item)})