def parse_options(self, response): item = response.meta['item'] option_data = response.body.split('@@@@') identifier = option_data[0] image_url = option_data[1] product_data = response.xpath('//span/text()').extract() if len(product_data) < 3: sku, price = response.xpath('//span/text()').extract() name = '' else: name, sku, price = response.xpath('//span/text()').extract()[:3] # Some products doesn't show name, so the sku goes to name variable product_found = self.rows.get(name, None) if product_found: sku = name name = '' else: product_found = self.rows.get(sku, None) if product_found: item['identifier'] = sku item['sku'] = sku item['metadata']['mpn'] = sku[3:] if image_url.endswith('.jpg'): item['image_url'] = response.urljoin(image_url) if name: item['name'] += ' ' + name item['price'] = extract_price(price) categories = self.categories.get(sku.upper()) item['category'] = ' > '.join([s for s in categories if s]) yield item
def parse_category(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[@class="boxit_new_cover"]') for product in products: loader = ProductLoader(selector=product, item=Product()) url = product.select('div/a[@class="nadpisa"]/@href').extract()[0] identifier = url.replace('/', '').replace('.', '') loader.add_value('identifier', identifier) url = urljoin_rfc(base_url, url) name = product.select( 'div/a[@class="nadpisa"]/text()').extract()[0] loader.add_value('name', name) loader.add_value('url', url) loader.add_xpath('image_url', 'div/div[@class="boximages_new"]/div/a/img/@src') price = extract_price( product.select('div/div/div[@class="cenaa"]/text()').extract() [0]) loader.add_value('price', price) loader.add_xpath('category', '//div/h1/text()') loader.add_value('sku', self.re_sku.findall(name)) loader.add_value('brand', 'LEGO') if int(price) < 4000: loader.add_value('shipping_cost', 99) if price <= 0: loader.add_value('stock', 0) yield self.load_item_with_metadata(loader.load_item())
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta loader = ProductLoader(response=response, item=Product()) identifier = hxs.select( '//ul[contains(@class, "mint")]//input[@name="productId_1"]/@value' ).extract()[0] loader.add_value('identifier', identifier) name = hxs.select( '//div[@class="productHeader"]/h1/text()').extract()[0].strip() loader.add_value('name', name) loader.add_value('sku', meta['sku']) price = ''.join( hxs.select( '//ul[contains(@class, "mint")]/li[contains(@class, "price")]//text()' ).extract()).strip() price = price if price else '0' loader.add_value('price', extract_price(price)) loader.add_value('url', response.url) out_of_stock = hxs.select('//div[@class="outOfStock"]') if out_of_stock: loader.add_value('stock', 0) loader.add_xpath('image_url', '//img[@class="mainImage"]/@src') yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) product_name = response.xpath('//h1[@class="product__title"]/text()').extract()[0].strip() brand = re.search('Trademark=(.*)', response.body) brand = brand.group(1) if brand else '' sku = response.xpath('//div[@class="product__vnr"]/text()').re('VNR: (.*)') product_price = response.xpath('//div[@class="product__price"]/text()').extract() if not product_price: product_price = ['0.00'] product_price = product_price[0] product_code = response.xpath('//input[@name="productId"]/@value').extract()[0] image_url = response.xpath('//img[@class="img-responsive"]/@src').extract() category = response.xpath('//ol[@class="breadcrumbs"]//a/text()').extract() category = category[-1] if category else '' loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('url', response.url) loader.add_value('sku', sku) loader.add_value('identifier', product_code) if image_url: loader.add_value('image_url', 'http:' + image_url[0]) loader.add_value('category', category) product_price = extract_price(product_price.replace('.', '').replace(',', '.')) loader.add_value('price', product_price) yield loader.load_item()
def parse_price(self, response): product = response.meta['product'] # data = eval(response.body, {'true':True, 'false':False}) import json try: data = json.loads(response.body) except: self.log("ERROR cant load json, response.body=" + response.body) return if 'price' in data: product['price'] = extract_price(data['price']) if 'sku' in data and data['sku']: product['sku'] = data['sku'] product['identifier'] = product['identifier'] + '_' + response.meta['options'] if 'image' in data and data['image']: product['image_url'] = data['image'].replace('\\', '') elif 'thumb' in data and data['thumb']: product['image_url'] = data['thumb'].replace('\\', '') yield product
def parse(self, response): response.selector.register_namespace("g", "http://base.google.com/ns/1.0") for item in response.xpath('//item'): image_url = item.xpath('g:image_link/text()').extract() image_url = image_url[0] if image_url else '' category = item.xpath('g:product_type/text()').extract() category = category[0].split('>')[1:] if category else '' brand = item.xpath('g:brand/text()').extract() identifier = item.xpath('g:id/text()').extract() name = item.xpath('title/text()').extract_first() if name: name = name.replace('...', '').strip() price = item.xpath('g:price/text()').extract() price = extract_price(price[0]) if price else 0 url = item.xpath('link/text()').extract()[0] out_of_stock = item.xpath('g:availability/text()').extract()[0] == 'out of stock' product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('identifier', identifier) product_loader.add_value('sku', identifier) product_loader.add_value('name', name) product_loader.add_value('image_url', image_url) product_loader.add_value('price', price) product_loader.add_value('url', url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) if out_of_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product
def parse(self, response): hxs = HtmlXPathSelector(response) exchange_rate = hxs.select( '//tr[@class="uccRes"]/td[last()]/text()').re('[\d\.]+') yield Request('http://www.bluesuntree.co.uk/', meta={'exchange_rate': extract_price(exchange_rate[0])}, callback=self.parse_real)
def parse_product(self, response): row = response.meta['row'] name = ' '.join( response.xpath( '//div[@class="product-title"]//text()').extract()).strip() colour = response.xpath( '//div[@class="product-colors__header"]//span[@class="current"]/text()' ).extract() if colour: name += ' ' + colour[0].strip() image_url = response.xpath( '//meta[@property="og:image"]/@content').extract() image_url = image_url[0] if image_url else '' price = response.xpath( '//p[@class="product-price__now"]/span[@class="value"]/text()' ).extract() price = extract_price(price[0]) if price else '' loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['SKU']) loader.add_value('sku', row['SKU']) loader.add_value('url', response.url) loader.add_value('image_url', image_url) loader.add_xpath('brand', '//meta[@property="og:brand"]/@content') categories = response.xpath( '//ul[@class="breadcrumbs"]//a/text()').extract()[-3:] loader.add_value('category', categories) loader.add_value('name', name) loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): row = response.meta['row'] name = response.xpath( '//h2[@itemprop="name"]/text()').extract()[0].strip() colour = response.xpath( '//p[@class="common-option variant-ctrl"]/text()').extract() if colour: name += ' ' + colour[0].strip() image_url = response.xpath('//img[@itemprop="image"]/@src').extract() image_url = 'http:' + image_url[0] if image_url else '' price = ''.join( response.xpath( '//div[contains(@class, "product-price")]/span[contains(@class, "current")]//text()' ).extract()) price = extract_price(price) if price else '' loader = ProductLoader(response=response, item=Product()) loader.add_xpath('identifier', '//div[@id="pid"]/@data-product-id') loader.add_value('sku', row['SKU']) loader.add_value('url', response.url) loader.add_value('image_url', image_url) loader.add_xpath('brand', '//h2[@itemprop="brand"]/a/text()') categories = response.xpath( '//ul[@id="breadcrumbs"]//span/text()').extract() loader.add_value('category', categories) loader.add_value('name', name) loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) colour_options = hxs.select( '//ul[contains(@class, "colour-palette")]//a/@href').extract() for colour_option in colour_options: yield Request(urljoin_rfc(base_url, colour_option), callback=self.parse_product) loader = ProductLoader(item=Product(), response=response) product_name = hxs.select( '//div[contains(@class, "prod-details")]//h2/text()').extract() product_name = product_name[0] product_brand = '' for brand in self.brands: if brand.upper() in product_name.upper(): product_brand = brand break product_price = hxs.select( '//p[contains(@class, "pricing")]/span/text()').extract() product_price = extract_price( product_price[0]) if product_price else '0' product_code = hxs.select( '//div[contains(@class, "code")]/strong/text()').extract()[0] image_url = hxs.select('//a[@class="main-thumb"]/img/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' categories = hxs.select( '//div[contains(@class, "breadcrumb")]/a[not(@href="/") and not(@class="active")]/text()' ).extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('url', response.url) loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_value('identifier', product_code) loader.add_value('brand', product_brand) loader.add_value('image_url', image_url) loader.add_value('category', categories) out_of_stock = hxs.select('//form[@class="add_to_notification"]') if out_of_stock: loader.add_value('stock', 0) if loader.get_output_value('price') >= 10: loader.add_value('shipping_cost', 0) else: loader.add_value('shipping_cost', 3) loader.add_value('price', product_price) yield loader.load_item()
def start_requests(self): with open(os.path.join(HERE, 'lego.csv')) as f: reader = csv.reader(cStringIO.StringIO(f.read())) for row in reader: yield self.search( 'LEGO ' + row[2], { 'sku': row[2], 'name': row[3], 'price': extract_price(row[4]), })
def parse(self, response): reader = csv.DictReader(StringIO(response.body)) for row in reader: loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['identifier']) loader.add_value('url', row['URL']) loader.add_value('name', row['Product Name'] + ' ' + row['Pack size']) loader.add_value('price', extract_price(row['Price'])) yield loader.load_item()
def parse(self, response): reader = csv.DictReader(StringIO(response.body)) for row in reader: loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['ID'].lower()) loader.add_value('sku', row['ID']) loader.add_value('brand', '') loader.add_value('category', '') loader.add_value('name', row['Name'].decode('utf8')) loader.add_value('price', extract_price(row['Price'])) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) row = response.meta['row'] name = hxs.select('//span[@itemprop="name"]/text()').extract()[0].strip() url = response.url price = hxs.select('//p[@class="special-price"]/span[@class="price"]/text()').extract() if not price: price = hxs.select('//span[@class="regular-price"]/span[@class="price"]/text()').extract() price = price[0] if price else 0 l = ProductLoader(item=Product(), response=response) l.add_value('name', name) l.add_value('url', response.url) l.add_value('sku', row['SKU']) l.add_value('price', price) identifier = hxs.select('//input[@name="productId"]/@value').extract() if not identifier: identifier = hxs.select('//input[@name="product"]/@value').extract() l.add_value('identifier', identifier) l.add_xpath('brand', '//tr[th/text()="Brand"]/td/text()') l.add_xpath('image_url', '//a[@id="shoe-spin"]/img/@src') categories = hxs.select('//li[@typeof="v:Breadcrumb"]/a/text()').extract() l.add_value('category', categories) in_stock = hxs.select('//div[@class="offer"]//p[@class="availability in-stock"]') if not in_stock: l.add_value('stock', 0) item = l.load_item() options_config = re.search(r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: option_item = deepcopy(item) product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join((products.get(product, ''), option['label'])) prices[product] = prices.get(product, 0) + extract_price(option['price']) for option_id, option_name in products.iteritems(): option_item = deepcopy(item) option_item['identifier'] = option_item['identifier'] + '-' + option_id option_item['name'] = option_item['name'] + re.findall('(.*) \(', option_name)[0] option_item['price'] = option_item['price'] + prices[option_id] if 'IN STOCK' not in option_name.upper(): option_item['stock'] = 0 yield option_item else: yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) search_item = response.meta['search_item'] brand = ''.join( hxs.select('//tr[contains(th/h2/text(), "Brand")]/td/a/span/text()' ).extract()) products = hxs.select('//tr[@class="magazinProductTableRowData"]') for product in products: try: name, sku = product.select( 'td/a[contains(b/text(), "MPN")]/text()').extract() except: continue if sku.upper() == search_item['code'].upper( ) and search_item['brand'].upper() == brand.upper(): loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('sku', search_item['code']) loader.add_xpath('identifier', '@id') loader.add_value('brand', search_item['brand']) image_url = hxs.select( '//div[@class="product-img-box"]/a/img/@src').extract() image_url = image_url[0] if image_url else '' loader.add_value('image_url', image_url) category = search_item['category'] if not category: category = hxs.select( '//div[@class="breadcrumbs"]/ul/li/a/text()').extract( ) category = category[-1].strip() if category else '' loader.add_value('category', search_item['brand']) loader.add_value('category', category) price = product.select( 'td[contains(text(), "$")]/a/text()').extract() price = extract_price(price[0]) if price else 0 loader.add_value('price', price) in_stock = product.select( 'td[contains(a/text(), "In Stock")]/a/text()').extract() if not in_stock: loader.add_value('stock', 0) product = loader.load_item() metadata = NavicoMeta() metadata['screen_size'] = search_item['screen size'] product['metadata'] = metadata yield product
def parse(self, response): transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT)) password = "******" username = "******" filename = 'IntelligentEye.txt' transport.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(transport) sftp.get(filename, HERE + '/' + filename) fields = [ 'UniqueProductCode', 'isbn', 'ean', 'upc', 'ProductName', 'PriceGBP', 'ProductPageURL', 'Brand', 'Category', 'ImageURL', 'Stock', 'ShippingCost', 'NetRetailPrice', 'CostPrice' ] fields2 = [ 'UniqueProductCode', 'isbn', 'ean', 'upc', 'ProductName', 'Temp1', 'PriceGBP', 'ProductPageURL', 'Brand', 'Category', 'ImageURL', 'Stock', 'ShippingCost', 'NetRetailPrice', 'CostPrice' ] with open(os.path.join(HERE, filename)) as f: for i, line in enumerate(f, 1): line = line.decode('cp865', 'ignore') values = line.split('\t') if len(fields) == len(values): data = dict(zip(fields, values)) elif len(fields2) == len(values): data = dict(zip(fields2, values)) else: msg = "Incorrect number of fields on line: %d" % i self.log("[ERROR] %s" % msg) self.errors.append(msg) loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', data['UniqueProductCode']) loader.add_value('sku', data['UniqueProductCode']) loader.add_value('name', data['ProductName']) loader.add_value('price', extract_price(data['PriceGBP'])) loader.add_value('url', data['ProductPageURL']) loader.add_value('image_url', data['ImageURL']) loader.add_value('brand', data['Brand']) loader.add_value('category', data['Category']) loader.add_value('shipping_cost', data['ShippingCost']) loader.add_value('stock', data['Stock']) item = loader.load_item() item['sku'] = item['sku'].upper() metadata = MusicroomMeta() metadata['cost_price'] = data['CostPrice'].strip() metadata['net_retail_price'] = data['NetRetailPrice'].strip() item['metadata'] = metadata yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) product_name = hxs.select('//div[@class="boxbody"]/h1/text()[normalize-space()]').extract() if not product_name: retried = response.meta.get('retried', False) if not retried: yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product) product_price = hxs.select('//div[@class="price"]/ins/b/text()').extract() product_price = product_price[0] if product_price else None if not product_price: product_price = re.search('Price=(.*)', response.body) if product_price: product_price = product_price.group(1).replace('.', '') else: retried = response.meta.get('retried', False) if not retried: yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product) image_url = hxs.select('//a[@class="img"]/@href').extract() out_of_stock = hxs.select('//li[@class="serpontunactive"]') loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@class="boxbody"]/h1/text()[normalize-space()]') loader.add_value('url', response.url) loader.add_xpath('sku', '//*', re=r'ProductNo=(.*)') loader.add_xpath('identifier', '//*', re=r'ProductID=(.*)') if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_xpath('category', '//li[@class="current"]/a/text()', lambda e: e[0] if e else '') product_price = extract_price(product_price.replace('.', '').replace(',', '.')) loader.add_value('price', product_price) loader.add_xpath('brand', '//*', lambda e: e[0] if e else '', re=r'Trademark=(.*)') item = loader.load_item() if not item.get('sku') or not item.get('name'): retried = response.meta.get('retried', False) if not retried: yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product) return if not item.get('price'): item['stock'] = 0 yield item
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) search_item = response.meta['search_item'] sku = ''.join(hxs.select('//span[@class="product-manufno"]/text()').extract()).strip() name = ''.join(hxs.select('//h1[@id="productDetailsPageTitle"]/text()').extract()) for row in self.rows: if sku.upper() == row['code'].upper().strip() and row['brand'].upper() in name.upper().strip(): loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('sku', sku) loader.add_xpath('identifier', '//input[@name="productCodePost"]/@value') brand = get_brand(name) or search_item['brand'] loader.add_value('brand', brand) image_url = hxs.select('//div[@id="primary_image"]/a/img/@src').extract() image_url = 'http:' + image_url[0] if image_url else '' loader.add_value('image_url', image_url) category = row['category'] if not category: category = hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()').extract() category = category[-1] if category else '' loader.add_value('category', search_item['brand']) loader.add_value('category', category) price = hxs.select('//p[contains(@class, "promo price")]/text()').extract() if not price: price = hxs.select('//p[contains(@class, "regularPrice")]/text()').extract() price = extract_price(price[0]) if price else 0 loader.add_value('price', price) if not price: loader.add_value('stock', 0) product = loader.load_item() metadata = NavicoMeta() metadata['screen_size'] = row['screen size'] product['metadata'] = metadata yield product continue #else: #if name: #log.msg('Invalid brand or code: ' + response.url) products = hxs.select('//div[@class="productName"]/a/@href').extract() for product in products: url = urljoin_rfc(base_url,product) yield Request(url, meta=response.meta)
def parse_special_price(self, response): data = json.loads(response.body) product = Product(response.meta['product']) for cart_item in data['items']: if str(cart_item['product_id']) == str(product['identifier']): product['price'] = extract_price(cart_item['subMapPrice']) yield Request(self._get_reviews_url(product, 1), meta={ 'product': product, 'page': 1 }, callback=self.parse_product_reviews)
def parse_products(self, base_url, response, hxs): products = hxs.select('//div[@id="ListView"]/div') for r in products: try: price = r.select( './/span[@class="PricesalesPrice"]/text()').extract()[0] except: # No price => continue continue loader = ProductLoader(item=Product(), selector=r) loader.add_xpath( 'name', './/div[@class="FlexibleListBrowseV1ProductName"]/a/text()') url = r.select( './/div[@class="FlexibleListBrowseV1ProductName"]/a/@href' ).extract()[0] url = urljoin_rfc(base_url, url) loader.add_value('url', url) price = price.replace('.', '').replace(',', '.') loader.add_value('price', price) sku = r.select( './/div[@class="FlexibleCategoryProductSKUListView"]/text()' ).extract()[0] loader.add_value('sku', sku.replace('SKU: ', '')) category = url.split('/')[3] if category in CATEGORIES: category = CATEGORIES[category] else: category = '' loader.add_value('category', category) brand = ''.join( r.select('.//div[@class="FlexibleListViewMiddle"]/text()'). extract()).strip() loader.add_value('brand', brand) img_url = r.select( './/img[@class="browseProductImage"]/@src').extract()[0] loader.add_value('image_url', urljoin_rfc(base_url, img_url)) loader.add_xpath( 'identifier', './/input[@name="virtuemart_product_id[]"]/@value') price = extract_price(price) if price < Decimal(50): loader.add_value('shipping_cost', '7.00') yield loader.load_item()
def parse_shipping(self, response): hxs = HtmlXPathSelector(response) shipping_cost = ''.join( hxs.select('//tr[td[contains(text(), "Envoyer")]]/td/text()').re( '(\d+,\d+)')) shipping_cost = extract_price(shipping_cost) product = response.meta['product'] product['shipping_cost'] = shipping_cost yield product yield Request( response.meta['clean'], callback=self.parse_sync_basket, dont_filter=True, meta={'collect_products': response.meta['collect_products']})
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select('//span[@class="ArTit"]//text()').extract()[0] name = " ".join(name.split()) loader.add_value('name', name) loader.add_xpath( 'sku', '//span[@id="MainContent_ngpArticolo_lblARCd_AR"]/text()') price = hxs.select( '//span[@id="MainContent_ngpArticolo_lblPrezzoScontato"]/text()' )[0].extract() price = price.replace('.', '').replace(',', '.') loader.add_value('price', price) loader.add_xpath( 'brand', '//span[@id="MainContent_ngpArticolo_lblARMarcaDescrizione"]/text()' ) loader.add_xpath( 'category', '//span[@id="MainContent_ngpArticolo_lblCd_ARGruppo2"]/text()') image_url = hxs.select('//div[@id="gallery"]/img/@src') if not image_url: image_url = hxs.select('//div[@id="gallery"]/input/@src') image_url = image_url[0].extract() if not image_url.strip().endswith('noimage.png'): loader.add_value('image_url', urljoin_rfc(base_url, image_url)) if hxs.select('//div[@class="art-light-red"]'): loader.add_value('stock', 0) loader.add_value('url', response.url) loader.add_value('identifier', response.url.split('id=')[1]) price = extract_price(price) if price < Decimal(100): loader.add_value('shipping_cost', '15.00') elif price < Decimal(251): loader.add_value('shipping_cost', '30.00') elif price < Decimal(751): loader.add_value('shipping_cost', '40.00') elif price < Decimal(1000): loader.add_value('shipping_cost', '60.00') else: loader.add_value('shipping_cost', '100.00') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath( 'name', u'//div[@id="product-details-main"]//h1/text()') product_loader.add_xpath('category', u'//div[@class="crumbs"]/a[2]/text()') product_loader.add_xpath('price', u'//span[@class="blu-price"]/span/text()') product_loader.add_xpath('sku', '//meta[@name="bc:sku"]/@content') img = hxs.select(u'//img[@id="product-image-main"]/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) brands = hxs.select( u'//ul[@id="nav-top-list"]/li[contains(@class,"brands")]//a/text()' ).extract() name = product_loader.get_output_value('name').split()[0].lower() for brand in brands: if brand.split()[0].lower() == name: product_loader.add_value('brand', brand) product = product_loader.load_item() for variant in hxs.select('//div[@class="variant"]'): var_name = product['name'] + ' ' + variant.select( './/h4/text()').extract()[0].strip() price = variant.select( './/p[contains(@class, "price")]/span/text()').extract()[-1] for opt in variant.select('.//table/tr'): opt_name = var_name + ' ' + opt.select( 'td[1]/text()').extract()[0].strip() stock = opt.select('td[2]/text()').extract()[0].strip().lower() identifier = self.normalizename(opt_name).replace(' ', '')\ .replace('/', '').replace('-', '').replace('+', '').lower().replace('on sale', '').strip() opt_product = Product(product) opt_product['price'] = extract_price(price) opt_product['name'] = opt_name opt_product['identifier'] = identifier if 'out of stock' in stock: opt_product['stock'] = 0 yield opt_product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta loader = ProductLoader(response=response, item=Product()) identifier = response.url.split('/')[-2] loader.add_value('identifier', identifier) name = hxs.select('//h1[@class="prod_det"]/text()').extract()[0] loader.add_value('name', name) loader.add_value('sku', meta['sku']) price = hxs.select('//strong[@class="curPrice"]/text()').extract() price = price[0] if price else '0' loader.add_value('price', extract_price(price)) loader.add_value('url', response.url) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) product_name = hxs.select( '//div[@class="product-name"]/h1/text()')[0].extract() product_price = hxs.select( '//p[@class="special-price"]/span[@class="price"]/text()').extract( ) if not product_price: product_price = hxs.select( '//span[@class="regular-price"]/span[@class="price"]/text()' ).extract() if product_price: product_price = product_price[0] product_code = hxs.select('//div[@class="product-code"]/text()').re( 'Product code: (.*)')[0] image_url = hxs.select( '//img[@class="product-img-img"]/@src').extract() brand = response.meta.get('brand', '') category = brand loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('url', response.url) loader.add_value('sku', product_code) loader.add_value('identifier', product_code) loader.add_value('brand', brand) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('category', category) product_price = extract_price(product_price) loader.add_value('price', product_price) if not product_price: loader.add_value('stock', 0) product = loader.load_item() metadata = NavicoMeta() metadata['screen_size'] = self.force4_products.get( product_code.strip().upper(), '') product['metadata'] = metadata yield product
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) category = hxs.select( u'//div[@class="breadcrumbs"]/ul/li//text()').extract() category = u' > '.join( [x.strip() for x in category if len(x.strip()) > 1]) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) name = hxs.select(u'//div[contains(@class,"product-name")]/h1/text()' )[0].extract().strip() loader.add_value('name', name) loader.add_value('category', category) identifier = hxs.select(u'//div[@class="product-shop"]').re( u'Codice: (.*?)<')[0].strip() loader.add_value('identifier', identifier) found = False if identifier in self.ean_codes: loader.add_value('sku', identifier) # self.ean_codes[identifier]) found = True else: for model in self.model_codes.keys(): if len(model) > 3 and model in name.lower(): loader.add_value('sku', self.model_codes[model]) found = True break if not found: loader.add_value('sku', '') price = hxs.select(u'//span[@class="price"]/text()').re( u'\u20ac(.*)')[0].strip().replace(u'.', u'').replace(u',', u'.') loader.add_value('price', price) image_url = hxs.select( u'//a[@class="MagicZoomPlus"]/img/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) loader.add_value('image_url', image_url) price = extract_price(price) if price < Decimal(100): loader.add_value('shipping_cost', '11.00') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@class="css_carte_titlu"]/h1/b/text()') loader.add_value('url', response.url) brand = hxs.select( '//div[@class="produs_campuri" and b/text()="Editura:"]/a/text()' ).extract() loader.add_value('brand', brand) loader.add_value('category', 'Carti') sku = ''.join( hxs.select( '//div[@class="produs_campuri" and b/text()="ISBN:"]/text()'). extract()).strip() loader.add_value('sku', sku) loader.add_value('identifier', re.findall('p/(.*)/', response.url)[0]) image_url = hxs.select('//a[@rel="thumbnail"]/img/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) price = ''.join( hxs.select( '//tr[contains(td/b/text(), "ul nostru:")]/td/b[@class="red"]/text()' ).extract()).strip() if not price: price = ''.join( hxs.select( '//tr[td/b/text()="Pret:"]/td/text()').extract()).strip() loader.add_value('price', extract_price(price)) out_of_stock = 'IN STOC' not in ''.join( hxs.select('//tr[td/b/text()="Disponibilitate:"]/td/text()'). extract()).strip().upper() if out_of_stock: loader.add_value('stock', 0) if loader.get_output_value('price') < 150: loader.add_value('shipping_cost', 11.99) yield loader.load_item()
def parse_addcart(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] products = hxs.select( '//table/tr[td/input[contains(@name, "ItemId")]]') for product in products: valid_sku = item['sku'].upper() in ''.join( product.select('td[input[contains(@name, "ItemId")]]/text()'). extract()).strip().upper() if valid_sku: identifier = product.select( 'td[input[contains(@name, "ItemId")]]/input/@value' ).extract()[-1] item['identifier'] = identifier price = product.select( 'td[@class="sellprice"]/text()').extract() price = extract_price(price[-1]) if price else '0' item['price'] = price yield item break
def parse(self, response): reader = csv.DictReader(StringIO(response.body)) for row in reader: brand = row['Manufacturer'].strip() if brand.lower() in ('wse', 'unknown', 'unknowns'): continue loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['SKU'].lower()) loader.add_value('sku', row['SKU']) loader.add_value('brand', row['Manufacturer']) loader.add_value('category', row['Manufacturer']) loader.add_value('name', row['Name'].decode('utf-8')) loader.add_value('price', round(extract_price(row['Price']), 2)) item = loader.load_item() metadata = ErfMeta() metadata['gtin'] = row['GTIN'] item['metadata'] = metadata yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//meta[@property="og:title"]/@content') loader.add_value('url', response.url) product_brand = '' brands = hxs.select('//dl[dt/text()="Brand"]//li/@data-text').extract() for brand in brands: if brand.upper() in loader.get_output_value('name').upper(): product_brand = brand break loader.add_value('brand', product_brand) categories = hxs.select( '//div[@class="breadcrumbs"]//li[not(@class="home")]/a/text()' ).extract() loader.add_value('category', categories) identifier = hxs.select('//input[@name="product"]/@value').extract() loader.add_value('sku', identifier) loader.add_value('identifier', identifier) image_url = hxs.select('//img[@class="big"]/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) price = hxs.select( '//div[@class="product-shop"]//span[@class="price-including-tax"]//span[@class="price"]/text()' ).extract() price = extract_price(price[0]) if price else 0 loader.add_value('price', price) out_of_stock = hxs.select('//p[@class="availability out-of-stock"]') if out_of_stock or not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item()