def yield_product(self, product): metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(product) product['metadata'] = metadata if product['price'] < 15: product['shipping_cost'] = 1.99 return product
def preprocess_product(self, item): metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(item) item['metadata'] = metadata if Decimal(item['price']) < 9: item['shipping_cost'] = 2 return item
def parse_secondary(self, response): for obj in super(EvansCyclesComSpider, self).parse_secondary(response): if isinstance(obj, Product): metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(obj) obj['metadata'] = metadata yield obj
def preprocess_product(self, item): metadata = SigmaSportMeta() if not item['price']: item['price'] = '0.00' elif extract_price(item['price']) < 9: item['shipping_cost'] = 1.99 metadata['price_exc_vat'] = extract_exc_vat_price(item) item['metadata'] = metadata return item
def parse_item(self, response): ''' skuArray.push({ productexternalid: 72833, colour: 'Light Grey/Grey', size: '49', skuNopId: 91684, skuId: 227272, price: '£90.00', priceAsDecimal: 90.0000, stockquantity: 0, preorder: true, outofstock: true, issubscribed: false, availableDate: 'Due in 02/07/2015' }); ''' hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products_data = [] collect_product = False for i, l in enumerate(response.body.split('\n')): if 'skuArray.push({' in l: collect_product = True current_product = {} continue if '});' in l and collect_product: collect_product = False products_data.append(current_product) continue if collect_product: attr_data = [a.strip() for a in l.split(':')] current_product[attr_data[0]] = eval(attr_data[1].replace('false', 'False').replace('true', 'True')) if isinstance(current_product[attr_data[0]], tuple): current_product[attr_data[0]] = current_product[attr_data[0]][0] main_name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0].strip() categories = hxs.select('//div[@id="breadcrumb"]//span[@itemprop="title"]/text()').extract()[1:] for p in products_data: loader = ProductLoader(item=Product(), response=response) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src', lambda a: urljoin_rfc(base_url, a[0]) if a else '') loader.add_value('identifier', p['skuId']) loader.add_value('sku', p['productexternalid']) loader.add_value('price', p['priceAsDecimal']) loader.add_value('stock', p['stockquantity']) loader.add_value('category', categories) loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('url', response.url) loader.add_value('name', main_name + ' - ' + p['colour'] + ' - ' + p['size']) product = loader.load_item() metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(product) product['metadata'] = metadata yield product
def parse_product_data(self, response): s = response.body try: content = unicode(s, 'utf-8', errors='replace') except (LookupError, TypeError): content = unicode(s, errors='replace') try: data = json.loads(content) except ValueError: meta = response.meta retry = meta.get('retry', 1) retry += 1 if retry < 10: meta['retry'] = retry self.log('WARNING - Retry #{} {}'.format(retry, response.meta.get('url'))) yield Request(response.url, meta=meta, callback=self.parse_product_data, dont_filter=True) else: self.log('ERROR - Maximum retry count reached! {} {}'.format(response.meta.get('url'), response.body)) yield [] else: item = data.get('productItemDetails') if item: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', item.get('name')) price = extract_price(item.get('nowPriceRaw')) if price: shipping = 2.99 if price < 30 else '' product_loader.add_value('price', price) product_loader.add_value('shipping_cost', shipping) product_loader.add_value('category', response.meta.get('category')) product_loader.add_value('url', response.meta.get('url')) # product_loader.add_value('image_url', image_url) product_loader.add_value('brand', response.meta.get('brand')) product_loader.add_value('sku', item.get('itemCode')) product_loader.add_value('identifier', item.get('itemCode')) product = product_loader.load_item() metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(product) product['metadata'] = metadata if product['identifier'] not in self._identifiers_viewed: if self.simple_run and (product['identifier'] not in self.matched_identifiers): return self._identifiers_viewed.add(product['identifier']) yield product
def parse_options(self, response): data = json.loads(response.body_as_unicode())['productItemDetails'] product_loader = ProductLoader(item=Product(response.meta['item']), response=response) product_loader.add_value('name', data['name']) product_loader.add_value('identifier', data['itemCode']) product_loader.add_value('sku', data['itemCode']) product_loader.add_value('price', data['nowPriceRaw']) if not data['inStock']: product_loader.add_value('stock', 0) product = product_loader.load_item() if product['price'] < 30: product['shipping_cost'] = 2.99 metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(product) product['metadata'] = metadata if product['identifier'] not in self._identifiers_viewed: if self.simple_run and (product['identifier'] not in self.matched_identifiers): return self._identifiers_viewed.add(product['identifier']) yield product
def preprocess_product(self, item): metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(item) item['metadata'] = metadata return item
def parse_product(self, response): try: category = response.xpath('//nav[@id="breadcrumb"]//ul/li[@class="penultimateStep"]/a/text()').extract()[0].strip() except IndexError: category = '' image_url = response.xpath('//meta[@property="og:image"]/@content').extract() if image_url: image_url = image_url[0].replace('merchzone', 'main') brand = response.xpath('//div[@class="hproduct"]/span[@class="brand"]/text()').extract() price = response.xpath('//div[@id="priceAndLogo" or @id="priceAndRating"]/h2/text()').re(r'[\d,.]+') options = re.findall('multiVariantArray:(.*),', response.body) try: variants = json.loads(options[0].strip()) except: options = '' if options and response.xpath('//div[@class="productOptions"]//div[contains(@id, "itemVariantSelectionWidget")]'): parameters = { 'action': 'getProductItemDetails', 'langId': '-1', 'storeId': '10001' } msg = { 'productId': response.xpath('//input[@name="productId"]/@value').extract()[0].encode(), 'catalogId': response.xpath('//input[@name="catalogId"]/@value').extract()[0].encode(), 'categoryId': response.xpath('//input[@name="categoryId"]/@value').extract()[0].encode() } option_url = 'http://www.halfords.com/webapp/wcs/stores/servlet/GetProductItemDetails' for variant in variants: msg['catEntryId'] = variant['itemId'] parameters['msg'] = msg url = option_url for par in parameters: url = add_or_replace_parameter(url, par, parameters[par]) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('image_url', image_url) product = product_loader.load_item() yield Request(url, meta={'item':Product(product)}, callback=self.parse_options) return identifier = response.xpath('//input[@name="productId"]/@value').extract() if not identifier: self.log('No identifier found for %s' %response.url) return identifier = identifier.pop() product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('identifier', identifier) product_loader.add_value('sku', identifier) product_loader.add_xpath('name', '//h1[@class="productDisplayTitle"]/text()') price = response.xpath('//div[@class="productDisplayPricing"]' '//div[@class="pricewrapper"]/div[@class="total"]' '/span[@class="totalPrice"]/text()').extract() if not price: price = re.findall("price:\s?\'£(.+?)\'", response.body) if not price: self.log('WARNING: No price can be found, ignoring product %s' %response.url) return price = extract_price(price[0]) if price: shipping = 2.99 if price < 30 else '' product_loader.add_value('price', price) product_loader.add_value('shipping_cost', shipping) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('image_url', image_url) product_loader.add_value('brand', brand) if response.xpath('//div[@id="productBuyable"][@class="hidden"]'): product_loader.add_value('stock', 0) product = product_loader.load_item() metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(product) product['metadata'] = metadata if product['identifier'] not in self._identifiers_viewed: #if self.simple_run and (product['identifier'] not in self.matched_identifiers): #return self._identifiers_viewed.add(product['identifier']) yield product
def parse(self, response): transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT)) password = "******" username = "******" transport.connect(username = username, password = password) sftp = paramiko.SFTPClient.from_transport(transport) files = sftp.listdir_attr() last = get_last_file("SigmaFirst1000Products", "xlsx", files) file_path = HERE+'/SigmaFirst1000Products.xlsx' sftp.get(last.filename, file_path) wb = xlrd.open_workbook(file_path) sh = wb.sheet_by_name('Sheet1') product_ids = {} for rownum in xrange(sh.nrows): if rownum < 1: continue row = sh.row_slice(rownum) product_id = row[2].value if row[2].ctype == 2: product_id = str(int(row[2].value)) product_ids[product_id.replace('-GB', '')] = [] last = get_last_file("feedspark", "tsv", files) file_path = HERE+'/feedspark.tsv' sftp.get(last.filename, file_path) with open(file_path) as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: product_id = row['id'].replace('-GB', '').upper().strip() if product_id in product_ids.keys(): loader = ProductLoader(response=response, item=Product()) loader.add_value('sku', row['code'].replace('-gb', '').replace('-GB', '')) categories = row['mapped_category'].split('>') for category in categories: loader.add_value('category', category.strip().encode('utf-8')) loader.add_value('brand', row['brand'].encode('utf-8')) name = [row['title']] if row['colour']: name.append(row['colour']) if row['size']: name.append(row['size']) try: loader.add_value('name', " ".join(name).encode('utf-8')) except: loader.add_value('name', " ".join(name).decode('utf-8')) loader.add_value('price', row['price']) loader.add_value('image_url', row['image_link']) loader.add_value('url', row['link']) loader.add_value('identifier', row['id']) if row['availability'].lower() == 'out of stock': loader.add_value('stock', 0) if loader.get_output_value('price')<10: loader.add_value('shipping_cost', 1.99) product = loader.load_item() metadata = SigmaSportMeta() metadata['mpn'] = row['mpn'] metadata['item_group_number'] = row['item_group_id'] metadata['cost_price'] = row.get('cost_price', '0.00').replace(' GBP', '') metadata['price_exc_vat'] = extract_exc_vat_price(product) metadata['sku_gb'] = str(product['sku']) + '-GB'if product.get('sku', None) else '' product['metadata'] = metadata # Check if the products have different prices collected_products = product_ids[product_id] prices = [] for collected_product in collected_products: prices.append(product['price']) if product['price'] not in prices: product_ids[product_id].append(product) # Collects all the products for each name for name, products in product_ids.iteritems(): for product in products: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_id = response.xpath( '//div[contains(@class, "productContainer")]/@data-product-id' ).extract_first() if not product_id: return image_url = hxs.select( '//div[@id="thumbnails"]/div/a/img/@src').extract() category = response.css('.breadcrumb span::text').extract()[-2] main_name = ''.join( hxs.select('//h1[@class="product-title"]/text()').extract()) brand = hxs.select('//li[@class="brand"]/a/span/text()').re( 'About (\w+)') sku = ''.join( hxs.select('//div[@class="stockCode"]/text()').extract()).strip() options = response.xpath( '//div[contains(@id, "productOption")]/ul[@role="menu"]/li') for option in options: product_loader = ProductLoader(item=Product(), selector=option) option_id = option.select('@data-id').extract()[0] product_loader.add_value('brand', brand) product_loader.add_value('category', category) name = ''.join( option.select('a/span[@class="title"]/text()').extract()) if name == main_name: name = main_name else: group_name = option.xpath( 'preceding-sibling::div[1]/strong/text()').extract_first() if group_name: name = group_name + ' ' + name name = ' '.join((main_name, name)) product_loader.add_value('name', name) product_loader.add_value('url', response.url) identifier = product_id + '-' + option_id product_loader.add_value('identifier', identifier) # product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) stock = option.select('./@data-stock').extract() rrp = option.select('./@data-rrp').extract() rrp = str(extract_price(rrp[0])) if rrp else '' price = option.select('./@data-merlin-price').extract() if price: price = '{0:.2f}'.format(float(price[0])) product_loader.add_value('price', price) in_stock = stock[0] == 'inStock' if stock else None if not in_stock: product_loader.add_value('stock', 0) product_loader.add_value('image_url', image_url) product = product_loader.load_item() metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(product) product['metadata'] = metadata yield product if not options: identifier = product_id + '-0' stock = response.css('.productContainer .inStock') rpp = hxs.select( '//div[@class="productContainer"]//span[@class="rrp"]/span/span[@class="price"]/text()' ).extract() price = hxs.select('//meta[@itemprop="price"]/@content').extract() product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', identifier) product_loader.add_value('category', category) product_loader.add_value('name', main_name) product_loader.add_value('url', response.url) product_loader.add_value('sku', sku) product_loader.add_value('price', price) product_loader.add_value('brand', brand) product_loader.add_value('image_url', image_url) if not stock: product_loader.add_value('stock', 0) product = product_loader.load_item() metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(product) product['metadata'] = metadata yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select('//a[@itemprop="image"]/@href').extract() try: product_identifier = hxs.select( '//*[@id="productDetailsAddToCartForm"]/input[@name="product_id"]/@value' ).extract()[0].strip() except: return product_name = hxs.select( '//*[@id="ProductDetails"]/h1/text()').extract()[0].strip() category = hxs.select( '//ul[@class="breadcrumbs"]//a/text()').extract()[1:] brand = hxs.select( '//*[@id="ProductDetails"]/div[@itemprop="brand"]//span/text()' ).extract() brand = brand[0].strip() if brand else '' product_price = hxs.select( '//span[@class="ProductPrice VariationProductPrice"]/text()' ).extract()[0] product_price = extract_price(product_price) options = [] product_options = hxs.select('//div[@class="productOptionViewRadio"]') if product_options: for select in product_options: values = select.select('.//li/label/input/@value').extract() titles = select.select('.//li/label/span/text()').extract() opts = [] for value, title in zip(values, titles): opts.append({'identifier': value, 'name': title}) if opts: options.append(opts) product_options = hxs.select('//div[@class="productOptionViewSelect"]') if product_options: for select in product_options: values = select.select('./select/option/@value').extract() titles = select.select('./select/option/text()').extract() opts = [] for value, title in zip(values, titles): if value: opts.append({'identifier': value, 'name': title}) if opts: options.append(opts) if options: for opts in itertools.product(*options): name = product_name identifier = product_identifier for option in opts: name += ' ' + option['name'] identifier += '_' + option['identifier'] product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', identifier) product_loader.add_value('name', name) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('price', product_price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product = product_loader.load_item() metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(product) product['metadata'] = metadata yield product else: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('price', product_price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product = product_loader.load_item() metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(product) product['metadata'] = metadata yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select( '//div[@class="product-image main-product-image"]//img[@class="product-img"]/@src' ).extract() try: product_identifier = hxs.select( '//*[@id="productId"]/@value').extract()[0].strip() except: self.log('Error! No product ID on the page! {}'.format( response.url)) retry = response.meta.get('retry', 0) if retry < 10: meta = response.meta.copy() meta['retry'] = retry + 1 meta['dont_merge_cookies'] = True yield Request(response.url, meta=meta, callback=self.parse_product, dont_filter=True) return product_name = hxs.select( '//div[@class="product-title-wrap"]/h1/text()').extract()[0].strip( ) category = response.url.split('/')[3].replace('-', ' ').title() brand = response.xpath( '//th[contains(text(),"Brand:")]/../td//text()[normalize-space(.)!=""]' ).extract() brand = brand[0].strip() if brand else '' product_price = response.css('span.price::text').extract_first() product_price = extract_price(product_price) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('sku', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('price', product_price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) if product_price < 10: product_loader.add_value('shipping_cost', 1.99) else: product_loader.add_value('shipping_cost', 0) product = product_loader.load_item() variations = hxs.select( '//div[@class="variation-dropdowns fl"]/form//input[@name="variation"]/@value' ).extract() product_options = hxs.select( '//div[@class="variation-dropdowns fl"]/form[1]//select/option/@value' ).extract() if product_options: for option_id in product_options: if option_id: yield Request( 'http://www.probikekit.co.uk/variations.json?productId=' + product_identifier + '&selected=1&variation1=' + variations[0] + '&option1=' + option_id + '&switchcurrency=GBP', meta={ 'product': product, 'cur_variation': 1 }, callback=self.parse_product_option) else: metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(product) product['metadata'] = metadata yield product
def parse_product_option(self, response): base_url = get_base_url(response) product_data = json.loads(response.body) if 'variations' not in product_data or not product_data['variations']: self.log('Error! No options on the page! {}'.format(response.url)) retry = response.meta.get('retry', 0) if retry < 10: meta = response.meta.copy() meta['retry'] = retry + 1 yield Request(response.url, meta=meta, callback=self.parse_product_option, dont_filter=True) product = response.meta['product'] cur_variation = response.meta['cur_variation'] if cur_variation == len(product_data['variations']): name = '' for variation in product_data['variations']: name += ' ' + variation['options'][0]['name'] name = name.replace('One Colour', '').replace('One Option', '').replace('One Option', '') name = ' '.join(name.split()) new_item = copy.deepcopy(product) new_item['name'] += ' ' + name new_item['identifier'] = str(product_data['selected-product-id']) new_item['price'] = extract_price( product_data['price'].split(';')[1]) if new_item['price'] < 10: new_item['shipping_cost'] = 1.99 else: new_item['shipping_cost'] = 0 if product_data['images']: new_item['image_url'] = urljoin_rfc( 'http://s1.thcdn.com/', product_data['images'][2]['name']) metadata = SigmaSportMeta() metadata['price_exc_vat'] = extract_exc_vat_price(new_item) new_item['metadata'] = metadata yield new_item else: base_url = 'http://www.probikekit.co.uk/variations.json?productId=' base_url += str(product['identifier']) + '&selected=' + str( cur_variation + 1) + '&switchcurrency=GBP' i = 0 for variation in product_data['variations'][0:cur_variation]: i += 1 base_url += '&variation' + str(i) + '=' + str(variation['id']) base_url += '&option' + str(i) + '=' + str( variation['options'][0]['id']) i += 1 for option in product_data['variations'][cur_variation]['options']: url = base_url + '&variation' + str(i) + '=' + str( product_data['variations'][cur_variation]['id']) url += '&option' + str(i) + '=' + str(option['id']) yield Request(url, meta={ 'product': product, 'cur_variation': cur_variation + 1 }, callback=self.parse_product_option)