def parse_product(self, response): hxs = HtmlXPathSelector(response) brand = hxs.select( '//span[@class="title-designer-info"]/a/text()').extract() brand = brand[0] if brand else '' options = re.search('var spConfig = new Product.Config\((.*})\);', response.body) options = json.loads(options.group(1)) if options else None if options: product_name = options['productName'] price = options['basePrice'] image_url = options['imageUrl'] identifier = options['productId'] else: product_name = hxs.select( '//span[@itemprop="name"]/text()')[0].extract() price = hxs.select( '//form//p[@class="special-price"]/span[@class="price"]/text()' ).extract() if not price: price = hxs.select( '//form//span[@class="regular-price"]/span[@class="price"]/text()' ).extract() price = price[0].replace('.', '').replace(',', '.') image_url = hxs.select('//img[@id="image-main"]/@src')[0].extract() identifier = hxs.select( '//input[@name="product"]/@value')[0].extract() product_loader = ProductLoader(item=Product(), selector=hxs) # url = 'http://www.retrofurnish.com/de/' + response.url.split('/')[-1] product_loader.add_value('url', response.url) product_loader.add_value('name', product_name) product_loader.add_value('brand', brand) product_loader.add_value('image_url', image_url) product_loader.add_value('identifier', identifier) product_loader.add_value('category', response.meta.get('category') or '') product_loader.add_value('sku', identifier) price = re.search('([\d\.]+)', price).group(1) product_loader.add_value('price', price) product_loader.add_value( 'shipping_cost', self.get_shipping_cost( float(product_loader.get_output_value('price')))) if not options: product = product_loader.load_item() if product['identifier'] in self.products_ids: product['name'] = self.products_ids[product['identifier']] else: self.products_ids[product['identifier']] = product['name'] yield product return option_names = {} for attr in options['attributes'].values(): for opt in attr['options']: for prod in opt['products']: option_names[prod] = option_names.get(prod, []) + [opt['label']] option_names = dict( map(lambda x: (x[0], ' '.join(x[1])), option_names.items())) for option in options.get('childProducts').iteritems(): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_value( 'name', '%s %s' % (product_name, option_names[option[0]])) product_loader.add_value('image_url', option[1]['imageUrl']) product_loader.add_value('identifier', option[0]) product_loader.add_value('sku', identifier) product_loader.add_value('brand', brand) product_loader.add_value('category', response.meta.get('category') or '') product_loader.add_value('price', option[1]['finalPrice']) product_loader.add_value( 'shipping_cost', self.get_shipping_cost( float(product_loader.get_output_value('price')))) product = product_loader.load_item() if product['identifier'] in self.products_ids: product['name'] = self.products_ids[product['identifier']] else: self.products_ids[product['identifier']] = product['name'] yield product
def parse_product(self, response): base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) name = ''.join(response.xpath('//h1//text()').extract()).strip() product_loader.add_value('name', name) product_loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') product_loader.add_xpath('identifier', '//input[@id="defaultSKUID"]/@value') category = response.xpath( '//div[@class="breadcrumb"]/ul/li/a/text()').extract()[1:] product_loader.add_value('category', category) image_url = response.xpath( '//meta[@property="og:image"]/@content').extract() if image_url: image_url = image_url[0].split('?$')[0] product_loader.add_value('image_url', image_url) options_values = response.xpath( '//script[contains(text(), "var allVariants={")]/text()').re( r'var variantsAray=(\[.*\]);') if options_values: options_values = eval(options_values[0]) options = response.xpath( '//script[contains(text(), "var allVariants={")]/text()').re( r'allVariants={"variants":(\[.*\,])\}\;') if options: options = eval(options[0]) option_images = {} media_json = re.findall("var mediaJSON='(.*)';if", response.body) if media_json and media_json[0]: images = json.loads(media_json[0]) for image in images["imageList"]: sku = image.get('skuId', None) if sku: option_image = response.xpath('//div[@data-value="' + image['colour'] + '"]/img/@src').extract() image_url = option_image[0] if option_image else '' if option_image: image_url = add_or_replace_parameter( option_image[0], 'wid', '500') image_url = add_or_replace_parameter( image_url, 'hei', '500') option_images[image['skuId']] = image_url else: option_images[image['skuId']] = '' initial_image = images['initialImage']['imageURL'] product_loader.add_value('image_url', initial_image) product = product_loader.load_item() if options and options_values: for option in options: prod = Product(product) sku = option['skuId'] if not sku: log.msg(' >>>>> ERROR: NO IDENTIFIER' + response.url) continue prod['identifier'] = sku prod['sku'] = sku prod['name'] = prod['name'].strip() + ' ' + ' '.join( option[k] for k in options_values if option[k] is not 'null').decode('utf-8') prod['price'] = extract_price(option['RP']) if option['isInStock'] != 'true': prod['stock'] = 0 if option_images and option_images.get(sku, ''): prod['image_url'] = option_images.get(sku, '') yield prod else: if not product.get('sku'): log.msg(' >>>>> ERROR: NO IDENTIFIER' + response.url) else: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: product_id = re.findall('\/(\d+)-', response.url)[-1] except IndexError: product_id = re.findall('\/(\d+)$', response.url)[-1] name = hxs.select('//div[@class="pm_inner"]/h1/text()').extract_first() sku = hxs.select( '//span[contains(@class, "product_code")]/text()').extract() sku = sku[0] if sku else '' if not name: name = sku if not name: for request in self.parse_product_list(response): yield request return category = hxs.select( '//ul[contains(@class, "ancestors")]/li/a/text()').extract() if category: category = category[-1] image_url = hxs.select( '//div[@class="mlens-image"]//img/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) brand = hxs.select( '//a[@class="more" and contains(@href, "brands")]/h2/text()').re( 'More (.*)') brand = brand[0].strip() if brand else '' price = ''.join( hxs.select('//span[@class="pm_price"]//text()').extract()) price = extract_price(price) options = hxs.select( '//select[@id="basket_line_product_id"]/option[@value!=""]') row_options = hxs.select( '//tr[@class="no_stock" or @class="has_stock"]') if options: for option in options: size = option.select('text()').extract()[0] product_loader = ProductLoader(item=Product(), selector=option) product_loader.add_value('category', category) product_name = name + ' - ' + size brand_in_name = False for w in re.findall('([a-zA-Z]+)', product_name): if w.upper() in brand.upper(): brand_in_name = True if brand.upper() not in product_name.upper() and brand.upper( ) not in ('OTHER', 'UNASSIGNED') and not brand_in_name: product_name = brand + ' ' + product_name product_loader.add_value('name', product_name) product_loader.add_value('url', response.url) option_id = option.select('@value').extract()[0] product_loader.add_value('identifier', product_id + '-' + option_id) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) #stock = option.select('div[@class="pr_stock"]/text()').extract()[0] #add_button = option.select('.//input[contains(@class, "addbasket")]') #if add_button: # product_loader.add_value('stock', 1) #else: # product_loader.add_value('stock', extract_price(stock)) if price < 150: product_loader.add_value('shipping_cost', 6) product_loader.add_value('price', price) product_loader.add_value('image_url', image_url) yield product_loader.load_item() elif row_options: for option in row_options: size = option.select('./td[1]/text()').extract()[0] product_loader = ProductLoader(item=Product(), selector=option) product_loader.add_value('category', category) product_name = name + ' - ' + size brand_in_name = False for w in re.findall('([a-zA-Z]+)', product_name): if w.upper() in brand.upper(): brand_in_name = True if brand.upper() not in product_name.upper() and brand.upper( ) not in ('OTHER', 'UNASSIGNED') and not brand_in_name: product_name = brand + ' ' + product_name product_loader.add_value('name', product_name) product_loader.add_value('url', response.url) option_id = option.select( './td[3]/input[1]/@value').extract()[0] product_loader.add_value('identifier', product_id + '-' + option_id) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) #stock = option.select('div[@class="pr_stock"]/text()').extract()[0] #add_button = option.select('.//input[contains(@class, "addbasket")]') #if add_button: # product_loader.add_value('stock', 1) #else: # product_loader.add_value('stock', extract_price(stock)) price = ''.join( option.select( './td[2]/div[not(@class="oldprice")]/div[@class="nowprice"]/text()' ).extract()) if not price: price = ''.join(option.select('./td[2]//text()').extract()) price = extract_price(price) if price < 150: product_loader.add_value('shipping_cost', 6) product_loader.add_value('price', price) product_loader.add_value('image_url', image_url) yield product_loader.load_item() else: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('category', category) product_name = name brand_in_name = False for w in re.findall('([a-zA-Z]+)', product_name): if w.upper() in brand.upper(): brand_in_name = True if brand.upper() not in product_name.upper() and brand.upper( ) not in ('OTHER', 'UNASSIGNED') and not brand_in_name: product_name = brand + ' ' + product_name product_loader.add_value('name', product_name) product_loader.add_value('url', response.url) product_loader.add_value('identifier', product_id) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) if price < 150: product_loader.add_value('shipping_cost', 6) product_loader.add_value('price', price) product_loader.add_value('image_url', image_url) yield product_loader.load_item()
def parse_products(self, response): html_response = json.loads(response.body)['display_tyres'] hxs = HtmlXPathSelector(text=html_response) search_params = response.meta['search_params'] products = hxs.select('//div[contains(@class, "tyre_container") and @id]') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) brand = product_el.select('.//span[@class="tyre_brand_text"]/text()').extract() brand = brand[0] if brand else '' winter_tyre = product_el.select('.//i[@class="icon-select_tyres-winter"]').extract() if not winter_tyre: for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand full_name = product_el.select('.//p[@class="tyre_details"]/span/text()').extract()[0] loader.add_value('name', full_name) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.select('@id').extract() loader.add_value('identifier', identifier) loader.add_value('url', 'http://www.tyresavings.com') image_url = product_el.select('.//img[contains(@class, "tyre_image")]/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) price = ''.join(product_el.select('.//*[@class="tyre_price"]//text()').re(r'[\d,.]+')) if not price: loader.add_value('stock', 0) loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = search_params['aspect_ratio'] metadata['rim'] = search_params['rim'] tyre_details = product_el.select('.//*[@class="tyre_details"]/text()').extract()[0].strip() speed = re.search('(\s\d+\w+)', tyre_details) load_rating = speed.group().strip()[:-1] if speed else '' speed_rating = speed.group().strip()[-1] if speed else '' metadata['speed_rating'] = speed_rating metadata['load_rating'] = load_rating metadata['width'] = search_params['width'] metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' xl = product_el.select('.//i[@class="icon-select_tyres-xl"]').extract() metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(full_name) run_flat = product_el.select('.//i[@class="icon-select_tyres-runflat"]').extract() metadata['run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' metadata['manufacturer_mark'] = self._get_manufacturer_code(full_name) metadata['full_tyre_size'] = '/'.join((search_params['width'], search_params['aspect_ratio'], search_params['rim'], metadata['load_rating'], metadata['speed_rating'])) fuel, grip, noise = filter(lambda s: bool(s), map(unicode.strip, product_el.select('.//div[@class="label_ratings"]//span[contains(@class, "label_rating_")]/text()|' './/div[@class="label_ratings"]//p[span[contains(@class, "decibels")]]/text()') .extract())) metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) yield product if products: meta = response.meta next_page = meta['page'] + 1 next_url = 'http://www.tyresavings.com/update-tyres/%s' % str(next_page) meta['page'] = next_page yield Request(next_url, dont_filter=True, callback=self.parse_products, meta=meta)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = ' '.join(hxs.select('//*[@id="itemTitle"]/text()').extract()).strip() identifier = response.url.split('?')[0].split('/')[-1] sku = '' for match in re.finditer(r"([\d,\.]+)", name): if len(match.group()) > len(sku): sku = match.group() brand = 'Lego' price = None try: price = hxs.select('//*[@id="prcIsum"]/text()').extract()[0].strip() except IndexError: try: price = hxs.select('//*[@id="mm-saleDscPrc"]/text()').extract()[0].strip() except IndexError: try: price = re.search(r'"binPrice":".*[\$\xA3]([\d\.,]+)",', response.body).groups()[0] except AttributeError: self.log("Price not found for " + response.url) image_url = hxs.select('//img[@id="icImg"]/@src').extract() category = 'Lego' # shipping cost shipping_cost = None try: shipping_cost = hxs.select('//*[@id="shippingSection"]//td/div/text()').extract()[0] if shipping_cost: if 'free' in shipping_cost.lower(): shipping_cost = 0 else: shipping_cost = extract_price(shipping_cost) except IndexError: pass options_variations = [] try: json_var_map = unicode(hxs.select('//*/text()') .re(r'("menuItemMap":{.*}.*),' '"unavailableVariationIds"')[0]) except: self.log('No item variations map...') else: json_var_map = re.sub(r',"watchCountMessage":".*?}', '}', json_var_map) variations = json.loads('{' + re.sub(r',"unavailableVariationIds".*', '', json_var_map) + '}') menu_map = variations['menuItemMap'] for key, variation in variations['itemVariationsMap'].items(): if variation['traitValuesMap']: new_variation = {} for option, value in variation['traitValuesMap'].items(): new_variation[option] = menu_map[str(value)]['displayName'] price = variation['price'] options_variations.append({'price': price, 'values': new_variation, 'identifier': key}) if options_variations: for product in options_variations: product_loader = ProductLoader(item=Product(), selector=product) p_name = name + ' ' + \ ' '.join(opt_name.strip().lower() for o, opt_name in product['values'].items()) p_identifier = product['identifier'] price = product['price'] price = extract_price(price) product_loader.add_value('identifier', identifier + '_' + p_identifier) product_loader.add_value('name', p_name) product_loader.add_value('sku', sku) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('price', price) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('url', response.url) if shipping_cost is not None: product_loader.add_value('shipping_cost', shipping_cost) product = product_loader.load_item() yield product else: product_loader = ProductLoader(item=Product(), selector=hxs) price = extract_price(price) product_loader.add_value('identifier', identifier) product_loader.add_value('name', name) product_loader.add_value('sku', sku) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('price', price) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('url', response.url) if shipping_cost is not None: product_loader.add_value('shipping_cost', shipping_cost) product = product_loader.load_item() yield product
def parse(self, response): hxs = HtmlXPathSelector(response) search_params = response.meta['search_params'] formdata = response.meta['formdata'] loader = ProductLoader(item=Product(), selector=hxs) title = hxs.select( '//div[@class="rightpanel"]//h1/text()').extract()[0] title = ' '.join(title.split()) tyre_params = "{}/{}R{}".format(search_params['width'], search_params['aspect_ratio'], search_params['rim']) parts = title.partition(tyre_params) brand = parts[0].strip() load_rating = parts[2].strip().split(formdata['speed'])[0].strip() name = title.partition('Fuel Effic')[0].replace('~', '').strip() name = name.replace( '{} {} {}{} '.format(brand, tyre_params, load_rating, formdata['speed']), '') brand = brand.title() if 'goodrich' in brand.lower(): brand = 'BFG' loader.add_value('brand', unify_brand(brand)) if 'www.tyretraders.com' in name or tyre_params not in title: meta = response.meta meta['retry'] += 1 if meta['retry'] < 10: yield Request(response.url, callback=self.parse, meta=meta, dont_filter=True) else: self.log('Giving up retrying to reload the product: {}'.format( response.url)) else: price = response.meta.get('price') loader.add_value('price', price) identifier = response.url.split("|")[-1].split(".")[0] identifier = url_unquote(identifier) # identifier = hxs.select('//*[@id="hf_itemid"]/@value').extract()[0] loader.add_value('identifier', identifier) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', response.url) image_url = hxs.select( '//div[@class="rightpanel"]//img[@style=" max-width:450px;"]/@src' ).extract() if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) metadata = MicheldeverMeta() metadata['aspect_ratio'] = search_params['aspect_ratio'] metadata['rim'] = search_params['rim'] metadata['speed_rating'] = search_params['speed_rating'] metadata['width'] = search_params['width'] metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating #metadata['alternative_speed_rating'] = '' result, name = remove_whole_word('XL', name) result1, name = remove_whole_word('RF', name) metadata['xl'] = 'Yes' if result or result1 else 'No' result, name = remove_whole_word('runflat', name) metadata['run_flat'] = 'Yes' if result else 'No' man_code = '' for code, man_mark in self.all_man_marks.iteritems(): result, name = remove_whole_word(code, name) if result: man_code = man_mark break metadata['manufacturer_mark'] = man_code loader.add_value('name', name) metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], load_rating, metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_product(self, response): try: pjs_data = json.loads( response.xpath('//script[@id="tb-djs-wml-base"]/text()'). extract()[0].strip()) price = str(pjs_data['adContextJSON']['price']) except: price = None if not price: price = ''.join( response.xpath( '//div[contains(@class, "js-product-offer-summary")]' '//div[contains(@class, "price-display")]//text()'). extract()) if not price: price = ''.join( response.xpath( '//div[contains(@class, "PricingInfo clearfix")]' '//span[contains(@class, "clearfix camelPrice")]//text()'). extract()) # Some products are not available online and these have no price if not price: self.log('No price found {}'.format(response.url)) if price: stock_status = 1 if 'out of stock' in price.lower(): stock_status = 0 sku = response.xpath( '//td[contains(text(), "Model No")]/following-sibling::td/text()' ).extract() if not sku: sku = response.xpath( '//td[contains(text(), "Model:")]/following-sibling::td/text()' ).extract() if not sku: # Retry retry = int(response.meta.get('retry', 0)) if retry < 5: retry += 1 meta = response.meta.copy() meta['retry'] = retry yield Request(response.url, callback=self.parse_product, meta=meta, dont_filter=True) else: self.log('NO SKU => %s' % response.url) return loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop = "name"]//text()') loader.add_value('identifier', re.search(r'/(\d+)\[?|$]', response.url).group(1)) loader.add_value('sku', sku[0].strip()) if response.meta.get('brand'): loader.add_value('brand', response.meta.get('brand')) else: loader.add_value('brand', 'LEGO') loader.add_xpath( 'category', '//ol[contains(@class, "breadcrumb-list")]//li[last()]//a/span/text()' ) loader.add_value('url', response.url) loader.add_xpath( 'image_url', '//img[contains(@id, "mainImage") or contains(@class, "product-primary-image")]/@src' ) loader.add_value('price', price) if not stock_status: loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = {} if self.enable_map: self._save_html_response(response, item['identifier']) yield Request(self._get_reviews_url(item, 1), meta={ 'product': item, 'page': 1 }, callback=self.parse_product_reviews)
def parse_product(self, response): hxs = HtmlXPathSelector(response) if response.url in self.products_parsed: return loader = ProductLoader(response=response, item=Product()) #desc = hxs.select('//div[@id="product_description"]') price = None category = None crumbs = None brand = None price_raw = hxs.select( '//div[@id="product_price"]//span[@class="price"]//span[@class="GBP"]/text()' ).extract() if price_raw: price = price_raw[0][1:] name = hxs.select( '//div[@id="product_page_right_title"]//span[@id="product_title"]//text()' ).extract()[0] try: category = response.meta['category'] except KeyError: pass crumbs = hxs.select( '//div[@id="breadcrumb_container"]//span//a/@title').extract() try: category = crumbs[1] except IndexError: pass try: brand = crumbs[2] except IndexError: pass img_url = hxs.select( '//img[@id="product_medium_image"]/@src').extract()[0] if name: loader.add_value('name', name) if price: loader.add_value('price', price) loader.add_value('url', response.url) identifier = hxs.select( u'//input[@type="hidden" and @name="parent_product_id"]/@value' ).extract()[0] loader.add_value('identifier', identifier) loader.add_value('image_url', img_url) if category: loader.add_value('category', category) if brand: loader.add_value('brand', brand) loader.add_value('shipping_cost', 'N/A') item = loader.load_item() if hxs.select('//div[@id="product_options"]'): yield Request( 'http://www.thecosmeticpractice.co.uk/ajax/get_product_options/%s?cmd=addtobasket&parent_product_id=%s&product_id=0&image_product_id=0&image_id=0&image_index=0&' % (identifier, identifier), callback=self.parse_options, meta={ 'item': item, 'identifier': identifier }) else: yield item
def parse_product(self, product, fitted, search_params): url = product.select('.//div[@class="mod-item-body"]/h3//a/@href').extract()[0] p_id = url.split('/')[-1] p_id += '-F' if fitted else '-D' image_url = product.select('.//div[@class="mod-item-img"]//img/@src').extract()[0] brand = product.select('.//div[@class="mod-item-body"]/h3/text()').extract()[0].strip() try: if not fitted: price = product.select('.//div[@class="mod-delivered"]/a/text()').extract()[0] else: price = product.select('.//div[@class="mod-fitted"]/a/text()').extract()[0] except IndexError: self.log("Price not found: %s" % str(product)) self.errors.append("Price not found: %s" % str(product)) return name = product.select('.//div[@class="mod-item-body"]/h3/span/a/text()').extract()[0] pattern = re.sub('\d+[^\s]+R\d+', '', name) pattern = re.sub('[\d/]+%s' % search_params['speed_rating'].upper(), '', pattern) pattern = pattern.strip() if not pattern: pattern = name.strip() loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', url) loader.add_value('identifier', p_id) loader.add_value('image_url', image_url) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('price', price) pattern = pattern.upper() pattern = pattern.replace('XL', '').replace('RFLAT', '').replace('RUNFLAT', '') loader.add_value('name', pattern) m = MicheldeverMeta() m['aspect_ratio'] = search_params['aspect_ratio'] m['rim'] = search_params['rim'] m['width'] = search_params['width'] m['speed_rating'] = search_params['speed_rating'].upper() res = re.search('([\d/]+)%s' % search_params['speed_rating'].upper(), name) if res: m['load_rating'] = res.groups()[0] else: self.log('ERROR: not load rating: %s' % url) m['load_rating'] = '' if 'RFLAT' in name.upper() or 'RUNFLAT' in name.upper(): m['run_flat'] = 'Yes' else: m['run_flat'] = 'No' if 'XL' in name.upper(): m['xl'] = 'Yes' else: m['xl'] = 'No' m['full_tyre_size'] = '/'.join((m['width'], m['aspect_ratio'], m['rim'], m['load_rating'], m['speed_rating'])) #m['alternative_speed_rating'])) m['fitting_method'] = 'Fitted' if fitted else 'Delivered' m['manufacturer_mark'] = self._get_manufacturer_code(name) product = loader.load_item() product['metadata'] = m if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating return product
def parse_product(self, response): if 'aspxerrorpath' in response.url: yield Request(response.request.meta['redirect_urls'][0], self.parse_product, dont_filter=True) aud_url = response.xpath( '//a[contains(@href, "?cur=AUD")]/@href').extract_first() if aud_url: yield Request(response.urljoin(aud_url), self.parse_product, dont_filter=True) return base_product = True add_custom_personalization = False loader = ProductLoader(item=Product(), response=response) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_value('category', 'Kits') heros_data = response.xpath('//script/text()').re( 'product\d{7} =(.+?});var') base_product_data = response.xpath('//script/text()').re( 'product\w{6} =(.+?});var') if not base_product_data: for p in self.parse(response): yield p return if not heros_data: data = json.loads(base_product_data[0]) elif len(heros_data) == 1: data = json.loads(heros_data[0]) base_product = False else: data = [json.loads(x) for x in heros_data] data = {x['ProductID']: x for x in data} heros = response.css('select.heroShirts') hero = heros.xpath('option[@selected]') if not hero: data = json.loads(base_product_data[0]) else: data = data[int(hero.xpath('@value').extract_first())] base_product = False base_product_data = json.loads(base_product_data[0]) #Checking custom personalization printings = { p['PrintingTypeID']: p for p in base_product_data['printingitems'] } custom_printings = printings.get(1) if custom_printings and base_product: add_custom_personalization = True loader.add_value('name', data['Description']) loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"') if data['Brand']: loader.add_value('brand', data['Brand']['Name']) loader.add_value('image_url', response.urljoin(data['ImageURL'])) loader.add_value('shipping_cost', self.shipping_cost) product = loader.load_item() player_from_name = re.search('(?!Sponsor).*with *([\w\ \.\-]+?) (\d+)', data.get('Description', ''), re.UNICODE) if player_from_name: player_name, number = player_from_name.groups() #sizes for variation in data['Variations']: size = variation['Description'] loader = ProductLoader(item=Product(), response=response) loader.add_value(None, product) loader.replace_value('identifier', variation['VariationId']) loader.add_value('name', size) loader.replace_value('price', variation['PriceActual']) loader.replace_value('stock', int(variation['IsInStock'])) item = loader.load_item() if player_from_name: item['metadata'] = { 'player': player_name, 'number': number, 'size': size } else: item['metadata'] = {'size': size} yield item base_size_items = [item] #Custom printings if add_custom_personalization: team_player_name = 'WILLIAMS' team_player_number = '10' team_player_id = 'WILLIAMS' loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.add_value('name', team_player_name) loader.add_value('name', team_player_number) price = Decimal(item['price']) + Decimal( str(custom_printings['PriceActual'])) loader.replace_value('price', price) identifier = '-'.join( (item['identifier'], str(custom_printings['PrintingID']), team_player_id)) loader.replace_value('identifier', identifier) custom_item = loader.load_item() custom_item['metadata'] = { 'player': team_player_name, 'number': team_player_number, 'size': size } yield custom_item base_size_items.append(custom_item) #Badges printing = printings.get(3) if not printing: continue for base_item in base_size_items: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, base_item) loader.add_value('name', printing['PrintingDescription']) price = Decimal(base_item['price']) + Decimal( str(printing['PriceActual'])) loader.replace_value('price', price) identifier = base_item['identifier'] + '-' + str( printing['PrintingID']) loader.replace_value('identifier', identifier) badge_item = loader.load_item() badge_item['metadata'] = base_item['metadata'].copy() yield badge_item
def parse(self, response): if not isinstance(response, HtmlResponse): self.log('ERROR: BAD HtmlResponse!!! URL:{}'.format(response.url)) return hxs = HtmlXPathSelector(response) # logic to find categories # find subcats for Outilage Jardin categories = hxs.select( '//div[contains(@class,"bg_U15 menugroup") and contains(@alt,"Jardin") and contains(@alt,"Outillage")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href' ).extract() # find subcats for Aspirateurs categories += hxs.select( '//div[contains(@class,"bg_U4 menugroup") and contains(@alt,"Entretien") and contains(@alt,"maison")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href' ).extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield self._proxyRequest(url) # products new logic products = hxs.select( u'//div[@id="productList"]//div[contains(@class,"plProductView")]') if products: for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath( 'url', './/a[contains(@class,"plPrName")]/@href') product_loader.add_xpath( 'name', './/a[contains(@class,"plPrName")]/text()') product_loader.add_xpath( 'category', '//div[@class="productListTitle"]/h1/text()') product_loader.add_xpath( 'image_url', './/div[contains(@class, "plProductImg")]//img/@data-src') product_loader.add_xpath('sku', './@data-sku') product_loader.add_xpath( 'identifier', './/input[contains(@name, "ProductPostedForm.ProductId")]/@value' ) price = product.select( u'.//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/text()' ).extract() if price: decimals = product.select( u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/sup/text()' ).re(u'(\d+)') if decimals: price = price[0] + '.' + decimals[0] product_loader.add_value('price', price) if product_loader.get_output_value( 'name') and product_loader.get_output_value('price'): yield product_loader.load_item() # pagination next_page = hxs.select( u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]/@href' ).extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield self._proxyRequest(next_page)
def parse_list(self, response): # To list all products if they are not all already listed limiter_selected = response.xpath( '//div[@class="limiter"]/select/option[@selected]/@value').extract( ) limiter_all = response.xpath( '//div[@class="limiter"]/select/option[contains(@value, "limit=all")]/@value' ).extract() if limiter_all and limiter_selected: if limiter_selected[0] != limiter_all[0]: yield Request(response.urljoin(limiter_all[0]), callback=self.parse_list, meta=response.meta) sub_category_urls = response.xpath( '//div[@class="category-item-center"]' '//span[@class="product-name"]/a/@href').extract() for url in sub_category_urls: yield Request(response.urljoin(url), callback=self.parse_list, meta=response.meta) if not sub_category_urls: products = response.xpath( '//ul[contains(@class, "products-grid")]/li[contains(@class, "item")]' ) for product_xs in products: product_name = ''.join( product_xs.xpath( './/*[contains(@class, "product-name")]//text()'). extract()).strip() product_url = response.urljoin( product_xs.xpath( './/*[contains(@class, "product-name")]//a/@href'). extract()[0]) product_price = extract_price_eu( product_xs.xpath('.//*[@class="price-box"]//text()').re( r'[\d\.,]+')[-1]) product_image_url = map( response.urljoin, product_xs.xpath( './/*[contains(@class, "product-image")]//img/@src'). extract()) product_brand = response.meta.get('brand', '') product_category = map( unicode.strip, response.xpath( '//div[contains(@class, "breadcrumbs")]//li[contains(@class, ' '"category")]/a/text()').extract())[1:] product_out_of_stock = bool( product_xs.xpath( './/*[contains(@class, "availability") and contains(@class, "out-of-stock")]' )) product_shipping_cost = '0.00' if product_price >= self.free_shipping_over else '5.00' try: product_identifier = product_xs.xpath( './/*[contains(@id, "product-price-")]/@id').re( r'(\d+)')[0] except: product_identifier = None loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('url', product_url) loader.add_value('price', product_price) loader.add_value('shipping_cost', product_shipping_cost) loader.add_value('image_url', product_image_url) loader.add_value('brand', product_brand) loader.add_value('category', product_brand or product_category) if product_out_of_stock: loader.add_value('stock', 0) if product_identifier is not None: loader.add_value('identifier', product_identifier) loader.add_value('sku', product_identifier) yield loader.load_item() else: item = loader.load_item() yield Request(item['url'], meta={'item': item}, callback=self.parse_options)
def parse_item(self, response): meta = response.meta categories = response.css( '.ProductDetailBreadcrumbs-item::text').extract() sku = meta['client_product']['Item Number'] image_url = response.xpath( '//div[contains(@class, "main-carousel")]//a/@data-original-src' ).extract() if not image_url: image_url = response.xpath( '//img[contains(@class, "ProductDetailImagesBlock-carousel-image")]/@src' ).extract() prod_id = response.xpath('//input[@name="sku"]/@value').extract() prod_id = prod_id[0] if prod_id else '' try: name = response.xpath( '//h1/span[contains(@class, "ProductDetailInfoBlock-header-title")]/text()' ).extract()[0] except Exception: retry = meta.get('retry', 0) if retry <= 10: retry += 1 meta['retry'] = retry self.log('ERROR >>> No name found, retry URL: ' + response.url) yield Request(response.url, dont_filter=True, callback=self.parse_item, meta=meta) return else: self.log('ERROR >>> Gave up retrying URL: ' + response.url) return name += response.xpath('//h1/text()').extract()[-1].strip() brand = meta['client_product'].get('Brand', '') products_collected = [] sku_list = [] options = [] dropdown_options = response.xpath( '//select[contains(@class, "stdselect")]/option[@value!="XXXXXXXXXX"]' ) option_elements = [] if dropdown_options: for dropdown_option in dropdown_options: option = {} option['identifier'] = dropdown_option.xpath( '@value').extract()[0] option['sku'] = '' option['desc'] = dropdown_option.xpath( './/text()').extract()[0] cost = dropdown_option.xpath('@cost').extract() or re.findall( '\+\$([\d.]+)', option['desc']) option['cost'] = cost[0] if cost else '0' options.append(option) option_elements.append(options) else: dropdown_elements = response.xpath( '//div[@class="pdinfoblock"]/div[@class="fl"]//select') for dropdown_options in dropdown_elements: options = [] for dropdown_option in dropdown_options.xpath( 'option[@value!="XXXXXXXXXX"]'): option = {} option['identifier'] = dropdown_option.xpath( '@value').extract()[0] option['sku'] = '' option['desc'] = dropdown_option.xpath( './/text()').extract()[0].split('-')[0] option['cost'] = dropdown_option.xpath( '@cost').extract()[0] options.append(option) option_elements.append(options) image_options = response.css('.option_select_wrap .visual_option_wrap') if image_options: options = [] for image_option in image_options: option = {} option['identifier'] = image_option.xpath( '@data-pi-id').extract()[0] option['sku'] = '' option['desc'] = image_option.xpath('@data-name').extract()[0] option['cost'] = image_option.xpath('@data-cost').extract()[0] options.append(option) option_elements.append(options) if option_elements: if len(option_elements) > 1: combined_options = list(itertools.product(*option_elements)) options = [] for combined_option in combined_options: final_option = {} for option in combined_option: final_option['desc'] = final_option.get( 'desc', '') + ' - ' + option['desc'] final_option['cost'] = final_option.get( 'cost', 0) + float(option['cost']) final_option['identifier'] = final_option.get( 'identifier', '') + ' - ' + option['identifier'] options.append(final_option) else: options = option_elements[0] products_matched = self.hhe_df[self.hhe_df['Wayfair'] == meta['client_product']['Wayfair']] for option in options: price = response.xpath( '//*[@class="dynamic_sku_price"]/span/text()').extract()[0] #price += response.xpath('//*[@class="dynamic_sku_price"]/span/sup/text()').extract()[0] option_price_value = self.option_price(price, str(option['cost'])) # SKU not unique: match the correct client product sku if not products_matched.empty and products_matched.count( )['Wayfair'] > 1: current_diff = Decimal(0) current_sku = sku for i, row in products_matched.iterrows(): wf_price = Decimal(row['Wayfair Cost'].replace( '$', '').strip()) price_diff = abs(option_price_value - wf_price) if (current_diff == Decimal(0)) or (price_diff < current_diff): current_sku = str(row['Item Number']) current_diff = price_diff sku = current_sku product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', name + ' ' + option['desc']) product_loader.add_value('sku', sku) identifier = response.xpath( '//input[@name="sku"]/@value').extract()[0] product_loader.add_value( 'identifier', identifier + '-' + option['identifier']) product_loader.add_value('brand', brand) product_loader.add_value('category', categories) if image_url: product_loader.add_value('image_url', image_url[0]) product_loader.add_value('url', response.url) product_loader.add_value('price', option_price_value) product = product_loader.load_item() metadata = HouseholdEssentialsMeta() metadata['reviews'] = [] product['metadata'] = metadata products_collected.append(product) sku_list.append(product['identifier']) else: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', name) product_loader.add_value('sku', sku) product_loader.add_xpath('identifier', '//input[@name="sku"]/@value') product_loader.add_value('brand', brand) product_loader.add_value('category', categories) if image_url: product_loader.add_value('image_url', image_url[0]) price = response.xpath( '//span[@data-id="dynamic-sku-price"]/text()').extract_first() #price += response.xpath('//*[@class="dynamic_sku_price"]/span/sup/text()').extract()[0] product_loader.add_value('price', price) product_loader.add_value('url', response.url) product = product_loader.load_item() metadata = HouseholdEssentialsMeta() metadata['reviews'] = [] product['metadata'] = metadata products_collected.append(product) sku_list.append(product['identifier']) transaction_id = re.findall(r'"transactionID":"(.*)",', response.body)[0] headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': response.url, 'X-Requested-With': 'XMLHttpRequest' } params = urlencode({ 'bpss': 'yes', 'skulist': '~^~'.join(sku_list), 'kitmode': '0', 'postalcode': '67346', '_txid': transaction_id }) yield Request(self.ajax_stock_url + '?' + params, headers=headers, dont_filter=True, meta={ 'product': products_collected, 'prod_id': prod_id, 'prod_url': response.url }, callback=self.parse_stock)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) base_url = get_base_url(response) prod_lists = hxs.select( '//div[@class="product_list"]/div/h3/a/@href').extract() if prod_lists: for url in prod_lists: url = urljoin_rfc(get_base_url(response), url) yield Request(url) products = hxs.select( u'//table[child::tr[child::td[@colspan="2" and child::h2]]]') if products: try: category = hxs.select('//div[@class="page-heading"]/h1/text()' ).extract()[0].strip() except: try: category = hxs.select( '//div[@id="frag"]//text()').extract()[0].strip() except: category = hxs.select( '//p[@class="text_breadcrumbs"]//text()').extract( ).pop() for product in products: try: image_url = urljoin_rfc( base_url, product.select('.//img/@src').extract()[0]) except: image_url = '' multiple_options = product.select(u'.//select/option') general_price = product.select( u'.//span[@class="actlarge"]/text()').extract() general_price = general_price[0] if general_price else None if not general_price: general_price = product.select(u'.//*/text()').re( u'Price inc UK Mainland Carriage.*?\:.*?\xa3([\d\.,]*)') general_price = str(round(float(general_price[0]) / 1.2, 2)) if general_price else None log.msg(u'Product with: Price inc UK Mainland Carriage') if multiple_options and general_price: options_text = u' '.join( product.select(u'.//select/option/text()').extract()) if u'\xa3' in options_text: log.msg( u'Product with both option and general price: [%s]' % response.url) name = product.select(u'.//h2/text()')[0].extract().strip() name_complete = ''.join(product.select(u'.//h2//text()').extract()) if 'special offer' in name.lower(): special_offer_starts_at = name.lower().index('special offer') new_name = name[:special_offer_starts_at].strip() if 'ref:' in new_name.lower(): self.log("Found special offer") self.log("Before: '%s'" % name) self.log("After: '%s'" % new_name) name = new_name.replace(u' (Ref', u' \xa0(Ref') if multiple_options and not general_price: idx = 1 for option in multiple_options: option_text = option.select(u'./text()')[0].extract() loader = ProductLoader(item=Product(), selector=product) price = re.search(u'\xa3([\d\.,]+)', option_text) if price: price = price.group(1) else: continue regex = r'[\d]{1,2},[\d]{2}' if re.search(regex, price): price = price.replace(',', '.') loader.add_value('name', name + u' %s' % option_text.strip()) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('url', response.url) loader.add_value('price', price) m = re.search(r'\(Ref:\s*([^\)]+)\)', name_complete, re.I) if m: optsku = option_text.strip().lower().replace( 'code', '').strip('-. ').split('-')[0] if optsku: loader.add_value('sku', m.group(1) + optsku) else: loader.add_value('sku', m.group(1) + ".inc" + str(idx)) idx += 1 loader.add_value('identifier', loader.get_output_value('sku')) if loader.get_output_value('sku') not in INVALID_PRODUCTS: yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('category', category) loader.add_value('image_url', image_url) if not general_price: continue regex = r'[\d]{1,2},[\d]{2}' if re.search(regex, general_price): general_price = general_price.replace(',', '') loader.add_value('price', general_price) m = re.search(r'\(Ref:\s*([^\)]+)\)', name_complete, re.I) if m: loader.add_value('sku', m.group(1)) loader.add_value('identifier', loader.get_output_value('sku')) # if loader.get_output_value('price'): if loader.get_output_value('sku') not in INVALID_PRODUCTS: yield loader.load_item()
def parse_product(self, response): if response.status == 405: url = response.meta['redirect_urls'][0] retries = response.meta.get('retries', 0) if retries >= 9: self.logger.error( 'Gave up retrying avoid antibot captcha for %s' % url) return self.logger.debug('DistilNetworks antibot captcha. Retrying %s' % url) yield response.request.replace(dont_filter=True, url=url, meta={ 'retries': retries + 1, 'dont_merge_cookies': True }) return if response.url in self.old_urls: self.old_urls.remove(response.url) options_data = response.xpath( "//div[@class='v2-product-subproducts']//@data").extract() if options_data: options_data = json.loads(options_data[0]) product_name = options_data['name'] if not options_data.get('sku', 0): pass else: if options_data['sub_products']: for sub_option in options_data: loader = ProductLoader(item=Product(), response=response) price = extract_price( sub_option['prices']['price']['amount']) loader.add_value('url', response.url) option_name = sub_option['option1'] loader.add_value( 'name', "{product} {option}".format(product=product_name, option=option_name)) loader.add_value('stock', sub_option['stock']['is_in_stock']) loader.add_xpath( 'category', "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()" ) loader.add_xpath( 'brand', "//div[@class='v2-gallery-block']//img/@alt") if price < 10: shipping_cost = extract_price('2.95') else: shipping_cost = 0 # Add shipping cost to product price loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price + shipping_cost) loader.add_value('sku', sub_option['sku']) loader.add_value('identifier', sub_option['sku']) loader.add_xpath( 'image_url', sub_option['main_image']['large_path']) product = loader.load_item() promotion = response.xpath( "//div[@id='product-offer-tab']//h3//text()" ).extract() metadata = FragranceDirectMeta() if promotion: metadata['promotion'] = promotion[0] if product.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product else: loader = ProductLoader(item=Product(), response=response) price = extract_price( options_data['prices']['price']['amount']) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_value('name', product_name) loader.add_value('stock', options_data['stock']['is_in_stock']) loader.add_xpath( 'category', "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()" ) loader.add_xpath( 'brand', "//div[@class='v2-gallery-block']//img/@alt") if price < 10: shipping_cost = extract_price('2.95') else: shipping_cost = 0 # Add shipping cost to product price loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price + shipping_cost) loader.add_value('sku', options_data['sku']) loader.add_value('identifier', options_data['sku']) loader.add_value('image_url', options_data['main_image']['large_path']) product = loader.load_item() promotion = response.xpath( "//div[@id='product-offer-tab']//h3//text()").extract( ) metadata = FragranceDirectMeta() if promotion: metadata['promotion'] = promotion[0] if product.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product else: product_name = response.xpath( "//h1[@class='fn']//text()").extract()[0] options = response.xpath( "//div[contains(@class, 'sub-products')]/div") sku = ''.join( response.xpath( "//form[@name='notifications']//input[@name='p']/@value"). extract()) if options: for sub_option_2 in options: sku_option = ''.join( sub_option_2.xpath("./label/@data-sub-sku").extract()) loader = ProductLoader(item=Product(), response=response) price = extract_price( sub_option_2.xpath("./label/@data-subprice").extract() [0]) if not price: price = extract_price(''.join( response.xpath( '//p[@class="price-info"]//span[@class="Price"]/text()' ).extract()).strip()) loader.add_value('price', price) loader.add_value('url', response.url) option_name = sub_option_2.xpath( "./label/@data-option").extract()[0] loader.add_value( 'name', u"{product} {option}".format(product=product_name, option=option_name)) stock = ''.join( sub_option_2.xpath( "./label/@data-stock").extract()).strip().lower() if stock in ['limited', 'in stock']: stock = '1' else: stock = '0' loader.add_value('stock', stock) loader.add_xpath( 'category', "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()" ) loader.add_xpath('brand', "//a[@class='product-brand']//img/@alt") if price < 10: shipping_cost = extract_price('2.95') else: shipping_cost = 0 # Add shipping cost to product price loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price + shipping_cost) loader.add_value('sku', sku_option) loader.add_value('identifier', '{}_{}'.format(sku, sku_option)) img = ''.join( sub_option_2.xpath("./data-image-large").extract()) if not img: img = ''.join( response.xpath( "//img/@data-original-large").extract()) loader.add_value('image_url', 'http:' + img) product = loader.load_item() promotion = response.xpath( "//div[@id='product-offer-tab']//h3//text()").extract( ) metadata = FragranceDirectMeta() if promotion: metadata['promotion'] = promotion[0] if product.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product return options = response.xpath('//option[@data-name]') if options: for opt in options: loader = ProductLoader(item=Product(), response=response) product_image_json = opt.xpath('@data-image').extract() if product_image_json: product_image_data = json.loads(product_image_json[0]) loader.add_value('image_url', product_image_data['default']) product_stock = opt.xpath('@data-stock').extract()[0] if product_stock == 'Out of Stock': loader.add_value('stock', 0) option_name = opt.xpath('@data-name').extract()[0] loader.add_value('name', product_name + ' ' + option_name) price_data = json.loads( opt.xpath('@data-price').extract()[0]) loader.add_value('price', price_data['price']) option_sku = opt.xpath('@value').extract()[0] loader.add_value('sku', option_sku) loader.add_value('identifier', sku + '_' + option_sku) loader.add_xpath( 'category', "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()" ) loader.add_xpath('brand', "//a[@class='product-brand']//img/@alt") loader.add_value('url', response.url) price = loader.get_output_value('price') if price < 10: shipping_cost = extract_price('2.95') else: shipping_cost = 0 # Add shipping cost to product price loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price + shipping_cost) product = loader.load_item() promotion = response.xpath( "//div[@id='product-offer-tab']//h3//text()").extract( ) metadata = FragranceDirectMeta() if promotion: metadata['promotion'] = promotion[0] if product.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product else: if not sku: pass else: loader = ProductLoader(item=Product(), response=response) price = ''.join( response.xpath( '//p[@class="price-info"]//span[@class="Price"]/text()' ).extract()).strip() if price == '': price = ''.join( response.xpath( "//span[@class='Price ']//span[@class='Price-integer' or @class='Price-decimal']//text()" ).extract()) if price == '': self.log("Error! No price! URL: {}".format( response.url)) return price = extract_price(price) loader.add_value('url', response.url) loader.add_value('name', product_name) stock = ''.join( response.xpath("//span[@class='stock-level']//text()"). extract()).strip() if stock.lower() in ['limited', 'in stock']: stock = '1' else: stock = '0' loader.add_value('stock', stock) loader.add_xpath( 'category', "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()" ) loader.add_xpath('brand', "//a[@class='product-brand']//img/@alt") if price < 10: shipping_cost = extract_price('2.95') else: shipping_cost = 0 # Add shipping cost to product price loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price + shipping_cost) loader.add_xpath( 'sku', "//form[@name='notifications']//input[@name='p']/@value" ) loader.add_xpath( 'identifier', "//form[@name='notifications']//input[@name='p']/@value" ) loader.add_xpath('image_url', "//img/@data-original-large") product = loader.load_item() promotion = response.xpath( "//div[@id='product-offer-tab']//h3//text()").extract( ) metadata = FragranceDirectMeta() if promotion: metadata['promotion'] = promotion[0] if product.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) brand = hxs.select( "//*[contains(text(),'Dise') and contains(text(),'ador:')]/text()" ).extract() brand = brand[0].split(':')[1].strip() if brand else '' option_specs = [] product_options = hxs.select( u'//select[@class="form" and contains(@onchange, "actualiza_atributos")]/option/text()' ).extract() if product_options: # Extract product options and price for option_text in product_options: parts = re.split(r'[()]', option_text, 2) option_name = parts[0] part_len = len(parts) if part_len == 1: price_diff = 0 else: price_spec = parts[1] modifier = -1 if price_spec.startswith('-') else 1 price_diff = price_spec.replace('+', '').replace('-', '') price_diff = Decimal(spanishDecimal(price_diff)) * modifier option_specs.append({ 'extra_name': option_name, 'price_diff': price_diff }) else: option_specs.append({'extra_name': '', 'price_diff': 0}) for option_spec in option_specs: extra_name = option_spec['extra_name'] price_diff = option_spec['price_diff'] category = hxs.select( u'//td[@class="cont_heading_td"]/span[@class="sub_cont_heading_td"]/text()' ).extract() category = category[0] if category else '' image_url = hxs.select( u'(//a[@rel="fotografias"])[1]/@href').extract() if image_url: image_url = urljoin_rfc(get_base_url(response), image_url[0]) name = hxs.select( u'//td[@class="cont_heading_td"]/h1[last()]/text()').extract( )[0] product_loader = ProductLoader(item=Product(), response=response) if extra_name: product_loader.add_value( 'name', "%s - %s" % (name.strip(), extra_name.strip())) else: product_loader.add_value('name', name.strip()) product_loader.add_value('url', response.url, Compose(stripSessionId)) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('image_url', image_url, Compose(stripSessionId)) if extra_name: identifier = product_loader.get_value(response.url, TakeFirst(), re='p-([0-9]+)\.html') id_n_ext_name = "%s-%s" % (identifier, extra_name) product_loader.add_value('identifier', id_n_ext_name) else: product_loader.add_value('identifier', response.url, TakeFirst(), re='p-([0-9]+)\.html') product_loader.add_xpath('sku', '//td[contains(text(), "Ref:")]/text()', TakeFirst(), re='Ref: (.+)') price = hxs.select('//td[@class="preu"]/text()[1]').extract()[0] price = Decimal(spanishDecimal(price)) if price_diff: price = price + price_diff product_loader.add_value('price', price) product_loader.add_value('stock', 1) yield product_loader.load_item() # parse product options more_products = hxs.select( u'//div[@class="product_section_sub"][1]/a[@title]/@href' ).extract() _, _, urlpath = response.url.partition('/product-pol') url_to_remove = "/product-pol%s" % urlpath final_more_products = list( set(more_products) - set([url_to_remove])) # parse product for product_url in final_more_products: product_url = urljoin_rfc(get_base_url(response), product_url) yield Request(product_url, callback=self.parse_product)
def parse_product(self, response): hxs = HtmlXPathSelector(response) if hxs.select('//div[@id="ResultSetItems"]'): for x in self.parse(response): yield x return first_name = ' '.join( hxs.select('//*[@id="itemTitle"]/text()').extract()).strip() if not first_name: return identifier = response.url.split('?')[0].split('/')[-1] try: category = hxs.select( '//*[@id="vi-VR-brumb-lnkLst"]//a/text()').extract().pop() except: category = '' seller_id = ''.join( hxs.select('.//*[@class="si-content"]' '//a/*[@class="mbg-nw"]/text()').extract()) try: brand = hxs.select( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*/text()').extract()[0].strip() except: brand = '' product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('name', first_name) product_loader.add_value('identifier', identifier) product_loader.add_value('category', category) product_loader.add_value('dealer', 'eBay - ' + seller_id) product_loader.add_value('brand', brand) product_loader.add_xpath('image_url', '//img[@id="icImg"]/@src') product_loader.add_value('url', response.url) try: price = hxs.select( '//*[@id="prcIsum"]/text()').extract()[0].strip() except: try: price = hxs.select( '//*[@id="mm-saleDscPrc"]/text()').extract()[0].strip() except: try: price = re.search(r'"binPrice":".*([\d\.,]+)",', response.body).groups()[0] except: price = re.search(r'"bidPrice":".*([\d\.,]+)",', response.body).groups()[0] product_loader.add_value('price', extract_price_eu(price)) # shipping cost try: shipping_cost = hxs.select( '//*[@id="shippingSection"]//td/div/text()').extract()[0] if shipping_cost: if 'free' in shipping_cost.lower(): product_loader.add_value('shipping_cost', 0) else: product_loader.add_value('shipping_cost', extract_price(shipping_cost)) except: pass product_ = product_loader.load_item() options_variations = [] sel = HtmlXPathSelector(text=response.body.replace('"', '')) try: json_var_map = unicode( sel.select('//*/text()').re(r'("menuItemMap":{.*}.*),' '"unavailableVariationIds"')[0]) except: pass else: #json_var_map = re.sub(r',"watchCountMessage":".*?}', '}', json_var_map) variations = json.loads( '{' + re.sub(r',"unavailableVariationIds".*', '', json_var_map) + '}') menu_map = variations['menuItemMap'] for key, variation in variations['itemVariationsMap'].items(): if variation['traitValuesMap']: new_variation = {} for option, value in variation['traitValuesMap'].items(): new_variation[option] = menu_map[str( value)]['displayName'] options_variations.append({ 'price': variation['price'], 'values': new_variation, 'identifier': '%s:%s' % (identifier, key) }) if options_variations: for model in options_variations: model_name = first_name + ' ' + \ ' '.join(opt_name.strip().lower() for o, opt_name in model['values'].items()) new_product = Product(product_) new_product['name'] = model_name new_product['identifier'] = model['identifier'] new_product['price'] = extract_price_eu(model['price']) yield new_product else: yield product_
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('identifier', '//input[@name="product"]/@value') if not loader.get_output_value('identifier'): loader.add_xpath( 'identifier', 'substring-after(//span[starts-with(@id,"product-price-")]/@id, "product-price-")' ) loader.add_xpath('sku', '//*[contains(text(),"UGK")]/../*[2]/text()') loader.add_value('url', response.url) loader.add_xpath('name', '//div[@itemprop="name"]//text()') loader.add_xpath('image_url', '//meta[@itemprop="image"]/@content') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') shipping = hxs.select( u'//*[contains(text(), "livraison standard")]//following-sibling::*/span[@class="price"]/text()' ).extract() if shipping: loader.add_value('shipping_cost', shipping[0].replace(',', '.')) else: self.log('No shipping cost on %s' % response.url) self.log(response.body) self.log('Closing spider') raise CloseSpider if not loader.get_output_value('name'): return if loader.get_output_value('name').split()[0] == '2': loader.add_value('brand', 'Flynn') else: loader.add_value('brand', loader.get_output_value('name').split(',')[0]) #if hxs.select('//span[@itemprop="availability" and @content="in_stock"]'): #loader.add_value('stock', '1') #else: #loader.add_value('stock', '0') sku = loader.get_output_value('sku') sku = sku.upper().strip() if sku else '' if not sku: self.log('No SKU on %s' % response.url) self.log(response.body) self.log('Closing spider') raise CloseSpider made_product = self.made_products.get(sku, None) no_category = False if made_product: loader.add_value('category', made_product['Category']) else: loader.add_xpath( 'category', '//div[@class="breadcrumbs"]/ul/li[position()>1]/a/span/text()' ) if not loader.get_output_value('category'): loader.add_value('category', (x.replace('-', ' ') for x in response.url.split('/')[3:-1])) no_category = True product = loader.load_item() catmap = { "bedding and bath": "Bed & Bath", "beds": "Beds", "chairs": "Chairs", "homewares accessories": "Home Accessories", "lighting": "Lighting", "sofas and armchairs": "Sofas", "storage": "Storage", "tables": "Tables", } product['category'] = catmap.get(product['category'], product['category']) trs = hxs.select( '//table[@id="super-product-table"]//tr/td[@class="price"]/..') if not trs: for x in self.yield_product(product, no_category): yield x return for tr in trs: loader = ProductLoader(item=Product(product), selector=tr) loader.add_xpath( 'identifier', 'substring-after(.//span[starts-with(@id,"product-price-")]/@id, "product-price-")' ) loader.add_value('name', product['name']) loader.add_xpath('name', './/td[1]/text()') loader.add_xpath('price', './/span[@property="price"]/@content') for x in self.yield_product(loader.load_item(), no_category): yield x
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) url_processing = lambda u: urljoin_rfc(base_url, u[-1]).split('#')[ 0].split('?')[0] if u else '' price_processing = lambda p: extract_price_eu(p[0]) list_view_mode = hxs.select( '//a[contains(@class, "enable-view") ' 'and contains(@class, "enable-list-view") ' 'and not(contains(@class, "active"))]/@href').extract() if list_view_mode: yield Request(url_processing(list_view_mode), meta={ 'category': response.meta['category'], 'cookiejar': response.meta['cookiejar'] }) return last_page_no = int(response.meta.get('last_page_no', 0)) if not last_page_no: last_page_no = hxs.select( '//input[@id="page-counter"]/@data-pagecount').extract() last_page_no = int(last_page_no[0] if last_page_no else 0) current_page_no = hxs.select( '//input[@id="page-counter"]/@data-currentpage').extract() current_page_no = int(current_page_no[0] if current_page_no else 0) is_last_page = (current_page_no == last_page_no) next_page = hxs.select( '//li[contains(@class, "page-arrow") and contains(@class, "arrow-next")]//a/@href' ).extract() if next_page: yield Request(url_processing(next_page), meta={ 'category': response.meta['category'], 'cookiejar': response.meta['cookiejar'], 'last_page_no': last_page_no }) products = hxs.select( '//div[contains(@class, "category-list-body")]' '/div[@data-pid and contains(@class, "cat-prod-row")]') for product_xs in products: loader = ProductLoader(item=Product(), selector=product_xs) loader.add_xpath( 'name', './/strong[contains(@class, "cat-prod-row-name")]//a/text()') loader.add_xpath('identifier', '@data-pid') loader.add_xpath('sku', '@data-pid') loader.add_xpath( 'url', './/strong[contains(@class, "cat-prod-row-name")]//a/@href', url_processing) loader.add_xpath('price', './/strong[contains(@class, "price")]/text()', price_processing) loader.add_value('category', response.meta['category'].split(',')) loader.add_xpath( 'image_url', './/div[contains(@class, "cat-prod-row-foto")]//img[@data-original]' '/@data-original|.//div[contains(@class, "cat-prod-row-foto")]//img/@src', url_processing) item = loader.load_item() if item['identifier'] not in self.new_ids: self.new_ids.append(item['identifier']) yield item if ((not products) and (not next_page)) or ((not is_last_page) and (not next_page)): blocked_url = url_query_parameter(response.url, 'returnUrl') if blocked_url: blocked_url = urljoin_rfc(base_url, blocked_url) self.log('ERROR: Blocked URL => %s' % blocked_url) else: self.log('ERROR: No products or no next page in => %s' % response.url) retry_no = int(response.meta.get('retry_no', 0)) if retry_no < 10: retry_no += 1 self.log('DEBUG: Retrying page - Retry No: %s' % retry_no) yield Request(blocked_url or response.url, meta={ 'category': response.meta['category'], 'cookiejar': response.meta['cookiejar'], 'retry_no': retry_no, 'last_page_no': last_page_no }, dont_filter=True)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand')) categories = hxs.select( '//div[@id="breadcrumbs"]/div[@class="crumbs"]/span/a/span/text()' ).extract() for category in categories[2:]: loader.add_value('category', category) sku = hxs.select('//meta[@itemprop="sku"]/@content').extract() loader.add_value('sku', sku) image_url = hxs.select( '//div[@id="product-image"]//img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) identifier = loader.get_output_value('name') loader.add_value('shipping_cost', '0.00') item = loader.load_item() variants = response.xpath('//div[@class="variant"]') if variants: for variant in variants: options = variant.select('.//tr') variant_name = variant.select( './/div[@class="title"]/h4/text()')[0].extract().strip() for option in options: option_name = option.select('.//td[@class="name"]/text()')[ 0].extract().strip().encode('latin-1') option_item = deepcopy(item) option_item['identifier'] = '{}-{}-{}'.format( identifier, variant_name, option_name).decode('latin-1') option_item['name'] += ' {} {}'.format( variant_name, option_name if option_name.lower() != variant_name.lower() else '').decode('latin-1') option_item['name'] = option_item['name'].strip() price = variant.xpath( './/span[@class="now"]/text()').extract_first( ) or variant.css('p.price span::text').extract_first() option_item['price'] = extract_price( price) if price else Decimal('0.00') if Decimal(option_item['price']) < Decimal('30.00'): option_item['shipping_cost'] = '1.99' stock = option.select('.//td[@class="stock instock"]') if not stock: option_item['stock'] = 0 option_item['image_url'] = variant.select( './/img/@src')[0].extract() yield option_item else: self.log('PRODUCT WITHOUT OPTIONS: ' + response.url)
def parse(self, response): URL_BASE = get_base_url(response) hxs = HtmlXPathSelector(response) # categories category_urls = hxs.select('//ul[@id="nav"]/li//a/@href').extract() for url in category_urls: url = urljoin_rfc(URL_BASE, url) url += self.page_query + str(1) yield Request(url) # sub-categories sub_category_urls = hxs.select( "//a[@class='subcategory_link']/@href").extract() for url in sub_category_urls: url = urljoin_rfc(URL_BASE, url) url += self.page_query + str(1) yield Request(url) # pages page = 2 page_urls = hxs.select("//div[@class='pages']//a/@href").extract() for url in page_urls: yield Request(url) page += 1 # products list products_count = 0 products = hxs.select('//h2[@class="product-name"]/..') #products = hxs.select("//form[@class='search_results_section']/table[2]/tr/td/table/tr/td/table/tr/td[@width='25%']") if not products: print "ERROR!! NO PRODUCTS!! %s " % response.url for product_el in products: name = product_el.select(".//h2/a/text()").extract() if not name: continue name = name[0] url = product_el.select(".//h2/a/@href").extract() if not url: print "ERROR!! NO URL!! %s" % response.url continue url = url[0] price = product_el.select( './/span[@class="price" and starts-with(@id, "product-price")]/text()' ).extract() if not price: price = product_el.select( './/span[@class="price"]/text()').extract() if not price: print "ERROR!! NO PRICE!! %s" % response.url price = '0' else: price = price[0] products_count += 1 product = Product() loader = ProductLoader(item=product, response=response) loader.add_value('url', url) loader.add_value('name', u' \r\n' + name + u'\r\n') loader.add_value('price', price) loader.add_value('sku', '') yield loader.load_item() if products_count == 0: # products list 2 products = hxs.select( "//form[@class='search_results_section']/table[2]/tr/td/table/tr/td/table/tr" ) if not products: print "ERROR!! NO PRODUCTS!! %s " % response.url for product_el in products: name = product_el.select( "td/a[@class='productnamecolor colors_productname']/text()" ).extract() if not name: continue url = product_el.select( "td/a[@class='productnamecolor colors_productname']/@href" ).extract() if not url: print "ERROR!! NO URL!! %s" % response.url continue url = url[0] price = product_el.select( './/font[@class="pricecolor colors_productprice"]/text()' ).extract() if not price: print "ERROR!! NO PRICE!! %s" % response.url continue price = price[0] product = Product() loader = ProductLoader(item=product, response=response) loader.add_value('url', url) loader.add_value('name', u' \r\n' + name + u'\r\n') loader.add_value('price', price) loader.add_value('sku', '') yield loader.load_item()
def parse_product(self, response): if response.url in self._ignore_urls: return base_url = get_base_url(response) hxs = HtmlXPathSelector(response) # Fill up the Product model fields # identifier = url = response.url name = hxs.select( "//*[@class='product-name']/*[@itemprop='name']/text()").extract() price = hxs.select( "//span[@itemprop='offers']/span[@itemprop='price']/@content" ).extract() # sku = hxs.select("//span[@id='ProductSKU']/text()").extract() # metadata = category = hxs.select( "//li[contains(@class, 'category')]/a/text()").extract() image_url = hxs.select('//img[@id="image-main"]/@src').extract() # brand = hxs.select("//div[@class='SectionHeader']/h1/text()").extract() # shipping_cost = # l = ProductLoader(response=response, item=Product()) # l.add_value('identifier', identifier) # l.add_value('url', url) # l.add_value('sku', sku) # l.add_value('metadata', metadata) # l.add_value('category', category) # l.add_value('image_url', image_url) # l.add_value('brand', brand) # l.add_value('shipping_cost', shipping_cost) free_shipping = hxs.select( '//div[@class="product-img-box"]//div[@class="onsale-product-label-image"]/table/tr/td[text()[contains(.,"Shipping")] and text()[contains(.,"Free")]]' ).extract() if free_shipping: shipping_cost = Decimal(0) # l.add_value("shipping_cost", Decimal(0)) else: shipping_cost = 11.99 # l.add_value("shipping_cost", 11.99) identifier = hxs.select('//input[@name="product"]/@value').extract() if not identifier: self.log("ERROR identifier not found") else: identifier = identifier[0] # l.add_value("identifier",identifier[0]) brand = hxs.select('//span[@itemprop="brand"]/@content').extract() if not brand: self.log("ERROR brand not found") else: brand = brand[0] # l.add_value("brand",brand[0]) stock = 0 try: p_stock = hxs.select('//meta[@itemprop="availability"]/@content' ).extract()[0].lower() if 'in_stock' in p_stock: stock = 1 except: stock = None self.log("ERROR stock not found") options_config = re.search(r'var spConfig=new Product.Config\((.*)\)', response.body) if not options_config: options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) prices[product] = prices.get(product, 0) + float( option['price']) for option_identifier, option_name in products.iteritems(): l = ProductLoader(response=response, item=Product()) l.add_value('name', name[0] + ' ' + option_name) #if (name[0] + ' ' + size.get('label')).strip() in self._ignore_names: # continue l.add_value('price', float(price[0]) + prices[option_identifier]) l.add_value("identifier", identifier + '-' + option_identifier) l.add_value("brand", brand) l.add_value("shipping_cost", shipping_cost) l.add_value('category', category) l.add_value('image_url', image_url) l.add_value('url', url) if stock is not None: l.add_value("stock", stock) yield l.load_item() else: l = ProductLoader(response=response, item=Product()) l.add_value('name', name) if name in self._ignore_names: return l.add_value('price', price) l.add_value("identifier", identifier) l.add_value("brand", brand) l.add_value("shipping_cost", shipping_cost) l.add_value('category', category) l.add_value('image_url', image_url) l.add_value('url', url) if stock is not None: l.add_value("stock", stock) yield l.load_item()
def parse_product(self, response): schema = SpiderSchema(response) pdata = schema.get_product() sku = pdata.get('mpn', '') image = pdata['image'].replace('example.com', 'prodirectsoccer.com') main_id = response.xpath( '//div[@id="define-profile"]/@data-quickref').extract()[0] main_name = pdata['name'] main_price = extract_price(pdata['offers']['properties']['price']) main_brand = response.meta.get('brand') shipping = '9.93' sizes = response.xpath('//select[@id="size"]/option[@value!=""]') player_sel_label = response.xpath( '//label[@for="pers-opt1"]/text()').extract() player_tourn_sel_label = response.xpath( '//label[@for="pers-opt2"]/text()').extract() for size_opt in sizes: size_desc = size_opt.xpath('text()').extract()[0].strip() size_value = size_opt.xpath('@value').extract()[0].strip() in_stock = True if ' ' in size_desc: size_desc, stock = size_desc.split(' ', 1) if 'OUT OF STOCK' in stock.upper(): in_stock = False loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', main_id + 'x' + size_value) loader.add_value('name', main_name + ' - ' + size_desc) loader.add_value('sku', sku) loader.add_value('price', main_price) loader.add_value('shipping_cost', shipping) loader.add_value('url', response.url) loader.add_value('image_url', image) if main_brand: loader.add_value('brand', main_brand) loader.add_value('category', 'Replicas') if not in_stock: loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = {'size': size_desc} yield item if player_sel_label: player_sel_price = extract_price(player_sel_label[0]) players = response.xpath( '//select[@id="pers-player"]/option[@value!=""]') for player_opt in players: player_desc = player_opt.xpath( 'text()').extract()[0].strip() player_value = player_opt.xpath( '@value').extract()[0].strip() new_item = Product(item) new_item['identifier'] += 'x' + player_value new_item['name'] += ' - ' + player_desc new_item['price'] = Decimal( new_item['price']) + player_sel_price try: player_number, player_name = re.search( r'(\d+)\s(.*)', player_desc).groups() new_item['metadata']['player'] = player_name.strip() new_item['metadata']['number'] = player_number except: pass yield new_item if player_tourn_sel_label: player_tourn_price = extract_price( player_tourn_sel_label[0]) tournaments = response.xpath( '//select[@id="pers-tournament"]/option[@value!=""]' ) for tourn_opt in tournaments: tourn_desc = tourn_opt.xpath( 'text()').extract()[0].strip() tourn_value = tourn_opt.xpath( '@value').extract()[0].strip() new_item = Product(item) new_item[ 'identifier'] += 'x' + player_value + 'x' + tourn_value new_item[ 'name'] += ' - ' + player_desc + ' - ' + tourn_desc new_item['price'] = Decimal( new_item['price']) + player_tourn_price try: player_number, player_name = re.search( r'(\d+)\s(.*)', player_desc).groups() new_item['metadata'][ 'player'] = player_name.strip() new_item['metadata']['number'] = player_number except: pass yield new_item
def parse_product(self, response): hxs = HtmlXPathSelector(response) data = response.xpath( '//script/text()[contains(., "product/data")]').extract_first() data = json.loads( re.search('product/data",[ \n]*({.+})', data).group(1)) price = ''.join( hxs.select( '//div[contains(@class, "js-product-offer-summary")]//div[contains(@class, "price-display")]//text()' ).extract()) if not price: price = ''.join( response.xpath( '//div[@itemprop="offers"]//div[@itemprop="price"][1]//text()' ).extract()) if not price: price = ''.join( response.xpath( '//span[contains(@class, "hide-content-m")]/span[@data-tl-id="Price-ProductOffer"]//text()' ).extract()) # Some products are not available online and these have no price if price: stock_status = 1 if 'out of stock' in price.lower(): stock_status = 0 product_name = filter( lambda x: bool(x), map( unicode.strip, hxs.select('//h1[contains(@itemprop, "name")]//text()'). extract())) loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('identifier', re.search(r'/(\d+)$', response.url).group(1)) loader.add_value('sku', response.meta['sku']) loader.add_value('brand', response.meta['brand']) categories = hxs.select( '//ol[contains(@class, "breadcrumb-list")]//li//a/span/text()' ).extract() categories = map(lambda x: x.strip(), categories) loader.add_value('category', categories) loader.add_value('url', response.url) loader.add_xpath( 'image_url', '//img[contains(@class, "js-product-primary-image")]/@src') try: loader.add_value( 'shipping_cost', data['buyingOptions']['shippingPrice']['displayPrice']) except KeyError: loader.add_css('shipping_cost', 'h2.js-shipping-primary-msg::text') loader.add_value('price', price) if not stock_status: loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = {} yield Request(self._get_reviews_url(item, 1), meta={ 'product': item, 'page': 1 }, callback=self.parse_product_reviews)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name_xpath = '//div[@id="product-details"]/h1/span/text()' names = hxs.select('//h1[@id="product_title"]/text()').extract() if names and len(names) > 0: name = names[0].strip() else: # product not found. Just continue self.log('WARNING: Product not found => %s' % response.url) return quantity = hxs.select('//p[@id="stock_status"]/text()').extract() if quantity and 'OUT OF STOCK' in quantity.pop().upper(): quantity = 0 else: quantity = None category = hxs.select( '//ul[@id="crumbs"]/li[@class="last"]/a/text()').extract() brand = hxs.select( '//div[@id="product_title_container"]/span[@class="secondary"]/text()' ).extract() loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//img[@id="main_image"]/@src', TakeFirst(), Compose(lambda v: urljoin(base_url, v))) loader.add_xpath( 'price', '//div[@class="product_price"]/span[@class="price"]/text()', TakeFirst(), re="([.0-9]+)") if not loader.get_output_value('price'): loader.add_value('price', 0) if category: loader.add_value('category', category[0].strip()) sku = hxs.select('//li[@itemprop="ISBN13"]/text()').extract() sku = sku[-1].strip() if sku else '' loader.add_value('sku', sku) if brand: loader.add_value('brand', brand[0].strip()) identifier = hxs.select('//input[@name="ProductID"]/@value').extract() if not identifier: identifier = hxs.select('//li[@itemprop="id"]/text()').extract() loader.add_value('identifier', identifier[0]) if quantity == 0: loader.add_value('stock', 0) item = loader.load_item() metadata = BookpeopleMeta() pre_order = hxs.select( '//button[contains(@class, "submit") and text()="Pre order"]') metadata['pre_order'] = 'Yes' if pre_order else '' author = hxs.select( '//span[contains(em/text(), "author")]/a/text()').extract() metadata['author'] = author[0] if author else '' book_format = hxs.select('//li[@itemprop="Format"]/text()').extract() metadata['format'] = book_format[-1].strip() if book_format else '' publisher = hxs.select('//span[@itemprop="publisher"]/a/text()').re( ': (.*)') metadata['publisher'] = publisher[0] if publisher else '' published = hxs.select( '//li[@itemprop="publication date"]/text()').extract() metadata['published'] = published[-1].strip() if published else '' item['metadata'] = metadata yield item
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('category', 'Kits') base_data = response.xpath('//script/text()').re('product\w{6} =(.+?});var') hero_data = response.xpath('//script/text()').re('product\d{7} =(.+?});var') if base_data: base_data = json.loads(base_data[0]) if hero_data: hero_data = [json.loads(elem) for elem in hero_data] selected_hero = response.xpath('//select[contains(@class,"heroShirts")]/option[@selected]/@value').extract_first() if selected_hero: hero_data = {elem['ProductID']: elem for elem in hero_data}[int(selected_hero)] elif len(hero_data) == 1: hero_data = hero_data[0] else: hero_data = {} else: hero_data = {} if not base_data and not hero_data: return loader.add_value('name', base_data['Description']) loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"') if base_data['Brand']: loader.add_value('brand', base_data['Brand']['Name'].title()) loader.add_value('image_url', response.urljoin(base_data['ImageURL'])) loader.add_value('shipping_cost', self.shipping_cost) product = loader.load_item() # Player names player_from_name = re.search('with *([\w\ \.\-]+?) (\d+)', hero_data.get('Description', ''), re.UNICODE) if player_from_name: player, number = player_from_name.groups() for data in [hero_data, base_data]: for variation in data.get('Variations', []): size = variation['Description'] loader = ProductLoader(item=Product(), response=response) loader.add_value(None, product) loader.replace_value('identifier', variation['VariationId']) option_name = data['Description'] + u' ' + size loader.replace_value('name', option_name) loader.replace_value('price', Decimal(str(variation['PriceActual']))*self.exchange_rate) if data.get('ImageURL'): loader.replace_value('image_url', response.urljoin(base_data['ImageURL'])) if not variation['IsInStock']: loader.replace_value('stock', 0) identifier = str(variation['VariationId']) item = loader.load_item() if self.free_delivery_over is not None and self.free_delivery_over <= item['price']: item['shipping_cost'] = '0.00' if item['identifier'] not in self.extracted_identifiers: self.extracted_identifiers.append(item['identifier']) if player_from_name and data == hero_data: item['metadata'] = {'player': player, 'number': number, 'size': size} else: item['metadata'] = {'size': size} yield item # Badges printings = {elem['PrintingTypeID']: elem for elem in base_data['printingitems']} printing = printings.get(3) if printing: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) option_name = loader.get_output_value('name') + u' ' + printing['PrintingDescription'] loader.replace_value('name', option_name) price = Decimal(str(variation['PriceActual'])) + Decimal(str(printing['PriceActual'])) loader.replace_value('price', format_price(price*self.exchange_rate)) identifier += '-' + str(printing['PrintingID']) loader.replace_value('identifier', identifier) item = loader.load_item() if self.free_delivery_over is not None and self.free_delivery_over <= item['price']: item['shipping_cost'] = '0.00' if item['identifier'] not in self.extracted_identifiers: self.extracted_identifiers.append(item['identifier']) if player_from_name and data == hero_data: item['metadata'] = {'player': player, 'number': number, 'size': size} else: item['metadata'] = {'size': size} yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url brand = response.meta.get('brand', '') name = hxs.select( "//div[@class='primary-content']//div[@id='product-title']/h1/text()" ).extract() if not name: logging.error("ERROR! NO NAME! %s" % url) return name = name[0] price = hxs.select( "//div[@class='secondary-content']//ul[@class='pricing']/li[@class='current-price']/span/text()" ).extract() if not price: logging.error("ERROR! NO PRICE! %s %s" % (url, name)) price = '' else: price = "".join(price[:2]) sku = url.lower().split('skuid=')[-1] if len( url.lower().split('skuid=')) > 0 else None if not sku: logging.error("ERROR! SKU! %s %s" % (url, name)) return categories = " ".join( hxs.select("//div[@id='breadcrumbs']//li//a/text()").extract() ).lower().replace('\n', ' ').split(' ') if 'books' in categories: logging.error("ERROR! Product not valid %s %s" % (url, name)) return #is_valid = [t for t in self.cats if t in categories] #if brand.lower() != 'keter' and not is_valid: # logging.error("ERROR! Product not valid %s %s" % (url, name)) # return l = ProductLoader(item=Product(), response=response) l.add_value('identifier', sku) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) l.add_value('brand', brand.strip().lower()) #l.add_value('sku', sku) product = l.load_item() metadata = KeterMeta() metadata['brand'] = brand.strip().lower() metadata['reviews'] = [] product['metadata'] = metadata review_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=asiwwvlu4jk00qyffn49sr7tb&apiversion=5.4&displaycode=1235-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A' + sku + '&filter.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&sort.q0=rating%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv182_28795' request = Request(review_url, meta={ 'product': product, 'offset': 0, 'sku': sku }, callback=self.parse_reviews) yield request
def parse(self, response): if not isinstance(response, HtmlResponse): retry_count = self.retry_urls.get(response.url, 0) retry_count += 1 if retry_count > 100: self.log("ERROR MAX retry count reached (100), giving up...") return else: self.log( "ERROR - got response that is not HTML, adding to retry queue (#{})" .format(retry_count)) self.retry_urls[response.url] = retry_count yield Request(url=response.url, callback=self.parse, dont_filter=True) hxs = HtmlXPathSelector(response) pages = hxs.select( '//div[@class="contents"]/div//h2//div[@class="pagination"]//a/@href' ).extract() for page in pages: url = urljoin_rfc(get_base_url(response), page) yield Request(url=url, callback=self.parse) category = hxs.select('//div[@class="contents"]/h1/text()').extract() if not category: self.log('ERROR - No category name found!') category = brand = '' else: category = brand = category[0] products = hxs.select( '//div[@class="contents"]/table//tr[td[@valign="middle"]]') if not products: self.log('ERROR - empty products list, needs investigation!') return for product in products: product_id = product.select('.//a[@class="buttonBig"]/@href').re( r'add_to_cart/(\d+)') if not product_id: continue product_loader = ProductLoader(item=Product(), selector=product) product_url = product.select('.//td//font//b//a/@href').extract() if product_url: product_url = urljoin_rfc(get_base_url(response), product_url[0]) product_loader.add_value('url', product_url) product_image = product.select( './/img[@class="product_image"]/@src').extract() if product_image: product_image = urljoin_rfc(get_base_url(response), product_image[0]) product_loader.add_value('image_url', product_image) product_loader.add_value('identifier', product_id) product_loader.add_xpath('name', './/td//font//b//a/text()') product_loader.add_xpath('price', './/td//font[@class="price"]//b/text()') product_loader.add_value('category', category) product_loader.add_value('brand', brand) item = product_loader.load_item() yield Request(item['url'], callback=self.parse_product, meta={'item': item})
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select('//*[@id="main-image"]/@href').extract() try: product_identifier = hxs.select( '//input[@name="product"]/@value').extract()[0].strip() except: product_identifier = hxs.select( '//form[@id="product_addtocart_form"]/@action').re( r'/product/(\d+)')[0] product_name = hxs.select( '//div[@class="product-name"]/h1/text()').extract()[0].strip() category = response.meta.get('category') sku = hxs.select('//div[@class="sku-package"]/text()').extract() if sku: sku = sku[0].strip() sku = sku.replace('SKU# ', '') else: sku = '' brand = '' for b in self.brands: if product_name.startswith(b): brand = b break options_config = re.search( r'var spConfig=new Product.Config\((.*)\)', response.body.replace('var spConfig = new', 'var spConfig=new')) ean = hxs.select( '//div[@class="sku-package" and contains(text(), "SKU# ")]/text()' ).extract() if options_config: product_data = demjson.decode(options_config.groups()[0], return_errors=True)[0] products = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) for identifier, option_name in products.iteritems(): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier + '_' + identifier) product_loader.add_value('name', product_name + option_name) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) price = float(product_data['basePrice']) product_loader.add_value('price', round(price, 2)) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product_loader.add_value('sku', sku) product = product_loader.load_item() if ean: product['metadata'] = { "ean": ean[0].split("SKU# ")[-1].strip() } yield product else: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select('//meta[@itemprop="price"]/@content').extract() price = ''.join(price).strip() if price == '': price = hxs.select('//*[@id="old-price-{}"]//text()'.format( product_identifier)).extract() price = ''.join(price).strip() price = extract_price(price) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product_loader.add_value('sku', sku) product = product_loader.load_item() if ean: product['metadata'] = { "ean": ean[0].split("SKU# ")[-1].strip() } yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) mpn = hxs.select('//span[@class="b-item"]').re("MPN: ([0-9]+)") ean = hxs.select('//span[@class="b-item"]').re("EAN: ([0-9]+)") sku = hxs.select('//input[@name="sku"]/@value').extract() name = hxs.select('//h1[@class="b-ttl-main"]/text()').extract()[0] dealer_name = "".join(hxs.select('//h2[@id="auto_shop_info_name"]//text()').extract()).strip() brand = hxs.select('.//span[@itemprop="brand"]/text()').extract() if brand: brand = brand[0].strip() else: brand = response.meta.get('brand') categories = hxs.select('//ul[@class="b-breadcrumb"]/li/a/text()').extract() image_url = hxs.select('//img[@itemprop="image"]/@data-frz-src').extract() ## options = hxs.select('//script[contains(text(), "var variant_details")]/text()').re('var variant_details = (.*);\n') options = hxs.select('//script[contains(text(), "var variant_details")]/text()').extract() if options: options = options[0].replace('"', "'") options = re.findall('var variant_details = (.*);\n', options) variants = json.loads(options[0]) else: identifier = hxs.select('//input[@name="item_id"]/@value').extract()[0] price = hxs.select('//div[@class="b-product-main"]//meta[@itemprop="price"]/@content').extract()[0] variants = [{'itemVariantId': identifier, 'sku': sku, 'variantValues': [], 'defaultPricing': {'price': price}}] items = [] for variant in variants: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', variant['itemVariantId']) loader.add_value('name', " ".join([name] + variant.get('variantValues', []))) loader.add_value('sku', variant['sku']) loader.add_value('url', response.url) loader.add_value('price', variant['defaultPricing']['price']) loader.add_value('dealer', dealer_name) loader.add_value('category', categories) if brand: loader.add_value('brand', brand) if image_url: loader.add_value('image_url', image_url[0]) product = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] product['metadata'] = metadata if mpn or ean: if mpn: metadata['mpn'] = mpn[0] if ean: metadata['ean'] = ean[0] product['metadata'] = metadata items.append(product) reviews_url = response.xpath('//a[contains(text(), "See All Reviews")]/@href').extract() if reviews_url: yield Request(reviews_url[0], callback=self.parse_reviews, meta={'items': items, 'url': response.url}) else: for item in items: yield item