def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[contains(@id,"Tyre") and contains(@class, "tyre-list-tyre")]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'div//div[@class="manufacturerText"]/p/strong/text()') brand = ''.join(product.select('div//div[@class="manufacturerImage"]/img/@alt').extract()).split(' - ')[0] winter_tyre = product.select('div//img[@alt="Winter Tyre"]') if not winter_tyre: loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) identifier = product.select('div//div[@class="pricingAddToOrder clearfix"]/input/@value').extract()[0] loader.add_value('url', '') image_url = product.select('div[@class="image"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) loader.add_value('identifier', identifier) price = product.select('div//div[contains(@class, "pricingSelection")]//a/strong/text()').extract() price = re.findall(r"\d+.\d+", price[0]) if price else '0.0' loader.add_value('price', price) tyresize_text = product.select('.//div[contains(@class, "manufacturerText")]/p/span/text()').extract()[0].strip() width, aspect, speed_rating, rim = re.search(r'tyre size (\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups() fitting_method = 'Fitted' metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = fitting_method load_rating = product.select('div//li/a[@rel="load-index-description"]/text()').extract() metadata['load_rating'] = load_rating[0].split(': ')[-1] if load_rating else '' metadata['alternative_speed_rating'] = '' xl = product.select('div//img[@title="Reinforced"]/@title').extract() metadata['xl'] = 'Yes' if xl else 'No' run_flat = product.select('div//img[@title="Run Flat"]').extract() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = product.select('div//img[contains(@title, "Homologated for fitment to certai")]/@title').extract() manufacturer_mark = manufacturer_mark[0].replace('Homologated for fitment to certain ' ,'').replace(' cars.' ,'') if manufacturer_mark else '' metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//tr[contains(@class,"tyre-search-row")]') next_page = [] if next_page: yield Request(urljoin_rfc(base_url, next_page[0]), meta=response.meta) not_found_count = 0 for product in products: url = product.select('.//td/b/a/@href')[0].extract() winter_tyre = product.select('.//td/b/a/text()')[0].extract() winter_tyre = 'winter' in winter_tyre.lower() if not winter_tyre: brand = product.select('.//a/img/@src')[0].extract() brand = re.search('/public/brands/(.*?)(-tyres)?\.', brand).group(1).replace('-', ' ').title() product_name = product.select('.//td/b/a/text()')[0].extract() product_name = re.sub(brand, '', product_name).strip() fitting_method = 'Delivered' identifier = product.select( './/input[@name="item_id"]/@value').extract() if not identifier: identifier = product.select('.//a/@href').re( 'email_me_stock/(.*)') if not identifier: continue try: fuel, grip, noise = map( unicode.strip, product.select( './/img[contains(@alt, "Tyre Label")]/following-sibling::text()' ).extract()) except: fuel = '' grip = '' noise = '' price = product.select("td[3]/b/text()").extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', identifier[0]) loader.add_value('name', product_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', url) if price: loader.add_value('price', price[0]) else: loader.add_value('price', '0.00') loader.add_value('stock', 0) pattern_name = product.select('.//i/text()').extract() if not pattern_name: continue pattern_name = pattern_name[0] data = re.search( '(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)', pattern_name) if data: data = data.groupdict() else: msg = 'ERROR parsing "{}" [{}]'.format( pattern_name, response.url) self.log(msg) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'].upper() metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in pattern_name metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(pattern_name) run_flat = 'run flat' in pattern_name.lower( ) or 'runflat' in pattern_name.lower() or run_flat_found metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): not_found_count += 1 self.log('%s - PRODUCT IS NOT CORRECT: %r' % (not_found_count, product)) continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) if product['url'] in self.images: product['image_url'] = self.images[product['url']] yield product else: yield Request(product['url'], callback=self.parse_image, meta={'product': product}, dont_filter=True)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name brand = response.meta.get('brand') or '' product_name = hxs.select('//h2[@class="heading black"]/text()')[0].extract().strip() product_name = re.sub(brand, '', product_name).strip() fitting_method = 'Delivered' base_loader.add_value('url', response.url) image_url = hxs.select('//div[@class="item"]/a/img/@src').extract() options = hxs.select('//div[@style="background: #fff; padding: 6px; "]') for option in options: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', product_name) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = option.select('../input[@type="hidden" and @name="item_id"]/@value').extract() if not identifier: identifier = option.select('./a/@href').re('email_me_stock/(.*)') if not identifier: continue loader.add_value('identifier', identifier[0]) price = option.select('./strong[@class="price" and not(contains(text(),"On Backorder"))]/text()').extract() if price: loader.add_value('price', price[0]) else: if response.meta.get('price'): loader.add_value('price', response.meta['price']) else: loader.add_value('price', '0.00') loader.add_value('stock', 0) pattern_name = option.select('./p/strong/text()').extract() if not pattern_name: pattern_name = option.select('./strong/text()').extract() pattern_name = pattern_name[0] data = re.search('(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)', pattern_name) if data: data = data.groupdict() else: msg = 'ERROR parsing "{}" [{}]'.format(pattern_name, response.url) log.msg(msg) self.errors.append(msg) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'].upper() metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in pattern_name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'run flat' in pattern_name.lower() or 'runflat' in pattern_name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ')] manufacturer_mark = manufacturer_mark[0].strip() if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse(self, response): products = response.xpath( '//div[contains(@class, "tyres_search_results_tyre") and @data-viewtype="grid"]' ) for product in products: winter_tyre = product.xpath( '@data-filter-season').extract()[0] == 'Winter' if not winter_tyre: name = product.xpath( './/div[contains(@class, "tyre-model text-center")]/text()' ).extract()[0] brand = product.xpath('@data-filter-brand').extract()[0] loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', brand + ' ' + name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product.xpath('@data-tyreid').extract()[0] loader.add_value('identifier', identifier) loader.add_value('url', response.url) image_url = product.xpath( './/div[contains(@class, "tyre-image")]//img/@src' ).extract() if image_url: loader.add_value( 'image_url', urljoin(get_base_url(response), image_url[0])) price = product.xpath( './/div[contains(@class, "tyre-pricing-information")]/div/text()' ).re(r'[\d,.]+') price = price[0] if price else '0.00' loader.add_value('price', price) tyresize_text = product.xpath( './/div[contains(@class, "tyre-size")]/text()').extract( )[0].strip() try: width, aspect, speed_rating, rim, load_rating = re.search( r'(\d+)\/(\d+)(\w{1})(\d+)\s\((\d+)\)', tyresize_text, re.I).groups() except: width, aspect, speed_rating, rim = re.search( r'(\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups() load_rating = '' fitting_method = 'Fitted' metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = fitting_method metadata['load_rating'] = load_rating metadata['alternative_speed_rating'] = '' xl = product.xpath( '@data-filter-reinforced').extract()[0] == 'Y' metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(loader.get_output_value('name')) run_flat = product.xpath( '@data-filter-runflat').extract()[0] == 'Y' metadata[ 'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' manufacturer_mark = product.xpath('.//span[contains(@title, "Homologated for fitment to certai")]/@title')\ .re(r'Homologated for fitment to certain (.*) cars\.') metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark[0]) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) fuel, grip, noise = product.xpath('@data-filter-tyreefficiencyr' '|@data-filter-tyreefficiencyg' '|@data-filter-tyreefficiencyd')\ .extract() metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product
def parse_products(self, response): json_data = json.loads(response.body) products = json.loads(json_data.get('d')) for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) try: brand = product_el[u'ProductManufacturer'][ u'TyreManufacturerName'] except: brand = '' winter_tyre = product_el[u'ProductAttributes'][u'IsWinter'] # skip winter tyres if winter_tyre: continue for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand try: full_name = product_el[u'ProductTreadPattern'][u'TreadName'] except: full_name = '' # Fix name changes if full_name in self.new_old_names: full_name = self.new_old_names[full_name] loader.add_value('name', full_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.get('TyreID') loader.add_value('url', 'http://www.tyresonthedrive.com') image_url = 'http://www.tyresonthedrive.com/img/treads/' + product_el[ u'ProductTreadPattern'][u'TreadPatternImage'] + '.jpg' loader.add_value('image_url', image_url) loader.add_value('identifier', identifier) price = product_el[u'CheapestPriceTwoDay'][u'OneTyrePriceIncVat'] if not price: loader.add_value('stock', 0) loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = str( product_el[u'ProductAttributes'][u'Profile']) metadata['rim'] = str(product_el[u'ProductAttributes'][u'Rim']) metadata['speed_rating'] = str( product_el[u'ProductAttributes'][u'Speed']) metadata['load_rating'] = str( product_el[u'ProductAttributes'][u'Load']) metadata['width'] = str( product_el[u'ProductAttributes'][u'Section']) metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' metadata['xl'] = 'Yes' if product_el[u'ProductAttributes'][ u'IsExLoad'] else 'No' metadata['run_flat'] = 'Yes' if product_el[u'ProductAttributes'][ u'IsRunFlat'] else 'No' man_mark = product_el[u'ProductAttributes'][u'OEMFitment'] metadata['manufacturer_mark'] = find_man_mark( man_mark) if man_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse(self, response): row = response.meta['row'] json_data = None for line in response.body.split('\n'): if "JsonObject = " in line: json_data = json.loads( line.replace('JsonObject = ', '').replace('; \r', '')) products = json_data['Rest'] + json_data['Deals'] collected_products = [] self.log('Results found {} {}'.format(len(products), response.meta)) for product_info in products: # skip winter tyres if product_info['WinterTyre']: continue loader = ProductLoader(item=Product(), selector=product_info) loader.add_value('name', product_info['ModelName']) brand = product_info['Manufacturer'] loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_info['PrimaryId'] fitting_method = 'Fitted' if str(identifier) + '-' + fitting_method in self.seen_ids: continue url = '/catalogue' + product_info[ 'CatalogueUrl'] + '/f?tyre=' + str(product_info['PrimaryId']) loader.add_value('url', response.urljoin(url)) image_url = product_info.get('ModelImageLarge') if not image_url: image_url = product_info.get('ModelImage') if image_url: image_url = image_url.split('src="')[-1].split('"')[0] loader.add_value('image_url', response.urljoin(image_url)) spec = product_info['SpecificationName'] metadata = MicheldeverMeta() # metadata['mts_stock_code'] = row['MTS Stockcode'] metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = spec.split()[-1] metadata['width'] = row['Width'] load_rating = product_info['LoadRatingName'] metadata['load_rating'] = load_rating metadata['alternative_speed_rating'] = '' xl = product_info['Reinforced'] metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(product_info['ModelName']) run_flat = product_info['RunFlat'] metadata[ 'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' manufacturer_mark = product_info['Variant'] if manufacturer_mark: manufacturer_mark = manufacturer_mark.split()[0].strip() full_tyre_size = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], metadata['load_rating'], metadata['speed_rating'])) # MOE Exception for this product if manufacturer_mark and 'MO EXTENDED' in product_info['Variant'].upper()\ and product_info['ModelName'] == 'Potenza S001' and full_tyre_size == '245/40/18/97/Y': metadata['manufacturer_mark'] = 'MOE' else: metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = full_tyre_size try: metadata['fuel'] = product_info['TyreLabelFuel']['Score'] except Exception: metadata['fuel'] = '' try: metadata['grip'] = product_info['TyreLabelWet']['Score'] except Exception: metadata['grip'] = '' try: metadata['noise'] = product_info['TyreLabelNoise'][ 'NoiseLevel'] except Exception: metadata['noise'] = '' product = loader.load_item() product['metadata'] = metadata product['price'] = product_info['FullyFittedPrice'] fitting_method = 'Fitted' product['identifier'] = str(identifier) + '-' + fitting_method product['metadata']['fitting_method'] = fitting_method t1 = time.time() if not is_product_correct(product): self.log('Search: {}'.format(str(response.meta))) self.seen_ids.add(str(identifier) + '-' + fitting_method) self.log('PRODUCT IS NOT CORRECT => %r' % product) continue t2 = time.time() self.log('Time taken by product correct: {}'.format(t2 - t1)) t1 = time.time() product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) t2 = time.time() self.log('Time taken by mts stock: {}'.format(t2 - t1)) collected_products.append(product) min_price_products = {} for product in collected_products: key = "%s-%s-%s-%s-%s-%s-%s" % ( product['brand'], product['name'], product['metadata']['fitting_method'], product['metadata']['full_tyre_size'], product['metadata']['xl'], product['metadata']['run_flat'], product['metadata']['manufacturer_mark']) if key in min_price_products: if product['price'] < min_price_products[key]['price']: min_price_products[key] = product else: min_price_products[key] = product for product in min_price_products.values(): self.seen_ids.add(product['identifier']) yield product
def parse(self, response): products = response.xpath('//div[@class="results"]') pages = response.xpath( '//p[contains(text(),"Page")]//a/@href').extract() for page in pages: yield Request(response.urljoin(page), meta=response.meta) for product in products: loader = ProductLoader(item=Product(), selector=product) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name name = ' '.join( map( unicode.strip, product.select('.//div[@class="resultsLeft"]/div' '//text()[normalize-space()]').extract())) name += name + ' %s' % ' '.join( map( unicode.strip, product.select( './/div[@class="t_size"]//text()[normalize-space()]'). extract())) loader.add_xpath( 'name', './/div[@class="resultsLeft"]/div//a/i/b/text()[normalize-space()]' ) brand = product.select( './/div[@class="resultsLeft"]/div/b//text()[normalize-space()]' ).extract()[0].strip() # skip winter tyres if product.select( './/img[contains(@alt,"Winter / cold weather tyres")]'): continue if product.select( './/img[contains(@alt,"Wi") or contains(@src,"/simg/hiver.png")]' ): continue loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) fitting_method = 'Fitted' url = product.select('.//a[i[b]]/@href')[0].extract() url = response.urljoin(url) url = re.sub('cart_id=[^&]*', '', url) loader.add_value('url', url) image_url = product.select( './/a/img[@align="left"]/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) identifier = urlparse.parse_qs( urlparse.urlparse(url).query)['typ'][0] loader.add_value('identifier', identifier) price = ''.join( product.select( './/div[@class="price"]/font/b//text()[normalize-space()]' ).extract()) price = re.findall(r"\d+.\d+", price) if price else '0.0' loader.add_value('price', price) data = parse_pattern(name) if not data: # log.msg("ERROR %s [%s]" % (name, response.url)) # self.errors.append("Error parsing: %s. URL: %s" % (name, response.url)) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] metadata['alternative_speed_rating'] = '' xl = 'XL' in name metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(name) run_flat = 'run flat' in name.lower() or 'runflat' in name.lower( ) or run_flat_found metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = product.select( './/div[@class="t_size"]/b/a[contains(@onmouseover,"Original") or ' 'contains(@onmouseover,"BMW") or contains(@onmouseover,"Porsche")]' '/@name[normalize-space()]').extract() manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) try: fuel, grip, noise = map( unicode.strip, product.select( './/div[@class="tyre_label_short"]//text()').extract()) metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise.replace('dB', '').strip() except: metadata['fuel'] = '' metadata['grip'] = '' metadata['noise'] = '' product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue if product['identifier'] in self.ip_codes: ip_code = self.ip_codes[product['identifier']] product['sku'] = ip_code product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log, ip_code=ip_code) yield product else: # We can't found IP code on products list, unfortunatelly we must extract it from product page yield Request(product['url'], meta={'product': product}, callback=self.parse_ipcode)
def parse(self, response): try: hxs = HtmlXPathSelector(response) except AttributeError: msg = 'Error getting selector on page for row: %s' % response.meta[ 'row'] self.log('[ERROR] %s' % msg) self.errors.append(msg) return row = response.meta['row'] json_data = None for line in hxs.extract().split('\n'): if "JsonObject = " in line: json_data = json.loads( line.replace('JsonObject = ', '').replace('; \r', '')) products = json_data['Rest'] + json_data['Deals'] collected_products = [] for product_info in products: # skip winter tyres if product_info['WinterTyre']: continue loader = ProductLoader(item=Product(), selector=product_info) loader.add_value('name', product_info['ModelName']) brand = product_info['Manufacturer'] loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_info['PrimaryId'] fitting_method = 'Delivered' url = '/catalogue' + product_info[ 'CatalogueUrl'] + '/f?tyre=' + str(product_info['PrimaryId']) loader.add_value('url', urljoin(get_base_url(response), url)) image_url = product_info.get('ModelImageLarge') if not image_url: image_url = product_info.get('ModelImage') if image_url: image_url = image_url.split('src="')[-1].split('"')[0] loader.add_value('image_url', urljoin(get_base_url(response), image_url)) loader.add_value('identifier', str(identifier) + '-' + fitting_method) price = product_info['SellingPrice'] loader.add_value('price', price) spec = product_info['SpecificationName'] metadata = MicheldeverMeta() # metadata['mts_stock_code'] = row['MTS Stockcode'] metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = spec.split()[-1] metadata['width'] = row['Width'] metadata['fitting_method'] = fitting_method load_rating = product_info['LoadRatingName'] metadata['load_rating'] = load_rating metadata['alternative_speed_rating'] = '' xl = product_info['Reinforced'] metadata['xl'] = 'Yes' if xl else 'No' run_flat = product_info['RunFlat'] metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = product_info['Variant'] if manufacturer_mark: manufacturer_mark = manufacturer_mark.split()[0].strip() metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], metadata['load_rating'], metadata['speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating # Do not collect "Delivered" tyres # yield product product['price'] = product_info['FullyFittedPrice'] fitting_method = 'Fitted' product['identifier'] = str(identifier) + '-' + fitting_method product['metadata']['fitting_method'] = fitting_method collected_products.append(product) min_price_products = {} for product in collected_products: key = "%s-%s-%s-%s-%s-%s-%s" % ( product['brand'], product['name'], product['metadata']['fitting_method'], product['metadata']['full_tyre_size'], product['metadata']['xl'], product['metadata']['run_flat'], product['metadata']['manufacturer_mark']) if key in min_price_products: if product['price'] < min_price_products[key]['price']: min_price_products[key] = product else: min_price_products[key] = product for product in min_price_products.values(): yield product