def parse_product(self, response): hxs = HtmlXPathSelector(response) base_loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name brand = response.meta.get('brand') or '' product_name = hxs.select('//h2[@class="heading black"]/text()')[0].extract().strip() product_name = re.sub(brand, '', product_name).strip() fitting_method = 'Delivered' base_loader.add_value('url', response.url) image_url = hxs.select('//div[@class="item"]/a/img/@src').extract() options = hxs.select('//div[@style="background: #fff; padding: 6px; "]') for option in options: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', product_name) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = option.select('../input[@type="hidden" and @name="item_id"]/@value').extract() if not identifier: identifier = option.select('./a/@href').re('email_me_stock/(.*)') if not identifier: continue loader.add_value('identifier', identifier[0]) price = option.select('./strong[@class="price" and not(contains(text(),"On Backorder"))]/text()').extract() if price: loader.add_value('price', price[0]) else: if response.meta.get('price'): loader.add_value('price', response.meta['price']) else: loader.add_value('price', '0.00') loader.add_value('stock', 0) pattern_name = option.select('./p/strong/text()').extract() if not pattern_name: pattern_name = option.select('./strong/text()').extract() pattern_name = pattern_name[0] data = re.search('(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)', pattern_name) if data: data = data.groupdict() else: msg = 'ERROR parsing "{}" [{}]'.format(pattern_name, response.url) log.msg(msg) self.errors.append(msg) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'].upper() metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in pattern_name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'run flat' in pattern_name.lower() or 'runflat' in pattern_name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ')] manufacturer_mark = manufacturer_mark[0].strip() if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def extract_products(self, hxs, url): for el in hxs.select( '//div[starts-with(@class,"tyre_container round")]'): tyre_options = fix_spaces("".join( el.select( './/p[@class="tyre_details"]//text()').extract())).strip() if not tyre_options: msg = 'Could not extract tyre options from element from %s' % url self.log('ERROR: %s' % msg) self.errors.append(msg) continue res = parse_pattern(tyre_options) if not res: msg = "ERROR parsing: %s on %s" % (tyre_options, url) self.log(msg) self.errors.append(msg) continue width, ratio, rim, load_rating, speed_rating, name = res # skip winter tyres if el.select(".//div[@class='tyre_winter']"): continue name = name.strip() identifier = el.select("./@id").extract()[0] price = "".join( el.select( ".//p[@class='tyre_price']//text()").extract()).strip() if not price: continue brand = el.select( ".//span[@class='tyre_brand_text']/text()").extract()[0] image_url = el.select("img/@src").extract()[0] image_url = urljoin_rfc('http://asdatyres.co.uk', image_url) run_flat = 'Yes' if len( el.select(".//div[@class='tyre_rf']").extract()) > 0 else 'No' xl = 'Yes' if len( el.select(".//div[@class='tyre_xl']").extract()) > 0 else 'No' if xl == 'Yes': name = name.replace("XL", "").strip() loader = ProductLoader(Product(), selector=hxs) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('price', price) loader.add_value('url', 'http://www.asdatyres.co.uk/') loader.add_value('image_url', image_url) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) metadata = MicheldeverMeta() metadata['width'] = width metadata['aspect_ratio'] = ratio metadata['rim'] = rim metadata['load_rating'] = load_rating metadata['speed_rating'] = speed_rating metadata['fitting_method'] = 'Fitted' metadata['run_flat'] = run_flat metadata['xl'] = xl metadata['fitting_method'] = 'Fitted' man_code = '' for code, man_mark in self.all_man_marks.iteritems(): if code in name: man_code = man_mark break metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (width, ratio, rim, load_rating, speed_rating)) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_search(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) #pagination urls = hxs.select( '//div[@class="pagination pagination-centered"]//a/@href').extract( ) for url in urls: yield Request(urljoin(base_url, url), callback=self.parse_search) #parse products list products = hxs.select('//*[@id="searchRes"]/tbody//tr') for product in products: season = product.select('.//td[4]/i/@class').extract() #skip winter tyres if season and 'ico-type ico-W' in season[0]: continue loader = ProductLoader(item=Product(), selector=product) brand, name = product.select('./td[2]/a/b/text()').extract() loader.add_value('name', name) pattern = product.select('./td[2]/a/small/text()').extract()[0] data = extract_data(pattern) if data: width, aspect_ratio, rim, load_rating, speed_rating = data else: self.log("ERROR. Unable to parse pattern: %s" % pattern) continue if 'goodrich' in brand.lower(): brand = 'BFG' loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) price = product.select('.//span[@class="pr"]/text()').extract()[0] price_decimals = product.select( './/span[@class="pr"]/sup/text()').extract()[0].replace( u'\xa3', '') loader.add_value('price', extract_price(price + price_decimals)) identifier = product.select('@data-id').extract()[0] loader.add_value('identifier', identifier) url = product.select('./td[2]/a/@href').extract()[0] loader.add_value('url', urljoin(base_url, url)) image_url = product.select('./td[1]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin(base_url, image_url[0])) metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect_ratio metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = 'Delivered' metadata['load_rating'] = load_rating specif = product.select( './/span[@class="specif"]/text()').extract() specif = [x.lower() for x in specif] metadata['xl'] = 'Yes' if 'xl' in specif else 'No' metadata['run_flat'] = 'Yes' if 'runflat' in specif else 'No' man_code = '' for code, man_mark in self.all_man_marks.iteritems(): if code.lower() in specif: man_code = man_mark break if man_code == '': for code, man_mark in self.custom_man_marks.iteritems(): if code.lower() in specif: man_code = man_mark break metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], load_rating, speed_rating)) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse(self, response): hxs = HtmlXPathSelector(response) row = response.meta['row'] products = hxs.select('//div[@id="formcontent"]/div[@class="result"]') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) brand = product_el.select('p/span[@class="brand_text"]/text()').extract() brand = brand[0] if brand else '' winter_tyre = product_el.select('div/img[@title="Winter Tyre"]').extract() # skip winter tyres if winter_tyre: continue for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand full_name = ''.join(product_el.select('p[@class="the_tyre"]/text()').extract()).strip() loader.add_value('name', ' '.join(full_name.split()[2:])) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.select('p/span/select/@name').extract() if identifier: identifier = identifier[0].replace('number[', '').replace(']', '') else: log.msg('Product without identifier') search_params = '/'.join([row['Aspect Ratio'], row['Rim'], row['Width'], row['Alt Speed']]) log.msg('Search parameters: ' + search_params) return loader.add_value('url', 'http://www.tyresavings.com') loader.add_xpath('image_url', 'img[@class="tyre_image"]/@src') loader.add_value('identifier', identifier) price = ''.join(product_el.select('div[@class="price"]/text()').extract()).strip() if not price: continue loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] speed = re.search('(\s\d+\w+\s)', full_name) speed_rating = speed.group().strip()[-1] if speed else '' load_rating = speed.group().strip()[:-1] if speed else '' metadata['speed_rating'] = speed_rating metadata['load_rating'] = load_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' xl = product_el.select('div/img[@title="Reinforced Tyre"]').extract() metadata['xl'] = 'Yes' if xl else 'No' run_flat = product_el.select('div/img[@title="Run Flat Tyre"]').extract() metadata['run_flat'] = 'Yes' if run_flat else 'No' metadata['manufacturer_mark'] = self._get_manufacturer_code(full_name) metadata['full_tyre_size'] = '/'.join((row['Width'], row['Aspect Ratio'], row['Rim'], metadata['load_rating'], metadata['speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def extract_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@class="listcontPART"]//div[@class="conprcbx"]') for el in products: brand = el.select('./div[@class="dec_tyrebnt"]/p/b/text()' ).extract().pop().strip() pattern = "".join( el.select( './div[@class="dec_tyrebnt"]/p/text()').extract()).strip() # skip winter tyres if 'winter' in pattern.lower(): continue xl, pattern = extract_reinforced(pattern) run_flat, pattern = extract_run_flat(pattern) res = parse_pattern(pattern) if not res: excludes = [ 'sport contact', 'advantage sport', 'expedia s02', 'zero rosso' ] if any([x in pattern.lower() for x in excludes]): continue else: msg = 'Could not parse pattern: %s' % fix_spaces( pattern).encode('utf-8') self.log('[CARTYRES] %s' % msg) self.errors.append(msg) continue width, ratio, rim, load_rating, speed_rating, name = res identifier = el.select(".//p/@onclick").re( "AddCarToShortList\('([^']*)',") url = self.start_urls[0] price = el.select( './/div[@class="dec_fittdbnt"]/h1/text()').extract().pop() price = fix_spaces(price) image_url = el.select( '../..//div[@class="uptyre_prt"]/img/@src').extract()[0] man_mark = el.select( './/div[@class="bndLGO1"]/img/@title').extract() if man_mark: man_mark = man_mark[0] if not man_mark in self.man_marks: self.man_marks.add(man_mark) else: man_mark = '' loader = ProductLoader(Product(), selector=hxs) loader.add_value('name', name) loader.add_value('identifier', identifier.pop()) loader.add_value('price', price) loader.add_value('url', url) loader.add_value('image_url', image_url) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) metadata = MicheldeverMeta() metadata['width'] = width metadata['aspect_ratio'] = ratio metadata['rim'] = rim metadata['load_rating'] = load_rating metadata['speed_rating'] = speed_rating metadata['fitting_method'] = 'Fitted' metadata['run_flat'] = run_flat metadata['xl'] = xl if man_mark and man_mark in man_mark_mapping: man_code = man_mark_mapping[man_mark] else: man_code = '' metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (width, ratio, rim, load_rating, speed_rating)) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name name = hxs.select('//td[@class="tread"]/text()').extract() if not name: msg = "No name found on page: %s" % response.url self.errors.append(msg) self.log("[ERROR] %s" % msg) return loader.add_value('name', name[0]) brand = hxs.select( '//table[@class="single searchresults"]//td[@class="tyreinfo"]/b/text()' ).extract()[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(brand)) fitting_method = 'Delivered' loader.add_value('url', response.url) out_of_stock = hxs.select( '//table[@class="single searchresults"]//span[@class="outofstock"]' ) if out_of_stock: loader.add_value('stock', 0) image_url = hxs.select( '//table[@class="single searchresults"]//td[@class="logo-pic"]/img/@src' ).extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = hxs.select( '//table[@class="single searchresults"]//form/input[@name="pid"]/@value' )[0].extract() loader.add_value('identifier', identifier) price = hxs.select( '//table[@class="single searchresults"]//td[@class="netprice"]/text()' )[0].extract() loader.add_value('price', price) name = hxs.select( '//table[@class="single searchresults"]//td[@class="tyreinfo"]/span/text()' )[0].extract() data = parse_pattern(name) if not data: log.msg('ERROR parsing "{}" [{}]'.format(name, response.url)) self.errors.append('ERROR parsing "{}" [{}]'.format( name, response.url)) return metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] metadata['alternative_speed_rating'] = '' xl = 'XL' in name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'rflat' in name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' if '*' in name: manufacturer_mark = '*' else: manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \ else '' metadata['mts_stock_code'] = '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse(self, response): base_url = get_base_url(response) row = response.meta['row'] products = json.loads(response.body_as_unicode()) for product_el in products: #skip winter tyres if product_el['winter'] != '0': continue loader = ProductLoader(item=Product(), selector=product_el) brand = product_el['tyreMake'].title() if 'goodrich' in brand.lower(): brand = 'BFG' loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) load_rating = product_el['loadrating'] speed_rating = product_el['tyreSpeed'] loader.add_value('price', product_el['priceVat']) loader.add_value('identifier', product_el['id']) loader.add_value( 'url', urljoin('http://www.etyres.co.uk/tyre-detail/', product_el['URLString'])) if product_el['tyreModelImage2']: image_url = 'images/' + product_el['tyreModelImage2'] if image_url: loader.add_value('image_url', urljoin(base_url, image_url)) metadata = MicheldeverMeta() metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = speed_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating metadata[ 'xl'] = 'Yes' if product_el['tyreReinforced'] == 'T' else 'No' metadata[ 'run_flat'] = 'Yes' if product_el['runflat'] == '1' else 'No' name = product_el['tyreModel'] man_code = '' for code, man_mark in self.all_man_marks.iteritems(): result, name = cut_name(code, name) if result: man_code = man_mark break if not man_code: for code, man_mark in self.custom_man_marks.iteritems(): if name.endswith(code): name = name.partition(code)[0] man_code = man_mark break metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating, speed_rating)) name = name.replace(' EXTRA LOAD', '') name = name.replace(' RUNFLAT', '') loader.add_value('name', name.strip()) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) title = hxs.select('//h1/text()')[0].extract() if 'winter' in title.lower(): return title = re.search('(.*)-[^-]+', title).groups()[0] brand = title.split(' ')[0] price = hxs.select('//td[@class="price"]/text()')[0].extract() # fix wrong product if brand.strip() == 'R27': loader.add_value('name', title.replace('XL', '').replace('RF', '')) brand = 'Toyo' else: loader.add_value( 'name', title.replace(brand, '').replace('XL', '').replace('RF', '')) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('identifier', '//input[@id="product_reference"]/@value') image_url = hxs.select('//img[@class="productImg"]/@src')[0].extract() loader.add_value('image_url', urljoin(get_base_url(response), image_url)) speed_rating = hxs.select( "//tr[td/strong[text()='Speed:']]/td[2]/text()").extract()[0] load_rating = hxs.select("//tr[td/strong[text()='Load:']]/td[2]/text()" ).extract()[0].replace(speed_rating, "") size = hxs.select( "//tr[td/strong[text()='Size:']]/td[2]/text()").extract()[0] width, aspect_ratio, _, rim = parse_tyre_size(size) if not width: msg = "Error parsing '%s' on page %s" % (size, response.url) self.log(msg) self.errors.append(msg) return m = MicheldeverMeta() m['aspect_ratio'] = aspect_ratio m['rim'] = rim m['width'] = width m['speed_rating'] = speed_rating.upper() m['load_rating'] = load_rating if 'RF' in title.upper(): m['run_flat'] = 'Yes' else: m['run_flat'] = 'No' if 'XL' in title.upper(): m['xl'] = 'Yes' else: m['xl'] = 'No' m['full_tyre_size'] = '/'.join( (m['width'], m['aspect_ratio'], m['rim'], m['load_rating'], m['speed_rating'])) m['fitting_method'] = 'Fitted' m['manufacturer_mark'] = self._get_manufacturer_code(title) product = loader.load_item() product['metadata'] = m if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select( '//ul[@class="c-list-classic c-list-classic-liste m-produit-res"]/li' ) next_page = hxs.select( '//li[a[span[text()="Next"]]]/@data-page').extract() # pagination if next_page: formdata = response.meta.get('formdata') formdata['page'] = next_page[0] yield FormRequest(response.url, formdata=formdata, dont_filter=True, meta=response.meta) for product_el in products: url = product_el.select( './/a[@class="u-semi-link"]/@href')[0].extract() winter_tyre = product_el.select( './/div[@class="m-produit-bloc-res-lst__gamme-saison"]/text()' ).re('Winter') if not winter_tyre: loader = ProductLoader(item=Product(), selector=product_el) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name loader.add_xpath( 'name', './/span[@class="m-produit-bloc-res-lst__dcp"]/text()') brand = product_el.select( './/span[@class="m-produit-bloc-res-lst__fab"]/text()' ).extract() if brand: brand = brand[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) fitting_method = 'Delivered' loader.add_value('url', urljoin(base_url, url)) image_url = product_el.select( './/div[@class="m-produit-bloc-res-lst__image"]//img/@src' ).extract() if image_url: loader.add_value( 'image_url', urljoin(get_base_url(response), image_url[0])) identifier = product_el.select( './/button/@data-id')[0].extract() loader.add_value('identifier', identifier) price = product_el.select( './/div[@class="c-qte-prix__prix m-produit-bloc-res-lst__prix"]/text()' )[0].extract() loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) name = product_el.select( './/div[@class="m-produit-bloc-res-lst__dim"]/text()' )[0].extract().strip().replace(u'\xa0', u' ') data = parse_pattern(name) if not data: log.msg('ERROR parsing "{}" [{}]'.format( name, response.url)) self.errors.append('ERROR parsing "{}" [{}]'.format( name, response.url)) continue additional_data = ' '.join( product_el.select( './/ul[@class="m-produit__carac c-list-horizontale"]/li/text()' ).extract()) metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in additional_data metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'runflat' in additional_data.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if re.search('\(?{}\)?'.format(mark.replace('*', '\*')), additional_data) ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \ else '' metadata['mts_stock_code'] = '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse(self, response): try: hxs = HtmlXPathSelector(response) except AttributeError: msg = 'Error getting selector on page for row: %s' % response.meta[ 'row'] self.log('[ERROR] %s' % msg) self.errors.append(msg) return row = response.meta['row'] json_data = None for line in hxs.extract().split('\n'): if "JsonObject = " in line: json_data = json.loads( line.replace('JsonObject = ', '').replace('; \r', '')) products = json_data['Rest'] + json_data['Deals'] collected_products = [] for product_info in products: # skip winter tyres if product_info['WinterTyre']: continue loader = ProductLoader(item=Product(), selector=product_info) loader.add_value('name', product_info['ModelName']) brand = product_info['Manufacturer'] loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_info['PrimaryId'] fitting_method = 'Delivered' url = '/catalogue' + product_info[ 'CatalogueUrl'] + '/f?tyre=' + str(product_info['PrimaryId']) loader.add_value('url', urljoin(get_base_url(response), url)) image_url = product_info.get('ModelImageLarge') if not image_url: image_url = product_info.get('ModelImage') if image_url: image_url = image_url.split('src="')[-1].split('"')[0] loader.add_value('image_url', urljoin(get_base_url(response), image_url)) loader.add_value('identifier', str(identifier) + '-' + fitting_method) price = product_info['SellingPrice'] loader.add_value('price', price) spec = product_info['SpecificationName'] metadata = MicheldeverMeta() # metadata['mts_stock_code'] = row['MTS Stockcode'] metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = spec.split()[-1] metadata['width'] = row['Width'] metadata['fitting_method'] = fitting_method load_rating = product_info['LoadRatingName'] metadata['load_rating'] = load_rating metadata['alternative_speed_rating'] = '' xl = product_info['Reinforced'] metadata['xl'] = 'Yes' if xl else 'No' run_flat = product_info['RunFlat'] metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = product_info['Variant'] if manufacturer_mark: manufacturer_mark = manufacturer_mark.split()[0].strip() metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], metadata['load_rating'], metadata['speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating # Do not collect "Delivered" tyres # yield product product['price'] = product_info['FullyFittedPrice'] fitting_method = 'Fitted' product['identifier'] = str(identifier) + '-' + fitting_method product['metadata']['fitting_method'] = fitting_method collected_products.append(product) min_price_products = {} for product in collected_products: key = "%s-%s-%s-%s-%s-%s-%s" % ( product['brand'], product['name'], product['metadata']['fitting_method'], product['metadata']['full_tyre_size'], product['metadata']['xl'], product['metadata']['run_flat'], product['metadata']['manufacturer_mark']) if key in min_price_products: if product['price'] < min_price_products[key]['price']: min_price_products[key] = product else: min_price_products[key] = product for product in min_price_products.values(): yield product
def parse(self, response): hxs = HtmlXPathSelector(response) row = response.meta['row'] products = hxs.select('//div[contains(@class, "tyreResult")]') for product in products: winter = product.select('.//li[@class="cw"]') # skip winter tyres if winter: continue loader = ProductLoader(item=Product(), selector=product) title = product.select( './/div[@class="tyreName"]/h4/text()').extract()[0].strip() brand = product.select('./@data-brand').extract()[0] brand = brand.title() loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) title = title[len(brand):].strip() results = re.search( r"\b((?:\d{2,3}/)*(?:\d{2,3}))\s?([A-Z]{1,2}\d?)\b", title) if results: load_rating = results.group(1) speed_rating = results.group(2) name = title[:results.start(1)] title = title[results.end(2):] else: load_rating = '' speed_rating = row['Speed rating'] name = title title = '' price = product.select( './/div[@class="tyreBuy"]//h5/text()').extract()[0] price_dec = product.select( './/div[@class="tyreBuy"]//h5/sup/text()').extract()[0] loader.add_value('price', extract_price(price + price_dec)) identifier = product.select( './/input[@name="id"]/@value').extract()[0] loader.add_value('identifier', identifier) loader.add_value('url', '') image_url = product.select( './/div[@class="tyreImg"]/img[@class="tyre"]/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) metadata = MicheldeverMeta() metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = speed_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating # metadata['alternative_speed_rating'] = '' specif = product.select( './/ul[@class="fixed"]//li/@class').extract() metadata['xl'] = 'Yes' if 'xl' in specif else 'No' metadata['run_flat'] = 'Yes' if 'rf' in specif else 'No' man_code = '' if 'bmw' in specif: man_code = '*' elif 'mer' in specif: man_code = 'MO' elif 'aud' in specif: man_code = 'AO' elif 'por' in specif: man_code = 'NO' for code, man_mark in self.all_man_marks.iteritems(): result, name = cut_name(code, name) if result: if man_code == '': man_code = man_mark break if man_code == '': for code, man_mark in self.all_man_marks.iteritems(): result, title = cut_name(code, title) if result: man_code = man_mark break metadata['manufacturer_mark'] = man_code result, name = cut_name('XL', name) loader.add_value('name', name) metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating, speed_rating)) # metadata['alternative_speed_rating'])) prod = loader.load_item() prod['metadata'] = metadata if not is_product_correct(prod): continue prod['metadata']['mts_stock_code'] = find_mts_stock_code( prod, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(prod) new_alt_speed = get_alt_speed(prod) prod['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ prod['metadata']['speed_rating'] if prod['metadata']['speed_rating'] != new_speed_rating else '' prod['metadata']['speed_rating'] = new_speed_rating yield prod
def parse_products(self, response): hxs = HtmlXPathSelector(response) row = response.meta['row'] products = hxs.select('//*[@id="tyreResults"]//tr[contains(@class, "tyre")]//td[@class != "gutter"]') for product in products: loader = ProductLoader(item=Product(), selector=product) title = product.select('.//p[@class="subTitle"]/text()').extract() if not title: continue title = ' '.join(title[0].split()) parsed_title = parse_title_new(title) brand = parsed_title['brand'] load_rating = parsed_title['load_rating'] speed_rating = parsed_title['speed_rating'] name = parsed_title['name'] if not name or not brand: self.log("++++++++++++++++++++++++++++{}==================".format(title)) #self.errors.append("Error parsing title: %s" % title) for fixed_brand, brand_spellings in self.brand_fixes.iteritems(): if brand.lower() in brand_spellings: brand = fixed_brand break brand = brand.title() if brand not in self.brand_fixes: self.log('Wrong brand %s' % brand) continue loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) price = product.select('.//h6[@class="price"]/text()').extract()[0] price += product.select('.//h6[@class="price"]/sup/text()').extract()[0] loader.add_value('price', extract_price(price)) identifier = product.select('./a[@class="btnBuy png_bg"]/@href').extract()[0] identifier = identifier.split('/')[-1] loader.add_value('identifier', identifier) loader.add_value('url', '') image_url = product.select('.//img[@class="tyreImg"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) metadata = MicheldeverMeta() metadata['onsite_name'] = title metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = speed_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating self.log("===============matching================") self.log(str(name)) metadata['manufacturer_mark'], name = filter_man_code(name, self.all_man_marks, self.custom_man_marks) self.log(str((metadata['manufacturer_mark'], name))) metadata['xl'], name = filter_xl(name) metadata['xl'] = "Yes" if metadata['xl'] else "No" self.log(str((metadata['xl'], name))) metadata['run_flat'], name = filter_run_flat(name) metadata['run_flat'] = "Yes" if metadata['run_flat'] else "No" self.log(str((metadata['run_flat'], name))) self.log("===============/matching===============") if name.endswith('('): name = name[:-1] loader.add_value('name', name.strip()) metadata['full_tyre_size'] = '/'.join((row['Width'], row['Aspect Ratio'], row['Rim'], load_rating, speed_rating)) #metadata['alternative_speed_rating'])) prod = loader.load_item() prod['metadata'] = metadata if not is_product_correct(prod): continue prod['metadata']['mts_stock_code'] = find_mts_stock_code(prod, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(prod) new_alt_speed = get_alt_speed(prod) prod['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ prod['metadata']['speed_rating'] if prod['metadata']['speed_rating'] != new_speed_rating else '' prod['metadata']['speed_rating'] = new_speed_rating yield prod
def parse(self, response): hxs = HtmlXPathSelector(response) search_params = response.meta['search_params'] formdata = response.meta['formdata'] loader = ProductLoader(item=Product(), selector=hxs) title = hxs.select( '//div[@class="rightpanel"]//h1/text()').extract()[0] title = ' '.join(title.split()) tyre_params = "{}/{}R{}".format(search_params['width'], search_params['aspect_ratio'], search_params['rim']) parts = title.partition(tyre_params) brand = parts[0].strip() load_rating = parts[2].strip().split(formdata['speed'])[0].strip() name = title.partition('Fuel Effic')[0].replace('~', '').strip() name = name.replace( '{} {} {}{} '.format(brand, tyre_params, load_rating, formdata['speed']), '') brand = brand.title() if 'goodrich' in brand.lower(): brand = 'BFG' loader.add_value('brand', unify_brand(brand)) if 'www.tyretraders.com' in name or tyre_params not in title: meta = response.meta meta['retry'] += 1 if meta['retry'] < 10: yield Request(response.url, callback=self.parse, meta=meta, dont_filter=True) else: self.log('Giving up retrying to reload the product: {}'.format( response.url)) else: price = response.meta.get('price') loader.add_value('price', price) identifier = response.url.split("|")[-1].split(".")[0] identifier = url_unquote(identifier) # identifier = hxs.select('//*[@id="hf_itemid"]/@value').extract()[0] loader.add_value('identifier', identifier) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', response.url) image_url = hxs.select( '//div[@class="rightpanel"]//img[@style=" max-width:450px;"]/@src' ).extract() if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) metadata = MicheldeverMeta() metadata['aspect_ratio'] = search_params['aspect_ratio'] metadata['rim'] = search_params['rim'] metadata['speed_rating'] = search_params['speed_rating'] metadata['width'] = search_params['width'] metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating #metadata['alternative_speed_rating'] = '' result, name = remove_whole_word('XL', name) result1, name = remove_whole_word('RF', name) metadata['xl'] = 'Yes' if result or result1 else 'No' result, name = remove_whole_word('runflat', name) metadata['run_flat'] = 'Yes' if result else 'No' man_code = '' for code, man_mark in self.all_man_marks.iteritems(): result, name = remove_whole_word(code, name) if result: man_code = man_mark break metadata['manufacturer_mark'] = man_code loader.add_value('name', name) metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], load_rating, metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_product(self, product, fitted, search_params): url = product.select('.//div[@class="mod-item-body"]/h3//a/@href').extract()[0] p_id = url.split('/')[-1] p_id += '-F' if fitted else '-D' image_url = product.select('.//div[@class="mod-item-img"]//img/@src').extract()[0] brand = product.select('.//div[@class="mod-item-body"]/h3/text()').extract()[0].strip() try: if not fitted: price = product.select('.//div[@class="mod-delivered"]/a/text()').extract()[0] else: price = product.select('.//div[@class="mod-fitted"]/a/text()').extract()[0] except IndexError: self.log("Price not found: %s" % str(product)) self.errors.append("Price not found: %s" % str(product)) return name = product.select('.//div[@class="mod-item-body"]/h3/span/a/text()').extract()[0] pattern = re.sub('\d+[^\s]+R\d+', '', name) pattern = re.sub('[\d/]+%s' % search_params['speed_rating'].upper(), '', pattern) pattern = pattern.strip() if not pattern: pattern = name.strip() loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', url) loader.add_value('identifier', p_id) loader.add_value('image_url', image_url) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('price', price) pattern = pattern.upper() pattern = pattern.replace('XL', '').replace('RFLAT', '').replace('RUNFLAT', '') loader.add_value('name', pattern) m = MicheldeverMeta() m['aspect_ratio'] = search_params['aspect_ratio'] m['rim'] = search_params['rim'] m['width'] = search_params['width'] m['speed_rating'] = search_params['speed_rating'].upper() res = re.search('([\d/]+)%s' % search_params['speed_rating'].upper(), name) if res: m['load_rating'] = res.groups()[0] else: self.log('ERROR: not load rating: %s' % url) m['load_rating'] = '' if 'RFLAT' in name.upper() or 'RUNFLAT' in name.upper(): m['run_flat'] = 'Yes' else: m['run_flat'] = 'No' if 'XL' in name.upper(): m['xl'] = 'Yes' else: m['xl'] = 'No' m['full_tyre_size'] = '/'.join((m['width'], m['aspect_ratio'], m['rim'], m['load_rating'], m['speed_rating'])) #m['alternative_speed_rating'])) m['fitting_method'] = 'Fitted' if fitted else 'Delivered' m['manufacturer_mark'] = self._get_manufacturer_code(name) product = loader.load_item() product['metadata'] = m if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating return product
def parse_product_cache(self, identifier, price, out_of_stock, product): """ >>> spider = CamSkillSpider() >>> product = {\ "brand": "Pirelli", \ "category": 'R16" - 205/55/16, 205/55R16', \ "identifier": "113764", \ "image_url": "http://www.camskill.co.uk/smsimg/1943/113764--main--1943.jpg", \ "metadata": {\ "alternative_speed_rating": "", \ "aspect_ratio": "55", \ "fitting_method": "Delivered", \ "full_tyre_size": "205/55/16/91/V", \ "load_rating": "91", \ "manufacturer_mark": "", \ "mts_stock_code": "2055516VPIP7", \ "rim": "16", \ "run_flat": "No", \ "speed_rating": "V", \ "width": "205", \ "xl": "No"\ }, \ "name": "Cinturato P7", \ "price": "64.40", \ "sku": None, \ "stock": "0", \ "url": "http://www.camskill.co.uk/m62b0s291p113764/Pirelli_Tyres_Car_Pirelli_P7_Cinturato_Pirelli_P_7_-_205_55_R16_91V_TL_Fuel_Eff_%3A_E_Wet_Grip%3A_A_NoiseClass%3A_2_Noise%3A_70dB"\ } >>> spider.products_data['113764'] = product >>> product_ = spider.parse_product_cache("113764", 123, product) >>> product_['metadata']['mts_stock_code'] '2055516VPIP7CINT' """ loader = ProductLoader(item=Product(), selector=product) for col in ['name', 'identifier', 'sku', 'url', 'image_url', 'brand']: loader.add_value(col, self.products_data[identifier][col]) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('price', price) if out_of_stock: loader.add_value('stock', 0) product_ = loader.load_item() if identifier in self.products_metadata: product_['metadata'] = self.products_metadata[identifier] if not is_product_correct(product_): self.incorrect_identifiers.append(product['identifier']) return product_['metadata']['mts_stock_code'] = find_mts_stock_code( product_, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product_) new_alt_speed = get_alt_speed(product_) product_['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product_['metadata']['speed_rating'] if product_['metadata']['speed_rating'] != new_speed_rating else '' product_['metadata']['speed_rating'] = new_speed_rating return product_
def parse_products(self, response): html_response = json.loads(response.body)['display_tyres'] hxs = HtmlXPathSelector(text=html_response) search_params = response.meta['search_params'] products = hxs.select('//div[contains(@class, "tyre_container")]') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) brand = product_el.select( './/form/span[@class="tyre_brand_text"]/text()').extract() brand = brand[0] if brand else '' winter_tyre = hxs.select( '/div/div/div[@class="winter_img"]').extract() if not winter_tyre: for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand full_name = product_el.select( './/form/span[@class="tyre_brand_text"]/text()').extract( )[-1] loader.add_value('name', full_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.select( './/input[@name="tyre"]/@value').extract() loader.add_value('identifier', identifier) loader.add_value('url', 'http://www.tyregiant.com') image_url = product_el.select( './/img[@class="tyre_image"]/@src').extract() if image_url: loader.add_value( 'image_url', urljoin(get_base_url(response), image_url[0])) price = product_el.select( './/*[@class="tyre_price"]/span/text()').extract() if not price: loader.add_value('stock', 0) loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = search_params['aspect_ratio'] metadata['rim'] = search_params['rim'] tyre_details = product_el.select( './/form/p[@class="tyre_details"]/text()').extract()[0] speed = re.search('(\s\d+\w+\s)', tyre_details) load_rating = speed.group().strip()[:-1] if speed else '' speed_rating = speed.group().strip()[-1] if speed else '' metadata['speed_rating'] = speed_rating metadata['load_rating'] = load_rating metadata['width'] = search_params['width'] metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' xl = product_el.select( './/img[@class="xl_img"]/@src').extract() metadata['xl'] = 'Yes' if xl else 'No' run_flat = product_el.select( './/img[@class="rf_img"]/@src').extract() metadata['run_flat'] = 'Yes' if run_flat else 'No' metadata['manufacturer_mark'] = self._get_manufacturer_code( full_name) metadata['full_tyre_size'] = '/'.join( (search_params['width'], search_params['aspect_ratio'], search_params['rim'], metadata['load_rating'], metadata['speed_rating'])) # metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product if products: meta = response.meta next_page = meta['page'] + 1 next_url = 'http://www.tyregiant.com/update-tyres/%s' % str( next_page) meta['page'] = next_page yield Request(next_url, dont_filter=True, callback=self.parse_products, meta=meta)
def parse_list(self, response): setattr(self, response.meta.get('thread'), True) hxs = HtmlXPathSelector(response) vs_data = hxs.select( '//input[@name="__VIEWSTATE"]/@value').extract()[0] identifiers = parse_identifiers(vs_data) products = hxs.select( '//div[@class="main-list"]//div[@class="group conti-box"]') for product_el in products: identifier = identifiers.pop(0) specif = product_el.select( './/span[@class="blue"]//div/text()').extract() # skip winter tyres if 'WINTER' in specif: continue loader = ProductLoader(item=Product(), selector=product_el) title = product_el.select( './/div[@class="conti-gray"]/text()').extract()[0] #identifier = title.split() title = title.strip().split('\r\n') name = title[-1].strip() width = title[0].split("/")[0].strip() ratio = title[0].split("/")[1].replace("R", "").strip() rim = title[1].strip() rating = title[2].strip() results = re.search(r"((?:\d{1,3}/)*(?:\d{1,3}))([A-Z]{1,2}\d?)", rating) if results: load_rating = results.group(1) speed_rating = results.group(2) else: load_rating = speed_rating = '' brand = product_el.select( './/div[@class="black-conti"]/text()').extract()[0].strip() brand = brand.title() if 'bfg' in brand.lower(): brand = 'BFG' loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) price = product_el.select( './/h4[@class="prc"]/text()').extract()[0] loader.add_value('price', extract_price(price)) #identifier = brand.replace(' ', '') + ''.join(identifier) loader.add_value('identifier', identifier) loader.add_value('url', '') image_url = product_el.select( './/div[@class="sec-img"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) metadata = MicheldeverMeta() metadata['aspect_ratio'] = ratio metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating #metadata['alternative_speed_rating'] = '' metadata['xl'] = 'Yes' if 'REINFORCED' in specif else 'No' metadata['run_flat'] = 'Yes' if 'RUN FLAT' in specif else 'No' man_code = '' for code, man_mark in self.all_man_marks.iteritems(): result, name = cut_name(code, name) if result: man_code = man_mark break if not man_code: for code, man_mark in self.custom_man_marks.iteritems(): if name.endswith(code): name = name.partition(code)[0] man_code = man_mark break metadata['manufacturer_mark'] = man_code loader.add_value('name', name) metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], load_rating, speed_rating)) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product for x in self.next_search(): yield x
def parse_products(self, response): hxs = HtmlXPathSelector(response) product_data = response.meta['product_data'] width = product_data['Width'] aspect_ratio = product_data['Aspect Ratio'] rim = product_data['Rim'] speed_rating = product_data['Speed rating'] alt_speed = product_data['Alt Speed'] name_reg = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (width, rim, speed_rating.upper()) name_reg2 = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (width, rim, alt_speed.upper()) name_reg3 = r'(.+?)\s*%s.+%s.?[\s]*(.*)' % (width, rim) products = hxs.select('//div[@id="product-listing"]//div[@class="product"]/..') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) try: url = product_el.select('.//div[@class="title"]/a/@href')[0].extract() except: continue loader.add_value('url', url) loader.add_value('identifier', product_el.select(".//span[@class='addcompare']/input/@id").extract()[0].split(":")[1]) # loader.add_value('identifier', re.search('productId_(\d+)_', url).groups()[0]) loader.add_xpath('price', './/span[@class="prodPirce"]/text()') try: name = product_el.select('.//div[@class="title"]/a/text()')[0].extract() except: continue if not re.search(r'(\(.*\))', name): # name = name.replace('/', '') m = re.search(name_reg, name) if not m: m = name_parts = re.search(name_reg2, name) if not m: m = name_parts = re.search(name_reg3, name) if m: name_parts = m.groups() else: self.log('Failed parsing ' + name) self.log('URL: ' + response.url) self.log('Params: ' + ", ".join(map(str, [width, rim, speed_rating.upper()]))) continue else: name_parts = [] name_parts.append(name.split()[0]) load_rating_reg = re.search(r'(\d+)%s' % speed_rating.upper(), name) if not load_rating_reg: load_rating_reg = re.search(r'(\d+)%s' % alt_speed.upper(), name) if not load_rating_reg: self.log('Failed parsing ' + name) self.log('URL: ' + response.url) self.log('Params: ' + ", ".join(map(str, [width, rim, speed_rating.upper()]))) continue name_parts.append(load_rating_reg.groups()[0]) name_parts.append(' '.join(name.split()[1:]).split('(')[0]) loader.add_value('name', name_parts[-1].replace('XL', '').replace('ROF', '').replace('RFT', '')) brand = name_parts[0] loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_xpath('image_url', './/a[contains(@class, "tyre")]/img/@src') m = MicheldeverMeta() m['aspect_ratio'] = aspect_ratio m['rim'] = rim m['width'] = width m['speed_rating'] = speed_rating.upper() m['load_rating'] = name_parts[1] if 'ROF' in name.upper() or 'RFT' in name.upper(): m['run_flat'] = 'Yes' else: m['run_flat'] = 'No' if 'XL' in name.upper(): m['xl'] = 'Yes' else: m['xl'] = 'No' m['full_tyre_size'] = '/'.join((m['width'], m['aspect_ratio'], m['rim'], m['load_rating'], m['speed_rating'])) # m['alternative_speed_rating'])) m['fitting_method'] = 'Fitted' m['manufacturer_mark'] = self._get_manufacturer_code(name_parts[-1]) product = loader.load_item() product['metadata'] = m if not is_product_correct(product): self.log('The product is not correct: %r' % product) continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product next_page = hxs.select('//span[@class="nextlink"]/a/@href') if next_page: yield Request(next_page.extract()[0], callback=self.parse_products, meta=response.meta)
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[contains(@id,"Tyre") and contains(@class, "tyre-list-tyre")]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'div//div[@class="manufacturerText"]/p/strong/text()') brand = ''.join(product.select('div//div[@class="manufacturerImage"]/img/@alt').extract()).split(' - ')[0] winter_tyre = product.select('div//img[@alt="Winter Tyre"]') if not winter_tyre: loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) identifier = product.select('div//div[@class="pricingAddToOrder clearfix"]/input/@value').extract()[0] loader.add_value('url', '') image_url = product.select('div[@class="image"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) loader.add_value('identifier', identifier) price = product.select('div//div[contains(@class, "pricingSelection")]//a/strong/text()').extract() price = re.findall(r"\d+.\d+", price[0]) if price else '0.0' loader.add_value('price', price) tyresize_text = product.select('.//div[contains(@class, "manufacturerText")]/p/span/text()').extract()[0].strip() width, aspect, speed_rating, rim = re.search(r'tyre size (\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups() fitting_method = 'Fitted' metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = fitting_method load_rating = product.select('div//li/a[@rel="load-index-description"]/text()').extract() metadata['load_rating'] = load_rating[0].split(': ')[-1] if load_rating else '' metadata['alternative_speed_rating'] = '' xl = product.select('div//img[@title="Reinforced"]/@title').extract() metadata['xl'] = 'Yes' if xl else 'No' run_flat = product.select('div//img[@title="Run Flat"]').extract() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = product.select('div//img[contains(@title, "Homologated for fitment to certai")]/@title').extract() manufacturer_mark = manufacturer_mark[0].replace('Homologated for fitment to certain ' ,'').replace(' cars.' ,'') if manufacturer_mark else '' metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_products(self, response): json_data = json.loads(response.body) products = json.loads(json_data.get('d')) for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) try: brand = product_el[u'ProductManufacturer'][ u'TyreManufacturerName'] except: brand = '' winter_tyre = product_el[u'ProductAttributes'][u'IsWinter'] # skip winter tyres if winter_tyre: continue for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand try: full_name = product_el[u'ProductTreadPattern'][u'TreadName'] except: full_name = '' # Fix name changes if full_name in self.new_old_names: full_name = self.new_old_names[full_name] loader.add_value('name', full_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.get('TyreID') loader.add_value('url', 'http://www.tyresonthedrive.com') image_url = 'http://www.tyresonthedrive.com/img/treads/' + product_el[ u'ProductTreadPattern'][u'TreadPatternImage'] + '.jpg' loader.add_value('image_url', image_url) loader.add_value('identifier', identifier) price = product_el[u'CheapestPriceTwoDay'][u'OneTyrePriceIncVat'] if not price: loader.add_value('stock', 0) loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = str( product_el[u'ProductAttributes'][u'Profile']) metadata['rim'] = str(product_el[u'ProductAttributes'][u'Rim']) metadata['speed_rating'] = str( product_el[u'ProductAttributes'][u'Speed']) metadata['load_rating'] = str( product_el[u'ProductAttributes'][u'Load']) metadata['width'] = str( product_el[u'ProductAttributes'][u'Section']) metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' metadata['xl'] = 'Yes' if product_el[u'ProductAttributes'][ u'IsExLoad'] else 'No' metadata['run_flat'] = 'Yes' if product_el[u'ProductAttributes'][ u'IsRunFlat'] else 'No' man_mark = product_el[u'ProductAttributes'][u'OEMFitment'] metadata['manufacturer_mark'] = find_man_mark( man_mark) if man_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name fitting_method = 'Delivered' loader.add_value('url', response.url) image_url = hxs.select('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = hxs.select('//form[@name="form1"]/@action').extract() if not identifier: yield self.retry_request(response) return identifier = identifier[0] loader.add_value('identifier', identifier) price = hxs.select( '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract() loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) brand = hxs.select( '//div[@class="hidden"]/input[@class="producerName"]/@value' ).extract() if not brand: yield self.retry_request(response) return brand = brand[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) brand = re.sub(u'\u0119', u'e', brand) product_name = hxs.select( '//h1[@itemprop="name"]/text()')[0].extract().strip() product_name = re.sub(u'[:\u2122]', u'', product_name) product_name = product_name.replace(brand, '').strip() data = parse_pattern(product_name) if not data: log.msg('ERROR parsing "{}" [{}]'.format(product_name, response.url)) self.errors.append('ERROR parsing "{}" [{}]'.format( product_name, response.url)) return loader.add_value('name', data['Name']) metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in product_name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'run on flat' in product_name.lower( ) or 'run flat' in product_name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in product_name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get( manufacturer_mark, '') if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) # metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="results"]') pages = hxs.select('//p[contains(text(),"Page")]//a/@href').extract() for page in pages: yield Request(urljoin(get_base_url(response), page), meta=response.meta) for product in products: loader = ProductLoader(item=Product(), selector=product) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name name = ' '.join( map( unicode.strip, product.select('.//div[@class="resultsLeft"]/div' '//text()[normalize-space()]').extract())) name += name + ' %s' % ' '.join( map( unicode.strip, product.select( './/div[@class="t_size"]//text()[normalize-space()]'). extract())) loader.add_xpath( 'name', './/div[@class="resultsLeft"]/div//a/i/b/text()[normalize-space()]' ) brand = product.select( './/div[@class="resultsLeft"]/div/b//text()[normalize-space()]' ).extract()[0].strip() # skip winter tyres if product.select( './/img[contains(@alt,"Winter / cold weather tyres")]'): continue if product.select( './/img[contains(@alt,"Wi") or contains(@src,"/simg/hiver.png")]' ): continue loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) fitting_method = 'Fitted' url = product.select('.//a[i[b]]/@href')[0].extract() url = urljoin(get_base_url(response), url) url = re.sub('cart_id=[^&]*', '', url) loader.add_value('url', url) image_url = product.select( './/a/img[@align="left"]/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = urlparse.parse_qs( urlparse.urlparse(url).query)['typ'][0] loader.add_value('identifier', identifier) price = ''.join( product.select( './/div[@class="price"]/font/b//text()[normalize-space()]' ).extract()) price = re.findall(r"\d+.\d+", price) if price else '0.0' loader.add_value('price', price) data = parse_pattern(name) if not data: #log.msg("ERROR %s [%s]" % (name, response.url)) #self.errors.append("Error parsing: %s. URL: %s" % (name, response.url)) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] metadata['alternative_speed_rating'] = '' xl = 'XL' in name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'run flat' in name.lower() or 'runflat' in name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = product.select( './/div[@class="t_size"]/b/a[contains(@onmouseover,"Original") or ' 'contains(@onmouseover,"BMW") or contains(@onmouseover,"Porsche")]' '/@name[normalize-space()]').extract() manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product