def __init__(self, *args, **kwargs): super(KwikFitSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row)
def __init__(self, *args, **kwargs): super(MyTyresSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) self.errors = [] self.search_history = set()
def __init__(self, *args, **kwargs): super(TyreDriveSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.errors = []
def __init__(self, *args, **kwargs): super(EtyresSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.custom_man_marks[' JAGUAR FITMENT'] = 'J' self.custom_man_marks[' RAV4 FITMENT'] = '' self.custom_man_marks[' NISSAN JUKE FITMENT'] = '' self.custom_man_marks[' (PORSCHE FITMENT)'] = 'N0' self.custom_man_marks[' LEXUS FITMENT'] = '' self.custom_man_marks[' PRIUS FITMENT'] = '' self.custom_man_marks[' TOYOTA AURIS FITMENT'] = '' self.custom_man_marks[' - TOYOTA RAV4 FITMENT'] = '' self.custom_man_marks[' BMW MINI FITMENT'] = '*' self.custom_man_marks[' AUDI FITMENT'] = 'AO' self.custom_man_marks[' JAG FITMENT'] = 'J' self.custom_man_marks[' FERRARI MASERATI FITMENT'] = '' self.custom_man_marks[' MASERATI FITMENT'] = '' self.custom_man_marks[' - BMW FITMENT'] = '*' self.custom_man_marks[' ASTON MARTIN FITMENT'] = '' self.custom_man_marks[' MERCEDES & RENAULT FITMENT'] = 'MO'
def __init__(self, *args, **kwargs): super(BestBuyTyresSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.brand_fixes['Bridgestone'] = ["b'stone", 'b/stone', 'bridestone', 'bridgestohne', 'brridgestone'] self.brand_fixes['Continental'] = ['conti', 'contiental', 'continenal', 'continntal', 'contintenal'] self.brand_fixes['Dunlop'] = ['dlp'] self.brand_fixes['Goodyear'] = ['g’year', 'g’yr', 'g/year', 'goodyea', 'gy', 'gyr'] self.brand_fixes['Michelin'] = ['mich'] self.brand_fixes['Pirelli'] = ['pir', 'pire', 'pireelli'] #self.brand_fixes['Uniroyal'] = ['uni'] self.custom_man_marks = { '(LEXUS FITMENT)': '', '()': '', '(BMW FITMENT)': '*', '(RAV 4)': '', '(BMW)': '*' } self.errors = []
def __init__(self, *args, **kwargs): super(PointSSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.brands = [row['Brand'] for row in self.tyre_sizes] self.processed_rows = {}
def __init__(self, *args, **kwargs): super(EcraterSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'lego.csv')) as f: reader = csv.reader(f) self.products = { prod[2]: prod[3].decode('utf8') for prod in reader } dispatcher.connect(self.spider_closed, signals.spider_closed) if os.path.exists(os.path.join(HERE, 'ecrater_products.csv')): shutil.copy(os.path.join(HERE, 'ecrater_products.csv'), os.path.join(HERE, 'ecrater_products.csv.bak')) # Errors self.errors = []
def __init__(self, *args, **kwargs): super(LegoUsaEbaySpider, self).__init__() self._csv_file = os.path.join(self.HERE, 'lego.csv') self._converted_price = True self._ebay_url = 'http://www.ebay.com' self._search_fields = [3, 2] self._all_vendors = True self._look_related = False self._meta_fields = [('sku', 2), ('name', 3), ('price', 4), ('category', 1)] self._match_fields = ('sku', 'identifier') self._check_valid_item = self._valid_item_ self._re_sku = re.compile(r'(\d{3,})') self._check_diff_ratio = True # self._ratio_accuracy = 60 self.matcher = Matcher(self.log)
def __init__(self, *args, **kwargs): super(EvenTyresSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) self.tyre_sizes = [] self.all_man_marks = {} self.manually_matched = [] with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: new_row = row.copy() self.tyre_sizes.append(new_row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.errors = []
def __init__(self, *args, **kwargs): super(TyreGiantSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.brands = [row['Brand'] for row in self.tyre_sizes] self.search_history = set() self.finished = False dispatcher.connect(self.spider_idle, signals.spider_idle)
def __init__(self, *args, **kwargs): super(MyTyresSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) self.ip_codes = {} self.ip_codes_filename = os.path.join(HERE, 'mytyres_ip_codes.csv') if os.path.exists(self.ip_codes_filename): with open(self.ip_codes_filename) as f: reader = csv.DictReader(f) for row in reader: self.ip_codes[row['identifier']] = row['ip_code'] self.errors = [] self.search_history = set()
def __init__(self, *args, **kwargs): super(LoveTyresSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) self.images = {} self.all_man_marks = {} with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] if os.path.exists(self.images_filename): with open(self.images_filename) as f: reader = csv.DictReader(f) for row in reader: self.images[row['product_url']] = row['image_url'] self.errors = [] dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, *args, **kwargs): super(EtyresSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self.matcher = Matcher(self.log) self.all_man_marks = {} self.custom_man_marks = {} self.tyre_sizes = [] self.tyre_widths = {} self.tyre_profiles = {} self.tyre_rims = {} with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.ip_codes = {} self.ip_codes_filename = os.path.join(HERE, 'etyres_ip_codes.csv') if os.path.exists(self.ip_codes_filename): with open(self.ip_codes_filename) as f: reader = csv.DictReader(f) for row in reader: self.ip_codes[row['identifier']] = row['ip_code'] self.custom_man_marks[' JAGUAR FITMENT'] = 'J' self.custom_man_marks[' RAV4 FITMENT'] = '' self.custom_man_marks[' NISSAN JUKE FITMENT'] = '' self.custom_man_marks[' (PORSCHE FITMENT)'] = 'N0' self.custom_man_marks[' LEXUS FITMENT'] = '' self.custom_man_marks[' PRIUS FITMENT'] = '' self.custom_man_marks[' TOYOTA AURIS FITMENT'] = '' self.custom_man_marks[' - TOYOTA RAV4 FITMENT'] = '' self.custom_man_marks[' BMW MINI FITMENT'] = '*' self.custom_man_marks[' AUDI FITMENT'] = 'AO' self.custom_man_marks[' JAG FITMENT'] = 'J' self.custom_man_marks[' FERRARI MASERATI FITMENT'] = '' self.custom_man_marks[' MASERATI FITMENT'] = '' self.custom_man_marks[' - BMW FITMENT'] = '*' self.custom_man_marks[' ASTON MARTIN FITMENT'] = '' self.custom_man_marks[' MERCEDES & RENAULT FITMENT'] = 'MO'
def __init__(self, *args, **kwargs): super(HusqvarnaDEEbaySpider, self).__init__() self._ebay_url = 'http://www.ebay.de' self._search_fields = ['brand', 'sku'] self._all_vendors = True self._meta_fields = [('name', 'name'), ('price', 'price'), ('brand', 'brand'), ('category', 'category')] self._match_fields = ('sku', ) self._check_valid_item = self.__valid_item_ self._converted_price = False self._check_diff_ratio = True self._re_sku = re.compile(r'(\d{3,})') self._look_related = False self.__collected_items = set() self._check_diff_ratio = True self.matcher = Matcher(self.log)
def __init__(self, *args, **kwargs): super(TyrebookersSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) self.tyre_sizes = self.tyre_sizes[::-1] with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.already_processed = [] self.custom_man_marks[' Merc'] = 'MO' self.custom_man_marks[' BMW'] = '*' self.custom_man_marks[' Audi'] = 'AO'
class OponeoSpider(BaseSpider): name = 'oponeo.co.uk_test' allowed_domains = ['oponeo.co.uk'] start_urls = ('http://www.oponeo.co.uk', ) tyre_sizes = [] all_man_marks = {} download_delay = 1 def __init__(self, *args, **kwargs): super(OponeoSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for i, row in enumerate(reader): self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.errors = [] def start_requests(self): requests = [] cookie = 1 self.log("[OPONEO] Row to process: %d" % len(self.tyre_sizes)) for i, row in enumerate(self.tyre_sizes, 1): self.log("[OPONEO] Searching for tyre %d: %s, MTS code: %s" % (i, row['Full Tyre Size'], row['MTS Stockcode'])) search = str(row['Width']) + '/' + str(row['Aspect Ratio']) + \ str(row['Speed rating']) + str(row['Rim']) meta = {'row': row, 'search': search, 'cookiejar': cookie} cookie += 1 search_url = 'http://www.oponeo.co.uk/tyre-finder/s=2/summer,all-season/t=1/car/r=1/{Width}-{Aspect Ratio}-r{Rim}'.format( **row) yield Request( search_url, meta=meta, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0' }) def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select( '//*[@id="productList"]//div[@itemprop="offers"]') next_page = hxs.select( '//li[contains(@class, "next") and contains(@class, "nextItem")]/a/@id' ).extract() if next_page: next_page_id = next_page[0] req = FormRequest.from_response( response, formname='form1', formdata={ '__ASYNCPOST': 'true', '__EVENTTARGET': next_page_id, '__EVENTARGUMENT': '' }, headers={ 'X-MicrosoftAjax': 'Delta=true', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0' }, meta=response.meta, callback=self.parse_next_page, dont_filter=True) yield req for product in products: product_url = product.select( './/div[@class="productName"]//a/@href')[0].extract() yield Request(urljoin(base_url, product_url), callback=self.parse_product, meta=response.meta) def parse_next_page(self, response): yield Request(response.url, dont_filter=True, meta=response.meta) def retry_request(self, response): try_no = response.meta.get('try', 1) if try_no < self.max_retry_count: meta = {'try': try_no + 1} meta['recache'] = True self.log("[WARNING] Retrying. Failed to scrape product page: %s" % response.url) yield Request(response.url, meta=meta, callback=self.parse_product, dont_filter=True) else: self.log("[WARNING] Gave up. Failed to scrape product page: %s" % response.url) self.errors.append("Failed to scrape product page: %s" % response.url) def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name fitting_method = 'Delivered' loader.add_value('url', response.url) image_url = hxs.select('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = hxs.select('//form[@name="form1"]/@action').extract() if not identifier: yield self.retry_request(response) return identifier = identifier[0] loader.add_value('identifier', identifier) price = hxs.select( '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract() loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) brand = hxs.select( '//div[@class="hidden"]/input[@class="producerName"]/@value' ).extract() if not brand: yield self.retry_request(response) return brand = brand[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) brand = re.sub(u'\u0119', u'e', brand) product_name = hxs.select( '//h1[@itemprop="name"]/text()')[0].extract().strip() product_name = re.sub(u'[:\u2122]', u'', product_name) product_name = product_name.replace(brand, '').strip() data = parse_pattern(product_name) if not data: log.msg('ERROR parsing "{}" [{}]'.format(product_name, response.url)) self.errors.append('ERROR parsing "{}" [{}]'.format( product_name, response.url)) return loader.add_value('name', data['Name']) metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in product_name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'run on flat' in product_name.lower( ) or 'run flat' in product_name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in product_name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get( manufacturer_mark, '') if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) # metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product def match_name(self, search_name, new_item, match_threshold=80, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class EvenTyresSpider(Spider): name = 'event-tyres.co.uk' allowed_domains = ['event-tyres.co.uk'] website_url = 'http://www.event-tyres.co.uk/' postal_code = 'WA5 7ZB' price_discount = False # extract multiple tyre discount price? def __init__(self, *args, **kwargs): super(EvenTyresSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) self.tyre_sizes = [] self.all_man_marks = {} self.manually_matched = [] with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: new_row = row.copy() self.tyre_sizes.append(new_row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.errors = [] def start_requests(self): for i, row in enumerate(self.tyre_sizes): yield Request(self.website_url, callback=self.next_search, meta={ 'row': row, 'cookiejar': str(i) }, dont_filter=True) def next_search(self, response): form_token = response.xpath( '//input[@id="search_form__token"]/@value').extract()[0] row = response.meta['row'] params = { 'search_form[width]': row['Width'], 'search_form[profile]': row['Aspect Ratio'], 'search_form[size]': row['Rim'], 'search_form[postcode]': self.postal_code, 'search_form[_token]': form_token, 'search_form[search]': '', } r = FormRequest(url=self.website_url, meta={'cookiejar': response.meta['cookiejar']}, formdata=params) yield r def parse(self, response): pages = set( response.xpath( '//*[contains(@class, "pagination__item")]/a[not(contains(@class, "pagination__current"))]/@href' ).extract()) for page_url in pages: yield Request(response.urljoin(page_url), meta=response.meta) products = response.xpath( '//article[@itemtype="http://schema.org/Product"]') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) brand = product_el.xpath( './/*[@itemprop="brand"]//*[@itemprop="name"]/text()').extract( )[0].strip() if brand.upper() in brands_substitute: brand = brands_substitute[brand.upper()] full_name = product_el.xpath( './/*[contains(@class, "product__title") and @itemprop="name"]/text()' ).extract()[0] try: tyre_size, name = re.split(brand, full_name, flags=re.I) except ValueError: self.log( "[[TESTING]] Can not split tyre '%s' with brand '%s'" % (full_name, brand)) continue # tyre_size, name = full_name.split(brand) loader.add_value('name', name) winter_tyre = product_el.xpath( './/*[@class="product__info"]//*[@data-icon="S" and contains(text(), "Winter")]' ) if not winter_tyre: loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = self.get_identifier(product_el) out_of_stock = product_el.xpath( './/*[@itemprop="availability" and contains(@content, "Out")]' ) if out_of_stock: loader.add_value('stock', 0) loader.add_value('url', response.url) image_url = product_el.xpath( './/img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('identifier', identifier) price = product_el.xpath('@data-price').extract()[0] loader.add_value('price', price) metadata = MicheldeverMeta() res = parse_pattern(tyre_size) if not res: continue width, ratio, rim, load_rating, speed_rating = res metadata['aspect_ratio'] = ratio metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['load_rating'] = load_rating metadata['width'] = width metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' xl = bool( product_el.xpath( './/*[@class="product__info"]//*[@data-icon="XL"]')) metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(full_name) run_flat = bool( product_el.xpath( './/*[@class="product__info"]//*[@data-icon="RF"]')) if not run_flat: run_flat = ' RFT' in name metadata[ 'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' man_code = self._get_manufacturer_code(full_name) metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) try: fuel, grip, noise = product_el.xpath( './/li[contains(@class, "product__meta-item--")]/text()' ).extract() except: fuel, grip, noise = ('', '', '') metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() # The website is defaulting to 2 tyres with a discount of £10 if product.get('price') and (not self.price_discount): product['price'] += Decimal('10') product['metadata'] = metadata if not is_product_correct(product): continue product['metadata'][ 'mts_stock_code'] = self.find_mts_stock_code(product) yield product # Please don't remove this method. This method is overridden by the children. def find_mts_stock_code(self, product): return find_mts_stock_code(product, spider_name=self.name, log=self.log) # Please don't remove this method. This method is overridden by the children. def get_identifier(self, selector): return selector.xpath('@data-product').extract()[0] def _get_manufacturer_code(self, name): name = name.upper() for code, manufacturer_mark in self.all_man_marks.items(): if code not in name: continue if code in name.split(' ') or code == '*': return manufacturer_mark return '' def match_name(self, search_name, new_item, match_threshold=90, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class BlackcirclesSpider(Spider): name = 'blackcircles.com' allowed_domains = ['blackcircles.com'] start_urls = ('http://www.blackcircles.com', ) tyre_sizes = [] errors = [] seen_ids = set() def __init__(self, *args, **kwargs): super(BlackcirclesSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) def start_requests(self): search_seen = set() for row in self.tyre_sizes: formdata = { 'profile': row['Aspect Ratio'], 'rim': row['Rim'], 'speed': 'Any', 'width': row['Width'], 'displayall': '999', 'delivery': '0', } search_key = '{}:{}:{}'.format(row['Aspect Ratio'], row['Rim'], row['Width']) if search_key not in search_seen: yield FormRequest( 'http://www.blackcircles.com/order/tyres/search', dont_filter=True, formdata=formdata, meta={'row': row}, callback=self.parse) search_seen.add(search_key) else: self.log('Duplicate search: {}'.format(search_key)) def parse(self, response): row = response.meta['row'] json_data = None for line in response.body.split('\n'): if "JsonObject = " in line: json_data = json.loads( line.replace('JsonObject = ', '').replace('; \r', '')) products = json_data['Rest'] + json_data['Deals'] collected_products = [] self.log('Results found {} {}'.format(len(products), response.meta)) for product_info in products: # skip winter tyres if product_info['WinterTyre']: continue loader = ProductLoader(item=Product(), selector=product_info) loader.add_value('name', product_info['ModelName']) brand = product_info['Manufacturer'] loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_info['PrimaryId'] fitting_method = 'Fitted' if str(identifier) + '-' + fitting_method in self.seen_ids: continue url = '/catalogue' + product_info[ 'CatalogueUrl'] + '/f?tyre=' + str(product_info['PrimaryId']) loader.add_value('url', response.urljoin(url)) image_url = product_info.get('ModelImageLarge') if not image_url: image_url = product_info.get('ModelImage') if image_url: image_url = image_url.split('src="')[-1].split('"')[0] loader.add_value('image_url', response.urljoin(image_url)) spec = product_info['SpecificationName'] metadata = MicheldeverMeta() # metadata['mts_stock_code'] = row['MTS Stockcode'] metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = spec.split()[-1] metadata['width'] = row['Width'] load_rating = product_info['LoadRatingName'] metadata['load_rating'] = load_rating metadata['alternative_speed_rating'] = '' xl = product_info['Reinforced'] metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(product_info['ModelName']) run_flat = product_info['RunFlat'] metadata[ 'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' manufacturer_mark = product_info['Variant'] if manufacturer_mark: manufacturer_mark = manufacturer_mark.split()[0].strip() full_tyre_size = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], metadata['load_rating'], metadata['speed_rating'])) # MOE Exception for this product if manufacturer_mark and 'MO EXTENDED' in product_info['Variant'].upper()\ and product_info['ModelName'] == 'Potenza S001' and full_tyre_size == '245/40/18/97/Y': metadata['manufacturer_mark'] = 'MOE' else: metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = full_tyre_size try: metadata['fuel'] = product_info['TyreLabelFuel']['Score'] except Exception: metadata['fuel'] = '' try: metadata['grip'] = product_info['TyreLabelWet']['Score'] except Exception: metadata['grip'] = '' try: metadata['noise'] = product_info['TyreLabelNoise'][ 'NoiseLevel'] except Exception: metadata['noise'] = '' product = loader.load_item() product['metadata'] = metadata product['price'] = product_info['FullyFittedPrice'] fitting_method = 'Fitted' product['identifier'] = str(identifier) + '-' + fitting_method product['metadata']['fitting_method'] = fitting_method t1 = time.time() if not is_product_correct(product): self.log('Search: {}'.format(str(response.meta))) self.seen_ids.add(str(identifier) + '-' + fitting_method) self.log('PRODUCT IS NOT CORRECT => %r' % product) continue t2 = time.time() self.log('Time taken by product correct: {}'.format(t2 - t1)) t1 = time.time() product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) t2 = time.time() self.log('Time taken by mts stock: {}'.format(t2 - t1)) collected_products.append(product) min_price_products = {} for product in collected_products: key = "%s-%s-%s-%s-%s-%s-%s" % ( product['brand'], product['name'], product['metadata']['fitting_method'], product['metadata']['full_tyre_size'], product['metadata']['xl'], product['metadata']['run_flat'], product['metadata']['manufacturer_mark']) if key in min_price_products: if product['price'] < min_price_products[key]['price']: min_price_products[key] = product else: min_price_products[key] = product for product in min_price_products.values(): self.seen_ids.add(product['identifier']) yield product def match_name(self, search_name, new_item, match_threshold=90, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class KwikFitSpider(BaseSpider): name = 'kwik-fit.com_test' allowed_domains = ['kwik-fit.com'] start_urls = ('http://www.kwik-fit.com',) tyre_sizes = [] download_delay = 0.1 def __init__(self, *args, **kwargs): super(KwikFitSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) def start_requests(self): for row in self.tyre_sizes: formdata = {} search = row['Width']+'/'+row['Aspect Ratio']+row['Speed rating']+row['Rim'] formdata['dts'] = search formdata['sop'] = 'TyreSize' formdata['ssq'] = '3' formdata['tsf'] = search formdata['tsr'] = search formdata['MobileQuote'] = 'false' formdata['ShowSummerTyres'] = 'true' formdata['ShowTyresForBookOnline'] = 'true' formdata['ShowTyresForQuotation'] = 'true' formdata['ShowWinterTyres'] = 'true' formdata['Stage'] = '2' yield FormRequest('http://www.kwik-fit.com/tyre-search.asp', dont_filter=True, formdata=formdata, meta={'row':row, 'search':search}, callback=self.parse) if row['Alt Speed']: formdata = {} search = row['Width']+'/'+row['Aspect Ratio']+row['Alt Speed']+row['Rim'] formdata['dts'] = search formdata['sop'] = 'TyreSize' formdata['ssq'] = '3' formdata['tsf'] = search formdata['tsr'] = search formdata['MobileQuote'] = 'false' formdata['ShowSummerTyres'] = 'true' formdata['ShowTyresForBookOnline'] = 'true' formdata['ShowTyresForQuotation'] = 'true' formdata['ShowWinterTyres'] = 'true' formdata['Stage'] = '2' yield FormRequest('http://www.kwik-fit.com/tyre-search.asp', dont_filter=True, formdata=formdata, meta={'row':row, 'search':search}, callback=self.parse) def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[contains(@id,"Tyre") and contains(@class, "tyre-list-tyre")]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'div//div[@class="manufacturerText"]/p/strong/text()') brand = ''.join(product.select('div//div[@class="manufacturerImage"]/img/@alt').extract()).split(' - ')[0] winter_tyre = product.select('div//img[@alt="Winter Tyre"]') if not winter_tyre: loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) identifier = product.select('div//div[@class="pricingAddToOrder clearfix"]/input/@value').extract()[0] loader.add_value('url', '') image_url = product.select('div[@class="image"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) loader.add_value('identifier', identifier) price = product.select('div//div[contains(@class, "pricingSelection")]//a/strong/text()').extract() price = re.findall(r"\d+.\d+", price[0]) if price else '0.0' loader.add_value('price', price) tyresize_text = product.select('.//div[contains(@class, "manufacturerText")]/p/span/text()').extract()[0].strip() width, aspect, speed_rating, rim = re.search(r'tyre size (\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups() fitting_method = 'Fitted' metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = fitting_method load_rating = product.select('div//li/a[@rel="load-index-description"]/text()').extract() metadata['load_rating'] = load_rating[0].split(': ')[-1] if load_rating else '' metadata['alternative_speed_rating'] = '' xl = product.select('div//img[@title="Reinforced"]/@title').extract() metadata['xl'] = 'Yes' if xl else 'No' run_flat = product.select('div//img[@title="Run Flat"]').extract() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = product.select('div//img[contains(@title, "Homologated for fitment to certai")]/@title').extract() manufacturer_mark = manufacturer_mark[0].replace('Homologated for fitment to certain ' ,'').replace(' cars.' ,'') if manufacturer_mark else '' metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product def match_name(self, search_name, new_item, match_threshold=80, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class TyreGiantSpider(BaseSpider): name = 'tyregiant.com_test' allowed_domains = ['tyregiant.com'] start_urls = ('http://www.tyregiant.com/', ) tyre_sizes = [] brands = [] manually_matched = [] all_man_marks = {} download_delay = 0.1 def __init__(self, *args, **kwargs): super(TyreGiantSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.brands = [row['Brand'] for row in self.tyre_sizes] self.search_history = set() self.finished = False dispatcher.connect(self.spider_idle, signals.spider_idle) def _get_history_key(self, search_params): key = "%(width)s-%(rim)s-%(aspect_ratio)s-%(speed_rating)s" % search_params return key def check_in_history(self, search_params): if self._get_history_key(search_params) in self.search_history: return True return False def add_to_history(self, search_params): self.search_history.add(self._get_history_key(search_params)) def spider_idle(self, spider): if not self.finished: request = Request(self.start_urls[0], dont_filter=True, callback=self.parse) self._crawler.engine.crawl(request, self) raise DontCloseSpider def parse(self, response): for r in self.next_search(): yield r def next_search(self): request_sent = False for i, row in enumerate(self.tyre_sizes, 1): for speed_rating in [row['Speed rating'], row['Alt Speed']]: if not speed_rating: continue search_params = { 'width': row['Width'], 'aspect_ratio': row['Aspect Ratio'], 'speed_rating': speed_rating, 'rim': row['Rim'] } if self.check_in_history(search_params): continue self.log("Checking row: %s" % str({ 'width': row['Width'], 'aspect_ratio': row['Aspect Ratio'], 'speed_rating': row['Speed rating'], 'rim': row['Rim'] })) self.add_to_history(search_params) url = 'http://www.tyregiant.com/%(width)s-%(aspect_ratio)s-%(rim)s?speed=%(speed_rating)s' % \ search_params yield Request(url, dont_filter=True, meta={'search_params': search_params}, callback=self.parse_search) request_sent = True break if request_sent: break else: self.finished = True return def parse_search(self, response): meta = response.meta url = 'http://www.tyregiant.com/update-tyres/1' meta['page'] = 1 yield Request(url, dont_filter=True, callback=self.parse_products, meta=meta) def parse_products(self, response): html_response = json.loads(response.body)['display_tyres'] hxs = HtmlXPathSelector(text=html_response) search_params = response.meta['search_params'] products = hxs.select('//div[contains(@class, "tyre_container")]') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) brand = product_el.select( './/form/span[@class="tyre_brand_text"]/text()').extract() brand = brand[0] if brand else '' winter_tyre = hxs.select( '/div/div/div[@class="winter_img"]').extract() if not winter_tyre: for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand full_name = product_el.select( './/form/span[@class="tyre_brand_text"]/text()').extract( )[-1] loader.add_value('name', full_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.select( './/input[@name="tyre"]/@value').extract() loader.add_value('identifier', identifier) loader.add_value('url', 'http://www.tyregiant.com') image_url = product_el.select( './/img[@class="tyre_image"]/@src').extract() if image_url: loader.add_value( 'image_url', urljoin(get_base_url(response), image_url[0])) price = product_el.select( './/*[@class="tyre_price"]/span/text()').extract() if not price: loader.add_value('stock', 0) loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = search_params['aspect_ratio'] metadata['rim'] = search_params['rim'] tyre_details = product_el.select( './/form/p[@class="tyre_details"]/text()').extract()[0] speed = re.search('(\s\d+\w+\s)', tyre_details) load_rating = speed.group().strip()[:-1] if speed else '' speed_rating = speed.group().strip()[-1] if speed else '' metadata['speed_rating'] = speed_rating metadata['load_rating'] = load_rating metadata['width'] = search_params['width'] metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' xl = product_el.select( './/img[@class="xl_img"]/@src').extract() metadata['xl'] = 'Yes' if xl else 'No' run_flat = product_el.select( './/img[@class="rf_img"]/@src').extract() metadata['run_flat'] = 'Yes' if run_flat else 'No' metadata['manufacturer_mark'] = self._get_manufacturer_code( full_name) metadata['full_tyre_size'] = '/'.join( (search_params['width'], search_params['aspect_ratio'], search_params['rim'], metadata['load_rating'], metadata['speed_rating'])) # metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product if products: meta = response.meta next_page = meta['page'] + 1 next_url = 'http://www.tyregiant.com/update-tyres/%s' % str( next_page) meta['page'] = next_page yield Request(next_url, dont_filter=True, callback=self.parse_products, meta=meta) def _get_manufacturer_code(self, name): name = name.upper() for code, manufacturer_mark in self.all_man_marks.items(): if code not in name: continue if code in name.split(' ') or code == '*': return manufacturer_mark return '' def match_name(self, search_name, new_item, match_threshold=90, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class TyreDriveSpider(BaseSpider): name = 'micheldever-tyredrive.co.uk_test' allowed_domains = ['tyredrive.co.uk'] start_urls = ('http://www.tyredrive.co.uk', ) tyre_sizes = [] all_man_marks = {} def __init__(self, *args, **kwargs): super(TyreDriveSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.errors = [] def start_requests(self): for row in self.tyre_sizes: search = str(row['Width']) + '/' + str(row['Aspect Ratio']) + \ str(row['Speed rating']) + str(row['Rim']) parameters = { 'section': row['Width'], 'profile': row['Aspect Ratio'], 'rim': row['Rim'], 'speed': '0', 'tyre_brand': '0', 'submit': 'SEARCH' } yield Request('http://www.tyredrive.co.uk/search.php?' + urllib.urlencode(parameters), meta={ 'row': row, 'search': search }, callback=self.parse) def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) row = response.meta['row'] products = hxs.select('//td[@class="tyreinfo"]/a/@href').extract() log.msg('Products found: {!s} items [{}]'.format( len(products), response.url)) if not products: log.msg('No products: [{}]'.format(response.url)) pages = hxs.select('//a[contains(@href,"pagpage")]/@href').extract() for page in pages: yield Request(urljoin(base_url, page), meta=response.meta) for url in products: yield Request(urljoin(base_url, url), callback=self.parse_product, meta=response.meta) def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name name = hxs.select('//td[@class="tread"]/text()').extract() if not name: msg = "No name found on page: %s" % response.url self.errors.append(msg) self.log("[ERROR] %s" % msg) return loader.add_value('name', name[0]) brand = hxs.select( '//table[@class="single searchresults"]//td[@class="tyreinfo"]/b/text()' ).extract()[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(brand)) fitting_method = 'Delivered' loader.add_value('url', response.url) out_of_stock = hxs.select( '//table[@class="single searchresults"]//span[@class="outofstock"]' ) if out_of_stock: loader.add_value('stock', 0) image_url = hxs.select( '//table[@class="single searchresults"]//td[@class="logo-pic"]/img/@src' ).extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = hxs.select( '//table[@class="single searchresults"]//form/input[@name="pid"]/@value' )[0].extract() loader.add_value('identifier', identifier) price = hxs.select( '//table[@class="single searchresults"]//td[@class="netprice"]/text()' )[0].extract() loader.add_value('price', price) name = hxs.select( '//table[@class="single searchresults"]//td[@class="tyreinfo"]/span/text()' )[0].extract() data = parse_pattern(name) if not data: log.msg('ERROR parsing "{}" [{}]'.format(name, response.url)) self.errors.append('ERROR parsing "{}" [{}]'.format( name, response.url)) return metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] metadata['alternative_speed_rating'] = '' xl = 'XL' in name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'rflat' in name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' if '*' in name: manufacturer_mark = '*' else: manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \ else '' metadata['mts_stock_code'] = '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product def match_name(self, search_name, new_item, match_threshold=80, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class LoveTyresSpider(BaseSpider): name = 'lovetyres.com' allowed_domains = ['lovetyres.com'] start_urls = ['http://www.lovetyres.com'] images_filename = os.path.join(HERE, 'lovetyres_images.csv') def __init__(self, *args, **kwargs): super(LoveTyresSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) self.images = {} self.all_man_marks = {} with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] if os.path.exists(self.images_filename): with open(self.images_filename) as f: reader = csv.DictReader(f) for row in reader: self.images[row['product_url']] = row['image_url'] self.errors = [] dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): with open(self.images_filename, 'w') as f: writer = csv.DictWriter(f, ['product_url', 'image_url']) writer.writeheader() for product_url, image_url in self.images.items(): writer.writerow({ 'product_url': product_url, 'image_url': image_url }) def start_requests(self): requests = [] urls = set() with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: search = str(row['Width']) + '/' + str(row['Aspect Ratio']) + \ str(row['Speed rating']) + str(row['Rim']) search_url = 'http://www.lovetyres.com/search/tyres/%(Width)s-%(Aspect Ratio)s-%(Rim)s' % row if search_url not in urls: self.log(search_url) urls.add(search_url) requests.append( Request(search_url, meta={'search': search}, callback=self.parse)) self.log('TOTAL SEARCH REQUESTS: %s' % len(requests)) return requests def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//tr[contains(@class,"tyre-search-row")]') next_page = [] if next_page: yield Request(urljoin_rfc(base_url, next_page[0]), meta=response.meta) not_found_count = 0 for product in products: url = product.select('.//td/b/a/@href')[0].extract() winter_tyre = product.select('.//td/b/a/text()')[0].extract() winter_tyre = 'winter' in winter_tyre.lower() if not winter_tyre: brand = product.select('.//a/img/@src')[0].extract() brand = re.search('/public/brands/(.*?)(-tyres)?\.', brand).group(1).replace('-', ' ').title() product_name = product.select('.//td/b/a/text()')[0].extract() product_name = re.sub(brand, '', product_name).strip() fitting_method = 'Delivered' identifier = product.select( './/input[@name="item_id"]/@value').extract() if not identifier: identifier = product.select('.//a/@href').re( 'email_me_stock/(.*)') if not identifier: continue try: fuel, grip, noise = map( unicode.strip, product.select( './/img[contains(@alt, "Tyre Label")]/following-sibling::text()' ).extract()) except: fuel = '' grip = '' noise = '' price = product.select("td[3]/b/text()").extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', identifier[0]) loader.add_value('name', product_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', url) if price: loader.add_value('price', price[0]) else: loader.add_value('price', '0.00') loader.add_value('stock', 0) pattern_name = product.select('.//i/text()').extract() if not pattern_name: continue pattern_name = pattern_name[0] data = re.search( '(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)', pattern_name) if data: data = data.groupdict() else: msg = 'ERROR parsing "{}" [{}]'.format( pattern_name, response.url) self.log(msg) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'].upper() metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in pattern_name metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(pattern_name) run_flat = 'run flat' in pattern_name.lower( ) or 'runflat' in pattern_name.lower() or run_flat_found metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): not_found_count += 1 self.log('%s - PRODUCT IS NOT CORRECT: %r' % (not_found_count, product)) continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) if product['url'] in self.images: product['image_url'] = self.images[product['url']] yield product else: yield Request(product['url'], callback=self.parse_image, meta={'product': product}, dont_filter=True) def parse_image(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=response.meta['product'], selector=response) image_url = hxs.select('//div[@class="item"]/a/img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product = loader.load_item() if 'image_url' in product and product['image_url'].strip(): self.images[product['url']] = product['image_url'] yield product def match_name(self, search_name, new_item, match_threshold=80, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class PointSSpider(Spider): name = 'micheldever-point-s.co.uk' allowed_domains = ['point-s.co.uk'] start_urls = ('http://www.point-s.co.uk/', ) tyre_sizes = [] brands = [] all_man_marks = {} def __init__(self, *args, **kwargs): super(PointSSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.brands = [row['Brand'] for row in self.tyre_sizes] self.processed_rows = {} def start_requests(self): for row in self.tyre_sizes: if self.check_row_is_processed(row): continue self.add_row_to_history(row) meta = {'row': row} xl = '' if row['XL'] == 'XL': xl = 'Y' meta['xl'] = True run_flat = '' if row['Run Flat'] == 'RF': run_flat = 'Y' meta['run_flat'] = True url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[ 'Width'] + '&profile=' + row['Aspect Ratio'] + '&size=' + row[ 'Rim'] + '&speed=' + row[ 'Speed rating'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl yield Request(url, dont_filter=True, meta=meta) if row['Alt Speed']: url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[ 'Width'] + '&profile=' + row[ 'Aspect Ratio'] + '&size=' + row['Rim'] + '&speed=' + row[ 'Alt Speed'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl yield Request(url, dont_filter=True, meta=meta) def get_row_key(self, row): fields_to_save = [ 'Width', 'Rim', 'Aspect Ratio', 'Speed rating', 'Alt Speed', 'XL', 'Run Flat' ] return tuple([row[x] for x in fields_to_save]) def check_row_is_processed(self, row): key = self.get_row_key(row) if self.processed_rows.get(key): return True return False def add_row_to_history(self, row): key = self.get_row_key(row) self.processed_rows[key] = True def parse(self, response): row = response.meta['row'] products = response.xpath( '//div[contains(@class, "product-recommended")]') products += response.xpath( '//div[@class="product-section"]/div[contains(@class, "product")]') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) brand = product_el.xpath( './/input[@name="brand"]/@value').extract() brand = brand[0] if brand else '' for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand full_name = ''.join(product_el.xpath('.//h2/text()').extract()) if not full_name: continue full_name_splt = re.split(brand, full_name, flags=re.I) tyre_code = full_name_splt[0] name = ' '.join(full_name_splt[1:]).strip() tyre_code = tyre_code.strip() name = name.strip() loader.add_value('name', name) # loader.add_value('name', full_name.split(brand)[-1]) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.xpath( './/input[@name="prodCode"]/@value').extract() if identifier: identifier = identifier[0] else: self.log('Product without identifier') search_params = '/'.join([ row['Aspect Ratio'], row['Rim'], row['Width'], row['Alt Speed'] ]) self.log('Search parameters: ' + search_params) return loader.add_value('url', response.url) image_url = product_el.xpath( './/div[contains(@class, "product-im")]/img/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('identifier', identifier) price = ''.join( product_el.xpath('.//*[@class="price"]//text()').re( r'[\d\.,]+')) if not price: continue loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] speed = re.search('(\s\d+\w+\s)', full_name) speed_rating = speed.group().strip()[-1] if speed else '' load_rating = speed.group().strip()[:-1] if speed else '' metadata['speed_rating'] = speed_rating metadata['load_rating'] = load_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' metadata['xl'] = 'Yes' if 'XL' in full_name.upper() else 'No' run_flat_found = is_run_flat(full_name) metadata['run_flat'] = 'Yes' if 'RUNFLAT' in full_name.upper( ) or run_flat_found else 'No' metadata['manufacturer_mark'] = self._get_manufacturer_code( full_name) metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], metadata['load_rating'], metadata['speed_rating'])) try: fuel, grip, noise = map( unicode.strip, product_el.xpath( './/div[contains(@class, "feature-image") or contains(@class, "feature-block")]' '//span[@class="icon-text"]/text()').extract()) except: fuel = '' grip = '' noise = '' metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product next_page = response.xpath( u'//ul[@class="pagination"]//a[contains(text(), ">")]/@data-url' ).extract() if next_page: yield Request(next_page[0], dont_filter=True, meta=response.meta) def _get_manufacturer_code(self, name): name = name.upper().strip() for code, manufacturer_mark in self.all_man_marks.items(): if code not in name: continue if code in map(unicode.strip, name.split(' ')) or code == '*': return manufacturer_mark return '' def match_name(self, search_name, new_item, match_threshold=90, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class KwikFitSpider(BaseSpider): name = 'kwik-fit.com' allowed_domains = ['kwik-fit.com'] start_urls = ('http://www.kwik-fit.com', ) tyre_sizes = [] download_delay = 0.1 def __init__(self, *args, **kwargs): super(KwikFitSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) def start_requests(self): for row in self.tyre_sizes: search = row['Width'] + '/' + row['Aspect Ratio'] + row[ 'Speed rating'] + row['Rim'] yield Request( 'https://www.kwik-fit.com/tyres/search/results/%(Width)s/%(Aspect Ratio)s/%(Rim)s/%(Speed rating)s' % row, dont_filter=True, meta={ 'row': row, 'search': search }, callback=self.parse) if row['Alt Speed']: search = row['Width'] + '/' + row['Aspect Ratio'] + row[ 'Alt Speed'] + row['Rim'] yield Request( 'https://www.kwik-fit.com/tyres/search/results/%(Width)s/%(Aspect Ratio)s/%(Rim)s/%(Alt Speed)s' % row, dont_filter=True, meta={ 'row': row, 'search': search }, callback=self.parse) def parse(self, response): products = response.xpath( '//div[contains(@class, "tyres_search_results_tyre") and @data-viewtype="grid"]' ) for product in products: winter_tyre = product.xpath( '@data-filter-season').extract()[0] == 'Winter' if not winter_tyre: name = product.xpath( './/div[contains(@class, "tyre-model text-center")]/text()' ).extract()[0] brand = product.xpath('@data-filter-brand').extract()[0] loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', brand + ' ' + name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product.xpath('@data-tyreid').extract()[0] loader.add_value('identifier', identifier) loader.add_value('url', response.url) image_url = product.xpath( './/div[contains(@class, "tyre-image")]//img/@src' ).extract() if image_url: loader.add_value( 'image_url', urljoin(get_base_url(response), image_url[0])) price = product.xpath( './/div[contains(@class, "tyre-pricing-information")]/div/text()' ).re(r'[\d,.]+') price = price[0] if price else '0.00' loader.add_value('price', price) tyresize_text = product.xpath( './/div[contains(@class, "tyre-size")]/text()').extract( )[0].strip() try: width, aspect, speed_rating, rim, load_rating = re.search( r'(\d+)\/(\d+)(\w{1})(\d+)\s\((\d+)\)', tyresize_text, re.I).groups() except: width, aspect, speed_rating, rim = re.search( r'(\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups() load_rating = '' fitting_method = 'Fitted' metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = fitting_method metadata['load_rating'] = load_rating metadata['alternative_speed_rating'] = '' xl = product.xpath( '@data-filter-reinforced').extract()[0] == 'Y' metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(loader.get_output_value('name')) run_flat = product.xpath( '@data-filter-runflat').extract()[0] == 'Y' metadata[ 'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' manufacturer_mark = product.xpath('.//span[contains(@title, "Homologated for fitment to certai")]/@title')\ .re(r'Homologated for fitment to certain (.*) cars\.') metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark[0]) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) fuel, grip, noise = product.xpath('@data-filter-tyreefficiencyr' '|@data-filter-tyreefficiencyg' '|@data-filter-tyreefficiencyd')\ .extract() metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product def match_name(self, search_name, new_item, match_threshold=80, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class TrovaprezziSpider(BaseSpider): name = u'trovaprezzi.it' allowed_domains = [u'trovaprezzi.it'] start_urls = [ u'http://www.trovaprezzi.it/prezzi_elettronica-elettricita.aspx' ] items = [] def __init__(self, *args, **kwargs): super(TrovaprezziSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) dispatcher.connect(self.spider_idle, signals.spider_idle) def start_requests(self): with open(os.path.join(HERE, 'product_list.csv')) as f: reader = csv.DictReader(cStringIO.StringIO(f.read())) for row in reader: meta = { 'sku': row['model'], 'identifier': row['ean'], 'brand': row['brand'], 'name': row['name'] } if row['model']: search = row['brand'] + '+' + row['model'] meta['model_search'] = True url = 'http://www.trovaprezzi.it/categoria.aspx?libera=' + search + '&id=-1&prezzomin=&prezzomax=' else: url = 'http://www.trovaprezzi.it/categoria.aspx?libera=' + row[ 'name'].replace(' ', '+') + '&id=-1&prezzomin=&prezzomax=' yield Request(url, meta=meta) def spider_idle(self, spider): if self.items: request = Request(self.start_urls[0], dont_filter=True, callback=self.closing_parse) self._crawler.engine.crawl(request, self) def closing_parse(self, response): self.log("Processing items after finish") items_dict = {} items = sorted(self.items, key=lambda x: x['sku']) for item in items: if item['sku'] in items_dict: old_item = items_dict[item['sku']] if item['price'] < old_item['price']: items_dict[item['sku']] = item else: items_dict[item['sku']] = item self.items = [] for sku, item in items_dict.items(): loader = ProductLoader(item=Product(), response=response) loader.add_value('name', item['name']) loader.add_value('url', item['url']) loader.add_value('price', item['price']) loader.add_value('sku', item['sku']) loader.add_value('category', item['category']) loader.add_value('brand', item['brand']) loader.add_value('identifier', item['identifier']) loader.add_value('dealer', item['dealer']) loader.add_value('image_url', item['image_url']) product = loader.load_item() yield product def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta relevant_categories = hxs.select( '//div[@class="catsMI"]/div/a/@href').extract() for category in relevant_categories: yield Request(urljoin_rfc(base_url, category), meta=meta) products = hxs.select('//table[@id="productlist-table"]/tbody/tr') if not products and meta.get('model_search', False): url = 'http://www.trovaprezzi.it/categoria.aspx?libera=' + meta[ 'name'].replace(' ', '+') + '&id=-1&prezzomin=&prezzomax=' meta['model_search'] = False yield Request(url, meta=meta) else: category = hxs.select( '//div[@id="divTitle"]/h1/text()').extract()[0] pr = None for product in products: name = product.select( 'td[@class="descCol"]/a/b/text()').extract()[0] if self.match_name(meta['name'], name, match_threshold=70): loader = ProductLoader(item=Product(), selector=product) image_url = product.select( 'td[@class="imgCol"]/a/img/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) else: image_url = '' loader.add_value('image_url', image_url) loader.add_xpath('dealer', 'td[@class="mercCol"]/a/img/@alt') loader.add_xpath('name', 'td[@class="descCol"]/a/b/text()') loader.add_value('category', category) loader.add_value('sku', response.meta.get('sku')) url = product.select( 'td[@class="descCol"]/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(base_url, url)) price = product.select('td[@class="prodListPrezzo"]/text()' ).extract()[0].strip().replace( '.', '').replace(',', '.') loader.add_value('price', price) shipping_cost = product.select( 'td[@class="prodListPrezzo"]/' + 'span[@class="deliveryCost nobr"]/' + 'text()').extract()[0].strip().replace('.', '').replace( ',', '.') loader.add_value('shipping_cost', shipping_cost) loader.add_value('identifier', response.meta.get('identifier')) if loader.get_output_value('price') and ( pr is None or pr.get_output_value('price') > loader.get_output_value('price')): pr = loader if pr: item = pr.load_item() if not item in self.items: self.items.append(item) def match_name(self, search_name, new_item, match_threshold=90, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class LegoUsaEbaySpider(BaseeBaySpider): HERE = os.path.abspath(os.path.dirname(__file__)) name = 'legousa-ebay.com' # Map deviation screenshot feature map_deviation_detection = True map_deviation_csv = os.path.join(HERE, 'ebay_map_deviation.csv') map_screenshot_method = 'scrapy_response' map_screenshot_html_files = {} def __init__(self, *args, **kwargs): super(LegoUsaEbaySpider, self).__init__() self._csv_file = os.path.join(self.HERE, 'lego.csv') self._converted_price = True self._ebay_url = 'http://www.ebay.com' self._search_fields = [3, 2] self._all_vendors = True self._look_related = False self._meta_fields = [('sku', 2), ('name', 3), ('price', 4), ('category', 1)] self._match_fields = ('sku', 'identifier') self._check_valid_item = self._valid_item_ self._re_sku = re.compile(r'(\d{3,})') self._check_diff_ratio = True # self._ratio_accuracy = 60 self.matcher = Matcher(self.log) def match_text(self, text, item_field, match_threshold=90, important_words=None): r = self.matcher.match_ratio(text, item_field, important_words) self.log('Searching for %s in %s: %s' % (text, item_field, r)) return r >= match_threshold def start_requests(self): with open(self._csv_file) as f: reader = csv.reader(cStringIO.StringIO(f.read())) number = 0 for row in reader: number += 1 meta = dict( dict((m_k, row[m_f]) for m_k, m_f in self._meta_fields)) search = ' '.join(row[field].strip() for field in self._search_fields) if not 'lego' in search.lower(): search = 'LEGO ' + search meta.update({'search': search}) # Get URL search = self._clean_search(search) # Clean search url = self._get_url_search(search) self.log('Item %s | SKU: %s | Search by: %s' % (number, meta.get('sku', None), search)) yield self._search(url, meta) search = 'LEGO ' + row[2] meta.update({'search': search}) # Get URL search = self._clean_search(search) # Clean search url = self._get_url_search(search) self.log('Item %s | SKU: %s | Search by: %s' % (number, meta.get('sku', None), search)) yield self._search(url, meta) def load_item(self, *args, **kwargs): product_loader = super(LegoUsaEbaySpider, self).load_item(*args, **kwargs) product_loader.replace_value('brand', 'LEGO') identifier = product_loader.get_output_value('identifier') response = args[-1] html_path = os.path.join('/tmp', 'ebay_%s.html' % identifier) with open(html_path, 'w') as f_html: f_html.write(response.body) self.map_screenshot_html_files[identifier] = html_path return product_loader def _valid_item_(self, item_loader, response): item_name = item_loader.get_output_value('name').lower() if not self._check_exclude_terms(item_name): return False name = item_loader.get_output_value('name') search_sku = item_loader.get_output_value('sku') sku = self._re_sku.findall(name.replace(' ', '')) sku.extend(self._re_sku.findall(name)) category = item_loader.get_output_value('category') if not self._check_name_valid(name): return False if not self._check_category_valid(category): return False sku = set(sku) search_name = response.meta['item_meta']['name'].decode('utf-8') if not self.match_text(search_name, name, match_threshold=70): return False if sku: search_price = response.meta['item_meta'].get('price') price = item_loader.get_output_value('price') if not len(sku) > 1 or self._check_max_price(search_price, price): match_sku = search_sku in sku self.log('SKU %s in %s ? %s' % (search_sku, sku, match_sku)) return match_sku else: self.log('Reject lot of products => %s' % item_loader.get_output_value('url')) return False return True def _check_name_valid(self, name): """ >>> spider = LegoUsaEbaySpider() >>> spider._check_name_valid("Lego 123") True >>> spider._check_name_valid("Lego 123 figure") False """ if (self.match_text('mini figures from', name) or self.match_text('mini figures only', name) or self.match_text('mini figures', name) or self.match_text('mini figure', name) or self.match_text('minifigures', name) or self.match_text('minifigure', name) or self.match_text('figure', name) or self.match_text('loose', name) or self.match_text('no box', name) or self.match_text('nobox', name)): return False return True def _check_category_valid(self, category): """ >>> spider = LegoUsaEbaySpider() >>> spider._check_category_valid('asd') True >>> spider._check_category_valid("figures") False >>> spider._check_category_valid("figure") False """ if category and (self.match_text('figure', category)): return False return True def _check_valid_price(self, search_price, price): ''' Checks price variation ''' price_diff = 0.5 search_price = Decimal(search_price) diff = Decimal(search_price) * Decimal(price_diff) return search_price - diff <= Decimal(price) def _check_max_price(self, search_price, price): ''' Checks price variation ''' price_diff = 0.5 search_price = Decimal(search_price) diff = Decimal(search_price) * Decimal(price_diff) return Decimal(price) <= search_price + diff def _check_exclude_terms(self, item_name): ''' [([<list terms to exclude>], [<list exceptions>]), ([...], [...]), ([...], [...])] ''' exclude_ = [(['NO MINIFIG'], []), (['MINIFIG', 'MINIFG'], ['MINIFIGURES'])] for values, exceptions in exclude_: for w in values: if w.lower() in item_name: itsvalid = False for e in exceptions: if e.lower() in item_name: itsvalid = True break if not itsvalid: return False return True
class EcraterSpider(BaseSpider): name = 'legousa-ecrater.com' allowed_domains = ['ecrater.com'] start_urls = ( 'http://www.ecrater.com/filter.php?cid=542133&keywords=lego&slocation=d&new=1', 'http://www.ecrater.com/filter.php?cid=542133&slocation=d&new=1') _re_sku = re.compile('(\d\d\d\d\d?)') # Map deviation screenshot feature map_deviation_detection = True map_deviation_csv = os.path.join(HERE, 'ecrater_map_deviation.csv') def __init__(self, *args, **kwargs): super(EcraterSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'lego.csv')) as f: reader = csv.reader(f) self.products = { prod[2]: prod[3].decode('utf8') for prod in reader } dispatcher.connect(self.spider_closed, signals.spider_closed) if os.path.exists(os.path.join(HERE, 'ecrater_products.csv')): shutil.copy(os.path.join(HERE, 'ecrater_products.csv'), os.path.join(HERE, 'ecrater_products.csv.bak')) # Errors self.errors = [] def spider_closed(self, spider): shutil.copy('data/%s_products.csv' % spider.crawl_id, os.path.join(HERE, 'toysrus_products.csv')) def start_requests(self): # Parse default items and then start_urls yield Request('http://www.ecrater.com', self.parse_default) def parse_default(self, response): with open(os.path.join(HERE, 'ecrater_products.csv')) as f: reader = csv.DictReader(f) for row in reader: yield Request(row['url'], self.parse_product) # Scrape start urls for url in self.start_urls: yield Request(url) def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) next = hxs.select( '//ul[@class="pagination-controls nav"]/li/a[@title="Next Page"]/@href' ).extract() if next: yield Request(urljoin_rfc(base_url, next[-1]), callback=self.parse) products = hxs.select( '//div[@class="product-details"]/h2/a/@href').extract() for product in products: if 'keywords=lego' in response.url or 'lego' in product: yield Request(urljoin_rfc(base_url, product), callback=self.parse_product) if not products: self.errors.append('WARNING: No products in %s' % response.url) def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) seller = hxs.select( '//a[@class="seller-username"]/text()').extract()[0] cart_url = hxs.select( '//div[@id="product-title-actions"]/a/@href').extract()[0] identifier = url_query_parameter(urljoin_rfc(base_url, cart_url), 'pid', None) if not identifier: identifier_regex = re.search(r'p/(\d+)/', response.url) if not identifier_regex: self.errors.append('WARNING: No identifier in %s' % response.url) return else: identifier = identifier_regex.groups()[0] name = hxs.select('//div[@id="product-title"]/h1/text()').extract()[0] sku = self._re_sku.findall(name) sku = sku[0] if sku else '' loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier + '-' + seller) loader.add_value('name', name) loader.add_value('brand', 'LEGO') loader.add_xpath( 'category', '//ul[@class="breadcrumb"]/li/a[@class="active"]/text()') loader.add_value('url', response.url) price = hxs.select( '//div[@id="product-title-actions"]/span/text()').extract()[0] loader.add_value('price', price) image_url = hxs.select( '//img[@id="product-image-display"]/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) stock = hxs.select('//p[@id="product-quantity"]/text()').extract() if stock: stock = re.findall("\d+", stock[0]) stock = stock[0] if stock else 0 loader.add_value('stock', stock) shipping = hxs.select( '//p[a[@href="#shipping-rates"]]/text()').extract() if shipping: shipping = re.findall("\d+.\d+", shipping[0]) shipping = shipping[0] if shipping else 0 loader.add_value('shipping_cost', shipping) loader.add_value('dealer', seller) if sku in self.products.keys(): if self.match_name(self.products[sku], name): loader.add_value('sku', sku) else: log.msg('###########################') log.msg(response.url) log.msg('###########################') else: loader.add_value('sku', sku) yield loader.load_item() def match_name(self, search_name, new_item, match_threshold=90, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
class LoveTyresSpider(BaseSpider): name = 'lovetyres.com_test' allowed_domains = ['lovetyres.com'] start_urls = ('http://www.lovetyres.com',) tyre_sizes = [] all_man_marks = {} def __init__(self, *args, **kwargs): super(LoveTyresSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f: reader = csv.DictReader(f) for row in reader: self.tyre_sizes.append(row) with open(os.path.join(HERE, 'manmarks.csv')) as f: reader = csv.DictReader(f) for row in reader: self.all_man_marks[row['code']] = row['manufacturer_mark'] self.errors = [] def start_requests(self): for row in self.tyre_sizes: search = str(row['Width']) + '/' + str(row['Aspect Ratio']) + \ str(row['Speed rating']) + str(row['Rim']) yield Request('http://www.lovetyres.com/search/tyres/{Width}-{Aspect Ratio}-{Rim}'.format(**row), meta={'search': search}, callback=self.parse) def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//tr[contains(@class,"tyre-search-row")]') next_page = [] if next_page: yield Request(urljoin(base_url, next_page[0]), meta=response.meta) for product in products: url = product.select('.//td/b/a/@href')[0].extract() winter_tyre = product.select('.//td/b/a/text()')[0].extract() winter_tyre = 'winter' in winter_tyre.lower() if not winter_tyre: brand = product.select('.//a/img/@src')[0].extract() brand = re.search('/public/brands/(.*?)(-tyres)?\.', brand).group(1).replace('-', ' ').title() meta = response.meta meta['brand'] = brand price = product.select("td[3]/b/text()").extract() if price: meta['price'] = price[0] yield Request(urljoin(base_url, url), callback=self.parse_product, meta=meta) def parse_product(self, response): hxs = HtmlXPathSelector(response) base_loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name brand = response.meta.get('brand') or '' product_name = hxs.select('//h2[@class="heading black"]/text()')[0].extract().strip() product_name = re.sub(brand, '', product_name).strip() fitting_method = 'Delivered' base_loader.add_value('url', response.url) image_url = hxs.select('//div[@class="item"]/a/img/@src').extract() options = hxs.select('//div[@style="background: #fff; padding: 6px; "]') for option in options: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', product_name) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = option.select('../input[@type="hidden" and @name="item_id"]/@value').extract() if not identifier: identifier = option.select('./a/@href').re('email_me_stock/(.*)') if not identifier: continue loader.add_value('identifier', identifier[0]) price = option.select('./strong[@class="price" and not(contains(text(),"On Backorder"))]/text()').extract() if price: loader.add_value('price', price[0]) else: if response.meta.get('price'): loader.add_value('price', response.meta['price']) else: loader.add_value('price', '0.00') loader.add_value('stock', 0) pattern_name = option.select('./p/strong/text()').extract() if not pattern_name: pattern_name = option.select('./strong/text()').extract() pattern_name = pattern_name[0] data = re.search('(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)', pattern_name) if data: data = data.groupdict() else: msg = 'ERROR parsing "{}" [{}]'.format(pattern_name, response.url) log.msg(msg) self.errors.append(msg) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'].upper() metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in pattern_name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'run flat' in pattern_name.lower() or 'runflat' in pattern_name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ')] manufacturer_mark = manufacturer_mark[0].strip() if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product def match_name(self, search_name, new_item, match_threshold=80, important_words=None): r = self.matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold
def __init__(self, *args, **kwargs): super(TrovaprezziSpider, self).__init__(*args, **kwargs) self.matcher = Matcher(self.log) dispatcher.connect(self.spider_idle, signals.spider_idle)
""" >>> find_man_mark('bmw') '*' >>> find_man_mark('Mercedes') 'MO' >>> find_man_mark('por') 'N0' """ man_marks = load_manufacturers_marks() for code, manufacturer_mark in man_marks.items(): if mark.lower() in code.lower(): return manufacturer_mark return '' matcher = Matcher(logging.error) def match_name(search_name, new_item, match_threshold=90, important_words=None): r = matcher.match_ratio(search_name, new_item, important_words) return r >= match_threshold def match_pattern(pattern, name, match_threshold=70): """ >>> match_pattern('B 250 ECOPIA', 'B250ECO') True >>> match_pattern('DSPORT', 'Dueler Sport')