Пример #1
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_loader = ProductLoader(item=Product(), selector=hxs)
        # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
        # the pattern should be set as the product's name
        brand = response.meta.get('brand') or ''
        product_name = hxs.select('//h2[@class="heading black"]/text()')[0].extract().strip()
        product_name = re.sub(brand, '', product_name).strip()
        fitting_method = 'Delivered'
    
        base_loader.add_value('url', response.url)
    
        image_url = hxs.select('//div[@class="item"]/a/img/@src').extract()
        options = hxs.select('//div[@style="background: #fff; padding: 6px; "]')
        for option in options:
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('name', product_name)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
            loader.add_value('url', response.url)
            if image_url:
                loader.add_value('image_url', urljoin(get_base_url(response), image_url[0]))
            identifier = option.select('../input[@type="hidden" and @name="item_id"]/@value').extract()
            if not identifier:
                identifier = option.select('./a/@href').re('email_me_stock/(.*)')
            if not identifier:
                continue
            loader.add_value('identifier', identifier[0])
            price = option.select('./strong[@class="price" and not(contains(text(),"On Backorder"))]/text()').extract()
            if price:
                loader.add_value('price', price[0]) 
            else:
                if response.meta.get('price'):
                    loader.add_value('price', response.meta['price'])
                else:
                    loader.add_value('price', '0.00')
                loader.add_value('stock', 0)
        
            pattern_name = option.select('./p/strong/text()').extract()
            if not pattern_name:
                pattern_name = option.select('./strong/text()').extract()
            pattern_name = pattern_name[0]
            data = re.search('(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)',
                              pattern_name)
            if data:
                data = data.groupdict()
            else:
                msg = 'ERROR parsing "{}" [{}]'.format(pattern_name, response.url)
                log.msg(msg)
                self.errors.append(msg)
                continue
            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = data['Aspect_Ratio']
            metadata['rim'] = data['Rim']
            metadata['speed_rating'] = data['Speed_Rating'].upper()
        
            metadata['width'] = data['Width']
            metadata['fitting_method'] = fitting_method
            metadata['load_rating'] = data['Load_Rating'] or ''
            metadata['alternative_speed_rating'] = ''
            xl = 'XL' in pattern_name
            metadata['xl'] = 'Yes' if xl else 'No'
        
            run_flat = 'run flat' in pattern_name.lower() or 'runflat' in pattern_name.lower()
            metadata['run_flat'] = 'Yes' if run_flat else 'No'
            manufacturer_mark = [mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ')]
            manufacturer_mark = manufacturer_mark[0].strip() if manufacturer_mark else []
            metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else ''

            metadata['full_tyre_size'] = '/'.join((metadata['width'],
                                                   metadata['aspect_ratio'],
                                                   metadata['rim'],
                                                   metadata['load_rating'], 
                                                   metadata['speed_rating']))
                                                    #metadata['alternative_speed_rating']))
        
            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating
        
            yield product
Пример #2
0
    def extract_products(self, hxs, url):
        for el in hxs.select(
                '//div[starts-with(@class,"tyre_container round")]'):
            tyre_options = fix_spaces("".join(
                el.select(
                    './/p[@class="tyre_details"]//text()').extract())).strip()
            if not tyre_options:
                msg = 'Could not extract tyre options from element from %s' % url
                self.log('ERROR: %s' % msg)
                self.errors.append(msg)
                continue
            res = parse_pattern(tyre_options)
            if not res:
                msg = "ERROR parsing: %s on %s" % (tyre_options, url)
                self.log(msg)
                self.errors.append(msg)
                continue
            width, ratio, rim, load_rating, speed_rating, name = res

            # skip winter tyres
            if el.select(".//div[@class='tyre_winter']"):
                continue

            name = name.strip()
            identifier = el.select("./@id").extract()[0]
            price = "".join(
                el.select(
                    ".//p[@class='tyre_price']//text()").extract()).strip()
            if not price:
                continue
            brand = el.select(
                ".//span[@class='tyre_brand_text']/text()").extract()[0]
            image_url = el.select("img/@src").extract()[0]
            image_url = urljoin_rfc('http://asdatyres.co.uk', image_url)
            run_flat = 'Yes' if len(
                el.select(".//div[@class='tyre_rf']").extract()) > 0 else 'No'
            xl = 'Yes' if len(
                el.select(".//div[@class='tyre_xl']").extract()) > 0 else 'No'

            if xl == 'Yes':
                name = name.replace("XL", "").strip()

            loader = ProductLoader(Product(), selector=hxs)
            loader.add_value('name', name)
            loader.add_value('identifier', identifier)
            loader.add_value('price', price)
            loader.add_value('url', 'http://www.asdatyres.co.uk/')
            loader.add_value('image_url', image_url)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))

            metadata = MicheldeverMeta()
            metadata['width'] = width
            metadata['aspect_ratio'] = ratio
            metadata['rim'] = rim
            metadata['load_rating'] = load_rating
            metadata['speed_rating'] = speed_rating
            metadata['fitting_method'] = 'Fitted'
            metadata['run_flat'] = run_flat
            metadata['xl'] = xl
            metadata['fitting_method'] = 'Fitted'

            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                if code in name:
                    man_code = man_mark
                    break
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join(
                (width, ratio, rim, load_rating, speed_rating))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #3
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        #pagination
        urls = hxs.select(
            '//div[@class="pagination pagination-centered"]//a/@href').extract(
            )
        for url in urls:
            yield Request(urljoin(base_url, url), callback=self.parse_search)
        #parse products list
        products = hxs.select('//*[@id="searchRes"]/tbody//tr')
        for product in products:
            season = product.select('.//td[4]/i/@class').extract()
            #skip winter tyres
            if season and 'ico-type ico-W' in season[0]:
                continue
            loader = ProductLoader(item=Product(), selector=product)
            brand, name = product.select('./td[2]/a/b/text()').extract()
            loader.add_value('name', name)

            pattern = product.select('./td[2]/a/small/text()').extract()[0]

            data = extract_data(pattern)
            if data:
                width, aspect_ratio, rim, load_rating, speed_rating = data
            else:
                self.log("ERROR. Unable to parse pattern: %s" % pattern)
                continue

            if 'goodrich' in brand.lower():
                brand = 'BFG'
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            price = product.select('.//span[@class="pr"]/text()').extract()[0]
            price_decimals = product.select(
                './/span[@class="pr"]/sup/text()').extract()[0].replace(
                    u'\xa3', '')
            loader.add_value('price', extract_price(price + price_decimals))
            identifier = product.select('@data-id').extract()[0]
            loader.add_value('identifier', identifier)
            url = product.select('./td[2]/a/@href').extract()[0]
            loader.add_value('url', urljoin(base_url, url))
            image_url = product.select('./td[1]/img/@src').extract()
            if image_url:
                loader.add_value('image_url', urljoin(base_url, image_url[0]))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = aspect_ratio
            metadata['rim'] = rim
            metadata['speed_rating'] = speed_rating
            metadata['width'] = width
            metadata['fitting_method'] = 'Delivered'
            metadata['load_rating'] = load_rating
            specif = product.select(
                './/span[@class="specif"]/text()').extract()
            specif = [x.lower() for x in specif]
            metadata['xl'] = 'Yes' if 'xl' in specif else 'No'
            metadata['run_flat'] = 'Yes' if 'runflat' in specif else 'No'

            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                if code.lower() in specif:
                    man_code = man_mark
                    break
            if man_code == '':
                for code, man_mark in self.custom_man_marks.iteritems():
                    if code.lower() in specif:
                        man_code = man_mark
                        break
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 load_rating, speed_rating))
            #metadata['alternative_speed_rating']))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #4
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
  
        row = response.meta['row']

        products = hxs.select('//div[@id="formcontent"]/div[@class="result"]')

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.select('p/span[@class="brand_text"]/text()').extract()
            brand = brand[0] if brand else ''

            winter_tyre = product_el.select('div/img[@title="Winter Tyre"]').extract()
            # skip winter tyres
            if winter_tyre:
                continue

            for tyre_brand in self.brands:
                if tyre_brand.upper() == brand.strip().upper():
                    brand = tyre_brand

            full_name = ''.join(product_el.select('p[@class="the_tyre"]/text()').extract()).strip()

            loader.add_value('name', ' '.join(full_name.split()[2:]))
            loader.add_value('brand', unify_brand(brand))
            loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
            identifier = product_el.select('p/span/select/@name').extract()
            if identifier:
                identifier = identifier[0].replace('number[', '').replace(']', '')
            else:
                log.msg('Product without identifier')
                search_params = '/'.join([row['Aspect Ratio'], row['Rim'], row['Width'], row['Alt Speed']])
                log.msg('Search parameters: ' + search_params)
                return

            loader.add_value('url', 'http://www.tyresavings.com')
            loader.add_xpath('image_url', 'img[@class="tyre_image"]/@src')
            loader.add_value('identifier', identifier)

            price = ''.join(product_el.select('div[@class="price"]/text()').extract()).strip()

            if not price:
                continue

            loader.add_value('price', price)

            metadata = MicheldeverMeta()

            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']

            speed = re.search('(\s\d+\w+\s)', full_name)
            speed_rating = speed.group().strip()[-1] if speed else ''
            load_rating = speed.group().strip()[:-1] if speed else ''

            metadata['speed_rating'] = speed_rating
            metadata['load_rating'] = load_rating

            metadata['width'] = row['Width']

            metadata['fitting_method'] = 'Fitted'
            metadata['alternative_speed_rating'] = ''
            xl = product_el.select('div/img[@title="Reinforced Tyre"]').extract()
            metadata['xl'] = 'Yes' if xl else 'No'
            run_flat = product_el.select('div/img[@title="Run Flat Tyre"]').extract()
            metadata['run_flat'] = 'Yes' if run_flat else 'No'

            metadata['manufacturer_mark'] = self._get_manufacturer_code(full_name)

            metadata['full_tyre_size'] = '/'.join((row['Width'],
                                                   row['Aspect Ratio'],
                                                   row['Rim'],
                                                   metadata['load_rating'],
                                                   metadata['speed_rating']))
            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #5
0
    def extract_products(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select(
            '//div[@class="listcontPART"]//div[@class="conprcbx"]')
        for el in products:
            brand = el.select('./div[@class="dec_tyrebnt"]/p/b/text()'
                              ).extract().pop().strip()

            pattern = "".join(
                el.select(
                    './div[@class="dec_tyrebnt"]/p/text()').extract()).strip()

            # skip winter tyres
            if 'winter' in pattern.lower():
                continue

            xl, pattern = extract_reinforced(pattern)
            run_flat, pattern = extract_run_flat(pattern)
            res = parse_pattern(pattern)
            if not res:
                excludes = [
                    'sport contact', 'advantage sport', 'expedia s02',
                    'zero rosso'
                ]
                if any([x in pattern.lower() for x in excludes]):
                    continue
                else:
                    msg = 'Could not parse pattern: %s' % fix_spaces(
                        pattern).encode('utf-8')
                    self.log('[CARTYRES] %s' % msg)
                    self.errors.append(msg)
                    continue
            width, ratio, rim, load_rating, speed_rating, name = res

            identifier = el.select(".//p/@onclick").re(
                "AddCarToShortList\('([^']*)',")

            url = self.start_urls[0]

            price = el.select(
                './/div[@class="dec_fittdbnt"]/h1/text()').extract().pop()
            price = fix_spaces(price)

            image_url = el.select(
                '../..//div[@class="uptyre_prt"]/img/@src').extract()[0]

            man_mark = el.select(
                './/div[@class="bndLGO1"]/img/@title').extract()
            if man_mark:
                man_mark = man_mark[0]
                if not man_mark in self.man_marks:
                    self.man_marks.add(man_mark)
            else:
                man_mark = ''

            loader = ProductLoader(Product(), selector=hxs)
            loader.add_value('name', name)
            loader.add_value('identifier', identifier.pop())
            loader.add_value('price', price)
            loader.add_value('url', url)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))

            metadata = MicheldeverMeta()
            metadata['width'] = width
            metadata['aspect_ratio'] = ratio
            metadata['rim'] = rim
            metadata['load_rating'] = load_rating
            metadata['speed_rating'] = speed_rating
            metadata['fitting_method'] = 'Fitted'
            metadata['run_flat'] = run_flat
            metadata['xl'] = xl

            if man_mark and man_mark in man_mark_mapping:
                man_code = man_mark_mapping[man_mark]
            else:
                man_code = ''
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join(
                (width, ratio, rim, load_rating, speed_rating))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #6
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
        # the pattern should be set as the product's name
        name = hxs.select('//td[@class="tread"]/text()').extract()
        if not name:
            msg = "No name found on page: %s" % response.url
            self.errors.append(msg)
            self.log("[ERROR] %s" % msg)
            return
        loader.add_value('name', name[0])
        brand = hxs.select(
            '//table[@class="single searchresults"]//td[@class="tyreinfo"]/b/text()'
        ).extract()[0].strip()
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category', find_brand_segment(brand))
        fitting_method = 'Delivered'

        loader.add_value('url', response.url)

        out_of_stock = hxs.select(
            '//table[@class="single searchresults"]//span[@class="outofstock"]'
        )
        if out_of_stock:
            loader.add_value('stock', 0)

        image_url = hxs.select(
            '//table[@class="single searchresults"]//td[@class="logo-pic"]/img/@src'
        ).extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin(get_base_url(response), image_url[0]))

        identifier = hxs.select(
            '//table[@class="single searchresults"]//form/input[@name="pid"]/@value'
        )[0].extract()
        loader.add_value('identifier', identifier)
        price = hxs.select(
            '//table[@class="single searchresults"]//td[@class="netprice"]/text()'
        )[0].extract()
        loader.add_value('price', price)

        name = hxs.select(
            '//table[@class="single searchresults"]//td[@class="tyreinfo"]/span/text()'
        )[0].extract()
        data = parse_pattern(name)
        if not data:
            log.msg('ERROR parsing "{}" [{}]'.format(name, response.url))
            self.errors.append('ERROR parsing "{}" [{}]'.format(
                name, response.url))
            return
        metadata = MicheldeverMeta()
        metadata['aspect_ratio'] = data['Aspect_Ratio']
        metadata['rim'] = data['Rim']
        metadata['speed_rating'] = data['Speed_Rating']

        metadata['width'] = data['Width']
        metadata['fitting_method'] = fitting_method
        metadata['load_rating'] = data['Load_Rating']
        metadata['alternative_speed_rating'] = ''
        xl = 'XL' in name
        metadata['xl'] = 'Yes' if xl else 'No'

        run_flat = 'rflat' in name.lower()
        metadata['run_flat'] = 'Yes' if run_flat else 'No'
        if '*' in name:
            manufacturer_mark = '*'
        else:
            manufacturer_mark = [
                mark for mark in self.all_man_marks.keys()
                if mark in name.split(' ')
            ]
        manufacturer_mark = manufacturer_mark[0].strip(
        ) if manufacturer_mark else []
        metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \
                                                                                      else ''
        metadata['mts_stock_code'] = ''
        metadata['full_tyre_size'] = '/'.join(
            (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
             metadata['load_rating'], metadata['speed_rating']))
        #metadata['alternative_speed_rating']))

        product = loader.load_item()
        product['metadata'] = metadata

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(
            product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        yield product
Пример #7
0
    def parse(self, response):
        base_url = get_base_url(response)
        row = response.meta['row']
        products = json.loads(response.body_as_unicode())
        for product_el in products:
            #skip winter tyres
            if product_el['winter'] != '0':
                continue
            loader = ProductLoader(item=Product(), selector=product_el)
            brand = product_el['tyreMake'].title()
            if 'goodrich' in brand.lower():
                brand = 'BFG'
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            load_rating = product_el['loadrating']
            speed_rating = product_el['tyreSpeed']
            loader.add_value('price', product_el['priceVat'])
            loader.add_value('identifier', product_el['id'])
            loader.add_value(
                'url',
                urljoin('http://www.etyres.co.uk/tyre-detail/',
                        product_el['URLString']))
            if product_el['tyreModelImage2']:
                image_url = 'images/' + product_el['tyreModelImage2']
                if image_url:
                    loader.add_value('image_url', urljoin(base_url, image_url))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = speed_rating
            metadata['width'] = row['Width']
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating
            metadata[
                'xl'] = 'Yes' if product_el['tyreReinforced'] == 'T' else 'No'
            metadata[
                'run_flat'] = 'Yes' if product_el['runflat'] == '1' else 'No'

            name = product_el['tyreModel']
            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                result, name = cut_name(code, name)
                if result:
                    man_code = man_mark
                    break
            if not man_code:
                for code, man_mark in self.custom_man_marks.iteritems():
                    if name.endswith(code):
                        name = name.partition(code)[0]
                        man_code = man_mark
                        break
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating,
                 speed_rating))
            name = name.replace(' EXTRA LOAD', '')
            name = name.replace(' RUNFLAT', '')

            loader.add_value('name', name.strip())

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #8
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        title = hxs.select('//h1/text()')[0].extract()
        if 'winter' in title.lower():
            return

        title = re.search('(.*)-[^-]+', title).groups()[0]
        brand = title.split(' ')[0]
        price = hxs.select('//td[@class="price"]/text()')[0].extract()
        # fix wrong product
        if brand.strip() == 'R27':
            loader.add_value('name', title.replace('XL', '').replace('RF', ''))
            brand = 'Toyo'
        else:
            loader.add_value(
                'name',
                title.replace(brand, '').replace('XL', '').replace('RF', ''))
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category',
                         find_brand_segment(loader.get_output_value('brand')))

        loader.add_value('price', price)
        loader.add_value('url', response.url)
        loader.add_xpath('identifier',
                         '//input[@id="product_reference"]/@value')
        image_url = hxs.select('//img[@class="productImg"]/@src')[0].extract()
        loader.add_value('image_url', urljoin(get_base_url(response),
                                              image_url))

        speed_rating = hxs.select(
            "//tr[td/strong[text()='Speed:']]/td[2]/text()").extract()[0]
        load_rating = hxs.select("//tr[td/strong[text()='Load:']]/td[2]/text()"
                                 ).extract()[0].replace(speed_rating, "")

        size = hxs.select(
            "//tr[td/strong[text()='Size:']]/td[2]/text()").extract()[0]

        width, aspect_ratio, _, rim = parse_tyre_size(size)
        if not width:
            msg = "Error parsing '%s' on page %s" % (size, response.url)
            self.log(msg)
            self.errors.append(msg)
            return

        m = MicheldeverMeta()
        m['aspect_ratio'] = aspect_ratio
        m['rim'] = rim
        m['width'] = width
        m['speed_rating'] = speed_rating.upper()
        m['load_rating'] = load_rating
        if 'RF' in title.upper():
            m['run_flat'] = 'Yes'
        else:
            m['run_flat'] = 'No'

        if 'XL' in title.upper():
            m['xl'] = 'Yes'
        else:
            m['xl'] = 'No'

        m['full_tyre_size'] = '/'.join(
            (m['width'], m['aspect_ratio'], m['rim'], m['load_rating'],
             m['speed_rating']))

        m['fitting_method'] = 'Fitted'
        m['manufacturer_mark'] = self._get_manufacturer_code(title)

        product = loader.load_item()
        product['metadata'] = m

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(
            product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        yield product
Пример #9
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select(
            '//ul[@class="c-list-classic c-list-classic-liste m-produit-res"]/li'
        )

        next_page = hxs.select(
            '//li[a[span[text()="Next"]]]/@data-page').extract()  # pagination
        if next_page:
            formdata = response.meta.get('formdata')
            formdata['page'] = next_page[0]
            yield FormRequest(response.url,
                              formdata=formdata,
                              dont_filter=True,
                              meta=response.meta)

        for product_el in products:
            url = product_el.select(
                './/a[@class="u-semi-link"]/@href')[0].extract()
            winter_tyre = product_el.select(
                './/div[@class="m-produit-bloc-res-lst__gamme-saison"]/text()'
            ).re('Winter')
            if not winter_tyre:
                loader = ProductLoader(item=Product(), selector=product_el)
                # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
                # the pattern should be set as the product's name
                loader.add_xpath(
                    'name',
                    './/span[@class="m-produit-bloc-res-lst__dcp"]/text()')
                brand = product_el.select(
                    './/span[@class="m-produit-bloc-res-lst__fab"]/text()'
                ).extract()
                if brand:
                    brand = brand[0].strip()
                    loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                fitting_method = 'Delivered'

                loader.add_value('url', urljoin(base_url, url))

                image_url = product_el.select(
                    './/div[@class="m-produit-bloc-res-lst__image"]//img/@src'
                ).extract()
                if image_url:
                    loader.add_value(
                        'image_url',
                        urljoin(get_base_url(response), image_url[0]))

                identifier = product_el.select(
                    './/button/@data-id')[0].extract()
                loader.add_value('identifier', identifier)
                price = product_el.select(
                    './/div[@class="c-qte-prix__prix m-produit-bloc-res-lst__prix"]/text()'
                )[0].extract()
                loader.add_value('price', price)
                if not loader.get_output_value('price'):
                    loader.add_value('stock', 0)

                name = product_el.select(
                    './/div[@class="m-produit-bloc-res-lst__dim"]/text()'
                )[0].extract().strip().replace(u'\xa0', u' ')
                data = parse_pattern(name)
                if not data:
                    log.msg('ERROR parsing "{}" [{}]'.format(
                        name, response.url))
                    self.errors.append('ERROR parsing "{}" [{}]'.format(
                        name, response.url))
                    continue

                additional_data = ' '.join(
                    product_el.select(
                        './/ul[@class="m-produit__carac c-list-horizontale"]/li/text()'
                    ).extract())
                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = data['Aspect_Ratio']
                metadata['rim'] = data['Rim']
                metadata['speed_rating'] = data['Speed_Rating']

                metadata['width'] = data['Width']
                metadata['fitting_method'] = fitting_method
                metadata['load_rating'] = data['Load_Rating'] or ''
                metadata['alternative_speed_rating'] = ''
                xl = 'XL' in additional_data
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat = 'runflat' in additional_data.lower()
                metadata['run_flat'] = 'Yes' if run_flat else 'No'
                manufacturer_mark = [
                    mark for mark in self.all_man_marks.keys()
                    if re.search('\(?{}\)?'.format(mark.replace('*', '\*')),
                                 additional_data)
                ]
                manufacturer_mark = manufacturer_mark[0].strip(
                ) if manufacturer_mark else []
                metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \
                                                                                              else ''
                metadata['mts_stock_code'] = ''
                metadata['full_tyre_size'] = '/'.join(
                    (metadata['width'], metadata['aspect_ratio'],
                     metadata['rim'], metadata['load_rating'],
                     metadata['speed_rating']))
                #metadata['alternative_speed_rating']))

                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product, spider_name=self.name, log=self.log)

                new_speed_rating = get_speed_rating(product)
                new_alt_speed = get_alt_speed(product)
                product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                    product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
                product['metadata']['speed_rating'] = new_speed_rating

                yield product
Пример #10
0
    def parse(self, response):
        try:
            hxs = HtmlXPathSelector(response)
        except AttributeError:
            msg = 'Error getting selector on page for row: %s' % response.meta[
                'row']
            self.log('[ERROR] %s' % msg)
            self.errors.append(msg)
            return

        row = response.meta['row']

        json_data = None
        for line in hxs.extract().split('\n'):
            if "JsonObject = " in line:
                json_data = json.loads(
                    line.replace('JsonObject = ', '').replace('; \r', ''))

        products = json_data['Rest'] + json_data['Deals']

        collected_products = []

        for product_info in products:
            # skip winter tyres
            if product_info['WinterTyre']:
                continue

            loader = ProductLoader(item=Product(), selector=product_info)
            loader.add_value('name', product_info['ModelName'])
            brand = product_info['Manufacturer']

            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            identifier = product_info['PrimaryId']
            fitting_method = 'Delivered'

            url = '/catalogue' + product_info[
                'CatalogueUrl'] + '/f?tyre=' + str(product_info['PrimaryId'])
            loader.add_value('url', urljoin(get_base_url(response), url))

            image_url = product_info.get('ModelImageLarge')
            if not image_url:
                image_url = product_info.get('ModelImage')

            if image_url:
                image_url = image_url.split('src="')[-1].split('"')[0]
                loader.add_value('image_url',
                                 urljoin(get_base_url(response), image_url))

            loader.add_value('identifier',
                             str(identifier) + '-' + fitting_method)
            price = product_info['SellingPrice']
            loader.add_value('price', price)

            spec = product_info['SpecificationName']

            metadata = MicheldeverMeta()
            # metadata['mts_stock_code'] = row['MTS Stockcode']
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = spec.split()[-1]
            metadata['width'] = row['Width']

            metadata['fitting_method'] = fitting_method
            load_rating = product_info['LoadRatingName']
            metadata['load_rating'] = load_rating
            metadata['alternative_speed_rating'] = ''
            xl = product_info['Reinforced']
            metadata['xl'] = 'Yes' if xl else 'No'
            run_flat = product_info['RunFlat']
            metadata['run_flat'] = 'Yes' if run_flat else 'No'
            manufacturer_mark = product_info['Variant']
            if manufacturer_mark:
                manufacturer_mark = manufacturer_mark.split()[0].strip()

            metadata['manufacturer_mark'] = find_man_mark(
                manufacturer_mark) if manufacturer_mark else ''

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'],
                 metadata['load_rating'], metadata['speed_rating']))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            # Do not collect "Delivered" tyres
            # yield product

            product['price'] = product_info['FullyFittedPrice']
            fitting_method = 'Fitted'
            product['identifier'] = str(identifier) + '-' + fitting_method
            product['metadata']['fitting_method'] = fitting_method
            collected_products.append(product)

        min_price_products = {}
        for product in collected_products:
            key = "%s-%s-%s-%s-%s-%s-%s" % (
                product['brand'], product['name'],
                product['metadata']['fitting_method'],
                product['metadata']['full_tyre_size'],
                product['metadata']['xl'], product['metadata']['run_flat'],
                product['metadata']['manufacturer_mark'])
            if key in min_price_products:
                if product['price'] < min_price_products[key]['price']:
                    min_price_products[key] = product
            else:
                min_price_products[key] = product

        for product in min_price_products.values():
            yield product
Пример #11
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        row = response.meta['row']
        products = hxs.select('//div[contains(@class, "tyreResult")]')
        for product in products:
            winter = product.select('.//li[@class="cw"]')
            # skip winter tyres
            if winter:
                continue
            loader = ProductLoader(item=Product(), selector=product)
            title = product.select(
                './/div[@class="tyreName"]/h4/text()').extract()[0].strip()
            brand = product.select('./@data-brand').extract()[0]
            brand = brand.title()
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            title = title[len(brand):].strip()
            results = re.search(
                r"\b((?:\d{2,3}/)*(?:\d{2,3}))\s?([A-Z]{1,2}\d?)\b", title)
            if results:
                load_rating = results.group(1)
                speed_rating = results.group(2)
                name = title[:results.start(1)]
                title = title[results.end(2):]
            else:
                load_rating = ''
                speed_rating = row['Speed rating']
                name = title
                title = ''
            price = product.select(
                './/div[@class="tyreBuy"]//h5/text()').extract()[0]
            price_dec = product.select(
                './/div[@class="tyreBuy"]//h5/sup/text()').extract()[0]
            loader.add_value('price', extract_price(price + price_dec))
            identifier = product.select(
                './/input[@name="id"]/@value').extract()[0]
            loader.add_value('identifier', identifier)
            loader.add_value('url', '')
            image_url = product.select(
                './/div[@class="tyreImg"]/img[@class="tyre"]/@src').extract()
            if image_url:
                loader.add_value('image_url',
                                 urljoin(get_base_url(response), image_url[0]))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = speed_rating
            metadata['width'] = row['Width']
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating
            # metadata['alternative_speed_rating'] = ''
            specif = product.select(
                './/ul[@class="fixed"]//li/@class').extract()
            metadata['xl'] = 'Yes' if 'xl' in specif else 'No'
            metadata['run_flat'] = 'Yes' if 'rf' in specif else 'No'
            man_code = ''
            if 'bmw' in specif:
                man_code = '*'
            elif 'mer' in specif:
                man_code = 'MO'
            elif 'aud' in specif:
                man_code = 'AO'
            elif 'por' in specif:
                man_code = 'NO'

            for code, man_mark in self.all_man_marks.iteritems():
                result, name = cut_name(code, name)
                if result:
                    if man_code == '':
                        man_code = man_mark
                    break
            if man_code == '':
                for code, man_mark in self.all_man_marks.iteritems():
                    result, title = cut_name(code, title)
                    if result:
                        man_code = man_mark
                        break
            metadata['manufacturer_mark'] = man_code
            result, name = cut_name('XL', name)
            loader.add_value('name', name)

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating,
                 speed_rating))
            # metadata['alternative_speed_rating']))

            prod = loader.load_item()
            prod['metadata'] = metadata

            if not is_product_correct(prod):
                continue

            prod['metadata']['mts_stock_code'] = find_mts_stock_code(
                prod, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(prod)
            new_alt_speed = get_alt_speed(prod)
            prod['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                prod['metadata']['speed_rating'] if prod['metadata']['speed_rating'] != new_speed_rating else ''
            prod['metadata']['speed_rating'] = new_speed_rating

            yield prod
Пример #12
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        row = response.meta['row']

        products = hxs.select('//*[@id="tyreResults"]//tr[contains(@class, "tyre")]//td[@class != "gutter"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            title = product.select('.//p[@class="subTitle"]/text()').extract()
            if not title:
                continue
            title = ' '.join(title[0].split())

            parsed_title = parse_title_new(title)

            
            brand = parsed_title['brand']
            load_rating = parsed_title['load_rating']
            speed_rating = parsed_title['speed_rating']
            name = parsed_title['name']
            if not name or not brand:
                self.log("++++++++++++++++++++++++++++{}==================".format(title))
                #self.errors.append("Error parsing title: %s" % title)
            for fixed_brand, brand_spellings in self.brand_fixes.iteritems():
                if brand.lower() in brand_spellings:
                    brand = fixed_brand
                    break
            brand = brand.title()
            if brand not in self.brand_fixes:
                self.log('Wrong brand %s' % brand)
                continue
            loader.add_value('brand', unify_brand(brand))
            loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
            price = product.select('.//h6[@class="price"]/text()').extract()[0]
            price += product.select('.//h6[@class="price"]/sup/text()').extract()[0]
            loader.add_value('price', extract_price(price))
            identifier = product.select('./a[@class="btnBuy png_bg"]/@href').extract()[0]
            identifier = identifier.split('/')[-1]
            loader.add_value('identifier', identifier)
            loader.add_value('url', '')
            image_url = product.select('.//img[@class="tyreImg"]/@src').extract()
            if image_url:
                loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0]))

            metadata = MicheldeverMeta()
            metadata['onsite_name'] = title
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = speed_rating
            metadata['width'] = row['Width']
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating

            self.log("===============matching================")
            self.log(str(name))

            metadata['manufacturer_mark'], name = filter_man_code(name, self.all_man_marks, self.custom_man_marks)
            self.log(str((metadata['manufacturer_mark'], name)))

            metadata['xl'], name = filter_xl(name)
            metadata['xl'] = "Yes" if metadata['xl'] else "No"
            self.log(str((metadata['xl'], name)))

            metadata['run_flat'], name = filter_run_flat(name)
            metadata['run_flat'] = "Yes" if metadata['run_flat'] else "No"
            self.log(str((metadata['run_flat'], name)))

            self.log("===============/matching===============")

            if name.endswith('('):
                name = name[:-1]
            loader.add_value('name', name.strip())

            metadata['full_tyre_size'] = '/'.join((row['Width'],
                                                   row['Aspect Ratio'],
                                                   row['Rim'],
                                                   load_rating,
                                                   speed_rating))
                                                   #metadata['alternative_speed_rating']))

            prod = loader.load_item()
            prod['metadata'] = metadata

            if not is_product_correct(prod):
                continue

            prod['metadata']['mts_stock_code'] = find_mts_stock_code(prod, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(prod)
            new_alt_speed = get_alt_speed(prod)
            prod['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                prod['metadata']['speed_rating'] if prod['metadata']['speed_rating'] != new_speed_rating else ''
            prod['metadata']['speed_rating'] = new_speed_rating

            yield prod
Пример #13
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        search_params = response.meta['search_params']
        formdata = response.meta['formdata']
        loader = ProductLoader(item=Product(), selector=hxs)
        title = hxs.select(
            '//div[@class="rightpanel"]//h1/text()').extract()[0]
        title = ' '.join(title.split())
        tyre_params = "{}/{}R{}".format(search_params['width'],
                                        search_params['aspect_ratio'],
                                        search_params['rim'])
        parts = title.partition(tyre_params)
        brand = parts[0].strip()
        load_rating = parts[2].strip().split(formdata['speed'])[0].strip()
        name = title.partition('Fuel Effic')[0].replace('~', '').strip()
        name = name.replace(
            '{} {} {}{} '.format(brand, tyre_params, load_rating,
                                 formdata['speed']), '')
        brand = brand.title()
        if 'goodrich' in brand.lower():
            brand = 'BFG'
        loader.add_value('brand', unify_brand(brand))

        if 'www.tyretraders.com' in name or tyre_params not in title:
            meta = response.meta
            meta['retry'] += 1
            if meta['retry'] < 10:
                yield Request(response.url,
                              callback=self.parse,
                              meta=meta,
                              dont_filter=True)
            else:
                self.log('Giving up retrying to reload the product: {}'.format(
                    response.url))
        else:
            price = response.meta.get('price')
            loader.add_value('price', price)
            identifier = response.url.split("|")[-1].split(".")[0]
            identifier = url_unquote(identifier)
            # identifier = hxs.select('//*[@id="hf_itemid"]/@value').extract()[0]
            loader.add_value('identifier', identifier)
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            loader.add_value('url', response.url)
            image_url = hxs.select(
                '//div[@class="rightpanel"]//img[@style=" max-width:450px;"]/@src'
            ).extract()
            if image_url:
                loader.add_value(
                    'image_url',
                    urljoin_rfc(get_base_url(response), image_url[0]))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = search_params['aspect_ratio']
            metadata['rim'] = search_params['rim']
            metadata['speed_rating'] = search_params['speed_rating']
            metadata['width'] = search_params['width']
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating
            #metadata['alternative_speed_rating'] = ''
            result, name = remove_whole_word('XL', name)
            result1, name = remove_whole_word('RF', name)
            metadata['xl'] = 'Yes' if result or result1 else 'No'
            result, name = remove_whole_word('runflat', name)
            metadata['run_flat'] = 'Yes' if result else 'No'

            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                result, name = remove_whole_word(code, name)
                if result:
                    man_code = man_mark
                    break
            metadata['manufacturer_mark'] = man_code

            loader.add_value('name', name)

            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 load_rating, metadata['speed_rating']))
            #metadata['alternative_speed_rating']))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                return

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #14
0
    def parse_product(self, product, fitted, search_params):
        url = product.select('.//div[@class="mod-item-body"]/h3//a/@href').extract()[0]
        p_id = url.split('/')[-1]
        p_id += '-F' if fitted else '-D'
        image_url = product.select('.//div[@class="mod-item-img"]//img/@src').extract()[0]
        brand = product.select('.//div[@class="mod-item-body"]/h3/text()').extract()[0].strip()
        try:
            if not fitted:
                price = product.select('.//div[@class="mod-delivered"]/a/text()').extract()[0]
            else:
                price = product.select('.//div[@class="mod-fitted"]/a/text()').extract()[0]
        except IndexError:
            self.log("Price not found: %s" % str(product))
            self.errors.append("Price not found: %s" % str(product))
            return

        name = product.select('.//div[@class="mod-item-body"]/h3/span/a/text()').extract()[0]

        pattern = re.sub('\d+[^\s]+R\d+', '', name)
        pattern = re.sub('[\d/]+%s' % search_params['speed_rating'].upper(), '', pattern)
        pattern = pattern.strip()
        if not pattern:
            pattern = name.strip()

        loader = ProductLoader(item=Product(), selector=product)
        loader.add_value('url', url)
        loader.add_value('identifier', p_id)
        loader.add_value('image_url', image_url)
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
        loader.add_value('price', price)

        pattern = pattern.upper()
        pattern = pattern.replace('XL', '').replace('RFLAT', '').replace('RUNFLAT', '')

        loader.add_value('name', pattern)

        m = MicheldeverMeta()
        m['aspect_ratio'] = search_params['aspect_ratio']
        m['rim'] = search_params['rim']
        m['width'] = search_params['width']
        m['speed_rating'] = search_params['speed_rating'].upper()
        res = re.search('([\d/]+)%s' % search_params['speed_rating'].upper(), name)
        if res:
            m['load_rating'] = res.groups()[0]
        else:
            self.log('ERROR: not load rating: %s' % url)
            m['load_rating'] = ''
        if 'RFLAT' in name.upper() or 'RUNFLAT' in name.upper():
            m['run_flat'] = 'Yes'
        else:
            m['run_flat'] = 'No'

        if 'XL' in name.upper():
            m['xl'] = 'Yes'
        else:
            m['xl'] = 'No'

        m['full_tyre_size'] = '/'.join((m['width'],
                                        m['aspect_ratio'],
                                        m['rim'],
                                        m['load_rating'],
                                        m['speed_rating']))
                                        #m['alternative_speed_rating']))

        m['fitting_method'] = 'Fitted' if fitted else 'Delivered'
        m['manufacturer_mark'] = self._get_manufacturer_code(name)

        product = loader.load_item()
        product['metadata'] = m

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        return product
Пример #15
0
    def parse_product_cache(self, identifier, price, out_of_stock, product):
        """
        >>> spider = CamSkillSpider()
        >>> product = {\
                "brand": "Pirelli", \
                "category": 'R16" -  205/55/16, 205/55R16', \
                "identifier": "113764", \
                "image_url": "http://www.camskill.co.uk/smsimg/1943/113764--main--1943.jpg", \
                "metadata": {\
                    "alternative_speed_rating": "", \
                    "aspect_ratio": "55", \
                    "fitting_method": "Delivered", \
                    "full_tyre_size": "205/55/16/91/V", \
                    "load_rating": "91", \
                    "manufacturer_mark": "", \
                    "mts_stock_code": "2055516VPIP7", \
                    "rim": "16", \
                    "run_flat": "No", \
                    "speed_rating": "V", \
                    "width": "205", \
                    "xl": "No"\
                }, \
                "name": "Cinturato P7", \
                "price": "64.40", \
                "sku": None, \
                "stock": "0", \
                "url": "http://www.camskill.co.uk/m62b0s291p113764/Pirelli_Tyres_Car_Pirelli_P7_Cinturato_Pirelli_P_7_-_205_55_R16_91V_TL_Fuel_Eff_%3A_E_Wet_Grip%3A_A_NoiseClass%3A_2_Noise%3A_70dB"\
            }
        >>> spider.products_data['113764'] = product
        >>> product_ = spider.parse_product_cache("113764", 123, product)
        >>> product_['metadata']['mts_stock_code']
        '2055516VPIP7CINT'
        """
        loader = ProductLoader(item=Product(), selector=product)
        for col in ['name', 'identifier', 'sku', 'url', 'image_url', 'brand']:
            loader.add_value(col, self.products_data[identifier][col])

        loader.add_value('category',
                         find_brand_segment(loader.get_output_value('brand')))

        loader.add_value('price', price)
        if out_of_stock:
            loader.add_value('stock', 0)

        product_ = loader.load_item()
        if identifier in self.products_metadata:
            product_['metadata'] = self.products_metadata[identifier]

            if not is_product_correct(product_):
                self.incorrect_identifiers.append(product['identifier'])
                return

            product_['metadata']['mts_stock_code'] = find_mts_stock_code(
                product_, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product_)
            new_alt_speed = get_alt_speed(product_)
            product_['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product_['metadata']['speed_rating'] if product_['metadata']['speed_rating'] != new_speed_rating else ''
            product_['metadata']['speed_rating'] = new_speed_rating

        return product_
Пример #16
0
    def parse_products(self, response):
        html_response = json.loads(response.body)['display_tyres']
        hxs = HtmlXPathSelector(text=html_response)

        search_params = response.meta['search_params']

        products = hxs.select('//div[contains(@class, "tyre_container")]')

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.select(
                './/form/span[@class="tyre_brand_text"]/text()').extract()
            brand = brand[0] if brand else ''

            winter_tyre = hxs.select(
                '/div/div/div[@class="winter_img"]').extract()
            if not winter_tyre:
                for tyre_brand in self.brands:
                    if tyre_brand.upper() == brand.strip().upper():
                        brand = tyre_brand
                full_name = product_el.select(
                    './/form/span[@class="tyre_brand_text"]/text()').extract(
                    )[-1]

                loader.add_value('name', full_name)
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                identifier = product_el.select(
                    './/input[@name="tyre"]/@value').extract()
                loader.add_value('identifier', identifier)

                loader.add_value('url', 'http://www.tyregiant.com')

                image_url = product_el.select(
                    './/img[@class="tyre_image"]/@src').extract()

                if image_url:
                    loader.add_value(
                        'image_url',
                        urljoin(get_base_url(response), image_url[0]))

                price = product_el.select(
                    './/*[@class="tyre_price"]/span/text()').extract()

                if not price:
                    loader.add_value('stock', 0)

                loader.add_value('price', price)

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = search_params['aspect_ratio']
                metadata['rim'] = search_params['rim']

                tyre_details = product_el.select(
                    './/form/p[@class="tyre_details"]/text()').extract()[0]
                speed = re.search('(\s\d+\w+\s)', tyre_details)
                load_rating = speed.group().strip()[:-1] if speed else ''
                speed_rating = speed.group().strip()[-1] if speed else ''

                metadata['speed_rating'] = speed_rating
                metadata['load_rating'] = load_rating

                metadata['width'] = search_params['width']

                metadata['fitting_method'] = 'Fitted'
                metadata['alternative_speed_rating'] = ''
                xl = product_el.select(
                    './/img[@class="xl_img"]/@src').extract()
                metadata['xl'] = 'Yes' if xl else 'No'
                run_flat = product_el.select(
                    './/img[@class="rf_img"]/@src').extract()
                metadata['run_flat'] = 'Yes' if run_flat else 'No'

                metadata['manufacturer_mark'] = self._get_manufacturer_code(
                    full_name)

                metadata['full_tyre_size'] = '/'.join(
                    (search_params['width'], search_params['aspect_ratio'],
                     search_params['rim'], metadata['load_rating'],
                     metadata['speed_rating']))
                # metadata['alternative_speed_rating']))
                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product, spider_name=self.name, log=self.log)

                new_speed_rating = get_speed_rating(product)
                new_alt_speed = get_alt_speed(product)
                product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                    product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
                product['metadata']['speed_rating'] = new_speed_rating
                yield product

        if products:
            meta = response.meta
            next_page = meta['page'] + 1
            next_url = 'http://www.tyregiant.com/update-tyres/%s' % str(
                next_page)
            meta['page'] = next_page
            yield Request(next_url,
                          dont_filter=True,
                          callback=self.parse_products,
                          meta=meta)
Пример #17
0
    def parse_list(self, response):
        setattr(self, response.meta.get('thread'), True)
        hxs = HtmlXPathSelector(response)
        vs_data = hxs.select(
            '//input[@name="__VIEWSTATE"]/@value').extract()[0]
        identifiers = parse_identifiers(vs_data)

        products = hxs.select(
            '//div[@class="main-list"]//div[@class="group conti-box"]')
        for product_el in products:
            identifier = identifiers.pop(0)
            specif = product_el.select(
                './/span[@class="blue"]//div/text()').extract()
            # skip winter tyres
            if 'WINTER' in specif:
                continue
            loader = ProductLoader(item=Product(), selector=product_el)
            title = product_el.select(
                './/div[@class="conti-gray"]/text()').extract()[0]
            #identifier = title.split()
            title = title.strip().split('\r\n')
            name = title[-1].strip()
            width = title[0].split("/")[0].strip()
            ratio = title[0].split("/")[1].replace("R", "").strip()
            rim = title[1].strip()
            rating = title[2].strip()
            results = re.search(r"((?:\d{1,3}/)*(?:\d{1,3}))([A-Z]{1,2}\d?)",
                                rating)
            if results:
                load_rating = results.group(1)
                speed_rating = results.group(2)
            else:
                load_rating = speed_rating = ''
            brand = product_el.select(
                './/div[@class="black-conti"]/text()').extract()[0].strip()
            brand = brand.title()
            if 'bfg' in brand.lower():
                brand = 'BFG'
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            price = product_el.select(
                './/h4[@class="prc"]/text()').extract()[0]
            loader.add_value('price', extract_price(price))
            #identifier = brand.replace(' ', '') + ''.join(identifier)
            loader.add_value('identifier', identifier)
            loader.add_value('url', '')
            image_url = product_el.select(
                './/div[@class="sec-img"]/img/@src').extract()
            if image_url:
                loader.add_value('image_url',
                                 urljoin(get_base_url(response), image_url[0]))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = ratio
            metadata['rim'] = rim
            metadata['speed_rating'] = speed_rating
            metadata['width'] = width
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating
            #metadata['alternative_speed_rating'] = ''

            metadata['xl'] = 'Yes' if 'REINFORCED' in specif else 'No'
            metadata['run_flat'] = 'Yes' if 'RUN FLAT' in specif else 'No'

            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                result, name = cut_name(code, name)
                if result:
                    man_code = man_mark
                    break
            if not man_code:
                for code, man_mark in self.custom_man_marks.iteritems():
                    if name.endswith(code):
                        name = name.partition(code)[0]
                        man_code = man_mark
                        break
            metadata['manufacturer_mark'] = man_code

            loader.add_value('name', name)
            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 load_rating, speed_rating))
            #metadata['alternative_speed_rating']))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
        for x in self.next_search():
            yield x
Пример #18
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        product_data = response.meta['product_data']
        width = product_data['Width']
        aspect_ratio = product_data['Aspect Ratio']
        rim = product_data['Rim']
        speed_rating = product_data['Speed rating']
        alt_speed = product_data['Alt Speed']

        name_reg = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (width, rim, speed_rating.upper())
        name_reg2 = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (width, rim, alt_speed.upper())
        name_reg3 = r'(.+?)\s*%s.+%s.?[\s]*(.*)' % (width, rim)
        products = hxs.select('//div[@id="product-listing"]//div[@class="product"]/..')
        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)
            try:
                url = product_el.select('.//div[@class="title"]/a/@href')[0].extract()
            except:
                continue
            loader.add_value('url', url)
            loader.add_value('identifier',
                             product_el.select(".//span[@class='addcompare']/input/@id").extract()[0].split(":")[1])
            # loader.add_value('identifier', re.search('productId_(\d+)_', url).groups()[0])
            loader.add_xpath('price', './/span[@class="prodPirce"]/text()')
            try:
                name = product_el.select('.//div[@class="title"]/a/text()')[0].extract()
            except:
                continue
            if not re.search(r'(\(.*\))', name):
                # name = name.replace('/', '')
                m = re.search(name_reg, name)
                if not m:
                    m = name_parts = re.search(name_reg2, name)
                if not m:
                    m = name_parts = re.search(name_reg3, name)

                if m:
                    name_parts = m.groups()
                else:
                    self.log('Failed parsing ' + name)
                    self.log('URL: ' + response.url)
                    self.log('Params: ' + ", ".join(map(str, [width, rim, speed_rating.upper()])))
                    continue
            else:
                name_parts = []
                name_parts.append(name.split()[0])
                load_rating_reg = re.search(r'(\d+)%s' % speed_rating.upper(), name)
                if not load_rating_reg:
                    load_rating_reg = re.search(r'(\d+)%s' % alt_speed.upper(), name)
                if not load_rating_reg:
                    self.log('Failed parsing ' + name)
                    self.log('URL: ' + response.url)
                    self.log('Params: ' + ", ".join(map(str, [width, rim, speed_rating.upper()])))
                    continue
                name_parts.append(load_rating_reg.groups()[0])
                name_parts.append(' '.join(name.split()[1:]).split('(')[0])

            loader.add_value('name', name_parts[-1].replace('XL', '').replace('ROF', '').replace('RFT', ''))
            brand = name_parts[0]
            loader.add_value('brand', unify_brand(brand))
            loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
            loader.add_xpath('image_url', './/a[contains(@class, "tyre")]/img/@src')
            m = MicheldeverMeta()
            m['aspect_ratio'] = aspect_ratio
            m['rim'] = rim
            m['width'] = width
            m['speed_rating'] = speed_rating.upper()
            m['load_rating'] = name_parts[1]
            if 'ROF' in name.upper() or 'RFT' in name.upper():
                m['run_flat'] = 'Yes'
            else:
                m['run_flat'] = 'No'

            if 'XL' in name.upper():
                m['xl'] = 'Yes'
            else:
                m['xl'] = 'No'

            m['full_tyre_size'] = '/'.join((m['width'],
                                            m['aspect_ratio'],
                                            m['rim'],
                                            m['load_rating'],
                                            m['speed_rating']))
                                            # m['alternative_speed_rating']))

            m['fitting_method'] = 'Fitted'
            m['manufacturer_mark'] = self._get_manufacturer_code(name_parts[-1])

            product = loader.load_item()
            product['metadata'] = m

            if not is_product_correct(product):
                self.log('The product is not correct: %r' % product)
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product

        next_page = hxs.select('//span[@class="nextlink"]/a/@href')
        if next_page:
            yield Request(next_page.extract()[0], callback=self.parse_products, meta=response.meta)
Пример #19
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
  
        products = hxs.select('//div[contains(@id,"Tyre") and contains(@class, "tyre-list-tyre")]')

        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', 'div//div[@class="manufacturerText"]/p/strong/text()')
            brand = ''.join(product.select('div//div[@class="manufacturerImage"]/img/@alt').extract()).split(' - ')[0]
            winter_tyre = product.select('div//img[@alt="Winter Tyre"]')
            if not winter_tyre:
                loader.add_value('brand', unify_brand(brand))
                loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
                identifier = product.select('div//div[@class="pricingAddToOrder clearfix"]/input/@value').extract()[0]
 
                loader.add_value('url', '')

                image_url = product.select('div[@class="image"]/img/@src').extract()
                if image_url:
                    loader.add_value('image_url', urljoin(get_base_url(response), image_url[0]))

                loader.add_value('identifier', identifier)
                price = product.select('div//div[contains(@class, "pricingSelection")]//a/strong/text()').extract()
                price = re.findall(r"\d+.\d+", price[0]) if price else '0.0'
                loader.add_value('price', price)

                tyresize_text = product.select('.//div[contains(@class, "manufacturerText")]/p/span/text()').extract()[0].strip()
                width, aspect, speed_rating, rim = re.search(r'tyre size (\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups()

                fitting_method = 'Fitted'

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = aspect
                metadata['rim'] = rim

                metadata['speed_rating'] = speed_rating

                metadata['width'] = width
                metadata['fitting_method'] = fitting_method
                load_rating = product.select('div//li/a[@rel="load-index-description"]/text()').extract()
                metadata['load_rating'] = load_rating[0].split(': ')[-1] if load_rating else ''
                metadata['alternative_speed_rating'] = ''
                xl = product.select('div//img[@title="Reinforced"]/@title').extract()
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat = product.select('div//img[@title="Run Flat"]').extract()
                metadata['run_flat'] = 'Yes' if run_flat else 'No'
                manufacturer_mark = product.select('div//img[contains(@title, "Homologated for fitment to certai")]/@title').extract()
                manufacturer_mark = manufacturer_mark[0].replace('Homologated for fitment to certain ' ,'').replace(' cars.' ,'') if manufacturer_mark else ''
 
                metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else ''

                metadata['full_tyre_size'] = '/'.join((metadata['width'],
                                                       metadata['aspect_ratio'],
                                                       metadata['rim'],
                                                       metadata['load_rating'], 
                                                       metadata['speed_rating']))
                                                       #metadata['alternative_speed_rating']))
     
                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

                new_speed_rating = get_speed_rating(product)
                new_alt_speed = get_alt_speed(product)
                product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                    product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
                product['metadata']['speed_rating'] = new_speed_rating
 
                yield product
Пример #20
0
    def parse_products(self, response):
        json_data = json.loads(response.body)
        products = json.loads(json_data.get('d'))

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            try:
                brand = product_el[u'ProductManufacturer'][
                    u'TyreManufacturerName']
            except:
                brand = ''

            winter_tyre = product_el[u'ProductAttributes'][u'IsWinter']
            # skip winter tyres
            if winter_tyre:
                continue
            for tyre_brand in self.brands:
                if tyre_brand.upper() == brand.strip().upper():
                    brand = tyre_brand

            try:
                full_name = product_el[u'ProductTreadPattern'][u'TreadName']
            except:
                full_name = ''
            # Fix name changes
            if full_name in self.new_old_names:
                full_name = self.new_old_names[full_name]

            loader.add_value('name', full_name)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            identifier = product_el.get('TyreID')
            loader.add_value('url', 'http://www.tyresonthedrive.com')
            image_url = 'http://www.tyresonthedrive.com/img/treads/' + product_el[
                u'ProductTreadPattern'][u'TreadPatternImage'] + '.jpg'
            loader.add_value('image_url', image_url)
            loader.add_value('identifier', identifier)

            price = product_el[u'CheapestPriceTwoDay'][u'OneTyrePriceIncVat']
            if not price:
                loader.add_value('stock', 0)
            loader.add_value('price', price)

            metadata = MicheldeverMeta()

            metadata['aspect_ratio'] = str(
                product_el[u'ProductAttributes'][u'Profile'])
            metadata['rim'] = str(product_el[u'ProductAttributes'][u'Rim'])
            metadata['speed_rating'] = str(
                product_el[u'ProductAttributes'][u'Speed'])
            metadata['load_rating'] = str(
                product_el[u'ProductAttributes'][u'Load'])
            metadata['width'] = str(
                product_el[u'ProductAttributes'][u'Section'])
            metadata['fitting_method'] = 'Fitted'
            metadata['alternative_speed_rating'] = ''
            metadata['xl'] = 'Yes' if product_el[u'ProductAttributes'][
                u'IsExLoad'] else 'No'
            metadata['run_flat'] = 'Yes' if product_el[u'ProductAttributes'][
                u'IsRunFlat'] else 'No'

            man_mark = product_el[u'ProductAttributes'][u'OEMFitment']
            metadata['manufacturer_mark'] = find_man_mark(
                man_mark) if man_mark else ''

            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 metadata['load_rating'], metadata['speed_rating']))
            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #21
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
        # the pattern should be set as the product's name

        fitting_method = 'Delivered'

        loader.add_value('url', response.url)

        image_url = hxs.select('//img[@itemprop="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin(get_base_url(response), image_url[0]))

        identifier = hxs.select('//form[@name="form1"]/@action').extract()
        if not identifier:
            yield self.retry_request(response)
            return
        identifier = identifier[0]
        loader.add_value('identifier', identifier)
        price = hxs.select(
            '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract()
        loader.add_value('price', price)
        if not loader.get_output_value('price'):
            loader.add_value('stock', 0)

        brand = hxs.select(
            '//div[@class="hidden"]/input[@class="producerName"]/@value'
        ).extract()
        if not brand:
            yield self.retry_request(response)
            return
        brand = brand[0].strip()
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category',
                         find_brand_segment(loader.get_output_value('brand')))
        brand = re.sub(u'\u0119', u'e', brand)

        product_name = hxs.select(
            '//h1[@itemprop="name"]/text()')[0].extract().strip()
        product_name = re.sub(u'[:\u2122]', u'', product_name)
        product_name = product_name.replace(brand, '').strip()

        data = parse_pattern(product_name)
        if not data:
            log.msg('ERROR parsing "{}" [{}]'.format(product_name,
                                                     response.url))
            self.errors.append('ERROR parsing "{}" [{}]'.format(
                product_name, response.url))
            return

        loader.add_value('name', data['Name'])

        metadata = MicheldeverMeta()
        metadata['aspect_ratio'] = data['Aspect_Ratio']
        metadata['rim'] = data['Rim']
        metadata['speed_rating'] = data['Speed_Rating']

        metadata['width'] = data['Width']
        metadata['fitting_method'] = fitting_method
        metadata['load_rating'] = data['Load_Rating'] or ''
        metadata['alternative_speed_rating'] = ''
        xl = 'XL' in product_name
        metadata['xl'] = 'Yes' if xl else 'No'

        run_flat = 'run on flat' in product_name.lower(
        ) or 'run flat' in product_name.lower()
        metadata['run_flat'] = 'Yes' if run_flat else 'No'
        manufacturer_mark = [
            mark for mark in self.all_man_marks.keys()
            if mark in product_name.split(' ')
        ]
        manufacturer_mark = manufacturer_mark[0].strip(
        ) if manufacturer_mark else []
        metadata['manufacturer_mark'] = self.all_man_marks.get(
            manufacturer_mark, '') if manufacturer_mark else ''
        metadata['full_tyre_size'] = '/'.join(
            (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
             metadata['load_rating'], metadata['speed_rating']))
        # metadata['alternative_speed_rating']))

        product = loader.load_item()
        product['metadata'] = metadata

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(
            product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        yield product
Пример #22
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@class="results"]')

        pages = hxs.select('//p[contains(text(),"Page")]//a/@href').extract()
        for page in pages:
            yield Request(urljoin(get_base_url(response), page),
                          meta=response.meta)

        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
            # the pattern should be set as the product's name
            name = ' '.join(
                map(
                    unicode.strip,
                    product.select('.//div[@class="resultsLeft"]/div'
                                   '//text()[normalize-space()]').extract()))
            name += name + ' %s' % ' '.join(
                map(
                    unicode.strip,
                    product.select(
                        './/div[@class="t_size"]//text()[normalize-space()]').
                    extract()))
            loader.add_xpath(
                'name',
                './/div[@class="resultsLeft"]/div//a/i/b/text()[normalize-space()]'
            )
            brand = product.select(
                './/div[@class="resultsLeft"]/div/b//text()[normalize-space()]'
            ).extract()[0].strip()

            # skip winter tyres
            if product.select(
                    './/img[contains(@alt,"Winter / cold weather tyres")]'):
                continue
            if product.select(
                    './/img[contains(@alt,"Wi") or contains(@src,"/simg/hiver.png")]'
            ):
                continue
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            fitting_method = 'Fitted'

            url = product.select('.//a[i[b]]/@href')[0].extract()
            url = urljoin(get_base_url(response), url)
            url = re.sub('cart_id=[^&]*', '', url)
            loader.add_value('url', url)

            image_url = product.select(
                './/a/img[@align="left"]/@src').extract()
            if image_url:
                loader.add_value('image_url',
                                 urljoin(get_base_url(response), image_url[0]))

            identifier = urlparse.parse_qs(
                urlparse.urlparse(url).query)['typ'][0]
            loader.add_value('identifier', identifier)
            price = ''.join(
                product.select(
                    './/div[@class="price"]/font/b//text()[normalize-space()]'
                ).extract())
            price = re.findall(r"\d+.\d+", price) if price else '0.0'
            loader.add_value('price', price)

            data = parse_pattern(name)
            if not data:
                #log.msg("ERROR %s [%s]" % (name, response.url))
                #self.errors.append("Error parsing: %s. URL: %s" % (name, response.url))
                continue

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = data['Aspect_Ratio']
            metadata['rim'] = data['Rim']
            metadata['speed_rating'] = data['Speed_Rating']

            metadata['width'] = data['Width']
            metadata['fitting_method'] = fitting_method
            metadata['load_rating'] = data['Load_Rating']
            metadata['alternative_speed_rating'] = ''
            xl = 'XL' in name
            metadata['xl'] = 'Yes' if xl else 'No'

            run_flat = 'run flat' in name.lower() or 'runflat' in name.lower()
            metadata['run_flat'] = 'Yes' if run_flat else 'No'
            manufacturer_mark = product.select(
                './/div[@class="t_size"]/b/a[contains(@onmouseover,"Original") or '
                'contains(@onmouseover,"BMW") or contains(@onmouseover,"Porsche")]'
                '/@name[normalize-space()]').extract()
            manufacturer_mark = manufacturer_mark[0].strip(
            ) if manufacturer_mark else []
            metadata['manufacturer_mark'] = find_man_mark(
                manufacturer_mark) if manufacturer_mark else ''
            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 metadata['load_rating'], metadata['speed_rating']))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product