Пример #1
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        title = hxs.select('//h1/text()')[0].extract()
        if 'winter' in title.lower():
            return

        title = re.search('(.*)-[^-]+', title).groups()[0]
        brand = title.split(' ')[0]
        price = hxs.select('//td[@class="price"]/text()')[0].extract()
        # fix wrong product
        if brand.strip() == 'R27':
            loader.add_value('name', title.replace('XL', '').replace('RF', ''))
            brand = 'Toyo'
        else:
            loader.add_value(
                'name',
                title.replace(brand, '').replace('XL', '').replace('RF', ''))
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category',
                         find_brand_segment(loader.get_output_value('brand')))

        loader.add_value('price', price)
        loader.add_value('url', response.url)
        loader.add_xpath('identifier',
                         '//input[@id="product_reference"]/@value')
        image_url = hxs.select('//img[@class="productImg"]/@src')[0].extract()
        loader.add_value('image_url', urljoin(get_base_url(response),
                                              image_url))

        speed_rating = hxs.select(
            "//tr[td/strong[text()='Speed:']]/td[2]/text()").extract()[0]
        load_rating = hxs.select("//tr[td/strong[text()='Load:']]/td[2]/text()"
                                 ).extract()[0].replace(speed_rating, "")

        size = hxs.select(
            "//tr[td/strong[text()='Size:']]/td[2]/text()").extract()[0]

        width, aspect_ratio, _, rim = parse_tyre_size(size)
        if not width:
            msg = "Error parsing '%s' on page %s" % (size, response.url)
            self.log(msg)
            self.errors.append(msg)
            return

        m = MicheldeverMeta()
        m['aspect_ratio'] = aspect_ratio
        m['rim'] = rim
        m['width'] = width
        m['speed_rating'] = speed_rating.upper()
        m['load_rating'] = load_rating
        if 'RF' in title.upper():
            m['run_flat'] = 'Yes'
        else:
            m['run_flat'] = 'No'

        if 'XL' in title.upper():
            m['xl'] = 'Yes'
        else:
            m['xl'] = 'No'

        m['full_tyre_size'] = '/'.join(
            (m['width'], m['aspect_ratio'], m['rim'], m['load_rating'],
             m['speed_rating']))

        m['fitting_method'] = 'Fitted'
        m['manufacturer_mark'] = self._get_manufacturer_code(title)

        product = loader.load_item()
        product['metadata'] = m

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(
            product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        yield product
Пример #2
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
  
        products = hxs.select('//div[contains(@id,"Tyre") and contains(@class, "tyre-list-tyre")]')

        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', 'div//div[@class="manufacturerText"]/p/strong/text()')
            brand = ''.join(product.select('div//div[@class="manufacturerImage"]/img/@alt').extract()).split(' - ')[0]
            winter_tyre = product.select('div//img[@alt="Winter Tyre"]')
            if not winter_tyre:
                loader.add_value('brand', unify_brand(brand))
                loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
                identifier = product.select('div//div[@class="pricingAddToOrder clearfix"]/input/@value').extract()[0]
 
                loader.add_value('url', '')

                image_url = product.select('div[@class="image"]/img/@src').extract()
                if image_url:
                    loader.add_value('image_url', urljoin(get_base_url(response), image_url[0]))

                loader.add_value('identifier', identifier)
                price = product.select('div//div[contains(@class, "pricingSelection")]//a/strong/text()').extract()
                price = re.findall(r"\d+.\d+", price[0]) if price else '0.0'
                loader.add_value('price', price)

                tyresize_text = product.select('.//div[contains(@class, "manufacturerText")]/p/span/text()').extract()[0].strip()
                width, aspect, speed_rating, rim = re.search(r'tyre size (\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups()

                fitting_method = 'Fitted'

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = aspect
                metadata['rim'] = rim

                metadata['speed_rating'] = speed_rating

                metadata['width'] = width
                metadata['fitting_method'] = fitting_method
                load_rating = product.select('div//li/a[@rel="load-index-description"]/text()').extract()
                metadata['load_rating'] = load_rating[0].split(': ')[-1] if load_rating else ''
                metadata['alternative_speed_rating'] = ''
                xl = product.select('div//img[@title="Reinforced"]/@title').extract()
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat = product.select('div//img[@title="Run Flat"]').extract()
                metadata['run_flat'] = 'Yes' if run_flat else 'No'
                manufacturer_mark = product.select('div//img[contains(@title, "Homologated for fitment to certai")]/@title').extract()
                manufacturer_mark = manufacturer_mark[0].replace('Homologated for fitment to certain ' ,'').replace(' cars.' ,'') if manufacturer_mark else ''
 
                metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else ''

                metadata['full_tyre_size'] = '/'.join((metadata['width'],
                                                       metadata['aspect_ratio'],
                                                       metadata['rim'],
                                                       metadata['load_rating'], 
                                                       metadata['speed_rating']))
                                                       #metadata['alternative_speed_rating']))
     
                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

                new_speed_rating = get_speed_rating(product)
                new_alt_speed = get_alt_speed(product)
                product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                    product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
                product['metadata']['speed_rating'] = new_speed_rating
 
                yield product
Пример #3
0
    def parse(self, response):
        base_url = get_base_url(response)
        row = response.meta['row']
        products = json.loads(response.body_as_unicode())
        for product_el in products:
            #skip winter tyres
            if product_el['winter'] != '0':
                continue
            loader = ProductLoader(item=Product(), selector=product_el)
            brand = product_el['tyreMake'].title()
            if 'goodrich' in brand.lower():
                brand = 'BFG'
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            load_rating = product_el['loadrating']
            speed_rating = product_el['tyreSpeed']
            loader.add_value('price', product_el['priceVat'])
            loader.add_value('identifier', product_el['id'])
            loader.add_value(
                'url',
                urljoin('http://www.etyres.co.uk/tyre-detail/',
                        product_el['URLString']))
            if product_el['tyreModelImage2']:
                image_url = 'images/' + product_el['tyreModelImage2']
                if image_url:
                    loader.add_value('image_url', urljoin(base_url, image_url))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = speed_rating
            metadata['width'] = row['Width']
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating
            metadata[
                'xl'] = 'Yes' if product_el['tyreReinforced'] == 'T' else 'No'
            metadata[
                'run_flat'] = 'Yes' if product_el['runflat'] == '1' else 'No'

            name = product_el['tyreModel']
            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                result, name = cut_name(code, name)
                if result:
                    man_code = man_mark
                    break
            if not man_code:
                for code, man_mark in self.custom_man_marks.iteritems():
                    if name.endswith(code):
                        name = name.partition(code)[0]
                        man_code = man_mark
                        break
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating,
                 speed_rating))
            name = name.replace(' EXTRA LOAD', '')
            name = name.replace(' RUNFLAT', '')

            loader.add_value('name', name.strip())

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #4
0
    def parse_list(self, response):
        setattr(self, response.meta.get('thread'), True)
        hxs = HtmlXPathSelector(response)
        vs_data = hxs.select(
            '//input[@name="__VIEWSTATE"]/@value').extract()[0]
        identifiers = parse_identifiers(vs_data)

        products = hxs.select(
            '//div[@class="main-list"]//div[@class="group conti-box"]')
        for product_el in products:
            identifier = identifiers.pop(0)
            specif = product_el.select(
                './/span[@class="blue"]//div/text()').extract()
            # skip winter tyres
            if 'WINTER' in specif:
                continue
            loader = ProductLoader(item=Product(), selector=product_el)
            title = product_el.select(
                './/div[@class="conti-gray"]/text()').extract()[0]
            #identifier = title.split()
            title = title.strip().split('\r\n')
            name = title[-1].strip()
            width = title[0].split("/")[0].strip()
            ratio = title[0].split("/")[1].replace("R", "").strip()
            rim = title[1].strip()
            rating = title[2].strip()
            results = re.search(r"((?:\d{1,3}/)*(?:\d{1,3}))([A-Z]{1,2}\d?)",
                                rating)
            if results:
                load_rating = results.group(1)
                speed_rating = results.group(2)
            else:
                load_rating = speed_rating = ''
            brand = product_el.select(
                './/div[@class="black-conti"]/text()').extract()[0].strip()
            brand = brand.title()
            if 'bfg' in brand.lower():
                brand = 'BFG'
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            price = product_el.select(
                './/h4[@class="prc"]/text()').extract()[0]
            loader.add_value('price', extract_price(price))
            #identifier = brand.replace(' ', '') + ''.join(identifier)
            loader.add_value('identifier', identifier)
            loader.add_value('url', '')
            image_url = product_el.select(
                './/div[@class="sec-img"]/img/@src').extract()
            if image_url:
                loader.add_value('image_url',
                                 urljoin(get_base_url(response), image_url[0]))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = ratio
            metadata['rim'] = rim
            metadata['speed_rating'] = speed_rating
            metadata['width'] = width
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating
            #metadata['alternative_speed_rating'] = ''

            metadata['xl'] = 'Yes' if 'REINFORCED' in specif else 'No'
            metadata['run_flat'] = 'Yes' if 'RUN FLAT' in specif else 'No'

            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                result, name = cut_name(code, name)
                if result:
                    man_code = man_mark
                    break
            if not man_code:
                for code, man_mark in self.custom_man_marks.iteritems():
                    if name.endswith(code):
                        name = name.partition(code)[0]
                        man_code = man_mark
                        break
            metadata['manufacturer_mark'] = man_code

            loader.add_value('name', name)
            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 load_rating, speed_rating))
            #metadata['alternative_speed_rating']))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
        for x in self.next_search():
            yield x
Пример #5
0
    def extract_products(self, response):
        products = response.xpath(
            '//div[@class="listcontPART"]//div[contains(@class, "conprcbx")]')
        for el in products:
            brand = map(
                unicode.strip,
                el.xpath('.//div[@class="imgBrandLogo"]/span/text()'
                         '|.//div[@class="imgBrandLogo"]/img/@alt'
                         '|.//b[@class="Brantrin"]/text()').extract())[0]

            pattern = ''.join(
                el.xpath('.//div[@class="dec_tyrerates"]//text()').extract()
            ).strip()

            # skip winter tyres
            if 'winter' in pattern.lower():
                continue

            xl, pattern = extract_reinforced(pattern)
            run_flat, pattern = extract_run_flat(pattern)
            res = parse_pattern(pattern)
            if not res:
                excludes = [
                    'sport contact', 'advantage sport', 'expedia s02',
                    'zero rosso'
                ]
                if any([x in pattern.lower() for x in excludes]):
                    continue
                else:
                    # msg = 'Could not parse pattern: %s' % fix_spaces(pattern).encode('utf-8')
                    # self.log('[CARTYRES] %s' % msg)
                    # self.errors.append(msg)
                    continue
            width, ratio, rim, load_rating, speed_rating, name = res

            identifier = el.css('.hndSTCODE').xpath('text()').extract_first()

            url = self.start_urls[0]

            price = ''.join(
                el.xpath('.//div[@class="dec_fittdbnt"]//h2//text()').re(
                    r'[\d\.,]+'))

            image_url = el.xpath(
                './/div[@class="uptyre_prt"]/img[@class="trIMG"]/@src'
            ).extract()[0]

            man_mark = el.xpath(
                './/div[@class="bndLGO1"]/img/@title').extract()
            if man_mark:
                man_mark = man_mark[0]
                if not man_mark in self.man_marks:
                    self.man_marks.add(man_mark)
            else:
                man_mark = ''

            loader = ProductLoader(Product(), response=response)
            loader.add_value('name', name)
            loader.add_value('identifier', identifier)
            loader.add_value('price', price)
            loader.add_value('url', url)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))

            metadata = MicheldeverMeta()
            metadata['width'] = width
            metadata['aspect_ratio'] = ratio
            metadata['rim'] = rim
            metadata['load_rating'] = load_rating
            metadata['speed_rating'] = speed_rating
            metadata['fitting_method'] = 'Fitted'
            metadata['run_flat'] = run_flat
            metadata['xl'] = xl

            if man_mark and man_mark in man_mark_mapping:
                man_code = man_mark_mapping[man_mark]
            else:
                man_code = ''
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join(
                (width, ratio, rim, load_rating, speed_rating))
            fuel, grip, noise = map(
                unicode.strip,
                el.xpath(
                    './/div[@class="dec_labelbnt"]/div[@class="decsec1"]/p/b/text()'
                ).extract())

            metadata['fuel'] = fuel
            metadata['grip'] = grip
            metadata['noise'] = noise.replace('dB', '')

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                # self.log('Product is not correct: %s' % repr(product))
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            yield product
def preprocess_products(products, new_products):
    all_products = []
    processed = {}
    for product in chain(new_products, products):
        for field, value in product.items():
            product[field] = value.strip()
        if product['Website'] in websites_mapping.values():
            pass
        elif not product['Website'] in websites_mapping:
            raise WebsiteNotFound("Website '%s' not found in mapping" %
                                  product['Website'])
        else:
            product['Website'] = websites_mapping[product['Website']]

        product['Brand'] = product['Brand'].title()

        if not product['XL']:
            product['XL'] = 'No'
        elif product['XL'] == 'XL':
            product['XL'] = 'Yes'

        if not product['Run Flat']:
            product['Run Flat'] = 'No'
        elif product['Run Flat'] == 'RFT':
            product['Run Flat'] = 'Yes'

        if product['Segment'] == 'premiumbrands':
            product['Segment'] = 'Premium Brands'
        if product['Segment'] == 'housebrands':
            product['Segment'] = 'House Brands'

        if not product['Segment']:
            product['Segment'] = find_brand_segment(product['Brand'])

        key = get_key(product)
        if key in processed:
            # print "Duplicate: %s" % key
            if processed[key] != product['MTS Stock Code']:
                print "Duplicate product with different MTS Code: %s. Code1: %s, code2: %s" % (
                    key, processed[key], product['MTS Stock Code'])
            continue
        processed[key] = product['MTS Stock Code']

        # check MTS code is correct for current tyre size
        width, aspect_ratio, rim = get_tyre_size_from_mts_code(
            product['MTS Stock Code'])
        if width and width != product['Width']:
            print "Record has incorrect width for MTS code '%s', width: %s, code width: %s" % (
                product['MTS Stock Code'], product['Width'], width)
            continue
        if aspect_ratio and aspect_ratio != product['Aspect Ratio']:
            print "Record has incorrect Aspect Ratio for MTS code '%s', Aspect Ratio: %s, code Aspect Ratio: %s" % (
                product['MTS Stock Code'], product['Aspect Ratio'],
                aspect_ratio)
            continue
        if rim and rim != product['Rim']:
            print "Record has incorrect rim for MTS code '%s', rim: %s, code rim: %s" % (
                product['MTS Stock Code'], product['Rim'], rim)
            continue
        all_products.append(product)

    return all_products
Пример #7
0
    def parse_product_cache(self, identifier, price, out_of_stock, product):
        """
        >>> spider = CamSkillSpider()
        >>> product = {\
                "brand": "Pirelli", \
                "category": 'R16" -  205/55/16, 205/55R16', \
                "identifier": "113764", \
                "image_url": "http://www.camskill.co.uk/smsimg/1943/113764--main--1943.jpg", \
                "metadata": {\
                    "alternative_speed_rating": "", \
                    "aspect_ratio": "55", \
                    "fitting_method": "Delivered", \
                    "full_tyre_size": "205/55/16/91/V", \
                    "load_rating": "91", \
                    "manufacturer_mark": "", \
                    "mts_stock_code": "2055516VPIP7", \
                    "rim": "16", \
                    "run_flat": "No", \
                    "speed_rating": "V", \
                    "width": "205", \
                    "xl": "No"\
                }, \
                "name": "Cinturato P7", \
                "price": "64.40", \
                "sku": None, \
                "stock": "0", \
                "url": "http://www.camskill.co.uk/m62b0s291p113764/Pirelli_Tyres_Car_Pirelli_P7_Cinturato_Pirelli_P_7_-_205_55_R16_91V_TL_Fuel_Eff_%3A_E_Wet_Grip%3A_A_NoiseClass%3A_2_Noise%3A_70dB"\
            }
        >>> spider.products_data['113764'] = product
        >>> product_ = spider.parse_product_cache("113764", 123, product)
        >>> product_['metadata']['mts_stock_code']
        '2055516VPIP7CINT'
        """
        loader = ProductLoader(item=Product(), selector=product)
        for col in ['name', 'identifier', 'sku', 'url', 'image_url', 'brand']:
            loader.add_value(col, self.products_data[identifier][col])

        loader.add_value('category',
                         find_brand_segment(loader.get_output_value('brand')))

        loader.add_value('price', price)
        if out_of_stock:
            loader.add_value('stock', 0)

        product_ = loader.load_item()
        if identifier in self.products_metadata:
            product_['metadata'] = self.products_metadata[identifier]

            if not is_product_correct(product_):
                self.incorrect_identifiers.append(product['identifier'])
                return

            product_['metadata']['mts_stock_code'] = find_mts_stock_code(
                product_, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product_)
            new_alt_speed = get_alt_speed(product_)
            product_['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product_['metadata']['speed_rating'] if product_['metadata']['speed_rating'] != new_speed_rating else ''
            product_['metadata']['speed_rating'] = new_speed_rating

        return product_
Пример #8
0
    def parse_products(self, response):
        html_response = json.loads(response.body)['display_tyres']
        hxs = HtmlXPathSelector(text=html_response)

        search_params = response.meta['search_params']

        products = hxs.select('//div[contains(@class, "tyre_container")]')

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.select(
                './/form/span[@class="tyre_brand_text"]/text()').extract()
            brand = brand[0] if brand else ''

            winter_tyre = hxs.select(
                '/div/div/div[@class="winter_img"]').extract()
            if not winter_tyre:
                for tyre_brand in self.brands:
                    if tyre_brand.upper() == brand.strip().upper():
                        brand = tyre_brand
                full_name = product_el.select(
                    './/form/span[@class="tyre_brand_text"]/text()').extract(
                    )[-1]

                loader.add_value('name', full_name)
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                identifier = product_el.select(
                    './/input[@name="tyre"]/@value').extract()
                loader.add_value('identifier', identifier)

                loader.add_value('url', 'http://www.tyregiant.com')

                image_url = product_el.select(
                    './/img[@class="tyre_image"]/@src').extract()

                if image_url:
                    loader.add_value(
                        'image_url',
                        urljoin(get_base_url(response), image_url[0]))

                price = product_el.select(
                    './/*[@class="tyre_price"]/span/text()').extract()

                if not price:
                    loader.add_value('stock', 0)

                loader.add_value('price', price)

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = search_params['aspect_ratio']
                metadata['rim'] = search_params['rim']

                tyre_details = product_el.select(
                    './/form/p[@class="tyre_details"]/text()').extract()[0]
                speed = re.search('(\s\d+\w+\s)', tyre_details)
                load_rating = speed.group().strip()[:-1] if speed else ''
                speed_rating = speed.group().strip()[-1] if speed else ''

                metadata['speed_rating'] = speed_rating
                metadata['load_rating'] = load_rating

                metadata['width'] = search_params['width']

                metadata['fitting_method'] = 'Fitted'
                metadata['alternative_speed_rating'] = ''
                xl = product_el.select(
                    './/img[@class="xl_img"]/@src').extract()
                metadata['xl'] = 'Yes' if xl else 'No'
                run_flat = product_el.select(
                    './/img[@class="rf_img"]/@src').extract()
                metadata['run_flat'] = 'Yes' if run_flat else 'No'

                metadata['manufacturer_mark'] = self._get_manufacturer_code(
                    full_name)

                metadata['full_tyre_size'] = '/'.join(
                    (search_params['width'], search_params['aspect_ratio'],
                     search_params['rim'], metadata['load_rating'],
                     metadata['speed_rating']))
                # metadata['alternative_speed_rating']))
                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product, spider_name=self.name, log=self.log)

                new_speed_rating = get_speed_rating(product)
                new_alt_speed = get_alt_speed(product)
                product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                    product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
                product['metadata']['speed_rating'] = new_speed_rating
                yield product

        if products:
            meta = response.meta
            next_page = meta['page'] + 1
            next_url = 'http://www.tyregiant.com/update-tyres/%s' % str(
                next_page)
            meta['page'] = next_page
            yield Request(next_url,
                          dont_filter=True,
                          callback=self.parse_products,
                          meta=meta)
Пример #9
0
    def parse_products(self, response):
        json_data = json.loads(response.body)
        products = json.loads(json_data.get('d'))

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            try:
                brand = product_el[u'ProductManufacturer'][
                    u'TyreManufacturerName']
            except:
                brand = ''

            winter_tyre = product_el[u'ProductAttributes'][u'IsWinter']
            # skip winter tyres
            if winter_tyre:
                continue
            for tyre_brand in self.brands:
                if tyre_brand.upper() == brand.strip().upper():
                    brand = tyre_brand

            try:
                full_name = product_el[u'ProductTreadPattern'][u'TreadName']
            except:
                full_name = ''
            # Fix name changes
            if full_name in self.new_old_names:
                full_name = self.new_old_names[full_name]

            loader.add_value('name', full_name)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            identifier = product_el.get('TyreID')
            loader.add_value('url', 'http://www.tyresonthedrive.com')
            image_url = 'http://www.tyresonthedrive.com/img/treads/' + product_el[
                u'ProductTreadPattern'][u'TreadPatternImage'] + '.jpg'
            loader.add_value('image_url', image_url)
            loader.add_value('identifier', identifier)

            price = product_el[u'CheapestPriceTwoDay'][u'OneTyrePriceIncVat']
            if not price:
                loader.add_value('stock', 0)
            loader.add_value('price', price)

            metadata = MicheldeverMeta()

            metadata['aspect_ratio'] = str(
                product_el[u'ProductAttributes'][u'Profile'])
            metadata['rim'] = str(product_el[u'ProductAttributes'][u'Rim'])
            metadata['speed_rating'] = str(
                product_el[u'ProductAttributes'][u'Speed'])
            metadata['load_rating'] = str(
                product_el[u'ProductAttributes'][u'Load'])
            metadata['width'] = str(
                product_el[u'ProductAttributes'][u'Section'])
            metadata['fitting_method'] = 'Fitted'
            metadata['alternative_speed_rating'] = ''
            metadata['xl'] = 'Yes' if product_el[u'ProductAttributes'][
                u'IsExLoad'] else 'No'
            metadata['run_flat'] = 'Yes' if product_el[u'ProductAttributes'][
                u'IsRunFlat'] else 'No'

            man_mark = product_el[u'ProductAttributes'][u'OEMFitment']
            metadata['manufacturer_mark'] = find_man_mark(
                man_mark) if man_mark else ''

            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 metadata['load_rating'], metadata['speed_rating']))
            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #10
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        row = response.meta['row']

        products = hxs.select(
            '//*[@id="tyreResults"]//tr[contains(@class, "tyre")]//td[@class != "gutter"]'
        )
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            title = product.select('.//p[@class="subTitle"]/text()').extract()
            if not title:
                continue
            title = ' '.join(title[0].split())

            parsed_title = parse_title_new(title)

            brand = parsed_title['brand']
            load_rating = parsed_title['load_rating']
            speed_rating = parsed_title['speed_rating']
            name = parsed_title['name']
            if not name or not brand:
                self.log(
                    "++++++++++++++++++++++++++++{}==================".format(
                        title))
                # self.errors.append("Error parsing title: %s" % title)
            for fixed_brand, brand_spellings in self.brand_fixes.iteritems():
                if brand.lower() in brand_spellings:
                    brand = fixed_brand
                    break
            brand = brand.title()
            if brand not in self.brand_fixes:
                self.log('Wrong brand %s' % brand)
                continue
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            price = product.select('.//h6[@class="price"]/text()').extract()[0]
            price += product.select(
                './/h6[@class="price"]/sup/text()').extract()[0]
            loader.add_value('price', extract_price(price))
            identifier = product.select(
                './a[@class="btnBuy png_bg"]/@href').extract()[0]
            identifier = identifier.split('/')[-1]
            loader.add_value('identifier', identifier)
            loader.add_value('url', '')
            image_url = product.select(
                './/img[@class="tyreImg"]/@src').extract()
            if image_url:
                loader.add_value(
                    'image_url',
                    urljoin_rfc(get_base_url(response), image_url[0]))

            metadata = MicheldeverMeta()
            metadata['onsite_name'] = title
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = speed_rating
            metadata['width'] = row['Width']
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating

            self.log("===============matching================")
            self.log(str(name))

            metadata['manufacturer_mark'], name = filter_man_code(
                name, self.all_man_marks, self.custom_man_marks)
            self.log(str((metadata['manufacturer_mark'], name)))

            metadata['xl'], name = filter_xl(name)
            metadata['xl'] = "Yes" if metadata['xl'] else "No"
            self.log(str((metadata['xl'], name)))

            run_flat_found = is_run_flat(name)
            metadata['run_flat'], name = filter_run_flat(name)
            metadata['run_flat'] = "Yes" if metadata[
                'run_flat'] or run_flat_found else "No"
            self.log(str((metadata['run_flat'], name)))

            self.log("===============/matching===============")

            if name.endswith('('):
                name = name[:-1]
            loader.add_value('name', name.strip())

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating,
                 speed_rating))
            # metadata['alternative_speed_rating']))

            fuel = product.select(
                './/div[@class="tyreLabel"]/span/img[contains(@src, "icon=fuel")]'
            ).re(r'rr=(\w)')
            metadata['fuel'] = fuel[0] if fuel else ''
            grip = product.select(
                './/div[@class="tyreLabel"]/span/img[contains(@src, "icon=wet")]'
            ).re(r'wg=(\w)')
            metadata['grip'] = grip[0] if grip else ''
            noise = product.select(
                './/div[@class="tyreLabel"]/span/img[contains(@src, "icon=noise")]'
            ).re(r'db=(\d+)')
            metadata['noise'] = noise[0] if noise else ''

            prod = loader.load_item()
            prod['metadata'] = metadata

            if not is_product_correct(prod):
                continue

            prod['metadata']['mts_stock_code'] = find_mts_stock_code(
                prod, spider_name=self.name, log=self.log)

            yield prod
Пример #11
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select(
            '//ul[@class="c-list-classic c-list-classic-liste m-produit-res"]/li'
        )

        next_page = hxs.select(
            '//li[a[span[text()="Next"]]]/@data-page').extract()  # pagination
        if next_page:
            formdata = response.meta.get('formdata')
            formdata['page'] = next_page[0]
            yield FormRequest(response.url,
                              formdata=formdata,
                              dont_filter=True,
                              meta=response.meta)

        for product_el in products:
            url = product_el.select(
                './/a[@class="u-semi-link"]/@href')[0].extract()
            winter_tyre = product_el.select(
                './/div[@class="m-produit-bloc-res-lst__gamme-saison"]/text()'
            ).re('Winter')
            if not winter_tyre:
                loader = ProductLoader(item=Product(), selector=product_el)
                # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
                # the pattern should be set as the product's name
                loader.add_xpath(
                    'name',
                    './/span[@class="m-produit-bloc-res-lst__dcp"]/text()')
                brand = product_el.select(
                    './/span[@class="m-produit-bloc-res-lst__fab"]/text()'
                ).extract()
                if brand:
                    brand = brand[0].strip()
                    loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                fitting_method = 'Delivered'

                loader.add_value('url', urljoin(base_url, url))

                image_url = product_el.select(
                    './/div[@class="m-produit-bloc-res-lst__image"]//img/@src'
                ).extract()
                if image_url:
                    loader.add_value(
                        'image_url',
                        urljoin(get_base_url(response), image_url[0]))

                identifier = product_el.select(
                    './/button/@data-id')[0].extract()
                loader.add_value('identifier', identifier)
                price = product_el.select(
                    './/div[@class="c-qte-prix__prix m-produit-bloc-res-lst__prix"]/text()'
                )[0].extract()
                loader.add_value('price', price)
                if not loader.get_output_value('price'):
                    loader.add_value('stock', 0)

                name = product_el.select(
                    './/div[@class="m-produit-bloc-res-lst__dim"]/text()'
                )[0].extract().strip().replace(u'\xa0', u' ')
                data = parse_pattern(name)
                if not data:
                    log.msg('ERROR parsing "{}" [{}]'.format(
                        name, response.url))
                    self.errors.append('ERROR parsing "{}" [{}]'.format(
                        name, response.url))
                    continue

                additional_data = ' '.join(
                    product_el.select(
                        './/ul[@class="m-produit__carac c-list-horizontale"]/li/text()'
                    ).extract())
                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = data['Aspect_Ratio']
                metadata['rim'] = data['Rim']
                metadata['speed_rating'] = data['Speed_Rating']

                metadata['width'] = data['Width']
                metadata['fitting_method'] = fitting_method
                metadata['load_rating'] = data['Load_Rating'] or ''
                metadata['alternative_speed_rating'] = ''
                xl = 'XL' in additional_data
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat = 'runflat' in additional_data.lower()
                metadata['run_flat'] = 'Yes' if run_flat else 'No'
                manufacturer_mark = [
                    mark for mark in self.all_man_marks.keys()
                    if re.search('\(?{}\)?'.format(mark.replace('*', '\*')),
                                 additional_data)
                ]
                manufacturer_mark = manufacturer_mark[0].strip(
                ) if manufacturer_mark else []
                metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \
                                                                                              else ''
                metadata['mts_stock_code'] = ''
                metadata['full_tyre_size'] = '/'.join(
                    (metadata['width'], metadata['aspect_ratio'],
                     metadata['rim'], metadata['load_rating'],
                     metadata['speed_rating']))
                #metadata['alternative_speed_rating']))

                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product, spider_name=self.name, log=self.log)

                new_speed_rating = get_speed_rating(product)
                new_alt_speed = get_alt_speed(product)
                product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                    product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
                product['metadata']['speed_rating'] = new_speed_rating

                yield product
Пример #12
0
    def parse(self, response):
        row = response.meta['row']

        json_data = None
        for line in response.body.split('\n'):
            if "JsonObject = " in line:
                json_data = json.loads(
                    line.replace('JsonObject = ', '').replace('; \r', ''))

        products = json_data['Rest'] + json_data['Deals']

        collected_products = []

        self.log('Results found {} {}'.format(len(products), response.meta))
        for product_info in products:
            # skip winter tyres
            if product_info['WinterTyre']:
                continue

            loader = ProductLoader(item=Product(), selector=product_info)
            loader.add_value('name', product_info['ModelName'])
            brand = product_info['Manufacturer']

            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            identifier = product_info['PrimaryId']
            fitting_method = 'Fitted'
            if str(identifier) + '-' + fitting_method in self.seen_ids:
                continue

            url = '/catalogue' + product_info[
                'CatalogueUrl'] + '/f?tyre=' + str(product_info['PrimaryId'])
            loader.add_value('url', response.urljoin(url))

            image_url = product_info.get('ModelImageLarge')
            if not image_url:
                image_url = product_info.get('ModelImage')

            if image_url:
                image_url = image_url.split('src="')[-1].split('"')[0]
                loader.add_value('image_url', response.urljoin(image_url))

            spec = product_info['SpecificationName']
            metadata = MicheldeverMeta()
            # metadata['mts_stock_code'] = row['MTS Stockcode']
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = spec.split()[-1]
            metadata['width'] = row['Width']

            load_rating = product_info['LoadRatingName']
            metadata['load_rating'] = load_rating
            metadata['alternative_speed_rating'] = ''
            xl = product_info['Reinforced']
            metadata['xl'] = 'Yes' if xl else 'No'
            run_flat_found = is_run_flat(product_info['ModelName'])
            run_flat = product_info['RunFlat']
            metadata[
                'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No'
            manufacturer_mark = product_info['Variant']
            if manufacturer_mark:
                manufacturer_mark = manufacturer_mark.split()[0].strip()

            full_tyre_size = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'],
                 metadata['load_rating'], metadata['speed_rating']))
            # MOE Exception for this product
            if manufacturer_mark and 'MO EXTENDED' in product_info['Variant'].upper()\
               and product_info['ModelName'] == 'Potenza S001' and full_tyre_size == '245/40/18/97/Y':
                metadata['manufacturer_mark'] = 'MOE'
            else:
                metadata['manufacturer_mark'] = find_man_mark(
                    manufacturer_mark) if manufacturer_mark else ''

            metadata['full_tyre_size'] = full_tyre_size

            try:
                metadata['fuel'] = product_info['TyreLabelFuel']['Score']
            except Exception:
                metadata['fuel'] = ''

            try:
                metadata['grip'] = product_info['TyreLabelWet']['Score']
            except Exception:
                metadata['grip'] = ''

            try:
                metadata['noise'] = product_info['TyreLabelNoise'][
                    'NoiseLevel']
            except Exception:
                metadata['noise'] = ''

            product = loader.load_item()
            product['metadata'] = metadata

            product['price'] = product_info['FullyFittedPrice']
            fitting_method = 'Fitted'
            product['identifier'] = str(identifier) + '-' + fitting_method
            product['metadata']['fitting_method'] = fitting_method

            t1 = time.time()
            if not is_product_correct(product):
                self.log('Search: {}'.format(str(response.meta)))
                self.seen_ids.add(str(identifier) + '-' + fitting_method)
                self.log('PRODUCT IS NOT CORRECT => %r' % product)
                continue
            t2 = time.time()
            self.log('Time taken by product correct: {}'.format(t2 - t1))

            t1 = time.time()
            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)
            t2 = time.time()
            self.log('Time taken by mts stock: {}'.format(t2 - t1))

            collected_products.append(product)

        min_price_products = {}
        for product in collected_products:
            key = "%s-%s-%s-%s-%s-%s-%s" % (
                product['brand'], product['name'],
                product['metadata']['fitting_method'],
                product['metadata']['full_tyre_size'],
                product['metadata']['xl'], product['metadata']['run_flat'],
                product['metadata']['manufacturer_mark'])
            if key in min_price_products:
                if product['price'] < min_price_products[key]['price']:
                    min_price_products[key] = product
            else:
                min_price_products[key] = product

        for product in min_price_products.values():
            self.seen_ids.add(product['identifier'])
            yield product
Пример #13
0
    def parse_search(self, response):
        # Next URL found?
        if 'next_url' in response.meta:
            row = response.meta['row']
            yield FormRequest(response.meta['next_url'],
                              formdata={'loadCriteriaFromSession': 'false',
                                        'order': 'PRICE',
                                        'searchByRF': 'false',
                                        'searchByXL': 'false',
                                        'season': '',
                                        'selectedAspectRatioValue': row['Aspect Ratio'],
                                        'selectedLoadIndexValue': '',
                                        'selectedRimDiameterValue': row['Rim'],
                                        'selectedSectionValue': row['Width'],
                                        'selectedSpeedIndexValue': ''},
                              callback=self.parse_search,
                              dont_filter=True)

        # Parse results
        results = response.xpath('//div[@id="tyresrch-res"]/div')
        for result_xs in results:
            is_winter = bool(result_xs.xpath('.//*[@alt="WINTER"]').extract())
            if is_winter:
                continue

            try:
                brand = result_xs.xpath('.//span[@class="nom-marque"]/text()').extract()[0]
            except:
                brand = ''
            try:
                name = result_xs.xpath('.//span[@class="title"]/strong/text()').extract()[0]
            except:
                name = ''

            try:
                size = result_xs.xpath('normalize-space(.//span[@class="size"]/text())').extract()[0]
                if len(results) > 1:
                    width, aspect_ratio, rim, load_rating, speed_rating = re.search(r'(\d+)/(\d+)\sR(\d+)\s(\d+)(.)', size).groups()
                else:
                    try:
                        width, aspect_ratio, _, rim, load_rating, speed_rating = re.findall(r'[\d\w]+', size)
                    except:
                        width, aspect_ratio, rim, load_rating, speed_rating = re.search(r'(\d+)/(\d+)\sR(\d+)\s(\d+)(.)', size).groups()
            except:
                self.log("ERROR - Unable to parse pattern for name %s in %s" % (name, response.url))
                continue

            is_run_flat = bool(result_xs.xpath('.//*[contains(text(), "Run Flat")]').extract())
            is_xl = bool(result_xs.xpath('.//*[contains(text(), "Extraload")]').extract())

            product_id = result_xs.xpath('.//input[@name="chkCompare"]/@value').extract()
            if not product_id:
                product_id = result_xs.re(r'productDetail/id/(.*?)/mode')
            if not product_id:
                continue

            price = result_xs.xpath('.//span[@class="price"]/text()').extract()
            if not price:
                price = result_xs.xpath('.//*[@itemprop="price"]/text()').extract()

            product_url = map(response.urljoin, result_xs.xpath('.//a[@class="moreinfo"]/@href').extract())
            product_img = map(response.urljoin, result_xs.xpath('.//div[@class="tyre-image"]/img[1]/@src').extract())

            try:
                fuel, grip, noise = result_xs.xpath('.//div[@class="tyre-labelling-content"]//span[contains(@class, '
                                                    '"tyre-labelling-letter-")]/text()').re(r'[\w\d]+')
            except:
                fuel = ''
                grip = ''
                noise = ''

            loader = ProductLoader(item=Product(), selector=result_xs)
            loader.add_value('identifier', product_id[0])
            loader.add_value('name', name)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
            loader.add_value('price', extract_price(price[0]))
            if product_url:
                loader.add_value('url', product_url[0])
            else:
                loader.add_value('url', response.url)
            if product_img:
                loader.add_value('image_url', product_img[0])

            product = loader.load_item()

            try:
                manuf = result_xs.xpath('.//div[@class="info"]//text()').extract()[3].strip().lower()
            except:
                manuf = ''

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = aspect_ratio
            metadata['rim'] = rim
            metadata['speed_rating'] = speed_rating
            metadata['width'] = width
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating
            metadata['xl'] = 'Yes' if is_xl else 'No'
            metadata['run_flat'] = 'Yes' if is_run_flat else 'No'

            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                if code.lower() in manuf:
                    man_code = man_mark
                    break

            metadata['manufacturer_mark'] = man_code
            metadata['full_tyre_size'] = '/'.join((metadata['width'],
                                                   metadata['aspect_ratio'],
                                                   metadata['rim'],
                                                   load_rating,
                                                   speed_rating))
            metadata['fuel'] = fuel
            metadata['grip'] = grip
            metadata['noise'] = noise
            product['metadata'] = metadata
            if not is_product_correct(product):
                self.log('The product is not correct => %r' % product)
                continue

            # Only manual MTS Stock codes for now
            mts_stock_code = find_manually_matched_mts_stock_code(product, spider_name=self.name)
            if mts_stock_code:
                self.log('MTS Manually matched: %s' % mts_stock_code)
            product['metadata']['mts_stock_code'] = mts_stock_code

            yield product
Пример #14
0
    def parse(self, response):
        products = response.xpath('//div[@class="results"]')

        pages = response.xpath(
            '//p[contains(text(),"Page")]//a/@href').extract()
        for page in pages:
            yield Request(response.urljoin(page), meta=response.meta)

        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
            # the pattern should be set as the product's name
            name = ' '.join(
                map(
                    unicode.strip,
                    product.select('.//div[@class="resultsLeft"]/div'
                                   '//text()[normalize-space()]').extract()))
            name += name + ' %s' % ' '.join(
                map(
                    unicode.strip,
                    product.select(
                        './/div[@class="t_size"]//text()[normalize-space()]').
                    extract()))
            loader.add_xpath(
                'name',
                './/div[@class="resultsLeft"]/div//a/i/b/text()[normalize-space()]'
            )
            brand = product.select(
                './/div[@class="resultsLeft"]/div/b//text()[normalize-space()]'
            ).extract()[0].strip()

            # skip winter tyres
            if product.select(
                    './/img[contains(@alt,"Winter / cold weather tyres")]'):
                continue
            if product.select(
                    './/img[contains(@alt,"Wi") or contains(@src,"/simg/hiver.png")]'
            ):
                continue
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            fitting_method = 'Fitted'

            url = product.select('.//a[i[b]]/@href')[0].extract()
            url = response.urljoin(url)
            url = re.sub('cart_id=[^&]*', '', url)
            loader.add_value('url', url)

            image_url = product.select(
                './/a/img[@align="left"]/@src').extract()
            if image_url:
                loader.add_value('image_url', response.urljoin(image_url[0]))

            identifier = urlparse.parse_qs(
                urlparse.urlparse(url).query)['typ'][0]
            loader.add_value('identifier', identifier)
            price = ''.join(
                product.select(
                    './/div[@class="price"]/font/b//text()[normalize-space()]'
                ).extract())
            price = re.findall(r"\d+.\d+", price) if price else '0.0'
            loader.add_value('price', price)

            data = parse_pattern(name)
            if not data:
                # log.msg("ERROR %s [%s]" % (name, response.url))
                # self.errors.append("Error parsing: %s. URL: %s" % (name, response.url))
                continue

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = data['Aspect_Ratio']
            metadata['rim'] = data['Rim']
            metadata['speed_rating'] = data['Speed_Rating']

            metadata['width'] = data['Width']
            metadata['fitting_method'] = fitting_method
            metadata['load_rating'] = data['Load_Rating']
            metadata['alternative_speed_rating'] = ''
            xl = 'XL' in name
            metadata['xl'] = 'Yes' if xl else 'No'

            run_flat_found = is_run_flat(name)
            run_flat = 'run flat' in name.lower() or 'runflat' in name.lower(
            ) or run_flat_found
            metadata['run_flat'] = 'Yes' if run_flat else 'No'
            manufacturer_mark = product.select(
                './/div[@class="t_size"]/b/a[contains(@onmouseover,"Original") or '
                'contains(@onmouseover,"BMW") or contains(@onmouseover,"Porsche")]'
                '/@name[normalize-space()]').extract()
            manufacturer_mark = manufacturer_mark[0].strip(
            ) if manufacturer_mark else []
            metadata['manufacturer_mark'] = find_man_mark(
                manufacturer_mark) if manufacturer_mark else ''
            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 metadata['load_rating'], metadata['speed_rating']))

            try:
                fuel, grip, noise = map(
                    unicode.strip,
                    product.select(
                        './/div[@class="tyre_label_short"]//text()').extract())
                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise.replace('dB', '').strip()
            except:
                metadata['fuel'] = ''
                metadata['grip'] = ''
                metadata['noise'] = ''

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            if product['identifier'] in self.ip_codes:
                ip_code = self.ip_codes[product['identifier']]
                product['sku'] = ip_code
                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product,
                    spider_name=self.name,
                    log=self.log,
                    ip_code=ip_code)
                yield product
            else:
                # We can't found IP code on products list, unfortunatelly we must extract it from product page
                yield Request(product['url'],
                              meta={'product': product},
                              callback=self.parse_ipcode)
Пример #15
0
    def parse_products_data(self, response):
        meta = {
            'cookiejar': response.meta['cookiejar'],
            'speed_rating': response.meta['speed_rating'],
            'search_params': response.meta['search_params'],
            'proxy_service_disabled': True,
            'proxy': response.meta.get('proxy', ''),
        }

        # Pages
        for url in response.xpath(
                '//*[@id="InnerPH_InnerPH_pageList"]//a/@href').extract():
            url = response.urljoin(url)
            if url not in self.urls_history:
                self.urls_history.add(url)
                yield Request(url,
                              meta=meta,
                              callback=self.parse_products_data,
                              dont_filter=True)

        products = response.xpath('//ul[@id="results_tbl"]/li')
        if not products:
            products = response.xpath('//div[@class="product_item"]')
        if not products:
            self.log('No products found => %r' % response.meta)
        brand_list = response.xpath(
            '//ul[@id="InnerPH_InnerPH_brand_list"]//a/text()').extract()
        if not brand_list:
            self.log('No brand list found => %r' % response.meta)
            return
        for product in products:
            desc = product.xpath(
                './/div[@class="tyre_desc"]/text()').extract()[0]
            if 'snow' in desc or 'winter' in desc:
                continue

            search_params = response.meta['search_params']

            name = product.xpath(
                './/a[@class="tyre_name"]/text()').extract()[0]
            url = product.xpath('.//a[@class="tyre_name"]/@href').extract()[0]
            p_id = product.xpath('.//a[@class="tyre_name"]/@href').re(
                r'/t(\d+)/')[0]
            image_url = product.xpath(
                './/*[contains(@class, "tyre_img")]//img/@src').extract()[0]
            try:
                brand = filter(lambda b: b in name, brand_list)[0]
            except:
                self.log('Can\'t detect brand for: %s' % name)
                continue
            try:
                price = product.xpath(
                    './/*[@class="tyre_price_text"]/text()').extract()[0]
            except IndexError:
                self.log("Price not found: %s" % str(product))
                continue

            loader = ProductLoader(item=Product(), selector=product)
            loader.add_value('url', response.urljoin(url))
            loader.add_value('identifier', p_id)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            loader.add_value('price', price)

            pattern = name.strip()
            pattern = pattern.upper()
            pattern = pattern.replace('XL', '').replace('RFT', '').replace(
                'RFLAT', '').replace('RUNFLAT', '').strip()

            loader.add_value('name', pattern)

            m = MicheldeverMeta()
            m['aspect_ratio'] = search_params['aspect_ratio']
            m['rim'] = search_params['rim']
            m['width'] = search_params['width']
            m['speed_rating'] = search_params['speed_rating'].upper()
            res = re.search(
                '([\d/]+)%s' % search_params['speed_rating'].upper(), desc)
            if res:
                m['load_rating'] = res.groups()[0]
            else:
                self.log('ERROR: not load rating: %s' % url)
                m['load_rating'] = ''
            run_flat_found = is_run_flat(desc)
            if 'ZPS' in desc.upper() or 'RFT' in desc.upper() or 'RFLAT' in desc.upper() or \
               'RUNFLAT' in desc.upper() or 'RUN FLAT' in desc.upper() or run_flat_found:
                m['run_flat'] = 'Yes'
            else:
                m['run_flat'] = 'No'

            if 'XL' in desc.upper():
                m['xl'] = 'Yes'
            else:
                m['xl'] = 'No'

            m['full_tyre_size'] = '/'.join(
                (m['width'], m['aspect_ratio'], m['rim'], m['load_rating'],
                 m['speed_rating']))

            m['fitting_method'] = 'Fitted' if 'FITTED' in product.xpath(
                './/*[@class="tyre_price_type"]/text()').extract(
                ) else 'Delivered'
            m['manufacturer_mark'] = self._get_manufacturer_code(desc)

            fuel = product.xpath(
                './/*[@class="fuel-img"]/@data-grade').extract()
            m['fuel'] = fuel[0] if fuel else ''
            grip = product.xpath(
                './/*[@class="wetgrip-img"]/@data-grade').extract()
            m['grip'] = grip[0] if grip else ''
            noise = product.xpath(
                './/*[@class="noise-img"]/@data-grade').extract()
            m['noise'] = noise[0] if noise else ''

            product = loader.load_item()
            product['metadata'] = m

            if not is_product_correct(product):
                self.log('Product is not correct => %s' % desc)
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            if product['identifier'] in self.ip_codes:
                ip_code = self.ip_codes[product['identifier']]
                product['sku'] = ip_code
                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product,
                    spider_name=self.name,
                    log=self.log,
                    ip_code=ip_code)
                yield product
            else:
                # We can't found IP code on products list, unfortunatelly we must extract it from product page
                yield Request(product['url'],
                              meta={'product': product},
                              callback=self.parse_ipcode)
Пример #16
0
    def parse_search(self, response):
        base_url = get_base_url(response)

        products = response.xpath('//ul[contains(@class, "c-list-classic") and contains(@class, "m-produit-res")]/li')
        pages = response.xpath('//ul[contains(@class, "paginator")]/li[not(@data-page="1")]/@data-page').extract()

        for product_el in products:
            url = product_el.xpath('.//a[contains(@class, "u-semi-link")]/@href')[0].extract()
            winter_tyre = product_el.xpath('.//div[@class="m-produit-bloc-res-lst__gamme-saison"]/text()').re('Winter')
            if not winter_tyre:
                loader = ProductLoader(item=Product(), selector=product_el)
                # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
                # the pattern should be set as the product's name
                loader.add_xpath('name', './/span[@class="m-produit-bloc-res-lst__dcp"]/text()')
                brand = product_el.xpath('.//span[@class="m-produit-bloc-res-lst__fab"]/text()').extract()
                if brand:
                    brand = brand[0].strip()
                    loader.add_value('brand', unify_brand(brand))
                loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
                fitting_method = 'Delivered'

                loader.add_value('url', urljoin(base_url, url))

                image_url = product_el.xpath('.//div[@class="m-produit-bloc-res-lst__image"]//img/@src').extract()
                if image_url:
                    loader.add_value('image_url', urljoin(get_base_url(response), image_url[0]))

                identifier = product_el.xpath('.//button/@data-id')[0].extract()
                loader.add_value('identifier', identifier)
                price = product_el.xpath('.//div[@class="c-qte-prix__prix m-produit-bloc-res-lst__prix"]/text()')[0].extract()
                loader.add_value('price', price)
                if not loader.get_output_value('price'):
                    loader.add_value('stock', 0)

                name = product_el.xpath('.//div[@class="m-produit-bloc-res-lst__dim"]/text()')[0].extract().strip().replace(u'\xa0', u' ')
                data = parse_pattern(name)
                if not data:
                    self.log('ERROR parsing "{}" [{}]'.format(name, response.url))
                    # self.errors.append('ERROR parsing "{}" [{}]'.format(name, response.url))
                    continue

                additional_data = ' '.join(product_el.xpath('.//ul[@class="m-produit__carac c-list-horizontale"]/li/text()').extract())
                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = data['Aspect_Ratio']
                metadata['rim'] = data['Rim']
                metadata['speed_rating'] = data['Speed_Rating']

                metadata['width'] = data['Width']
                metadata['fitting_method'] = fitting_method
                metadata['load_rating'] = data['Load_Rating'] or ''
                metadata['alternative_speed_rating'] = ''
                xl = 'XL' in additional_data
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat_found = is_run_flat('%s %s %s' % (loader.get_output_value('name'), name, additional_data))
                run_flat = 'runflat' in additional_data.lower() or run_flat_found
                metadata['run_flat'] = 'Yes' if run_flat else 'No'
                manufacturer_mark = [mark for mark in self.all_man_marks.keys() if re.search('\(?{}\)?'.format(mark.replace('*', '\*')), additional_data)]
                manufacturer_mark = manufacturer_mark[0].strip() if manufacturer_mark else []
                metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \
                                                                                              else ''
                metadata['mts_stock_code'] = ''
                metadata['full_tyre_size'] = '/'.join((metadata['width'],
                                                       metadata['aspect_ratio'],
                                                       metadata['rim'],
                                                       metadata['load_rating'],
                                                       metadata['speed_rating']))

                try:
                    fuel, grip, noise = map(unicode.strip, product_el.xpath('.//div[@class="m-produit-bloc-res-lst__etiq hide-for-small"]'
                        '/ul[@class="m-etiq-light"]/li/div[contains(@class, "m-etiq-light__note")]/text()').extract())
                except:
                    fuel, grip, noise = ('', '', '')
                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise.replace('dB', '')

                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

                yield product

        for page_no in pages:
            meta = response.meta.copy()
            yield Request(add_or_replace_parameter(self.search_url % meta['row'], 'page', page_no),
                          meta=meta, callback=self.parse_search)
Пример #17
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//tr[contains(@class,"tyre-search-row")]')

        next_page = []
        if next_page:
            yield Request(urljoin_rfc(base_url, next_page[0]),
                          meta=response.meta)

        not_found_count = 0

        for product in products:
            url = product.select('.//td/b/a/@href')[0].extract()
            winter_tyre = product.select('.//td/b/a/text()')[0].extract()
            winter_tyre = 'winter' in winter_tyre.lower()
            if not winter_tyre:
                brand = product.select('.//a/img/@src')[0].extract()
                brand = re.search('/public/brands/(.*?)(-tyres)?\.',
                                  brand).group(1).replace('-', ' ').title()
                product_name = product.select('.//td/b/a/text()')[0].extract()
                product_name = re.sub(brand, '', product_name).strip()
                fitting_method = 'Delivered'
                identifier = product.select(
                    './/input[@name="item_id"]/@value').extract()
                if not identifier:
                    identifier = product.select('.//a/@href').re(
                        'email_me_stock/(.*)')
                if not identifier:
                    continue
                try:
                    fuel, grip, noise = map(
                        unicode.strip,
                        product.select(
                            './/img[contains(@alt, "Tyre Label")]/following-sibling::text()'
                        ).extract())
                except:
                    fuel = ''
                    grip = ''
                    noise = ''

                price = product.select("td[3]/b/text()").extract()
                loader = ProductLoader(item=Product(), selector=hxs)
                loader.add_value('identifier', identifier[0])
                loader.add_value('name', product_name)
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                loader.add_value('url', url)
                if price:
                    loader.add_value('price', price[0])
                else:
                    loader.add_value('price', '0.00')
                    loader.add_value('stock', 0)

                pattern_name = product.select('.//i/text()').extract()
                if not pattern_name:
                    continue
                pattern_name = pattern_name[0]

                data = re.search(
                    '(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)',
                    pattern_name)
                if data:
                    data = data.groupdict()
                else:
                    msg = 'ERROR parsing "{}" [{}]'.format(
                        pattern_name, response.url)
                    self.log(msg)
                    continue

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = data['Aspect_Ratio']
                metadata['rim'] = data['Rim']
                metadata['speed_rating'] = data['Speed_Rating'].upper()

                metadata['width'] = data['Width']
                metadata['fitting_method'] = fitting_method
                metadata['load_rating'] = data['Load_Rating'] or ''
                metadata['alternative_speed_rating'] = ''
                xl = 'XL' in pattern_name
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat_found = is_run_flat(pattern_name)
                run_flat = 'run flat' in pattern_name.lower(
                ) or 'runflat' in pattern_name.lower() or run_flat_found
                metadata['run_flat'] = 'Yes' if run_flat else 'No'
                manufacturer_mark = [
                    mark for mark in self.all_man_marks.keys()
                    if mark in pattern_name.split(' ')
                ]
                manufacturer_mark = manufacturer_mark[0].strip(
                ) if manufacturer_mark else []
                metadata['manufacturer_mark'] = find_man_mark(
                    manufacturer_mark) if manufacturer_mark else ''

                metadata['full_tyre_size'] = '/'.join(
                    (metadata['width'], metadata['aspect_ratio'],
                     metadata['rim'], metadata['load_rating'],
                     metadata['speed_rating']))

                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise

                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    not_found_count += 1
                    self.log('%s - PRODUCT IS NOT CORRECT: %r' %
                             (not_found_count, product))
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product, spider_name=self.name, log=self.log)

                if product['url'] in self.images:
                    product['image_url'] = self.images[product['url']]
                    yield product
                else:
                    yield Request(product['url'],
                                  callback=self.parse_image,
                                  meta={'product': product},
                                  dont_filter=True)
Пример #18
0
    def extract_products(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select(
            '//div[@class="listcontPART"]//div[@class="conprcbx"]')
        for el in products:
            brand = el.select('./div[@class="dec_tyrebnt"]/p/b/text()'
                              ).extract().pop().strip()

            pattern = "".join(
                el.select(
                    './div[@class="dec_tyrebnt"]/p/text()').extract()).strip()

            # skip winter tyres
            if 'winter' in pattern.lower():
                continue

            xl, pattern = extract_reinforced(pattern)
            run_flat, pattern = extract_run_flat(pattern)
            res = parse_pattern(pattern)
            if not res:
                excludes = [
                    'sport contact', 'advantage sport', 'expedia s02',
                    'zero rosso'
                ]
                if any([x in pattern.lower() for x in excludes]):
                    continue
                else:
                    msg = 'Could not parse pattern: %s' % fix_spaces(
                        pattern).encode('utf-8')
                    self.log('[CARTYRES] %s' % msg)
                    self.errors.append(msg)
                    continue
            width, ratio, rim, load_rating, speed_rating, name = res

            identifier = el.select(".//p/@onclick").re(
                "AddCarToShortList\('([^']*)',")

            url = self.start_urls[0]

            price = el.select(
                './/div[@class="dec_fittdbnt"]/h1/text()').extract().pop()
            price = fix_spaces(price)

            image_url = el.select(
                '../..//div[@class="uptyre_prt"]/img/@src').extract()[0]

            man_mark = el.select(
                './/div[@class="bndLGO1"]/img/@title').extract()
            if man_mark:
                man_mark = man_mark[0]
                if not man_mark in self.man_marks:
                    self.man_marks.add(man_mark)
            else:
                man_mark = ''

            loader = ProductLoader(Product(), selector=hxs)
            loader.add_value('name', name)
            loader.add_value('identifier', identifier.pop())
            loader.add_value('price', price)
            loader.add_value('url', url)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))

            metadata = MicheldeverMeta()
            metadata['width'] = width
            metadata['aspect_ratio'] = ratio
            metadata['rim'] = rim
            metadata['load_rating'] = load_rating
            metadata['speed_rating'] = speed_rating
            metadata['fitting_method'] = 'Fitted'
            metadata['run_flat'] = run_flat
            metadata['xl'] = xl

            if man_mark and man_mark in man_mark_mapping:
                man_code = man_mark_mapping[man_mark]
            else:
                man_code = ''
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join(
                (width, ratio, rim, load_rating, speed_rating))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #19
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_loader = ProductLoader(item=Product(), selector=hxs)
        # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
        # the pattern should be set as the product's name
        brand = response.meta.get('brand') or ''
        product_name = hxs.select('//h2[@class="heading black"]/text()')[0].extract().strip()
        product_name = re.sub(brand, '', product_name).strip()
        fitting_method = 'Delivered'
    
        base_loader.add_value('url', response.url)
    
        image_url = hxs.select('//div[@class="item"]/a/img/@src').extract()
        options = hxs.select('//div[@style="background: #fff; padding: 6px; "]')
        for option in options:
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('name', product_name)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
            loader.add_value('url', response.url)
            if image_url:
                loader.add_value('image_url', urljoin(get_base_url(response), image_url[0]))
            identifier = option.select('../input[@type="hidden" and @name="item_id"]/@value').extract()
            if not identifier:
                identifier = option.select('./a/@href').re('email_me_stock/(.*)')
            if not identifier:
                continue
            loader.add_value('identifier', identifier[0])
            price = option.select('./strong[@class="price" and not(contains(text(),"On Backorder"))]/text()').extract()
            if price:
                loader.add_value('price', price[0]) 
            else:
                if response.meta.get('price'):
                    loader.add_value('price', response.meta['price'])
                else:
                    loader.add_value('price', '0.00')
                loader.add_value('stock', 0)
        
            pattern_name = option.select('./p/strong/text()').extract()
            if not pattern_name:
                pattern_name = option.select('./strong/text()').extract()
            pattern_name = pattern_name[0]
            data = re.search('(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)',
                              pattern_name)
            if data:
                data = data.groupdict()
            else:
                msg = 'ERROR parsing "{}" [{}]'.format(pattern_name, response.url)
                log.msg(msg)
                self.errors.append(msg)
                continue
            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = data['Aspect_Ratio']
            metadata['rim'] = data['Rim']
            metadata['speed_rating'] = data['Speed_Rating'].upper()
        
            metadata['width'] = data['Width']
            metadata['fitting_method'] = fitting_method
            metadata['load_rating'] = data['Load_Rating'] or ''
            metadata['alternative_speed_rating'] = ''
            xl = 'XL' in pattern_name
            metadata['xl'] = 'Yes' if xl else 'No'
        
            run_flat = 'run flat' in pattern_name.lower() or 'runflat' in pattern_name.lower()
            metadata['run_flat'] = 'Yes' if run_flat else 'No'
            manufacturer_mark = [mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ')]
            manufacturer_mark = manufacturer_mark[0].strip() if manufacturer_mark else []
            metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else ''

            metadata['full_tyre_size'] = '/'.join((metadata['width'],
                                                   metadata['aspect_ratio'],
                                                   metadata['rim'],
                                                   metadata['load_rating'], 
                                                   metadata['speed_rating']))
                                                    #metadata['alternative_speed_rating']))
        
            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating
        
            yield product
Пример #20
0
    def parse_search(self, response):
        base_url = get_base_url(response)

        urls = response.xpath('//div[@class="pagination tCenter"]//a/@href').extract()
        for url in urls:
            yield Request(urljoin(base_url, url), callback=self.parse_search)

        products = response.xpath('//*[@class="table search-results vCenter"]/tbody//tr')
        for product in products:
            season = product.xpath('.//i[contains(@class, "season")]/@class').extract()

            if season and 'winter' in season[0]:
                continue
            loader = ProductLoader(item=Product(), selector=product)
            brand = product.xpath('./td/a[@class="item-ref"]/span[1]/text()').extract()[0]
            name = product.xpath('./td/a[@class="item-ref"]/span[2]/text()').extract()[0]
            loader.add_value('name', name)

            pattern = product.xpath('./td/a[@class="item-ref"]/small/text()').extract()[0]

            data = extract_data(pattern)
            if data:
                width, aspect_ratio, rim, load_rating, speed_rating = data
            else:
                self.log("ERROR. Unable to parse pattern: %s" % pattern)
                continue

            if 'goodrich' in brand.lower():
                brand = 'BFG'
            loader.add_value('brand', unify_brand(brand))
            loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
            price = ''.join(product.xpath('.//div[@class="hidden-xs"]/span[@class="prix"]/text()').re(r'[\d\.,]+'))\
                      .replace('.', '').replace(",", ".")
            loader.add_value('price', extract_price(price))
            identifier = product.xpath('@data-id').extract()[0]
            loader.add_value('identifier', identifier)
            url = product.xpath('./td[2]/a/@href').extract()[0]
            loader.add_value('url', urljoin(base_url, url))
            image_url = product.xpath('./td[@class="img"]//img/@src').extract()
            if image_url:
                if len(image_url) < 250:
                    loader.add_value('image_url', urljoin(base_url, image_url[0]))

            if self.old_meta_df is not None:
                old_meta = self.old_meta_df[self.old_meta_df['identifier'] == identifier]
            else:
                old_meta = None

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = aspect_ratio
            metadata['rim'] = rim
            metadata['speed_rating'] = speed_rating
            metadata['width'] = width
            metadata['fitting_method'] = 'Delivered'
            metadata['load_rating'] = load_rating
            specif = product.xpath('.//span[@class="specif"]/text()').extract()
            specif = [x.lower() for x in specif]
            metadata['xl'] = 'Yes' if 'xl' in specif else 'No'
            run_flat_found = is_run_flat('%s %s' % (name, ' '.join(specif)))
            metadata['run_flat'] = 'Yes' if ('runflat' in specif) \
                                            or ('run flat' in ' '.join(specif)) or run_flat_found else 'No'
            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                if code.lower() in specif:
                    man_code = man_mark
                    break
            if man_code == '':
                for code, man_mark in self.custom_man_marks.iteritems():
                    if code.lower() in specif:
                        man_code = man_mark
                        break
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join((metadata['width'],
                                                   metadata['aspect_ratio'],
                                                   metadata['rim'],
                                                   load_rating,
                                                   speed_rating))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                product_correct = False
                if (old_meta is not None) and (not old_meta.empty):
                    product['metadata'] = dict(old_meta.iloc[0].metadata)
                    try:
                        product_correct = is_product_correct(product)
                    except Exception, e:
                        self.log('%r' % e)
                        continue

                if not product_correct:
                    continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

            yield product
Пример #21
0
    def parse(self, response):
        row = response.meta['row']

        products = response.xpath(
            '//div[contains(@class, "product-recommended")]')
        products += response.xpath(
            '//div[@class="product-section"]/div[contains(@class, "product")]')
        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.xpath(
                './/input[@name="brand"]/@value').extract()
            brand = brand[0] if brand else ''

            for tyre_brand in self.brands:
                if tyre_brand.upper() == brand.strip().upper():
                    brand = tyre_brand

            full_name = ''.join(product_el.xpath('.//h2/text()').extract())
            if not full_name:
                continue

            full_name_splt = re.split(brand, full_name, flags=re.I)
            tyre_code = full_name_splt[0]
            name = ' '.join(full_name_splt[1:]).strip()
            tyre_code = tyre_code.strip()
            name = name.strip()
            loader.add_value('name', name)

            # loader.add_value('name', full_name.split(brand)[-1])
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            identifier = product_el.xpath(
                './/input[@name="prodCode"]/@value').extract()
            if identifier:
                identifier = identifier[0]
            else:
                self.log('Product without identifier')
                search_params = '/'.join([
                    row['Aspect Ratio'], row['Rim'], row['Width'],
                    row['Alt Speed']
                ])
                self.log('Search parameters: ' + search_params)
                return

            loader.add_value('url', response.url)
            image_url = product_el.xpath(
                './/div[contains(@class, "product-im")]/img/@src').extract()
            if image_url:
                loader.add_value('image_url', response.urljoin(image_url[0]))
            loader.add_value('identifier', identifier)

            price = ''.join(
                product_el.xpath('.//*[@class="price"]//text()').re(
                    r'[\d\.,]+'))

            if not price:
                continue

            loader.add_value('price', price)

            metadata = MicheldeverMeta()

            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']

            speed = re.search('(\s\d+\w+\s)', full_name)
            speed_rating = speed.group().strip()[-1] if speed else ''
            load_rating = speed.group().strip()[:-1] if speed else ''

            metadata['speed_rating'] = speed_rating
            metadata['load_rating'] = load_rating

            metadata['width'] = row['Width']

            metadata['fitting_method'] = 'Fitted'
            metadata['alternative_speed_rating'] = ''
            metadata['xl'] = 'Yes' if 'XL' in full_name.upper() else 'No'
            run_flat_found = is_run_flat(full_name)
            metadata['run_flat'] = 'Yes' if 'RUNFLAT' in full_name.upper(
            ) or run_flat_found else 'No'

            metadata['manufacturer_mark'] = self._get_manufacturer_code(
                full_name)

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'],
                 metadata['load_rating'], metadata['speed_rating']))

            try:
                fuel, grip, noise = map(
                    unicode.strip,
                    product_el.xpath(
                        './/div[contains(@class, "feature-image") or contains(@class, "feature-block")]'
                        '//span[@class="icon-text"]/text()').extract())
            except:
                fuel = ''
                grip = ''
                noise = ''

            metadata['fuel'] = fuel
            metadata['grip'] = grip
            metadata['noise'] = noise

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            yield product

        next_page = response.xpath(
            u'//ul[@class="pagination"]//a[contains(text(), ">")]/@data-url'
        ).extract()
        if next_page:
            yield Request(next_page[0], dont_filter=True, meta=response.meta)
Пример #22
0
    def parse(self, response):
        products = response.xpath(
            '//div[contains(@class, "tyres_search_results_tyre") and @data-viewtype="grid"]'
        )

        for product in products:
            winter_tyre = product.xpath(
                '@data-filter-season').extract()[0] == 'Winter'
            if not winter_tyre:
                name = product.xpath(
                    './/div[contains(@class, "tyre-model text-center")]/text()'
                ).extract()[0]
                brand = product.xpath('@data-filter-brand').extract()[0]

                loader = ProductLoader(item=Product(), selector=product)
                loader.add_value('name', brand + ' ' + name)
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                identifier = product.xpath('@data-tyreid').extract()[0]
                loader.add_value('identifier', identifier)
                loader.add_value('url', response.url)
                image_url = product.xpath(
                    './/div[contains(@class, "tyre-image")]//img/@src'
                ).extract()
                if image_url:
                    loader.add_value(
                        'image_url',
                        urljoin(get_base_url(response), image_url[0]))
                price = product.xpath(
                    './/div[contains(@class, "tyre-pricing-information")]/div/text()'
                ).re(r'[\d,.]+')
                price = price[0] if price else '0.00'
                loader.add_value('price', price)
                tyresize_text = product.xpath(
                    './/div[contains(@class, "tyre-size")]/text()').extract(
                    )[0].strip()
                try:
                    width, aspect, speed_rating, rim, load_rating = re.search(
                        r'(\d+)\/(\d+)(\w{1})(\d+)\s\((\d+)\)', tyresize_text,
                        re.I).groups()
                except:
                    width, aspect, speed_rating, rim = re.search(
                        r'(\d+)\/(\d+)(\w{1})(\d+)', tyresize_text,
                        re.I).groups()
                    load_rating = ''

                fitting_method = 'Fitted'

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = aspect
                metadata['rim'] = rim

                metadata['speed_rating'] = speed_rating

                metadata['width'] = width
                metadata['fitting_method'] = fitting_method
                metadata['load_rating'] = load_rating
                metadata['alternative_speed_rating'] = ''
                xl = product.xpath(
                    '@data-filter-reinforced').extract()[0] == 'Y'
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat_found = is_run_flat(loader.get_output_value('name'))
                run_flat = product.xpath(
                    '@data-filter-runflat').extract()[0] == 'Y'
                metadata[
                    'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No'
                manufacturer_mark = product.xpath('.//span[contains(@title, "Homologated for fitment to certai")]/@title')\
                                           .re(r'Homologated for fitment to certain (.*) cars\.')

                metadata['manufacturer_mark'] = find_man_mark(
                    manufacturer_mark[0]) if manufacturer_mark else ''

                metadata['full_tyre_size'] = '/'.join(
                    (metadata['width'], metadata['aspect_ratio'],
                     metadata['rim'], metadata['load_rating'],
                     metadata['speed_rating']))

                fuel, grip, noise = product.xpath('@data-filter-tyreefficiencyr'
                                                  '|@data-filter-tyreefficiencyg'
                                                  '|@data-filter-tyreefficiencyd')\
                                           .extract()
                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise

                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product, spider_name=self.name, log=self.log)

                yield product
Пример #23
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        title = hxs.select('//h2/text()')[0].extract()
        if 'winter' in title.lower():
            return
        brand = title.split(' ')[0]
        price = hxs.select('//td[contains(text(), "1 Tyre")]/following-sibling::td[@class="align-right"]/strong/text()')[0].extract()
        # fix wrong product
        if brand.strip() == 'R27':
            loader.add_value('name', title.replace('XL', '').replace('RF', ''))
            brand = 'Toyo'
        else:
            loader.add_value('name', title.replace(brand, '').replace('XL', '').replace('RF', ''))
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))

        loader.add_value('price', price)
        loader.add_value('url', response.url)
        loader.add_xpath('identifier', '//div[@class="hiddenFields"]/input[@name="sku"]/@value')
        image_url = hxs.select('//div[contains(@class, "sidebar")]/div[@class="align-center"]/img/@src')[0].extract()
        loader.add_value('image_url', image_url)

        speed_rating = hxs.select('//table[@class="blank-table"]//strong[contains(text(), "Speed Rating:")]/parent::td/following-sibling::td/text()').extract()[0]
        load_rating = hxs.select('//table[@class="blank-table"]//strong[contains(text(), "Load Index:")]/parent::td/following-sibling::td/text()').extract()[0].replace(speed_rating, "")


        size = hxs.select('//h3/text()')[0].extract()

        width, aspect_ratio, _, rim = parse_tyre_size(size)
        if not width:
            msg = "Error parsing '%s' on page %s" % (size, response.url)
            self.log(msg)
            self.errors.append(msg)
            return

        m = MicheldeverMeta()
        m['aspect_ratio'] = aspect_ratio
        m['rim'] = rim
        m['width'] = width
        m['speed_rating'] = speed_rating.upper()
        m['load_rating'] = load_rating
        run_flat_found = is_run_flat(title)
        if 'RUNFLAT' in title.upper() or 'RF' in title.upper() or run_flat_found:
            m['run_flat'] = 'Yes'
        else:
            m['run_flat'] = 'No'

        if 'XL' in title.upper():
            m['xl'] = 'Yes'
        else:
            m['xl'] = 'No'

        m['full_tyre_size'] = '/'.join((m['width'],
                                        m['aspect_ratio'],
                                        m['rim'],
                                        m['load_rating'],
                                        m['speed_rating']))

        m['fitting_method'] = 'Fitted'
        m['manufacturer_mark'] = self._get_manufacturer_code(title)

        try:
            fuel, grip, noise = hxs.select('//div[@class="eu-label"]//span/text()').extract()
        except:
            fuel, grip, noise = ('', '', '')
        m['fuel'] = fuel
        m['grip'] = grip
        m['noise'] = noise.replace('dB', '')

        product = loader.load_item()
        product['metadata'] = m

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

        yield product
Пример #24
0
    def parse(self, response):
        pages = set(
            response.xpath(
                '//*[contains(@class, "pagination__item")]/a[not(contains(@class, "pagination__current"))]/@href'
            ).extract())
        for page_url in pages:
            yield Request(response.urljoin(page_url), meta=response.meta)

        products = response.xpath(
            '//article[@itemtype="http://schema.org/Product"]')

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.xpath(
                './/*[@itemprop="brand"]//*[@itemprop="name"]/text()').extract(
                )[0].strip()
            if brand.upper() in brands_substitute:
                brand = brands_substitute[brand.upper()]
            full_name = product_el.xpath(
                './/*[contains(@class, "product__title") and @itemprop="name"]/text()'
            ).extract()[0]
            try:
                tyre_size, name = re.split(brand, full_name, flags=re.I)
            except ValueError:
                self.log(
                    "[[TESTING]] Can not split tyre '%s' with brand '%s'" %
                    (full_name, brand))
                continue
            # tyre_size, name = full_name.split(brand)
            loader.add_value('name', name)

            winter_tyre = product_el.xpath(
                './/*[@class="product__info"]//*[@data-icon="S" and contains(text(), "Winter")]'
            )
            if not winter_tyre:
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                identifier = self.get_identifier(product_el)

                out_of_stock = product_el.xpath(
                    './/*[@itemprop="availability" and contains(@content, "Out")]'
                )
                if out_of_stock:
                    loader.add_value('stock', 0)

                loader.add_value('url', response.url)

                image_url = product_el.xpath(
                    './/img[@itemprop="image"]/@src').extract()

                if image_url:
                    loader.add_value('image_url',
                                     response.urljoin(image_url[0]))

                loader.add_value('identifier', identifier)
                price = product_el.xpath('@data-price').extract()[0]
                loader.add_value('price', price)

                metadata = MicheldeverMeta()
                res = parse_pattern(tyre_size)
                if not res:
                    continue
                width, ratio, rim, load_rating, speed_rating = res
                metadata['aspect_ratio'] = ratio
                metadata['rim'] = rim
                metadata['speed_rating'] = speed_rating
                metadata['load_rating'] = load_rating
                metadata['width'] = width

                metadata['fitting_method'] = 'Fitted'
                metadata['alternative_speed_rating'] = ''
                xl = bool(
                    product_el.xpath(
                        './/*[@class="product__info"]//*[@data-icon="XL"]'))
                metadata['xl'] = 'Yes' if xl else 'No'
                run_flat_found = is_run_flat(full_name)
                run_flat = bool(
                    product_el.xpath(
                        './/*[@class="product__info"]//*[@data-icon="RF"]'))
                if not run_flat:
                    run_flat = ' RFT' in name
                metadata[
                    'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No'

                man_code = self._get_manufacturer_code(full_name)

                metadata['manufacturer_mark'] = man_code

                metadata['full_tyre_size'] = '/'.join(
                    (metadata['width'], metadata['aspect_ratio'],
                     metadata['rim'], metadata['load_rating'],
                     metadata['speed_rating']))

                try:
                    fuel, grip, noise = product_el.xpath(
                        './/li[contains(@class, "product__meta-item--")]/text()'
                    ).extract()
                except:
                    fuel, grip, noise = ('', '', '')

                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise

                product = loader.load_item()
                # The website is defaulting to 2 tyres with a discount of £10
                if product.get('price') and (not self.price_discount):
                    product['price'] += Decimal('10')
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata'][
                    'mts_stock_code'] = self.find_mts_stock_code(product)

                yield product
Пример #25
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
        # the pattern should be set as the product's name
        name = hxs.select('//td[@class="tread"]/text()').extract()
        if not name:
            msg = "No name found on page: %s" % response.url
            self.errors.append(msg)
            self.log("[ERROR] %s" % msg)
            return
        loader.add_value('name', name[0])
        brand = hxs.select(
            '//table[@class="single searchresults"]//td[@class="tyreinfo"]/b/text()'
        ).extract()[0].strip()
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category', find_brand_segment(brand))
        fitting_method = 'Delivered'

        loader.add_value('url', response.url)

        out_of_stock = hxs.select(
            '//table[@class="single searchresults"]//span[@class="outofstock"]'
        )
        if out_of_stock:
            loader.add_value('stock', 0)

        image_url = hxs.select(
            '//table[@class="single searchresults"]//td[@class="logo-pic"]/img/@src'
        ).extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin(get_base_url(response), image_url[0]))

        identifier = hxs.select(
            '//table[@class="single searchresults"]//form/input[@name="pid"]/@value'
        )[0].extract()
        loader.add_value('identifier', identifier)
        price = hxs.select(
            '//table[@class="single searchresults"]//td[@class="netprice"]/text()'
        )[0].extract()
        loader.add_value('price', price)

        name = hxs.select(
            '//table[@class="single searchresults"]//td[@class="tyreinfo"]/span/text()'
        )[0].extract()
        data = parse_pattern(name)
        if not data:
            log.msg('ERROR parsing "{}" [{}]'.format(name, response.url))
            self.errors.append('ERROR parsing "{}" [{}]'.format(
                name, response.url))
            return
        metadata = MicheldeverMeta()
        metadata['aspect_ratio'] = data['Aspect_Ratio']
        metadata['rim'] = data['Rim']
        metadata['speed_rating'] = data['Speed_Rating']

        metadata['width'] = data['Width']
        metadata['fitting_method'] = fitting_method
        metadata['load_rating'] = data['Load_Rating']
        metadata['alternative_speed_rating'] = ''
        xl = 'XL' in name
        metadata['xl'] = 'Yes' if xl else 'No'

        run_flat = 'rflat' in name.lower()
        metadata['run_flat'] = 'Yes' if run_flat else 'No'
        if '*' in name:
            manufacturer_mark = '*'
        else:
            manufacturer_mark = [
                mark for mark in self.all_man_marks.keys()
                if mark in name.split(' ')
            ]
        manufacturer_mark = manufacturer_mark[0].strip(
        ) if manufacturer_mark else []
        metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \
                                                                                      else ''
        metadata['mts_stock_code'] = ''
        metadata['full_tyre_size'] = '/'.join(
            (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
             metadata['load_rating'], metadata['speed_rating']))
        #metadata['alternative_speed_rating']))

        product = loader.load_item()
        product['metadata'] = metadata

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(
            product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        yield product
Пример #26
0
    def extract_products(self, hxs, url):
        for el in hxs.select(
                '//div[starts-with(@class,"tyre_container round")]'):
            tyre_options = fix_spaces("".join(
                el.select(
                    './/p[@class="tyre_details"]//text()').extract())).strip()
            if not tyre_options:
                msg = 'Could not extract tyre options from element from %s' % url
                self.log('ERROR: %s' % msg)
                self.errors.append(msg)
                continue
            res = parse_pattern(tyre_options)
            if not res:
                msg = "ERROR parsing: %s on %s" % (tyre_options, url)
                self.log(msg)
                self.errors.append(msg)
                continue
            width, ratio, rim, load_rating, speed_rating, name = res

            # skip winter tyres
            if el.select(".//div[@class='tyre_winter']"):
                continue

            name = name.strip()
            identifier = el.select("./@id").extract()[0]
            price = "".join(
                el.select(
                    ".//p[@class='tyre_price']//text()").extract()).strip()
            if not price:
                continue
            brand = el.select(
                ".//span[@class='tyre_brand_text']/text()").extract()[0]
            image_url = el.select("img/@src").extract()[0]
            image_url = urljoin_rfc('http://asdatyres.co.uk', image_url)
            run_flat = 'Yes' if len(
                el.select(".//div[@class='tyre_rf']").extract()) > 0 else 'No'
            xl = 'Yes' if len(
                el.select(".//div[@class='tyre_xl']").extract()) > 0 else 'No'

            if xl == 'Yes':
                name = name.replace("XL", "").strip()

            loader = ProductLoader(Product(), selector=hxs)
            loader.add_value('name', name)
            loader.add_value('identifier', identifier)
            loader.add_value('price', price)
            loader.add_value('url', 'http://www.asdatyres.co.uk/')
            loader.add_value('image_url', image_url)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))

            metadata = MicheldeverMeta()
            metadata['width'] = width
            metadata['aspect_ratio'] = ratio
            metadata['rim'] = rim
            metadata['load_rating'] = load_rating
            metadata['speed_rating'] = speed_rating
            metadata['fitting_method'] = 'Fitted'
            metadata['run_flat'] = run_flat
            metadata['xl'] = xl
            metadata['fitting_method'] = 'Fitted'

            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                if code in name:
                    man_code = man_mark
                    break
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join(
                (width, ratio, rim, load_rating, speed_rating))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #27
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        product_data = response.meta['product_data']
        width = product_data['Width']
        aspect_ratio = product_data['Aspect Ratio']
        rim = product_data['Rim']
        speed_rating = product_data['Speed rating']
        alt_speed = product_data['Alt Speed']

        name_reg = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (
            width, rim, speed_rating.upper())
        name_reg2 = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (
            width, rim, alt_speed.upper())
        name_reg3 = r'(.+?)\s*%s.+%s.?[\s]*(.*)' % (width, rim)
        products = hxs.select(
            '//div[@id="product-listing"]//div[@class="product"]/..')
        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)
            try:
                url = product_el.select(
                    './/div[@class="title"]/a/@href')[0].extract()
            except:
                continue
            loader.add_value('url', url)
            loader.add_value(
                'identifier',
                product_el.select(".//span[@class='addcompare']/input/@id").
                extract()[0].split(":")[1])
            # loader.add_value('identifier', re.search('productId_(\d+)_', url).groups()[0])
            loader.add_xpath('price', './/span[@class="prodPirce"]/text()')
            try:
                name = product_el.select(
                    './/div[@class="title"]/a/text()')[0].extract()
            except:
                continue
            run_flat_found = is_run_flat(name)
            if not re.search(r'(\(.*\))', name):
                # name = name.replace('/', '')
                m = re.search(name_reg, name)
                if not m:
                    m = name_parts = re.search(name_reg2, name)
                if not m:
                    m = name_parts = re.search(name_reg3, name)

                if m:
                    name_parts = m.groups()
                else:
                    self.log('Failed parsing ' + name)
                    self.log('URL: ' + response.url)
                    self.log('Params: ' + ", ".join(
                        map(str, [width, rim, speed_rating.upper()])))
                    continue
            else:
                name_parts = []
                name_parts.append(name.split()[0])
                load_rating_reg = re.search(r'(\d+)%s' % speed_rating.upper(),
                                            name)
                if not load_rating_reg:
                    load_rating_reg = re.search(r'(\d+)%s' % alt_speed.upper(),
                                                name)
                if not load_rating_reg:
                    self.log('Failed parsing ' + name)
                    self.log('URL: ' + response.url)
                    self.log('Params: ' + ", ".join(
                        map(str, [width, rim, speed_rating.upper()])))
                    continue
                name_parts.append(load_rating_reg.groups()[0])
                name_parts.append(' '.join(name.split()[1:]).split('(')[0])

            loader.add_value(
                'name',
                name_parts[-1].replace('XL',
                                       '').replace('ROF',
                                                   '').replace('RFT', ''))
            brand = name_parts[0]
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            loader.add_xpath('image_url',
                             './/a[contains(@class, "tyre")]/img/@src')
            m = MicheldeverMeta()
            m['aspect_ratio'] = aspect_ratio
            m['rim'] = rim
            m['width'] = width
            m['speed_rating'] = speed_rating.upper()
            m['load_rating'] = name_parts[1]
            if 'ROF' in name.upper() or 'RFT' in name.upper(
            ) or run_flat_found:
                m['run_flat'] = 'Yes'
            else:
                m['run_flat'] = 'No'

            if 'XL' in name.upper():
                m['xl'] = 'Yes'
            else:
                m['xl'] = 'No'

            m['full_tyre_size'] = '/'.join(
                (m['width'], m['aspect_ratio'], m['rim'], m['load_rating'],
                 m['speed_rating']))
            # m['alternative_speed_rating']))

            m['fitting_method'] = 'Fitted'
            m['manufacturer_mark'] = self._get_manufacturer_code(
                name_parts[-1])
            fuel = product_el.select(
                './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "fuel_")]/@class'
            ).re(r'fuel_(\w)')
            m['fuel'] = fuel[0] if fuel else ''
            grip = product_el.select(
                './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "grip_")]/@class'
            ).re(r'grip_(\w)')
            m['grip'] = grip[0] if grip else ''
            noise = product_el.select(
                './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "noise_")]/@class'
            ).re(r'_(\d+)')
            m['noise'] = noise[-1] if noise else ''

            product = loader.load_item()
            product['metadata'] = m

            if not is_product_correct(product):
                self.log('The product is not correct: %r' % product)
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            yield product

        next_page = hxs.select('//span[@class="nextlink"]/a/@href')
        if next_page:
            yield Request(next_page.extract()[0],
                          callback=self.parse_products,
                          meta=response.meta)
Пример #28
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        #pagination
        urls = hxs.select(
            '//div[@class="pagination pagination-centered"]//a/@href').extract(
            )
        for url in urls:
            yield Request(urljoin(base_url, url), callback=self.parse_search)
        #parse products list
        products = hxs.select('//*[@id="searchRes"]/tbody//tr')
        for product in products:
            season = product.select('.//td[4]/i/@class').extract()
            #skip winter tyres
            if season and 'ico-type ico-W' in season[0]:
                continue
            loader = ProductLoader(item=Product(), selector=product)
            brand, name = product.select('./td[2]/a/b/text()').extract()
            loader.add_value('name', name)

            pattern = product.select('./td[2]/a/small/text()').extract()[0]

            data = extract_data(pattern)
            if data:
                width, aspect_ratio, rim, load_rating, speed_rating = data
            else:
                self.log("ERROR. Unable to parse pattern: %s" % pattern)
                continue

            if 'goodrich' in brand.lower():
                brand = 'BFG'
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            price = product.select('.//span[@class="pr"]/text()').extract()[0]
            price_decimals = product.select(
                './/span[@class="pr"]/sup/text()').extract()[0].replace(
                    u'\xa3', '')
            loader.add_value('price', extract_price(price + price_decimals))
            identifier = product.select('@data-id').extract()[0]
            loader.add_value('identifier', identifier)
            url = product.select('./td[2]/a/@href').extract()[0]
            loader.add_value('url', urljoin(base_url, url))
            image_url = product.select('./td[1]/img/@src').extract()
            if image_url:
                loader.add_value('image_url', urljoin(base_url, image_url[0]))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = aspect_ratio
            metadata['rim'] = rim
            metadata['speed_rating'] = speed_rating
            metadata['width'] = width
            metadata['fitting_method'] = 'Delivered'
            metadata['load_rating'] = load_rating
            specif = product.select(
                './/span[@class="specif"]/text()').extract()
            specif = [x.lower() for x in specif]
            metadata['xl'] = 'Yes' if 'xl' in specif else 'No'
            metadata['run_flat'] = 'Yes' if 'runflat' in specif else 'No'

            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                if code.lower() in specif:
                    man_code = man_mark
                    break
            if man_code == '':
                for code, man_mark in self.custom_man_marks.iteritems():
                    if code.lower() in specif:
                        man_code = man_mark
                        break
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 load_rating, speed_rating))
            #metadata['alternative_speed_rating']))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
Пример #29
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
        # the pattern should be set as the product's name

        fitting_method = 'Delivered'

        loader.add_value('url', response.url)

        image_url = hxs.select('//img[@itemprop="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin(get_base_url(response), image_url[0]))

        identifier = hxs.select('//form[@name="form1"]/@action').extract()
        if not identifier:
            yield self.retry_request(response)
            return
        identifier = identifier[0]
        loader.add_value('identifier', identifier)
        price = hxs.select(
            '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract()
        loader.add_value('price', price)
        if not loader.get_output_value('price'):
            loader.add_value('stock', 0)

        brand = hxs.select(
            '//div[@class="hidden"]/input[@class="producerName"]/@value'
        ).extract()
        if not brand:
            yield self.retry_request(response)
            return
        brand = brand[0].strip()
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category',
                         find_brand_segment(loader.get_output_value('brand')))
        brand = re.sub(u'\u0119', u'e', brand)

        product_name = hxs.select(
            '//h1[@itemprop="name"]/text()')[0].extract().strip()
        product_name = re.sub(u'[:\u2122]', u'', product_name)
        product_name = product_name.replace(brand, '').strip()

        data = parse_pattern(product_name)
        if not data:
            log.msg('ERROR parsing "{}" [{}]'.format(product_name,
                                                     response.url))
            self.errors.append('ERROR parsing "{}" [{}]'.format(
                product_name, response.url))
            return

        loader.add_value('name', data['Name'])

        metadata = MicheldeverMeta()
        metadata['aspect_ratio'] = data['Aspect_Ratio']
        metadata['rim'] = data['Rim']
        metadata['speed_rating'] = data['Speed_Rating']

        metadata['width'] = data['Width']
        metadata['fitting_method'] = fitting_method
        metadata['load_rating'] = data['Load_Rating'] or ''
        metadata['alternative_speed_rating'] = ''
        xl = 'XL' in product_name
        metadata['xl'] = 'Yes' if xl else 'No'

        run_flat = 'run on flat' in product_name.lower(
        ) or 'run flat' in product_name.lower()
        metadata['run_flat'] = 'Yes' if run_flat else 'No'
        manufacturer_mark = [
            mark for mark in self.all_man_marks.keys()
            if mark in product_name.split(' ')
        ]
        manufacturer_mark = manufacturer_mark[0].strip(
        ) if manufacturer_mark else []
        metadata['manufacturer_mark'] = self.all_man_marks.get(
            manufacturer_mark, '') if manufacturer_mark else ''
        metadata['full_tyre_size'] = '/'.join(
            (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
             metadata['load_rating'], metadata['speed_rating']))
        # metadata['alternative_speed_rating']))

        product = loader.load_item()
        product['metadata'] = metadata

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(
            product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        yield product
Пример #30
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        row = response.meta['row']

        products = hxs.select('//div[contains(@class, "tyre_container")]')

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.select(
                'form/span[@class="tyre_brand_text"]/text()').extract()
            brand = brand[0] if brand else ''

            winter_tyre = product_el.select(
                'div[@class="tyre_type"]/div[@class="tyre_winter"]').extract()
            # skip winter tyres
            if winter_tyre:
                continue

            for tyre_brand in self.brands:
                if tyre_brand.upper() == brand.strip().upper():
                    brand = tyre_brand

            full_name = ' '.join(
                map(
                    lambda x: x.strip(),
                    product_el.select(
                        'form/p[@class="tyre_details"]//text()').extract()))
            if not full_name:
                continue

            loader.add_value('name', ' '.join(full_name.split()[2:]))
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            identifier = product_el.select('@id').extract()
            if identifier:
                identifier = identifier[0]
            else:
                log.msg('Product without identifier')
                search_params = '/'.join([
                    row['Aspect Ratio'], row['Rim'], row['Width'],
                    row['Alt Speed']
                ])
                log.msg('Search parameters: ' + search_params)
                return

            loader.add_value('url', response.url)
            image_url = product_el.select(
                'img[contains(@class, "tyre_image")]/@src').extract()
            if image_url:
                loader.add_value('image_url',
                                 urljoin_rfc(base_url, image_url[0]))
            loader.add_value('identifier', identifier)

            price = ''.join(
                product_el.select(
                    'div/p[@class="tyre_price"]//text()').extract())

            if not price:
                continue

            loader.add_value('price', price)

            metadata = MicheldeverMeta()

            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']

            speed = re.search('(\s\d+\w+\s)', full_name)
            speed_rating = speed.group().strip()[-1] if speed else ''
            load_rating = speed.group().strip()[:-1] if speed else ''

            metadata['speed_rating'] = speed_rating
            metadata['load_rating'] = load_rating

            metadata['width'] = row['Width']

            metadata['fitting_method'] = 'Fitted'
            metadata['alternative_speed_rating'] = ''
            xl = product_el.select(
                'div[@class="tyre_type"]/div[@class="tyre_xl"]').extract()
            metadata['xl'] = 'Yes' if xl else 'No'
            run_flat_found = is_run_flat(full_name)
            run_flat = product_el.select(
                'div[@class="tyre_type"]/div[@class="tyre_rf"]').extract()
            metadata[
                'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No'

            metadata['manufacturer_mark'] = self._get_manufacturer_code(
                full_name)

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'],
                 metadata['load_rating'], metadata['speed_rating']))

            fuel = product_el.select(
                './/div[@class="label_ratings"]/div[@class="fuel_rating"]//span[contains(@class, "label_rating_")]/text()'
            ).extract()
            grip = product_el.select(
                './/div[@class="label_ratings"]/div[@class="wet_rating"]//span[contains(@class, "label_rating_")]/text()'
            ).extract()
            noise = product_el.select(
                './/div[@class="label_ratings"]/div[contains(@class, "noise_rating")]/@data-decibels'
            ).extract()
            metadata['fuel'] = fuel[0] if fuel else ''
            metadata['grip'] = grip[0] if grip else ''
            metadata['noise'] = noise[0] if noise else ''

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            yield product

        brand_filters = hxs.select(
            '//div[@class="filter-wrapper"]/div[div/input[@name="brand_filter"]]/p/text()'
        ).extract()
        for brand_filter in brand_filters:
            url = response.url.split('&')[0] + '&brand=' + brand_filter.lower()
            yield Request(url, meta=response.meta, callback=self.parse)