示例#1
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        brand = hxs.select(
            '//span[@class="title-designer-info"]/a/text()').extract()
        brand = brand[0] if brand else ''

        options = re.search('var spConfig = new Product.Config\((.*})\);',
                            response.body)
        options = json.loads(options.group(1)) if options else None
        if options:
            product_name = options['productName']
            price = options['basePrice']
            image_url = options['imageUrl']
            identifier = options['productId']
        else:
            product_name = hxs.select(
                '//span[@itemprop="name"]/text()')[0].extract()
            price = hxs.select(
                '//form//p[@class="special-price"]/span[@class="price"]/text()'
            ).extract()
            if not price:
                price = hxs.select(
                    '//form//span[@class="regular-price"]/span[@class="price"]/text()'
                ).extract()
            price = price[0].replace('.', '').replace(',', '.')
            image_url = hxs.select('//img[@id="image-main"]/@src')[0].extract()
            identifier = hxs.select(
                '//input[@name="product"]/@value')[0].extract()
        product_loader = ProductLoader(item=Product(), selector=hxs)
        # url = 'http://www.retrofurnish.com/de/' + response.url.split('/')[-1]
        product_loader.add_value('url', response.url)
        product_loader.add_value('name', product_name)
        product_loader.add_value('brand', brand)
        product_loader.add_value('image_url', image_url)
        product_loader.add_value('identifier', identifier)
        product_loader.add_value('category',
                                 response.meta.get('category') or '')
        product_loader.add_value('sku', identifier)
        price = re.search('([\d\.]+)', price).group(1)
        product_loader.add_value('price', price)
        product_loader.add_value(
            'shipping_cost',
            self.get_shipping_cost(
                float(product_loader.get_output_value('price'))))
        if not options:
            product = product_loader.load_item()
            if product['identifier'] in self.products_ids:
                product['name'] = self.products_ids[product['identifier']]
            else:
                self.products_ids[product['identifier']] = product['name']
            yield product
            return
        option_names = {}
        for attr in options['attributes'].values():
            for opt in attr['options']:
                for prod in opt['products']:
                    option_names[prod] = option_names.get(prod,
                                                          []) + [opt['label']]
        option_names = dict(
            map(lambda x: (x[0], ' '.join(x[1])), option_names.items()))
        for option in options.get('childProducts').iteritems():
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('url', response.url)
            product_loader.add_value(
                'name', '%s %s' % (product_name, option_names[option[0]]))
            product_loader.add_value('image_url', option[1]['imageUrl'])
            product_loader.add_value('identifier', option[0])
            product_loader.add_value('sku', identifier)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category',
                                     response.meta.get('category') or '')
            product_loader.add_value('price', option[1]['finalPrice'])
            product_loader.add_value(
                'shipping_cost',
                self.get_shipping_cost(
                    float(product_loader.get_output_value('price'))))
            product = product_loader.load_item()
            if product['identifier'] in self.products_ids:
                product['name'] = self.products_ids[product['identifier']]
            else:
                self.products_ids[product['identifier']] = product['name']
            yield product
示例#2
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        name = ''.join(response.xpath('//h1//text()').extract()).strip()
        product_loader.add_value('name', name)
        product_loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
        product_loader.add_xpath('identifier',
                                 '//input[@id="defaultSKUID"]/@value')
        category = response.xpath(
            '//div[@class="breadcrumb"]/ul/li/a/text()').extract()[1:]
        product_loader.add_value('category', category)

        image_url = response.xpath(
            '//meta[@property="og:image"]/@content').extract()
        if image_url:
            image_url = image_url[0].split('?$')[0]
            product_loader.add_value('image_url', image_url)

        options_values = response.xpath(
            '//script[contains(text(), "var allVariants={")]/text()').re(
                r'var variantsAray=(\[.*\]);')
        if options_values:
            options_values = eval(options_values[0])
        options = response.xpath(
            '//script[contains(text(), "var allVariants={")]/text()').re(
                r'allVariants={"variants":(\[.*\,])\}\;')
        if options:
            options = eval(options[0])

        option_images = {}
        media_json = re.findall("var mediaJSON='(.*)';if", response.body)
        if media_json and media_json[0]:
            images = json.loads(media_json[0])
            for image in images["imageList"]:
                sku = image.get('skuId', None)
                if sku:
                    option_image = response.xpath('//div[@data-value="' +
                                                  image['colour'] +
                                                  '"]/img/@src').extract()
                    image_url = option_image[0] if option_image else ''
                    if option_image:
                        image_url = add_or_replace_parameter(
                            option_image[0], 'wid', '500')
                        image_url = add_or_replace_parameter(
                            image_url, 'hei', '500')
                        option_images[image['skuId']] = image_url
                    else:
                        option_images[image['skuId']] = ''

            initial_image = images['initialImage']['imageURL']
            product_loader.add_value('image_url', initial_image)

        product = product_loader.load_item()

        if options and options_values:
            for option in options:
                prod = Product(product)
                sku = option['skuId']
                if not sku:
                    log.msg(' >>>>> ERROR: NO IDENTIFIER' + response.url)
                    continue
                prod['identifier'] = sku
                prod['sku'] = sku
                prod['name'] = prod['name'].strip() + ' ' + ' '.join(
                    option[k] for k in options_values
                    if option[k] is not 'null').decode('utf-8')
                prod['price'] = extract_price(option['RP'])
                if option['isInStock'] != 'true':
                    prod['stock'] = 0
                if option_images and option_images.get(sku, ''):
                    prod['image_url'] = option_images.get(sku, '')
                yield prod
        else:
            if not product.get('sku'):
                log.msg(' >>>>> ERROR: NO IDENTIFIER' + response.url)
            else:
                yield product
示例#3
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        try:
            product_id = re.findall('\/(\d+)-', response.url)[-1]
        except IndexError:
            product_id = re.findall('\/(\d+)$', response.url)[-1]

        name = hxs.select('//div[@class="pm_inner"]/h1/text()').extract_first()
        sku = hxs.select(
            '//span[contains(@class, "product_code")]/text()').extract()
        sku = sku[0] if sku else ''
        if not name:
            name = sku
        if not name:
            for request in self.parse_product_list(response):
                yield request
            return
        category = hxs.select(
            '//ul[contains(@class, "ancestors")]/li/a/text()').extract()
        if category:
            category = category[-1]
        image_url = hxs.select(
            '//div[@class="mlens-image"]//img/@src').extract()
        if image_url:
            image_url = urljoin_rfc(base_url, image_url[0])
        brand = hxs.select(
            '//a[@class="more" and contains(@href, "brands")]/h2/text()').re(
                'More (.*)')
        brand = brand[0].strip() if brand else ''

        price = ''.join(
            hxs.select('//span[@class="pm_price"]//text()').extract())
        price = extract_price(price)

        options = hxs.select(
            '//select[@id="basket_line_product_id"]/option[@value!=""]')
        row_options = hxs.select(
            '//tr[@class="no_stock" or @class="has_stock"]')
        if options:
            for option in options:
                size = option.select('text()').extract()[0]
                product_loader = ProductLoader(item=Product(), selector=option)
                product_loader.add_value('category', category)

                product_name = name + ' - ' + size

                brand_in_name = False
                for w in re.findall('([a-zA-Z]+)', product_name):
                    if w.upper() in brand.upper():
                        brand_in_name = True

                if brand.upper() not in product_name.upper() and brand.upper(
                ) not in ('OTHER', 'UNASSIGNED') and not brand_in_name:
                    product_name = brand + ' ' + product_name

                product_loader.add_value('name', product_name)
                product_loader.add_value('url', response.url)
                option_id = option.select('@value').extract()[0]
                product_loader.add_value('identifier',
                                         product_id + '-' + option_id)
                product_loader.add_value('brand', brand)
                product_loader.add_value('sku', sku)
                #stock = option.select('div[@class="pr_stock"]/text()').extract()[0]
                #add_button = option.select('.//input[contains(@class, "addbasket")]')
                #if add_button:
                #    product_loader.add_value('stock', 1)
                #else:
                #    product_loader.add_value('stock', extract_price(stock))
                if price < 150:
                    product_loader.add_value('shipping_cost', 6)

                product_loader.add_value('price', price)
                product_loader.add_value('image_url', image_url)
                yield product_loader.load_item()
        elif row_options:
            for option in row_options:
                size = option.select('./td[1]/text()').extract()[0]
                product_loader = ProductLoader(item=Product(), selector=option)
                product_loader.add_value('category', category)

                product_name = name + ' - ' + size

                brand_in_name = False
                for w in re.findall('([a-zA-Z]+)', product_name):
                    if w.upper() in brand.upper():
                        brand_in_name = True

                if brand.upper() not in product_name.upper() and brand.upper(
                ) not in ('OTHER', 'UNASSIGNED') and not brand_in_name:
                    product_name = brand + ' ' + product_name

                product_loader.add_value('name', product_name)
                product_loader.add_value('url', response.url)
                option_id = option.select(
                    './td[3]/input[1]/@value').extract()[0]
                product_loader.add_value('identifier',
                                         product_id + '-' + option_id)
                product_loader.add_value('brand', brand)
                product_loader.add_value('sku', sku)
                #stock = option.select('div[@class="pr_stock"]/text()').extract()[0]
                #add_button = option.select('.//input[contains(@class, "addbasket")]')
                #if add_button:
                #    product_loader.add_value('stock', 1)
                #else:
                #    product_loader.add_value('stock', extract_price(stock))

                price = ''.join(
                    option.select(
                        './td[2]/div[not(@class="oldprice")]/div[@class="nowprice"]/text()'
                    ).extract())
                if not price:
                    price = ''.join(option.select('./td[2]//text()').extract())

                price = extract_price(price)
                if price < 150:
                    product_loader.add_value('shipping_cost', 6)

                product_loader.add_value('price', price)
                product_loader.add_value('image_url', image_url)
                yield product_loader.load_item()
        else:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('category', category)

            product_name = name

            brand_in_name = False
            for w in re.findall('([a-zA-Z]+)', product_name):
                if w.upper() in brand.upper():
                    brand_in_name = True

            if brand.upper() not in product_name.upper() and brand.upper(
            ) not in ('OTHER', 'UNASSIGNED') and not brand_in_name:
                product_name = brand + ' ' + product_name

            product_loader.add_value('name', product_name)
            product_loader.add_value('url', response.url)

            product_loader.add_value('identifier', product_id)
            product_loader.add_value('brand', brand)
            product_loader.add_value('sku', sku)
            if price < 150:
                product_loader.add_value('shipping_cost', 6)

            product_loader.add_value('price', price)
            product_loader.add_value('image_url', image_url)
            yield product_loader.load_item()
示例#4
0
    def parse_products(self, response):
        html_response = json.loads(response.body)['display_tyres']
        hxs = HtmlXPathSelector(text=html_response)

        search_params = response.meta['search_params']

        products = hxs.select('//div[contains(@class, "tyre_container") and @id]')

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.select('.//span[@class="tyre_brand_text"]/text()').extract()
            brand = brand[0] if brand else ''

            winter_tyre = product_el.select('.//i[@class="icon-select_tyres-winter"]').extract()
            if not winter_tyre:
                for tyre_brand in self.brands:
                    if tyre_brand.upper() == brand.strip().upper():
                        brand = tyre_brand
                full_name = product_el.select('.//p[@class="tyre_details"]/span/text()').extract()[0]

                loader.add_value('name', full_name)
                loader.add_value('brand', unify_brand(brand))
                loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
                identifier = product_el.select('@id').extract()
                loader.add_value('identifier', identifier)

                loader.add_value('url', 'http://www.tyresavings.com')

                image_url = product_el.select('.//img[contains(@class, "tyre_image")]/@src').extract()

                if image_url:
                    loader.add_value('image_url', urljoin(get_base_url(response), image_url[0]))

                price = ''.join(product_el.select('.//*[@class="tyre_price"]//text()').re(r'[\d,.]+'))

                if not price:
                    loader.add_value('stock', 0)

                loader.add_value('price', price)

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = search_params['aspect_ratio']
                metadata['rim'] = search_params['rim']

                tyre_details = product_el.select('.//*[@class="tyre_details"]/text()').extract()[0].strip()
                speed = re.search('(\s\d+\w+)', tyre_details)
                load_rating = speed.group().strip()[:-1] if speed else ''
                speed_rating = speed.group().strip()[-1] if speed else ''

                metadata['speed_rating'] = speed_rating
                metadata['load_rating'] = load_rating

                metadata['width'] = search_params['width']

                metadata['fitting_method'] = 'Fitted'
                metadata['alternative_speed_rating'] = ''
                xl = product_el.select('.//i[@class="icon-select_tyres-xl"]').extract()
                metadata['xl'] = 'Yes' if xl else 'No'
                run_flat_found = is_run_flat(full_name)
                run_flat = product_el.select('.//i[@class="icon-select_tyres-runflat"]').extract()
                metadata['run_flat'] = 'Yes' if run_flat or run_flat_found else 'No'

                metadata['manufacturer_mark'] = self._get_manufacturer_code(full_name)

                metadata['full_tyre_size'] = '/'.join((search_params['width'],
                                                       search_params['aspect_ratio'],
                                                       search_params['rim'],
                                                       metadata['load_rating'],
                                                       metadata['speed_rating']))
                fuel, grip, noise = filter(lambda s: bool(s),
                    map(unicode.strip,
                        product_el.select('.//div[@class="label_ratings"]//span[contains(@class, "label_rating_")]/text()|'
                                          './/div[@class="label_ratings"]//p[span[contains(@class, "decibels")]]/text()')
                        .extract()))

                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise

                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

                yield product

        if products:
            meta = response.meta
            next_page = meta['page'] + 1
            next_url = 'http://www.tyresavings.com/update-tyres/%s' % str(next_page)
            meta['page'] = next_page
            yield Request(next_url, dont_filter=True, callback=self.parse_products, meta=meta)
示例#5
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = ' '.join(hxs.select('//*[@id="itemTitle"]/text()').extract()).strip()
        identifier = response.url.split('?')[0].split('/')[-1]
        sku = ''
        for match in re.finditer(r"([\d,\.]+)", name):
            if len(match.group()) > len(sku):
                sku = match.group()
        brand = 'Lego'

        price = None
        try:
            price = hxs.select('//*[@id="prcIsum"]/text()').extract()[0].strip()
        except IndexError:
            try:
                price = hxs.select('//*[@id="mm-saleDscPrc"]/text()').extract()[0].strip()
            except IndexError:
                try:
                    price = re.search(r'"binPrice":".*[\$\xA3]([\d\.,]+)",', response.body).groups()[0]
                except AttributeError:
                    self.log("Price not found for " + response.url)

        image_url = hxs.select('//img[@id="icImg"]/@src').extract()
        category = 'Lego'

        # shipping cost
        shipping_cost = None
        try:
            shipping_cost = hxs.select('//*[@id="shippingSection"]//td/div/text()').extract()[0]
            if shipping_cost:
                if 'free' in shipping_cost.lower():
                    shipping_cost = 0
                else:
                    shipping_cost = extract_price(shipping_cost)
        except IndexError:
            pass

        options_variations = []

        try:
            json_var_map = unicode(hxs.select('//*/text()')
                                   .re(r'("menuItemMap":{.*}.*),'
                                       '"unavailableVariationIds"')[0])
        except:
            self.log('No item variations map...')
        else:
            json_var_map = re.sub(r',"watchCountMessage":".*?}', '}', json_var_map)
            variations = json.loads('{' + re.sub(r',"unavailableVariationIds".*', '', json_var_map) + '}')

            menu_map = variations['menuItemMap']

            for key, variation in variations['itemVariationsMap'].items():
                if variation['traitValuesMap']:
                    new_variation = {}
                    for option, value in variation['traitValuesMap'].items():
                        new_variation[option] = menu_map[str(value)]['displayName']
                    price = variation['price']
                    options_variations.append({'price': price,
                                               'values': new_variation,
                                               'identifier': key})

        if options_variations:
            for product in options_variations:
                product_loader = ProductLoader(item=Product(), selector=product)
                p_name = name + ' ' + \
                    ' '.join(opt_name.strip().lower()
                             for o, opt_name in product['values'].items())
                p_identifier = product['identifier']
                price = product['price']
                price = extract_price(price)
                product_loader.add_value('identifier', identifier + '_' + p_identifier)
                product_loader.add_value('name', p_name)
                product_loader.add_value('sku', sku)
                if image_url:
                    product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
                product_loader.add_value('price', price)
                product_loader.add_value('category', category)
                product_loader.add_value('brand', brand)
                product_loader.add_value('url', response.url)
                if shipping_cost is not None:
                    product_loader.add_value('shipping_cost', shipping_cost)
                product = product_loader.load_item()
                yield product
        else:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            price = extract_price(price)
            product_loader.add_value('identifier', identifier)
            product_loader.add_value('name', name)
            product_loader.add_value('sku', sku)
            if image_url:
                product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
            product_loader.add_value('price', price)
            product_loader.add_value('category', category)
            product_loader.add_value('brand', brand)
            product_loader.add_value('url', response.url)
            if shipping_cost is not None:
                    product_loader.add_value('shipping_cost', shipping_cost)
            product = product_loader.load_item()
            yield product
示例#6
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        search_params = response.meta['search_params']
        formdata = response.meta['formdata']
        loader = ProductLoader(item=Product(), selector=hxs)
        title = hxs.select(
            '//div[@class="rightpanel"]//h1/text()').extract()[0]
        title = ' '.join(title.split())
        tyre_params = "{}/{}R{}".format(search_params['width'],
                                        search_params['aspect_ratio'],
                                        search_params['rim'])
        parts = title.partition(tyre_params)
        brand = parts[0].strip()
        load_rating = parts[2].strip().split(formdata['speed'])[0].strip()
        name = title.partition('Fuel Effic')[0].replace('~', '').strip()
        name = name.replace(
            '{} {} {}{} '.format(brand, tyre_params, load_rating,
                                 formdata['speed']), '')
        brand = brand.title()
        if 'goodrich' in brand.lower():
            brand = 'BFG'
        loader.add_value('brand', unify_brand(brand))

        if 'www.tyretraders.com' in name or tyre_params not in title:
            meta = response.meta
            meta['retry'] += 1
            if meta['retry'] < 10:
                yield Request(response.url,
                              callback=self.parse,
                              meta=meta,
                              dont_filter=True)
            else:
                self.log('Giving up retrying to reload the product: {}'.format(
                    response.url))
        else:
            price = response.meta.get('price')
            loader.add_value('price', price)
            identifier = response.url.split("|")[-1].split(".")[0]
            identifier = url_unquote(identifier)
            # identifier = hxs.select('//*[@id="hf_itemid"]/@value').extract()[0]
            loader.add_value('identifier', identifier)
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            loader.add_value('url', response.url)
            image_url = hxs.select(
                '//div[@class="rightpanel"]//img[@style=" max-width:450px;"]/@src'
            ).extract()
            if image_url:
                loader.add_value(
                    'image_url',
                    urljoin_rfc(get_base_url(response), image_url[0]))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = search_params['aspect_ratio']
            metadata['rim'] = search_params['rim']
            metadata['speed_rating'] = search_params['speed_rating']
            metadata['width'] = search_params['width']
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating
            #metadata['alternative_speed_rating'] = ''
            result, name = remove_whole_word('XL', name)
            result1, name = remove_whole_word('RF', name)
            metadata['xl'] = 'Yes' if result or result1 else 'No'
            result, name = remove_whole_word('runflat', name)
            metadata['run_flat'] = 'Yes' if result else 'No'

            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                result, name = remove_whole_word(code, name)
                if result:
                    man_code = man_mark
                    break
            metadata['manufacturer_mark'] = man_code

            loader.add_value('name', name)

            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 load_rating, metadata['speed_rating']))
            #metadata['alternative_speed_rating']))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                return

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
示例#7
0
    def parse_product(self, response):
        try:
            pjs_data = json.loads(
                response.xpath('//script[@id="tb-djs-wml-base"]/text()').
                extract()[0].strip())
            price = str(pjs_data['adContextJSON']['price'])
        except:
            price = None

        if not price:
            price = ''.join(
                response.xpath(
                    '//div[contains(@class, "js-product-offer-summary")]'
                    '//div[contains(@class, "price-display")]//text()').
                extract())
        if not price:
            price = ''.join(
                response.xpath(
                    '//div[contains(@class, "PricingInfo clearfix")]'
                    '//span[contains(@class, "clearfix camelPrice")]//text()').
                extract())

        # Some products are not available online and these have no price
        if not price:
            self.log('No price found {}'.format(response.url))

        if price:
            stock_status = 1
            if 'out of stock' in price.lower():
                stock_status = 0
            sku = response.xpath(
                '//td[contains(text(), "Model No")]/following-sibling::td/text()'
            ).extract()
            if not sku:
                sku = response.xpath(
                    '//td[contains(text(), "Model:")]/following-sibling::td/text()'
                ).extract()
            if not sku:
                # Retry
                retry = int(response.meta.get('retry', 0))
                if retry < 5:
                    retry += 1
                    meta = response.meta.copy()
                    meta['retry'] = retry
                    yield Request(response.url,
                                  callback=self.parse_product,
                                  meta=meta,
                                  dont_filter=True)
                else:
                    self.log('NO SKU => %s' % response.url)
                return
            loader = ProductLoader(item=Product(), response=response)
            loader.add_xpath('name', '//h1[@itemprop = "name"]//text()')
            loader.add_value('identifier',
                             re.search(r'/(\d+)\[?|$]', response.url).group(1))
            loader.add_value('sku', sku[0].strip())
            if response.meta.get('brand'):
                loader.add_value('brand', response.meta.get('brand'))
            else:
                loader.add_value('brand', 'LEGO')
            loader.add_xpath(
                'category',
                '//ol[contains(@class, "breadcrumb-list")]//li[last()]//a/span/text()'
            )
            loader.add_value('url', response.url)
            loader.add_xpath(
                'image_url',
                '//img[contains(@id, "mainImage") or contains(@class, "product-primary-image")]/@src'
            )

            loader.add_value('price', price)
            if not stock_status:
                loader.add_value('stock', 0)
            item = loader.load_item()
            item['metadata'] = {}

            if self.enable_map:
                self._save_html_response(response, item['identifier'])

            yield Request(self._get_reviews_url(item, 1),
                          meta={
                              'product': item,
                              'page': 1
                          },
                          callback=self.parse_product_reviews)
示例#8
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        if response.url in self.products_parsed:
            return
        loader = ProductLoader(response=response, item=Product())

        #desc = hxs.select('//div[@id="product_description"]')

        price = None
        category = None
        crumbs = None
        brand = None

        price_raw = hxs.select(
            '//div[@id="product_price"]//span[@class="price"]//span[@class="GBP"]/text()'
        ).extract()
        if price_raw:
            price = price_raw[0][1:]

        name = hxs.select(
            '//div[@id="product_page_right_title"]//span[@id="product_title"]//text()'
        ).extract()[0]

        try:
            category = response.meta['category']
        except KeyError:
            pass

        crumbs = hxs.select(
            '//div[@id="breadcrumb_container"]//span//a/@title').extract()
        try:
            category = crumbs[1]
        except IndexError:
            pass

        try:
            brand = crumbs[2]
        except IndexError:
            pass

        img_url = hxs.select(
            '//img[@id="product_medium_image"]/@src').extract()[0]

        if name:
            loader.add_value('name', name)
        if price:
            loader.add_value('price', price)
        loader.add_value('url', response.url)
        identifier = hxs.select(
            u'//input[@type="hidden" and @name="parent_product_id"]/@value'
        ).extract()[0]
        loader.add_value('identifier', identifier)
        loader.add_value('image_url', img_url)
        if category:
            loader.add_value('category', category)
        if brand:
            loader.add_value('brand', brand)
        loader.add_value('shipping_cost', 'N/A')

        item = loader.load_item()
        if hxs.select('//div[@id="product_options"]'):
            yield Request(
                'http://www.thecosmeticpractice.co.uk/ajax/get_product_options/%s?cmd=addtobasket&parent_product_id=%s&product_id=0&image_product_id=0&image_id=0&image_index=0&'
                % (identifier, identifier),
                callback=self.parse_options,
                meta={
                    'item': item,
                    'identifier': identifier
                })
        else:
            yield item
示例#9
0
    def parse_product(self, product, fitted, search_params):
        url = product.select('.//div[@class="mod-item-body"]/h3//a/@href').extract()[0]
        p_id = url.split('/')[-1]
        p_id += '-F' if fitted else '-D'
        image_url = product.select('.//div[@class="mod-item-img"]//img/@src').extract()[0]
        brand = product.select('.//div[@class="mod-item-body"]/h3/text()').extract()[0].strip()
        try:
            if not fitted:
                price = product.select('.//div[@class="mod-delivered"]/a/text()').extract()[0]
            else:
                price = product.select('.//div[@class="mod-fitted"]/a/text()').extract()[0]
        except IndexError:
            self.log("Price not found: %s" % str(product))
            self.errors.append("Price not found: %s" % str(product))
            return

        name = product.select('.//div[@class="mod-item-body"]/h3/span/a/text()').extract()[0]

        pattern = re.sub('\d+[^\s]+R\d+', '', name)
        pattern = re.sub('[\d/]+%s' % search_params['speed_rating'].upper(), '', pattern)
        pattern = pattern.strip()
        if not pattern:
            pattern = name.strip()

        loader = ProductLoader(item=Product(), selector=product)
        loader.add_value('url', url)
        loader.add_value('identifier', p_id)
        loader.add_value('image_url', image_url)
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
        loader.add_value('price', price)

        pattern = pattern.upper()
        pattern = pattern.replace('XL', '').replace('RFLAT', '').replace('RUNFLAT', '')

        loader.add_value('name', pattern)

        m = MicheldeverMeta()
        m['aspect_ratio'] = search_params['aspect_ratio']
        m['rim'] = search_params['rim']
        m['width'] = search_params['width']
        m['speed_rating'] = search_params['speed_rating'].upper()
        res = re.search('([\d/]+)%s' % search_params['speed_rating'].upper(), name)
        if res:
            m['load_rating'] = res.groups()[0]
        else:
            self.log('ERROR: not load rating: %s' % url)
            m['load_rating'] = ''
        if 'RFLAT' in name.upper() or 'RUNFLAT' in name.upper():
            m['run_flat'] = 'Yes'
        else:
            m['run_flat'] = 'No'

        if 'XL' in name.upper():
            m['xl'] = 'Yes'
        else:
            m['xl'] = 'No'

        m['full_tyre_size'] = '/'.join((m['width'],
                                        m['aspect_ratio'],
                                        m['rim'],
                                        m['load_rating'],
                                        m['speed_rating']))
                                        #m['alternative_speed_rating']))

        m['fitting_method'] = 'Fitted' if fitted else 'Delivered'
        m['manufacturer_mark'] = self._get_manufacturer_code(name)

        product = loader.load_item()
        product['metadata'] = m

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        return product
示例#10
0
    def parse_product(self, response):
        if 'aspxerrorpath' in response.url:
            yield Request(response.request.meta['redirect_urls'][0],
                          self.parse_product,
                          dont_filter=True)
        aud_url = response.xpath(
            '//a[contains(@href, "?cur=AUD")]/@href').extract_first()
        if aud_url:
            yield Request(response.urljoin(aud_url),
                          self.parse_product,
                          dont_filter=True)
            return
        base_product = True
        add_custom_personalization = False
        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('url', '//link[@rel="canonical"]/@href')
        loader.add_value('category', 'Kits')
        heros_data = response.xpath('//script/text()').re(
            'product\d{7} =(.+?});var')
        base_product_data = response.xpath('//script/text()').re(
            'product\w{6} =(.+?});var')
        if not base_product_data:
            for p in self.parse(response):
                yield p
            return
        if not heros_data:
            data = json.loads(base_product_data[0])
        elif len(heros_data) == 1:
            data = json.loads(heros_data[0])
            base_product = False
        else:
            data = [json.loads(x) for x in heros_data]
            data = {x['ProductID']: x for x in data}
            heros = response.css('select.heroShirts')
            hero = heros.xpath('option[@selected]')
            if not hero:
                data = json.loads(base_product_data[0])
            else:
                data = data[int(hero.xpath('@value').extract_first())]
                base_product = False

        base_product_data = json.loads(base_product_data[0])

        #Checking custom personalization
        printings = {
            p['PrintingTypeID']: p
            for p in base_product_data['printingitems']
        }
        custom_printings = printings.get(1)
        if custom_printings and base_product:
            add_custom_personalization = True

        loader.add_value('name', data['Description'])
        loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"')
        if data['Brand']:
            loader.add_value('brand', data['Brand']['Name'])
        loader.add_value('image_url', response.urljoin(data['ImageURL']))
        loader.add_value('shipping_cost', self.shipping_cost)
        product = loader.load_item()
        player_from_name = re.search('(?!Sponsor).*with *([\w\ \.\-]+?) (\d+)',
                                     data.get('Description', ''), re.UNICODE)
        if player_from_name:
            player_name, number = player_from_name.groups()

        #sizes
        for variation in data['Variations']:
            size = variation['Description']
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value(None, product)
            loader.replace_value('identifier', variation['VariationId'])
            loader.add_value('name', size)
            loader.replace_value('price', variation['PriceActual'])
            loader.replace_value('stock', int(variation['IsInStock']))
            item = loader.load_item()
            if player_from_name:
                item['metadata'] = {
                    'player': player_name,
                    'number': number,
                    'size': size
                }
            else:
                item['metadata'] = {'size': size}
            yield item
            base_size_items = [item]

            #Custom printings
            if add_custom_personalization:
                team_player_name = 'WILLIAMS'
                team_player_number = '10'
                team_player_id = 'WILLIAMS'
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value(None, item)
                loader.add_value('name', team_player_name)
                loader.add_value('name', team_player_number)
                price = Decimal(item['price']) + Decimal(
                    str(custom_printings['PriceActual']))
                loader.replace_value('price', price)
                identifier = '-'.join(
                    (item['identifier'], str(custom_printings['PrintingID']),
                     team_player_id))
                loader.replace_value('identifier', identifier)
                custom_item = loader.load_item()
                custom_item['metadata'] = {
                    'player': team_player_name,
                    'number': team_player_number,
                    'size': size
                }
                yield custom_item
                base_size_items.append(custom_item)

            #Badges
            printing = printings.get(3)
            if not printing:
                continue
            for base_item in base_size_items:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value(None, base_item)
                loader.add_value('name', printing['PrintingDescription'])
                price = Decimal(base_item['price']) + Decimal(
                    str(printing['PriceActual']))
                loader.replace_value('price', price)
                identifier = base_item['identifier'] + '-' + str(
                    printing['PrintingID'])
                loader.replace_value('identifier', identifier)
                badge_item = loader.load_item()
                badge_item['metadata'] = base_item['metadata'].copy()
                yield badge_item
    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            self.log('ERROR: BAD HtmlResponse!!! URL:{}'.format(response.url))
            return
        hxs = HtmlXPathSelector(response)

        # logic to find categories
        # find subcats for Outilage Jardin
        categories = hxs.select(
            '//div[contains(@class,"bg_U15 menugroup") and contains(@alt,"Jardin") and contains(@alt,"Outillage")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href'
        ).extract()
        # find subcats for Aspirateurs
        categories += hxs.select(
            '//div[contains(@class,"bg_U4 menugroup") and contains(@alt,"Entretien") and contains(@alt,"maison")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href'
        ).extract()

        for url in categories:
            url = urljoin_rfc(get_base_url(response), url)
            yield self._proxyRequest(url)

        # products new logic
        products = hxs.select(
            u'//div[@id="productList"]//div[contains(@class,"plProductView")]')
        if products:
            for product in products:
                product_loader = ProductLoader(item=Product(),
                                               selector=product)
                product_loader.add_xpath(
                    'url', './/a[contains(@class,"plPrName")]/@href')
                product_loader.add_xpath(
                    'name', './/a[contains(@class,"plPrName")]/text()')
                product_loader.add_xpath(
                    'category', '//div[@class="productListTitle"]/h1/text()')
                product_loader.add_xpath(
                    'image_url',
                    './/div[contains(@class, "plProductImg")]//img/@data-src')
                product_loader.add_xpath('sku', './@data-sku')
                product_loader.add_xpath(
                    'identifier',
                    './/input[contains(@name, "ProductPostedForm.ProductId")]/@value'
                )
                price = product.select(
                    u'.//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/text()'
                ).extract()
                if price:
                    decimals = product.select(
                        u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/sup/text()'
                    ).re(u'(\d+)')
                    if decimals:
                        price = price[0] + '.' + decimals[0]
                product_loader.add_value('price', price)
                if product_loader.get_output_value(
                        'name') and product_loader.get_output_value('price'):
                    yield product_loader.load_item()

        # pagination
        next_page = hxs.select(
            u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]/@href'
        ).extract()
        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page[0])
            yield self._proxyRequest(next_page)
示例#12
0
    def parse_list(self, response):
        # To list all products if they are not all already listed
        limiter_selected = response.xpath(
            '//div[@class="limiter"]/select/option[@selected]/@value').extract(
            )
        limiter_all = response.xpath(
            '//div[@class="limiter"]/select/option[contains(@value, "limit=all")]/@value'
        ).extract()
        if limiter_all and limiter_selected:
            if limiter_selected[0] != limiter_all[0]:
                yield Request(response.urljoin(limiter_all[0]),
                              callback=self.parse_list,
                              meta=response.meta)

        sub_category_urls = response.xpath(
            '//div[@class="category-item-center"]'
            '//span[@class="product-name"]/a/@href').extract()
        for url in sub_category_urls:
            yield Request(response.urljoin(url),
                          callback=self.parse_list,
                          meta=response.meta)

        if not sub_category_urls:
            products = response.xpath(
                '//ul[contains(@class, "products-grid")]/li[contains(@class, "item")]'
            )
            for product_xs in products:
                product_name = ''.join(
                    product_xs.xpath(
                        './/*[contains(@class, "product-name")]//text()').
                    extract()).strip()
                product_url = response.urljoin(
                    product_xs.xpath(
                        './/*[contains(@class, "product-name")]//a/@href').
                    extract()[0])
                product_price = extract_price_eu(
                    product_xs.xpath('.//*[@class="price-box"]//text()').re(
                        r'[\d\.,]+')[-1])
                product_image_url = map(
                    response.urljoin,
                    product_xs.xpath(
                        './/*[contains(@class, "product-image")]//img/@src').
                    extract())
                product_brand = response.meta.get('brand', '')
                product_category = map(
                    unicode.strip,
                    response.xpath(
                        '//div[contains(@class, "breadcrumbs")]//li[contains(@class, '
                        '"category")]/a/text()').extract())[1:]
                product_out_of_stock = bool(
                    product_xs.xpath(
                        './/*[contains(@class, "availability") and contains(@class, "out-of-stock")]'
                    ))
                product_shipping_cost = '0.00' if product_price >= self.free_shipping_over else '5.00'

                try:
                    product_identifier = product_xs.xpath(
                        './/*[contains(@id, "product-price-")]/@id').re(
                            r'(\d+)')[0]
                except:
                    product_identifier = None

                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('name', product_name)
                loader.add_value('url', product_url)
                loader.add_value('price', product_price)
                loader.add_value('shipping_cost', product_shipping_cost)
                loader.add_value('image_url', product_image_url)
                loader.add_value('brand', product_brand)
                loader.add_value('category', product_brand or product_category)
                if product_out_of_stock:
                    loader.add_value('stock', 0)
                if product_identifier is not None:
                    loader.add_value('identifier', product_identifier)
                    loader.add_value('sku', product_identifier)
                    yield loader.load_item()
                else:
                    item = loader.load_item()
                    yield Request(item['url'],
                                  meta={'item': item},
                                  callback=self.parse_options)
示例#13
0
    def parse_item(self, response):
        meta = response.meta

        categories = response.css(
            '.ProductDetailBreadcrumbs-item::text').extract()
        sku = meta['client_product']['Item Number']

        image_url = response.xpath(
            '//div[contains(@class, "main-carousel")]//a/@data-original-src'
        ).extract()
        if not image_url:
            image_url = response.xpath(
                '//img[contains(@class, "ProductDetailImagesBlock-carousel-image")]/@src'
            ).extract()

        prod_id = response.xpath('//input[@name="sku"]/@value').extract()
        prod_id = prod_id[0] if prod_id else ''

        try:
            name = response.xpath(
                '//h1/span[contains(@class, "ProductDetailInfoBlock-header-title")]/text()'
            ).extract()[0]
        except Exception:
            retry = meta.get('retry', 0)
            if retry <= 10:
                retry += 1
                meta['retry'] = retry
                self.log('ERROR >>> No name found, retry URL: ' + response.url)
                yield Request(response.url,
                              dont_filter=True,
                              callback=self.parse_item,
                              meta=meta)
                return
            else:
                self.log('ERROR >>> Gave up retrying URL: ' + response.url)
                return

        name += response.xpath('//h1/text()').extract()[-1].strip()
        brand = meta['client_product'].get('Brand', '')

        products_collected = []
        sku_list = []

        options = []
        dropdown_options = response.xpath(
            '//select[contains(@class, "stdselect")]/option[@value!="XXXXXXXXXX"]'
        )
        option_elements = []
        if dropdown_options:
            for dropdown_option in dropdown_options:
                option = {}
                option['identifier'] = dropdown_option.xpath(
                    '@value').extract()[0]
                option['sku'] = ''
                option['desc'] = dropdown_option.xpath(
                    './/text()').extract()[0]
                cost = dropdown_option.xpath('@cost').extract() or re.findall(
                    '\+\$([\d.]+)', option['desc'])
                option['cost'] = cost[0] if cost else '0'
                options.append(option)
            option_elements.append(options)
        else:
            dropdown_elements = response.xpath(
                '//div[@class="pdinfoblock"]/div[@class="fl"]//select')
            for dropdown_options in dropdown_elements:
                options = []
                for dropdown_option in dropdown_options.xpath(
                        'option[@value!="XXXXXXXXXX"]'):
                    option = {}
                    option['identifier'] = dropdown_option.xpath(
                        '@value').extract()[0]
                    option['sku'] = ''
                    option['desc'] = dropdown_option.xpath(
                        './/text()').extract()[0].split('-')[0]
                    option['cost'] = dropdown_option.xpath(
                        '@cost').extract()[0]
                    options.append(option)
                option_elements.append(options)

        image_options = response.css('.option_select_wrap .visual_option_wrap')
        if image_options:
            options = []
            for image_option in image_options:
                option = {}
                option['identifier'] = image_option.xpath(
                    '@data-pi-id').extract()[0]
                option['sku'] = ''
                option['desc'] = image_option.xpath('@data-name').extract()[0]
                option['cost'] = image_option.xpath('@data-cost').extract()[0]
                options.append(option)
            option_elements.append(options)

        if option_elements:
            if len(option_elements) > 1:
                combined_options = list(itertools.product(*option_elements))
                options = []
                for combined_option in combined_options:
                    final_option = {}
                    for option in combined_option:
                        final_option['desc'] = final_option.get(
                            'desc', '') + ' - ' + option['desc']
                        final_option['cost'] = final_option.get(
                            'cost', 0) + float(option['cost'])
                        final_option['identifier'] = final_option.get(
                            'identifier', '') + ' - ' + option['identifier']
                    options.append(final_option)
            else:
                options = option_elements[0]

            products_matched = self.hhe_df[self.hhe_df['Wayfair'] ==
                                           meta['client_product']['Wayfair']]

            for option in options:

                price = response.xpath(
                    '//*[@class="dynamic_sku_price"]/span/text()').extract()[0]
                #price += response.xpath('//*[@class="dynamic_sku_price"]/span/sup/text()').extract()[0]
                option_price_value = self.option_price(price,
                                                       str(option['cost']))

                # SKU not unique: match the correct client product sku
                if not products_matched.empty and products_matched.count(
                )['Wayfair'] > 1:
                    current_diff = Decimal(0)
                    current_sku = sku
                    for i, row in products_matched.iterrows():
                        wf_price = Decimal(row['Wayfair Cost'].replace(
                            '$', '').strip())
                        price_diff = abs(option_price_value - wf_price)
                        if (current_diff
                                == Decimal(0)) or (price_diff < current_diff):
                            current_sku = str(row['Item Number'])
                            current_diff = price_diff

                    sku = current_sku

                product_loader = ProductLoader(item=Product(),
                                               response=response)
                product_loader.add_value('name', name + ' ' + option['desc'])
                product_loader.add_value('sku', sku)
                identifier = response.xpath(
                    '//input[@name="sku"]/@value').extract()[0]
                product_loader.add_value(
                    'identifier', identifier + '-' + option['identifier'])
                product_loader.add_value('brand', brand)
                product_loader.add_value('category', categories)
                if image_url:
                    product_loader.add_value('image_url', image_url[0])
                product_loader.add_value('url', response.url)

                product_loader.add_value('price', option_price_value)
                product = product_loader.load_item()

                metadata = HouseholdEssentialsMeta()
                metadata['reviews'] = []
                product['metadata'] = metadata

                products_collected.append(product)
                sku_list.append(product['identifier'])

        else:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('name', name)
            product_loader.add_value('sku', sku)
            product_loader.add_xpath('identifier',
                                     '//input[@name="sku"]/@value')
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', categories)
            if image_url:
                product_loader.add_value('image_url', image_url[0])

            price = response.xpath(
                '//span[@data-id="dynamic-sku-price"]/text()').extract_first()
            #price += response.xpath('//*[@class="dynamic_sku_price"]/span/sup/text()').extract()[0]

            product_loader.add_value('price', price)

            product_loader.add_value('url', response.url)

            product = product_loader.load_item()

            metadata = HouseholdEssentialsMeta()
            metadata['reviews'] = []
            product['metadata'] = metadata

            products_collected.append(product)
            sku_list.append(product['identifier'])

        transaction_id = re.findall(r'"transactionID":"(.*)",',
                                    response.body)[0]
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': response.url,
            'X-Requested-With': 'XMLHttpRequest'
        }

        params = urlencode({
            'bpss': 'yes',
            'skulist': '~^~'.join(sku_list),
            'kitmode': '0',
            'postalcode': '67346',
            '_txid': transaction_id
        })

        yield Request(self.ajax_stock_url + '?' + params,
                      headers=headers,
                      dont_filter=True,
                      meta={
                          'product': products_collected,
                          'prod_id': prod_id,
                          'prod_url': response.url
                      },
                      callback=self.parse_stock)
示例#14
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        prod_lists = hxs.select(
            '//div[@class="product_list"]/div/h3/a/@href').extract()
        if prod_lists:
            for url in prod_lists:
                url = urljoin_rfc(get_base_url(response), url)
                yield Request(url)

        products = hxs.select(
            u'//table[child::tr[child::td[@colspan="2" and child::h2]]]')
        if products:
            try:
                category = hxs.select('//div[@class="page-heading"]/h1/text()'
                                      ).extract()[0].strip()
            except:
                try:
                    category = hxs.select(
                        '//div[@id="frag"]//text()').extract()[0].strip()
                except:
                    category = hxs.select(
                        '//p[@class="text_breadcrumbs"]//text()').extract(
                        ).pop()
        for product in products:
            try:
                image_url = urljoin_rfc(
                    base_url,
                    product.select('.//img/@src').extract()[0])
            except:
                image_url = ''
            multiple_options = product.select(u'.//select/option')
            general_price = product.select(
                u'.//span[@class="actlarge"]/text()').extract()
            general_price = general_price[0] if general_price else None
            if not general_price:
                general_price = product.select(u'.//*/text()').re(
                    u'Price inc UK Mainland Carriage.*?\:.*?\xa3([\d\.,]*)')
                general_price = str(round(float(general_price[0]) /
                                          1.2, 2)) if general_price else None
                log.msg(u'Product with: Price inc UK Mainland Carriage')
            if multiple_options and general_price:
                options_text = u' '.join(
                    product.select(u'.//select/option/text()').extract())
                if u'\xa3' in options_text:
                    log.msg(
                        u'Product with both option and general price: [%s]' %
                        response.url)
            name = product.select(u'.//h2/text()')[0].extract().strip()
            name_complete = ''.join(product.select(u'.//h2//text()').extract())
            if 'special offer' in name.lower():
                special_offer_starts_at = name.lower().index('special offer')
                new_name = name[:special_offer_starts_at].strip()
                if 'ref:' in new_name.lower():
                    self.log("Found special offer")
                    self.log("Before: '%s'" % name)
                    self.log("After: '%s'" % new_name)
                    name = new_name.replace(u'  (Ref', u' \xa0(Ref')
            if multiple_options and not general_price:
                idx = 1
                for option in multiple_options:
                    option_text = option.select(u'./text()')[0].extract()
                    loader = ProductLoader(item=Product(), selector=product)

                    price = re.search(u'\xa3([\d\.,]+)', option_text)
                    if price:
                        price = price.group(1)
                    else:
                        continue
                    regex = r'[\d]{1,2},[\d]{2}'
                    if re.search(regex, price):
                        price = price.replace(',', '.')

                    loader.add_value('name',
                                     name + u' %s' % option_text.strip())
                    loader.add_value('category', category)
                    loader.add_value('image_url', image_url)
                    loader.add_value('url', response.url)
                    loader.add_value('price', price)
                    m = re.search(r'\(Ref:\s*([^\)]+)\)', name_complete, re.I)
                    if m:
                        optsku = option_text.strip().lower().replace(
                            'code', '').strip('-. ').split('-')[0]
                        if optsku:
                            loader.add_value('sku', m.group(1) + optsku)
                        else:
                            loader.add_value('sku',
                                             m.group(1) + ".inc" + str(idx))
                            idx += 1
                        loader.add_value('identifier',
                                         loader.get_output_value('sku'))

                    if loader.get_output_value('sku') not in INVALID_PRODUCTS:
                        yield loader.load_item()
            else:
                loader = ProductLoader(item=Product(), selector=product)
                loader.add_value('url', response.url)
                loader.add_value('name', name)
                loader.add_value('category', category)
                loader.add_value('image_url', image_url)
                if not general_price:
                    continue
                regex = r'[\d]{1,2},[\d]{2}'
                if re.search(regex, general_price):
                    general_price = general_price.replace(',', '')
                loader.add_value('price', general_price)
                m = re.search(r'\(Ref:\s*([^\)]+)\)', name_complete, re.I)
                if m:
                    loader.add_value('sku', m.group(1))
                    loader.add_value('identifier',
                                     loader.get_output_value('sku'))

                # if loader.get_output_value('price'):
                if loader.get_output_value('sku') not in INVALID_PRODUCTS:
                    yield loader.load_item()
示例#15
0
    def parse_product(self, response):
        if response.status == 405:
            url = response.meta['redirect_urls'][0]
            retries = response.meta.get('retries', 0)
            if retries >= 9:
                self.logger.error(
                    'Gave up retrying avoid antibot captcha for %s' % url)
                return
            self.logger.debug('DistilNetworks antibot captcha. Retrying %s' %
                              url)
            yield response.request.replace(dont_filter=True,
                                           url=url,
                                           meta={
                                               'retries': retries + 1,
                                               'dont_merge_cookies': True
                                           })
            return

        if response.url in self.old_urls:
            self.old_urls.remove(response.url)

        options_data = response.xpath(
            "//div[@class='v2-product-subproducts']//@data").extract()
        if options_data:
            options_data = json.loads(options_data[0])

            product_name = options_data['name']
            if not options_data.get('sku', 0):
                pass
            else:

                if options_data['sub_products']:

                    for sub_option in options_data:
                        loader = ProductLoader(item=Product(),
                                               response=response)
                        price = extract_price(
                            sub_option['prices']['price']['amount'])

                        loader.add_value('url', response.url)

                        option_name = sub_option['option1']
                        loader.add_value(
                            'name',
                            "{product} {option}".format(product=product_name,
                                                        option=option_name))
                        loader.add_value('stock',
                                         sub_option['stock']['is_in_stock'])

                        loader.add_xpath(
                            'category',
                            "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()"
                        )
                        loader.add_xpath(
                            'brand',
                            "//div[@class='v2-gallery-block']//img/@alt")

                        if price < 10:
                            shipping_cost = extract_price('2.95')
                        else:
                            shipping_cost = 0

                        # Add shipping cost to product price
                        loader.add_value('shipping_cost', shipping_cost)
                        loader.add_value('price', price + shipping_cost)

                        loader.add_value('sku', sub_option['sku'])
                        loader.add_value('identifier', sub_option['sku'])

                        loader.add_xpath(
                            'image_url',
                            sub_option['main_image']['large_path'])

                        product = loader.load_item()

                        promotion = response.xpath(
                            "//div[@id='product-offer-tab']//h3//text()"
                        ).extract()
                        metadata = FragranceDirectMeta()
                        if promotion:
                            metadata['promotion'] = promotion[0]
                        if product.get('price'):
                            metadata['price_exc_vat'] = Decimal(
                                product['price']) / Decimal('1.2')
                        product['metadata'] = metadata
                        yield product
                else:
                    loader = ProductLoader(item=Product(), response=response)
                    price = extract_price(
                        options_data['prices']['price']['amount'])

                    loader.add_value('price', price)
                    loader.add_value('url', response.url)

                    loader.add_value('name', product_name)
                    loader.add_value('stock',
                                     options_data['stock']['is_in_stock'])

                    loader.add_xpath(
                        'category',
                        "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()"
                    )
                    loader.add_xpath(
                        'brand', "//div[@class='v2-gallery-block']//img/@alt")

                    if price < 10:
                        shipping_cost = extract_price('2.95')
                    else:
                        shipping_cost = 0

                    # Add shipping cost to product price
                    loader.add_value('shipping_cost', shipping_cost)
                    loader.add_value('price', price + shipping_cost)

                    loader.add_value('sku', options_data['sku'])
                    loader.add_value('identifier', options_data['sku'])

                    loader.add_value('image_url',
                                     options_data['main_image']['large_path'])

                    product = loader.load_item()

                    promotion = response.xpath(
                        "//div[@id='product-offer-tab']//h3//text()").extract(
                        )
                    metadata = FragranceDirectMeta()
                    if promotion:
                        metadata['promotion'] = promotion[0]
                    if product.get('price'):
                        metadata['price_exc_vat'] = Decimal(
                            product['price']) / Decimal('1.2')
                    product['metadata'] = metadata
                    yield product

        else:
            product_name = response.xpath(
                "//h1[@class='fn']//text()").extract()[0]
            options = response.xpath(
                "//div[contains(@class, 'sub-products')]/div")
            sku = ''.join(
                response.xpath(
                    "//form[@name='notifications']//input[@name='p']/@value").
                extract())
            if options:
                for sub_option_2 in options:
                    sku_option = ''.join(
                        sub_option_2.xpath("./label/@data-sub-sku").extract())

                    loader = ProductLoader(item=Product(), response=response)
                    price = extract_price(
                        sub_option_2.xpath("./label/@data-subprice").extract()
                        [0])
                    if not price:
                        price = extract_price(''.join(
                            response.xpath(
                                '//p[@class="price-info"]//span[@class="Price"]/text()'
                            ).extract()).strip())

                    loader.add_value('price', price)
                    loader.add_value('url', response.url)

                    option_name = sub_option_2.xpath(
                        "./label/@data-option").extract()[0]
                    loader.add_value(
                        'name',
                        u"{product} {option}".format(product=product_name,
                                                     option=option_name))

                    stock = ''.join(
                        sub_option_2.xpath(
                            "./label/@data-stock").extract()).strip().lower()
                    if stock in ['limited', 'in stock']:
                        stock = '1'
                    else:
                        stock = '0'
                    loader.add_value('stock', stock)

                    loader.add_xpath(
                        'category',
                        "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()"
                    )
                    loader.add_xpath('brand',
                                     "//a[@class='product-brand']//img/@alt")

                    if price < 10:
                        shipping_cost = extract_price('2.95')
                    else:
                        shipping_cost = 0

                    # Add shipping cost to product price
                    loader.add_value('shipping_cost', shipping_cost)
                    loader.add_value('price', price + shipping_cost)

                    loader.add_value('sku', sku_option)
                    loader.add_value('identifier',
                                     '{}_{}'.format(sku, sku_option))

                    img = ''.join(
                        sub_option_2.xpath("./data-image-large").extract())
                    if not img:
                        img = ''.join(
                            response.xpath(
                                "//img/@data-original-large").extract())
                    loader.add_value('image_url', 'http:' + img)

                    product = loader.load_item()

                    promotion = response.xpath(
                        "//div[@id='product-offer-tab']//h3//text()").extract(
                        )
                    metadata = FragranceDirectMeta()
                    if promotion:
                        metadata['promotion'] = promotion[0]
                    if product.get('price'):
                        metadata['price_exc_vat'] = Decimal(
                            product['price']) / Decimal('1.2')
                    product['metadata'] = metadata
                    yield product
                return

            options = response.xpath('//option[@data-name]')
            if options:
                for opt in options:
                    loader = ProductLoader(item=Product(), response=response)
                    product_image_json = opt.xpath('@data-image').extract()
                    if product_image_json:
                        product_image_data = json.loads(product_image_json[0])
                        loader.add_value('image_url',
                                         product_image_data['default'])

                    product_stock = opt.xpath('@data-stock').extract()[0]
                    if product_stock == 'Out of Stock':
                        loader.add_value('stock', 0)

                    option_name = opt.xpath('@data-name').extract()[0]
                    loader.add_value('name', product_name + ' ' + option_name)

                    price_data = json.loads(
                        opt.xpath('@data-price').extract()[0])
                    loader.add_value('price', price_data['price'])

                    option_sku = opt.xpath('@value').extract()[0]
                    loader.add_value('sku', option_sku)
                    loader.add_value('identifier', sku + '_' + option_sku)

                    loader.add_xpath(
                        'category',
                        "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()"
                    )
                    loader.add_xpath('brand',
                                     "//a[@class='product-brand']//img/@alt")

                    loader.add_value('url', response.url)

                    price = loader.get_output_value('price')
                    if price < 10:
                        shipping_cost = extract_price('2.95')
                    else:
                        shipping_cost = 0

                    # Add shipping cost to product price
                    loader.add_value('shipping_cost', shipping_cost)
                    loader.add_value('price', price + shipping_cost)

                    product = loader.load_item()

                    promotion = response.xpath(
                        "//div[@id='product-offer-tab']//h3//text()").extract(
                        )
                    metadata = FragranceDirectMeta()
                    if promotion:
                        metadata['promotion'] = promotion[0]
                    if product.get('price'):
                        metadata['price_exc_vat'] = Decimal(
                            product['price']) / Decimal('1.2')
                    product['metadata'] = metadata
                    yield product

            else:
                if not sku:
                    pass
                else:

                    loader = ProductLoader(item=Product(), response=response)
                    price = ''.join(
                        response.xpath(
                            '//p[@class="price-info"]//span[@class="Price"]/text()'
                        ).extract()).strip()
                    if price == '':
                        price = ''.join(
                            response.xpath(
                                "//span[@class='Price ']//span[@class='Price-integer' or @class='Price-decimal']//text()"
                            ).extract())
                    if price == '':
                        self.log("Error! No price! URL: {}".format(
                            response.url))
                        return
                    price = extract_price(price)
                    loader.add_value('url', response.url)

                    loader.add_value('name', product_name)

                    stock = ''.join(
                        response.xpath("//span[@class='stock-level']//text()").
                        extract()).strip()

                    if stock.lower() in ['limited', 'in stock']:
                        stock = '1'
                    else:
                        stock = '0'

                    loader.add_value('stock', stock)

                    loader.add_xpath(
                        'category',
                        "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()"
                    )
                    loader.add_xpath('brand',
                                     "//a[@class='product-brand']//img/@alt")

                    if price < 10:
                        shipping_cost = extract_price('2.95')
                    else:
                        shipping_cost = 0

                    # Add shipping cost to product price
                    loader.add_value('shipping_cost', shipping_cost)
                    loader.add_value('price', price + shipping_cost)

                    loader.add_xpath(
                        'sku',
                        "//form[@name='notifications']//input[@name='p']/@value"
                    )
                    loader.add_xpath(
                        'identifier',
                        "//form[@name='notifications']//input[@name='p']/@value"
                    )

                    loader.add_xpath('image_url', "//img/@data-original-large")

                    product = loader.load_item()

                    promotion = response.xpath(
                        "//div[@id='product-offer-tab']//h3//text()").extract(
                        )
                    metadata = FragranceDirectMeta()
                    if promotion:
                        metadata['promotion'] = promotion[0]
                    if product.get('price'):
                        metadata['price_exc_vat'] = Decimal(
                            product['price']) / Decimal('1.2')
                    product['metadata'] = metadata
                    yield product
示例#16
0
    def parse_product(self, response):

        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        brand = hxs.select(
            "//*[contains(text(),'Dise') and contains(text(),'ador:')]/text()"
        ).extract()
        brand = brand[0].split(':')[1].strip() if brand else ''

        option_specs = []

        product_options = hxs.select(
            u'//select[@class="form" and contains(@onchange, "actualiza_atributos")]/option/text()'
        ).extract()
        if product_options:
            # Extract product options and price
            for option_text in product_options:
                parts = re.split(r'[()]', option_text, 2)
                option_name = parts[0]
                part_len = len(parts)
                if part_len == 1:
                    price_diff = 0
                else:
                    price_spec = parts[1]
                    modifier = -1 if price_spec.startswith('-') else 1
                    price_diff = price_spec.replace('+', '').replace('-', '')
                    price_diff = Decimal(spanishDecimal(price_diff)) * modifier

                option_specs.append({
                    'extra_name': option_name,
                    'price_diff': price_diff
                })

        else:
            option_specs.append({'extra_name': '', 'price_diff': 0})

        for option_spec in option_specs:
            extra_name = option_spec['extra_name']
            price_diff = option_spec['price_diff']
            category = hxs.select(
                u'//td[@class="cont_heading_td"]/span[@class="sub_cont_heading_td"]/text()'
            ).extract()
            category = category[0] if category else ''
            image_url = hxs.select(
                u'(//a[@rel="fotografias"])[1]/@href').extract()
            if image_url:
                image_url = urljoin_rfc(get_base_url(response), image_url[0])

            name = hxs.select(
                u'//td[@class="cont_heading_td"]/h1[last()]/text()').extract(
                )[0]

            product_loader = ProductLoader(item=Product(), response=response)
            if extra_name:
                product_loader.add_value(
                    'name', "%s - %s" % (name.strip(), extra_name.strip()))
            else:
                product_loader.add_value('name', name.strip())

            product_loader.add_value('url', response.url,
                                     Compose(stripSessionId))
            product_loader.add_value('category', category)
            product_loader.add_value('brand', brand)
            product_loader.add_value('image_url', image_url,
                                     Compose(stripSessionId))

            if extra_name:
                identifier = product_loader.get_value(response.url,
                                                      TakeFirst(),
                                                      re='p-([0-9]+)\.html')
                id_n_ext_name = "%s-%s" % (identifier, extra_name)
                product_loader.add_value('identifier', id_n_ext_name)

            else:
                product_loader.add_value('identifier',
                                         response.url,
                                         TakeFirst(),
                                         re='p-([0-9]+)\.html')

            product_loader.add_xpath('sku',
                                     '//td[contains(text(), "Ref:")]/text()',
                                     TakeFirst(),
                                     re='Ref: (.+)')

            price = hxs.select('//td[@class="preu"]/text()[1]').extract()[0]
            price = Decimal(spanishDecimal(price))
            if price_diff:
                price = price + price_diff
            product_loader.add_value('price', price)

            product_loader.add_value('stock', 1)
            yield product_loader.load_item()

            # parse product options
            more_products = hxs.select(
                u'//div[@class="product_section_sub"][1]/a[@title]/@href'
            ).extract()
            _, _, urlpath = response.url.partition('/product-pol')
            url_to_remove = "/product-pol%s" % urlpath
            final_more_products = list(
                set(more_products) - set([url_to_remove]))

            # parse product
            for product_url in final_more_products:
                product_url = urljoin_rfc(get_base_url(response), product_url)
                yield Request(product_url, callback=self.parse_product)
示例#17
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        if hxs.select('//div[@id="ResultSetItems"]'):
            for x in self.parse(response):
                yield x
            return

        first_name = ' '.join(
            hxs.select('//*[@id="itemTitle"]/text()').extract()).strip()
        if not first_name:
            return

        identifier = response.url.split('?')[0].split('/')[-1]

        try:
            category = hxs.select(
                '//*[@id="vi-VR-brumb-lnkLst"]//a/text()').extract().pop()
        except:
            category = ''
        seller_id = ''.join(
            hxs.select('.//*[@class="si-content"]'
                       '//a/*[@class="mbg-nw"]/text()').extract())
        try:
            brand = hxs.select(
                '//*[@class="attrLabels" and contains(text(), "Brand")]'
                '/following-sibling::*/text()').extract()[0].strip()
        except:
            brand = ''

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('name', first_name)
        product_loader.add_value('identifier', identifier)
        product_loader.add_value('category', category)
        product_loader.add_value('dealer', 'eBay - ' + seller_id)
        product_loader.add_value('brand', brand)
        product_loader.add_xpath('image_url', '//img[@id="icImg"]/@src')
        product_loader.add_value('url', response.url)
        try:
            price = hxs.select(
                '//*[@id="prcIsum"]/text()').extract()[0].strip()
        except:
            try:
                price = hxs.select(
                    '//*[@id="mm-saleDscPrc"]/text()').extract()[0].strip()
            except:
                try:
                    price = re.search(r'"binPrice":".*([\d\.,]+)",',
                                      response.body).groups()[0]
                except:
                    price = re.search(r'"bidPrice":".*([\d\.,]+)",',
                                      response.body).groups()[0]
        product_loader.add_value('price', extract_price_eu(price))

        # shipping cost
        try:
            shipping_cost = hxs.select(
                '//*[@id="shippingSection"]//td/div/text()').extract()[0]
            if shipping_cost:
                if 'free' in shipping_cost.lower():
                    product_loader.add_value('shipping_cost', 0)
                else:
                    product_loader.add_value('shipping_cost',
                                             extract_price(shipping_cost))
        except:
            pass

        product_ = product_loader.load_item()

        options_variations = []

        sel = HtmlXPathSelector(text=response.body.replace('&quot;', ''))
        try:
            json_var_map = unicode(
                sel.select('//*/text()').re(r'("menuItemMap":{.*}.*),'
                                            '"unavailableVariationIds"')[0])
        except:
            pass
        else:
            #json_var_map = re.sub(r',"watchCountMessage":".*?}', '}', json_var_map)
            variations = json.loads(
                '{' +
                re.sub(r',"unavailableVariationIds".*', '', json_var_map) +
                '}')

            menu_map = variations['menuItemMap']

            for key, variation in variations['itemVariationsMap'].items():
                if variation['traitValuesMap']:
                    new_variation = {}
                    for option, value in variation['traitValuesMap'].items():
                        new_variation[option] = menu_map[str(
                            value)]['displayName']
                    options_variations.append({
                        'price':
                        variation['price'],
                        'values':
                        new_variation,
                        'identifier':
                        '%s:%s' % (identifier, key)
                    })

        if options_variations:
            for model in options_variations:
                model_name = first_name + ' ' + \
                    ' '.join(opt_name.strip().lower()
                             for o, opt_name in model['values'].items())
                new_product = Product(product_)
                new_product['name'] = model_name
                new_product['identifier'] = model['identifier']
                new_product['price'] = extract_price_eu(model['price'])

                yield new_product
        else:
            yield product_
示例#18
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('identifier', '//input[@name="product"]/@value')
        if not loader.get_output_value('identifier'):
            loader.add_xpath(
                'identifier',
                'substring-after(//span[starts-with(@id,"product-price-")]/@id, "product-price-")'
            )
        loader.add_xpath('sku', '//*[contains(text(),"UGK")]/../*[2]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//div[@itemprop="name"]//text()')
        loader.add_xpath('image_url', '//meta[@itemprop="image"]/@content')
        loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
        shipping = hxs.select(
            u'//*[contains(text(), "livraison standard")]//following-sibling::*/span[@class="price"]/text()'
        ).extract()
        if shipping:
            loader.add_value('shipping_cost', shipping[0].replace(',', '.'))
        else:
            self.log('No shipping cost on %s' % response.url)
            self.log(response.body)
            self.log('Closing spider')
            raise CloseSpider
        if not loader.get_output_value('name'):
            return
        if loader.get_output_value('name').split()[0] == '2':
            loader.add_value('brand', 'Flynn')
        else:
            loader.add_value('brand',
                             loader.get_output_value('name').split(',')[0])

        #if hxs.select('//span[@itemprop="availability" and @content="in_stock"]'):
        #loader.add_value('stock', '1')
        #else:
        #loader.add_value('stock', '0')

        sku = loader.get_output_value('sku')
        sku = sku.upper().strip() if sku else ''
        if not sku:
            self.log('No SKU on %s' % response.url)
            self.log(response.body)
            self.log('Closing spider')
            raise CloseSpider
        made_product = self.made_products.get(sku, None)

        no_category = False
        if made_product:
            loader.add_value('category', made_product['Category'])
        else:
            loader.add_xpath(
                'category',
                '//div[@class="breadcrumbs"]/ul/li[position()>1]/a/span/text()'
            )
            if not loader.get_output_value('category'):
                loader.add_value('category',
                                 (x.replace('-', ' ')
                                  for x in response.url.split('/')[3:-1]))
                no_category = True

        product = loader.load_item()

        catmap = {
            "bedding and bath": "Bed & Bath",
            "beds": "Beds",
            "chairs": "Chairs",
            "homewares accessories": "Home Accessories",
            "lighting": "Lighting",
            "sofas and armchairs": "Sofas",
            "storage": "Storage",
            "tables": "Tables",
        }
        product['category'] = catmap.get(product['category'],
                                         product['category'])

        trs = hxs.select(
            '//table[@id="super-product-table"]//tr/td[@class="price"]/..')
        if not trs:
            for x in self.yield_product(product, no_category):
                yield x
            return

        for tr in trs:
            loader = ProductLoader(item=Product(product), selector=tr)
            loader.add_xpath(
                'identifier',
                'substring-after(.//span[starts-with(@id,"product-price-")]/@id, "product-price-")'
            )
            loader.add_value('name', product['name'])
            loader.add_xpath('name', './/td[1]/text()')
            loader.add_xpath('price', './/span[@property="price"]/@content')
            for x in self.yield_product(loader.load_item(), no_category):
                yield x
示例#19
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        url_processing = lambda u: urljoin_rfc(base_url, u[-1]).split('#')[
            0].split('?')[0] if u else ''
        price_processing = lambda p: extract_price_eu(p[0])

        list_view_mode = hxs.select(
            '//a[contains(@class, "enable-view") '
            'and contains(@class, "enable-list-view") '
            'and not(contains(@class, "active"))]/@href').extract()
        if list_view_mode:
            yield Request(url_processing(list_view_mode),
                          meta={
                              'category': response.meta['category'],
                              'cookiejar': response.meta['cookiejar']
                          })
            return

        last_page_no = int(response.meta.get('last_page_no', 0))
        if not last_page_no:
            last_page_no = hxs.select(
                '//input[@id="page-counter"]/@data-pagecount').extract()
            last_page_no = int(last_page_no[0] if last_page_no else 0)
        current_page_no = hxs.select(
            '//input[@id="page-counter"]/@data-currentpage').extract()
        current_page_no = int(current_page_no[0] if current_page_no else 0)

        is_last_page = (current_page_no == last_page_no)
        next_page = hxs.select(
            '//li[contains(@class, "page-arrow") and contains(@class, "arrow-next")]//a/@href'
        ).extract()
        if next_page:
            yield Request(url_processing(next_page),
                          meta={
                              'category': response.meta['category'],
                              'cookiejar': response.meta['cookiejar'],
                              'last_page_no': last_page_no
                          })

        products = hxs.select(
            '//div[contains(@class, "category-list-body")]'
            '/div[@data-pid and contains(@class, "cat-prod-row")]')
        for product_xs in products:
            loader = ProductLoader(item=Product(), selector=product_xs)
            loader.add_xpath(
                'name',
                './/strong[contains(@class, "cat-prod-row-name")]//a/text()')
            loader.add_xpath('identifier', '@data-pid')
            loader.add_xpath('sku', '@data-pid')
            loader.add_xpath(
                'url',
                './/strong[contains(@class, "cat-prod-row-name")]//a/@href',
                url_processing)
            loader.add_xpath('price',
                             './/strong[contains(@class, "price")]/text()',
                             price_processing)
            loader.add_value('category', response.meta['category'].split(','))
            loader.add_xpath(
                'image_url',
                './/div[contains(@class, "cat-prod-row-foto")]//img[@data-original]'
                '/@data-original|.//div[contains(@class, "cat-prod-row-foto")]//img/@src',
                url_processing)
            item = loader.load_item()
            if item['identifier'] not in self.new_ids:
                self.new_ids.append(item['identifier'])
                yield item

        if ((not products) and (not next_page)) or ((not is_last_page) and
                                                    (not next_page)):
            blocked_url = url_query_parameter(response.url, 'returnUrl')
            if blocked_url:
                blocked_url = urljoin_rfc(base_url, blocked_url)
                self.log('ERROR: Blocked URL => %s' % blocked_url)
            else:
                self.log('ERROR: No products or no next page in => %s' %
                         response.url)
            retry_no = int(response.meta.get('retry_no', 0))
            if retry_no < 10:
                retry_no += 1
                self.log('DEBUG: Retrying page - Retry No: %s' % retry_no)
                yield Request(blocked_url or response.url,
                              meta={
                                  'category': response.meta['category'],
                                  'cookiejar': response.meta['cookiejar'],
                                  'retry_no': retry_no,
                                  'last_page_no': last_page_no
                              },
                              dont_filter=True)
示例#20
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_value('url', response.url)
        loader.add_value('brand', response.meta.get('brand'))
        categories = hxs.select(
            '//div[@id="breadcrumbs"]/div[@class="crumbs"]/span/a/span/text()'
        ).extract()
        for category in categories[2:]:
            loader.add_value('category', category)

        sku = hxs.select('//meta[@itemprop="sku"]/@content').extract()
        loader.add_value('sku', sku)

        image_url = hxs.select(
            '//div[@id="product-image"]//img/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), image_url[0]))

        identifier = loader.get_output_value('name')

        loader.add_value('shipping_cost', '0.00')

        item = loader.load_item()

        variants = response.xpath('//div[@class="variant"]')
        if variants:
            for variant in variants:
                options = variant.select('.//tr')
                variant_name = variant.select(
                    './/div[@class="title"]/h4/text()')[0].extract().strip()
                for option in options:
                    option_name = option.select('.//td[@class="name"]/text()')[
                        0].extract().strip().encode('latin-1')
                    option_item = deepcopy(item)
                    option_item['identifier'] = '{}-{}-{}'.format(
                        identifier, variant_name,
                        option_name).decode('latin-1')
                    option_item['name'] += ' {} {}'.format(
                        variant_name, option_name
                        if option_name.lower() != variant_name.lower() else
                        '').decode('latin-1')
                    option_item['name'] = option_item['name'].strip()
                    price = variant.xpath(
                        './/span[@class="now"]/text()').extract_first(
                        ) or variant.css('p.price span::text').extract_first()
                    option_item['price'] = extract_price(
                        price) if price else Decimal('0.00')
                    if Decimal(option_item['price']) < Decimal('30.00'):
                        option_item['shipping_cost'] = '1.99'
                    stock = option.select('.//td[@class="stock instock"]')
                    if not stock:
                        option_item['stock'] = 0
                    option_item['image_url'] = variant.select(
                        './/img/@src')[0].extract()
                    yield option_item
        else:
            self.log('PRODUCT WITHOUT OPTIONS: ' + response.url)
示例#21
0
    def parse(self, response):
        URL_BASE = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        # categories
        category_urls = hxs.select('//ul[@id="nav"]/li//a/@href').extract()
        for url in category_urls:
            url = urljoin_rfc(URL_BASE, url)
            url += self.page_query + str(1)
            yield Request(url)

        # sub-categories
        sub_category_urls = hxs.select(
            "//a[@class='subcategory_link']/@href").extract()
        for url in sub_category_urls:
            url = urljoin_rfc(URL_BASE, url)
            url += self.page_query + str(1)
            yield Request(url)

        # pages
        page = 2
        page_urls = hxs.select("//div[@class='pages']//a/@href").extract()
        for url in page_urls:
            yield Request(url)
            page += 1

        # products list
        products_count = 0
        products = hxs.select('//h2[@class="product-name"]/..')
        #products = hxs.select("//form[@class='search_results_section']/table[2]/tr/td/table/tr/td/table/tr/td[@width='25%']")
        if not products:
            print "ERROR!! NO PRODUCTS!! %s " % response.url
        for product_el in products:
            name = product_el.select(".//h2/a/text()").extract()
            if not name:
                continue
            name = name[0]

            url = product_el.select(".//h2/a/@href").extract()
            if not url:
                print "ERROR!! NO URL!! %s" % response.url
                continue
            url = url[0]

            price = product_el.select(
                './/span[@class="price" and starts-with(@id, "product-price")]/text()'
            ).extract()
            if not price:
                price = product_el.select(
                    './/span[@class="price"]/text()').extract()
            if not price:
                print "ERROR!! NO PRICE!! %s" % response.url
                price = '0'
            else:
                price = price[0]

            products_count += 1
            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', u' \r\n' + name + u'\r\n')
            loader.add_value('price', price)
            loader.add_value('sku', '')
            yield loader.load_item()

        if products_count == 0:
            # products list 2
            products = hxs.select(
                "//form[@class='search_results_section']/table[2]/tr/td/table/tr/td/table/tr"
            )
            if not products:
                print "ERROR!! NO PRODUCTS!! %s " % response.url
            for product_el in products:
                name = product_el.select(
                    "td/a[@class='productnamecolor colors_productname']/text()"
                ).extract()
                if not name:
                    continue

                url = product_el.select(
                    "td/a[@class='productnamecolor colors_productname']/@href"
                ).extract()
                if not url:
                    print "ERROR!! NO URL!! %s" % response.url
                    continue
                url = url[0]

                price = product_el.select(
                    './/font[@class="pricecolor colors_productprice"]/text()'
                ).extract()
                if not price:
                    print "ERROR!! NO PRICE!! %s" % response.url
                    continue
                price = price[0]

                product = Product()
                loader = ProductLoader(item=product, response=response)
                loader.add_value('url', url)
                loader.add_value('name', u' \r\n' + name + u'\r\n')
                loader.add_value('price', price)
                loader.add_value('sku', '')
                yield loader.load_item()
示例#22
0
    def parse_product(self, response):
        if response.url in self._ignore_urls:
            return

        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        # Fill up the Product model fields
        # identifier =
        url = response.url
        name = hxs.select(
            "//*[@class='product-name']/*[@itemprop='name']/text()").extract()
        price = hxs.select(
            "//span[@itemprop='offers']/span[@itemprop='price']/@content"
        ).extract()
        # sku = hxs.select("//span[@id='ProductSKU']/text()").extract()
        # metadata =
        category = hxs.select(
            "//li[contains(@class, 'category')]/a/text()").extract()
        image_url = hxs.select('//img[@id="image-main"]/@src').extract()
        # brand = hxs.select("//div[@class='SectionHeader']/h1/text()").extract()
        # shipping_cost =

        # l = ProductLoader(response=response, item=Product())
        # l.add_value('identifier', identifier)
        # l.add_value('url', url)

        # l.add_value('sku', sku)
        # l.add_value('metadata', metadata)
        # l.add_value('category', category)
        # l.add_value('image_url', image_url)
        # l.add_value('brand', brand)
        # l.add_value('shipping_cost', shipping_cost)

        free_shipping = hxs.select(
            '//div[@class="product-img-box"]//div[@class="onsale-product-label-image"]/table/tr/td[text()[contains(.,"Shipping")] and text()[contains(.,"Free")]]'
        ).extract()
        if free_shipping:
            shipping_cost = Decimal(0)
            # l.add_value("shipping_cost", Decimal(0))
        else:
            shipping_cost = 11.99
            # l.add_value("shipping_cost", 11.99)

        identifier = hxs.select('//input[@name="product"]/@value').extract()
        if not identifier:
            self.log("ERROR identifier not found")
        else:
            identifier = identifier[0]
            # l.add_value("identifier",identifier[0])

        brand = hxs.select('//span[@itemprop="brand"]/@content').extract()
        if not brand:
            self.log("ERROR brand not found")
        else:
            brand = brand[0]
            # l.add_value("brand",brand[0])

        stock = 0
        try:
            p_stock = hxs.select('//meta[@itemprop="availability"]/@content'
                                 ).extract()[0].lower()
            if 'in_stock' in p_stock:
                stock = 1
        except:
            stock = None
            self.log("ERROR stock not found")

        options_config = re.search(r'var spConfig=new Product.Config\((.*)\)',
                                   response.body)
        if not options_config:
            options_config = re.search(
                r'var spConfig = new Product.Config\((.*)\)', response.body)

        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join(
                            (products.get(product, ''), option['label']))
                        prices[product] = prices.get(product, 0) + float(
                            option['price'])

            for option_identifier, option_name in products.iteritems():
                l = ProductLoader(response=response, item=Product())
                l.add_value('name', name[0] + ' ' + option_name)
                #if (name[0] + ' ' + size.get('label')).strip() in self._ignore_names:
                #    continue
                l.add_value('price',
                            float(price[0]) + prices[option_identifier])
                l.add_value("identifier", identifier + '-' + option_identifier)
                l.add_value("brand", brand)
                l.add_value("shipping_cost", shipping_cost)
                l.add_value('category', category)
                l.add_value('image_url', image_url)
                l.add_value('url', url)
                if stock is not None:
                    l.add_value("stock", stock)
                yield l.load_item()
        else:
            l = ProductLoader(response=response, item=Product())
            l.add_value('name', name)
            if name in self._ignore_names:
                return
            l.add_value('price', price)
            l.add_value("identifier", identifier)
            l.add_value("brand", brand)
            l.add_value("shipping_cost", shipping_cost)
            l.add_value('category', category)
            l.add_value('image_url', image_url)
            l.add_value('url', url)
            if stock is not None:
                l.add_value("stock", stock)
            yield l.load_item()
示例#23
0
    def parse_product(self, response):
        schema = SpiderSchema(response)
        pdata = schema.get_product()

        sku = pdata.get('mpn', '')
        image = pdata['image'].replace('example.com', 'prodirectsoccer.com')
        main_id = response.xpath(
            '//div[@id="define-profile"]/@data-quickref').extract()[0]
        main_name = pdata['name']
        main_price = extract_price(pdata['offers']['properties']['price'])
        main_brand = response.meta.get('brand')
        shipping = '9.93'

        sizes = response.xpath('//select[@id="size"]/option[@value!=""]')
        player_sel_label = response.xpath(
            '//label[@for="pers-opt1"]/text()').extract()
        player_tourn_sel_label = response.xpath(
            '//label[@for="pers-opt2"]/text()').extract()

        for size_opt in sizes:
            size_desc = size_opt.xpath('text()').extract()[0].strip()
            size_value = size_opt.xpath('@value').extract()[0].strip()
            in_stock = True
            if ' ' in size_desc:
                size_desc, stock = size_desc.split(' ', 1)
                if 'OUT OF STOCK' in stock.upper():
                    in_stock = False
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('identifier', main_id + 'x' + size_value)
            loader.add_value('name', main_name + ' - ' + size_desc)
            loader.add_value('sku', sku)
            loader.add_value('price', main_price)
            loader.add_value('shipping_cost', shipping)
            loader.add_value('url', response.url)
            loader.add_value('image_url', image)
            if main_brand:
                loader.add_value('brand', main_brand)
            loader.add_value('category', 'Replicas')
            if not in_stock:
                loader.add_value('stock', 0)
            item = loader.load_item()
            item['metadata'] = {'size': size_desc}
            yield item

            if player_sel_label:
                player_sel_price = extract_price(player_sel_label[0])
                players = response.xpath(
                    '//select[@id="pers-player"]/option[@value!=""]')
                for player_opt in players:
                    player_desc = player_opt.xpath(
                        'text()').extract()[0].strip()
                    player_value = player_opt.xpath(
                        '@value').extract()[0].strip()
                    new_item = Product(item)
                    new_item['identifier'] += 'x' + player_value
                    new_item['name'] += ' - ' + player_desc
                    new_item['price'] = Decimal(
                        new_item['price']) + player_sel_price
                    try:
                        player_number, player_name = re.search(
                            r'(\d+)\s(.*)', player_desc).groups()
                        new_item['metadata']['player'] = player_name.strip()
                        new_item['metadata']['number'] = player_number
                    except:
                        pass
                    yield new_item

                    if player_tourn_sel_label:
                        player_tourn_price = extract_price(
                            player_tourn_sel_label[0])
                        tournaments = response.xpath(
                            '//select[@id="pers-tournament"]/option[@value!=""]'
                        )
                        for tourn_opt in tournaments:
                            tourn_desc = tourn_opt.xpath(
                                'text()').extract()[0].strip()
                            tourn_value = tourn_opt.xpath(
                                '@value').extract()[0].strip()
                            new_item = Product(item)
                            new_item[
                                'identifier'] += 'x' + player_value + 'x' + tourn_value
                            new_item[
                                'name'] += ' - ' + player_desc + ' - ' + tourn_desc
                            new_item['price'] = Decimal(
                                new_item['price']) + player_tourn_price
                            try:
                                player_number, player_name = re.search(
                                    r'(\d+)\s(.*)', player_desc).groups()
                                new_item['metadata'][
                                    'player'] = player_name.strip()
                                new_item['metadata']['number'] = player_number
                            except:
                                pass
                            yield new_item
示例#24
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        data = response.xpath(
            '//script/text()[contains(., "product/data")]').extract_first()
        data = json.loads(
            re.search('product/data",[ \n]*({.+})', data).group(1))

        price = ''.join(
            hxs.select(
                '//div[contains(@class, "js-product-offer-summary")]//div[contains(@class, "price-display")]//text()'
            ).extract())
        if not price:
            price = ''.join(
                response.xpath(
                    '//div[@itemprop="offers"]//div[@itemprop="price"][1]//text()'
                ).extract())
        if not price:
            price = ''.join(
                response.xpath(
                    '//span[contains(@class, "hide-content-m")]/span[@data-tl-id="Price-ProductOffer"]//text()'
                ).extract())
        # Some products are not available online and these have no price
        if price:
            stock_status = 1
            if 'out of stock' in price.lower():
                stock_status = 0

            product_name = filter(
                lambda x: bool(x),
                map(
                    unicode.strip,
                    hxs.select('//h1[contains(@itemprop, "name")]//text()').
                    extract()))

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('name', product_name)
            loader.add_value('identifier',
                             re.search(r'/(\d+)$', response.url).group(1))
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('brand', response.meta['brand'])
            categories = hxs.select(
                '//ol[contains(@class, "breadcrumb-list")]//li//a/span/text()'
            ).extract()
            categories = map(lambda x: x.strip(), categories)
            loader.add_value('category', categories)
            loader.add_value('url', response.url)
            loader.add_xpath(
                'image_url',
                '//img[contains(@class, "js-product-primary-image")]/@src')
            try:
                loader.add_value(
                    'shipping_cost',
                    data['buyingOptions']['shippingPrice']['displayPrice'])
            except KeyError:
                loader.add_css('shipping_cost',
                               'h2.js-shipping-primary-msg::text')

            loader.add_value('price', price)
            if not stock_status:
                loader.add_value('stock', 0)
            item = loader.load_item()
            item['metadata'] = {}

            yield Request(self._get_reviews_url(item, 1),
                          meta={
                              'product': item,
                              'page': 1
                          },
                          callback=self.parse_product_reviews)
示例#25
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name_xpath = '//div[@id="product-details"]/h1/span/text()'
        names = hxs.select('//h1[@id="product_title"]/text()').extract()

        if names and len(names) > 0:
            name = names[0].strip()
        else:
            # product not found. Just continue
            self.log('WARNING: Product not found => %s' % response.url)
            return

        quantity = hxs.select('//p[@id="stock_status"]/text()').extract()
        if quantity and 'OUT OF STOCK' in quantity.pop().upper():
            quantity = 0
        else:
            quantity = None

        category = hxs.select(
            '//ul[@id="crumbs"]/li[@class="last"]/a/text()').extract()

        brand = hxs.select(
            '//div[@id="product_title_container"]/span[@class="secondary"]/text()'
        ).extract()

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('url', urljoin(base_url, response.url))
        loader.add_value('name', name)
        loader.add_xpath('image_url', '//img[@id="main_image"]/@src',
                         TakeFirst(), Compose(lambda v: urljoin(base_url, v)))
        loader.add_xpath(
            'price',
            '//div[@class="product_price"]/span[@class="price"]/text()',
            TakeFirst(),
            re="([.0-9]+)")
        if not loader.get_output_value('price'):
            loader.add_value('price', 0)

        if category:
            loader.add_value('category', category[0].strip())

        sku = hxs.select('//li[@itemprop="ISBN13"]/text()').extract()
        sku = sku[-1].strip() if sku else ''
        loader.add_value('sku', sku)

        if brand:
            loader.add_value('brand', brand[0].strip())

        identifier = hxs.select('//input[@name="ProductID"]/@value').extract()
        if not identifier:
            identifier = hxs.select('//li[@itemprop="id"]/text()').extract()

        loader.add_value('identifier', identifier[0])

        if quantity == 0:
            loader.add_value('stock', 0)

        item = loader.load_item()

        metadata = BookpeopleMeta()
        pre_order = hxs.select(
            '//button[contains(@class, "submit") and text()="Pre order"]')
        metadata['pre_order'] = 'Yes' if pre_order else ''
        author = hxs.select(
            '//span[contains(em/text(), "author")]/a/text()').extract()
        metadata['author'] = author[0] if author else ''
        book_format = hxs.select('//li[@itemprop="Format"]/text()').extract()
        metadata['format'] = book_format[-1].strip() if book_format else ''
        publisher = hxs.select('//span[@itemprop="publisher"]/a/text()').re(
            ': (.*)')
        metadata['publisher'] = publisher[0] if publisher else ''
        published = hxs.select(
            '//li[@itemprop="publication date"]/text()').extract()
        metadata['published'] = published[-1].strip() if published else ''
        item['metadata'] = metadata
        yield item
示例#26
0
    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('category', 'Kits')
        base_data = response.xpath('//script/text()').re('product\w{6} =(.+?});var')
        hero_data = response.xpath('//script/text()').re('product\d{7} =(.+?});var')
        if base_data:
            base_data = json.loads(base_data[0])
        if hero_data:
            hero_data = [json.loads(elem) for elem in hero_data]
            selected_hero = response.xpath('//select[contains(@class,"heroShirts")]/option[@selected]/@value').extract_first()
            if selected_hero:
                hero_data = {elem['ProductID']: elem for elem in hero_data}[int(selected_hero)]
            elif len(hero_data) == 1:
                hero_data = hero_data[0]
            else:
                hero_data = {}
        else:
            hero_data = {}

        if not base_data and not hero_data:
            return
        loader.add_value('name', base_data['Description'])
        loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"')
        if base_data['Brand']:
            loader.add_value('brand', base_data['Brand']['Name'].title())
        loader.add_value('image_url', response.urljoin(base_data['ImageURL']))
        loader.add_value('shipping_cost', self.shipping_cost)
        product = loader.load_item()
        # Player names
        player_from_name = re.search('with *([\w\ \.\-]+?) (\d+)', hero_data.get('Description', ''), re.UNICODE)
        if player_from_name:
            player, number = player_from_name.groups()

        for data in [hero_data, base_data]:
            for variation in data.get('Variations', []):
                size = variation['Description']
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value(None, product)
                loader.replace_value('identifier', variation['VariationId'])
                option_name = data['Description'] + u' ' + size
                loader.replace_value('name', option_name)
                loader.replace_value('price', Decimal(str(variation['PriceActual']))*self.exchange_rate)
                if data.get('ImageURL'):
                    loader.replace_value('image_url', response.urljoin(base_data['ImageURL']))
                if not variation['IsInStock']:
                    loader.replace_value('stock', 0)
                identifier = str(variation['VariationId'])
                item = loader.load_item()
                if self.free_delivery_over is not None and self.free_delivery_over <= item['price']:
                    item['shipping_cost'] = '0.00'
                if item['identifier'] not in self.extracted_identifiers:
                    self.extracted_identifiers.append(item['identifier'])
                    if player_from_name and data == hero_data:
                        item['metadata'] = {'player': player, 'number': number, 'size': size}
                    else:
                        item['metadata'] = {'size': size}
                    yield item
                # Badges
                printings = {elem['PrintingTypeID']: elem for elem in base_data['printingitems']}
                printing = printings.get(3)
                if printing:
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value(None, item)
                    option_name = loader.get_output_value('name') + u' ' + printing['PrintingDescription']
                    loader.replace_value('name', option_name)
                    price = Decimal(str(variation['PriceActual'])) + Decimal(str(printing['PriceActual']))
                    loader.replace_value('price', format_price(price*self.exchange_rate))
                    identifier += '-' + str(printing['PrintingID'])
                    loader.replace_value('identifier', identifier)
                    item = loader.load_item()
                    if self.free_delivery_over is not None and self.free_delivery_over <= item['price']:
                        item['shipping_cost'] = '0.00'
                    if item['identifier'] not in self.extracted_identifiers:
                        self.extracted_identifiers.append(item['identifier'])
                        if player_from_name and data == hero_data:
                            item['metadata'] = {'player': player, 'number': number, 'size': size}
                        else:
                            item['metadata'] = {'size': size}
                        yield item
示例#27
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url
        brand = response.meta.get('brand', '')

        name = hxs.select(
            "//div[@class='primary-content']//div[@id='product-title']/h1/text()"
        ).extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select(
            "//div[@class='secondary-content']//ul[@class='pricing']/li[@class='current-price']/span/text()"
        ).extract()
        if not price:
            logging.error("ERROR! NO PRICE! %s %s" % (url, name))
            price = ''
        else:
            price = "".join(price[:2])
        sku = url.lower().split('skuid=')[-1] if len(
            url.lower().split('skuid=')) > 0 else None
        if not sku:
            logging.error("ERROR! SKU! %s %s" % (url, name))
            return
        categories = " ".join(
            hxs.select("//div[@id='breadcrumbs']//li//a/text()").extract()
        ).lower().replace('\n', ' ').split(' ')

        if 'books' in categories:
            logging.error("ERROR! Product not valid  %s %s" % (url, name))
            return

        #is_valid = [t for t in self.cats if t in categories]
        #if brand.lower() != 'keter' and not is_valid:
        #    logging.error("ERROR! Product not valid  %s %s" % (url, name))
        #    return

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', sku)
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        l.add_value('brand', brand.strip().lower())
        #l.add_value('sku', sku)
        product = l.load_item()
        metadata = KeterMeta()
        metadata['brand'] = brand.strip().lower()
        metadata['reviews'] = []
        product['metadata'] = metadata

        review_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=asiwwvlu4jk00qyffn49sr7tb&apiversion=5.4&displaycode=1235-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A' + sku + '&filter.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&sort.q0=rating%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv182_28795'
        request = Request(review_url,
                          meta={
                              'product': product,
                              'offset': 0,
                              'sku': sku
                          },
                          callback=self.parse_reviews)
        yield request
    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            retry_count = self.retry_urls.get(response.url, 0)
            retry_count += 1
            if retry_count > 100:
                self.log("ERROR MAX retry count reached (100), giving up...")
                return
            else:
                self.log(
                    "ERROR - got response that is not HTML, adding to retry queue (#{})"
                    .format(retry_count))
                self.retry_urls[response.url] = retry_count
                yield Request(url=response.url,
                              callback=self.parse,
                              dont_filter=True)

        hxs = HtmlXPathSelector(response)

        pages = hxs.select(
            '//div[@class="contents"]/div//h2//div[@class="pagination"]//a/@href'
        ).extract()
        for page in pages:
            url = urljoin_rfc(get_base_url(response), page)
            yield Request(url=url, callback=self.parse)

        category = hxs.select('//div[@class="contents"]/h1/text()').extract()
        if not category:
            self.log('ERROR - No category name found!')
            category = brand = ''
        else:
            category = brand = category[0]

        products = hxs.select(
            '//div[@class="contents"]/table//tr[td[@valign="middle"]]')
        if not products:
            self.log('ERROR - empty products list, needs investigation!')
            return
        for product in products:
            product_id = product.select('.//a[@class="buttonBig"]/@href').re(
                r'add_to_cart/(\d+)')
            if not product_id:
                continue

            product_loader = ProductLoader(item=Product(), selector=product)
            product_url = product.select('.//td//font//b//a/@href').extract()
            if product_url:
                product_url = urljoin_rfc(get_base_url(response),
                                          product_url[0])
                product_loader.add_value('url', product_url)
            product_image = product.select(
                './/img[@class="product_image"]/@src').extract()
            if product_image:
                product_image = urljoin_rfc(get_base_url(response),
                                            product_image[0])
                product_loader.add_value('image_url', product_image)
            product_loader.add_value('identifier', product_id)
            product_loader.add_xpath('name', './/td//font//b//a/text()')
            product_loader.add_xpath('price',
                                     './/td//font[@class="price"]//b/text()')
            product_loader.add_value('category', category)
            product_loader.add_value('brand', brand)
            item = product_loader.load_item()
            yield Request(item['url'],
                          callback=self.parse_product,
                          meta={'item': item})
示例#29
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        image_url = hxs.select('//*[@id="main-image"]/@href').extract()
        try:
            product_identifier = hxs.select(
                '//input[@name="product"]/@value').extract()[0].strip()
        except:
            product_identifier = hxs.select(
                '//form[@id="product_addtocart_form"]/@action').re(
                    r'/product/(\d+)')[0]
        product_name = hxs.select(
            '//div[@class="product-name"]/h1/text()').extract()[0].strip()
        category = response.meta.get('category')
        sku = hxs.select('//div[@class="sku-package"]/text()').extract()
        if sku:
            sku = sku[0].strip()
            sku = sku.replace('SKU# ', '')
        else:
            sku = ''

        brand = ''
        for b in self.brands:
            if product_name.startswith(b):
                brand = b
                break
        options_config = re.search(
            r'var spConfig=new Product.Config\((.*)\)',
            response.body.replace('var spConfig = new', 'var spConfig=new'))
        ean = hxs.select(
            '//div[@class="sku-package" and contains(text(), "SKU# ")]/text()'
        ).extract()
        if options_config:
            product_data = demjson.decode(options_config.groups()[0],
                                          return_errors=True)[0]
            products = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join(
                            (products.get(product, ''), option['label']))

            for identifier, option_name in products.iteritems():
                product_loader = ProductLoader(item=Product(), selector=hxs)
                product_loader.add_value('identifier',
                                         product_identifier + '_' + identifier)
                product_loader.add_value('name', product_name + option_name)
                if image_url:
                    product_loader.add_value(
                        'image_url', urljoin_rfc(base_url, image_url[0]))
                price = float(product_data['basePrice'])
                product_loader.add_value('price', round(price, 2))
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', brand)
                product_loader.add_value('category', category)
                product_loader.add_value('sku', sku)
                product = product_loader.load_item()
                if ean:
                    product['metadata'] = {
                        "ean": ean[0].split("SKU# ")[-1].strip()
                    }
                yield product
        else:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('name', product_name)
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            price = hxs.select('//meta[@itemprop="price"]/@content').extract()
            price = ''.join(price).strip()
            if price == '':
                price = hxs.select('//*[@id="old-price-{}"]//text()'.format(
                    product_identifier)).extract()
                price = ''.join(price).strip()
            price = extract_price(price)
            product_loader.add_value('price', price)
            product_loader.add_value('url', response.url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            product_loader.add_value('sku', sku)
            product = product_loader.load_item()
            if ean:
                product['metadata'] = {
                    "ean": ean[0].split("SKU# ")[-1].strip()
                }
            yield product
示例#30
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        
        mpn = hxs.select('//span[@class="b-item"]').re("MPN: ([0-9]+)")
        ean = hxs.select('//span[@class="b-item"]').re("EAN: ([0-9]+)")
        sku = hxs.select('//input[@name="sku"]/@value').extract()
        name = hxs.select('//h1[@class="b-ttl-main"]/text()').extract()[0]
        dealer_name = "".join(hxs.select('//h2[@id="auto_shop_info_name"]//text()').extract()).strip()
        brand = hxs.select('.//span[@itemprop="brand"]/text()').extract()
        if brand:
          brand = brand[0].strip()
        else:
          brand = response.meta.get('brand')

        categories = hxs.select('//ul[@class="b-breadcrumb"]/li/a/text()').extract()
        image_url = hxs.select('//img[@itemprop="image"]/@data-frz-src').extract()

##        options = hxs.select('//script[contains(text(), "var variant_details")]/text()').re('var variant_details = (.*);\n')
        options = hxs.select('//script[contains(text(), "var variant_details")]/text()').extract()
        if options:
            options = options[0].replace('&quot;', "'")
            options = re.findall('var variant_details = (.*);\n', options)
            variants = json.loads(options[0])
        else:
            identifier = hxs.select('//input[@name="item_id"]/@value').extract()[0]
            price = hxs.select('//div[@class="b-product-main"]//meta[@itemprop="price"]/@content').extract()[0]
            variants = [{'itemVariantId': identifier, 'sku': sku, 'variantValues': [], 'defaultPricing': {'price': price}}]

        items = []
        for variant in variants:
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('identifier', variant['itemVariantId'])
            loader.add_value('name', " ".join([name] + variant.get('variantValues', [])))
            loader.add_value('sku', variant['sku'])
            loader.add_value('url', response.url)
            loader.add_value('price', variant['defaultPricing']['price'])
            loader.add_value('dealer', dealer_name)
            loader.add_value('category', categories)
            if brand:
                loader.add_value('brand', brand)
            if image_url:
                loader.add_value('image_url', image_url[0])
            product = loader.load_item()

            metadata = ToyMonitorMeta()
            metadata['reviews'] = []
            product['metadata'] = metadata

            if mpn or ean:
                if mpn:
                    metadata['mpn'] = mpn[0]
                if ean:
                    metadata['ean'] = ean[0]
                product['metadata'] = metadata
            items.append(product)

        reviews_url = response.xpath('//a[contains(text(), "See All Reviews")]/@href').extract()
        if reviews_url:
            yield Request(reviews_url[0], callback=self.parse_reviews, meta={'items': items, 'url': response.url})
        else:
            for item in items:
                yield item