Пример #1
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//div[@class="product-name"]/h1/text()')
        loader.add_value('url', response.url)
        loader.add_value('brand', 'Le Creuset')
        loader.add_value('category', 'Le Creuset')
        loader.add_xpath('sku', '//input[@name="product"]/@value')
        loader.add_xpath('identifier', '//input[@name="product"]/@value')
        image_url = hxs.select(
            '//div[@class="product-img-box"]/a/@href').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])
        loader.add_xpath('price', '//div[@class="prodPriceWrap"]/h2/text()')
        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', '4.75')
        else:
            loader.add_value('shipping_cost', '0')

        loader.add_value('stock', '1')

        item = loader.load_item()
        metadata = LeCreusetMeta()
        item['metadata'] = metadata

        yield item
Пример #2
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        meta = response.meta

        products = hxs.select(
            '//div[contains(@class, "item-details")]/div/header/h3/a/@href'
        ).extract()
        products += hxs.select(
            '//div[contains(@class, "setProduct")]/div/h5/a/@href').extract()
        if products:
            for product in products:
                url = urljoin_rfc(get_base_url(response), product)
                yield Request(url, callback=self.parse_product, meta=meta)
            return

        category = hxs.select(
            '//ol[contains(@class, "hof-breadcrumbs")]/li/a[@itemprop="breadcrumb"]/text()'
        ).extract()[-1]

        sku = hxs.select('//div[@class="product-code"]/text()').re(
            r'Product code:(.*)')[0].strip()
        name = hxs.select(
            '//span[@itemprop="name"]/text()').extract()[0].strip()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        loader.add_value('brand', response.meta['brand'])
        loader.add_value('category', category)
        loader.add_value('sku', sku)
        loader.add_value('identifier', sku)
        image_url = hxs.select(
            '//img[contains(@class, " featuredProductImage")]/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        price = hxs.select(
            '//div[@id="productDetailsRefinementBlock"]/div/span/p[@class="priceNow"]/span[@class="value"]/text()'
        ).extract()
        if not price:
            price = hxs.select(
                '//span[@id="productPriceContainer"]/p[@class="price"]/text()'
            ).extract()

        loader.add_value('price', price[0])

        price_was = ' '.join(
            map(
                lambda x: x.strip(),
                hxs.select(
                    '//div[@id="productDetailsRefinementBlock"]//p[@class="priceWas"]/span//text()'
                ).extract())).strip()
        item = loader.load_item()
        metadata = LeCreusetMeta()
        metadata['promotion'] = price_was
        item['metadata'] = metadata

        yield item
Пример #3
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        for url in hxs.select(
                '//div[@class="product_list"]//a/@href').extract():
            yield Request(urljoin_rfc(get_base_url(response), url))

        if not hxs.select('//span[@class="product"]/h1/text()'):
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//span[@class="product"]/h1/text()')
        loader.add_value('url', response.url)
        loader.add_value('brand', 'Le Creuset')
        loader.add_xpath(
            'category',
            '//div[@class="text_breadcrumbs"]/a[position()>1]//text()')
        loader.add_xpath(
            'sku',
            'substring-after(//font[@size="1" and contains(text(), "Ref:")]/text(), ": ")'
        )
        loader.add_xpath(
            'identifier',
            'substring-after(//font[@size="1" and contains(text(), "Ref:")]/text(), ": ")'
        )
        image_url = hxs.select('//img[@class="fullimage1"]/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), image_url[0]))
        loader.add_xpath('price',
                         '//h3[@class="product_price"]/prices/span[2]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', '//h3[@class="product_price"]//text()')
        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', '4.95')
        else:
            loader.add_value('shipping_cost', '0')

        if hxs.select(
                '//div[@class="stock-message"]/span[contains(.//text(), "In stock") or contains(.//text(), "plenty of stock in")]'
        ):
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        item = loader.load_item()
        metadata = LeCreusetMeta()
        metadata['promotion'] = ''.join(
            hxs.select(
                '//div[@class="special-offer-message"]/span/text()').extract())
        item['metadata'] = metadata

        yield item
Пример #4
0
 def parse(self, response):
     reader = csv.DictReader(StringIO(response.body))
     for row in reader:
         loader = ProductLoader(response=response, item=Product())
         loader.add_value('identifier', row['Barcode'].lower())
         loader.add_value('sku', row['SKU'])
         loader.add_value('name', row['Descritpion'].decode('utf'))
         loader.add_value('price', row['Price'])
         loader.add_value('brand', 'Le Creuset')
         loader.add_value('image_url', row['Product images'])
         product = loader.load_item()
         metadata = LeCreusetMeta()
         metadata['asin'] = row['ASIN']
         product['metadata'] = metadata
       
         yield product
Пример #5
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//h1/span/text()')
        loader.add_value('url', response.url)
        loader.add_value('brand', 'Le Creuset')
        loader.add_xpath(
            'category',
            '//ul[@class="breadcrumbs"]/li[position()>1]//span[@itemprop="title"]/text()'
        )
        loader.add_xpath('sku', '//input[@name="productID"]/@value')
        loader.add_xpath('identifier', '//input[@name="productID"]/@value')
        image_url = hxs.select('//div[@id="thumbnails"]/a/img/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])
        loader.add_xpath('price', '//div[@id="price"]/text()')
        if loader.get_output_value('price') < 25:
            loader.add_value('shipping_cost', '2.99')
        else:
            loader.add_value('shipping_cost', '0')

        if hxs.select('//a[@id="addcartlink"]'):
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        item = loader.load_item()
        metadata = LeCreusetMeta()
        promotion = hxs.select(
            '//img[contains(@src,"percent/sale_")]/@src').re('\d+')
        if promotion:
            metadata['promotion'] = promotion[0] + '% off retail price'
        item['metadata'] = metadata

        yield item
Пример #6
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        childMap = json.loads(
            re.search('\'childMap\': (.*),', response.body).group(1))
        prices = json.loads(
            re.search('\'prices\': (.*),', response.body).group(1))
        skus = json.loads(re.search('\'skus\': (.*),', response.body).group(1))
        stockStatuses = json.loads(
            re.search('\'stockStatuses\': (.*),', response.body).group(1))

        selects = []
        for sel in hxs.select('//div[@class="product-options"]//select'):
            s = []
            for opt in sel.select('.//option'):
                if opt.select('./@value').extract()[0]:
                    s.append((
                        opt.select('./@value').extract()[0],
                        opt.select('./text()').extract()[0],
                    ))
            if s:
                selects.append(s)

        if not selects:
            selects = [[('', ''), ('%', '')]]

        for k, v in list(childMap.items()):
            if '_%' in k:
                childMap[k.replace('_%', '')] = v

        found = False
        for c in itertools.product(*selects):
            key = [x[0] for x in c]
            name = [x[1] for x in c]
            code = childMap.get('_'.join(key))
            if not code: continue

            code = str(code)
            loader = ProductLoader(item=Product(), response=response)
            loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
            loader.add_value('name', name)
            loader.add_value('sku', skus[code])
            loader.add_value('identifier', skus[code])
            loader.add_value('price', prices[code][0]['purchase'])
            loader.add_value('url', response.url)
            loader.add_value('brand', 'Le Creuset')
            if 'In stock' in stockStatuses.get(code, ''):
                loader.add_value('stock', '1')
            else:
                loader.add_value('stock', '0')

            if loader.get_output_value('price') < 45:
                loader.add_value('shipping_cost', '4.95')
            else:
                loader.add_value('shipping_cost', '0')

            loader.add_xpath('category',
                             '//div[@class="crumbs"]/a[position()>2]/text()')
            image_url = hxs.select(
                '//div[@id="product-image"]//img/@src').extract()
            if image_url:
                loader.add_value(
                    'image_url',
                    urljoin_rfc(get_base_url(response), image_url[0]))

            item = loader.load_item()
            metadata = LeCreusetMeta()
            item['metadata'] = metadata

            found = True
            yield item

        if not found:
            self.log('No products on %s' % response.url)
Пример #7
0
    def _process_product_info_product_details(self, response, product_info):
        """
        This needs to be in separate function because used by two methods: parse_product_details and parse_ajax_price
        """
        hxs = HtmlXPathSelector(response)
        promotion = ' '.join(''.join(
            hxs.select(
                '//tr[@id="regularprice_savings"]//text()').extract()).split())
        metadata = LeCreusetMeta()
        metadata['promotion'] = promotion

        if response.meta.get(
                'seller_identifier',
                None) and not product_info.get('seller_identifier', None):
            product_info['seller_identifier'] = response.meta[
                'seller_identifier']

        check_match = response.meta.get('check_match', True)

        match = self.match(response.meta, self.current_search_item,
                           product_info)

        if check_match and not match:
            self.log("[AMAZON] WARNING: product does not match: %s" %
                     response.url)
            return

        if self.parse_options:
            if product_info['options'] and response.meta.get(
                    'parse_options', True):
                self.log('[AMAZON] OPTIONS FOUND => %s' % response.url)

                for option in product_info['options']:
                    new_meta = response.meta.copy()
                    new_meta.update({
                        'parse_options': False,
                        'search_string': self.current_search,
                        'search_item': self.current_search_item,
                        'check_match': check_match
                    })
                    yield Request(option['url'],
                                  self.parse_product,
                                  meta=new_meta,
                                  dont_filter=True)
                return
            else:
                if product_info['name_with_options']:
                    product_info['name'] = product_info['name_with_options']
                elif product_info['option_texts']:
                    product_info['name'] += ' [' + ', '.join(
                        product_info['option_texts']) + ']'

        if self.type == 'asins':
            url_asin = AmazonUrlCreator.get_product_asin_from_url(
                product_info['url'])
            if product_info['asin'].lower() != url_asin.lower():
                self.log(
                    "[AMAZON] product ASIN '%s' does not match url ASIN '%s'. Page: %s"
                    % (product_info['asin'], url_asin, response.url))
                return

        # Amazon Direct
        if self.amazon_direct:
            if self.collect_reviews and product_info.get(
                    'reviews_url') and response.meta.get(
                        'collect_reviews', True):
                new_meta = response.meta.copy()
                new_meta['found_item'] = product_info
                if self.type == 'search':
                    new_meta.update({
                        'search_string':
                        response.meta['search_string'],
                        'search_item':
                        self.current_search_item,
                    })
                yield Request(product_info['reviews_url'],
                              callback=self.parse_reviews,
                              meta=new_meta)
            else:
                product = self.construct_product(product_info,
                                                 meta=response.meta)
                self.log("[AMAZON] collect parse product: %s" %
                         product['identifier'])
                if self.type == 'category':
                    yield product
                else:
                    self._collect_amazon_direct(product, response.meta)
        # Buy Box
        elif self.only_buybox:
            if (product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \
                    self.collect_products_with_no_dealer:
                if self.collect_reviews and product_info.get(
                        'reviews_url') and response.meta.get(
                            'collect_reviews', True):
                    new_meta = response.meta.copy()
                    new_meta['found_item'] = product_info
                    if self.type == 'search':
                        new_meta.update({
                            'search_string':
                            response.meta['search_string'],
                            'search_item':
                            self.current_search_item,
                        })
                    yield Request(product_info['reviews_url'],
                                  callback=self.parse_reviews,
                                  meta=new_meta)
                else:
                    product = self.construct_product(product_info,
                                                     meta=response.meta)
                    product['metadata'] = metadata
                    self.log("[AMAZON] collect parse product: %s" %
                             product['identifier'])
                    if self.type == 'category':
                        yield product
                    else:
                        self._collect_buybox(product, response.meta)
            elif not product_info['vendor']:
                # TODO: collect vendor from vendor details page
                self.log("[AMAZON] WARNING: product with no vendor: %s" %
                         response.url)
            else:
                self.log("[AMAZON] WARNING: vendor not allowed: %s" %
                         response.url)
        # all sellers / lowest price
        elif self.all_sellers or self.lowest_product_and_seller:
            # Go to MBC lists to get dealers prices
            collect_mbc = response.meta.get('collect_mbc', True)
            if collect_mbc and product_info.get(
                    'mbc_list_url_new') and self.collect_new_products:
                # yield mbc parse
                new_meta = response.meta.copy()
                new_meta['found_item'] = product_info
                if self.type == 'search':
                    new_meta.update({
                        'search_string':
                        response.meta['search_string'],
                        'search_item':
                        self.current_search_item,
                    })
                yield Request(product_info['mbc_list_url_new'],
                              callback=self.parse_mbc_list,
                              meta=new_meta)
            elif collect_mbc and product_info.get(
                    'mbc_list_url_used') and self.collect_used_products:
                # yield mbc parse
                new_meta = response.meta.copy()
                new_meta['found_item'] = product_info
                if self.type == 'search':
                    new_meta.update({
                        'search_string':
                        response.meta['search_string'],
                        'search_item':
                        self.current_search_item,
                    })
                yield Request(product_info['mbc_list_url_used'],
                              callback=self.parse_mbc_list,
                              meta=new_meta)
            else:
                if (product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \
                        self.collect_products_with_no_dealer:
                    if self.collect_reviews and product_info.get(
                            'reviews_url') and response.meta.get(
                                'collect_reviews', True):
                        new_meta = response.meta.copy()
                        new_meta['found_item'] = product_info
                        if self.type == 'search':
                            new_meta.update({
                                'search_string':
                                response.meta['search_string'],
                                'search_item':
                                self.current_search_item,
                            })
                        yield Request(product_info['reviews_url'],
                                      callback=self.parse_reviews,
                                      meta=new_meta)
                    else:
                        use_seller_id_in_identifier = False \
                            if self.lowest_product_and_seller and not self.lowest_seller_collect_dealer_identifier else True
                        product = self.construct_product(
                            product_info,
                            meta=response.meta,
                            use_seller_id_in_identifier=
                            use_seller_id_in_identifier)
                        self.log("[AMAZON] collect parse product: %s" %
                                 product['identifier'])
                        if self.type == 'category':
                            yield product
                        else:
                            self._collect(product)
                elif not product_info['vendor']:
                    # TODO: collect vendor from vendor details page
                    self.log(
                        "[AMAZON] WARNING: Could not scrape vendor from product details: %s"
                        % response.url)
                    self.errors.append(
                        "Could not scrape vendor from product details: %s" %
                        response.url)
                else:
                    self.log("[AMAZON] WARNING: vendor not allowed: %s" %
                             response.url)