示例#1
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        try:
            identifier = response.xpath(
                '//dd[@data-product-sku]/text()').extract()[0]
            name = response.xpath('//h1[@itemprop="name"]/text()').extract()[0]
        except:
            return

        loader = ProductLoader(item=Product(), response=response)

        sku = self._re_sku.findall(name)
        sku = sku[0] if sku else ''

        loader.add_value('identifier', identifier)
        loader.add_value('name', name)
        loader.add_xpath('brand', '//h2[@itemprop="brand"]//span/text()')
        loader.add_css('category', 'li.breadcrumb:last-child a::text')
        loader.add_value('sku', sku)
        loader.add_value('url', response.url)
        loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
        loader.add_value('price', '')
        loader.add_xpath('image_url', '//a[@id="image-zoom"]/@href')

        yield loader.load_item()
示例#2
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=response.meta['product'],
                               response=response)

        loader.add_value('url', response.url)
        identifier = response.xpath('//@data-product-id').extract_first()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_css('name', 'h1.product-title::text')
        category = response.xpath('//script/text()').re_first(
            'category: "(.+?)>')
        loader.add_value('category', category)
        img = response.xpath('//meta[@itemprop="image"]/@src').extract_first()
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img))

        loader.add_value('brand', response.meta.get('brand'))

        if response.css('div.product-add-to-cart'):
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')
        product = self.add_shipping_cost(loader.load_item())

        metadata = KeterMeta()
        metadata['reviews'] = []
        product['metadata'] = metadata

        identifier = loader.get_output_value('identifier')

        methods = ('[{"method":"main_widget","params":{"pid":"' + identifier +
                   '"}},' + '{"method":"bottomline", "params":{"pid": ' +
                   identifier + ',' + '"link":"' +
                   hxs.select('//div/@data-url').extract()[0] +
                   '", "skip_average_score":false,' + '"main_widget_pid": ' +
                   identifier + '}}]')

        formdata = {
            'app_key': hxs.select('//div/@data-appkey').extract()[0],
            'is_mobile': 'false',
            'methods': methods,
            'widget_version': '2015-08-30_11-33-24'
        }

        req = FormRequest("http://w2.yotpo.com/batch",
                          formdata=formdata,
                          callback=self.parse_review,
                          meta={'product': product})
        yield req
示例#3
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        try:
            name = hxs.select(
                u'//*[@itemprop="name"]/text()').extract()[0].strip()
        except:
            open('/tmp/lookfantastic', 'w').write(response.body)
            response.meta['retries'] = response.meta.get('retries', 0) + 1
            if response.meta['retries'] > 10:
                self.log('Giving up on url [%s]' % (response.url))
                raise
            yield Request(response.url, meta=response.meta, dont_filter=True)
            return
        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('identifier',
                                 response.url.split('/')[-1].split('.')[0])
        product_loader.add_value('url', response.url)
        product_loader.add_value('name', name)
        product_loader.add_xpath('brand',
                                 u'(//meta[@itemprop="brand"]/@content)[1]')
        product_loader.add_css('price', '.product-price .price ::text')
        product_loader.add_value('sku',
                                 response.url.split('/')[-1].split('.')[0])
        product_loader.add_value('category', response.meta.get('category'))
        img = hxs.select(u'//a/img[@class="product-img"]/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img[0]))
        if hxs.select(
                '//p[@class="availability" and contains(text(),"In stock")]'):
            product_loader.add_value('stock', '1')
        if hxs.select('//p[@class="free-delivery"]'):
            product_loader.add_value('shipping_cost', '0')

        item = product_loader.load_item()
        metadata = FragranceDirectMeta()
        metadata['promotion'] = normalize_space(' '.join(
            hxs.select(
                '//p[contains(@class, "yousave")]//text()|//h3[@class="offer-buy-x-delivery-discount"]//text()'
            ).extract()))
        if item.get('price'):
            metadata['price_exc_vat'] = Decimal(item['price']) / Decimal('1.2')
        item['metadata'] = metadata

        yield item
示例#4
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        options = hxs.select('//a[@itemprop="url"]/@href').extract()
        if options:
            for url in options:
                yield Request(response.urljoin(url),
                              callback=self.parse_product)
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//h1[@itemprop="name"]//text()')
        categories = hxs.select(
            '//div[@class="breadcrumbs"]/ul/li/a/span/text()').extract()[6:]
        for category in categories:
            if category not in loader.get_output_value('name'):
                loader.add_value('name', category)
        loader.add_xpath('identifier',
                         '//meta[@itemprop="productID"]/@content')
        loader.add_xpath('price', '//span[@itemprop="price"]/text()')
        loader.add_css('price', '.price ::text')
        loader.add_value('url', response.url)
        loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
        if loader.get_output_value('identifier'):
            yield loader.load_item()
示例#5
0
    def parse_product(response):
        hxs = HtmlXPathSelector(response)

        opt_groups = []
        inside = False
        lst = ''
        for line in response.body.split('\n'):
            if line.startswith('perms[\''):
                inside = True
                lst = ''
            elif line.startswith('];'):
                if lst:
                    opts = eval('[' + lst + ']')
                    # XXX http://www.thesleepshop.co.uk/acatalog/4ft6_Double_Kyoto_Memphis_Futon.html#a11717
                    # second option has "Deluxe Mattress" twice with different additional price
                    # however price calculation ignores second addition price (uses first value)
                    filtered_opts = []
                    for price, name in opts:
                        if not [name for pn in filtered_opts if pn[1] == name]:
                            filtered_opts.append([price, name])
                    opt_groups.append(filtered_opts)
                inside = False
            elif inside:
                lst += line

        identifier = hxs.select(
            '//form//input[contains(@name, "Q_")]/@name').re(r'Q_(.*)$')[0]

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h3[@class="product"]/text()')
        product_loader.add_xpath('name', u'//span[@class="product"]/text()')
        product_loader.add_value('sku', identifier)
        product_loader.add_value('identifier', identifier)
        product_loader.add_value('category', response.meta.get('category'))

        product_loader.add_css('price', '.discprice::text')
        price_reg = response.xpath(
            '//div[@id="price_inside"]//span//text()').extract_first(
            ) or response.xpath(
                '//div[@id="price_inside"]//span/@ppraw').extract_first()
        price_reg = extract_price2uk(price_reg)
        product_loader.add_value('price', price_reg)
        product_loader.add_value('price', '')

        discount = product_loader.get_output_value('price') / price_reg

        img = hxs.select(
            u'//div[@class="slides_control"]/a/img/@src').extract()
        if not img:
            img = hxs.select(
                u'//div[@class="image_product"]//img/@src').extract()
        product_loader.add_value('image_url',
                                 urljoin_rfc(get_base_url(response), img[0]))

        brand_logo = hxs.select(
            u'//h3[@class="product"]/../img/@src').extract()
        if not brand_logo:
            brand_logo = hxs.select(
                u'//h3[@class="product"]/img/@src').extract()

        brands = {
            '6thsense.jpg': '6th sense',
            'bentley.gif': 'bentley',
            'birlea.gif': 'birlea',
            'blank.gif': '',
            'brand': '',
            'Breasley.gif': 'breasley',
            'buoyant.jpg': 'buoyant',
            'cro.gif': 'cro',
            'cumfilux.gif': 'cumfilux',
            'dt.gif': 'dt',
            'dunlopillo.gif': 'dunlopillo',
            'durabeds.gif': 'durabeds',
            'easycomfort.gif': 'easy comfort',
            'friendship_mill.gif': 'friendship mill',
            'Furmanac.gif': 'furmanac',
            'gainsborough.gif': 'fainsborough',
            'gleneagle.gif': 'gleneagle',
            'harlequin.gif': 'harlequin',
            'harmony.gif': 'harmony',
            'healthbeds.gif': 'healt beds',
            'highgate.gif': 'highgate',
            'hypnos.gif': 'hypnos',
            'jay-be.gif': 'jay be',
            'julianbowenlogo.jpg': 'julian bowen',
            'kaymed.gif': 'kaymed',
            'komfi.gif': 'komfi',
            'kyoto.gif': 'kyoto',
            'limelight.gif': 'limelight',
            'metalbeds.gif': 'metalbeds',
            'millbrook.gif': 'millbrook',
            'myers.gif': 'myers',
            'nd.gif': 'newdesign',
            'nestledown.gif': 'nestledown',
            'obc.gif': 'original bedstead',
            'Protectabed.gif': 'protectabed',
            'rauch.gif': 'rauch',
            'relaxsan.gif': 'relaxsan',
            'relyon.gif': 'relyon',
            'rest_assured.gif': 'rest assured',
            'richman.gif': 'richman',
            'sealy.gif': 'sealy',
            'shakespeare.gif': 'shakespeare',
            'silentnight.gif': 'silentnight',
            'sleepeezee.gif': 'sleepeezee',
            'sleepshaper.gif': 'sleepshaper',
            'sleepyvalley.gif': 'sleepyvalley',
            'slumberland.gif': 'slumberland',
            'staples.gif': 'staples',
            'steens.gif': 'steens',
            'swanglen.gif': 'swanglen',
            'sweetdreams.gif': 'sweetdreams',
            'tss.gif': 'the sleep shop',
            'verona.jpg': 'verona',
            'welcome.gif': 'welcome furniture',
        }
        product_loader.add_value(
            'brand', brands.get(brand_logo[0],
                                remove_extension(brand_logo[0])))
        product = product_loader.load_item()
        for opt_price, opt_name in multiply(opt_groups):
            prod = Product(product)
            prod['name'] = (prod['name'] + ' ' + opt_name).strip()
            try:
                prod['price'] = (Decimal(prod['price']) +
                                 Decimal(opt_price) * discount).quantize(
                                     Decimal('1.00'))
            except TypeError:
                prod['price'] = Decimal(0)
            prod['identifier'] = prod['identifier'] + ':' + opt_name
            yield prod
示例#6
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        data = response.xpath(
            '//script/text()[contains(., "product/data")]').extract_first()
        data = json.loads(
            re.search('product/data",[ \n]*({.+})', data).group(1))

        price = ''.join(
            hxs.select(
                '//div[contains(@class, "js-product-offer-summary")]//div[contains(@class, "price-display")]//text()'
            ).extract())
        if not price:
            price = ''.join(
                response.xpath(
                    '//div[@itemprop="offers"]//div[@itemprop="price"][1]//text()'
                ).extract())
        if not price:
            price = ''.join(
                response.xpath(
                    '//span[contains(@class, "hide-content-m")]/span[@data-tl-id="Price-ProductOffer"]//text()'
                ).extract())
        # Some products are not available online and these have no price
        if price:
            stock_status = 1
            if 'out of stock' in price.lower():
                stock_status = 0

            product_name = filter(
                lambda x: bool(x),
                map(
                    unicode.strip,
                    hxs.select('//h1[contains(@itemprop, "name")]//text()').
                    extract()))

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('name', product_name)
            loader.add_value('identifier',
                             re.search(r'/(\d+)$', response.url).group(1))
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('brand', response.meta['brand'])
            categories = hxs.select(
                '//ol[contains(@class, "breadcrumb-list")]//li//a/span/text()'
            ).extract()
            categories = map(lambda x: x.strip(), categories)
            loader.add_value('category', categories)
            loader.add_value('url', response.url)
            loader.add_xpath(
                'image_url',
                '//img[contains(@class, "js-product-primary-image")]/@src')
            try:
                loader.add_value(
                    'shipping_cost',
                    data['buyingOptions']['shippingPrice']['displayPrice'])
            except KeyError:
                loader.add_css('shipping_cost',
                               'h2.js-shipping-primary-msg::text')

            loader.add_value('price', price)
            if not stock_status:
                loader.add_value('stock', 0)
            item = loader.load_item()
            item['metadata'] = {}

            yield Request(self._get_reviews_url(item, 1),
                          meta={
                              'product': item,
                              'page': 1
                          },
                          callback=self.parse_product_reviews)