示例#1
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        product_loader = ProductLoader(item=Product(), selector=hxs)
        name = hxs.select('//*[@itemprop="name"]/text()').extract()
        name = name[0].strip() if name else ''

        product_loader.add_value('name', name)
        sku = ''
        for match in re.finditer(r"([\d,\.]+)", name):
            if len(match.group()) > len(sku):
                sku = match.group()
        product_loader.add_value('sku', sku)
        image_url = hxs.select(
            '//img[contains(@class, "productimage") and contains(@class, "main")]/@src'
        ).extract()
        product_loader.add_value('image_url', image_url)
        price = hxs.select('//div[@itemprop="offers"]//*[@itemprop="price"]/text()').re(r'[\d,. ]+')[0]\
            .strip().replace(' ', '').replace(',-', '').replace(u'\xa0', '').replace(',', '.')
        product_loader.add_value('price', extract_price(price))
        if product_loader.get_collected_values(
                'price'
        ) and product_loader.get_collected_values('price')[0] < 1000:
            product_loader.add_value('shipping_cost', '49')
        identifier = hxs.select('//div[@id="description-extra"]/text()').re(
            '\d+')
        product_loader.add_value('identifier', identifier)
        product_loader.add_value('url', response.url)
        out_stock = hxs.select(
            '//*[@itemprop="availability" and contains(@href, "OutOfStock")]')
        if out_stock:
            product_loader.add_value('stock', 0)
        product = product_loader.load_item()
        yield product
示例#2
0
    def parse_product_list(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//div[@class="product-wrapper"]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            name = product.select('.//h3//text()').extract()[0]
            product_loader.add_value('name', name)
            sku = ''
            for match in re.finditer(r"([\d,\.]+)", name):
                if len(match.group()) > len(sku):
                    sku = match.group()
            product_loader.add_value('sku', sku)
            image_url = product.select(
                './div[@class="product-image"]//img/@data-original').extract()
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            price = product.select('./div[@class="product-price"]//span[@class="price-amount"]/text()').extract()[0].strip()\
                .strip(' Kr').replace('.', '')
            product_loader.add_value('price', extract_price(price))
            if product_loader.get_collected_values(
                    'price'
            ) and product_loader.get_collected_values('price')[0] < 1500:
                product_loader.add_value('shipping_cost', '49')
            buy_button = product.select(
                './div[@class="product-buttons"]/a[@class="buy-button"]')
            if not buy_button:
                product_loader.add_value('stock', 0)
            url = product.select(
                './div[@class="product-buttons"]/a[@class="button-info"]/@href'
            ).extract()[0]
            product_loader.add_value('url', urljoin_rfc(base_url, url))
            identifier = product.select(
                './div[@class="product-name"]//@data-productid').extract()[0]
            product_loader.add_value('identifier', identifier)
            product = product_loader.load_item()
            yield product

        pages = hxs.select('//a[@class="paging-link-box"]/@href').extract()
        for url in pages:
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_product_list)
示例#3
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)

        product_loader.add_value('url', response.url)

        product_name = hxs.select(
            '//div[@class="product-name"]/h1/text()').extract()[0]
        product_loader.add_value('name', product_name)

        image_url = hxs.select(
            '//div[contains(@class, "img-box")]//img/@src').extract()[0]
        product_loader.add_value('image_url', urljoin_rfc(base_url, image_url))

        identifier = hxs.select(
            '//span[@class="sku"]/text()').extract()[0].strip()
        product_loader.add_value('identifier', identifier)

        #sku = re.search('(\d+)', identifier)
        #sku = sku.group(1) if sku else ''
        sku = identifier
        product_loader.add_value('sku', sku)

        #price = hxs.select('//span[starts-with(@id,"product-price")]//span[@class="price"]/text()').extract()
        price = hxs.select(
            '//div[@class="product-type-data"]/div[@class="price-box"]//span[@class="price"]/text()'
        ).extract()
        price = price[-1].strip() if price else '0.00'
        product_loader.add_value(
            'price',
            price.replace(',', '.').replace(' ', '').replace(u'\xa0', ''))
        if product_loader.get_collected_values(
                'price'
        ) and product_loader.get_collected_values('price')[0] < 1000:
            product_loader.add_value('shipping_cost', '49')
        # category = hxs.select('').extract()
        # category = category[0].strip() if category else ''
        # product_loader.add_value('category', category)

        product_loader.add_value('brand', 'Lego')

        yield product_loader.load_item()
示例#4
0
    def parse_product(self, response):
        import re
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        loader.add_xpath('identifier', '//input[@id="product_id"]/@value')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1/text()')
        sku = ''.join(hxs.select('//h1/text()').extract())
        try:
            loader.add_value('sku', re.search('(\d{3}\d*)', sku).groups()[0])
        except:
            sku = response.xpath(
                '//input[@id="product_productID"]/@value').extract()
            if sku:
                loader.add_value('sku', sku[0].strip())
            else:
                self.log('No SKU for %s' % (response.url))
        loader.add_xpath('price', '//*[@itemprop="price"]/text()')
        loader.add_xpath(
            'category',
            '//ul[@id="mnu_main"]/li[contains(@class, "selected")]//a/text()')

        img = hxs.select('//div[@class="img"]//a/@href').extract()
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img[0]))

        loader.add_value('brand', 'lego')
        if loader.get_collected_values(
                'price') and loader.get_collected_values('price')[0] < 600:
            loader.add_value('shipping_cost', '29')
#        loader.add_value('shipping_cost', '49')
#        loader.add_value('stock', '0')

        prod = loader.load_item()
        if prod.get('price'):
            yield prod
        else:
            for opt in hxs.select('//div[@class="cont"]//a/@href').extract():
                yield Request(urljoin_rfc(get_base_url(response), opt),
                              callback=self.parse_product,
                              meta=response.meta)
示例#5
0
    def parse_product(self, response):
        import re
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        loader.add_value(
            'identifier',
            re.search(r'onclick="wl\.addProductItem\((\d+),',
                      response.body).groups()[0])
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1//text()')
        sku = ''.join(hxs.select('//h1//text()').extract())
        try:
            loader.add_value('sku', re.search('(\d{3}\d*)', sku).groups()[0])
        except:
            self.log('No SKU for %s' % (response.url))
        loader.add_value(
            'price',
            extract_price(''.join(
                hxs.select('//div[@class="price"]/text()').extract()).replace(
                    ' ', '')))
        if loader.get_collected_values(
                'price') and loader.get_collected_values('price')[0] < 400:
            loader.add_value('shipping_cost', '49')
        loader.add_value('category', 'Lego')

        img = hxs.select('//div[@class="image"]//img/@src').extract()
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img[0]))

        loader.add_value('brand', 'lego')
        #        loader.add_value('shipping_cost', '49')
        if re.search('ItemData .*Finns i lager.*', response.body):
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        yield loader.load_item()
示例#6
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        meta = response.meta.copy()

        identifier = hxs.select(
            '//form[@id="product_addtocart_form"]'
            '//input[@name="product"]/@value').extract().pop().strip()
        shipping_cost = hxs.select(
            '//li[contains(string(), "Shipping cost:")]/span[@class="price"]/text()'
        ).extract()
        price = "".join(
            hxs.select('//span[@id="product-price-%s"]//text()' %
                       identifier).extract()).strip()
        loader = ProductLoader(item=Product(), selector=response)
        loader.add_value('sku', identifier)
        loader.add_value('identifier', identifier)
        loader.add_value('name', meta.get('name'))
        if price:
            loader.add_value('price', price)
        else:
            loader.add_value('price', meta.get('price'))
        if not loader.get_collected_values("price")[0]:
            return
        loader.add_value('url', response.url)
        loader.add_value('brand', meta.get('brand', ''))
        loader.add_value('category', meta.get('category', ''))
        if shipping_cost:
            loader.add_value('shipping_cost', shipping_cost.pop())
        loader.add_value('image_url', meta.get('image_url', ''))
        in_stock = bool(
            hxs.select(
                '//form//div[@class="add-to-cart-btn"]/button[contains(string(), "Add to Cart")]'
            ) or hxs.select(
                '//form//div[@class="add-to-cart-btn"]/button[contains(string(), "Coming Soon")]'
            ) or hxs.select(
                '//form//div[@class="add-to-cart-btn"]/button[contains(string(), "Very Limited Stock!")]'
            ) or hxs.select(
                '//form//div[@class="add-to-cart-btn"]/button[contains(string(), "Back Order Only")]'
            ))
        if in_stock:
            loader.add_value('stock', 1)
        else:
            loader.add_value('stock', 0)

        yield loader.load_item()