def parse_shipping_cost(self, response):
     hxs = HtmlXPathSelector(response)
     meta = response.meta
     product = meta['product']
     item = product['item']
     shipping_cost = hxs.select(
         '//div[@class="shipment-method" and ' +
         'div/div/label/text()="Consegna standard"]' +
         '//span[@class="amount"]/text()').extract()
     item['shipping_cost'] = extract_price_eu(
         shipping_cost[0]) if shipping_cost else 0
     yield item
     remove_regex = ',removeEntryResourceURL:"(.*)",updateItemQuantityResourceURL:"'
     try:
         remove_item = re.search(
             remove_regex,
             response.body).group(1).split('removeEntryResourceURL:"')[-1]
     except:
         return
     cart_entry = hxs.select('//div[@data-product="' + item['identifier'] +
                             '"]/@data-cartentry').extract()
     req = FormRequest(
         remove_item,
         formdata={'cartentry': cart_entry},
         callback=self.parse_sync_shipping,
         dont_filter=True,
         meta={'collect_products': meta.get('collect_products')[1:]})
     yield req
示例#2
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        meta = response.meta

        loader = ProductLoader(response=response, item=Product())
        identifier = hxs.select('//dd[@itemprop="sku"]/text()').extract()
        if identifier:
            identifier = identifier[0]
        else:
            identifier = response.url.split('/')[-1]
        loader.add_value('identifier', identifier)
        name = hxs.select('//h1[@class="detail__title"]/text()').extract()
        if not name:
            name = hxs.select('//h1[@itemprop="name"]/text()').extract()

        loader.add_value('name', name[0].strip())
        price = hxs.select('//img[@class="buybox__pricetag"]/@alt|//*[@itemprop="price"]/text()').extract()
        if price:
            price = price[0]
        else:
            price = '0'
        sku = meta.get('sku')
        if sku:
            loader.add_value('sku', meta['sku'])
        loader.add_value('price', extract_price_eu(price))
        loader.add_value('url', response.url)
        loader.add_xpath('image_url', '//div[contains(@class, "product-images")]/img/@src|//img[@itemprop="image"]/@src',
                         lambda imgs: urljoin_rfc(base_url, imgs[0]))
        yield loader.load_item()
示例#3
0
    def parse_products(self, response):
        category = response.css('.breadcrumbs').xpath(
            './/a/text()').extract()[1:]
        products = response.css('.listing_item')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            image_url = product.css('.listing_item_image').xpath(
                'img/@src').extract_first()
            if not 'noimage' in image_url:
                loader.add_value('image_url', image_url)
            url = product.css('.listing_item_name').xpath(
                '@href').extract_first()
            url = url_query_cleaner(response.urljoin(url))
            sku = url.split('/')[-1]
            loader.add_value('identifier', sku)
            loader.add_value('sku', sku)

            loader.add_value('url', url)
            loader.add_xpath('name', './/a[@class="listing_item_name"]/text()')
            loader.add_xpath(
                'price', './/span[@class="listing_item_basic_price"]/text()')
            loader.add_value('category', category)
            shipping_cost = product.css('.listing_item_delivery_costs').xpath(
                'text()').extract_first()
            loader.add_value('shipping_cost', extract_price_eu(shipping_cost))
            if 'Non disponibile' in product.css(
                    '.listing_item_availability').xpath(
                        'text()').extract_first():
                loader.add_value('stock', 0)
            item = loader.load_item()
            dealer = product.css('.listing_item_merchant_name').xpath(
                'img/@alt').extract_first()
            item['metadata'] = {'Dealer': dealer}
            yield item
示例#4
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(selector=hxs, item=Product())

        loader.add_xpath('name',
                         '//div[@id="sheetBoxTopDetails"]//h1/span/text()')
        loader.add_value('url', response.url)

        price = hxs.select('//h3/span[@id="md_price"]/text()').extract()
        if price == []:
            price = 0
        else:
            price = extract_price_eu(price[0])
        loader.add_value('price', price)

        loader.add_value('shipping_cost', 0)
        image_url = hxs.select('//img[@id="sheetMainImage"]/@src').extract()[0]
        loader.add_value('image_url', urljoin(base_url, image_url))

        category = hxs.select(
            '//div[@id="breadcrumbs"]/span[@id="md_category"]/a/text()'
        ).extract()
        try:
            category.remove('Home')
        except ValueError:
            pass
        category = ' > '.join(category)
        loader.add_value('category', category)

        loader.add_xpath('brand', '//td[@id="md_brand"]/text()')

        stock = hxs.select(
            '//span[@id="md_availability"]/@content').extract()[0]
        if stock == 'out_of_stock':
            stock = 0
        else:
            stock = 1
        loader.add_value('stock', stock)

        loader.add_xpath('sku', '//td[@id="md_mpn"]/text()')
        loader.add_xpath(
            'identifier',
            '//div[@id="sheetBoxTopDetails"]//tr[@class="code"]/td/text()')

        yield loader.load_item()
示例#5
0
    def parse_page(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        '''
        subcats = hxs.select('//div[@class="child_cat"]/@onclick').re(r'(http.*html)')
        for url in subcats:
            yield Request(urljoin_rfc(base_url, url), callback=self.parse_page)
        '''

        pages = hxs.select(
            '//a[contains(@class, "page_page")]/@href').extract()
        for url in pages:
            yield Request(urljoin_rfc(base_url, url), callback=self.parse_page)

        for z in hxs.select(self.products_xpath):
            pprice = z.select(
                './div[@class="prix_sans_promo"]/div[@class="prix_vente_sans_promo"]/text()'
            ).extract()
            if not pprice:
                pprice = z.select(
                    './div[@class="prix"]/div[@class="prix_vente"]/text()'
                ).extract()
            if not pprice:
                self.errors.append('WARNING: No price in %s' % response.url)
                continue
            else:
                price = pprice[0]
            try:
                product_url = z.select(
                    './div[@class="title"]/h2/a/@href').extract()[0]
            except:
                self.errors.append('WARNING: No url in %s' % response.url)
                continue

            loader = ProductLoader(selector=z, item=Product())
            loader.add_xpath('identifier',
                             './/div[contains(@id, "im_prod_")]/@id',
                             re=r'im_prod_(\d+)')
            loader.add_xpath('name', './div[@class="title"]/h2/a/text()')
            loader.add_value('url', urljoin_rfc(base_url, product_url))
            loader.add_value('price', extract_price_eu(price.replace(' ', '')))

            yield loader.load_item()
示例#6
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        meta = response.meta

        loader = ProductLoader(response=response, item=Product())
        identifier = hxs.select('//a[contains(@class, "btn-large")]/@href'
                                ).extract()[0].split('/')[-1]
        loader.add_value('identifier', identifier)
        name = hxs.select(
            '//div[@class="span24"]/h1/text()').extract()[0].strip()
        loader.add_value('name', name)
        loader.add_value('sku', meta['sku'])
        price = hxs.select('//p[@class="actual-price"]/text()').extract()
        price = price[0] if price else '0'
        loader.add_value('price', extract_price_eu(price))
        loader.add_value('url', response.url)
        loader.add_xpath('image_url', '//img[@class="main-packart"]/@src')
        yield loader.load_item()
示例#7
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        identifier = hxs.select('//meta[@itemprop="productID"]/@content').re(
            'sku: *(\d+)')

        name = u' '.join([
            x.strip() for x in hxs.select(
                u'//div[@class="product-name"]//text()').extract()
            if x.strip() != u''
        ])

        sku = [x for x in name.split(' ') if x.isdigit() and len(x) > 2]
        sku = sku[0] if len(set(sku)) == 1 else ''

        category = hxs.select(
            u'//div[@class="breadcrumbs"]//li/a/text()').extract()
        category = category[-1].strip() if category else ''
        loader.add_value('identifier', identifier)
        loader.add_value('name', name)
        brand = hxs.select('//meta[@itemprop="brand"]/@content').extract()
        brand = brand[0].strip() if brand else ''
        loader.add_value('brand', brand)
        loader.add_value('category', category)
        loader.add_value('sku', sku)
        loader.add_value('identifier', identifier)
        loader.add_value('url', response.url)
        price = hxs.select(u'//span[@class="price"]/text()').extract()
        price = extract_price_eu(price[0])
        #price = price[0].replace(',', '') if price else ''
        #if price:
        #price += hxs.select(u'//span[@class="price"]/sup/text()')[0].extract()
        loader.add_value('price', price)
        image = hxs.select(
            u'//div[contains(@class, "img-box")]//img/@src').extract()
        image = image[0] if image else ''
        loader.add_value('image_url', image)
        yield loader.load_item()
示例#8
0
 def extract_price(self, price):
     """
     override extract price cause French site has different number format: #.###,##
     """
     return extract_price_eu(price)