示例#1
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        url = hxs.select('//link[@rel="canonical"]/@href').extract()[0]

        options = hxs.select(
            '//div[@class="BBFLW100 pdSelections"]/select/option[@selected="selected"][@value="0"]'
        )
        if options:
            for option in options.select('../option[@value!="0"]'):
                event = option.select('../@name').extract()[0]
                formdata = {
                    '__VIEWSTATE':
                    hxs.select(
                        "//input[@id='__VIEWSTATE']/@value").extract()[0],
                    '__VIEWSTATEGENERATOR':
                    hxs.select("//input[@id='__VIEWSTATEGENERATOR']/@value").
                    extract()[0],
                    '__EVENTTARGET':
                    event,
                    event:
                    option.select('@value').extract()[0]
                }
                yield FormRequest(url,
                                  formdata=formdata,
                                  callback=self.parse_product,
                                  dont_filter=True,
                                  meta={'event': event})
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('url', '//link[@rel="canonical"]/@href')
        loader.add_xpath('name', '//h1[@id="h1ProdName"]/text()')
        loader.add_xpath(
            'category',
            '//div[@id="Breadcrumb"]//span[@itemprop="title"]/text()[.!="Home" and .!="Offers"]'
        )
        loader.add_xpath('image_url', '//img[@id="imgProdMainImg"]/@src')
        loader.add_xpath(
            'brand', '//div[@id="pnlManufacturer"]/meta[@itemprop]/@content')
        loader.add_xpath(
            'shipping_cost',
            '//div[@id="pdEstmtdDlvrDesc"]/ul[1]/li[@class="charges"]/text()')
        if not hxs.select(
                '//div[@id="pdStock"]/span[text()="In Stock"]').extract():
            loader.add_value('stock', 0)
        loader.add_xpath('identifier', '//span[@id="lblProdCode"]/text()')
        loader.add_xpath(
            'price',
            '//div[@id="pnlProdPriceNStock"]//span[@itemprop="price"]/text()')
        loader.add_xpath('sku', '//span[@id="lblProdCode"]/text()')

        item = loader.load_item()
        promotions = hxs.select('//div[@class="was-saveprice FL"]/style/text()'
                                ).re('{content:"(.+)"}')
        if promotions:
            metadata = MetaData()
            metadata['Promotions'] = promotions[0]
            item['metadata'] = metadata
        yield item
示例#2
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//meta[@property="og:title"][1]/@content')
        identifier = response.meta['id']
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath(
            'price',
            '//span[@class="price-including-tax"]/span[@class="price"]/text()')
        loader.add_xpath('image_url',
                         '//div[@class="product-img-box"]//img/@src')
        loader.add_xpath(
            'category',
            '//div[@class="breadcrumbs"]/ul/li[position()>1]/a/text()')
        brand = hxs.select(
            '//th[text()="Manufacturer"]/../td/text()').extract()
        if brand:
            loader.add_value('brand', brand[0])
        if not hxs.select('//p[@class="availability in-stock"]/span'):
            loader.add_value('stock', 0)

        yield loader.load_item()
示例#3
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        box = hxs.select('//div[@class="prod-box"]')
        crumbs = hxs.select('//ul[@class="breadcrumbs"]')[0]
        loader = ProductLoader(selector=box, item=Product())
        loader.add_value('url', response.url)
        brand = crumbs.select('.//a[contains(text(), "Brands")]/../following-sibling::li[1]/a/text()').extract()
        loader.add_value('brand', brand)
        categories = crumbs.select('.//a/text()').extract()
        categories = [cat for cat in categories if "Brand" not in cat]
        loader.add_value('category', categories)
        image_url = hxs.select('//section[@id="one"]//@src').extract()
        if not image_url:
            yield Request(response.url, callback=self.parse_category, dont_filter=True)
            return
        loader.add_value('image_url', urljoin(base_url, image_url[0]))
        loader.add_xpath('name', './h1/text()')
        loader.add_xpath('identifier', '//*/@prodref')
        loader.add_xpath('sku', '//*/@prodref')
        if not box.select('//*[text()="In Stock" or text()="Low Stock"]'):
            loader.add_value('stock', 0)
        loader.add_xpath('price', './/span[@class="product-price"]/text()')
        product = loader.load_item()
        if product['price'] < 20:
            product['shipping_cost'] = 2
        elif product['price'] < 40:
            product['shipping_cost'] = 4.99
        yield product
示例#4
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        price = filter(
            lambda p: p.strip(),
            hxs.select("//span[@class='regular-price']//text()").extract())[1:]

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('price', price)
        loader.add_value('url', response.url)
        loader.add_xpath('name', "//div[@class='product-name']//h1//text()")
        loader.add_xpath(
            'category',
            "//div[@class='breadcrumbs']//li[position() > 1 and position() < last()]/a/text()"
        )
        brand = hxs.select(
            "//div[@class='product-shop']/div[@class='product-name']/a[@class='brand']/text()"
        ).extract()
        loader.add_value('brand', brand)
        loader.add_value('shipping_cost', 0)
        loader.add_xpath('sku', '//li/span[text()="SKU:"]/../text()')
        loader.add_xpath(
            'identifier',
            "//div[@class='product-view']//input[@name='product']/@value")
        image_urls = hxs.select(
            '//img[contains(@class, "gallery-image")]/@src').extract()
        for image_url in image_urls:
            if len(image_url) < 1024:
                loader.add_value('image_url', image_url)
                break
        product = loader.load_item()
        if product['price'] > 0:
            yield product
示例#5
0
    def parse_node(self, response, node):
        loader = ProductLoader(item=Product(), selector=node)
        size = node.xpath('./*[local-name()="size"]/text()').extract()
        color = node.xpath('./*[local-name()="color"]/text()').extract()
        material = node.xpath('./*[local-name()="material"]/text()').extract()
        name = node.xpath('./*[local-name()="parent_title"]/text()').extract()
        if not name:
            name = node.xpath('./title/text()').extract()
        name = name[0]
        if material:
            name += u' {}'.format(material[0])
        if color:
            name += u' {}'.format(color[0])
        if size:
            name += u' {}'.format(size[0])
        loader.add_value('name', name)
        loader.add_xpath('url', './link/text()')
        loader.add_xpath('image_url', './*[local-name()="image_link"]/text()')
        loader.add_xpath('identifier', './*[local-name()="id"]/text()')
        loader.add_xpath('price', './*[local-name()="price"]/text()')
        loader.add_xpath('shipping_cost', './*[local-name()="shipping"]/*[local-name()="price"]/text()')
        loader.add_xpath('brand', './*[local-name()="brand"]/text()')
        loader.add_xpath('category', './*[local-name()="google_product_category"]/text()')
        loader.add_xpath('sku', './*[local-name()="mpn"]/text()')
        stock = node.xpath('./*[local-name()="availability"]/text()').extract()
        if stock and stock[0] == 'out of stock':
            loader.add_value('stock', 0)

        yield loader.load_item()
示例#6
0
    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        if not loader.get_collected_values('name'):
            return
        loader.add_xpath('price', '//span[@class="full-price"]/text()')
        stock = response.xpath(
            '//div[contains(@class, "low-stock")]') or response.xpath(
                '//div[contains(@class, "no-stock")]')
        if stock:
            loader.add_value('stock', 0)
        categories = response.xpath(
            '//ul[@class="the-breadcrumb-list"]//span[@itemprop="title"]/text()'
        ).extract()
        for category in categories:
            if category.title() not in ('Home', 'Search Results'):
                loader.add_value('category', category)
        #loader.add_xpath('category', '//li[@class="terain-type"]/text()')
        brand = response.meta.get('brand')
        if not brand:
            brand = response.xpath(
                '//div[@class="product-brand"]/a/@href').extract()[0]
        loader.add_value('brand', brand.strip('/').replace('-', ' '))
        loader.add_xpath('identifier', response.url.rpartition('_')[-1])
        loader.add_value('sku', response.url.rpartition('_')[-1])
        loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')

        yield loader.load_item()
示例#7
0
 def parse_product(self, response):
     
     
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     loader = ProductLoader(item=Product(), selector=hxs)
     loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a[@href!="/"]/text()')
     brand = hxs.select('//script[@type="text/javascript"]/text()').re('brand: *\"(.+)\"')
     loader.add_value('brand', brand)
     loader.add_xpath('image_url', '//div[@id="amp-originalImage"]/img/@src')
     loader.add_value('url', url_query_cleaner(response.url))
     loader.add_xpath('name', '//input[@name="speedtrapProductDisplayName"]/@value')
     item = loader.load_item()
     if hxs.select('//ul[@class="productOptionsList"]/li[contains(@class, "skuAttribute")]'):
         data = hxs.select('//script[contains(text(),"stockMatrix =")]/text()')[0].extract()
         data = data.replace('\n', '').replace('null', '"null"')
         data = re.search('stockMatrix = (.*?);', data, re.DOTALL)
         data = json.loads(data.group(1)) if data else []
         for i, variant in enumerate(data):
             sku = [elem for elem in variant if elem.startswith('sku')][0]
             sku_idx = variant.index(sku)
             product = Product(item)
             product['name'] = item['name'] + ' - ' + ' '.join(variant[:sku_idx]).title()
             product['identifier'] = '{}-{}'.format(response.meta.get('row').get('PRODUCT_NUMBER'), i)
             product['sku'] = product['identifier']
             product['price'] = variant[sku_idx + 2]
             product['stock'] = 1 if 'Available#Delivery' in variant[sku_idx + 1] else 0
             yield product
         return
     loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER'))
     loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER'))
     loader.add_xpath('price', '//input[@name="speedtrapPrice"]/@value')
     stock = 1 if hxs.select('//meta[@property="product:availability"]/@content[.="In Stock"]') else 0
     loader.add_value('stock', stock)
     yield loader.load_item()
示例#8
0
    def parse_product(self, response):
        for url in response.css('.facet-nav a::attr(href)').extract():
            yield Request(response.urljoin(url), self.parse_product)

        xpath = '//meta[@property="%s"]/@content'
        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('identifier', xpath % 'product:retailer_part_no')
        loader.add_xpath('url', '//link[@rel="canonical"]/@href')
        loader.add_xpath('name', xpath % 'og:title')
        #loader.add_xpath('name', xpath %'product:color')
        loader.add_xpath('price', xpath % 'product:price:amount')
        loader.add_xpath('sku', xpath % 'product:retailer_part_no')
        category = response.xpath(
            '//ul[@itemprop="breadcrumb"]//a/text()').extract()
        category.remove('Home')
        category.remove('Products')
        category.pop(-1)
        loader.add_value('category', category[-3:])
        loader.add_xpath('image_url', xpath % 'og:image')
        loader.add_xpath('brand', xpath % 'product:brand')
        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', '3.99')

        item = loader.load_item()
        if item.get('identifier'):
            yield item
示例#9
0
    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = response.xpath(
            '//input[@name="productId"]/@value').extract_first()
        if not identifier:
            loader.add_value('stock', 0)
            identifier = response.xpath('//text()').re('productId=(.+?)&')
        loader.add_value('identifier', identifier)
        loader.add_value('url', url_query_cleaner(response.url))
        loader.add_css('name', 'div.productTitleDescriptionContainer h1::text')
        loader.add_css('price', 'p.pricePerUnit::text')
        loader.add_css('sku', 'p.itemCode::text', re='Item code:(.+)')
        category = response.xpath(
            '//ul[@id="breadcrumbNavList"]//a/span/text()').extract()
        if 'Home' in category:
            category.remove('Home')
        loader.add_value('category', category)
        image_url = response.css(
            'img#productImageID::attr(src)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        item = loader.load_item()
        item['metadata'] = {'reviews': []}

        review_id = response.xpath('//text()').re_first("productId: '(.+?)'")
        reviews_url = 'http://sainsburysgrocery.ugc.bazaarvoice.com/8076-en_gb/%s/reviews.djs?format=embeddedhtml' % review_id
        yield Request(reviews_url,
                      callback=self.parse_review_page,
                      meta={'item': item})
示例#10
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     identifier = response.css('span#thisstkcode::text').extract_first()
     if not identifier:
         retries = response.meta.get('retries', 0)
         if retries > 9:
             self.logger.warning('No identifier found on %s' % response.url)
         else:
             self.logger.debug('Retry %s to get identifier' % response.url)
         meta = response.meta
         meta['retries'] = retries + 1
         yield response.request.replace('dont_filter=True', meta=meta)
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     loader.add_value('url', response.url)
     loader.add_xpath('name', '//h1/text()')
     price = response.css('span.prodPrice').xpath(
         './/span[@itemprop="price"]/text()').extract_first()
     loader.add_value('price', price)
     category = response.css('.breadcrumbs span::text').extract()[1:]
     loader.add_value('category', category)
     loader.add_css('image_url', '.main-product-photo::attr(href)')
     loader.add_css('brand', 'span#thisbrand::text')
     loader.add_css('stock', 'input#data-stock-qty::attr(value)')
     yield loader.load_item()
示例#11
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     identifier = response.xpath('//input[@name="product_id"]/@value').extract_first()
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     loader.add_value('url', response.url)
     loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
     category = response.xpath('//div[@id="ProductBreadcrumb"]//a/text()').extract()[1:]
     loader.add_value('category', category)
     loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
     loader.add_xpath('brand', '//div[@itemtype="http://schema.org/Organization"]/meta[@itemprop="name"]/@content')
     if not response.xpath('//link[@itemprop="availability"]/@href[contains(., "InStock")]'):
         loader.add_value('stock', 0)
     
     sku = identifier
     name = loader.get_output_value('name')
     name_end = re.search('\S+$', name).group(0).strip(' ()')
     keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(',')
     keywords = [word.strip() for word in keywords if word]
     shortest_keyword = min(keywords, key=len) if keywords else 'none'
     from_name = re.findall('\S*\d+\S*', name)
     if shortest_keyword.lower() == name_end.lower():
         sku = name_end
     elif shortest_keyword.upper() == shortest_keyword:
         sku = shortest_keyword
     elif name_end.upper() == name_end:
         sku = name_end
     elif from_name:
         sku = max(from_name, key=len)
         if '(' in sku:
             sku = identifier
     loader.replace_value('sku', sku)
     yield loader.load_item()
示例#12
0
 def parse_product(response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     models = response.xpath(
         '//div[contains(@class, "row-fluid") and .//table[@class="data"] and div[contains(@class, "media span")]]'
     )
     for model in models:
         loader = ProductLoader(item=Product(), selector=model)
         name = model.xpath('.//p/strong//text()').extract()[-1].strip()
         if not name:
             name = model.xpath(
                 './/p/strong[contains(text(), "Ford")]//text()').extract(
                 )[-1].strip()
         loader.add_value('name', name)
         prices = model.xpath(
             './/tr[td[contains(text(), "Cash")]]/td[not(contains(text(), "Cash"))]/text()'
         ).re('\d+,\d+')
         prices = map(extract_price, prices)
         price = min(prices)
         loader.add_value('price', price)
         image_url = model.xpath(
             './/picture/source/@data-placeholder').extract()
         image_url = 'http:' + image_url[0] if image_url else ''
         loader.add_value('image_url', image_url)
         loader.add_value('identifier', '_'.join(name.split()))
         loader.add_value('url', response.url)
         yield loader.load_item()
示例#13
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        row = response.meta['row']

        if not hxs.select('//div[@class="productDetail"]'):
            return

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_value('sku', row['PRODUCT_NUMBER'])
        product_loader.add_value('identifier', row['PRODUCT_NUMBER'])
        product_loader.add_xpath(
            'name',
            u'//div[@class="productDescription"]/h3/text()|//div[@class="productDescription"]/h4/text()'
        )
        product_loader.add_xpath(
            'brand', u'//div[@class="productDescription"]/h2/text()')
        if hxs.select('//input[contains(@class, "purchaseButton")]'):
            product_loader.add_value('stock', '1')
        product_loader.add_xpath(
            'category', '//p[@id="breadCrumbs"]/a[position() > 1]/text()')
        img = hxs.select(u'//img[@class="productImage"]/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img[0]))

        product_loader.add_xpath('price', './/span[@class="ourPrice"]/text()')
        item = product_loader.load_item()

        if item['price'] < 25:
            item['shipping_cost'] = Decimal('1.95')

        yield item
示例#14
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     loader.add_xpath('identifier', '//input[@name="productid"]/@value')
     loader.add_value('url', response.url)
     loader.add_css('name', '.descr::text')
     loader.add_css('price', 'span.currency::text')
     loader.add_value('sku', response.meta['sku'])
     image_url = response.css(
         'img#product_thumbnail::attr(src)').extract_first()
     if image_url:
         loader.add_value('image_url', response.urljoin(image_url))
     loader.add_value('brand', response.meta['brand'])
     stock = response.css('.quantity script::text').re(
         'product_avail = (\d+);')[0]
     loader.add_value('stock', stock)
     item = loader.load_item()
     if stock == '0':
         yield item
         return
     request = FormRequest.from_response(response,
                                         formname='orderform',
                                         meta={
                                             'cookiejar':
                                             item['identifier'],
                                             'item': Product(item)
                                         },
                                         cookies=self.cookies,
                                         callback=self.parse_shipping,
                                         dont_filter=True)
     yield request
示例#15
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', '//h1//text()')
        product_loader.add_xpath('identifier',
                                 '//input[@name="product_id"]/@value')
        try:
            sku = hxs.select(
                '//div[@class="description"]/span[contains(text(), '
                '"Codice")]/following-sibling::text()').extract()[0].strip()
        except:
            sku = ''
        product_loader.add_xpath('sku', '//input[@name="product_id"]/@value')
        product_loader.add_xpath('image_url', '//img[@id="image"]/@src')
        brand = response.css('.description').xpath(
            './/a/span/text()').extract_first()
        product_loader.add_value('brand', brand)
        category = response.css('.breadcrumb').xpath(
            'li[2]/a/span/text()').extract()
        product_loader.add_value('category', category)
        price = extract_price_eu(
            hxs.select('//div[@class="price"]/span/text()').extract()[0])
        product_loader.add_value('price', price)
        stock = ''.join(
            hxs.select(
                '//div[@class="description"]/span/strong[contains(text(), '
                '"Disponibilit")]/../following-sibling::text()').extract()
        ).strip().lower()
        if stock and not 'in magazzino' in stock:
            product_loader.add_value('stock', 0)

        yield product_loader.load_item()
示例#16
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(selector=hxs, item=Product())
        loader.add_value('url', response.url)
        loader.add_xpath(
            'brand', './/dt[text()="Brand"]/following-sibling::dd[1]/text()')
        loader.add_xpath('category',
                         './/div[contains(@class, "breadcrumbs")]//a/text()')

        if hxs.select('//article[@id="product"]'):
            image_url = hxs.select(
                './/div[@id="amplienceContent"]//img/@src').extract()
            loader.replace_value('image_url', urljoin(base_url, image_url[0]))
            options = hxs.select(
                '//script[@type="text/javascript"]/text()[contains(., "productData")]'
            ).extract()
            for item in self.parse_options(hxs, base_url, loader, options):
                yield item

        for product in hxs.select('//article[@class="bdp-item"]'):
            image_url = product.select(
                './/a[contains(@id, "mainImage")]/img/@src').extract()[0]
            loader.replace_value('image_url', urljoin(base_url, image_url))
            options = product.select(
                './div/div[1]//script[@type="text/javascript"]/text()'
            ).extract()
            for item in self.parse_options(product, base_url, loader, options):
                yield item
示例#17
0
 def parse_row(self, response, row):
     loader = ProductLoader(Product(), response=response)
     loader.add_value('identifier', row['Rapid Code'])
     loader.add_value('name', row['Description'])
     loader.add_value('sku', row['Manufactures Code'])
     loader.add_value('brand', row['Brand'])
     loader.add_value('url', row['URL'])
     yield Request(row['URL'], self.parse_product, meta={'loader': loader})
示例#18
0
 def parse_products(self, response):
     data = json.loads(response.body)
     for item in data['items']:
         loader = ProductLoader(item=Product(), response=response)
         loader.add_value('identifier', item['id'])
         loader.add_value('sku', item['id'])
         loader.add_value('name', item['nm'])
         loader.add_value('price', item['p'])
         loader.add_value('url', response.urljoin(item['l']))
         loader.add_value('image_url', response.urljoin(item['img']))
         yield loader.load_item()
示例#19
0
 def parse_simple_product(self, response):
     loader = ProductLoader(Product(), response=response)
     loader.add_xpath('identifier', '//input[@name="product"]/@value')
     loader.add_value('url', response.url)
     loader.add_css('name', 'div.product-name h1::text')
     loader.add_css('price', 'li.bigPrice span.price::text')
     loader.add_xpath('sku', '//input[@name="product"]/@value')
     category = response.css('div.breadcrumbs a::text').extract()[1:]
     loader.add_value('category', category)
     loader.add_css('image_url', 'img#image::attr(src)')
     item = loader.load_item()
     yield item
示例#20
0
 def parse_frames(self, response):
     base_url = get_base_url(response)
     products = response.xpath('//tr/td[text()="Code"][1]')
     if products:
         margin = 3
     else:
         products = response.xpath('//tr/td[span/text()="CODE"][1]')
         if products:
             margin = 2
     if not products:
         self.log('No products found on %s' % response.url)
     identifiers = []
     image_url = response.xpath(
         '//img[not (contains(@alt, "Doors"))]/@src[contains(., "images-thumb")]'
     ).extract()
     for product in products:
         for idx, option in enumerate(
                 product.xpath(
                     './../preceding-sibling::tr[1]/td[position()>1]')):
             name = option.xpath('.//text()').extract()
             for size in product.xpath('./../following-sibling::tr'):
                 if size.xpath(
                         'td[(text()="Code") or (span/text()="CODE")]'):
                     break
                 if not size.xpath('./td[1][contains(.//text(), " x")]'):
                     continue
                 loader = ProductLoader(item=Product(), selector=size)
                 loader.add_value('name', name)
                 size_name = size.xpath('td[1]/text()').extract()
                 loader.add_value('name', size_name)
                 loader.add_xpath('sku',
                                  'td[%d]/text()' % (idx * 2 + margin))
                 loader.add_xpath('price',
                                  'td[%d]/text()' % (idx * 2 + margin + 1))
                 if not loader.get_output_value('sku'):
                     continue
                 identifier = loader.get_output_value(
                     'sku') + '-' + '-'.join(re.findall(
                         '\d+', size_name[0]))
                 identifier += '-' + response.url.split('/')[-1].split(
                     '_')[0].split('.')[0]
                 while identifier in identifiers or identifier in self.ids_seen:
                     identifier += '-d'
                 identifiers.append(identifier)
                 self.ids_seen.append(identifier)
                 loader.add_value('identifier', identifier)
                 loader.add_value('url', response.url)
                 if image_url:
                     loader.add_value('image_url',
                                      urljoin(base_url, image_url[0]))
                 yield loader.load_item()
示例#21
0
 def parse_category(self, response):
     category = response.css('li.last::text').extract()
     products = response.xpath('//div[@typeof="Product"]')
     for product in products:
         loader = ProductLoader(Product(), selector=product)
         loader.add_xpath('identifier', './/*[@property="url"]/@sku')
         url = product.xpath('.//*[@property="url"]/@href').extract_first()
         loader.add_value('url', response.urljoin(url))
         loader.add_xpath('name', './/*[@property="url"]/text()')
         loader.add_xpath('price', './/*[@property="price"]/text()')
         loader.add_xpath('sku', './/*[@property="url"]/@sku')
         loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a/text()')
         loader.add_value('category', category)
         loader.add_xpath('image_url', './/*[@property="image"]/@content')
         if loader.get_output_value('price') < 50:
             loader.add_value('shipping_cost', '9.95')
         if product.xpath('.//button[starts-with(@id, "outOfStock")]'):
             loader.add_value('stock', 0)
         yield loader.load_item()
         
     if url_query_parameter(response.url, 'pn') or re.search('/cat_.+/.', response.url):
         return
     filters = response.css('ul.filters input::attr(id)').re('^\S{5}$')
     for filt in filters:
         url = response.url + '/' + filt
         yield Request(url, self.parse_category)
示例#22
0
    def parse_car(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        identifier = response.url.split('/')[-2]

        price = hxs.select(
            '//td[contains(text(), "Cash Price")]/following-sibling::td/text()'
        ).extract()
        if not price:
            price = hxs.select('//h2/text()').re(
                'Manager\'s Special Price (.*)')
        if not price:
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier', identifier)

        name = hxs.select(
            '//div[@class="textInner"][./h2]/*//strong/text()').extract()
        if name:
            name = name[0]
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        loader.add_value('price', price)

        yield loader.load_item()
示例#23
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        xpath = '//div[@class="nosto_product"]/span[@class="%s"]/text()'
        if not response.xpath('//div[@class="nosto_product"]'):
            for product in self.parse_category(response):
                yield product
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        for s in ('name', 'price', 'image_url', 'brand'):
            loader.add_xpath(s, xpath % s)
        loader.add_xpath('identifier', xpath % 'product_id')
        loader.add_xpath('sku', '//h6[@class="product-model"]/text()')
        category = hxs.select(xpath % 'category').extract()
        if category:
            category.sort()
            loader.add_value('category', category[-1].strip('/').split('/'))
        loader.add_value('shipping_cost', 29.99)
        if 'InStock' not in hxs.select(xpath % 'availability').extract():
            loader.add_value('stock', 0)
        item = loader.load_item()
        if 'Ex Display' in item['name']:
            item['metadata'] = {'Ex Display': 'Ex Display'}
        yield item
示例#24
0
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     if hxs.select('//a[@href="#product-range"]'):
         for url in hxs.select(
                 '//section[contains(@class, "product-range")]//div/a/@href'
         ).extract():
             yield Request(urljoin(base_url, url),
                           callback=self.parse_product)
         return
     loader = ProductLoader(item=Product(), selector=hxs)
     loader.add_xpath('name', '//h1[@class="fn c-both"]/text()')
     loader.add_xpath('price',
                      ('//span[@class="cta now-price"]/text()', '0'))
     if not hxs.select('//select[@id="quantity"]'):
         loader.add_value('stock', 0)
     categories = hxs.select(
         '//section[@class="breadcrumbs"]//a/text()').extract()[2:-1]
     if 'in the kitchen' in categories:
         categories.remove('in the kitchen')
     if 'baking' in categories:
         categories.remove('baking')
     loader.add_value('category', categories)
     loader.add_value('brand', "Lakeland")
     loader.add_xpath('identifier', '//meta[@name="productcode"]/@content')
     loader.add_xpath('sku', '//meta[@name="productcode"]/@content')
     loader.add_xpath('image_url', '//img[@class="main-image"]/@src')
     loader.add_value('url', response.url)
     product = loader.load_item()
     if product.get('price', 30) < 30:
         product['shipping_cost'] = 2.99
     yield product
示例#25
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        base_url = '/'.join(base_url.split('/')[:3])

        product = {}

        product['identifier'] = response.xpath(
            '//input[@name="elementID"]/@value').extract_first()

        if not response.css('span.product-in-store'):
            product['stock'] = 0

        product['name'] = response.xpath(
            '//h1[@itemprop="name"]/text()').extract_first()

        product['price'] = response.xpath(
            '//meta[@itemprop="price"]/@content').extract_first()

        product['url'] = response.url

        product['brand'] = hxs.select(
            u'//dt[contains(., "Производитель")]/following-sibling::dd/span/text()'
        ).extract_first()
        if not product['brand']:
            product['brand'] = response.xpath('//span/text()').re_first(
                u'Другие товары бренда (.+)')

        image_url = hxs.select('//img[@itemprop="image"]/@src').extract()
        if image_url:
            product['image_url'] = urljoin_rfc(base_url, image_url[0].strip())

        product['sku'] = ''
        product['sku'] = response.xpath(
            u'//span[contains(., "Артикул:")]/following-sibling::span/text()'
        ).extract_first()

        product['category'] = hxs.select(
            '//div[contains(@class, "breadcrumbs")]//span/text()').extract(
            )[-2]

        product_loader = ProductLoaderWithoutSpaces(item=Product(),
                                                    selector=hxs)
        for k, v in product.iteritems():
            product_loader.add_value(k, v)
        product = product_loader.load_item()

        #time.sleep(random.random()*2.0)
        yield product
示例#26
0
    def parse_node(self, response, node):
        identifier = node.select('./*[local-name()="id"]/text()')[0].extract()
        if identifier not in self.id_code_map:
            return
        product_code = self.id_code_map[identifier]
        loader = ProductLoader(item=Product(), selector=node)
        size = node.xpath('./*[local-name()="size"]/text()').extract()
        color = node.xpath('./*[local-name()="color"]/text()').extract()
        material = node.xpath('./*[local-name()="material"]/text()').extract()
        name = node.xpath('./*[local-name()="parent_title"]/text()').extract()
        if not name:
            name = node.xpath('./title/text()').extract()
        name = name[0]
        if material:
            name += u' {}'.format(material[0])
        if color:
            name += u' {}'.format(color[0])
        if size:
            name += u' {}'.format(size[0])
        price = node.xpath('./*[local-name()="price"]/text()').extract_first()
        pack_size = node.xpath('./description/text()').re(
            'Pack Size m: *([\d.]+)')
        if pack_size:
            price = extract_price(price) * extract_price(pack_size[0])

        loader.add_value('name', name)
        loader.add_xpath('url', './link/text()')
        loader.add_xpath('image_url', './*[local-name()="image_link"]/text()')
        loader.add_value('identifier', identifier)
        loader.add_value('price', price)
        loader.add_xpath(
            'shipping_cost',
            './*[local-name()="shipping"]/*[local-name()="price"]/text()')
        loader.add_xpath('brand', './*[local-name()="brand"]/text()')
        loader.add_xpath('category',
                         './*[local-name()="google_product_category"]/text()')
        loader.add_xpath('sku', './*[local-name()="mpn"]/text()')
        stock = node.xpath('./*[local-name()="availability"]/text()').extract()
        if stock and stock[0] == 'out of stock':
            loader.add_value('stock', 0)

        item = loader.load_item()

        if product_code in self.cost_prices:
            try:
                cost_price = Decimal(self.cost_prices[product_code])
            except:
                self.log('ERROR: unable to set cost price for item %r' % item)
            else:
                item['metadata'] = {'cost_price': str(cost_price)}

        if pack_size:
            yield Request(loader.get_output_value('url'),
                          self.parse_pack_price,
                          meta={'item': item})
        else:
            yield item
示例#27
0
 def parse_treatment(self, response):
     base_url = get_base_url(response)
     product = response.xpath('//tr/td[(text()="Code")][1]')[0]
     identifiers = []
     for size in product.xpath('./../following-sibling::tr[position()<5]'):
         loader = ProductLoader(item=Product(), selector=size)
         size_name = size.xpath('td[1]/text()').extract()
         loader.add_value('name', size_name)
         loader.add_xpath('sku', 'td[2]/text()')
         loader.add_xpath('price', 'td[3]/text()')
         if not loader.get_output_value('sku'):
             continue
         loader.add_xpath('identifier', 'td[2]/text()')
         loader.add_value('url', response.url)
         yield loader.load_item()
     else:
         self.treatment = True
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        for url in hxs.select(
                '//div[@class="product-tile"]//a/@href').extract():
            pid = url.split('_')[-1]
            if pid not in self.parsed_products:
                self.parsed_products.append(pid)
                yield Request(urljoin_rfc(base_url, url),
                              callback=self.parse_product)
        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        name = hxs.select('//h1/text()').extract()
        if not name:
            request = self.retry(response,
                                 "No name for product: " + response.url)
            if request:
                yield request
            return
        product_loader.add_value('name', name)
        category = hxs.select(
            '//ol[@class="breadcrumbs"]//a/text()').extract()[1:]
        product_loader.add_value('category', category)
        img = hxs.select('//div[@class="item"]//img/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img.pop(0)))

        product = product_loader.load_item()
        options = hxs.select(u'//div[contains(@class, "MainProds")]/ol/li')
        if not options:
            options = hxs.select(
                u'//div[@class="SingColl"]/div[contains(@class, "Prod")]')
        if True:
            if not options or len(options) == 1:
                prod = Product(product)
                prod['sku'] = hxs.select('//div[@class="product-sku"]/text()'
                                         ).re('Product code: (\w+)').pop()
                prod['identifier'] = prod['sku']
                prod['price'] = extract_price(
                    hxs.select('//div[@class="price-current"]/text()').extract(
                    ).pop())
                if prod['identifier']:
                    yield prod
            else:
                for opt in options:
                    prod = Product(product)
                    prod['name'] = opt.select(
                        u'normalize-space(.//h2/text())').extract()[0]
                    prod['sku'] = \
                        opt.select(u'normalize-space(substring-after(.//div[@class="code"]/text(), ":"))').extract()[0]
                    prod['identifier'] = prod['sku']
                    prod['price'] = extract_price(
                        opt.select(
                            u'.//span[@class="Price"]/text()').extract()[0])
                    yield prod
示例#29
0
    def parse_product(self, response):
        options_selects = response.css('label.required').xpath(
            '../following-sibling::dd[1]').css('div.input-box').xpath('*[1]')
        options_config = response.xpath('//script/text()').re_first(
            'Product.Config.*?({.+})')
        if not options_selects:
            for item in self.parse_simple_product(response):
                yield item
            return

        options = []
        for option in options_selects:
            if option.extract().startswith('<select'):
                if option.xpath('option[@value!=""]'):
                    options.append(option.xpath('option[@value!=""]'))
            else:
                options.append(option.xpath('li'))

        if options_config:
            items = self.parse_product_options_config(response)
        else:
            items = self.parse_simple_product(response)

        for item in items:
            if not options:
                yield item
                continue

            variants = itertools.product(*options)
            for variant in variants:
                loader = ProductLoader(Product(), response=response)
                loader.add_value(None, item)
                identifier = item['identifier'] + '-' + '-'.join(
                    (option.xpath('.//@value').extract_first()
                     for option in variant))
                loader.replace_value('identifier', identifier)
                loader.replace_value('sku', identifier)
                price = item['price']
                for option in variant:
                    name = option.xpath('text()').extract_first(
                    ) or option.xpath('.//label/text()').extract_first()
                    name = name.split(u'+£')[0]
                    loader.add_value('name', name)
                    price += Decimal(option.xpath('.//@price').extract_first())
                loader.replace_value('price', price)
                yield loader.load_item()
示例#30
0
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(selector=hxs, item=Product())
     loader.add_xpath('name', '//h1/text()')
     loader.add_xpath(
         'price', '//span[contains(@id, "price-including-tax")]/text()')
     stock = 1 if hxs.select('//span[text() = "In stock"]') else 0
     loader.add_value('stock', stock)
     loader.add_xpath(
         'category',
         '//div[@class="breadcrumbs"]//li[@class!="home"]/a//text()')
     loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
     loader.add_xpath('identifier', '//input[@name="product"]/@value')
     loader.add_xpath('sku', '//meta[@itemprop="sku"]/@content')
     loader.add_value('url', response.url)
     loader.add_xpath('image_url', '//img[@id="image-main"]/@src')
     yield loader.load_item()