Python extract_price示例，utils.extract_price Python示例

示例#1

0

显示文件

文件： uberkids.py 项目： oceancloud82/scraping

    def parse_options(self, response):
        item = response.meta['item']

        option_data = response.body.split('@@@@')

        identifier = option_data[0]
        image_url = option_data[1]

        product_data = response.xpath('//span/text()').extract()
        if len(product_data) < 3:
            sku, price = response.xpath('//span/text()').extract()
            name = ''
        else:
            name, sku, price = response.xpath('//span/text()').extract()[:3]

        # Some products doesn't show name, so the sku goes to name variable
        product_found = self.rows.get(name, None)
        if product_found:
            sku = name
            name = ''
        else:
            product_found = self.rows.get(sku, None)

        if product_found:
            item['identifier'] = sku
            item['sku'] = sku
            item['metadata']['mpn'] = sku[3:]
            if image_url.endswith('.jpg'):
                item['image_url'] = response.urljoin(image_url)
            if name:
                item['name'] += ' ' + name
            item['price'] = extract_price(price)
            categories = self.categories.get(sku.upper())
            item['category'] = ' > '.join([s for s in categories if s])
            yield item

示例#2

0

显示文件

    def parse_category(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//div[@class="boxit_new_cover"]')
        for product in products:
            loader = ProductLoader(selector=product, item=Product())
            url = product.select('div/a[@class="nadpisa"]/@href').extract()[0]
            identifier = url.replace('/', '').replace('.', '')
            loader.add_value('identifier', identifier)
            url = urljoin_rfc(base_url, url)
            name = product.select(
                'div/a[@class="nadpisa"]/text()').extract()[0]

            loader.add_value('name', name)
            loader.add_value('url', url)
            loader.add_xpath('image_url',
                             'div/div[@class="boximages_new"]/div/a/img/@src')

            price = extract_price(
                product.select('div/div/div[@class="cenaa"]/text()').extract()
                [0])
            loader.add_value('price', price)
            loader.add_xpath('category', '//div/h1/text()')
            loader.add_value('sku', self.re_sku.findall(name))
            loader.add_value('brand', 'LEGO')
            if int(price) < 4000:
                loader.add_value('shipping_cost', 99)
            if price <= 0:
                loader.add_value('stock', 0)
            yield self.load_item_with_metadata(loader.load_item())

示例#3

0

显示文件

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        meta = response.meta

        loader = ProductLoader(response=response, item=Product())
        identifier = hxs.select(
            '//ul[contains(@class, "mint")]//input[@name="productId_1"]/@value'
        ).extract()[0]
        loader.add_value('identifier', identifier)
        name = hxs.select(
            '//div[@class="productHeader"]/h1/text()').extract()[0].strip()
        loader.add_value('name', name)
        loader.add_value('sku', meta['sku'])
        price = ''.join(
            hxs.select(
                '//ul[contains(@class, "mint")]/li[contains(@class, "price")]//text()'
            ).extract()).strip()
        price = price if price else '0'
        loader.add_value('price', extract_price(price))
        loader.add_value('url', response.url)
        out_of_stock = hxs.select('//div[@class="outOfStock"]')
        if out_of_stock:
            loader.add_value('stock', 0)
        loader.add_xpath('image_url', '//img[@class="mainImage"]/@src')
        yield loader.load_item()

示例#4

0

显示文件

    def parse_product(self, response):
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        product_name = response.xpath('//h1[@class="product__title"]/text()').extract()[0].strip()
        brand = re.search('Trademark=(.*)', response.body)
        brand = brand.group(1) if brand else ''
        sku = response.xpath('//div[@class="product__vnr"]/text()').re('VNR: (.*)')
        product_price = response.xpath('//div[@class="product__price"]/text()').extract()
        if not product_price:
            product_price = ['0.00']
        product_price = product_price[0]
        product_code = response.xpath('//input[@name="productId"]/@value').extract()[0]
        image_url = response.xpath('//img[@class="img-responsive"]/@src').extract()
        category = response.xpath('//ol[@class="breadcrumbs"]//a/text()').extract()
        category = category[-1] if category else ''
        
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', product_name)
        loader.add_value('url', response.url)
        loader.add_value('sku', sku)
        loader.add_value('identifier', product_code)
        if image_url:
            loader.add_value('image_url', 'http:' + image_url[0])
        loader.add_value('category', category)
        product_price = extract_price(product_price.replace('.', '').replace(',', '.'))
        loader.add_value('price', product_price)
        yield loader.load_item()

示例#5

0

显示文件

文件： midwestunlimited_com.py 项目： oceancloud82/scraping

    def parse_price(self, response):

        product = response.meta['product']

        # data = eval(response.body, {'true':True, 'false':False})

        import json
        try:
            data = json.loads(response.body)
        except:
            self.log("ERROR cant load json, response.body=" + response.body)
            return

        if 'price' in data:
            product['price'] = extract_price(data['price'])

        if 'sku' in data and data['sku']:
            product['sku'] = data['sku']

        product['identifier'] = product['identifier'] + '_' + response.meta['options']

        if 'image' in data and data['image']:
            product['image_url'] = data['image'].replace('\\', '')
        elif 'thumb' in data and data['thumb']:
            product['image_url'] = data['thumb'].replace('\\', '')

        yield product

示例#6

0

显示文件

    def parse(self, response):
        
        response.selector.register_namespace("g", "http://base.google.com/ns/1.0")

        for item in response.xpath('//item'):
            image_url = item.xpath('g:image_link/text()').extract()
            image_url = image_url[0] if image_url else ''
            category = item.xpath('g:product_type/text()').extract()
            category = category[0].split('>')[1:] if category else ''
            brand = item.xpath('g:brand/text()').extract()
            identifier = item.xpath('g:id/text()').extract()
            name = item.xpath('title/text()').extract_first()
            if name:
                name = name.replace('...', '').strip()
            price = item.xpath('g:price/text()').extract()
            price = extract_price(price[0]) if price else 0
            url = item.xpath('link/text()').extract()[0]
            out_of_stock = item.xpath('g:availability/text()').extract()[0] == 'out of stock'
        
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('identifier', identifier)
            product_loader.add_value('sku', identifier)
            product_loader.add_value('name', name)
            product_loader.add_value('image_url', image_url)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            if out_of_stock:
                product_loader.add_value('stock', 0)
            product = product_loader.load_item()
        
            yield product

示例#7

0

显示文件

文件： bluesuntree_spider.py 项目： oceancloud82/scraping

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     exchange_rate = hxs.select(
         '//tr[@class="uccRes"]/td[last()]/text()').re('[\d\.]+')
     yield Request('http://www.bluesuntree.co.uk/',
                   meta={'exchange_rate': extract_price(exchange_rate[0])},
                   callback=self.parse_real)

示例#8

0

显示文件

    def parse_product(self, response):
        row = response.meta['row']

        name = ' '.join(
            response.xpath(
                '//div[@class="product-title"]//text()').extract()).strip()
        colour = response.xpath(
            '//div[@class="product-colors__header"]//span[@class="current"]/text()'
        ).extract()
        if colour:
            name += ' ' + colour[0].strip()

        image_url = response.xpath(
            '//meta[@property="og:image"]/@content').extract()
        image_url = image_url[0] if image_url else ''

        price = response.xpath(
            '//p[@class="product-price__now"]/span[@class="value"]/text()'
        ).extract()
        price = extract_price(price[0]) if price else ''

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('identifier', row['SKU'])
        loader.add_value('sku', row['SKU'])
        loader.add_value('url', response.url)
        loader.add_value('image_url', image_url)
        loader.add_xpath('brand', '//meta[@property="og:brand"]/@content')
        categories = response.xpath(
            '//ul[@class="breadcrumbs"]//a/text()').extract()[-3:]
        loader.add_value('category', categories)
        loader.add_value('name', name)
        loader.add_value('price', price)
        yield loader.load_item()

示例#9

0

显示文件

文件： elcorteingles.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        row = response.meta['row']

        name = response.xpath(
            '//h2[@itemprop="name"]/text()').extract()[0].strip()
        colour = response.xpath(
            '//p[@class="common-option variant-ctrl"]/text()').extract()
        if colour:
            name += ' ' + colour[0].strip()

        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        image_url = 'http:' + image_url[0] if image_url else ''

        price = ''.join(
            response.xpath(
                '//div[contains(@class, "product-price")]/span[contains(@class, "current")]//text()'
            ).extract())
        price = extract_price(price) if price else ''

        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath('identifier', '//div[@id="pid"]/@data-product-id')
        loader.add_value('sku', row['SKU'])
        loader.add_value('url', response.url)
        loader.add_value('image_url', image_url)
        loader.add_xpath('brand', '//h2[@itemprop="brand"]/a/text()')
        categories = response.xpath(
            '//ul[@id="breadcrumbs"]//span/text()').extract()
        loader.add_value('category', categories)
        loader.add_value('name', name)
        loader.add_value('price', price)
        yield loader.load_item()

示例#10

0

显示文件

文件： superdrug.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        colour_options = hxs.select(
            '//ul[contains(@class, "colour-palette")]//a/@href').extract()
        for colour_option in colour_options:
            yield Request(urljoin_rfc(base_url, colour_option),
                          callback=self.parse_product)

        loader = ProductLoader(item=Product(), response=response)

        product_name = hxs.select(
            '//div[contains(@class, "prod-details")]//h2/text()').extract()
        product_name = product_name[0]

        product_brand = ''
        for brand in self.brands:
            if brand.upper() in product_name.upper():
                product_brand = brand
                break

        product_price = hxs.select(
            '//p[contains(@class, "pricing")]/span/text()').extract()
        product_price = extract_price(
            product_price[0]) if product_price else '0'

        product_code = hxs.select(
            '//div[contains(@class, "code")]/strong/text()').extract()[0]

        image_url = hxs.select('//a[@class="main-thumb"]/img/@src').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''

        categories = hxs.select(
            '//div[contains(@class, "breadcrumb")]/a[not(@href="/") and not(@class="active")]/text()'
        ).extract()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', product_name)
        loader.add_value('url', response.url)
        loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER'))
        loader.add_value('identifier', product_code)
        loader.add_value('brand', product_brand)
        loader.add_value('image_url', image_url)
        loader.add_value('category', categories)
        out_of_stock = hxs.select('//form[@class="add_to_notification"]')
        if out_of_stock:
            loader.add_value('stock', 0)

        if loader.get_output_value('price') >= 10:
            loader.add_value('shipping_cost', 0)
        else:
            loader.add_value('shipping_cost', 3)

        loader.add_value('price', product_price)

        yield loader.load_item()

示例#11

0

显示文件

 def start_requests(self):
     with open(os.path.join(HERE, 'lego.csv')) as f:
         reader = csv.reader(cStringIO.StringIO(f.read()))
         for row in reader:
             yield self.search(
                 'LEGO ' + row[2], {
                     'sku': row[2],
                     'name': row[3],
                     'price': extract_price(row[4]),
                 })

示例#12

0

显示文件

 def parse(self, response):
     reader = csv.DictReader(StringIO(response.body))
     for row in reader:
         loader = ProductLoader(response=response, item=Product())
         loader.add_value('identifier', row['identifier'])
         loader.add_value('url', row['URL'])
         loader.add_value('name',
                          row['Product Name'] + ' ' + row['Pack size'])
         loader.add_value('price', extract_price(row['Price']))
         yield loader.load_item()

示例#13

0

显示文件

文件： aquacadabra.py 项目： oceancloud82/scraping

 def parse(self, response):
     reader = csv.DictReader(StringIO(response.body))
     for row in reader:
         loader = ProductLoader(response=response, item=Product())
         loader.add_value('identifier', row['ID'].lower())
         loader.add_value('sku', row['ID'])
         loader.add_value('brand', '')
         loader.add_value('category', '')
         loader.add_value('name', row['Name'].decode('utf8'))
         loader.add_value('price', extract_price(row['Price']))
         yield loader.load_item()

示例#14

0

显示文件

文件： pretavoir.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        row = response.meta['row']

        name = hxs.select('//span[@itemprop="name"]/text()').extract()[0].strip()
        url = response.url
        price = hxs.select('//p[@class="special-price"]/span[@class="price"]/text()').extract()
        if not price:
            price = hxs.select('//span[@class="regular-price"]/span[@class="price"]/text()').extract()
        price = price[0] if price else 0

        l = ProductLoader(item=Product(), response=response)
        l.add_value('name', name)        
        l.add_value('url', response.url)
        l.add_value('sku', row['SKU'])
        l.add_value('price', price)
        identifier = hxs.select('//input[@name="productId"]/@value').extract()
        if not identifier:
            identifier = hxs.select('//input[@name="product"]/@value').extract()

        l.add_value('identifier', identifier)
        l.add_xpath('brand', '//tr[th/text()="Brand"]/td/text()')
        l.add_xpath('image_url', '//a[@id="shoe-spin"]/img/@src')
        categories = hxs.select('//li[@typeof="v:Breadcrumb"]/a/text()').extract()
        l.add_value('category', categories)
        in_stock = hxs.select('//div[@class="offer"]//p[@class="availability in-stock"]')
        if not in_stock:
            l.add_value('stock', 0)
        item = l.load_item()

        options_config = re.search(r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            option_item = deepcopy(item)
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join((products.get(product, ''), option['label']))
                        prices[product] = prices.get(product, 0) +  extract_price(option['price'])

            for option_id, option_name in products.iteritems():
                option_item = deepcopy(item)
                option_item['identifier'] = option_item['identifier'] + '-' + option_id
                option_item['name'] = option_item['name'] + re.findall('(.*) \(', option_name)[0]
                option_item['price'] = option_item['price'] + prices[option_id]
                if 'IN STOCK' not in option_name.upper():
                    option_item['stock'] = 0
                yield option_item
        else:
            yield item

示例#15

0

显示文件

文件： chsmith_spider.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        search_item = response.meta['search_item']

        brand = ''.join(
            hxs.select('//tr[contains(th/h2/text(), "Brand")]/td/a/span/text()'
                       ).extract())
        products = hxs.select('//tr[@class="magazinProductTableRowData"]')
        for product in products:
            try:
                name, sku = product.select(
                    'td/a[contains(b/text(), "MPN")]/text()').extract()
            except:
                continue
            if sku.upper() == search_item['code'].upper(
            ) and search_item['brand'].upper() == brand.upper():
                loader = ProductLoader(item=Product(), selector=product)
                loader.add_value('name', name)
                loader.add_value('url', response.url)
                loader.add_value('sku', search_item['code'])
                loader.add_xpath('identifier', '@id')
                loader.add_value('brand', search_item['brand'])
                image_url = hxs.select(
                    '//div[@class="product-img-box"]/a/img/@src').extract()
                image_url = image_url[0] if image_url else ''
                loader.add_value('image_url', image_url)

                category = search_item['category']
                if not category:
                    category = hxs.select(
                        '//div[@class="breadcrumbs"]/ul/li/a/text()').extract(
                        )
                    category = category[-1].strip() if category else ''

                loader.add_value('category', search_item['brand'])
                loader.add_value('category', category)

                price = product.select(
                    'td[contains(text(), "$")]/a/text()').extract()
                price = extract_price(price[0]) if price else 0
                loader.add_value('price', price)
                in_stock = product.select(
                    'td[contains(a/text(), "In Stock")]/a/text()').extract()
                if not in_stock:
                    loader.add_value('stock', 0)

                product = loader.load_item()
                metadata = NavicoMeta()
                metadata['screen_size'] = search_item['screen size']
                product['metadata'] = metadata
                yield product

示例#16

0

显示文件

文件： musicroom_feed.py 项目： oceancloud82/scraping

    def parse(self, response):
        transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT))
        password = "******"
        username = "******"
        filename = 'IntelligentEye.txt'
        transport.connect(username=username, password=password)
        sftp = paramiko.SFTPClient.from_transport(transport)
        sftp.get(filename, HERE + '/' + filename)

        fields = [
            'UniqueProductCode', 'isbn', 'ean', 'upc', 'ProductName',
            'PriceGBP', 'ProductPageURL', 'Brand', 'Category', 'ImageURL',
            'Stock', 'ShippingCost', 'NetRetailPrice', 'CostPrice'
        ]
        fields2 = [
            'UniqueProductCode', 'isbn', 'ean', 'upc', 'ProductName', 'Temp1',
            'PriceGBP', 'ProductPageURL', 'Brand', 'Category', 'ImageURL',
            'Stock', 'ShippingCost', 'NetRetailPrice', 'CostPrice'
        ]
        with open(os.path.join(HERE, filename)) as f:
            for i, line in enumerate(f, 1):
                line = line.decode('cp865', 'ignore')
                values = line.split('\t')
                if len(fields) == len(values):
                    data = dict(zip(fields, values))
                elif len(fields2) == len(values):
                    data = dict(zip(fields2, values))
                else:
                    msg = "Incorrect number of fields on line: %d" % i
                    self.log("[ERROR] %s" % msg)
                    self.errors.append(msg)
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('identifier', data['UniqueProductCode'])
                loader.add_value('sku', data['UniqueProductCode'])
                loader.add_value('name', data['ProductName'])
                loader.add_value('price', extract_price(data['PriceGBP']))
                loader.add_value('url', data['ProductPageURL'])
                loader.add_value('image_url', data['ImageURL'])
                loader.add_value('brand', data['Brand'])
                loader.add_value('category', data['Category'])
                loader.add_value('shipping_cost', data['ShippingCost'])
                loader.add_value('stock', data['Stock'])
                item = loader.load_item()
                item['sku'] = item['sku'].upper()

                metadata = MusicroomMeta()
                metadata['cost_price'] = data['CostPrice'].strip()
                metadata['net_retail_price'] = data['NetRetailPrice'].strip()

                item['metadata'] = metadata

                yield item

示例#17

0

显示文件

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        product_name = hxs.select('//div[@class="boxbody"]/h1/text()[normalize-space()]').extract()
        if not product_name:
            retried = response.meta.get('retried', False)
            if not retried:
                yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product)


        product_price = hxs.select('//div[@class="price"]/ins/b/text()').extract()
        product_price = product_price[0] if product_price else None

        if not product_price:
            product_price = re.search('Price=(.*)', response.body)
            if product_price:
                product_price = product_price.group(1).replace('.', '')
            else:
                retried = response.meta.get('retried', False)
                if not retried:
                    yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product)

        image_url = hxs.select('//a[@class="img"]/@href').extract()
        out_of_stock = hxs.select('//li[@class="serpontunactive"]')

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//div[@class="boxbody"]/h1/text()[normalize-space()]')
        loader.add_value('url', response.url)
        loader.add_xpath('sku', '//*', re=r'ProductNo=(.*)')
        loader.add_xpath('identifier', '//*', re=r'ProductID=(.*)')
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        loader.add_xpath('category', '//li[@class="current"]/a/text()', lambda e: e[0] if e else '')
        product_price = extract_price(product_price.replace('.', '').replace(',', '.'))
        loader.add_value('price', product_price)
        loader.add_xpath('brand', '//*', lambda e: e[0] if e else '', re=r'Trademark=(.*)')

        item = loader.load_item()

        if not item.get('sku') or not item.get('name'):
            retried = response.meta.get('retried', False)
            if not retried:
                yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product)
                return

        if not item.get('price'):
            item['stock'] = 0

        yield item

示例#18

0

显示文件

文件： westmarine_spider.py 项目： oceancloud82/scraping

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        search_item = response.meta['search_item']

        sku = ''.join(hxs.select('//span[@class="product-manufno"]/text()').extract()).strip()
        name = ''.join(hxs.select('//h1[@id="productDetailsPageTitle"]/text()').extract())
        
        for row in self.rows:
            if sku.upper() == row['code'].upper().strip() and row['brand'].upper() in name.upper().strip():
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('name', name)
                loader.add_value('url', response.url)
                loader.add_value('sku', sku)
                loader.add_xpath('identifier', '//input[@name="productCodePost"]/@value')
                brand = get_brand(name) or search_item['brand']
                loader.add_value('brand', brand)
                image_url =  hxs.select('//div[@id="primary_image"]/a/img/@src').extract()
                image_url = 'http:' + image_url[0] if image_url else ''
                loader.add_value('image_url', image_url)

                category = row['category']
                if not category:
                    category = hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()').extract()
                    category = category[-1] if category else ''

                loader.add_value('category', search_item['brand'])
                loader.add_value('category', category)

                price = hxs.select('//p[contains(@class, "promo price")]/text()').extract()
                if not price:
                    price =  hxs.select('//p[contains(@class, "regularPrice")]/text()').extract()
                price = extract_price(price[0]) if price else 0
                loader.add_value('price', price)
                if not price:
                    loader.add_value('stock', 0)

                product = loader.load_item()
                metadata = NavicoMeta()
                metadata['screen_size'] = row['screen size']
                product['metadata'] = metadata
                yield product
                continue
            #else:
                #if name:
                    #log.msg('Invalid brand or code: ' + response.url)

        products = hxs.select('//div[@class="productName"]/a/@href').extract()
        for product in products:
            url = urljoin_rfc(base_url,product)
            yield Request(url, meta=response.meta)

示例#19

0

显示文件

文件： walmart.py 项目： oceancloud82/scraping

    def parse_special_price(self, response):
        data = json.loads(response.body)

        product = Product(response.meta['product'])
        for cart_item in data['items']:
            if str(cart_item['product_id']) == str(product['identifier']):
                product['price'] = extract_price(cart_item['subMapPrice'])

        yield Request(self._get_reviews_url(product, 1),
                      meta={
                          'product': product,
                          'page': 1
                      },
                      callback=self.parse_product_reviews)

示例#20

0

显示文件

    def parse_products(self, base_url, response, hxs):
        products = hxs.select('//div[@id="ListView"]/div')

        for r in products:
            try:
                price = r.select(
                    './/span[@class="PricesalesPrice"]/text()').extract()[0]
            except:
                # No price => continue
                continue
            loader = ProductLoader(item=Product(), selector=r)
            loader.add_xpath(
                'name',
                './/div[@class="FlexibleListBrowseV1ProductName"]/a/text()')
            url = r.select(
                './/div[@class="FlexibleListBrowseV1ProductName"]/a/@href'
            ).extract()[0]
            url = urljoin_rfc(base_url, url)
            loader.add_value('url', url)
            price = price.replace('.', '').replace(',', '.')
            loader.add_value('price', price)
            sku = r.select(
                './/div[@class="FlexibleCategoryProductSKUListView"]/text()'
            ).extract()[0]
            loader.add_value('sku', sku.replace('SKU: ', ''))
            category = url.split('/')[3]
            if category in CATEGORIES:
                category = CATEGORIES[category]
            else:
                category = ''
            loader.add_value('category', category)
            brand = ''.join(
                r.select('.//div[@class="FlexibleListViewMiddle"]/text()').
                extract()).strip()
            loader.add_value('brand', brand)
            img_url = r.select(
                './/img[@class="browseProductImage"]/@src').extract()[0]
            loader.add_value('image_url', urljoin_rfc(base_url, img_url))
            loader.add_xpath(
                'identifier',
                './/input[@name="virtuemart_product_id[]"]/@value')

            price = extract_price(price)

            if price < Decimal(50):
                loader.add_value('shipping_cost', '7.00')

            yield loader.load_item()

示例#21

0

显示文件

文件： meublesconcept_spider.py 项目： oceancloud82/scraping

    def parse_shipping(self, response):
        hxs = HtmlXPathSelector(response)
        shipping_cost = ''.join(
            hxs.select('//tr[td[contains(text(), "Envoyer")]]/td/text()').re(
                '(\d+,\d+)'))
        shipping_cost = extract_price(shipping_cost)

        product = response.meta['product']
        product['shipping_cost'] = shipping_cost
        yield product

        yield Request(
            response.meta['clean'],
            callback=self.parse_sync_basket,
            dont_filter=True,
            meta={'collect_products': response.meta['collect_products']})

示例#22

0

显示文件

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        name = hxs.select('//span[@class="ArTit"]//text()').extract()[0]
        name = " ".join(name.split())
        loader.add_value('name', name)
        loader.add_xpath(
            'sku', '//span[@id="MainContent_ngpArticolo_lblARCd_AR"]/text()')
        price = hxs.select(
            '//span[@id="MainContent_ngpArticolo_lblPrezzoScontato"]/text()'
        )[0].extract()
        price = price.replace('.', '').replace(',', '.')
        loader.add_value('price', price)
        loader.add_xpath(
            'brand',
            '//span[@id="MainContent_ngpArticolo_lblARMarcaDescrizione"]/text()'
        )
        loader.add_xpath(
            'category',
            '//span[@id="MainContent_ngpArticolo_lblCd_ARGruppo2"]/text()')
        image_url = hxs.select('//div[@id="gallery"]/img/@src')
        if not image_url:
            image_url = hxs.select('//div[@id="gallery"]/input/@src')

        image_url = image_url[0].extract()
        if not image_url.strip().endswith('noimage.png'):
            loader.add_value('image_url', urljoin_rfc(base_url, image_url))
        if hxs.select('//div[@class="art-light-red"]'):
            loader.add_value('stock', 0)
        loader.add_value('url', response.url)
        loader.add_value('identifier', response.url.split('id=')[1])

        price = extract_price(price)

        if price < Decimal(100):
            loader.add_value('shipping_cost', '15.00')
        elif price < Decimal(251):
            loader.add_value('shipping_cost', '30.00')
        elif price < Decimal(751):
            loader.add_value('shipping_cost', '40.00')
        elif price < Decimal(1000):
            loader.add_value('shipping_cost', '60.00')
        else:
            loader.add_value('shipping_cost', '100.00')

        yield loader.load_item()

示例#23

0

显示文件

文件： tropicanafitnesss_com.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath(
            'name', u'//div[@id="product-details-main"]//h1/text()')
        product_loader.add_xpath('category',
                                 u'//div[@class="crumbs"]/a[2]/text()')
        product_loader.add_xpath('price',
                                 u'//span[@class="blu-price"]/span/text()')

        product_loader.add_xpath('sku', '//meta[@name="bc:sku"]/@content')

        img = hxs.select(u'//img[@id="product-image-main"]/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img[0]))

        brands = hxs.select(
            u'//ul[@id="nav-top-list"]/li[contains(@class,"brands")]//a/text()'
        ).extract()
        name = product_loader.get_output_value('name').split()[0].lower()
        for brand in brands:
            if brand.split()[0].lower() == name:
                product_loader.add_value('brand', brand)

        product = product_loader.load_item()

        for variant in hxs.select('//div[@class="variant"]'):
            var_name = product['name'] + ' ' + variant.select(
                './/h4/text()').extract()[0].strip()
            price = variant.select(
                './/p[contains(@class, "price")]/span/text()').extract()[-1]
            for opt in variant.select('.//table/tr'):
                opt_name = var_name + ' ' + opt.select(
                    'td[1]/text()').extract()[0].strip()
                stock = opt.select('td[2]/text()').extract()[0].strip().lower()
                identifier = self.normalizename(opt_name).replace(' ', '')\
                    .replace('/', '').replace('-', '').replace('+', '').lower().replace('on sale', '').strip()
                opt_product = Product(product)
                opt_product['price'] = extract_price(price)
                opt_product['name'] = opt_name
                opt_product['identifier'] = identifier
                if 'out of stock' in stock:
                    opt_product['stock'] = 0
                yield opt_product

示例#24

0

显示文件

文件： greenmangaming_box_spider.py 项目： oceancloud82/scraping

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        meta = response.meta

        loader = ProductLoader(response=response, item=Product())
        identifier = response.url.split('/')[-2]
        loader.add_value('identifier', identifier)
        name = hxs.select('//h1[@class="prod_det"]/text()').extract()[0]
        loader.add_value('name', name)
        loader.add_value('sku', meta['sku'])
        price = hxs.select('//strong[@class="curPrice"]/text()').extract()
        price = price[0] if price else '0'
        loader.add_value('price', extract_price(price))
        loader.add_value('url', response.url)
        yield loader.load_item()

示例#25

0

显示文件

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        product_name = hxs.select(
            '//div[@class="product-name"]/h1/text()')[0].extract()
        product_price = hxs.select(
            '//p[@class="special-price"]/span[@class="price"]/text()').extract(
            )
        if not product_price:
            product_price = hxs.select(
                '//span[@class="regular-price"]/span[@class="price"]/text()'
            ).extract()
        if product_price:
            product_price = product_price[0]
        product_code = hxs.select('//div[@class="product-code"]/text()').re(
            'Product code: (.*)')[0]
        image_url = hxs.select(
            '//img[@class="product-img-img"]/@src').extract()
        brand = response.meta.get('brand', '')
        category = brand

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', product_name)
        loader.add_value('url', response.url)
        loader.add_value('sku', product_code)
        loader.add_value('identifier', product_code)
        loader.add_value('brand', brand)
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        loader.add_value('category', category)
        product_price = extract_price(product_price)
        loader.add_value('price', product_price)
        if not product_price:
            loader.add_value('stock', 0)

        product = loader.load_item()
        metadata = NavicoMeta()
        metadata['screen_size'] = self.force4_products.get(
            product_code.strip().upper(), '')
        product['metadata'] = metadata

        yield product

示例#26

0

显示文件

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        category = hxs.select(
            u'//div[@class="breadcrumbs"]/ul/li//text()').extract()
        category = u' > '.join(
            [x.strip() for x in category if len(x.strip()) > 1])
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        name = hxs.select(u'//div[contains(@class,"product-name")]/h1/text()'
                          )[0].extract().strip()
        loader.add_value('name', name)
        loader.add_value('category', category)
        identifier = hxs.select(u'//div[@class="product-shop"]').re(
            u'Codice: (.*?)<')[0].strip()
        loader.add_value('identifier', identifier)
        found = False
        if identifier in self.ean_codes:
            loader.add_value('sku', identifier)  # self.ean_codes[identifier])
            found = True
        else:
            for model in self.model_codes.keys():
                if len(model) > 3 and model in name.lower():
                    loader.add_value('sku', self.model_codes[model])
                    found = True
                    break
        if not found:
            loader.add_value('sku', '')
        price = hxs.select(u'//span[@class="price"]/text()').re(
            u'\u20ac(.*)')[0].strip().replace(u'.', u'').replace(u',', u'.')
        loader.add_value('price', price)
        image_url = hxs.select(
            u'//a[@class="MagicZoomPlus"]/img/@src').extract()
        if image_url:
            image_url = urljoin_rfc(base_url, image_url[0])
            loader.add_value('image_url', image_url)

        price = extract_price(price)

        if price < Decimal(100):
            loader.add_value('shipping_cost', '11.00')

        yield loader.load_item()

示例#27

0

显示文件

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//div[@class="css_carte_titlu"]/h1/b/text()')
        loader.add_value('url', response.url)
        brand = hxs.select(
            '//div[@class="produs_campuri" and b/text()="Editura:"]/a/text()'
        ).extract()
        loader.add_value('brand', brand)
        loader.add_value('category', 'Carti')
        sku = ''.join(
            hxs.select(
                '//div[@class="produs_campuri" and b/text()="ISBN:"]/text()').
            extract()).strip()
        loader.add_value('sku', sku)
        loader.add_value('identifier', re.findall('p/(.*)/', response.url)[0])
        image_url = hxs.select('//a[@rel="thumbnail"]/img/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        price = ''.join(
            hxs.select(
                '//tr[contains(td/b/text(), "ul nostru:")]/td/b[@class="red"]/text()'
            ).extract()).strip()
        if not price:
            price = ''.join(
                hxs.select(
                    '//tr[td/b/text()="Pret:"]/td/text()').extract()).strip()

        loader.add_value('price', extract_price(price))

        out_of_stock = 'IN STOC' not in ''.join(
            hxs.select('//tr[td/b/text()="Disponibilitate:"]/td/text()').
            extract()).strip().upper()
        if out_of_stock:
            loader.add_value('stock', 0)

        if loader.get_output_value('price') < 150:
            loader.add_value('shipping_cost', 11.99)

        yield loader.load_item()

示例#28

0

显示文件

    def parse_addcart(self, response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']
        products = hxs.select(
            '//table/tr[td/input[contains(@name, "ItemId")]]')
        for product in products:
            valid_sku = item['sku'].upper() in ''.join(
                product.select('td[input[contains(@name, "ItemId")]]/text()').
                extract()).strip().upper()
            if valid_sku:
                identifier = product.select(
                    'td[input[contains(@name, "ItemId")]]/input/@value'
                ).extract()[-1]
                item['identifier'] = identifier
                price = product.select(
                    'td[@class="sellprice"]/text()').extract()
                price = extract_price(price[-1]) if price else '0'
                item['price'] = price
                yield item
                break

示例#29

0

显示文件

    def parse(self, response):
        reader = csv.DictReader(StringIO(response.body))
        for row in reader:
            brand = row['Manufacturer'].strip()
            if brand.lower() in ('wse', 'unknown', 'unknowns'):
                continue

            loader = ProductLoader(response=response, item=Product())
            loader.add_value('identifier', row['SKU'].lower())
            loader.add_value('sku', row['SKU'])
            loader.add_value('brand', row['Manufacturer'])
            loader.add_value('category', row['Manufacturer'])
            loader.add_value('name', row['Name'].decode('utf-8'))
            loader.add_value('price', round(extract_price(row['Price']), 2))
            item = loader.load_item()

            metadata = ErfMeta()
            metadata['gtin'] = row['GTIN']
            item['metadata'] = metadata

            yield item

示例#30

0

显示文件

文件： howetools_spider.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//meta[@property="og:title"]/@content')
        loader.add_value('url', response.url)

        product_brand = ''
        brands = hxs.select('//dl[dt/text()="Brand"]//li/@data-text').extract()
        for brand in brands:
            if brand.upper() in loader.get_output_value('name').upper():
                product_brand = brand
                break

        loader.add_value('brand', product_brand)
        categories = hxs.select(
            '//div[@class="breadcrumbs"]//li[not(@class="home")]/a/text()'
        ).extract()
        loader.add_value('category', categories)
        identifier = hxs.select('//input[@name="product"]/@value').extract()
        loader.add_value('sku', identifier)
        loader.add_value('identifier', identifier)
        image_url = hxs.select('//img[@class="big"]/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        price = hxs.select(
            '//div[@class="product-shop"]//span[@class="price-including-tax"]//span[@class="price"]/text()'
        ).extract()
        price = extract_price(price[0]) if price else 0

        loader.add_value('price', price)

        out_of_stock = hxs.select('//p[@class="availability out-of-stock"]')
        if out_of_stock or not loader.get_output_value('price'):
            loader.add_value('stock', 0)

        yield loader.load_item()