Python ProductLoader.get_output_value示例，product_spiders.items.ProductLoader.get_output_value Python示例

示例#1

0

显示文件

文件： americanrv_crawler.py 项目： ontiyonke/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//td[@class="page_headers"]/text()')
        product_loader.add_xpath(
            'price',
            u'//td[@class="price-info"]//div[@id="price" and @class="price"]/text()',
            re=u'\$(.*)')
        name = product_loader.get_output_value('name').strip()
        md5 = hashlib.md5()
        md5.update(name)
        hashed_name = md5.hexdigest()
        sku = self.skus_dict[hashed_name]
        product_loader.add_value('sku', sku)
        product_loader.add_xpath('sku', u'//span[@id="product_id"]/text()')
        product_loader.add_value(
            'identifier',
            product_loader.get_output_value('sku').lower())
        loaded = (product_loader.get_output_value('name')) and (
            product_loader.get_output_value('price'))
        if loaded:
            yield product_loader.load_item()
        else:
            return

示例#2

0

显示文件

文件： streetsideauto.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), response=response)

        product_loader.add_xpath(
            'price', u'//div[@id="conv-box"]//dd[@class="amount"]/text()')
        if not product_loader.get_output_value('price'):
            product_loader.add_xpath(
                'price',
                u'//dl[@class="ssa-price-dl"]/dd[@class="ssa-price"]/text()')
        product_loader.add_value('url', response.url)
        product_loader.add_value('sku', response.meta['sku'])
        product_loader.add_value('identifier', response.meta['sku'].lower())

        name = hxs.select(
            u'//div[@class="right-column-left"]/div[@class="title"]/h2/text()'
        ).extract()[0].strip()
        product_loader.add_value('name', name)

        # sku = response.meta['sku'].lower().split(' ')
        # name = product_loader.get_output_value('name').lower()
        # sku = filter(lambda x: x != '' and x in name, sku)
        part_number = hxs.select(u'//div[@class="title"]/h2/span/text()').re(
            'Part No. (.*)')[0]
        mfrgid = response.meta['mfrgid']
        if part_number == mfrgid and product_loader.get_output_value('price'):
            yield product_loader.load_item()

示例#3

0

显示文件

文件： drugstore.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # pages
        next_page = hxs.select(u'//div[@style="float:left;padding-right:8px;"]/a[child::img]/@href').extract()
        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page[0])
            yield Request(next_page, callback=self.parse_product)

        products = hxs.select(u'//div[contains(@class,"itemGrid")]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//a[@class="oesLink"]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            name = product.select(u'.//a[@class="oesLink"]/span/text()').extract()[0]
            name += ' ' + product.select(u'.//a[@class="oesLink"]/text()').extract()[0]
            product_loader.add_value('name', name)
            product_loader.add_xpath('price', u'.//span[@class="PlistOfferPrice"]/text()',
                                 re=u'\$(.*)')
            product_loader.add_xpath('price', u'.//div[@class="pricing"]/span/div/span/text()',
                                 re=u'\$(.*)')
            loaded = product_loader.get_output_value('name') and product_loader.get_output_value('price')
            if not loaded:
                continue
            yield product_loader.load_item()

示例#4

0

显示文件

文件： campingworld_americanrv.py 项目： oceancloud82/scraping

 def parse_product(self, response):
     if not isinstance(response, HtmlResponse):
         return
     hxs = HtmlXPathSelector(response)
     product_loader = ProductLoader(item=Product(), response=response)
     product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     product_loader.add_xpath(
         'price',
         '//div[@class="club"]/span[@itemprop="Price"]/text()',
         re='.*\$(.*[0-9])')
     product_loader.add_value('url', response.url)
     product_loader.add_value('sku', response.meta['sku'])
     product_loader.add_value('identifier', response.meta['sku'].lower())
     if not product_loader.get_output_value('price'):
         return
     mfrgid = response.meta['mfrgid']
     if product_loader.get_output_value('name'):
         site_mfrgid = hxs.select(
             u'//p[@class="specs" and child::span[contains(text(),"Mfg Part")]]/text()'
         ).extract()
         site_mfrgid = site_mfrgid[1] if len(site_mfrgid) >= 2 else None
         name = response.meta['name'].split(' ')
         if site_mfrgid and (mfrgid == site_mfrgid.strip()
                             or site_mfrgid in name):
             return product_loader.load_item()

示例#5

0

显示文件

文件： streetsideauto.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), response=response)

        product_loader.add_xpath('price', u'//div[@id="conv-box"]//dd[@class="amount"]/text()')
        if not product_loader.get_output_value('price'):
            product_loader.add_xpath('price', u'//dl[@class="ssa-price-dl"]/dd[@class="ssa-price"]/text()')
        product_loader.add_value('url', response.url)
        product_loader.add_value('sku', response.meta['sku'])
        product_loader.add_value('identifier', response.meta['sku'].lower())

        name = hxs.select(u'//div[@class="right-column-left"]/div[@class="title"]/h2/text()').extract()[0].strip()
        product_loader.add_value('name', name)

        # sku = response.meta['sku'].lower().split(' ')
        # name = product_loader.get_output_value('name').lower()
        # sku = filter(lambda x: x != '' and x in name, sku)
        part_number = hxs.select(u'//div[@class="title"]/h2/span/text()').re('Part No. (.*)')[0]
        mfrgid = response.meta['mfrgid']
        if part_number == mfrgid and product_loader.get_output_value('price'):
            yield product_loader.load_item()

示例#6

0

显示文件

文件： lockhart.py 项目： oceancloud82/scraping

    def parse_brand_list(self, response):
        hxs = HtmlXPathSelector(response)

        # products
        product_items = hxs.select('//div[@class="productGrid"]/ul/li/div[@class="item"]')
        category_items = hxs.select('//h1[@class="categoryLandingPageTitle_heading"]/a/text()').extract()
        category = category_items[0] if category_items else ''
        brand_name = get_brand_from_url(response.url)

        def get_full_image_url(url):
            return get_full_url(response, url)

        for product_item in product_items:

            image_url = product_item.select(u'div[@class="prodimg"]/a/img/@src').extract()
            if image_url:
                image_url = get_full_url(response, image_url[0])

            ploadr = ProductLoader(item=Product(), selector=product_item, response=response)

            ploadr.add_xpath('name',
                             'div[@class="prodname"]/a/text()',
                             TakeFirst(), Compose(unicode.strip))
            ploadr.add_xpath('url', 'div[@class="prodname"]/a/@href',
                             TakeFirst(), Compose(unicode.strip), Compose(get_full_image_url))
            ploadr.add_value('category', category)
            ploadr.add_value('image_url', image_url)

            price = ploadr.get_xpath('div[@class="proddetails"]//div[@class="prodnowprice"]/span/text()',
                                     TakeFirst(), Compose(extract_price))
            price_excl_vat = Decimal(price)

            ploadr.add_value('price', price_excl_vat)

            ploadr.add_value('shipping_cost', Decimal('5.00') if price_excl_vat < 50 else Decimal('0.0'))
            ploadr.add_xpath('sku',
                             'div[@class="proddetails"]//div[@class="proditemcode"]/a/span/following-sibling::text()',
                             TakeFirst(), Compose(unicode.strip))

            ploadr.add_value('identifier', ploadr.get_output_value('sku'))
            stock_info = product_item.select(u'div[@class="proddetails"]/div/div/span[contains(@class, "instock")]/@class').extract()
            buy_button = product_item.select(u'div[@class="proddetails"]/div[@class="prodquickbuy"]/a[@class="primaryBtn"]').extract()

            ploadr.add_value('brand', brand_name)

            ploadr.add_value('stock', 1 if stock_info or buy_button else 0)

            item = ploadr.load_item()

            tmp = ''.join(product_item.select("//div[@class='proditemcode']//text()").extract())
            item['metadata'] = {'product_code': tmp.split(':')[-1].strip()}

            if not ploadr.get_output_value('brand'):
                yield Request(item['url'], meta={'item': item}, callback=self.parse_brand)
            else:
                yield item

示例#7

0

显示文件

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@class="summaryboxsearch"]')
        for product in products[0:1]:  # extract only the first product
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath(
                'price', u'.//span[@class="floatl sli_price"]/text()')
            product_loader.add_xpath('url',
                                     u'.//p[@class="mtext nobreak"]/a/@title')
            product_loader.add_value('sku', response.meta['sku'])
            product_loader.add_value('identifier',
                                     response.meta['sku'].lower())
            product_loader.add_xpath('name',
                                     u'.//p[@class="mtext nobreak"]/a/text()')
            name = product_loader.get_output_value('name').lower()
            sku = product_loader.get_output_value('sku').lower().split(' ')
            sku = filter(lambda x: x != '' and x in name, sku)
            site_mfrgid = product.select(
                './/span[@class="floatl sli_grid_code"]/text()').extract()
            if site_mfrgid:
                mfrgid = response.meta['mfrgid'].lower()
                site_mfrgid = site_mfrgid[0].strip().lower()
                if mfrgid in site_mfrgid and sku:
                    yield product_loader.load_item()

        if not products:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_xpath('price',
                                     u'//p[@class="strong"]/span/text()')
            product_loader.add_value('url', response.url)
            product_loader.add_value('sku', response.meta['sku'])
            product_loader.add_value('identifier',
                                     response.meta['sku'].lower())
            product_loader.add_xpath(
                'name', u'//div[@class="indentl orderbox"]//h1/text()')
            name = product_loader.get_output_value('name').lower()
            sku = product_loader.get_output_value('sku').lower().split(' ')
            sku = filter(lambda x: x != '' and x in name, sku)
            site_mfrgid = hxs.select(
                '//div[@class="indentl orderbox"]/div[@class="floatl"]/p/strong/text()'
            ).extract()
            if site_mfrgid:
                site_mfrgid = site_mfrgid[0].strip().lower()
                mfrgid = response.meta['mfrgid'].lower()
                if mfrgid in site_mfrgid and sku:
                    yield product_loader.load_item()

示例#8

0

显示文件

文件： oponeo.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        loader = ProductLoader(item=Product(), selector=response)
        loader.add_value('url', response.url)

        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url[0]))

        identifier = response.xpath('//form[@name="form1"]/@action').extract()
        if not identifier:
            yield self.retry_request(response)
            return
        identifier = identifier[0]
        loader.add_value('identifier', identifier)
        price = response.xpath(
            '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract()
        loader.add_value('price', price)

        stock = response.xpath('//div[@class="stockLevel"]//text()').re(
            r'(\d+)')
        if stock:
            loader.add_value('stock', stock[0])

        brand = response.xpath('//*[@itemprop="brand"]/@content').extract()
        if not brand:
            brand = response.xpath(
                '//div[@class="hidden"]/input[@class="producerName"]/@value'
            ).extract()
        if brand:
            brand = brand[0].strip()
            loader.add_value('brand', brand)
        if 'category' in response.meta:
            if response.meta['category'] != 'Car tyres':
                loader.add_value('category', response.meta['category'])
            else:
                category = response.xpath(
                    '//dt[contains(text(), "Type:")]/following-sibling::dd/text()'
                ).extract()
                if category:
                    loader.add_value('category', category[0].strip())
        else:
            loader.add_value('category', loader.get_output_value('brand'))

        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_xpath('sku', '//*[@itemprop="sku"]/@content')

        if self.exclude_word not in loader.get_output_value('name'):
            yield loader.load_item()

示例#9

0

显示文件

文件： otelo.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        multiple_products = hxs.select(u'//table[@id="ListeSkuGroupTableGauche"]//a/@href').extract()
        for url in multiple_products:
            yield Request(url)

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        name = hxs.select(u'//h1[@id="sku_Title"]/text()').extract()
        if not name:
            return
        brand = hxs.select(u'//h1[@id="sku_Title"]/span[@id="sku_Brand"]/text()').extract()
        if brand:
            name = brand[0] + ' ' + name[0].strip()
        else:
            name = name[0].strip()
        sku = hxs.select(u'//div[@class="sku_TP_TD"]/div[@class="sku_TP_SKU"]/text()').extract()
        if sku:
            name += ' (' + sku[0].strip() + ')'
        product_loader.add_value('name', name)
        price = hxs.select(u'//div[@id="sku_ZonePriceNormal"]//div[@id="sku_ZPN_HT"]/text()').re(u'([\d\.,]+)')
        if price:
            price = re.sub(',', '.', price[0])
            product_loader.add_value('price', price)
            if product_loader.get_output_value('name') and not multiple_products:
                yield product_loader.load_item()

示例#10

0

显示文件

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            u'//form/div[contains(@class,"highlightProduits hproduct")]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//h3/a[@class="item url"]/@href').extract()
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            product_loader.add_xpath('name',
                                     u'.//h3/a[@class="item url"]/text()')
            product_loader.add_xpath('price',
                                     u'.//p[@class="price"]/text()',
                                     re=u'([0-9\.]+)')
            if not product_loader.get_output_value('price'):
                product_loader.add_xpath(
                    'price',
                    u'.//p[contains(@class,"price")]/text()',
                    re=u'([0-9\.]+)')
            yield product_loader.load_item()

            if not products:
                log.msg('Retrying url: %s' % response.url, level=log.WARNING)
                retries = response.meta.get('retries', 0)
                if retries < 1:
                    yield Request(response.url,
                                  dont_filter=True,
                                  meta={'retries': retries + 1})

示例#11

0

显示文件

文件： outillage2000_spider.py 项目： 0--key/lib

	def parse_subproduct (self, subprod):
		if not isinstance(subprod, HtmlXPathSelector):
			return

		url= join(subprod.select(u'td[1]/a/@href').extract())
		name = join(subprod.select(u'td[1]/a/@title').extract())
		price = join(subprod.select(u'td[3]/text()').extract())

		#remove euro sign and replace ',' with '.' in the price				
		price = price.replace(u',',u'.').replace(u'\xe2',u"").strip()

		# if there is a discount the price is in another element		
		if price is None or len(price) == 0 :
			price = join(subprod.select(u'td[3]/ins/text()').extract())
			price = price.replace(u',',u'.').replace(u'\xe2',u"").strip()

		#strip html tags from name
		name = re.sub('<[^<]+?>', '',name)

		product_loader = ProductLoader(item=Product(), selector=subprod)
		product_loader.add_value('name', name)
		product_loader.add_value('url', url)
		product_loader.add_value('price', price)
		if product_loader.get_output_value('name'):			
			yield product_loader.load_item()

示例#12

0

显示文件

文件： stephensons_com.py 项目： oceancloud82/scraping

    def parse_category(self, response):
        # more categories
        categories = response.xpath(
            u'//div[@id="subcategories"]/ul/li//a[1]/@href').extract()
        for category in categories:
            url = response.urljoin(category)
            yield Request(url, callback=self.parse_category)

        # products
        products = response.xpath(u'//ul[@id="product_list"]/li')
        products_category = list(set(response.xpath(u'//div[@class="breadcrumb" and position()=1]/a[not(position()=1)]/text()').extract() + \
                                     [response.xpath(u'//div[@class="breadcrumb" and position()=1]//text()').extract()[-1]]))
        for product_xs in products:
            pack_price = product_xs.xpath(
                './/span[@class="price-pack"]//text()').re(r'[\d\,.]+')
            price = product_xs.xpath('.//span[@class="price"]/text()').re(
                r'[\d\,.]+')
            loader = ProductLoader(item=Product(), selector=product_xs)
            loader.add_xpath('identifier', './/h3/a/@href', re=r'/(\d+)-')
            loader.add_xpath('name', './/h3/a/text()')
            loader.add_xpath('url', './/h3/a/@href')
            loader.add_value('category', products_category)
            loader.add_value('price',
                             pack_price[-1] if pack_price else price[-1])
            price = loader.get_output_value('price')
            if price:
                loader.add_value(
                    'shipping_cost',
                    Decimal('4.95')
                    if price < Decimal('50') else Decimal('0.0'))
            in_stock = bool(
                product_xs.xpath(
                    './/img[@class="ticky-tick" and (@alt="product in stock" or contains(@alt, "days"))]'
                ))
            loader.add_value('stock', 1 if in_stock else 0)

            item = loader.load_item()
            item['metadata'] = {'product_code': item['identifier']}

            item = loader.load_item()

            if item['identifier'] in self.products_cache:
                self.log('Product found in cache => %s' % item['identifier'])
                item['sku'] = self.products_cache[item['identifier']]['sku']
                item['image_url'] = self.products_cache[
                    item['identifier']]['image_url']
                yield item
            else:
                self.log('Product NOT found in cache => %s' %
                         item['identifier'])
                yield Request(item['url'], callback=self.parse_product)

        if not products:
            meta = response.meta.copy()
            meta['retry'] = meta.get('retry', 0)
            if meta['retry'] < 3:
                meta['retry'] += 1
                self.log('>>> RETRY %d => %s' %
                         (meta['retry'], response.request.url))
                yield Request(response.request.url, meta=meta)

示例#13

0

显示文件

文件： lockhart.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        name = hxs.select('//div[@class="productDetail_name_and_description"]/h1/text()')[0].extract().strip()
        sku = hxs.select('//input[@name="productCode"]/@value')[0].extract()
        image_url = hxs.select('//img[@id="zoom"]/@src').extract()
        category = hxs.select('//div[@id="breadcrumbs"]/a[not(@class)]/text()').extract()
        brand = hxs.select('//div[@class="productDetail_tab_content"]//p/text()').re('Brand: (.*)')
        price = hxs.select('//div[@class="productDetail_main_pricelist"]/span[@id="now_price"]/text()')
        if not price:
            price = hxs.select('//div[@class="productDetail_main_pricelist"]/div[@id="now_price"]/text()')
        price = price.re('[\.\d,]+')[0].strip().replace(',', '') if price else '0.00'
        stock = hxs.select('//input[@class="primaryBasket"]').extract()
        price_excl_vat = Decimal(price)

        ploadr = ProductLoader(item=Product(), response=response)
        ploadr.add_value('name', name)
        ploadr.add_value('url', response.url)
        if image_url:
            ploadr.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0]))
        ploadr.add_value('sku', sku)
        ploadr.add_value('identifier', ploadr.get_output_value('sku'))
        ploadr.add_value('price', price_excl_vat)
        if category:
            ploadr.add_value('category', category[-1])
        if brand:
            ploadr.add_value('brand', brand[0].strip())
        ploadr.add_value('shipping_cost', Decimal('5.00') if price_excl_vat < 50 else Decimal('0.0'))
        ploadr.add_value('stock', 1 if stock else 0)
        item = ploadr.load_item()

        tmp = hxs.select("//div[@class='productDetail_item_code']/text()").extract()
        item['metadata'] = {'product_code': tmp[0].split(':')[-1].strip()}

        yield item

示例#14

0

显示文件

文件： vitacost.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//div[@class="pt9P cf clear"]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(
                u'.//a[@class="pNameM cf"]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            product_loader.add_xpath('name',
                                     u'.//a[@class="pNameM cf"]/text()')
            product_loader.add_xpath(
                'price',
                u'.//div[contains(@class,"pOurPrice")]/text()',
                re=u'\$(.*)')
            if product_loader.get_output_value('price'):
                yield product_loader.load_item()
            else:
                cart_url = product.select(
                    u'.//div[@class="pt0PBtns"]/a[child::img]/@href').extract(
                    )[0]
                cart_url = urljoin_rfc(get_base_url(response), cart_url)
                request = Request(cart_url,
                                  callback=self.parse_cart,
                                  cookies={},
                                  meta={'dont_merge_cookies': True})
                request.meta['product_loader'] = product_loader
                yield request

示例#15

0

显示文件

文件： whsmithcouk.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name_xpath = '//div[@id="product-details"]/h1/span/text()'
        names = hxs.select('//h1[@id="product_title"]/text()').extract()

        if names and len(names) > 0:
            name = names[0].strip()
        else:
            # product not found. Just continue
            self.log('WARNING: Product not found => %s' % response.url)
            return

        quantity = hxs.select('//p[@id="stock_status"]/text()').extract()
        if quantity and "In Stock" in quantity.pop():
            quantity = None
        else:
            quantity = 0

        category = hxs.select(
            '//ul[@id="crumbs"]/li[@class="last"]/a/text()').extract()

        brand = hxs.select(
            '//div[@id="product_title_container"]/span[@class="secondary"]/text()'
        ).extract()

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('url', urljoin(base_url, response.url))
        loader.add_value('name', name)
        loader.add_xpath('image_url', '//img[@id="main_image"]/@src',
                         TakeFirst(), Compose(lambda v: urljoin(base_url, v)))
        loader.add_xpath(
            'price',
            '//div[@class="product_price"]/span[@class="price"]/text()',
            TakeFirst(),
            re="([.0-9]+)")
        if not loader.get_output_value('price'):
            loader.add_value('price', 0)

        if category:
            loader.add_value('category', category[0].strip())

        loader.add_value('sku', name, TakeFirst(), re='(\d\d\d+)\s*$')

        if brand:
            loader.add_value('brand', brand[0].strip())

        identifier = hxs.select('//input[@name="ProductID"]/@value').extract()
        if not identifier:
            identifier = hxs.select('//li[@itemprop="id"]/text()').extract()

        loader.add_value('identifier', identifier[0])

        if quantity == 0:
            loader.add_value('stock', 0)

        yield loader.load_item()

示例#16

0

显示文件

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        if 'loader' in response.meta:
            loader = response.meta['loader']
        else:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('url', response.url)
            loader.add_xpath('name', '//*[@itemprop="name"]/text()')
            loader.add_xpath('price', '//*[@itemprop="price"]/@content')
            loader.add_value('category', response.meta.get('category', ''))

        try:
            identifier = hxs.select(
                '//input[@type="hidden" and @name="Product_Code"]/@value'
            )[0].extract()
        except:
            identifier = ''

        if not identifier:
            identifier = re.search(r'product/(.*).html$',
                                   response.url).group(1)

        loader.add_value('identifier', identifier)

        image_url = ''
        line_no = None

        for i, line in enumerate(response.body.split('\n')):
            if '"image_data":' in line:
                line_no = i
                break

        if line_no is not None:
            image_url = response.body.split('\n')[line_no + 2].replace(
                '\\', '')[1:-2]

        if image_url:
            image_url = urljoin_rfc(base_url, image_url)
            loader.add_value('image_url', image_url)

        out_of_stock = hxs.select(
            '//p[@class="notifications"]//strong[contains(text(),"On backorder")]'
        )
        if out_of_stock:
            loader.add_value('stock', 0)

        try:
            shipping_cost = '0.00' if float(
                loader.get_output_value('price')) >= 75.00 else '5.00'
            loader.add_value('shipping_cost', shipping_cost)
        except:
            return

        yield loader.load_item()

示例#17

0

显示文件

文件： campingworld_americanrv.py 项目： 0--key/lib

 def parse_product(self, response):
     if not isinstance(response, HtmlResponse):
         return
     hxs = HtmlXPathSelector(response)
     product_loader = ProductLoader(item=Product(), response=response)
     product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     product_loader.add_xpath('price', '//div[@class="club"]/span[@itemprop="Price"]/text()',
                              re='.*\$(.*[0-9])')
     product_loader.add_value('url', response.url)
     product_loader.add_value('sku', response.meta['sku'])
     product_loader.add_value('identifier', response.meta['sku'].lower())
     if not product_loader.get_output_value('price'):
         return
     mfrgid = response.meta['mfrgid']
     if product_loader.get_output_value('name'):
         site_mfrgid = hxs.select(u'//p[@class="specs" and child::span[contains(text(),"Mfg Part")]]/text()').extract()
         site_mfrgid = site_mfrgid[1] if len(site_mfrgid) >= 2 else None
         name = response.meta['name'].split(' ')
         if site_mfrgid and (mfrgid == site_mfrgid.strip() or site_mfrgid in name):
             return product_loader.load_item()

示例#18

0

显示文件

文件： chapmansangling_spider.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        category = hxs.select('//div[@id="crumblinks"]//a/text()').extract()
        category = category[-1] if category else ''
        image_url = hxs.select('//img[@id="product-big"]/@src').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''

        product_brand = ''
        brand_url = hxs.select(
            '//div[@class="description"]//img[@alt="Brand Image"]/parent::a/@href'
        ).extract()
        if brand_url:
            brand_url = urljoin_rfc(base_url, brand_url[0])
            product_brand = url_query_parameter(brand_url, 'search')

        name = hxs.select("//h1[@class='coarse']/text()")[0].extract().strip()
        options = hxs.select('//div[@class="generated"]/table/tr')[1:]
        select = hxs.select(
            '//form[@id="cart_form"]//select[@class="prodoptions"]').extract()
        if options:
            # options
            for option in options:
                name2 = option.select('./td[position()=4]/text()')
                name2 = name2[0].extract().strip() if name2 else ''
                price = option.select('.//td/text()').extract()[-2].strip()
                loader = ProductLoader(item=Product(), selector=option)
                loader.add_xpath('identifier', './td[position()=2]/text()')
                loader.add_xpath('sku', './td[position()=3]/text()')
                loader.add_value('url', response.url)
                loader.add_value(
                    'name', name + ' %s %s' %
                    (loader.get_output_value('identifier'), name2))
                loader.add_value('price', price)
                loader.add_value('category', category)
                loader.add_value('image_url', image_url)
                loader.add_value('brand', product_brand)
                yield loader.load_item()
        else:
            price = "".join(
                hxs.select(".//span[@class='bigprice']/text()").re(
                    r'([0-9\,\. ]+)')).strip()
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('url', response.url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('identifier', response.url)
            loader.add_value('image_url', image_url)
            loader.add_value('category', category)
            loader.add_xpath('sku', './td[position()=2]/text()')
            loader.add_value('brand', product_brand)
            yield loader.load_item()

示例#19

0

显示文件

文件： etrailer.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@class="summaryboxsearch"]')
        for product in products[0:1]: # extract only the first product
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath('price', u'.//span[@class="floatl sli_price"]/text()')
            product_loader.add_xpath('url', u'.//p[@class="mtext nobreak"]/a/@title')
            product_loader.add_value('sku', response.meta['sku'])
            product_loader.add_value('identifier', response.meta['sku'].lower())
            product_loader.add_xpath('name', u'.//p[@class="mtext nobreak"]/a/text()')
            name = product_loader.get_output_value('name').lower()
            sku = product_loader.get_output_value('sku').lower().split(' ')
            sku = filter(lambda x: x != '' and x in name, sku)
            site_mfrgid = product.select('.//span[@class="floatl sli_grid_code"]/text()').extract()
            if site_mfrgid:
                mfrgid = response.meta['mfrgid'].lower()
                site_mfrgid = site_mfrgid[0].strip().lower()
                if mfrgid in site_mfrgid and sku:
                    yield product_loader.load_item()

        if not products:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_xpath('price', u'//p[@class="strong"]/span/text()')
            product_loader.add_value('url', response.url)
            product_loader.add_value('sku', response.meta['sku'])
            product_loader.add_value('identifier', response.meta['sku'].lower())
            product_loader.add_xpath('name', u'//div[@class="indentl orderbox"]//h1/text()')
            name = product_loader.get_output_value('name').lower()
            sku = product_loader.get_output_value('sku').lower().split(' ')
            sku = filter(lambda x: x != '' and x in name, sku)
            site_mfrgid = hxs.select('//div[@class="indentl orderbox"]/div[@class="floatl"]/p/strong/text()').extract()
            if site_mfrgid:
                site_mfrgid = site_mfrgid[0].strip().lower()
                mfrgid = response.meta['mfrgid'].lower()
                if mfrgid in site_mfrgid and sku:
                    yield product_loader.load_item()

示例#20

0

显示文件

文件： drugstore.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # pages
        next_page = hxs.select(
            u'//div[@style="float:left;padding-right:8px;"]/a[child::img]/@href'
        ).extract()
        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page[0])
            yield Request(next_page, callback=self.parse_product)

        products = hxs.select(u'//div[contains(@class,"itemGrid")]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//a[@class="oesLink"]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            name = product.select(
                u'.//a[@class="oesLink"]/span/text()').extract()[0]
            name += ' ' + product.select(
                u'.//a[@class="oesLink"]/text()').extract()[0]
            product_loader.add_value('name', name)
            product_loader.add_xpath(
                'price',
                u'.//span[@class="PlistOfferPrice"]/text()',
                re=u'\$(.*)')
            product_loader.add_xpath(
                'price',
                u'.//div[@class="pricing"]/span/div/span/text()',
                re=u'\$(.*)')
            loaded = product_loader.get_output_value(
                'name') and product_loader.get_output_value('price')
            if not loaded:
                continue
            yield product_loader.load_item()

示例#21

0

显示文件

文件： americanrv_crawler.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//td[@class="page_headers"]/text()')
        product_loader.add_xpath('price', u'//td[@class="price-info"]//div[@id="price" and @class="price"]/text()',
                                 re=u'\$(.*)')
        name = product_loader.get_output_value('name').strip()
        md5 = hashlib.md5()
        md5.update(name)
        hashed_name = md5.hexdigest()
        sku = self.skus_dict[hashed_name]
        product_loader.add_value('sku', sku)
        product_loader.add_xpath('sku', u'//span[@id="product_id"]/text()')
        product_loader.add_value('identifier', product_loader.get_output_value('sku').lower())
        loaded = (product_loader.get_output_value('name')) and (product_loader.get_output_value('price'))
        if loaded:
            yield product_loader.load_item()
        else:
            return

示例#22

0

显示文件

文件： walgreens.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//div[contains(@class,"product-container")]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(
                u'.//a[@class="SearchLinkBold"]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            name = product.select(
                u'.//a[@class="SearchLinkBold"]/text()').extract()[0]

            extra_name = product.select(
                u'.//div[contains(@class,"prod-info-box")]/p/text()').extract(
                )
            if extra_name:
                name += ' ' + extra_name[0]

            r = re.search('ID=prod(\d+)', url)
            if r:
                log.msg('Found ' + r.groups()[0])
                name = self.names.get(r.groups()[0], name)
            product_loader.add_value('name', name)
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b[@class="price sale"]/text()',
            #                                 re=u'.*?or 1/\$(.*)')
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b[@class="price sale"]/text()',
            #                                 re=u'\$(.*)')
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b/text()',
            #                                 re=u'.*?or 1/\$(.*)')
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b/text()',
            #                     re=u'\$(.*)')
            product_loader.add_xpath(
                'price', './/p[@class="empPrc"]/span[@class="FSprice"]/text()')
            product_loader.add_xpath('price',
                                     './/p[@class="FSprice"]/text()',
                                     re=u'.*?or 1/\$(.*)')
            product_loader.add_xpath('price',
                                     './/p[@class="FSprice"]/text()',
                                     re=u'.*?or 1/\$(.*)')
            product_loader.add_xpath('price', './/p[@class="Rprice"]/text()')
            product_loader.add_xpath('price', './/p[@class="Rprice"]/text()')
            if not product_loader.get_output_value('price'):
                continue
            yield product_loader.load_item()

示例#23

0

显示文件

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)

        product_loader.add_xpath('price', '//font[@class="pricecolor colors_productprice"]/text()', re='.*\$(.*[0-9])')
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('sku', '//span[@class="product_code"]/text()')

        sku = product_loader.get_output_value('sku')
        if sku:
            product_loader.add_value('name', sku)
        else:
            product_loader.add_xpath('name', '//font[@class="productnamecolorLARGE colors_productname"]/text()')

        return product_loader.load_item()

示例#24

0

显示文件

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        l = ProductLoader(item=Product(), selector=hxs)
        l.add_xpath('identifier', '//*[@itemprop="sku"]/text()')
        l.add_xpath('name', '//h1[@itemprop="name"]/text()')
        l.add_value('url', response.url)
        l.add_xpath('price', '//*[@itemprop="price"]/@content')
        l.add_xpath('image_url', '//*[@itemprop="image"]/@src')
        l.add_value('sku', response.meta['sku'])
        l.add_value('brand', response.meta['brand'])
        l.add_value('category', response.meta['category'])
        l.add_value('stock', re.search('"stock": (.+),', response.body).group(1))
        if l.get_output_value('price') < 10:
            l.add_value('shipping_cost', '2.99')
        else:
            l.add_value('shipping_cost', '0')
        yield l.load_item()

示例#25

0

显示文件

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        try:
            product_id = hxs.select('//form[@class="add-to-cart" or @id="add-notification"]/@action').re('productId=(.*)')[0]
        except:
            self.log('No product_id found on %s' %response.url)
            return

        image_url = hxs.select('//section[@id="product-image-viewer"]/div[@id="slider"]/ul[@class="slides"]//a[@class="fancy-box"]/@href').extract()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier', product_id)
        loader.add_xpath('sku', '//p[contains(text(), "Brand Code")]/text()', re=r': (.*)$')
        loader.add_xpath('name', '//h1/text()')
        loader.add_xpath('price', '//span[@class="product-price"]/span/text()', re=r'[\d,.]+')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', '//span[@class="product-price"]/span[@class="text-red"]/text()', re=r'[\d,.]+')
        loader.add_value('url', response.url)
        loader.add_xpath('category', '//nav[@id="breadcrumb"]/span/a/span/text()', lambda elms: elms[-1])
        loader.add_xpath('brand', '//a[@class="pull-right" and contains(@href, "Brands")]/img/@alt')
        out_of_stock = hxs.select('//div[@class="stock-status"]/span[@class="stock-status-circle out-of-stock"]')
        if out_of_stock:
            loader.add_value('stock', 0)
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        item = loader.load_item()

        yield item

        data_model = hxs.select('//form[@class="add-to-cart"]/@data-model').extract()
        #import ipdb; ipdb.set_trace()
        if data_model:
            data = json.loads(data_model[0])
            if 'associatedProducts' in data:
                for option in data['associatedProducts']:
                    loader = ProductLoader(item=Product(item), response=response)
                    loader.replace_value('identifier', option['id'])
                    loader.replace_value('name', item['name'] + ' ' + ' '.join([o['value'].split(u'(\xa3')[0].strip() for o in option['fieldValues']]))
                    loader.replace_value('price', round(option['price'], 2))

                    yield loader.load_item()

示例#26

0

显示文件

文件： steamer.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        schema = SpiderSchema(response)
        product_data = schema.get_product()
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier', product_data['productID'])
        loader.add_value('sku', product_data['productID'])
        loader.add_value('name', product_data['name'])
        out_stock = bool(response.css('.product-shop .out-of-stock'))
        if (not out_stock) and (
                'InStock'
                in product_data['offers']['properties']['availability']):
            loader.add_value('stock', 1)
        else:
            loader.add_value('stock', 0)
        category = response.css('.breadcrumbs').xpath(
            './/li/a/text()').extract()[1:]
        loader.add_value('category', category)
        loader.add_value('url', response.url)
        loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
        loader.add_xpath(
            'brand', '//th[@class="label" and contains(text(), '
            '"Brand")]/following-sibling::td/text()')
        price = response.css('.product-shop .price-box .minimal-price .price'
                             ).xpath('text()').re_first(r'[\d\.,]+')
        if not price:
            price = response.css(
                '.product-shop .price-box .regular-price .price').xpath(
                    'text()').re_first(r'[\d\.,]+')
        if not price:
            price = response.css(
                '.product-shop .price-box .special-price .price').xpath(
                    'text()').re_first(r'[\d\.,]+')
        loader.add_value('price', price)

        if loader.get_output_value('price') >= Decimal('45.0'):
            loader.add_value('shipping_cost', '0.0')
        else:
            loader.add_value('shipping_cost', '4.95')

        yield loader.load_item()

        for url in response.css('.grouped-items-table-wrapper .name-wrapper'
                                ).xpath('a/@href').extract():
            yield Request(url, callback=self.parse_product)

示例#27

0

显示文件

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)

        product_loader.add_xpath('price', '//div[@class="h3"]/span[@class="productSpecialPrice"]/text()',
                                 re='.*\$(.*)')
        product_loader.add_xpath('price', '//div[@class="h3"]/text()', re='.*\$(.*[0-9])')
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('sku', '//div[@id="content"]/div[@id="right-column"]/span[@class="right"]/text()',
                                 re='-(.*)\]')

        sku = product_loader.get_output_value('sku')
        if sku:
            product_loader.add_value('name', sku)
        else:
            product_loader.add_xpath('name', '//div[@id="content"]/div[@id="right-column"]/h1[@class="bottom-border"]/text()')

        return product_loader.load_item()

示例#28

0

显示文件

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        multiple_products = hxs.select(u'//table[@id="ListeSkuGroupTableGauche"]//a/@href').extract()
        for url in multiple_products:
            yield Request(url)

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        name = hxs.select(u'//h1[@id="sku_Title"]/text()').extract()
        if not name:
            return
        brand = hxs.select(u'//h1[@id="sku_Title"]/span[@id="sku_Brand"]/text()').extract()
        if brand:
            name = brand[0] + ' ' + name[0].strip()
            product_loader.add_value('brand', brand[0].strip())
        else:
            name = name[0].strip()
        sku = hxs.select(u'//div[@class="sku_TP_TD"]/div[@class="sku_TP_SKU"]/text()').extract()
        if sku:
            name += ' (' + sku[0].strip() + ')'
            product_loader.add_value('sku', sku[0].strip())
            product_loader.add_value('name', name)
            product_loader.add_value('identifier', sku)
            product_loader.add_value('category', hxs.select('//a[@class="BreadCrumbLink"]/text()')[-2].extract())
            image = hxs.select('//img[@id="ChangePhoto"]/@src').extract()
            if image:
                product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), image[0]))
            shipping = hxs.select('//div[@id="sku_ZoneFLtxtFix"]//strong/text()').extract()
            if shipping:
                product_loader.add_value('shipping_cost', shipping[0].replace(',', '.'))
            price = hxs.select(u'//div[@id="sku_ZonePriceNormal"]//div[@id="sku_ZPN_HT"]/text()').re(u'([\d\.,]+)')
            if price:
                price = re.sub(',', '.', price[0])
                product_loader.add_value('price', price)
                if product_loader.get_output_value('name') and not multiple_products:
                    yield product_loader.load_item()

示例#29

0

显示文件

文件： customform_pl.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        category = hxs.select(u'//div[@class="breadcrumb"]/a/text()').extract()
        category = category[-1] if category else ''
        image_url = hxs.select(
            u'//ul[@id="product_images"]/li/a//img[@class="big_photo"]/@src'
        ).extract()
        if image_url:
            image_url = urljoin_rfc(get_base_url(response), image_url[0])

        name = hxs.select(u'//h1/text()').extract()[0]

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('name', name.strip())
        product_loader.add_value('url', response.url)
        product_loader.add_value('category', category)
        product_loader.add_value('image_url', image_url)

        req_url = os.path.basename(response.request.url)
        identifier, _, _ = req_url.partition('-')

        product_loader.add_value('identifier', identifier)
        product_loader.add_xpath('price',
                                 '//div[@id="center_column"]/script[2]/text()',
                                 re="var productPrice='([0-9.]+)'")
        price = product_loader.get_output_value('price')
        product_loader.add_value('shipping_cost', 25 if price < 500 else 0)

        stock_option = hxs.select(
            u'//div[@class="shipping" and ./h2/text()="Dostawa"]/div[@class="feature_value"]/text()'
        )

        product_loader.add_value('stock', 0 if stock_option else 1)

        yield product_loader.load_item()

示例#30

0

显示文件

文件： walgreens.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//div[contains(@class,"product-container")]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//a[@class="SearchLinkBold"]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            name = product.select(u'.//a[@class="SearchLinkBold"]/text()').extract()[0]

            extra_name = product.select(u'.//div[contains(@class,"prod-info-box")]/p/text()').extract()
            if extra_name:
                name += ' ' + extra_name[0]

            r = re.search('ID=prod(\d+)', url)
            if r:
                log.msg('Found ' + r.groups()[0])
                name = self.names.get(r.groups()[0], name)
            product_loader.add_value('name', name)
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b[@class="price sale"]/text()',
            #                                 re=u'.*?or 1/\$(.*)')
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b[@class="price sale"]/text()',
            #                                 re=u'\$(.*)')
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b/text()',
            #                                 re=u'.*?or 1/\$(.*)')
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b/text()',
            #                     re=u'\$(.*)')
            product_loader.add_xpath('price', './/p[@class="FSprice"]/text()', re=u'.*?or 1/\$(.*)')
            product_loader.add_xpath('price', './/p[@class="FSprice"]/text()', re=u'.*?or 1/\$(.*)')
            product_loader.add_xpath('price', './/p[@class="Rprice"]/text()')
            product_loader.add_xpath('price', './/p[@class="Rprice"]/text()')
            if not product_loader.get_output_value('price'):
                continue
            yield product_loader.load_item()

示例#31

0

显示文件

    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = hxs.select('//div[@id="produktDET"]/div/div/h1[@class="or"]/text()').pop().extract().strip()

        category = hxs.select('//div[@id="link"]/a/@title').pop().extract().strip()

        sku = hxs.select('//span[@class="code"]/text()').extract().pop().strip()

        pid = self.get_id(hxs.select('//div[@class="buy"]/a/@href').pop().extract())

        price = hxs.select('//div[@class="pricebox"]/div/div/p[@class="prodCena"]/span/span[@class="actual_price"]/text()').pop().extract()

        stock = hxs.select('//div[@class="prodRight"]/div/div/p[@class="makeGreen"][contains(text(), "Skladem")]')

        if price:
            loader = ProductLoader(response=response, item=Product())
            loader.add_value('url', urljoin(base_url, response.url))
            loader.add_value('name', name)
            try:
                loader.add_xpath('image_url', '//div[@class="mainImgCont"]/a/img/@src', Compose(lambda v: urljoin(base_url, v[0])))
            except IndexError:
                self.errors.append("No image set for url: '%s'" % urljoin(base_url, response.url))
            loader.add_value('price', price.replace(' ', ''))
            loader.add_value('category', category)
            loader.add_value('sku', sku)
            loader.add_value('identifier', pid)
            loader.add_value('brand', 'LEGO')
            price = loader.get_output_value('price')
            if int(price) < 1990:
                loader.add_value('shipping_cost', 99)
            if not stock:
                loader.add_value('stock', 0)
            yield self.load_item_with_metadata(loader.load_item())
        else:
            self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))

示例#32

0

显示文件

文件： outillage2000_spider.py 项目： 0--key/lib

	def parse_product(self, response):
		if not isinstance(response, HtmlResponse):
			return

		hxs = HtmlXPathSelector(response)		
		url=response.url
		name = join(hxs.select(u'//h1[@id="titre_produit"]/text()').extract())
		price = join(hxs.select(u'//div[@id="productPrice"]/text()').extract())
		#remove euro sign and replace ',' with '.' in the price
		price = price.replace(u',',u'.').replace(u'\xe2',u"").strip()
		# if there is a discount the price is in another element		
		if price is None or len(price) == 0 :
			price = join(hxs.select(u'//div[@id="productPrice"]/ins/text()').extract())
			price = price.replace(u',',u'.').replace(u'\xe2',u"").strip()

		#strip html tags from name
		name = re.sub('<[^<]+?>', '',name)
		product_loader = ProductLoader(item=Product(), selector=name)
		product_loader.add_value('name', name)
		product_loader.add_value('url', url)
		product_loader.add_value('price', price)
		if product_loader.get_output_value('name'):			
			yield product_loader.load_item()

示例#33

0

显示文件

文件： vitacost.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//div[@class="pt9P cf clear"]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//a[@class="pNameM cf"]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            product_loader.add_xpath('name', u'.//a[@class="pNameM cf"]/text()')
            product_loader.add_xpath('price', u'.//div[contains(@class,"pOurPrice")]/text()',
                                 re=u'\$(.*)')
            if product_loader.get_output_value('price'):
                yield product_loader.load_item()
            else:
                cart_url = product.select(u'.//div[@class="pt0PBtns"]/a[child::img]/@href').extract()[0]
                cart_url = urljoin_rfc(get_base_url(response), cart_url)
                request = Request(cart_url, callback=self.parse_cart, cookies={}, meta={'dont_merge_cookies': True})
                request.meta['product_loader'] = product_loader
                yield request

示例#34

0

显示文件

文件： leroymerlin.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        products = hxs.select(u'//form/div[contains(@class,"highlightProduits hproduct")]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//h3/a[@class="item url"]/@href').extract()
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            product_loader.add_xpath('name', u'.//h3/a[@class="item url"]/text()')
            product_loader.add_xpath('price', u'.//p[@class="price"]/text()',
                                     re=u'([0-9\.]+)')
            if not product_loader.get_output_value('price'):
                product_loader.add_xpath('price', u'.//p[contains(@class,"price")]/text()',
                                     re=u'([0-9\.]+)')
            yield product_loader.load_item()

            if not products:
                log.msg('Retrying url: %s' % response.url, level=log.WARNING)
                retries = response.meta.get('retries', 0)
                if retries < 1:
                    yield Request(response.url, dont_filter=True, meta={'retries': retries + 1})

示例#35

0

显示文件

文件： otelo.py 项目： ontiyonke/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        multiple_products = hxs.select(
            u'//table[@id="ListeSkuGroupTableGauche"]//a/@href').extract()
        for url in multiple_products:
            yield Request(url)

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        name = hxs.select(u'//h1[@id="sku_Title"]/text()').extract()
        if not name:
            return
        brand = hxs.select(
            u'//h1[@id="sku_Title"]/span[@id="sku_Brand"]/text()').extract()
        if brand:
            name = brand[0] + ' ' + name[0].strip()
        else:
            name = name[0].strip()
        sku = hxs.select(
            u'//div[@class="sku_TP_TD"]/div[@class="sku_TP_SKU"]/text()'
        ).extract()
        if sku:
            name += ' (' + sku[0].strip() + ')'
        product_loader.add_value('name', name)
        price = hxs.select(
            u'//div[@id="sku_ZonePriceNormal"]//div[@id="sku_ZPN_HT"]/text()'
        ).re(u'([\d\.,]+)')
        if price:
            price = re.sub(',', '.', price[0])
            product_loader.add_value('price', price)
            if product_loader.get_output_value(
                    'name') and not multiple_products:
                yield product_loader.load_item()

示例#36

0

显示文件

文件： djkit.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        URL_BASE = 'http://www.djkit.com'
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//*[@itemprop="name"]/text()').extract()
        if not name:
            self.log("ERROR name not found")
            name = ""
        else:
            name = name[0]

        name = name.strip()
        if 'B-STOCK' in name.upper():
            return

        price = hxs.select(
            '//span[@class="product-variation-value discount-value"]//*[@itemprop="price"]/text()'
        ).extract()
        if not price:
            price = hxs.select('//*[@itemprop="price"]/text()').extract()
        if not price:
            self.log("ERROR price not found")
            price = ""
        else:
            price = extract_price(price[0].strip())

        sku = hxs.select('//*[@itemprop="sku"]/strong/text()').extract()
        if not sku:
            sku = hxs.select('//*[@itemprop="sku"]/text()').extract()
        if not sku:
            self.log("ERROR sku not found")
        else:
            sku = sku[0]

        product_id = hxs.select(
            '//*[@id="sub"]/input[@name="product"]/@value').extract()
        if not product_id:
            self.log("ERROR ID not found")
            return
        else:
            product_id = product_id[0]

        img_url = hxs.select('//img[@itemprop="image"]/@src').extract()
        if not img_url:
            self.log("ERROR img not found")
        else:
            img_url = urljoin_rfc(URL_BASE, img_url[0])

        category = hxs.select(
            '//div[@id="breadcrumbs"]/a[@class="breadlink"]/text()').extract()
        category = category[-1] if category else ''

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('sku', sku)
        loader.add_value('image_url', img_url)
        loader.add_value('identifier', product_id.strip())
        loader.add_value('category', category)

        shipping_cost = '5.50' if float(
            loader.get_output_value('price')) < 50.00 else '0.00'
        loader.add_value('shipping_cost', shipping_cost)

        stock = hxs.select(
            '//div[@class="delivery-availability"]//text()[normalize-space()]'
        ).extract()
        if 'DISCONTINUED' in stock:
            return
        if not ('In Stock' in stock or 'In stock' in stock):
            loader.add_value('stock', 0)
        yield loader.load_item()

示例#37

0

显示文件

文件： windowcleaningsupply.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        # products
        product_links = hxs.select(
            '//div[@id="CategoryContent"]//div[@class="ProductDetails"]/strong/a/@href'
        ).extract()
        for url in product_links:
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_product,
                          meta=response.meta)
        if product_links or not hxs.select('//h1/text()'):
            return

        # sub products
        subproduct_urls = hxs.select(
            '//div[@class="ProductDescriptionContainer"]//a/@href').extract()
        if subproduct_urls:
            for url in subproduct_urls:
                yield Request(urljoin_rfc(base_url, url),
                              callback=self.parse_product,
                              meta=response.meta)

        name = hxs.select('//h1/text()')[0].extract()
        if 'MSDS' in name.upper() or 'ABC' in name.upper():
            return
        category = response.meta.get('category', '')
        brand = hxs.select(
            '//div[@class="DetailRow" and div[text()="Brand:"]]/div[@class="Value"]//text()[normalize-space()]'
        ).extract()
        image_url = hxs.select(
            '//meta[@property="og:image"]/@content').extract()
        price = hxs.select(
            '//em[@class="ProductPrice VariationProductPrice"]/text()'
        ).extract()
        price = price[0] if price else '0.00'
        identifier = hxs.select(
            '//form[@id="productDetailsAddToCartForm"]//input[@type="hidden" and @name="product_id"]/@value'
        )
        if identifier:
            identifier = identifier[0].extract()
        else:
            log.msg('Product without identifier: ' + response.url)
            return

        sku = hxs.select('//div[@id="sku"]/text()').extract()
        sku = sku[0] if sku else None

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('identifier', identifier)
        if sku:
            loader.add_value('sku', sku)
        loader.add_value('category', category)
        if brand:
            loader.add_value('brand', brand[0].strip())
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        if not loader.get_output_value('price'):
            loader.add_value('stock', 0)

        yield loader.load_item()

示例#38

0

显示文件

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        meta = response.meta

        relevant_categories = hxs.select(
            '//div[@class="catsMI"]/div/a/@href').extract()
        for category in relevant_categories:
            yield Request(urljoin_rfc(base_url, category), meta=meta)

        products = hxs.select('//table[@id="productlist-table"]/tbody/tr')
        if not products and meta.get('model_search', False):
            url = 'http://www.trovaprezzi.it/categoria.aspx?libera=' + meta[
                'name'].replace(' ', '+') + '&id=-1&prezzomin=&prezzomax='
            meta['model_search'] = False
            yield Request(url, meta=meta)
        else:
            category = hxs.select(
                '//div[@id="divTitle"]/h1/text()').extract()[0]
            pr = None
            for product in products:
                name = product.select(
                    'td[@class="descCol"]/a/b/text()').extract()[0]
                if self.match_name(meta['name'], name, match_threshold=70):
                    loader = ProductLoader(item=Product(), selector=product)
                    image_url = product.select(
                        'td[@class="imgCol"]/a/img/@src').extract()
                    if image_url:
                        image_url = urljoin_rfc(base_url, image_url[0])
                    else:
                        image_url = ''
                    loader.add_value('image_url', image_url)
                    loader.add_xpath('dealer',
                                     'td[@class="mercCol"]/a/img/@alt')
                    loader.add_xpath('name', 'td[@class="descCol"]/a/b/text()')
                    loader.add_value('category', category)
                    loader.add_value('sku', response.meta.get('sku'))

                    url = product.select(
                        'td[@class="descCol"]/a/@href').extract()[0]
                    loader.add_value('url', urljoin_rfc(base_url, url))

                    price = product.select('td[@class="prodListPrezzo"]/text()'
                                           ).extract()[0].strip().replace(
                                               '.', '').replace(',', '.')
                    loader.add_value('price', price)
                    shipping_cost = product.select(
                        'td[@class="prodListPrezzo"]/' +
                        'span[@class="deliveryCost nobr"]/' +
                        'text()').extract()[0].strip().replace('.',
                                                               '').replace(
                                                                   ',', '.')
                    loader.add_value('shipping_cost', shipping_cost)
                    loader.add_value('identifier',
                                     response.meta.get('identifier'))

                    if loader.get_output_value('price') and (
                            pr is None or pr.get_output_value('price') >
                            loader.get_output_value('price')):
                        pr = loader
            if pr:
                item = pr.load_item()
                if not item in self.items:
                    self.items.append(item)

示例#39

0

显示文件

文件： oponeo.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
        # the pattern should be set as the product's name

        fitting_method = 'Delivered'

        loader.add_value('url', response.url)

        image_url = hxs.select('//img[@itemprop="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin(get_base_url(response), image_url[0]))

        identifier = hxs.select('//form[@name="form1"]/@action').extract()
        if not identifier:
            yield self.retry_request(response)
            return
        identifier = identifier[0]
        loader.add_value('identifier', identifier)
        price = hxs.select(
            '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract()
        loader.add_value('price', price)
        if not loader.get_output_value('price'):
            loader.add_value('stock', 0)

        brand = hxs.select(
            '//div[@class="hidden"]/input[@class="producerName"]/@value'
        ).extract()
        if not brand:
            yield self.retry_request(response)
            return
        brand = brand[0].strip()
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category',
                         find_brand_segment(loader.get_output_value('brand')))
        brand = re.sub(u'\u0119', u'e', brand)

        product_name = hxs.select(
            '//h1[@itemprop="name"]/text()')[0].extract().strip()
        product_name = re.sub(u'[:\u2122]', u'', product_name)
        product_name = product_name.replace(brand, '').strip()

        data = parse_pattern(product_name)
        if not data:
            log.msg('ERROR parsing "{}" [{}]'.format(product_name,
                                                     response.url))
            self.errors.append('ERROR parsing "{}" [{}]'.format(
                product_name, response.url))
            return

        loader.add_value('name', data['Name'])

        metadata = MicheldeverMeta()
        metadata['aspect_ratio'] = data['Aspect_Ratio']
        metadata['rim'] = data['Rim']
        metadata['speed_rating'] = data['Speed_Rating']

        metadata['width'] = data['Width']
        metadata['fitting_method'] = fitting_method
        metadata['load_rating'] = data['Load_Rating'] or ''
        metadata['alternative_speed_rating'] = ''
        xl = 'XL' in product_name
        metadata['xl'] = 'Yes' if xl else 'No'

        run_flat = 'run on flat' in product_name.lower(
        ) or 'run flat' in product_name.lower()
        metadata['run_flat'] = 'Yes' if run_flat else 'No'
        manufacturer_mark = [
            mark for mark in self.all_man_marks.keys()
            if mark in product_name.split(' ')
        ]
        manufacturer_mark = manufacturer_mark[0].strip(
        ) if manufacturer_mark else []
        metadata['manufacturer_mark'] = self.all_man_marks.get(
            manufacturer_mark, '') if manufacturer_mark else ''
        metadata['full_tyre_size'] = '/'.join(
            (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
             metadata['load_rating'], metadata['speed_rating']))
        # metadata['alternative_speed_rating']))

        product = loader.load_item()
        product['metadata'] = metadata

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(
            product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        yield product

示例#40

0

显示文件

文件： petertyson.py 项目： 0--key/lib

    def _get_prices(self, price_response):
        unpriced = re.search('eBunpriced="(.*)"', price_response.body)
        if unpriced:
            unpriced = [prod_id for prod_id in unpriced.groups()[0].split(',') if prod_id.strip()]

        eBzp = [None] * 200
        eBzpp = [None] * 200
        eBzp_assignments = re.findall('(eBzp\[\d+\]=.*);', price_response.body)
        for assignment in eBzp_assignments:
            exec assignment.replace('eBop', "''").replace('eBspl', "'&'").replace('eBsp', "'&'")

        eBzpp_assignments = re.findall('(eBzpp\[\d+\]=.*);', price_response.body)
        for assignment in eBzpp_assignments:
            exec assignment.replace('eBop', "'&'").replace('eBspl', "'&'").replace('eBsp', "'&'")

        prices = {}

        for i, prod in enumerate(eBzp):
            if prod:
                prices[prod] = eBzpp[i]

        hxs = price_response.meta['hxs']
        main_name = hxs.select('//h1/text()').extract()[0].strip()

        products = hxs.select('//form[@id="eBvariant1"]//option')
        subprods = hxs.select('//div[@id="TabbedPanels1"]//em/strong[contains(text(), "//")]/text()').extract()

        if not products and subprods:
            subprods = subprods[0].split('//')
            for prod in subprods:
                r = prod.split(':')
                if len(r) == 2:
                    p = Product()
                    loader = ProductLoader(response=price_response.meta['main_response'], item=p)
                    loader.add_value('name', main_name + ' ' + r[0].strip())
                    loader.add_value('price', r[1])
                    loader.add_value('url', price_response.meta['main_response'].url)
                    yield loader.load_item()

            return

        if not products and prices:
            product_id = hxs.select('//span[@class="eBprice"]/@id').re('pP(.*)')
            if product_id:
                price = prices.get(product_id[0]) or eBzpp[0]
            else:
                price = eBzpp[0]

            p = Product()
            loader = ProductLoader(response=price_response.meta['main_response'], item=p)
            loader.add_value('name', main_name)
            loader.add_value('price', price)
            loader.add_value('url', price_response.meta['main_response'].url)
            yield loader.load_item()

        for product in products:
            subprods = product.select('./@value').extract()[0].split(',')
            if len(subprods) == 1 and subprods[0] in prices and subprods[0] not in unpriced:
                p = Product()
                loader = ProductLoader(response=price_response.meta['main_response'], item=p)
                subname = product.select('./text()').extract()[0].strip()
                loader.add_value('name', main_name + ' ' + subname)
                loader.add_value('price', prices[subprods[0]])
                loader.add_value('url', price_response.meta['main_response'].url)
                yield loader.load_item()

            elif len(subprods) > 1:
                subprods = subprods[1:]
                for i, subprod in enumerate(subprods):
                    if subprod in prices and subprod not in unpriced:
                        p = Product()
                        loader = ProductLoader(response=price_response.meta['main_response'], item=p)
                        loader.add_value('url', price_response.meta['main_response'].url)
                        first_subname = product.select('./text()').extract()[0].strip()
                        subname = subprods[i - 1].strip()
                        loader.add_value('name', unquote(main_name + ' ' + first_subname + ' ' + subname))
                        loader.add_value('price', prices[subprod])
                        yield loader.load_item()

        alternate_prices = hxs.select('//a[@class="green2"]')
        for alt in alternate_prices:
            subprods = alt.select('./following-sibling::em//text()').extract()
            for subprod in subprods:
                prod_data = subprod.split(':')
                if len(prod_data) > 1:
                    loader = ProductLoader(selector=alt, item=Product())
                    loader.add_value('url', price_response.meta['main_response'].url)
                    loader.add_value('name', main_name)
                    loader.add_value('name', prod_data[0])
                    loader.add_value('price', prod_data[1])
                    if not loader.get_output_value('price'):
                        continue

                    yield loader.load_item()

示例#41

0

显示文件

    def parse(self, response):
        base_url = get_base_url(response)
        row = response.meta['row']
        products = json.loads(response.body_as_unicode())
        for product_el in products:
            #skip winter tyres
            if product_el['winter'] != '0':
                continue
            loader = ProductLoader(item=Product(), selector=product_el)
            brand = product_el['tyreMake'].title()
            if 'goodrich' in brand.lower():
                brand = 'BFG'
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            load_rating = product_el['loadrating']
            speed_rating = product_el['tyreSpeed']
            loader.add_value('price', product_el['priceVat'])
            loader.add_value('identifier', product_el['id'])
            loader.add_value(
                'url',
                urljoin('http://www.etyres.co.uk/tyre-detail/',
                        product_el['URLString']))
            if product_el['tyreModelImage2']:
                image_url = 'images/' + product_el['tyreModelImage2']
                if image_url:
                    loader.add_value('image_url', urljoin(base_url, image_url))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = speed_rating
            metadata['width'] = row['Width']
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating
            metadata[
                'xl'] = 'Yes' if product_el['tyreReinforced'] == 'T' else 'No'
            metadata[
                'run_flat'] = 'Yes' if product_el['runflat'] == '1' else 'No'

            name = product_el['tyreModel']
            man_code = ''
            for code, man_mark in self.all_man_marks.iteritems():
                result, name = cut_name(code, name)
                if result:
                    man_code = man_mark
                    break
            if not man_code:
                for code, man_mark in self.custom_man_marks.iteritems():
                    if name.endswith(code):
                        name = name.partition(code)[0]
                        man_code = man_mark
                        break
            metadata['manufacturer_mark'] = man_code

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating,
                 speed_rating))
            name = name.replace(' EXTRA LOAD', '')
            name = name.replace(' RUNFLAT', '')

            loader.add_value('name', name.strip())

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product

示例#42

0

显示文件

文件： halfords.py 项目： oceancloud82/scraping

    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        product_data = response.meta['product_data']
        width = product_data['Width']
        aspect_ratio = product_data['Aspect Ratio']
        rim = product_data['Rim']
        speed_rating = product_data['Speed rating']
        alt_speed = product_data['Alt Speed']

        name_reg = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (
            width, rim, speed_rating.upper())
        name_reg2 = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (
            width, rim, alt_speed.upper())
        name_reg3 = r'(.+?)\s*%s.+%s.?[\s]*(.*)' % (width, rim)
        products = hxs.select(
            '//div[@id="product-listing"]//div[@class="product"]/..')
        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)
            try:
                url = product_el.select(
                    './/div[@class="title"]/a/@href')[0].extract()
            except:
                continue
            loader.add_value('url', url)
            loader.add_value(
                'identifier',
                product_el.select(".//span[@class='addcompare']/input/@id").
                extract()[0].split(":")[1])
            # loader.add_value('identifier', re.search('productId_(\d+)_', url).groups()[0])
            loader.add_xpath('price', './/span[@class="prodPirce"]/text()')
            try:
                name = product_el.select(
                    './/div[@class="title"]/a/text()')[0].extract()
            except:
                continue
            run_flat_found = is_run_flat(name)
            if not re.search(r'(\(.*\))', name):
                # name = name.replace('/', '')
                m = re.search(name_reg, name)
                if not m:
                    m = name_parts = re.search(name_reg2, name)
                if not m:
                    m = name_parts = re.search(name_reg3, name)

                if m:
                    name_parts = m.groups()
                else:
                    self.log('Failed parsing ' + name)
                    self.log('URL: ' + response.url)
                    self.log('Params: ' + ", ".join(
                        map(str, [width, rim, speed_rating.upper()])))
                    continue
            else:
                name_parts = []
                name_parts.append(name.split()[0])
                load_rating_reg = re.search(r'(\d+)%s' % speed_rating.upper(),
                                            name)
                if not load_rating_reg:
                    load_rating_reg = re.search(r'(\d+)%s' % alt_speed.upper(),
                                                name)
                if not load_rating_reg:
                    self.log('Failed parsing ' + name)
                    self.log('URL: ' + response.url)
                    self.log('Params: ' + ", ".join(
                        map(str, [width, rim, speed_rating.upper()])))
                    continue
                name_parts.append(load_rating_reg.groups()[0])
                name_parts.append(' '.join(name.split()[1:]).split('(')[0])

            loader.add_value(
                'name',
                name_parts[-1].replace('XL',
                                       '').replace('ROF',
                                                   '').replace('RFT', ''))
            brand = name_parts[0]
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            loader.add_xpath('image_url',
                             './/a[contains(@class, "tyre")]/img/@src')
            m = MicheldeverMeta()
            m['aspect_ratio'] = aspect_ratio
            m['rim'] = rim
            m['width'] = width
            m['speed_rating'] = speed_rating.upper()
            m['load_rating'] = name_parts[1]
            if 'ROF' in name.upper() or 'RFT' in name.upper(
            ) or run_flat_found:
                m['run_flat'] = 'Yes'
            else:
                m['run_flat'] = 'No'

            if 'XL' in name.upper():
                m['xl'] = 'Yes'
            else:
                m['xl'] = 'No'

            m['full_tyre_size'] = '/'.join(
                (m['width'], m['aspect_ratio'], m['rim'], m['load_rating'],
                 m['speed_rating']))
            # m['alternative_speed_rating']))

            m['fitting_method'] = 'Fitted'
            m['manufacturer_mark'] = self._get_manufacturer_code(
                name_parts[-1])
            fuel = product_el.select(
                './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "fuel_")]/@class'
            ).re(r'fuel_(\w)')
            m['fuel'] = fuel[0] if fuel else ''
            grip = product_el.select(
                './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "grip_")]/@class'
            ).re(r'grip_(\w)')
            m['grip'] = grip[0] if grip else ''
            noise = product_el.select(
                './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "noise_")]/@class'
            ).re(r'_(\d+)')
            m['noise'] = noise[-1] if noise else ''

            product = loader.load_item()
            product['metadata'] = m

            if not is_product_correct(product):
                self.log('The product is not correct: %r' % product)
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            yield product

        next_page = hxs.select('//span[@class="nextlink"]/a/@href')
        if next_page:
            yield Request(next_page.extract()[0],
                          callback=self.parse_products,
                          meta=response.meta)

示例#43

0

显示文件

文件： creativeaudio.py 项目： 0--key/lib

    def parse_product(self, response):
        if 'TERMS' in response.url or 'ABOUTUS' in response.url:
            return
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        products = hxs.select(u'//td[@class="linkcell"]/div[@onclick]/@onclick | //div[@id="DARKSTRIP"]//td[@onclick]/@onclick').re('assign\(\'(.*)\'')
        products += hxs.select(u'//div[@id="DARKSTRIP"]//a/@href').extract()
        products += hxs.select(u'//a["HDlistTitlefont"]/@href').extract()
        for url in set(products):
            url = urljoin_rfc(get_base_url(response), '/' + url)
            if ('javascript' not in url) and ('Javascript' not in url):
                yield Request(url, callback=self.parse_product)

        product_loader = ProductLoader(item=Product(), response=response)

        name = hxs.select(u'//title/text()').extract()[0]
        name = re.sub('\n', ' ', name)
        product_loader.add_value('name', name)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('price', u'//div[@class="HDPriceHD"]//span/text()', re='\xa3(.*)')
        product_loader.add_xpath('price', u'//div[@id="MASTER"]//td[@valign]/text()', re='\xa3(.*)')
        product_loader.add_xpath('price', u'//div[@class="HDPriceRRP"]//text()', re='.*?\xa3(.*)')
        if product_loader.get_output_value('price'):
            yield product_loader.load_item()

        products = hxs.select(u'//td[@class="DefaultFont"]')
        for product in products:
            name = product.select(u'.//p/strong/text()').extract()
            price = product.select(u'.//p/text()').re('\xa3(.*)\)')
            url = product.select(u'.//a[child::u]/@href').extract()
            if url:
                url = urljoin_rfc(get_base_url(response), '/' + url[0])
            if not price:
                continue
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_value('name', name)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url if url else response.url)
            yield product_loader.load_item()

        products = hxs.select(u'//td[@id="LIGHTSTRIP"]')
        for product in products:
            name = product.select(u'.//a[@class="DefaultFont" and contains(@style,"#000000")]/text()').extract()
            if len(name) > 1:
                name = map(lambda x: x.strip(), name)
                name = ' '.join(name)
            else:
                name = name[0].strip()
            price = product.select(u'.//span[contains(text(),"Hot Deal - only")]/span/text()').re('\xa3(.*)')
            if not price:
                price = product.select(u'.//span[@class="DefaultFont" and contains(text(),"RRP")]/text()').re('\xa3(.*)')
            if not price:
                continue
            url = product.select(u'.//a[@class="DefaultFont"]/@href').extract()
            if url:
                url = urljoin_rfc(get_base_url(response), '/' + url[0])
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_value('name', name)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url if url else response.url)
            yield product_loader.load_item()