Python ProductLoader.add_value示例，axemusic_item.ProductLoader.add_value Python示例

示例#1

0

显示文件

文件： avshop_spider.py 项目： oceancloud82/scraping

    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            '//div[@id="productListing"]//h5[a[contains(@class, "product-name")]]/..'
        )
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/a/text()')
            url = product.select('.//a/@href').extract()[0]
            loader.add_value('url', url)
            price = product.select(
                '..//div[@class="product-buttons"]//span[@class="sellPrice"]/text()'
            ).extract()
            if not price:
                price = product.select(
                    '..//div[@class="product-buttons"]//div[@class="productSpecialPrice"]/span/text()'
                ).extract()
            loader.add_value('price', price[0])
            yield Request(url,
                          callback=self.parse_product,
                          meta={'loader': loader})  #loader.load_item()
        next = hxs.select('//a[@title=" Next Page "]/@href').extract()
        if next:
            url = urljoin_rfc(get_base_url(response), next[0])
            yield Request(url, callback=self.parse_products)

示例#2

0

显示文件

文件： acclaimmusic_spider.py 项目： oceancloud82/scraping

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select(
            '//ul[@id="search-results"]/li/span[@class="wrapper"]')

        for product in products:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value(
                'name',
                product.select(
                    './/span[@class="product-title"]/a/text()').extract()[0])
            url = product.select(
                './/span[@class="product-title"]/a/@href').extract()[0]
            loader.add_value('url', url)
            try:
                loader.add_value(
                    'price',
                    product.select('.//span[@class="product-ourprice"]/text()'
                                   ).extract()[0])
            except IndexError:
                loader.add_value('price', 0)
            yield Request(url,
                          callback=self.parse_product,
                          meta={'loader': loader})
        pages = hxs.select(
            '//div[contains(@class, "nav-pages")][1]//a/@href').extract()
        if pages:
            url = urljoin_rfc(get_base_url(response), pages[-1])
            yield Request(url, callback=self.parse)

示例#3

0

显示文件

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = AxeMusicProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name',
                                 u'//div[@class="product-name"]/h1/text()')
        price = hxs.select(
            u'//div[@class="price-box"]//span[@class="price"]/text()').extract(
            )
        if price:
            price = price[0].strip()
            product_loader.add_value('price', price)
        else:
            return
        product_loader.add_xpath(
            'sku', u'//div[@class="sku"]/span[@class="value"]/text()')
        product_loader.add_xpath(
            'category', u'//div[@class="breadcrumbs"]/ul/li[2]/a/span/text()')

        img = hxs.select('//img[@id="image-main"]/@src').extract()
        if img:
            img = urljoin_rfc(get_base_url(response), img[0])
            product_loader.add_value('image_url', img)

        identifier = hxs.select('//meta[@itemprop="productID"]/@content').re(
            'sku:(.*)')[0]
        product_loader.add_value('identifier', identifier)

        product_loader.add_value(
            'brand',
            self._get_brand_from_name(product_loader.get_output_value('name')))

        #stock_status = ''.join(hxs.select('//p[@class="availability in-stock"]/h10/text()').extract()).strip()
        # if stock_status:
        #     if 'OUT OF STOCK' in stock_status.upper():
        #         product_loader.add_value('stock', 0)

        yield product_loader.load_item()

示例#4

0

显示文件

文件： lamusic_ca.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//span[@itemprop="name"]/text()')
        price = hxs.select(u'//form[@id="vCSS_mainform"]//span[@itemprop="price"]/text()').extract()
        price = price[0] if price else u'0'
        product_loader.add_value('price', price)
        product_loader.add_xpath('sku', u'//span[@class="product_code"]/text()')
        product_loader.add_xpath('identifier', u'//span[@class="product_code"]/text()')
        product_loader.add_xpath('category', u'//td[@class="vCSS_breadcrumb_td"]//a[position()=2]/@title')
        product_loader.add_xpath('image_url', u'concat("http://lamusic.ca",//img[@id="product_photo"]/@src)')
        product_loader.add_xpath('brand', u'//meta[@itemprop="manufacturer"]/@content')
        availability_label = ''.join(hxs.select('//b[contains(text(), "Availability:")]/text()').extract()).strip()
        # in_stock = 'IN STOCK' in ''.join(hxs.select('//div[@itemprop="offers"]/text()').extract()).strip().upper()
        # if availability_label and not in_stock:
        #     product_loader.add_value('stock', 0)
        if hxs.select(u'//img[@class="vCSS_img_icon_free_shipping"]'):
            product_loader.add_value('shipping_cost', '0')

        product = product_loader.load_item()
        if hxs.select(u'//tr[@class="Multi-Child_Background"]'):
            for opt in hxs.select(u'//tr[@class="Multi-Child_Background"]'):
                p = Product(product)
                p['sku'] = opt.select(u'./td[1]/text()').extract()[0].strip()
                p['identifier'] = opt.select(u'./td[1]/text()').extract()[0].strip()
                p['name'] = opt.select(u'./td[2]/text()').extract()[0].strip()
                try:
                    p['price'] = opt.select(u'./td[4]//span[@itemprop="price"]/text()').extract()[0].strip().replace('$', '').replace(',', '')
                except:
                    price = opt.select(u'./td[4]//span/text()').extract()
                    if not price:
                        price = opt.select(u'./td[3]//span[contains(text(), "$")]/text()').extract()

                    p['price'] = price[0].strip().replace('$', '').replace(',', '')
                    
                if p.get('identifier') and p.get('price') > 0:
                    yield p
        elif product.get('identifier') and product.get('price') > 0:
            yield product

示例#5

0

显示文件

    def parse_full(self, response):
        meta = response.meta.copy()
        meta['dont_redirect'] = True
        meta['dont_merge_cookies'] = True

        items_number = response.xpath(
            '//div[contains(@class, "pagination")]//span[contains(@class, "bold")]/text()'
        ).re(r'\d+')

        if items_number:
            if items_number[0] > items_number[1]:
                return

        need_retry = False

        brands = response.xpath('//dl[@class="brandsList"]//a/@href').extract()
        for brand in brands:
            yield (Request(brand, callback=self.parse_full))

        cats = response.xpath(
            '//li[@data-selenium="category"]//@href').extract()
        if cats:
            for cat in cats:
                meta['try'] = 0
                yield Request(url=canonicalize_url(cat),
                              callback=self.parse_full,
                              meta=meta,
                              errback=lambda failure, url=canonicalize_url(
                                  cat), metadata=meta: self.bsm_retry_download(
                                      failure, url, metadata, self.parse_full))

        products = response.xpath(
            '//div[contains(@class, "item") and contains(@class, "clearfix")]')
        if products:
            for product in products:
                try:
                    brand = product.xpath(
                        './/span[@itemprop="brand"]/text()').extract()[0]
                except IndexError:
                    brand = ''
                try:
                    title = product.xpath(
                        './/span[@itemprop="name"]/text()').extract()[0]
                except IndexError:
                    continue
                name = ' '.join((brand, title))

                url = product.xpath('.//a[@itemprop="url"]/@href').extract()[0]

                price = ''.join(
                    product.xpath('.//*[contains(@class, "price")]/text()').
                    extract()).strip()

                identifier = product.xpath(
                    './/input[@name="sku"]/@value').extract()
                if identifier:
                    identifier = identifier[0]
                    id_part = product.xpath(
                        './/input[@name="is"]/@value').extract()
                    if id_part:
                        identifier = identifier + '-' + id_part[0]
                else:
                    self.log('No identifier found for %s on %s' %
                             (name, response.url))
                    continue

                if not price:
                    for data in response.xpath(
                            '//div/@data-itemdata').extract():
                        json_data = json.loads(data)
                        if json_data['sku'] in identifier.split('-'):
                            price = json_data['price']
                            break

                sku = product.xpath(
                    './/p[contains(@class, "skus")]//span[@class="sku"]/text()'
                ).extract()
                if sku:
                    sku = sku[-1]
                else:
                    sku = ''

                image_url = product.xpath(
                    'div/a[@name="image"]/img/@src').extract()
                if not image_url:
                    image_url = product.xpath(
                        'div[@class="img-zone zone"]//img/@data-src').extract(
                        )
                if not image_url:
                    image_url = product.xpath(
                        'div[@class="img-zone zone"]//img/@src').extract()
                if image_url:
                    image_url = response.urljoin(image_url[0])
                else:
                    image_url = ''
                category = response.xpath('//ul[@id="breadcrumbs"]/li/a/text()'
                                          ).extract()[-1].strip()
                if category.lower() == "home":
                    category = response.xpath(
                        '//ul[@id="breadcrumbs"]/li[@class="last"]/text()'
                    ).extract()[-1].strip()

                if identifier:
                    if not price:
                        price = '0.0'

                    loader = AxeMusicProductLoader(item=Product(),
                                                   selector=product)
                    loader.add_value('url', url)
                    loader.add_value('identifier', identifier)
                    loader.add_value('sku', sku)
                    loader.add_value('image_url', image_url)
                    if brand:
                        loader.add_value('brand', brand)
                    loader.add_value('category', category)
                    loader.add_value('name', name)
                    loader.add_value('price', price)

                    if url not in self.product_pages and loader.get_output_value(
                            'price') > 0:
                        item = loader.load_item()
                        if item['identifier'].endswith('-REG'):
                            item['identifier'] = item['identifier'].replace(
                                '-REG', '')
                        yield item
                    self.product_pages.add(url)
        elif not cats:
            need_retry = True

        pages = response.xpath(
            '//div[contains(@class, "pagination-zone")]//a/@href').extract()
        for page_url in pages:
            meta['try'] = 0
            yield Request(callback=self.parse_full,
                          url=canonicalize_url(page_url),
                          meta=meta)

        if need_retry:
            retry = response.meta.get('try', 0)
            if retry < 15:
                meta = response.meta.copy()
                meta['try'] = retry + 1
                self.log("Try %d. retrying to download %s" %
                         (meta['try'], response.url))
                yield Request(url=response.url,
                              callback=self.parse_full,
                              dont_filter=True,
                              meta=meta)

示例#6

0

显示文件

    def parse_product(self, response):
        meta = response.meta
        url = response.url
        price = ''
        for line in response.body.split('\n'):
            if "MAIN:No^Refrnce" in line:
                price = line.split('");')[0].split(', "')[-1]

        if not price:
            try:
                price = response.xpath(
                    '//span[@itemprop="price"]/text()').extract()[0].replace(
                        ',', '')
            except IndexError:
                pass

        identifier = meta.get('identifier')
        if not identifier:
            identifier = response.xpath(
                '//form[@name="addItemToCart"]//input[@name="sku"]/@value'
            ).extract()
        if not identifier:
            identifier = response.xpath(
                '//input[@name="useMainItemSku"]/@value').extract()

        id_part = response.xpath('//form/input[@name="is"]/@value').extract()

        if identifier:
            identifier = identifier[0]
            if id_part:
                identifier = identifier + '-' + id_part[0]

        else:
            self.log('Product without identifier: ' + response.url)
            return

        if not price:
            for data in response.xpath('//div/@data-itemdata').extract():
                json_data = json.loads(data)
                if json_data['sku'] in identifier.split('-'):
                    price = json_data['price']
                    break

        image_url = meta.get('image_url')
        if not image_url:
            image_url = response.xpath('//img[@id="mainImage"]/@src').extract()
        brand = meta.get('brand')
        if not brand:
            brand = response.xpath(
                '//div[@id="tMain"]//div[@class="mfrLogo"]//img[1]/@alt'
            ).extract()
        category = meta.get('category')
        if not category:
            try:
                category = response.xpath('//ul[@id="breadcrumbs"]/li/a/text()'
                                          ).extract()[-1].strip()
            except IndexError:
                pass
        sku = meta.get('sku')
        if not sku:
            sku = map(
                lambda s: s.replace(' ', '').lower(),
                response.xpath(
                    '//meta[@itemprop="productID" and contains(@content, "mpn:")]/@content'
                ).re(r'mpn:([\w\s\.-]+)'))
        name = meta.get('name')
        if not name:
            name = ''.join(
                response.xpath(
                    '//*[@itemprop="name"]//text()').extract()).strip()

        if identifier:
            loader = AxeMusicProductLoader(item=Product(), response=response)
            loader.add_value('identifier', identifier)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', brand)
            loader.add_value('category', category)
            loader.add_value('url', url)
            loader.add_value('sku', sku)
            loader.add_value('name', name)
            loader.add_value('price', price)

            product = loader.load_item()

            # BSM simple run duplicates fix
            if isinstance(self, BigSiteMethodSpider) and self.simple_run and (
                    product['identifier'] not in self.matched_identifiers):
                self.matched_identifiers.add(product['identifier'])

            if product['price'] > 0:
                if product['identifier'].endswith('-REG'):
                    product['identifier'] = product['identifier'].replace(
                        '-REG', '')
                yield product

示例#7

0

显示文件

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        redirected_urls = response.meta.get('redirect_urls', None)
        if redirected_urls:
            log.msg('Skips product, redirected url: ' +
                    str(redirected_urls[0]))
            return

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_value(
            'identifier',
            re.search('p-(\d+)\.html', response.url).group(1))
        name = hxs.select(
            u'//td[@class="pageHeading" and @valign="top" and not(@align)]/text()'
        ).extract()[0]
        product_loader.add_value('name', name)
        price = ''.join(
            hxs.select(
                u'//td[@class="pageHeading" and @valign="top" and @align="right"]/text()'
            ).extract()).strip()
        if not price:
            price = ''.join(
                hxs.select(
                    u'//td[@class="pageHeading" and @valign="top" and @align="right"]/span[@class="productSpecialPrice"]/text()'
                ).extract())

        product_loader.add_value('price', price)
        product_loader.add_xpath(
            'sku',
            u'//td[@class="pageHeading" and @valign="top" and not(@align)]/span[@class="smallText"]/text()',
            re='\[(.*)\]')
        product_loader.add_value('category', response.meta.get('category'))
        image_url = hxs.select(
            u'//a[contains(@href,"images") and child::img]/@href').extract()
        if image_url:
            image_url = urljoin_rfc(get_base_url(response), image_url[0])
            product_loader.add_value('image_url', image_url)
        # product_loader.add_xpath('brand', u'')

        brand = ''
        brands = hxs.select(
            '//form[@name="manufacturers"]/select/option/text()').extract()
        for brand in brands:
            if '..' in brand:
                incomplete_brand = ' '.join(brand.split()[:-1])
                if incomplete_brand.lower() in name.lower():
                    product_loader.add_value('brand', brand.replace('..', ''))
            else:
                if brand.lower() in name.lower():
                    product_loader.add_value('brand', brand.replace('..', ''))
                    break

        yield product_loader.load_item()

示例#8

0

显示文件

文件： fleetsound_spider.py 项目： oceancloud82/scraping

    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        category = response.meta.get('category', '')

        for url in hxs.select('//div[@class="pages"]//a/@href').extract():
            yield Request(url,
                          callback=self.parse_products,
                          meta=response.meta)

        products = hxs.select('//li[contains(@class, "item")]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            try:
                model = map(
                    unicode.strip,
                    product.select('.//p[contains(text(), "model: ")]/text()').
                    re(r'model: (.*)'))[0]
            except:
                model = ''
            name = product.select(
                './/h2[@class="product-name"]/a/text()').extract()
            if name:
                name = name[0].strip()
            else:
                name = ''
            loader.add_value('name', ' '.join((name, model)))
            url = product.select(
                './/h2[@class="product-name"]/a/@href').extract()[0].strip()
            identifier = product.select(
                './/span[contains(@id, "product-price-")]/@id').re(
                    r'product-price-(\d+)')
            if not identifier:
                identifier = product.select(
                    './/ul[@class="add-to-links"]/li/a[@class="link-compare" or @class="link-wishlist"]/@href'
                ).re('product/(.*?)/')
            if identifier:
                prod_id = identifier[0]
                loader.add_value('identifier', prod_id)
            loader.add_value('url', url.split('?')[0])
            try:
                brand = map(
                    unicode.strip,
                    product.select(
                        './/p[contains(text(), "manufacturer: ")]/text()').re(
                            r'manufacturer: (.*)'))[0]
            except:
                brand = product.select('td[3]//text()').extract()
            loader.add_value('brand', brand)
            if model:
                loader.add_value('sku', model)
            image_url = product.select(
                './/a[@class="product-image"]/img/@src').extract()
            if image_url:
                loader.add_value('image_url',
                                 urljoin_rfc(base_url, image_url[0]))
            try:
                price = product.select(
                    './/span[contains(@id, "product-price-")]/span[@class="price"]/text()'
                ).extract()[0].strip()
            except:
                try:
                    price = product.select(
                        './/span[contains(@id, "product-price-") and contains(@class, "price")]/text()'
                    ).extract()[0].strip()
                except:
                    price = '0.0'
            loader.add_value('price', price)

            loader.add_value('category', category)

            if loader.get_collected_values(
                    'identifier') and loader.get_collected_values(
                        'identifier')[0]:
                product = loader.load_item()
                if product['price'] > 0:
                    yield product
            else:
                self.log('IDENTIFIER NOT FOUND!!! {}'.format(
                    loader.get_output_value('url')))

示例#9

0

显示文件

    def parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        meta = response.meta.copy()

        cats = hxs.select('//*[@id="tContent"]/div/div/div[@class="column"]'
                          '/ul/li/a/@href').extract()

        pages = hxs.select(
            '//div[contains(@class, "pagination-zone")]//a/@href').extract()
        for page_url in pages:
            yield Request(callback=self.parse,
                          url=canonicalize_url(page_url),
                          errback=lambda failure, url=canonicalize_url(
                              page_url), metadata=meta: self.retry_download(
                                  failure, url, metadata, self.parse))

        products = hxs.select(
            '//div[contains(@class, "item") and contains(@class, "clearfix")]')
        if products:
            for product in products:
                try:
                    brand = product.select(
                        './/span[@itemprop="brand"]/text()').extract()[0]
                except IndexError:
                    brand = ''
                title = product.select(
                    './/span[@itemprop="name"]/text()').extract()[0]
                name = ' '.join((brand, title))

                url = product.select(
                    './/a[@itemprop="url"]/@href').extract()[0]

                identifier = product.select(
                    './/input[@name="sku"]/@value').extract().pop()

                price = 0
                for data in hxs.select('//div/@data-itemdata').extract():
                    json_data = json.loads(data)
                    if json_data['sku'] == identifier:
                        price = json_data['price']
                        break

                if not price:
                    price = product.select(
                        './/div[@class="price-zone"]/div[@class="atc-price"]'
                        '//strong[contains(@class, "price")]/text()').extract(
                        )

                try:
                    sku = product.select(
                        './/p[contains(@data-selenium, "skus")]//span[@class="sku"]/text()'
                    ).extract()[-1]
                except:
                    sku = ''
                image_url = product.select(
                    './/a[@class="itemImg"]/img/@data-src').extract(
                    ) or product.select(
                        './/a[@class="itemImg"]/img/@src').extract()
                if image_url:
                    image_url = urljoin_rfc(base_url, image_url[0])
                else:
                    image_url = ''

                category = hxs.select('//ul[@id="breadcrumbs"]/li/a/text()'
                                      ).extract()[-1].strip()
                if category.lower() == "home":
                    category = hxs.select(
                        '//ul[@id="breadcrumbs"]/li[@class="last"]/text()'
                    ).extract()[-1].strip()

                bushnell_product = self.bushnell_products.get(
                    sku.upper().strip(), None)
                if bushnell_product:
                    category = bushnell_product['Class']
                    log.msg(
                        'Extracts category "%s" from bushnell file, URL: %s' %
                        (category, response.url))

                if url not in self.urls_list:
                    if price:
                        self.urls_list.append(url)
                        loader = ProductLoader(item=Product(),
                                               selector=product)
                        loader.add_value('url', url)
                        loader.add_value('identifier', identifier)
                        loader.add_value('sku', sku)
                        loader.add_value('image_url', image_url)
                        loader.add_value('brand', brand)
                        loader.add_value('category', category)
                        loader.add_value('name', name)
                        loader.add_value('price', price)
                        product = loader.load_item()
                        yield self._get_reviews_url(product)
                    else:
                        # parse product page if price not found
                        meta = {
                            'name': name,
                            'brand': brand,
                            'category': category,
                            'identifier': identifier,
                            'image_url': image_url,
                            'sku': sku
                        }
                        yield Request(
                            url=url,
                            callback=self.parse_product,
                            meta=meta,
                            errback=lambda failure, url=url,
                            metadata=meta: self.retry_download(
                                failure, url, metadata, self.parse_product))
        elif not cats:
            retry = response.meta.get('try', 0)
            if retry < 15:
                meta = response.meta.copy()
                meta['try'] = retry + 1
                yield Request(
                    url=response.url,
                    dont_filter=True,
                    callback=self.parse,
                    errback=lambda failure, url=response.url, metadata=meta:
                    self.retry_download(failure, url, metadata, self.parse))

示例#10

0

显示文件

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        meta = response.meta
        url = response.url
        price = ''
        for line in hxs.extract().split('\n'):
            if "MAIN:No^Refrnce" in line:
                price = line.split('");')[0].split(', "')[-1]

        if not price:
            try:
                price = hxs.select(
                    '//span[@itemprop="price"]/text()').extract()[0].replace(
                        ',', '')
            except:
                pass

        identifier = meta.get('identifier')
        if not identifier:
            identifier = hxs.select(
                '//form[@name="addItemToCart"]//input[@name="sku"]/@value'
            ).extract()[0]
        image_url = meta.get('image_url')
        if not image_url:
            image_url = hxs.select('//img[@id="mainImage"]/@src').extract()
        brand = meta.get('brand')
        if not brand:
            brand = hxs.select(
                '//div[@id="tMain"]//div[@class="mfrLogo"]//img[1]/@alt'
            ).extract()
        category = meta.get('category')
        if not category:
            try:
                category = hxs.select('//ul[@id="breadcrumbs"]/li/a/text()'
                                      ).extract()[-1].strip()
            except:
                pass
        sku = meta.get('sku')
        if not sku:
            sku = hxs.select(
                '//meta[@itemprop="productID" and contains(@content, "mpn:")]/@content'
            ).re(r'mpn:(\w+)')
            if sku:
                bushnell_product = self.bushnell_products.get(
                    sku[0].upper().strip(), None)
                if bushnell_product:
                    category = bushnell_product['Class']
                    log.msg(
                        'Extracts category "%s" from bushnell file, URL: %s' %
                        (category, response.url))

        name = meta.get('name')
        if not name:
            name = ''.join(
                hxs.select(
                    '//h1[@itemprop="name"]//text()').extract()).strip()

        if url not in self.urls_list:
            self.urls_list.append(url)
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('identifier', identifier)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', brand)
            loader.add_value('category', category)
            loader.add_value('url', url)
            loader.add_value('sku', sku)
            loader.add_value('name', name)
            loader.add_value('price', price)
            product = loader.load_item()
            yield self._get_reviews_url(product)

示例#11

0

显示文件

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        # Fill up the Product model fields
        # identifier =
        url = response.url
        brand = ''.join(
            response.xpath(
                '//span[@id="product-brand"]/text()').extract()).strip()
        name = ''.join(
            response.xpath(
                '//span[@id="product-header-name"]/text()').extract()).strip()
        full_name = brand + ' - ' + name
        # The price can be tagged in either <b> or <span>, or None
        price = response.xpath(
            '//span[@id="product-regular-price"]/text()').extract()
        if not price:
            price = response.xpath(
                '//span[@id="product-sale-price"]/text()').extract()
            if not price:
                price = 0  # Call for pricing
        sku = response.xpath('//h2[@id="product-model"]/text()').extract()
        identifier = response.xpath(
            '//span[@id="product-sku"]/text()').extract()
        category = response.xpath(
            '//div[@class="products-bredcrumbs"]/a/text()').extract()
        if len(category) > 1:
            category = category[1]
        else:
            category = ""
        image_url = response.xpath('//img[@id="product-image"]/@src').extract()
        if image_url:
            image_url = urljoin_rfc(base_url, image_url.pop())

        l = ProductLoader(response=response, item=Product())
        l.add_value('url', url)
        l.add_value('name', name)
        l.add_value('price', price)
        l.add_value('sku', sku)
        l.add_value('identifier', identifier)
        l.add_value('category', category)
        if image_url:
            l.add_value('image_url', image_url)
        l.add_value('brand', brand)
        item = l.load_item()
        if item['identifier'] not in self.identifiers and item['price'] > 0:
            self.identifiers.add(item['identifier'])
            yield item

示例#12

0

显示文件

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//div[@itemprop="name"]/text()')
        price = hxs.select(u'//span[@itemprop="price"]/text()').extract()
        price = price[0] if price else '0'

        product_loader.add_value('price', price)

        product_id = hxs.select(
            u'//form//input[@type="hidden" and @name="products_id"]/@value'
        ).extract()
        if not product_id:
            product_id = hxs.select(
                '//div[@id="productTellFriendLink"]/a/@href').re(
                    'products_id=(.*)')
            if not product_id:
                product_id = re.findall(r'products_id=(.*)" class',
                                        response.body)
                if not product_id:
                    log.msg('Product without identifier: ' + response.url)
                    return

        product_loader.add_value('identifier', product_id[0])

        sku = hxs.select(u'//span[@itemprop="identifier"]/text()').extract()
        if sku:
            product_loader.add_value('sku', sku[0])

        product_loader.add_xpath('category',
                                 u'//div[@id="navBreadCrumb"]/a[2]/text()')

        img = hxs.select(u'//div[@id="productMainImage"]//img/@src').extract()
        if img:
            img = urljoin_rfc(get_base_url(response), img[0])
            product_loader.add_value('image_url', img)

        brand = hxs.select('//li[@itemprop="brand"]/text()').extract()
        if brand:
            brand = brand[0].replace('Manufactured by: ', '')
            product_loader.add_value('brand', brand)

        product = product_loader.load_item()
        if product['price'] > 0:
            yield product
        """

示例#13

0

显示文件

    def parse(self, response):
        if ('temporarilyUnavailable'
                in response.url) and ('redirect_urls' in response.meta) and (
                    response.meta['redirect_urls']):
            url = response.meta['redirect_urls'][0]
            yield Request(url, dont_filter=True)
            return

        urls = response.xpath(
            '//nav[@id="category-navigation"]//a/@href').extract()
        for url in map(lambda u: response.urljoin(u), urls):
            yield Request(url)

        urls_all = response.xpath(
            '//span[@class="class-type-more"]/a/@href').extract()
        for url in map(lambda u: response.urljoin(u), urls_all):
            yield Request(url)

        if not urls_all:
            pages = set(
                response.xpath(
                    '//div[@class="pagination-container"]//li[@class="next"]/a/@href'
                ).extract())
            for url in map(lambda u: response.urljoin(u), pages):
                yield Request(url)

            products = response.xpath(
                '//div[@class="products"]/div[@class="product" and div[contains(@class, "cart")]]'
            )
            for product in products:
                product_name = ' '.join(
                    product.xpath(
                        './/h3[contains(@class, "name")]//text()').extract())
                product_url = map(
                    lambda u: response.urljoin(u),
                    product.xpath('.//h3[contains(@class, "name")]/a/@href').
                    extract())[0]
                product_price = product.xpath(
                    './/div[@class="product-price"]/span[@class="price"]/text()'
                ).extract()[0]
                product_image = map(
                    lambda u: response.urljoin(u),
                    product.xpath(
                        './/div[@class="product-image"]//img/@data-echo').
                    extract())[0]
                product_sku = filter(lambda l: l.strip(),
                                     product_url.split('/'))[-1]
                product_brand = product.xpath(
                    './/h3[contains(@class, "name")]//text()').extract()[0]
                product_identifier = '%s-%s' % (product_brand.strip(),
                                                product_sku.strip())
                product_category = response.xpath(
                    '//div[@class="breadcrumb-inner"]//li//span[@itemprop="title"]/text()'
                ).extract()

                loader = ProductLoader(item=Product(), selector=product)
                loader.add_value('name', product_name)
                loader.add_value('url', product_url)
                loader.add_value('price', product_price)
                loader.add_value('image_url', product_image)
                loader.add_value('sku', product_sku)
                loader.add_value('brand', product_brand)
                loader.add_value('identifier', product_identifier)
                loader.add_value('category', product_category)

                item = loader.load_item()
                if item['identifier'] not in self._identifier_name:
                    self._identifier_name[item['identifier']] = item['name']
                else:
                    item['name'] = self._identifier_name[item['identifier']]

                yield item

            if not products:
                urls = response.xpath('//h3[@class="name"]/a/@href').extract()
                for url in map(lambda u: response.urljoin(u), urls):
                    yield Request(url)

示例#14

0

显示文件

文件： musiciansfriend_spider.py 项目： oceancloud82/scraping

    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        brands = map(
            strip,
            hxs.select(
                '//*[@id="facets"]//div[contains(label/text(), "Search Brands")]/ul/li/a/text()'
            ).re(r'(.*) \(\d+\)'))

        products = hxs.select(
            '//div[@class="productGrid"]//div[@class="product"]')
        for product in products:
            try:
                name = product.select(
                    './/div/strong/a/text()').extract()[0].strip()
            except:
                continue
            image_url = product.select(
                './/div[@class="thumb "]/span/img/@data-original').extract()
            category = hxs.select(
                '//ol[@class="breadcrumbs"]/li/a/text()').extract()[-2]
            brand = filter(lambda b: b in name, brands)
            url = urljoin_rfc(
                get_base_url(response),
                product.select('.//div/strong/a/@href').extract()[0].strip())
            price = ' '.join(''.join(
                product.select('div/span[@class="productPrice"]/text()').
                extract()).split())
            if not price:
                price = ' '.join(''.join(
                    product.select(
                        'div/dl[@class="productUsedPrice"]//dd/text()').
                    extract()).split())

            sku = product.select(
                'var[contains(@class, "productId")]/text()').extract()[0]

            loader = ProductLoader(item=Product(), selector=product)
            loader.add_value('name', name)
            loader.add_value('url', url)
            loader.add_value('sku', sku)
            loader.add_value('category', category)
            if image_url:
                loader.add_value(
                    'image_url',
                    urljoin_rfc(get_base_url(response), image_url[0]))
            loader.add_value('brand', brand)
            loader.add_value('identifier', sku)
            if 'Email for Price' in price:
                log.msg('Email for price')
                loader.add_value('price', 0)
                metadata = AxeMeta()
                metadata['price'] = 'Email for Price'
                prod = loader.load_item()
                prod['metadata'] = metadata
            else:
                loader.add_value('price', price)
                prod = loader.load_item()

            yield Request(url,
                          callback=self.parse_product,
                          meta={'product': prod})

        next_page = hxs.select('//a[@class="next_link"]/@href').extract()
        if next_page:
            url = urljoin_rfc(get_base_url(response), next_page[0])
            yield Request(url, callback=self.parse_products)

示例#15

0

显示文件

文件： italmelodie.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        meta_url = hxs.select(
            '//meta[@property="og:url"]/@content').extract()[0]
        parsed = urlparse.urlparse(meta_url)
        params = urlparse.parse_qs(parsed.query)

        name = hxs.select('//td[@class="text11 bold"]//h1/text()').extract()
        sku = hxs.select('//div[@class="grey text12"]/text()').re(
            r'Model: ([\w-]+)')
        price = hxs.select(
            '//table[@class="bold text11"]//tr[@class="bold darkBlue"]/td[2]/text()'
        ).extract()
        category = hxs.select('//div[@id="breadcrums"]/a[1]/text()').extract()
        img_url = hxs.select('//img[@id="itemImage"]/@src').extract()[0]
        base_url = get_base_url(response)
        img_url = urljoin_rfc(base_url, img_url)
        brand = hxs.select(
            '//div[@class="grey text12"]/following-sibling::img[1]/@src'
        ).extract()
        if (brand):
            brand = brand[0]
            brand = re.search('([\w]+)\.+', brand).group(1)

        if not price:
            # If product has sub-products
            prod_list = hxs.select(
                '//div[@class="grey text12"]/following-sibling::table[1]//select/option/@value'
            ).extract()
            for prod in prod_list:
                item_id = str('itemID=' + prod)
                url = re.sub('itemID=([\d]+)', item_id, response.url)
                yield Request(url, callback=self.parse_product)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', name)
        if price:
            loader.add_value('price', price)
        else:
            loader.add_value('price', [u'$0.0'])

        loader.add_value('identifier', params['itemID'])
        loader.add_value('url', response.url)
        loader.add_value('sku', sku)
        loader.add_value('category', category)
        loader.add_value('image_url', img_url)
        if (brand):
            loader.add_value('brand', brand)

        # Not Found - Shipping cost
        yield loader.load_item()