Python TigerChefLoader.add_xpath示例，tigerchefloader.TigerChefLoader.add_xpath Python示例

示例#1

0

显示文件

    def parse(self, response):
        # Main categories
        for cat_url in response.xpath(
                '//ul[@id="main-nav"]/li/a/@href').extract():
            yield Request(response.urljoin(cat_url))

        sub_categories = response.xpath(
            '//div[contains(@class, "sub-categories")]'
            '/div/div//p/a/@href').extract()
        for sub_cat in sub_categories:
            yield Request(
                add_or_replace_parameter(response.urljoin(sub_cat), 'sort',
                                         'lowest'))

        categories = response.xpath(
            '//ul[@class="category"]/li/a/@href').extract()
        categories += response.xpath(
            '//a[contains(@class, "shop-all-button")]/@href').extract()
        categories += response.css('.subcat-panel ::attr(href)').extract()
        for url in categories:
            yield Request(
                add_or_replace_parameter(response.urljoin(url), 'sort',
                                         'lowest'))

        next_page = response.xpath(
            '//ul[@class="pagination"]/li/a[@class="next"]/@href').extract()
        if next_page:
            yield Request(url=response.urljoin(next_page[0]))

        products = response.xpath('//div[contains(@class, "product")]')
        for product_xs in products:
            url = product_xs.xpath('a/@href').extract()
            if not url:
                continue
            product_loader = ProductLoader(item=Product(), selector=product_xs)
            product_loader.add_value('url', url)

            try:
                sku = product_xs.xpath('p[@class="product-sku"]/text()').re(
                    'KaTom #: (.*)')[0]
            except:
                sku = None
            product_loader.add_value('sku', sku)
            product_loader.add_value('identifier', sku)
            product_loader.add_xpath('name', 'a/@title')
            product_loader.add_css('image_url', '.img ::attr(src)')
            product_loader.add_xpath('category', '//h1[@class="title"]/text()')

            product = product_loader.load_item()
            if len(product.get('sku', '').split('-')) > 1:
                product['sku'] = '-'.join(product['sku'].split('-')[1:])

            yield Request(url=product_loader.get_output_value('url'),
                          meta={"product": product},
                          callback=self.parse_product)

示例#2

0

显示文件

文件： tigerchefspider.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        itemno = response.xpath(
            '//div[@id="product-main-info"]//a[contains(@id, '
            '"wishlist_link_")]/@id').re(r'(\d+)')
        if not itemno:
            self.log('ERROR: itemno not found => %s' % response.url)
            return
        else:
            itemno = itemno[0]

        price = ''.join(
            response.xpath('//span[@id="the-price"]//text()').re(r'[\d\.,]+')
            [-2:])
        if not price:
            self.log('WARNING: price not found => %s' % response.url)
            price = '0.00'

        sku = response.xpath('//li[@itemprop="sku"]/text()').extract()
        if not sku:
            self.log('WARNING: SKU not found => %s' % response.url)
        else:
            sku = sku[0].replace('Model #:', '').strip()

        brand = response.xpath('//li[@itemprop="name"]/text()').extract()
        image_url = response.xpath(
            '//div[@id="zoom-div"]//img[@itemprop="image"]/@src').extract()
        category = response.xpath('//span[@class="breadcrumb-element"]'
                                  '//*[@itemprop="name"]/text()').extract()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]//text()')
        loader.add_value('price', price)
        if sku:
            loader.add_value('sku', sku)
        if image_url:
            loader.add_value('image_url', image_url)
        if brand:
            loader.add_value('brand', brand)
        loader.add_value('identifier', itemno + ' ' + sku)
        if category:
            loader.add_value('category', category[0].strip())

        product = loader.load_item()

        sold_as = response.xpath('//li[contains(text(),"Sold As:")]/../li[2]/text()')\
                          .extract()[0].strip()
        metadata = TigerChefMeta()
        metadata['sold_as'] = sold_as
        product['metadata'] = metadata

        yield product

示例#3

0

显示文件

    def parse_product(self, response):
        product = response.meta['product']

        product_loader = ProductLoader(Product(product), response=response)
        product_loader.add_xpath(
            'price', '//meta[@property="og:price:amount"]/@content')
        product_loader.add_value('price', 0)

        name = response.xpath(
            '//div[@class="product-info"]/p[@class="h1"]/text()').extract()

        img_url = response.xpath('//img[@class="mainImgFix"]/@src').extract()
        if not img_url:
            self.log("ERROR img not found")
        else:
            product_loader.add_value('image_url', img_url[0])

        category = response.xpath(
            '//ol[contains(@class, "breadcrumb")]/li/a/text()').extract()
        if not category:
            self.log("ERROR category not found")
        else:
            product_loader.add_value('category', category[-1])

        brand = response.xpath('//div[@class="logo-area"]/a/@title').extract()
        if not brand:
            brand = response.xpath(
                '//td[contains(text(), "Manufacturer")]/following-sibling::td/text()'
            ).extract()
        if not brand:
            self.log("ERROR brand not found")
        else:
            product_loader.add_value('brand', brand[0])

        product = product_loader.load_item()

        if name:
            product['name'] = name[0].strip()

        sold_as = response.xpath(
            '//strong[@class="price"]/span/text()').extract()
        metadata = TigerChefMeta()
        metadata['sold_as'] = sold_as[0].split('/ ')[-1] if sold_as else '1 ea'
        product['metadata'] = metadata

        yield product

示例#4

0

显示文件

    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//div[contains(@class, "productResult")]')
        log.msg(">>>>>>>> FOUND %s ITEMS >>>" % len(products))
        for product in products:
            product_loader = TigerChefLoader(Product(), product, spider_name=self.name)
            product_loader.add_xpath(
                    'name', './/h2[@class="productResultName"]/a/text()')
            try:
                name = product.select('.//h2[@class="productResultName"]/a/text()').extract()[0]
            except:
                self.log('Cannot find name %s' % response.url)
            url = product.select(
                    './/h2[@class="productResultName"]/a/@href'
                    ).extract()[0]
            url = canonicalize_url(urljoin_rfc(base_url, url))
            price = ' '.join(product.select(
                    './/span[@class="variantprice"]//text()').extract())
            identifier = identifier_regex.search(url).group(1)
            yield Request(url, callback=self.parse_product, meta={'name': name,
                                                                  'price': price,
                                                                  'identifier': identifier})

        products2 = hxs.select('//div[contains(@id, "ageContent_pnlContent")]/table/tr/td/table/tr[2]/td/a/@href').extract()
        for url in products2:
            identifier = identifier_regex.search(url).group(1)
            yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta={'identifier': identifier})

        if not products and not products2 and not hxs.select('//td[@id="featuredProductsTable"]'):
            retry = int(response.meta.get('retry', 0))
            if retry < 10:
                self.log('WARNING: No products and no subcategories, Retry => %s' % response.url)
                retry += 1
                new_meta = response.meta.copy()
                new_meta['retry'] = retry
                yield Request(
                    response.url,
                    meta=new_meta,
                    cookies={'pagesize': 10000},
                    callback=self.parse_products,
                    dont_filter=True)
            else:
                self.log('ERROR - NO PRODUCTS FOUND, retry limit reached, giving up, url: {}'.format(response.url))

示例#5

0

显示文件

 def parse(self, response):
     data = response.xpath('//script/text()').re("products', (\[{.+}\])")
     if not data:
         return
     list_of_data = json.loads(data[0])
     for data in list_of_data:
         loader = ProductLoader(item=Product(), response=response, spider_name=self.name)
         loader.add_xpath('url', '//link[@rel="canonical"]/@href')
         loader.add_value('sku', data['sku'])
         loader.add_value('identifier', str(data['sqlProductID']) + '_' + data['sku'])
         loader.add_value('name', data['name'])
         loader.add_value('price', data['price'])
         category = response.css('.breadcrumb a::text').extract()
         loader.add_value('category', category[-1])
         loader.add_value('brand', data['manufacturer'])
         loader.add_xpath('image_url', '//meta[@property="og:image"]/@content')
         loader.add_value('stock', int(data['inventoryStatus'] != 3))
         yield loader.load_item()

示例#6

0

显示文件

    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        meta = response.meta

        products = hxs.select('//li[contains(@itemtype, "Product")]')
        for product in products:
            product_loader = ProductLoader(Product(),
                                           product,
                                           spider_name=self.name)
            product_loader.add_xpath('name', './/a[@itemprop="name"]/text()')
            product_loader.add_xpath('url', './/a[@itemprop="name"]/@href')
            product_loader.add_xpath('price',
                                     './/span[@itemprop="price"]/text()')
            product_loader.add_xpath('image_url', 'div/a/img/@src')
            identifier = product.select('@id').extract()[0].split(
                'product_')[-1]
            product_loader.add_value('identifier', identifier)
            product_loader.add_value('category', meta.get('category'))
            product_loader.add_value('brand', meta.get('brand'))

            sku = product.select('.//span[@itemprop="model"]/text()')
            if sku:

                sku = sku.extract()[0]
                '''
                dash_pos = sku.find('-')
                if dash_pos >= 0:
                    sku = sku[dash_pos + 1:]
                '''
                product_loader.add_value('sku', sku)

            sold_as = product.select(
                'div/div/div/div/span[contains(text(), "Sold As")]/text()'
            ).extract()
            product = product_loader.load_item()

            metadata = TigerChefMeta()
            metadata['sold_as'] = sold_as[0].split(
                'Sold As: ')[-1].strip() if sold_as else '1 ea'
            product['metadata'] = metadata

            yield product

        next_page = hxs.select(
            '//td[@class="next"]/a[@class="pagerlink"]/@href').extract()
        if next_page:
            yield Request(urljoin_rfc(base_url, next_page[0]),
                          callback=self.parse_products,
                          meta=meta)

示例#7

0

显示文件

    def parse(self, response):
        next_page = response.xpath(
            '//*[@class="pagelinks"]/following-sibling::td//a[contains(text(), "Next")]/@href'
        ).extract()
        if next_page:
            yield Request(response.urljoin(next_page[0]),
                          meta={'dont_merge_cookies': True},
                          dont_filter=True)

        if not next_page:
            self._search_done = True

        products_xs = response.xpath('//td[contains(@class, "search-prod")]')
        for product_xs in products_xs:
            sku = None
            product_id = product_xs.xpath(
                './/*[@class="search-item-title"]/a/@href').extract()[0].split(
                    '/')[-1].split('.')[2]
            try:
                brand, sku = product_xs.xpath(
                    './/*[@class="search-item-title"]/following-sibling::div/a/text()'
                ).extract()
            except ValueError:
                try:
                    brand = product_xs.xpath(
                        './/*[@class="search-item-title"]/following-sibling::div/a/text()'
                    ).extract()[0]
                except:
                    brand = None
            image_url = map(response.urljoin,
                            product_xs.xpath('.//img/@src').extract())
            price = product_xs.xpath('.//*[@class="search-item-price"]').re(
                r'[\d\.,]+')
            add_to_cart = bool(
                product_xs.xpath(
                    './/*[@class="search-item-price"]/span[@class="see-price-sprite"]'
                ))
            loader = ProductLoader(item=Product(), selector=product_xs)
            identifier = product_id
            if sku:
                identifier = identifier + ' ' + sku.lower()

            loader.add_value('identifier', identifier)
            if sku:
                loader.add_value('sku', sku)
            loader.add_xpath('url', './/*[@class="search-item-title"]/a/@href')
            if image_url:
                loader.add_value(
                    'image_url',
                    image_url[0].replace('/pics/sm/',
                                         '/pics/md/').replace('sm_', 'md_'))
            if brand:
                loader.add_value('brand', brand)
            loader.add_xpath(
                'name', './/*[@class="search-item-title"]/a/strong/text()')
            if price:
                loader.add_value('price', price[0])
                yield loader.load_item()
            elif add_to_cart:
                product = loader.load_item()
                url = response.urljoin(
                    product_xs.xpath(
                        './/a[@class="atc-primary"]/@href').extract()[0])
                item_id = url_query_parameter(url, 'ItemID')
                self._add_to_cart_products.append((item_id, url, product))

示例#8

0

显示文件

    def parse_product(self, response):
        schema = SpiderSchema(response)
        data = schema.get_product()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('name', data['Name'])
        loader.add_xpath('category',
                         u'//div[@class="breadcrumbs"]/ul/li[2]/a/text()')
        price = response.xpath(
            '//form[@id="productform"]/input[@name="price"]/@value').extract()
        if price:
            loader.add_value('price', price[0])
        else:
            loader.add_value(
                'price',
                data.get('offers', {}).get('properties',
                                           {}).get('price', '0.0'))

        sku = map(
            unicode.strip,
            response.xpath(
                '//span[contains(@class, "mfr-number")]/text()').extract())
        loader.add_value('identifier', data['productID'])
        if sku:
            loader.add_value('sku', sku)
        else:
            loader.add_value('sku', data['productID'].replace('#', ''))

        image_url = data.get('image', '').replace('www.example.com',
                                                  'www.webstaurantstore.com')
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))

        brand = data.get('brand', '')
        if not brand:
            brand = response.xpath(
                '//tr[@class="highlight" and .//b[contains(text(), "Manufacturer Name")]]/td[not(b)]/text()'
            ).extract()
            brand = brand[0].strip() if brand else ''

        if brand:
            loader.add_value('brand', brand)

        sold_as = response.xpath(
            '//div[@id="subject"]/div/div/p/span[@class="each"]/text()'
        ).extract()

        product = loader.load_item()
        if product.get('identifier', '').strip() != '':
            metadata = TigerChefMeta()
            metadata['sold_as'] = sold_as[0].replace('/',
                                                     '') if sold_as else ''
            product['metadata'] = metadata

            # Add to cart to see the price
            if response.xpath(
                    '//*[@itemprop="price" and contains(@class, "strikeOutPrice")][1]'
            ):
                cart_url = 'http://www.webstaurantstore.com/viewcart.html'
                inputs = response.xpath('//form[@id="productform"]/input')
                formdata = dict(
                    zip(
                        inputs.select('./@name').extract(),
                        inputs.select('./@value').extract()))
                # quantity
                formdata[u'qty'] = '1'
                f_request = FormRequest(url=cart_url,
                                        method='POST',
                                        formdata=formdata,
                                        callback=self.parse_price,
                                        meta={
                                            'product': product,
                                            'dont_merge_cookies': True
                                        },
                                        dont_filter=True)

                yield f_request
            else:
                yield product  # loader.load_item()

示例#9

0

显示文件

    def parse_product(self, response):

        # self.log("parse_product")

        hxs = HtmlXPathSelector(response)

        name = hxs.select('//h1[@id="partNameId"]/text()').extract()

        quantity = hxs.select(
            '//label[@class="productdetail-qtytxt"]/../text()[last()]'
        ).extract()
        if quantity:
            quantity = quantity[0].replace('\n',
                                           ' ').replace('\r', ' ').replace(
                                               '\t', ' ').strip()
            quantity = re.sub(' +', ' ', quantity)

        loader = ProductLoader(response=response,
                               item=Product(),
                               spider_name=self.name)

        if not name:
            self.log("ERROR name not found")
        else:
            loader.add_value('name', name[0].strip())

        brand = hxs.select(
            '//div[@class="productdetail-contentarea-wrapper"]/table/tr/td[.//b[contains(text(),"Manufacturer:")]]/a/text()'
        ).extract()
        if not brand:
            self.log("ERROR brand not found")
        else:
            loader.add_value("brand", brand[0].strip())

        img_url = hxs.select(
            '//div[@class="productdetail-productimage"]/a/img/@src').extract()
        if not img_url:
            self.log("ERROR img_url not found")
        else:
            loader.add_value("image_url", img_url[0])

        category = hxs.select(
            '(//div[@id="productdetail-crumbcategory"]/ul/li/a)[last()]/text()'
        ).extract()
        if not category:
            self.log("ERROR category not found")
        else:
            loader.add_value("category", category[0].strip())

        # self.log("name = " + name[0].strip() + ", quantity = " + quantity.strip())

        if quantity and quantity.lower() != 'each':
            loader.add_value('name', quantity)

        loader.add_value('url', response.url)
        loader.add_xpath('price',
                         '//font[@class="txt-purchaseprice20blue"]/text()')
        sku = ''.join(
            hxs.select('//b[contains(text(), "Model #:")]/../text()').extract(
            )).strip()
        temp = sku.split()
        if len(temp) == 2 and temp[0] == temp[1]:
            sku = temp[0]
        loader.add_value('sku', sku)
        loader.add_xpath('identifier',
                         '//form//input[@name="productId"]/@value')

        product = loader.load_item()

        metadata = TigerChefMeta()
        metadata['sold_as'] = quantity if quantity else '1 ea'
        product['metadata'] = metadata

        yield product