Python extract_helper示例，Visions.utils.xgroup.extract_helper Python示例

示例#1

0

显示文件

    def parse_category(self, response):
        '''
			Products are listed under a particular brand or subcategory.
			Use either to grab the product.
		'''
        # For debug, bundles and open box page not supported as the pages
        # have no breadcrumbs.
        crumbs = self.get_breadcrumbs(response)
        log.msg('IN ' + str(crumbs))

        if not extract_helper(response, self.EMPTY_PAGE_CHECK):
            yield Request(response.url,
                          callback=self.parse_product,
                          dont_filter=True)
        else:
            paths = [self.SUB_NAV_PATHS]
            paths.append(self.ALT_SUB_NAV_PATHS)
            subcat_link = extract_helper(response, paths)

            if subcat_link:
                subcat_link = re.sub(
                    'Default.aspx', '/Catalogue/Category/ProductResults.aspx',
                    subcat_link) if 'bundle' not in subcat_link.lower(
                    ) else subcat_link

                url = add_schema(response.url, subcat_link)
                yield Request(url, callback=self.parse_category)

示例#2

0

显示文件

    def get_product_details(self, response):
        crumbs = self.get_breadcrumbs(response)
        loader = ItemLoader(item=VisionsProduct())

        loader.add_value('breadcrumbs', crumbs)
        loader.add_value('url', response.url)

        if isinstance(crumbs, basestring):
            loader.add_value('category', crumbs)

        # Ensure we aren't wasting time extracting from an empty page
        if extract_helper(response, self.EMPTY_PAGE_CHECK):
            for d in self.PRODUCT_DETAILS:
                if '_' not in d.name:  # Don't load price
                    loader.add_value(d.name, 'N/A')
        else:
            productDetails = detailsRunner(self.PRODUCT_DETAILS,
                                           response=response)

            if not productDetails['price']:
                productDetails['price'] = productDetails['price_gif']

            productDetails.pop('price_gif')

            # Fix truncated image urls
            if productDetails['image']:
                productDetails['image'] = add_schema(response.url,
                                                     productDetails['image'])

            for d in productDetails:
                loader.add_value(d, productDetails[d])

        yield loader.load_item()

示例#3

0

显示文件

    def testPage1(self):
        '''
			Test extraction using a single path and index==one
		'''
        url = ('http://www.visions.ca/catalogue/category/Details'
               '.aspx?categoryId=162&productId=4644&sku=KUBE2')
        paths = ('//div/span[contains(@id, "Shipping")]/text()')
        response = get_response('test_extract_helper_page1', url=url)

        shipping = extract_helper(response, paths)

        self.assertEqual(shipping, 'Free Shipping!')

示例#4

0

显示文件

文件： test_xgroup.py 项目： ecotg/Visions

	def testPage1(self):
		'''
			Test extraction using a single path and index==one
		'''
		url = (
			'http://www.visions.ca/catalogue/category/Details'
			'.aspx?categoryId=162&productId=4644&sku=KUBE2'
		)
		paths = ('//div/span[contains(@id, "Shipping")]/text()')
		response = get_response('test_extract_helper_page1', url=url)

		shipping = extract_helper(response, paths)

		self.assertEqual(shipping, 'Free Shipping!')

示例#5

0

显示文件

    def testPage2(self):
        '''
			Test extraction using a multiple paths and index==one
		'''
        url = ('http://www.visions.ca/catalogue/category/Details'
               '.aspx?categoryId=162&productId=4644&sku=KUBE2')
        paths = ('//div/span[contains(@id, "price")]/a/text()'
                 '//div/span[contains(@id, "price")]/text()',
                 '//div[@class="productdetail-pricing"]/div/span[@id]/text()')
        response = get_response('test_extract_helper_page1', url=url)

        price = extract_helper(response, paths)

        self.assertEqual(price, '$398.00')

示例#6

0

显示文件

文件： test_xgroup.py 项目： ecotg/Visions

	def testPage2(self):
		'''
			Test extraction using a multiple paths and index==one
		'''
		url = (
			'http://www.visions.ca/catalogue/category/Details'
			'.aspx?categoryId=162&productId=4644&sku=KUBE2'
		)
		paths = (
			'//div/span[contains(@id, "price")]/a/text()'
			'//div/span[contains(@id, "price")]/text()',
			'//div[@class="productdetail-pricing"]/div/span[@id]/text()'
		)
		response = get_response('test_extract_helper_page1', url=url)

		price = extract_helper(response, paths)

		self.assertEqual(price, '$398.00')

示例#7

0

显示文件

    def parse_product(self, response):
        all_links = [i for i in self.PRODUCT_PAGE_PATHS]
        all_links.extend([i for i in self.BUNDLE_PAGE_PATHS])

        product_links = extract_helper(response, all_links)

        if product_links:
            # Parse out urls ../../ refers to Catalogue/Category
            if 'bundle' not in response.url.lower():
                product_links = ('/Catalogue/Category/' + product_links
                                 if product_links.startswith('Details.aspx')
                                 else product_links)
            else:
                product_links = ('/Catalogue/Bundles/' + product_links
                                 if product_links.startswith('Details.aspx')
                                 else product_links)
            url = add_schema(response.url, re.sub('\.+/*\.+', '',
                                                  product_links))
            yield Request(url, callback=self.get_product_details)
        else:
            # We have landed on a product page, parse the product details
            yield self.get_product_details(response)