def parse_category(self, response): ''' Products are listed under a particular brand or subcategory. Use either to grab the product. ''' # For debug, bundles and open box page not supported as the pages # have no breadcrumbs. crumbs = self.get_breadcrumbs(response) log.msg('IN ' + str(crumbs)) if not extract_helper(response, self.EMPTY_PAGE_CHECK): yield Request(response.url, callback=self.parse_product, dont_filter=True) else: paths = [self.SUB_NAV_PATHS] paths.append(self.ALT_SUB_NAV_PATHS) subcat_link = extract_helper(response, paths) if subcat_link: subcat_link = re.sub( 'Default.aspx', '/Catalogue/Category/ProductResults.aspx', subcat_link) if 'bundle' not in subcat_link.lower( ) else subcat_link url = add_schema(response.url, subcat_link) yield Request(url, callback=self.parse_category)
def get_product_details(self, response): crumbs = self.get_breadcrumbs(response) loader = ItemLoader(item=VisionsProduct()) loader.add_value('breadcrumbs', crumbs) loader.add_value('url', response.url) if isinstance(crumbs, basestring): loader.add_value('category', crumbs) # Ensure we aren't wasting time extracting from an empty page if extract_helper(response, self.EMPTY_PAGE_CHECK): for d in self.PRODUCT_DETAILS: if '_' not in d.name: # Don't load price loader.add_value(d.name, 'N/A') else: productDetails = detailsRunner(self.PRODUCT_DETAILS, response=response) if not productDetails['price']: productDetails['price'] = productDetails['price_gif'] productDetails.pop('price_gif') # Fix truncated image urls if productDetails['image']: productDetails['image'] = add_schema(response.url, productDetails['image']) for d in productDetails: loader.add_value(d, productDetails[d]) yield loader.load_item()
def testPage1(self): ''' Test extraction using a single path and index==one ''' url = ('http://www.visions.ca/catalogue/category/Details' '.aspx?categoryId=162&productId=4644&sku=KUBE2') paths = ('//div/span[contains(@id, "Shipping")]/text()') response = get_response('test_extract_helper_page1', url=url) shipping = extract_helper(response, paths) self.assertEqual(shipping, 'Free Shipping!')
def testPage1(self): ''' Test extraction using a single path and index==one ''' url = ( 'http://www.visions.ca/catalogue/category/Details' '.aspx?categoryId=162&productId=4644&sku=KUBE2' ) paths = ('//div/span[contains(@id, "Shipping")]/text()') response = get_response('test_extract_helper_page1', url=url) shipping = extract_helper(response, paths) self.assertEqual(shipping, 'Free Shipping!')
def testPage2(self): ''' Test extraction using a multiple paths and index==one ''' url = ('http://www.visions.ca/catalogue/category/Details' '.aspx?categoryId=162&productId=4644&sku=KUBE2') paths = ('//div/span[contains(@id, "price")]/a/text()' '//div/span[contains(@id, "price")]/text()', '//div[@class="productdetail-pricing"]/div/span[@id]/text()') response = get_response('test_extract_helper_page1', url=url) price = extract_helper(response, paths) self.assertEqual(price, '$398.00')
def testPage2(self): ''' Test extraction using a multiple paths and index==one ''' url = ( 'http://www.visions.ca/catalogue/category/Details' '.aspx?categoryId=162&productId=4644&sku=KUBE2' ) paths = ( '//div/span[contains(@id, "price")]/a/text()' '//div/span[contains(@id, "price")]/text()', '//div[@class="productdetail-pricing"]/div/span[@id]/text()' ) response = get_response('test_extract_helper_page1', url=url) price = extract_helper(response, paths) self.assertEqual(price, '$398.00')
def parse_product(self, response): all_links = [i for i in self.PRODUCT_PAGE_PATHS] all_links.extend([i for i in self.BUNDLE_PAGE_PATHS]) product_links = extract_helper(response, all_links) if product_links: # Parse out urls ../../ refers to Catalogue/Category if 'bundle' not in response.url.lower(): product_links = ('/Catalogue/Category/' + product_links if product_links.startswith('Details.aspx') else product_links) else: product_links = ('/Catalogue/Bundles/' + product_links if product_links.startswith('Details.aspx') else product_links) url = add_schema(response.url, re.sub('\.+/*\.+', '', product_links)) yield Request(url, callback=self.get_product_details) else: # We have landed on a product page, parse the product details yield self.get_product_details(response)