示例#1
0
    def parse_category(self, response):
        '''
			Products are listed under a particular brand or subcategory.
			Use either to grab the product.
		'''
        # For debug, bundles and open box page not supported as the pages
        # have no breadcrumbs.
        crumbs = self.get_breadcrumbs(response)
        log.msg('IN ' + str(crumbs))

        if not extract_helper(response, self.EMPTY_PAGE_CHECK):
            yield Request(response.url,
                          callback=self.parse_product,
                          dont_filter=True)
        else:
            paths = [self.SUB_NAV_PATHS]
            paths.append(self.ALT_SUB_NAV_PATHS)
            subcat_link = extract_helper(response, paths)

            if subcat_link:
                subcat_link = re.sub(
                    'Default.aspx', '/Catalogue/Category/ProductResults.aspx',
                    subcat_link) if 'bundle' not in subcat_link.lower(
                    ) else subcat_link

                url = add_schema(response.url, subcat_link)
                yield Request(url, callback=self.parse_category)
示例#2
0
    def get_product_details(self, response):
        crumbs = self.get_breadcrumbs(response)
        loader = ItemLoader(item=VisionsProduct())

        loader.add_value('breadcrumbs', crumbs)
        loader.add_value('url', response.url)

        if isinstance(crumbs, basestring):
            loader.add_value('category', crumbs)

        # Ensure we aren't wasting time extracting from an empty page
        if extract_helper(response, self.EMPTY_PAGE_CHECK):
            for d in self.PRODUCT_DETAILS:
                if '_' not in d.name:  # Don't load price
                    loader.add_value(d.name, 'N/A')
        else:
            productDetails = detailsRunner(self.PRODUCT_DETAILS,
                                           response=response)

            if not productDetails['price']:
                productDetails['price'] = productDetails['price_gif']

            productDetails.pop('price_gif')

            # Fix truncated image urls
            if productDetails['image']:
                productDetails['image'] = add_schema(response.url,
                                                     productDetails['image'])

            for d in productDetails:
                loader.add_value(d, productDetails[d])

        yield loader.load_item()
示例#3
0
    def testUrl2(self):
        '''
			Test that helper adds in only schema
		'''
        schemaless_url = 'www.visions.ca/Catalogue'

        self.assertEqual(helpers.add_schema(self.parent_url, schemaless_url),
                         'http://www.visions.ca/Catalogue')
示例#4
0
    def testUrl1(self):
        '''
			Test that helper adds in proper host and schema
		'''
        malformed_url = ('/Catalogue/Category/Details.aspx?categoryId=2')
        self.assertEqual(
            helpers.add_schema(self.parent_url, malformed_url),
            'http://www.visions.ca/Catalogue/Category/Details.aspx?categoryId=2'
        )
示例#5
0
    def parse_product(self, response):
        all_links = [i for i in self.PRODUCT_PAGE_PATHS]
        all_links.extend([i for i in self.BUNDLE_PAGE_PATHS])

        product_links = extract_helper(response, all_links)

        if product_links:
            # Parse out urls ../../ refers to Catalogue/Category
            if 'bundle' not in response.url.lower():
                product_links = ('/Catalogue/Category/' + product_links
                                 if product_links.startswith('Details.aspx')
                                 else product_links)
            else:
                product_links = ('/Catalogue/Bundles/' + product_links
                                 if product_links.startswith('Details.aspx')
                                 else product_links)
            url = add_schema(response.url, re.sub('\.+/*\.+', '',
                                                  product_links))
            yield Request(url, callback=self.get_product_details)
        else:
            # We have landed on a product page, parse the product details
            yield self.get_product_details(response)
示例#6
0
 def parse(self, response):
     for link in response.xpath(self.MAIN_NAV_PATHS).extract():
         url = add_schema(response.url, link)
         yield Request(url, callback=self.parse_category)