Python Utils.clean_url示例，spiders_utils.Utils.clean_url Python示例

示例#1

0

显示文件

    def extract_result_products(self, response):

        hxs = HtmlXPathSelector(response)

        items = []
        results = hxs.select(
            "//div[@class='list-item-info']/div[@class='sku-title']/h4/a")

        for result in results:
            item = SearchItem()
            #item['origin_site'] = site
            product_name_holder = result.select("text()").extract()
            if product_name_holder:
                item['product_name'] = product_name_holder[0].strip()
            else:
                self.log("Error: No product name: " + str(response.url) +
                         " from product: " + origin_url,
                         level=log.ERROR)

            item['product_url'] = Utils.clean_url(
                Utils.add_domain(
                    result.select("@href").extract()[0],
                    "http://www.bestbuy.com"))

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_name' in response.meta:
                item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            model_holder = result.select(
                "../../../div[@class='sku-model']/ul/li[@class='model-number']/span[@id='model-value']/text()"
            ).extract()
            if model_holder:
                item['product_model'] = model_holder[0]

            price_holder = result.select(
                "../../../../div[@class='list-item-price']//div[@class='price-block']//div[@class='medium-item-price']/text()[normalize-space()]"
            ).extract()
            if price_holder:
                price = price_holder[0].strip()
                price = re.sub(",", "", price)
                price = float(price)
                item['product_target_price'] = price

            items.append(item)

        return items

示例#2

0

显示文件

文件： ocado_spider.py 项目： lifelonglearner127/tmtext

    def parseBrand(self, response):
        hxs = HtmlXPathSelector(response)

        # category of items on current page
        category = response.meta['category']

        # set parameters in meta specifying current product count and total product count for this brand
        # to be used for deciding on stop criteria on pagination
        if 'total_product_count' in response.meta:
            product_count = response.meta['total_product_count']
            cur_product_count = response.meta['current_product_count']
        else:
            # extract number of products for this brand
            product_count = int(
                hxs.select("//h2[@id='productCount']//text()").re("[0-9]+")[0])
            cur_product_count = 0

        # extract products from this page
        product_links = hxs.select(
            "//h3[@class='productTitle']/a/@href").extract()
        # add domain
        product_urls = map(lambda x: Utils.add_domain(x, self.base_url),
                           product_links)

        for product_url in product_urls:
            item = ProductItem()
            # remove parameters in url
            item['product_url'] = Utils.clean_url(product_url)
            item['category'] = category

            yield item

        # add nr of extracted products to current product count
        cur_product_count += len(product_urls)

        # get next page if any
        next_page = self.build_next_page_url(response.url,
                                             product_count,
                                             cur_product_count,
                                             first=('total_product_count'
                                                    not in response.meta))

        if next_page:
            yield Request(url=next_page,
                          callback=self.parseBrand,
                          meta={
                              'total_product_count': product_count,
                              'current_product_count': cur_product_count,
                              'category': category
                          })

示例#3

0

显示文件

文件： bestbuy_spider.py 项目： Blas-P12/scrapy-project

    def parseResults(self, response):

        hxs = HtmlXPathSelector(response)

        #site = response.meta['origin_site']
        origin_name = response.meta['origin_name']
        origin_model = response.meta['origin_model']

        # if this comes from a previous request, get last request's items and add to them the results

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select(
            "//div[@class='hproduct']/div[@class='info-main']/h3/a")

        for result in results:
            item = SearchItem()
            #item['origin_site'] = site
            item['product_name'] = result.select("text()").extract()[0].strip()
            item['product_url'] = Utils.clean_url(
                Utils.add_domain(
                    result.select("@href").extract()[0],
                    "http://www.bestbuy.com"))

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_id' in response.meta:
                request.meta['origin_id'] = response.meta['origin_id']
            # 	assert self.by_id
            # else:
            # 	assert not self.by_id

            model_holder = result.select(
                "parent::node()/parent::node()//strong[@itemprop='model']/text()"
            ).extract()
            if model_holder:
                item['product_model'] = model_holder[0]

            items.add(item)

        response.meta['items'] = items
        response.meta['parsed'] = items
        return self.reduceResults(response)

示例#4

0

显示文件

    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select("//h3[@class='productTitle']/a")
        for result in results:
            item = SearchItem()
            product_url = result.select("@href").extract()[0]
            # extract all text in <a> (contains product name inside <strong>, and size(ml) directly in text())

            # node containing full product name if the displayed one is abbreviated. use this one if exists, and displayed one if it doesn't
            product_name_node = result.select("strong/abbr/@title")
            product_name = product_name_node.extract(
            )[0] if product_name_node else result.select(
                "strong/text()").extract()[0]
            # assert name is not abbreviated
            assert '...' not in product_name
            # add product quantity
            product_quantity_node = result.select(
                "text()[normalize-space()!='']")
            product_quantity = product_quantity_node.extract()[0].strip(
            ) if product_quantity_node else ""
            product_name_full = product_name + " " + product_quantity

            #print "ITEM", product_name

            # quit if there is no product name
            if product_name and product_url:
                # clean url
                item['product_url'] = Utils.add_domain(
                    Utils.clean_url(product_url), self.base_url)

                item['product_name'] = product_name_full
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # add url, name and model of product to be matched (from origin site)
            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            #TODO: extract: price, brand?

            # add result to items
            items.add(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        response.meta['items'] = items

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)

示例#5

0

显示文件

 def build_url(self, url):
     url = Utils.add_domain(url, self.BASE_URL)
     url = Utils.clean_url(url, ['#'])
     return url