示例#1
0
 def parse_book(self, response):
     l = ItemLoader(item=BooksCrawlerItem(), response=response)
     l.add_value(
         'Title',
         response.xpath('//*[@class="col-sm-6 product_main"]/h1/text()').
         extract_first())
     return l.load_item()
示例#2
0
	def parse_book(self, response):
		items = BooksCrawlerItem()
		title = response.css('h1::text').extract_first()
		url = response.request.url
	
		items['title'] = title
		items['url'] = url
		yield items
		
示例#3
0
文件: books.py 项目: mkubasz/other
    def parse_book(self, response):
        l = ItemLoader(item=BooksCrawlerItem(), response=response)
        title = response.css('h1::text').extract_first()
        price = response.xpath(
            '//*[@class="price_color"]/text()').extract_first()

        image_urls = response.xpath('//img/@src').extract_first()
        image_urls = image_urls.replace('../..', 'http://books.toscrape.com')

        l.add_value('title', title)
        l.add_value('price', price)
        l.add_value('image_urls', image_urls)

        return l.load_item()
示例#4
0
    def parse_book(self, response):
        item_loader = ItemLoader(item=BooksCrawlerItem(), response=response)
        title = response.xpath('//h1/text()').extract_first()
        url = response.request.url

        yield {
            'Title': title,
            'Url': url
        }

        item_loader.add_value('Title', title)
        item_loader.add_value('Url', url)

        item_loader.load_item()
    def parse_book(self, response):
        """ getting the books information """
        l = ItemLoader(item=BooksCrawlerItem(), response=response)
        title = response.css('h1::text').extract_first()
        price_color = response.xpath(
            '//h1/following-sibling::p[@class="price_color"]/text()').extract_first()

        image_urls = response.xpath('//img/@src').extract_first()
        image_urls = image_urls.replace('../..', 'http://books.toscrape.com')

        l.add_value('title', title)
        l.add_value('price', price_color)
        l.add_value('image_urls', image_urls)

        return l.load_item()
示例#6
0
    def parse_book(self, response):
        l = ItemLoader(item=BooksCrawlerItem(), response=response)

        title = response.css("h1::text").extract_first()
        price = response.xpath(
            "//*[@class='price_color']/text()").extract_first()
        image_url = response.xpath('//img/@src').extract_first()
        image_url = image_url.replace('../..', 'http://books.toscrape.com/')
        rating = response.xpath(
            "//*[contains(@class,'star-rating')]/@class").extract_first()

        rating = rating.replace("star-rating", '')
        description = response.xpath(
            "//*[@id='product_description']/following-sibling::p/text()"
        ).extract_first()

        upc = product_info(response, 'UPC')
        product_type = product_info(response, 'Product Type')
        price_without_tax = product_info(response, 'Price (excl. tax)')
        price_with_tax = product_info(response, 'Price (incl. tax)')
        tax = product_info(response, 'Tax')
        availability = product_info(response, 'Availability')
        number_of_reviews = product_info(response, 'Number of reviews')

        url = response.request.url

        l.add_value('title', title)
        l.add_value('price', price)
        l.add_value('image_urls', image_url)
        yield l.load_item()
        yield {
            'title': title,
            'price': price,
            'rating': rating,
            'image_url': image_url,
            'description': description,
            'upc': upc,
            'product_type': product_type,
            'price_without_tax': price_without_tax,
            'price_with_tax': price_with_tax,
            'tax': tax,
            'availability': availability,
            'number_of_reviews': number_of_reviews,
            'url': url
        }
示例#7
0
    def parse_book(self, response):
        l = ItemLoader(item=BooksCrawlerItem(), response=response)
        title = response.css('h1::text').extract_first()
        price = response.xpath(
            '//*[@class="price_color"]/text()').extract_first()

        image_urls = response.xpath('//img/@src').extract_first()
        image_urls = image_urls.replace('../..', 'http://books.toscrape.com/')

        l.add_value('image_urls', image_urls)

        yield l.load_item()

        rating = response.xpath(
            '//*[contains(@class, "star-rating")]/@class').extract_first()
        rating = rating.replace('star-rating', '')

        description = response.xpath(
            '//*[@id="product_description"]/following-sibling::p/text()'
        ).extract_first()

        # product informatoion data points
        upc = product_info(response, 'UPC')
        product_type = product_info(response, 'Product Type')
        price_including_tax = product_info(response, 'Price (excl. tax)')
        price_excluding_tax = product_info(response, 'Price (incl. tax)')
        tax = product_info(response, 'Tax')
        availability = product_info(response, 'Availability')
        number_of_reviews = product_info(response, 'Number of reviews')

        yield {
            'title': title,
            'price': price,
            # 'image_urls': image_urls,
            'rating': rating,
            'description': description,
            'upc': upc,
            'product_type': product_type,
            'price_including_tax': price_including_tax,
            'price_excluding_tax': price_excluding_tax,
            'tax': tax,
            'availability': availability,
            'number_of_reviews': number_of_reviews
        }
示例#8
0
    def parse_book(self, response):
        page_url = response.url
        title = response.css('h1::text').extract_first()
        price = response.xpath('//*[@class="price_color"]/text()').extract_first()

        image_url = response.xpath('//img/@src').extract_first()
        image_url = image_url.replace('../..', 'http://books.toscrape.com/')

        rating = response.xpath('//*[contains(@class, "star-rating")]/@class').extract_first()
        rating = rating.replace('star-rating ', '')

        description = response.xpath(
            '//*[@id="product_description"]/following-sibling::p/text()').extract_first()

        # response.url
        # product information data points
        upc = product_info(response, 'UPC')
        product_type =  product_info(response, 'Product Type')
        price_without_tax = product_info(response, 'Price (excl. tax)')
        price_with_tax = product_info(response, 'Price (incl. tax)')
        tax = product_info(response, 'Tax')
        availability = product_info(response, 'Availability')
        number_of_reviews = product_info(response, 'Number of reviews')

        # 
        item = BooksCrawlerItem()
        item['page_url'] = page_url
        item['title'] =title
        item['price'] =price
        item['image_url'] =image_url
        item['rating'] =rating
        # item['description'] =description
        item['upc'] =upc
        item['product_type'] =product_type
        
        item['price_without_tax'] =price_without_tax
        item['price_with_tax'] =price_with_tax
        item['tax'] =tax
        item['availability'] =availability
        item['number_of_reviews'] =number_of_reviews


        return item
示例#9
0
    def parse_book(self, response):
        '''
        Scrapes a site and downloads images for all the books and renames the
        images to the title.
        '''
        l = ItemLoader(item=BooksCrawlerItem(), response=response)

        title = response.xpath('//h1/text()').extract_first()
        price = response.xpath(
            '//*[@class="price_color"]/text()').extract_first()
        image_urls = response.xpath('//img/@src').extract_first()
        image_urls = image_urls.replace('../..', 'http://books.toscrape.com')
        rating = response.xpath(
            '//*[contains(@class, "star-rating")]/@class').extract_first()
        rating = rating.replace('star-rating ', '')
        description = response.xpath(
            '//*[@id="product_description"]/following-sibling::p/text()'
        ).extract_first()
        description = description.replace(' ...more', '')

        # Product information table
        upc = product_table(response, 'UPC')
        price_excl = product_table(response, 'Price (excl. tax)')
        price_incl = product_table(response, 'Price (incl. tax)')
        tax = product_table(response, 'Tax')
        p_type = product_table(response, 'Product Type')
        stock = product_table(response, 'Availability')
        reviews = product_table(response, 'Number of reviews')

        l.add_value('title', title)
        l.add_value('price', price)
        l.add_value('image_urls', image_urls)
        l.add_value('rating', rating)
        l.add_value('description', description)
        l.add_value('upc', upc)
        l.add_value('price_excl', price_excl)
        l.add_value('price_incl', price_incl)
        l.add_value('tax', tax)
        l.add_value('p_type', p_type)
        l.add_value('stock', stock)
        l.add_value('reviews', reviews)

        return l.load_item()