Python QuoteItem示例，quotetutorial.items.QuoteItem Python示例

示例#1

0

显示文件

    def parse_author(self, response, text, author, tags):
        author_details = response.css('.container .author-details')

        item = QuoteItem()
        item['text'] = text
        item['author'] = author
        item['tags'] = ','.join(tags)

        #split dates to data and year
        born_dates = author_details.css(
            '.author-born-date::text').extract_first()
        item['born_date'], item['born_year'] = born_dates.split(',')
        '''
        remove ‘in’ in location
        split it to city and country 
        ！！！sometimes there is no city ,put unknow
        '''
        born_location = author_details.css(
            '.author-born-location::text').extract_first()
        born_location = born_location.replace('in ', '')
        item['born_country'] = born_location.split(',')[-1]
        if len(born_location.split(',')) > 1:
            item['born_city'] = born_location.split(',')[0]
        else:
            item['born_city'] = None

        item['description'] = author_details.css(
            '.author-description::text').extract_first()

        yield item

示例#2

0

显示文件

文件： quotes.py 项目： fengxueff/quotetutorial

    def parse(self, response):
        # print(response.text)
        #获取标签
        quotes = response.css(".quote")
        for quote in quotes:
            # 获取class=text标签中的文本内容，取第一行
            text = quote.css(".text::text").extract_first()
            author = quote.css(".author::text").extract_first()
            #获取class=tags下的class=tag的文本内容，取所有数据
            tags = quote.css(".tags .tag::text").extract()

            item = QuoteItem()
            #这个地方居然不能支持对象点属性的方式来调用
            item["text"] = text
            item["author"] = author
            item["tags"] = tags
            #当使用yield后scrapy就会默认的解析这个item
            #yield只适用于item类与request类
            #通过 scrapy crawl quotes -o quotes.json 就可以将item保存在json文件中了
            #而通过scrapy crawl quotes -o quotes.csv就可以将item保存为csv文件
            # 而通过scrapy crawl quotes -o quotes.xml就可以将item保存为xml文件
            #也可以将目标文件保存在ftp中，如：ftp://user:[email protected]/path/quotes.xml
            yield item
        #获取下一页按钮对应的url值
        next = response.css(".pager .next a::attr(href)").extract_first()
        #将next获得的相对url=/page/2/转换成绝对url
        url = response.urljoin(next)
        #发起一个请求，用于获取下一页，然后递归调用parse方法来解析网页内容
        yield scrapy.Request(url=url, callback=self.parse)

示例#3

0

显示文件

 def parse(self, response):
     quotes = response.css('.quote')
     for quote in quotes:
         item = QuoteItem()
         text = quote.css('.text::text').extract_first()
         author = quote.css('.author::text').extract_first()
         tags = quote.css('.tags .tag::text').extract()
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     next = response.css('.pager .next a::attr(href)').extract_first()
     url = response.urljoin(next)
     yield scrapy.Request(url=url, callback=self.parse)

示例#4

0

显示文件

文件： quotes.py 项目： UltramanShuai/scrapy_foundamental

    def parse(self, response):
        quotes = response.css(".quote")
        for quote in quotes:
            item = QuoteItem()
            text = quote.css(".text::text").extract_first()
            author = quote.css(".author::text").extract_first()
            tags = quote.css(".tags .tag::text").extract()
            item["text"] = text
            item["author"] = author
            item["tags"] = tags
            yield item

        next = response.css(".pager .next a::attr(href)").extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url, callback=self.parse)

示例#5

0

显示文件

文件： quotes.py 项目： yuumm/scrapy

    def parse(self, response):
        quotes = response.css(".quote")
        for quote in quotes:
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()     #只有一个内容就可以用extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tags::text').extract()     #当有多个内容就用extract()
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)        #response.urljoin()可以将括号内的参数添加到当前页面的url后面
        yield scrapy.Request(url=url, callback=self.parse)      #第一个参数URL 表示访问这个URL的页面，后面的参数表示调用自己的函数

示例#6

0

显示文件

    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()

            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next_page_url = response.css("li.next > a::attr(href)").extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))

示例#7

0

显示文件

文件： quotes.py 项目： QkqBeer/PythonSubject

 def parse(self, response):
     print('添加中间件之后，  输出response.status',response.status)
     quotes = response.css('.quote')
     for quote in quotes:
         item = QuoteItem()
         text = quote.css('.text::text').extract_first()
         #('.text::text')输出该标签的文本内容  extract_first方法选取第一个
         author = quote.css('.author::text').extract_first()
         tags = quote.css('.tags .tag::text').extract()
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     next = response.css('.pager .next a::attr(href)').extract_first()
     url = response.urljoin(next)
     yield scrapy.Request(url=url, callback = self.parse)

示例#8

0

显示文件

文件： quotes.py 项目： fengzhiyugithub/scrapy_essential_use

 def parse(self, response):
     quotes = response.css('.quote') #通过css选择器获取quote
     for quote in quotes:
         item = QuoteItem()
         text = quote.css('.text::text').extract_first()#通过css选择器获取text的内容（第一个）
         author = quote.css('.author::text').extract_first()
         tags = quote.css('.tags .tag::text').extract() #获取所有内容
         # 提取item的内容
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     # 翻页
     next = response.css('.pager .next a::attr(href)').extract_first()
     url = response.urljoin(next)
     # callback=self.parse参数表示递归调用自己
     yield scrapy.Request(url=url, callback=self.parse)

示例#9

0

显示文件

文件： quotes.py 项目： m247209631/scrapy_python

    def parse(self, response):
        # print(response.text)
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tag.tag::text').extract(
            )  #因为tag有多个，所以和上面的不太一样，extract()会把所有的都查找出来
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)  #补全整个url链接
        yield scrapy.Request(url=url, callback=self.parse)

示例#10

0

显示文件

    def parse(self, response):
        # print(response.text)
        # pass
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        # css定位下一页href 进行url拼接 callback回调自己，实现循环爬取页面
        next_page = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next_page)
        yield scrapy.Request(url=url, callback=self.parse)

示例#11

0

显示文件

文件： quotes.py 项目： xuhaoa/PythonProjects

    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            #利用CSS选择器选取想要的信息
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()
            #调用 items.py 中存储的数据结构
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        #实现翻页
        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url, callback=self.parse)  #定义回调函数，重新回调自己

示例#12

0

显示文件

    def parse(self, response):
        # pass
        # print(response.text)
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            # ::text表示其中的文本，extract提取
            text = quote.css('.text::text').extract_first()  # 提取第一个结果
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()  # 提取所有的
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url, callback=self.parse)  # 递归调用来实现循环下一页

示例#13

0

显示文件

文件： quotes.py 项目： Hayeke/quotetutorial

    def parse(self, response):
        quotes = response.css(".quote")
        ##获得class=quote的区块,迭代查询
        for quote in quotes:
            item = QuoteItem()
            text = quote.css(".text::text").extract_first(
            )  ##css选择器，::scrapy 特有语法结构，获取class=test里的文本内容,extract_first方法拿到内容；
            author = quote.css(".author::text").extract_first()
            tags = quote.css(".tags .tag::text").extract(
            )  ##tags是多级的，css级联；extract()提取全部内容；
            item["text"] = text
            item["author"] = author
            item["tags"] = tags
            yield item

        next = response.css(
            ".pager .next a::attr(href)").extract_first()  ##链接提取，attr(属性名称)
        url = response.urljoin(next)  ##urljoin方法获取绝对链接
        yield scrapy.Request(url=url, callback=self.parse)  #回调自己，完成递归的调用

示例#14

0

显示文件

文件： quotes.py 项目： Big-Belphegor/quotetutorial

 def parse(self, response):
     quotes = response.css('.quote')
     for quote in quotes:
         item = QuoteItem()
         text = quote.css(
             '.text::text').extract_first()  # '.text::text'表获取.text类的文本部分
         author = quote.css(
             '.author::text').extract_first()  # extract_first提取一个内容
         tags = quote.css('.tags .tag::text').extract(
         )  # extract提取多个内容，'.tags .tag::text'表获取tags类下tag类的文本部分
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     #实现翻页，首先获取翻页的值
     next = response.css('.pager .next a::attr(href)').extract_first()
     #拼接URL
     url = response.urljoin(next)
     #反复调用URL
     yield scrapy.Request(url=url, callback=self.parse)

示例#15

0

显示文件

文件： quotes.py 项目： chenghao00/quetospider

    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            # ::text --> 提取其中的文本信息  算是scrapy语法
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        # 获取下一页的url
        next = response.css('.pager .next a::attr(href)').extract_first()
        # 连接成新的url
        url = response.urljoin(next)
        # callback 对应的是：请求这个url参数之后 由谁处理
        # 用递归调用 实现了循环翻页
        yield scrapy.Request(url=url, callback=self.parse)

示例#16

0

显示文件

文件： quotes.py 项目： Annihilater/quotetutorial

    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()

            item = QuoteItem()
            # item['text'] = text
            # item['author'] = author
            # item['tags'] = tags
            for field in item.fields:
                try:
                    item[field] = eval(field)
                except NameError:
                    self.logger.debug('Field is not Defined' + field)
            yield item

        next_page = response.css('.next a::attr(href)').extract_first()
        url = response.urljoin(next_page)
        yield scrapy.Request(url=url, callback=self.parse)

示例#17

0

显示文件

    def parse(self, response):  #解析单页网页
        #pass
        #print(response.text)
        quotes = response.css('.quote')
        for quotes in quotes:
            item = QuoteItem()
            text = quotes.css('.text::text').extract_first(
            )  #传入CSS 选择器。只有一个元素时用extract_first()
            author = quotes.css('.author::text').extract_first()
            tags = quotes.css(
                '.tags .tag::text').extract()  #多个元素时用extract(),会以列表的形式返回结果。
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item  #生成字典类型数据

        #实现翻页循环
        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)  #urljoin获取绝对url
        yield scrapy.Request(
            url=url,
            callback=self.parse)  #相当于重新发起一次请求。回绝函数callback递归调用自己，parse是处理索引页函数

示例#18

0

显示文件

文件： quotes.py 项目： txowner/spider_study

    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()

            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()

            item['text'] = text
            item['author'] = author
            item['tags'] = tags

            yield item

        # 翻页
        next = response.css(
            '.next a::attr("href")').extract_first()  # /page/2/
        url = response.urljoin(
            next)  # 把相对路径连接成绝对路径  http://quotes.toscrape.com/page/2/
        yield scrapy.Request(
            url=url,
            callback=self.parse)  # 请求下一页，把response返回给callback指定函数继续处理，递归调用自己

示例#19

0

显示文件

    def parse(self, response):
        quotes = response.css('.quote')

        for quote in quotes:
            # change piplines to get two csv
            # first csv
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()
            item['text'] = text
            item['author'] = author
            item['tags'] = ','.join(tags)
            yield item

            #second csv
            #into about page
            about = quote.css('a::attr(href)').extract_first()
            about_url = response.urljoin(about)
            yield scrapy.Request(url=about_url, callback=self.parse_author)
        #next page
        next = response.css('.pager .next a::attr(href)').extract_first()
        next_url = response.urljoin(next)
        yield scrapy.Request(url=next_url, callback=self.parse)

示例#20

0

显示文件

 def parse(self, response):
     # pass
     # pass 默认回调方法
     # print("response.text")
     quotes = response.css('.quote')
     for quote in quotes:
         item = QuoteItem()
         #在items.py中定义的
         text = quote.css('.text::text').extract_first()
         #进一步筛选quote
         #::是特有的语法结构，是输出text中文本
         #extract_first找第一个结果
         author = quote.css('.author::text').extract_first()
         tags = quote.css('.tags .tag::text').extract()
         #tag有多个，extract提取全部内容，类似于find(),findall()
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     #下一页提取
     next = response.css('.pager .next a::attr(href)').extract_first()
     url = response.urljoin(next)
     #urljon()生成一个网站的url
     yield scrapy.Request(url=url, callback=self.parse)