def parse_author(self, response, text, author, tags): author_details = response.css('.container .author-details') item = QuoteItem() item['text'] = text item['author'] = author item['tags'] = ','.join(tags) #split dates to data and year born_dates = author_details.css( '.author-born-date::text').extract_first() item['born_date'], item['born_year'] = born_dates.split(',') ''' remove ‘in’ in location split it to city and country !!!sometimes there is no city ,put unknow ''' born_location = author_details.css( '.author-born-location::text').extract_first() born_location = born_location.replace('in ', '') item['born_country'] = born_location.split(',')[-1] if len(born_location.split(',')) > 1: item['born_city'] = born_location.split(',')[0] else: item['born_city'] = None item['description'] = author_details.css( '.author-description::text').extract_first() yield item
def parse(self, response): # print(response.text) #获取标签 quotes = response.css(".quote") for quote in quotes: # 获取class=text标签中的文本内容,取第一行 text = quote.css(".text::text").extract_first() author = quote.css(".author::text").extract_first() #获取class=tags下的class=tag的文本内容,取所有数据 tags = quote.css(".tags .tag::text").extract() item = QuoteItem() #这个地方居然不能支持对象点属性的方式来调用 item["text"] = text item["author"] = author item["tags"] = tags #当使用yield后scrapy就会默认的解析这个item #yield只适用于item类与request类 #通过 scrapy crawl quotes -o quotes.json 就可以将item保存在json文件中了 #而通过scrapy crawl quotes -o quotes.csv就可以将item保存为csv文件 # 而通过scrapy crawl quotes -o quotes.xml就可以将item保存为xml文件 #也可以将目标文件保存在ftp中,如:ftp://user:[email protected]/path/quotes.xml yield item #获取下一页按钮对应的url值 next = response.css(".pager .next a::attr(href)").extract_first() #将next获得的相对url=/page/2/转换成绝对url url = response.urljoin(next) #发起一个请求,用于获取下一页,然后递归调用parse方法来解析网页内容 yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): quotes = response.css('.quote') for quote in quotes: item = QuoteItem() text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() item['text'] = text item['author'] = author item['tags'] = tags yield item next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): quotes = response.css(".quote") for quote in quotes: item = QuoteItem() text = quote.css(".text::text").extract_first() author = quote.css(".author::text").extract_first() tags = quote.css(".tags .tag::text").extract() item["text"] = text item["author"] = author item["tags"] = tags yield item next = response.css(".pager .next a::attr(href)").extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): quotes = response.css(".quote") for quote in quotes: item = QuoteItem() text = quote.css('.text::text').extract_first() #只有一个内容就可以用extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tags::text').extract() #当有多个内容就用extract() item['text'] = text item['author'] = author item['tags'] = tags yield item next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) #response.urljoin()可以将括号内的参数添加到当前页面的url后面 yield scrapy.Request(url=url, callback=self.parse) #第一个参数URL 表示访问这个URL的页面,后面的参数表示调用自己的函数
def parse(self, response): quotes = response.css('.quote') for quote in quotes: item = QuoteItem() text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() item['text'] = text item['author'] = author item['tags'] = tags yield item next_page_url = response.css("li.next > a::attr(href)").extract_first() if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url))
def parse(self, response): print('添加中间件之后, 输出response.status',response.status) quotes = response.css('.quote') for quote in quotes: item = QuoteItem() text = quote.css('.text::text').extract_first() #('.text::text')输出该标签的文本内容 extract_first方法选取第一个 author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() item['text'] = text item['author'] = author item['tags'] = tags yield item next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback = self.parse)
def parse(self, response): quotes = response.css('.quote') #通过css选择器获取quote for quote in quotes: item = QuoteItem() text = quote.css('.text::text').extract_first()#通过css选择器获取text的内容(第一个) author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() #获取所有内容 # 提取item的内容 item['text'] = text item['author'] = author item['tags'] = tags yield item # 翻页 next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) # callback=self.parse参数表示递归调用自己 yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): # print(response.text) quotes = response.css('.quote') for quote in quotes: item = QuoteItem() text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tag.tag::text').extract( ) #因为tag有多个,所以和上面的不太一样,extract()会把所有的都查找出来 item['text'] = text item['author'] = author item['tags'] = tags yield item next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) #补全整个url链接 yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): # print(response.text) # pass quotes = response.css('.quote') for quote in quotes: item = QuoteItem() text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() item['text'] = text item['author'] = author item['tags'] = tags yield item # css定位下一页href 进行url拼接 callback回调自己,实现循环爬取页面 next_page = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next_page) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): quotes = response.css('.quote') for quote in quotes: item = QuoteItem() #利用CSS选择器选取想要的信息 text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() #调用 items.py 中存储的数据结构 item['text'] = text item['author'] = author item['tags'] = tags yield item #实现翻页 next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse) #定义回调函数,重新回调自己
def parse(self, response): # pass # print(response.text) quotes = response.css('.quote') for quote in quotes: item = QuoteItem() # ::text表示其中的文本,extract提取 text = quote.css('.text::text').extract_first() # 提取第一个结果 author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() # 提取所有的 item['text'] = text item['author'] = author item['tags'] = tags yield item next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse) # 递归调用来实现循环下一页
def parse(self, response): quotes = response.css(".quote") ##获得class=quote的区块,迭代查询 for quote in quotes: item = QuoteItem() text = quote.css(".text::text").extract_first( ) ##css选择器,::scrapy 特有语法结构,获取class=test里的文本内容,extract_first方法拿到内容; author = quote.css(".author::text").extract_first() tags = quote.css(".tags .tag::text").extract( ) ##tags是多级的,css级联;extract()提取全部内容; item["text"] = text item["author"] = author item["tags"] = tags yield item next = response.css( ".pager .next a::attr(href)").extract_first() ##链接提取,attr(属性名称) url = response.urljoin(next) ##urljoin方法获取绝对链接 yield scrapy.Request(url=url, callback=self.parse) #回调自己,完成递归的调用
def parse(self, response): quotes = response.css('.quote') for quote in quotes: item = QuoteItem() text = quote.css( '.text::text').extract_first() # '.text::text'表获取.text类的文本部分 author = quote.css( '.author::text').extract_first() # extract_first提取一个内容 tags = quote.css('.tags .tag::text').extract( ) # extract提取多个内容,'.tags .tag::text'表获取tags类下tag类的文本部分 item['text'] = text item['author'] = author item['tags'] = tags yield item #实现翻页,首先获取翻页的值 next = response.css('.pager .next a::attr(href)').extract_first() #拼接URL url = response.urljoin(next) #反复调用URL yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): quotes = response.css('.quote') for quote in quotes: item = QuoteItem() # ::text --> 提取其中的文本信息 算是scrapy语法 text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() item['text'] = text item['author'] = author item['tags'] = tags yield item # 获取下一页的url next = response.css('.pager .next a::attr(href)').extract_first() # 连接成新的url url = response.urljoin(next) # callback 对应的是:请求这个url参数之后 由谁处理 # 用递归调用 实现了循环翻页 yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): quotes = response.css('.quote') for quote in quotes: text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() item = QuoteItem() # item['text'] = text # item['author'] = author # item['tags'] = tags for field in item.fields: try: item[field] = eval(field) except NameError: self.logger.debug('Field is not Defined' + field) yield item next_page = response.css('.next a::attr(href)').extract_first() url = response.urljoin(next_page) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): #解析单页网页 #pass #print(response.text) quotes = response.css('.quote') for quotes in quotes: item = QuoteItem() text = quotes.css('.text::text').extract_first( ) #传入CSS 选择器。只有一个元素时用extract_first() author = quotes.css('.author::text').extract_first() tags = quotes.css( '.tags .tag::text').extract() #多个元素时用extract(),会以列表的形式返回结果。 item['text'] = text item['author'] = author item['tags'] = tags yield item #生成字典类型数据 #实现翻页循环 next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) #urljoin获取绝对url yield scrapy.Request( url=url, callback=self.parse) #相当于重新发起一次请求。回绝函数callback递归调用自己,parse是处理索引页函数
def parse(self, response): quotes = response.css('.quote') for quote in quotes: item = QuoteItem() text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() item['text'] = text item['author'] = author item['tags'] = tags yield item # 翻页 next = response.css( '.next a::attr("href")').extract_first() # /page/2/ url = response.urljoin( next) # 把相对路径连接成绝对路径 http://quotes.toscrape.com/page/2/ yield scrapy.Request( url=url, callback=self.parse) # 请求下一页,把response返回给callback指定函数继续处理,递归调用自己
def parse(self, response): quotes = response.css('.quote') for quote in quotes: # change piplines to get two csv # first csv item = QuoteItem() text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() item['text'] = text item['author'] = author item['tags'] = ','.join(tags) yield item #second csv #into about page about = quote.css('a::attr(href)').extract_first() about_url = response.urljoin(about) yield scrapy.Request(url=about_url, callback=self.parse_author) #next page next = response.css('.pager .next a::attr(href)').extract_first() next_url = response.urljoin(next) yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): # pass # pass 默认回调方法 # print("response.text") quotes = response.css('.quote') for quote in quotes: item = QuoteItem() #在items.py中定义的 text = quote.css('.text::text').extract_first() #进一步筛选quote #::是特有的语法结构,是输出text中文本 #extract_first找第一个结果 author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() #tag有多个,extract提取全部内容,类似于find(),findall() item['text'] = text item['author'] = author item['tags'] = tags yield item #下一页提取 next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) #urljon()生成一个网站的url yield scrapy.Request(url=url, callback=self.parse)