def parse(self, response): # 手机端斗鱼返回的是json格式的数据,所有数据都在data中 # 直接从请求响应体中获取数据 node_list = json.loads(response.body)['data'] # 如果拿不到数据,说明已经爬取完所有翻页 if not node_list: return # 对具体数据进行解析 for node in node_list: # 数据保存 item = DouyuItem() item['image_link'] = node['vertical_src'] item['nick_name'] = node['nickname'] item['room_id'] = node['room_id'] item['city'] = node['anchor_city'] yield item # 实现翻页 self.offset += 20 yield scrapy.Request(self.base_url + str(self.offset), callback=self.parse)
def parse(self, response): # print (response.body) # 将源码转换成python字典 data_list = json.loads(response.body)['data'] # 遍历数据列表 for data in data_list: # 创建item对象 item = DouyuItem() # 提取数据,将data中对应的字段提取出来,保存到item中 item['name'] = data['nickname'] item['uid'] = data['owner_uid'] item['image_link'] = data['vertical_src'] item['city'] = data['anchor_city'] # print (item) # 反回数据 yield item #翻页 if len(data_list) != 0: self.offset += 100 # 生成出url next_url = self.host + str(self.offset) # 创建请求并发送 yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): datas = json.loads(response.body) for data in datas['data']: item = DouyuItem() item['vertical_src'] = data['vertical_src'] item['nickname'] = data['nickname'] yield item
def parse(self, response): # print "###"*20 # print type(response) # print response.body # print "###" * 20 data_list = json.loads(response.body)['data'] if not len(data_list): return for data in data_list: item = DouyuItem() # 房间名字 item['room_name'] = data["room_name"] # 主播名字 item['nick_name'] = data["nickname"] # 主播房间封面 item['image_src'] = data['vertical_src'] # 所在城市 item['city'] = data['anchor_city'] self.unm += 1 print "写入第%s个" % self.unm yield item self.offset +=100 yield scrapy.Request(self.base_url + str(self.offset), callback=self.parse)
def parse(self, response): data_list = json.loads(response.body)["data"] print("len(data_list)" + str(len(data_list))) if len(data_list) == 0 and self.UpOver == False: return elif len(data_list) == 0 and self.UpOver == True: self.UpOver = False for data in data_list: item = DouyuItem() item["nickname"] = data["nickname"] item["room_id"] = data["room_id"] item["room_src"] = data["room_src"] item["vertical_src"] = data["vertical_src"] item["city"] = data["anchor_city"] print(item) yield item # print("*"*40) if not self.UpOver: self.offsetUp += 1 yield scrapy.Request(self.base_urls + str(self.offsetUp), callback=self.parse) else: self.offsetDo -= 1 yield scrapy.Request(self.base_urls + str(self.offsetDo), callback=self.parse)
def parse(self, response): data_list = json.loads(response.body)['data'] # 如果data_list没有值,return会结束方法 if len(data_list) == 0: return for data in data_list: item = DouyuItem() item['nickname'] = data['nickname'] item['imagelink'] = [data['vertical_src']] yield item
def parse(self, response): #格式转换,先将response的内容转为可处理的json格式 data_list = json.loads(response.body)['data'] #print(content['data']) if len(data_list) == 0: return for data in data_list: item = DouyuItem() item['nickname'] = data['nickname'] item['imagelink'] = data['vertical_src'] yield item
def parse(self, response): data_list = json.loads(response.body) if len(data_list) == 0: return for data in data_list: item = DouyuItem() item["nickName"] = data["nickname"] item["imageLink"] = data["vertical_src"] yield item self.offset += 20 yield scrapy.Request(self.base_url + str(self.offset), callback=self.parse)
def parse(self, response): #there are two parts in the text: one is error:0; # the other is data:{} data = json.loads(response.text)['data'] for each in data: item = DouyuItem() item['nickname'] = each['nickname'] item['imagelink'] = each['vertical_src'] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offest), callback=self.parse)
def parse(self, response): data_list = json.loads(response.body.decode())['data']['list'] for room in data_list: item = DouyuItem() item['rid'] = room['rid'] item['roomName'] = room['roomName'] item['roomSrc'] = room['roomSrc'] item['nickname'] = room['nickname'] item['hn'] = room['hn'] yield item
def parse(self, response): data_list = json.loads(response.body)['data'] if len(data_list) == 0: return for data in data_list: item = DouyuItem() item['nickname'] = data['nickname'] item['imagelink'] = data['vertical_src'] yield item self.offset += 20 yield scrapy.Request(self.baseURL + str(self.offset), callback=self.parse)
def parse(self, response): data_list = json.loads(response.body)['data'] #在data_list为空的时候 ,return:关闭程序 if len(data_list) == 0: return for data in data_list: item = DouyuItem() item['nickname'] = data["nickname"] item['vertical_src'] = data["vertical_src"] yield item #offset递增 然后调用回调函数parse() self.offset += 20 scrapy.Request(self.baseUrl + str(self.offset), callback=self.parse)
def parse(self, response): # json格式的数据装换成python格式 data = json.loads(response.text)['data'] for each in data: item = DouyuItem() item['nickname'] = each['nickname'] item['imagelink'] = each['vertical_src'] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): content = response.text data = json.loads(content)['data'] for each in data: item = DouyuItem() item['nickname'] = each['nickname'] item['image_link'] = each['vertical_src'] yield item #如果当前页的长度小于每页20的数量,说明当前已经是最后一页了,就停止 if len(data) == 20: self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): data_list = json.loads(response.body)["data"]["list"] if not len(data_list): return for data in data_list: item = DouyuItem() item["nickname"] = data["nickname"] if len(data["nickname"]) > 0 else None item["imagelink"] = data["verticalSrc"] if len(data["verticalSrc"]) > 0 else None yield item # 记得加yield! rm * 删除当前文件夹下的所有文件 self.offset += 1 yield scrapy.Request(self.baseURL.format(str(self.offset)), callback=self.parse)
def parse(self, response): node_list = json.loads(response.body.decode())['data'] if not node_list: return for node in node_list: item = DouyuItem() item['image_link'] = node['vertical_src'] item['nick_name'] = node['nickname'] item['room_id'] = node['room_id'] item['city'] = node['anchor_city'] yield item self.offset += 20 yield scrapy.Request(self.base_url + str(self.offset), callback=self.parse)
def parse(self, response): # 返回从json里获取 data段数据集合 data = json.loads(response.text)["data"] for each in data: item = DouyuItem() item["name"] = each["nickname"] item["imagesUrls"] = each["vertical_src"] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): data = json.loads(response.body) if len(data['data']) == 0: return for info in data['data']: item = DouyuItem() item['nickname'] = info['nickname'] item['image_url'] = info['vertical_src'] yield item self.offset += 20 yield scrapy.Request(self.base_url + str(self.offset), callback=self.parse)
def channel_parse(self, response): #channel_get 的回调函数,根据返回的json数据抓取相应内容,并抓出主播的房间链接,对房间链接执行请求 response_json = json.loads(response.text) # 利用json.loads将json数据转为字典 channel = response.meta['channel'] for i in response_json['data']['rl']: items = DouyuItem() # 实例化item.HuyaItem items['channel'] = channel # 获取频道名称 items['watch_num'] = int(i['ol']) # 获取观看数量 设置为整数方便排序 items['anchor_roomname'] = i['rn'] # 获取房间名称 加密的 items['anchor_url'] = 'http://www.douyu.com' + i['url'] items['anchor_name'] = i['nn'] # 获主播名称 加密的 yield Request(url=items['anchor_url'], meta={'items': items}, callback=self.room_parse) #重跳转问题修复, dont_filter=True
def parse(self, response): nickList = json.loads(response.text)['data'] if len(nickList) == 0: return for nick in nickList: item = DouyuItem() item['imagLink'] = nick['vertical_src'] item['nickName'] = nick['nickname'] item['nickCity'] = nick['anchor_city'] yield item self.offset += 20 yield scrapy.Request(self.baseUrl + str(self.offset), callback=self.parse)
def parse(self, response): #取返回的json数据中的['data'],json.loads?.load和磁盘交互,.loads处理字符串 # data = json.loads(response.body) # print data data_list = json.loads(response.body)['data'] if len(data_list) == 0: return for data in data_list: item = DouyuItem() item['nickname'] = data["nickname"] item['imagelink'] = data["vertical_src"] yield item self.offset += 20 yield scrapy.Request(self.baseURL + str(self.offset), callback=self.parse)
def parse(self, response): """获取想要的字段""" data_list = json.loads(response.body)["data"] if not data_list: return for data in data_list: item = DouyuItem() item["room_link"] = "http://www.douyu.com/" + data["room_id"] item["image_link"] = data["vertical_src"] item["nick_name"] = data["nickname"] item["anchor_city"] = data["anchor_city"] yield item # 获取url 构建请求函数交给调度器 self.offset += 100 yield scrapy.Request(base_url + str(self.offset), callback=self.parse)
def parse(self, response): #取出json文件 data_list = json.loads(response.body)['data'] # 若data为空则退出函数 if not len(data_list): return for data in data_list: item = DouyuItem() item['nickname'] = data["nickname"] item['imagelink'] = data["vertical_src"] yield item self.offset += 20 # print(self.baseURL + str(self.offset)) yield scrapy.Request(self.baseURL + str(self.offset), callback=self.parse)
def parse(self, response): print('我是第一步') data_list = json.loads(response.body.decode())['data'] if not data_list: print('没有数据呀!') return for room_dict in data_list: item = DouyuItem() item['room_id'] = room_dict['room_id'] item['vertical_src'] = room_dict['vertical_src'] item['nickname'] = room_dict['nickname'] item['anchor_city'] = room_dict['anchor_city'] yield item self.offset += 60 url = self.base_url + str(self.offset) yield scrapy.Request(url, callback=self.parse)
def parse(self, response): data_list = json.loads(response.body)['data'] if len(data_list) == 0: print '=' * 30 return for data in data_list: item = DouyuItem() item['nickname'] = data['nickname'] item['imagelink'] = data['vertical_src'] yield item # 发送API的请求 self.offset += 20 next_url = self.baseURL + str(self.offset) print "--------------------", next_url yield scrapy.Request(next_url, callback=self.parse, dont_filter=True)
def parse(self, response): # 针对响应json的处理办法 data_list = json.loads(response.body)['data'] if len(data_list) == 0: return for data in data_list: # 可以不通过管道,将1数据打印出来 # print data['nickname'] # print data['vertical_src'] # print '*'*40 item = DouyuItem() item['nickname'] = data['nickname'] item['imagelink'] = data['vertical_src'] yield item self.offset += 20 yield scrapy.Request(self.baseurl + str(self.offset), callback=self.parse)
def parse(self, response): data_list = json.loads(response.body)['data'] if data_list: for data in data_list: item = DouyuItem() item['room_id'] = data['room_id'] item['image_src'] = data['vertical_src'] item['nick_name'] = data['nickname'] item['city'] = data['anchor_city'] # yield scrapy.Request(url=item['image_src'], meta = {"image_name" : item['nick_name']}, callback = self.parse_image) yield item # 处理下一页的情况 self.offset += 20 yield scrapy.Request(url=self.base_url + str(self.offset), callback=self.parse)
def parse(self, response): # print(response.body) data_list = json.loads(response.body_as_unicode())['data'] if not len(data_list): return try: for data in data_list: item = DouyuItem() item['nickname'] = data['nickname'] item['imagelink'] = data['vertical_src'] yield item self.offset += 20 yield scrapy.Request(self.base_url + str(self.offset), callback=self.parse) except Exception as e: print(e)
def parse(self, response): data_list = json.loads(response.body)['data'] # 只要返回的响应json,data数据为空,就表示抓取结束 if not data_list: return for data in data_list: item = DouyuItem() item["room_link"] = "http://www.douyu.com/" + data['room_id'] item["image_link"] = data['vertical_src'] item["nick_name"] = data['nickname'] item["city"] = data['anchor_city'] yield item self.offset += 20 yield scrapy.Request(self.base_url + str(self.offset), callback = self.parse)
def parse(self, response): # response为json格式的数据 # 先把响应转成python数据类型 list = json.loads(response.text)["data"] # 如果list为空,说明爬取完毕 # if len(list) == 0: if self.offset == 60: return for info in list: item = DouyuItem() item['img'] = info['vertical_src'] item['name'] = info['nickname'] item['city'] = info['anchor_city'] yield item # 实现翻页功能 self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)