def parse(self, response): item = DouyuspiderItem() item['nickName'] = 'shake' item['imageLink'] = urls yield item
def parse(self, response): data = json.loads(response.text)['data'] for each in data: item = DouyuspiderItem() item['nickName'] = each['nickname'] item['imageLink'] = each['vertical_src'] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): data_list = json.loads(response.body)['data'] # 若数据爬完则返回 if len(data_list) == 0: return # 开始爬取 for data in data_list: item = DouyuspiderItem() item['nickname'] = data['nickname'] item['imageurl'] = data['vertical_src'] yield item # 修改偏移量,多次爬虫 self.offset += 20 yield scrapy.Request(self.baseURL + str(self.offset), callback=self.parse)
def parse(self, response): data_list = json.loads(response.body)['data'] if len(data_list['list']) == 0: return # 遇到return终止返回 for data in data_list['list']: item = DouyuspiderItem() item['nickname'] = data['nickname'].encode( 'utf-8') # 防止中文出现乱码,进行utf-8编码 item['roomSrc'] = data['roomSrc'] item['hn'] = data['hn'] yield item # yield迭代器 self.page += 1 url = self.base_url.format(self.page) yield scrapy.Request(url, callback=self.parse) # 回调函数不用加括号()
def parse(self, response): """解析方法""" json_str = response.body.decode() my_dict = json.loads(json_str).get("data") if my_dict: for item in my_dict: douyumm = DouyuspiderItem() douyumm["nickname"] = item["nickname"] douyumm["vertical_src"] = item["vertical_src"] yield douyumm # 如果写到for 循环里面 yield的本质是放入队列等待下载 # yield scrapy.Request(self.url + str(self.offset), callback=self.parse, dont_filter=True) self.offset += 20 # 偏移量叠加20 # dont_filter 忽略域组,解决加上allowed_domains http://的问题 yield scrapy.Request(self.url + str(self.offset), callback=self.parse, dont_filter=True) pxd = 1
def parse(self, response): body_bytes = response.body body_bytes_decode_to_str = response.body.decode("utf-8") # with open("斗鱼主播信息.json", mode="a", buffering=-1, encoding="utf-8", errors="ignore") as f: # f.write(body_bytes_decode_to_str) # f.write(", \n") self.f1.write(body_bytes_decode_to_str) self.f1.write(", \n") body_str_loads_to_dict = loads(response.body.decode("utf-8"), encoding="utf-8") body_dict_dumps_to_str = dumps(body_str_loads_to_dict, ensure_ascii=False) # with open("斗鱼主播信息2.json", mode="a", buffering=-1, encoding="utf-8", errors="ignore") as f: # f.write(body_dict_dumps_to_str) # f.write(", \n") self.f2.write(body_dict_dumps_to_str) self.f2.write(", \n") data_value = loads(response.body.decode("utf-8"))["data"] data_list = data_value if not data_list: return # 返回item给管道 for i in data_list: # print('*' * 100) # print(i["vertical_src"]) # print(i["nickname"]) # print(i["anchor_city"]) # print('*' * 100) # 创建item对象 item = DouyuspiderItem() """防止程序中断""" # 方式一:判断 if len(str(i["vertical_src"])): item["vertical_src"] = i["vertical_src"] else: item["vertical_src"] = "" if len(str(i["nickname"])): item["nickname"] = i["nickname"] else: item["nickname"] = "" if len(str(i["anchor_city"])): item["anchor_city"] = i["anchor_city"] else: item["anchor_city"] = "" # # 方式二:捕获异常 # try: # item["vertical_src"] = i["vertical_src"] # except Exception as ex: # print(ex) # item["vertical_src"] = "" # try: # item["nickname"] = i["nickname"] # except Exception as ex: # print(ex) # item["nickname"] = "" # try: # item["anchor_city"] = i["anchor_city"] # except Exception as ex: # print(ex) # item["anchor_city"] = "" # 返回item给管道 yield item # 返回Request给调度器 self.offset += 20 yield scrapy.Request(self.baseURL.format(self.offset), callback=self.parse)