def parse(self, response): textcontent = BeautifulSoup(response.text, 'lxml') textList = textcontent.find_all( 'div', {'class': 'item masonry_brick masonry-brick'}) for text in textList: item = DouyuItem() #item['imageUrl'] = text.find('img').get('src') item['imageName'] = text.find('img').get('alt') a = text.find('div', {'class': 'items_likes'}).get_text() #匹配张数,输出多少张 nums = re.findall(r'\d+', a.split()[2])[0] #详细图的初始化地址 item['bigPageUrl'] = text.find('a', { 'target': '_blank' }).get('href') #获取第一张图 self.firstUrl(item['bigPageUrl']) #http://www.mmonly.cc/ktmh/dmmn/13544.html #匹配URL b = re.findall(r'(.*?).html', item['bigPageUrl'])[0] for i in range(2, int(nums) + 1): #获取大图片所在的URL地址,除了第一张外 bigPage = b + '_' + str(i) + '.html' #回调函数传递参数 time.sleep(0.5) #yield scrapy.Request(bigPage, callback=lambda response,startUrl=item['bigPageUrl']: self.xianImage(response,startUrl)) yield scrapy.Request(bigPage, callback=self.xianImage) yield item if self.page <= 359: self.page += 1 yield scrapy.Request(self.url + str(self.page) + '.html', callback=self.parse)
def parse_details(self, response): print(response.url) title = response.xpath( '//h3[@class="Title-headlineH2"]/text()').extract_first() zhubo = response.xpath( '//a[@class="Title-anchorName"]/@title').extract_first() url = response.url dengji = response.xpath( '//div[@class="Title-AnchorLevel"]/div/@class').re(r'\d+\.?\d*')[0] bankuai = response.xpath( '//div[@class="Title-categoryList clearFix"]/a[1]/text()' ).extract_first() fenlei = response.xpath( '//div[@class="Title-categoryList clearFix"]/a[2]/text()' ).extract_first() biaoqian = response.xpath( '//div[@class="Title-categoryList clearFix"]/a[3]/text()' ).extract_first() #renqi = response.xpath('//div[@id="live-list-content"]//span[@class="dy-num fr"]/text()').extract_first() item = DouyuItem() item['title'] = title item['zhubo'] = zhubo item['url'] = url item['fenlei'] = fenlei item['dengji'] = dengji item['bankuai'] = bankuai item['biaoqian'] = biaoqian # print(renqi) yield item
def parse(self, response): data = json.loads(response.text)['data'] # 解决TypeError: list indices must be integers, not str,某些主播没有标签 try: list = data['list'] for each in list: item = DouyuItem() # 主播名称 item['nickname'] = each['nickname'] # 房间名称 item['room_name'] = each['room_name'] # 人气 item['hot'] = each['hn'] # 房间号 item['room_id'] = each['room_id'] # 主播标签 for i in each['anchor_label']: item['tag'] = i['tag'] yield item except: pass self.page += 20 yield scrapy.Request(self.url_first + str(self.page) + self.url_last, callback=self.parse)
def parse(self, response): data_list=json.loads(response.body)['data'] for data in data_list: item=DouyuItem() item['nickname']=data['nickname'] item['imagelink']=data['vertical_src'] yield item
def parse(self, response): for sel in response.xpath('//ul[@id="live-new-show-content-box"]/li'): item = DouyuItem() item['douyu_title'] = sel.xpath('a/@title').extract() item['douyu_type'] = sel.xpath('a/div/div/span/text()').extract() item['douyu_name'] = sel.xpath('a/div/p/span/text()').extract() item['douyu_link'] = sel.xpath('a/@href').extract() yield item
def parse(self, response): data = json.loads(response.text)['data'] for each in data: item = DouyuItem() item['name'] = each['nickname'] item['imglink'] = each['vertical_src'] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath( "//div[@class='DyListCover HeaderCell is-href']/a"): item = DouyuItem() item['image_link'] = each.xpath("./div/div/img/@src").extract()[0] item['nick_name'] = each.xpath("./div/div/h2/text()").extract()[0] item['fans_num'] = each.xpath( "./div/div/span[@class ='DyListCover-hot']/text()").extract( )[0] yield item
def parse(self, response): each_line = json.loads(response.text)["data"] for each in each_line: item = DouyuItem() item['nickname'] = each['nickname'] item['imgLink'] = each['vertical_src'] yield item if self.offset < 200: self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): data_list = json.loads(response.body)['data'] # if not len(data_list): # return for data in data_list: item = DouyuItem() item['nickname'] = data['nickname'] item['avatar_mid'] = data['avatar_mid'] yield item
def parse(self, response): data = json.loads(response.text)['data']['rl'] for p in data: items = DouyuItem() items['name'] = p['nn'] items['image_urls'] = p['rs1'] items['desc'] = p['rn'] items['uid'] = p['uid'] yield items
def parse(self, response): data=json.loads(response.text)["data"] for each in data: item=DouyuItem() item["name"]=each["nickname"] item["image_link"]=each["vertical_src"] yield item self.offset+=20 # 循环发送请求 yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
def parse(self, response): res = json.loads(response.text) for each in res['data']: item = DouyuItem() item["nickname"] = each["nickname"] item["imagelink"] = each["vertical_src"] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): content = response.text data = json.loads(content)['data'] for i in data: image_url = i['vertical_src'] item = DouyuItem() # 该字段必须是可迭代对象 item['image_urls'] = [image_url] yield item
def parse(self, response): node_list = json.loads(response.body.decode())["data"] for node in node_list: item = DouyuItem() item["vertical_src"] = node["vertical_src"] item["nickname"] = node["nickname"] item["anchor_city"] = node["anchor_city"] item["room_url"] = "https://www.douyu.com/" + node["room_id"] item["room_name"] = node["room_name"] yield item
def parse(self, response): jsonDict = json.loads(response.text) infoDictList = jsonDict['data'] for infoDict in infoDictList: item = DouyuItem() item['nickname'] = infoDict['nickname'] item['imagelink'] = infoDict['vertical_src'] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): # 把json格式的数据转换为python格式,data段是列表 data = json.loads(response.text)["data"] for each in data: item = DouyuItem() item["nickname"] = each["nickname"] item["imagelink"] = each["vertical_src"] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): data_list = json.loads(response.text)['data'] if len(data_list) == 0: return for data in data_list: item = DouyuItem() item['nickname'] = data['nickname'] item['imageLink'] = data['vertical_src'] yield self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): # self.log(response.text) data = json.loads(response.text) ls = data['data'] self.log('len' + str(len(ls))) for i in ls: item = DouyuItem() item['name'] = i['nickname'] item['image_url'] = i['vertical_src'] yield item self.offset += 20 next_url = self.url + str(self.offset) yield scrapy.Request(next_url,callback=self.parse)
def parse(self, response): # 返回从json里获取 data段数据集合 data = json.loads(response.text)["data"] for each in data: item = DouyuItem() item["name"] = each["nickname"] item["imagesUrls"] = each["vertical_src"] yield item self.offset += 100 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def xianImage(self, response): #获取其他图片的URL html = BeautifulSoup(response.text, 'lxml') c = html.find('div', {'class': 'big-pic'}) item = DouyuItem() #获取真正的大图图片地址 item['imageUrl'] = c.find('img').get('src') item['imageName1'] = c.find('img').get('alt') #取详细图片 name = item['imageUrl'][-6:] #item['num'] = c.find('a').get('href').split('.')[0] self.saveImg(item['imageUrl'], item['imageName1'], name)
def parse(self, response): # 把json格式的数据转换为Python格式,data段是列表 data = json.loads(response.text)['data'] for each in data: item = DouyuItem() item['nickname'] = each['nickname'] item['imagelink'] = each['vertical_src'] item['anchorcity'] = each['anchor_city'] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
def parse(self, response): data = json.loads(response.text)["data"]["list"] for each in data: item = DouyuItem() item["nickname"] = each["nickname"] item["imagelink"] = each["vertical_src"] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset) + "/20/ios?client_sys=ios", callback=self.parse)
def parse(self, response): # 将json中的data转换为python,data是列表 data = json.loads(response.text)['data'] for each in data: item = DouyuItem() item['nickname'] = each['nickname'] item['imagelink'] = each['vertical_src'] # 将数据交给管道文件 yield item if self.offset < 1000: self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): # print(response.text) # 主播 zbItems = DouyuItem() data = json.loads(response.text)['data']['rl'] # 图片 zbImgItems = DouyuItem() print(data) ''' image_urls = scrapy.Field() # 图片url images = scrapy.Field() # 图片名 c2name = scrapy.Field() # 板块 nn = scrapy.Field() # 主播 rn = scrapy.Field() # 标签 ''' for mz in data: # for mz in data: # 主播信息 c2name = mz['c2name'] nn = mz['nn'] rn = mz['rn'] zbItems['c2name'] = c2name zbItems['nn'] = nn zbItems['rn'] = rn # 图片 image_urls = mz['rs1'] images = mz['nn'] # zbImgItems['image_urls'] = [image_urls] # 图片是个列表 # zbImgItems['images'] = images # yield zbItems yield zbImgItems
def parse(self, response): data_list1 = json.loads(response.body)['data'] data_list = data_list1['rl'] print(data_list) if len(data_list) == 0: return for data in data_list: item = DouyuItem() item['name_img'] = data['nn'] item['url_img'] = data['rs1'] yield item self.num += 1 url = self.base_url + str(self.num) yield scrapy.Request(url, callback=self.parse, dont_filter=True)
def parse(self, response): data = json.loads(response.text)["data"] if len(data) == 0: return for i in data: item = DouyuItem() item["name"] = i["nickname"] item["room_id"] = i["room_id"] item["image_urls"] = i["vertical_src"] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): data_list = json.loads(response.body.decode())['data']['list'] for data in data_list: item = DouyuItem() item['room_rid'] = data['rid'] item['room_name'] = data['roomName'] item['room_src'] = data['roomSrc'] item['nick_name'] = data['nickname'] item['room_hn'] = data['hn'] yield item if self.offset == 122: return self.offset += 1 yield scrapy.Request(url='https://m.douyu.com/api/room/list?page='+ str(self.offset)+'&type=yz',callback=self.parse)
def parse(self, response): res = json.loads(response.body.decode('utf-8')) room_id_list = jsonpath.jsonpath(res, '$..room_id') room_name_list = jsonpath.jsonpath(res, '$..room_name') room_src_list = jsonpath.jsonpath(res, '$..room_src') nickname_list = jsonpath.jsonpath(res, '$..nickname') for room_id, room_name, room_src, nickname in zip( room_id_list, room_name_list, room_src_list, nickname_list): item = DouyuItem() item['room_id'] = room_id item['room_name'] = room_name item['room_src'] = room_src item['nickname'] = nickname yield item
def parse(self, response): data_list = json.loads(response.body)['data'] if len(data_list) == 0: return for data in data_list: item = DouyuItem() item['nickname'] = data['nickname'] item['img_src'] = data['vertical_src'] yield item self.offset += 20 url = self.base_url + str(self.offset) yield scrapy.Request(url, callback=self.parse)
def parse(self, response): # 把json格式的数据转换为python格式, data段是列表 # data = json.loads(response.text).get('data') data = json.loads(response.text)['data'] for each in data: item = DouyuItem() item['nickname'] = each['nickname'] item['imageLink'] = each['vertical_src'] yield item self.offset += 20 # yield scrapy.Request(self.url + str(self.offset), callback = self.parse) yield scrapy.Request(self.url + str(self.offset), callback=self.parse)