예제 #1
0
    def parse(self, response):
        textcontent = BeautifulSoup(response.text, 'lxml')
        textList = textcontent.find_all(
            'div', {'class': 'item masonry_brick masonry-brick'})
        for text in textList:
            item = DouyuItem()
            #item['imageUrl'] = text.find('img').get('src')
            item['imageName'] = text.find('img').get('alt')
            a = text.find('div', {'class': 'items_likes'}).get_text()
            #匹配张数,输出多少张
            nums = re.findall(r'\d+', a.split()[2])[0]
            #详细图的初始化地址
            item['bigPageUrl'] = text.find('a', {
                'target': '_blank'
            }).get('href')
            #获取第一张图
            self.firstUrl(item['bigPageUrl'])
            #http://www.mmonly.cc/ktmh/dmmn/13544.html
            #匹配URL
            b = re.findall(r'(.*?).html', item['bigPageUrl'])[0]

            for i in range(2, int(nums) + 1):
                #获取大图片所在的URL地址,除了第一张外
                bigPage = b + '_' + str(i) + '.html'
                #回调函数传递参数
                time.sleep(0.5)
                #yield scrapy.Request(bigPage, callback=lambda response,startUrl=item['bigPageUrl']: self.xianImage(response,startUrl))
                yield scrapy.Request(bigPage, callback=self.xianImage)
            yield item

        if self.page <= 359:
            self.page += 1
        yield scrapy.Request(self.url + str(self.page) + '.html',
                             callback=self.parse)
예제 #2
0
    def parse_details(self, response):
        print(response.url)
        title = response.xpath(
            '//h3[@class="Title-headlineH2"]/text()').extract_first()
        zhubo = response.xpath(
            '//a[@class="Title-anchorName"]/@title').extract_first()
        url = response.url
        dengji = response.xpath(
            '//div[@class="Title-AnchorLevel"]/div/@class').re(r'\d+\.?\d*')[0]
        bankuai = response.xpath(
            '//div[@class="Title-categoryList clearFix"]/a[1]/text()'
        ).extract_first()
        fenlei = response.xpath(
            '//div[@class="Title-categoryList clearFix"]/a[2]/text()'
        ).extract_first()
        biaoqian = response.xpath(
            '//div[@class="Title-categoryList clearFix"]/a[3]/text()'
        ).extract_first()

        #renqi = response.xpath('//div[@id="live-list-content"]//span[@class="dy-num fr"]/text()').extract_first()

        item = DouyuItem()
        item['title'] = title
        item['zhubo'] = zhubo
        item['url'] = url
        item['fenlei'] = fenlei
        item['dengji'] = dengji
        item['bankuai'] = bankuai
        item['biaoqian'] = biaoqian
        # print(renqi)
        yield item
예제 #3
0
파일: dy.py 프로젝트: cheney-github/spider
    def parse(self, response):
        data = json.loads(response.text)['data']
        # 解决TypeError: list indices must be integers, not str,某些主播没有标签
        try:
            list = data['list']
            for each in list:
                item = DouyuItem()
                # 主播名称
                item['nickname'] = each['nickname']
                # 房间名称
                item['room_name'] = each['room_name']
                # 人气
                item['hot'] = each['hn']
                # 房间号
                item['room_id'] = each['room_id']
                # 主播标签
                for i in each['anchor_label']:
                    item['tag'] = i['tag']
                yield item
        except:
            pass

        self.page += 20
        yield scrapy.Request(self.url_first + str(self.page) + self.url_last,
                             callback=self.parse)
예제 #4
0
파일: Douyu.py 프로젝트: kak0na/pachong
 def parse(self, response):
     data_list=json.loads(response.body)['data']
     for data in data_list:
         item=DouyuItem()
         item['nickname']=data['nickname']
         item['imagelink']=data['vertical_src']
         yield item
예제 #5
0
 def parse(self, response):
     for sel in response.xpath('//ul[@id="live-new-show-content-box"]/li'):
         item = DouyuItem()
         item['douyu_title'] = sel.xpath('a/@title').extract()
         item['douyu_type'] = sel.xpath('a/div/div/span/text()').extract()
         item['douyu_name'] = sel.xpath('a/div/p/span/text()').extract()
         item['douyu_link'] = sel.xpath('a/@href').extract()
         yield item
예제 #6
0
    def parse(self, response):
        data = json.loads(response.text)['data']
        for each in data:
            item = DouyuItem()
            item['name'] = each['nickname']
            item['imglink'] = each['vertical_src']
            yield item

        self.offset += 20
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
예제 #7
0
 def parse(self, response):
     for each in response.xpath(
             "//div[@class='DyListCover HeaderCell is-href']/a"):
         item = DouyuItem()
         item['image_link'] = each.xpath("./div/div/img/@src").extract()[0]
         item['nick_name'] = each.xpath("./div/div/h2/text()").extract()[0]
         item['fans_num'] = each.xpath(
             "./div/div/span[@class ='DyListCover-hot']/text()").extract(
             )[0]
         yield item
예제 #8
0
 def parse(self, response):
     each_line = json.loads(response.text)["data"]
     for each in each_line:
         item = DouyuItem()
         item['nickname'] = each['nickname']
         item['imgLink'] = each['vertical_src']
         yield item
     if self.offset < 200:
         self.offset += 20
     yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
예제 #9
0
    def parse(self, response):
        data_list = json.loads(response.body)['data']
        # if not len(data_list):
        #     return
        for data in data_list:
            item = DouyuItem()
            item['nickname'] = data['nickname']
            item['avatar_mid'] = data['avatar_mid']

            yield item
예제 #10
0
    def parse(self, response):
        data = json.loads(response.text)['data']['rl']

        for p in data:
            items = DouyuItem()
            items['name'] = p['nn']
            items['image_urls'] = p['rs1']
            items['desc'] = p['rn']
            items['uid'] = p['uid']

            yield items
예제 #11
0
파일: douyupic.py 프로젝트: LQ6H/spider
    def parse(self, response):
        data=json.loads(response.text)["data"]
        for each in data:
            item=DouyuItem()
            item["name"]=each["nickname"]
            item["image_link"]=each["vertical_src"]
            yield item
            self.offset+=20

            # 循环发送请求
            yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
예제 #12
0
    def parse(self, response):

        res = json.loads(response.text)
        for each in res['data']:
            item = DouyuItem()
            item["nickname"] = each["nickname"]
            item["imagelink"] = each["vertical_src"]
            yield item

        self.offset += 20
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
예제 #13
0
    def parse(self, response):
        content = response.text
        data = json.loads(content)['data']

        for i in data:
            image_url = i['vertical_src']
            item = DouyuItem()

            # 该字段必须是可迭代对象
            item['image_urls'] = [image_url]
            yield item
예제 #14
0
    def parse(self, response):

        node_list = json.loads(response.body.decode())["data"]
        for node in node_list:
            item = DouyuItem()
            item["vertical_src"] = node["vertical_src"]
            item["nickname"] = node["nickname"]
            item["anchor_city"] = node["anchor_city"]
            item["room_url"] = "https://www.douyu.com/" + node["room_id"]
            item["room_name"] = node["room_name"]

            yield item
예제 #15
0
    def parse(self, response):
        jsonDict = json.loads(response.text)
        infoDictList = jsonDict['data']
        for infoDict in infoDictList:
            item = DouyuItem()
            item['nickname'] = infoDict['nickname']
            item['imagelink'] = infoDict['vertical_src']

            yield item

        self.offset += 20
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
예제 #16
0
    def parse(self, response):
        # 把json格式的数据转换为python格式,data段是列表
        data = json.loads(response.text)["data"]
        for each in data:
            item = DouyuItem()
            item["nickname"] = each["nickname"]
            item["imagelink"] = each["vertical_src"]

            yield item

        self.offset += 20
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
예제 #17
0
    def parse(self, response):
        data_list = json.loads(response.text)['data']
        if len(data_list) == 0:
            return
        for data in data_list:
            item = DouyuItem()
            item['nickname'] = data['nickname']
            item['imageLink'] = data['vertical_src']
            yield

        self.offset += 20
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
예제 #18
0
 def parse(self, response):
     # self.log(response.text)
     data = json.loads(response.text)
     ls = data['data']
     self.log('len' + str(len(ls)))
     for i in ls:
         item = DouyuItem()
         item['name'] = i['nickname']
         item['image_url'] = i['vertical_src']
         yield item
     self.offset += 20
     next_url = self.url + str(self.offset)
     yield scrapy.Request(next_url,callback=self.parse)
예제 #19
0
    def parse(self, response):
        # 返回从json里获取 data段数据集合
        data = json.loads(response.text)["data"]

        for each in data:
            item = DouyuItem()
            item["name"] = each["nickname"]
            item["imagesUrls"] = each["vertical_src"]

            yield item

        self.offset += 100
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
예제 #20
0
    def xianImage(self, response):
        #获取其他图片的URL
        html = BeautifulSoup(response.text, 'lxml')
        c = html.find('div', {'class': 'big-pic'})
        item = DouyuItem()
        #获取真正的大图图片地址
        item['imageUrl'] = c.find('img').get('src')
        item['imageName1'] = c.find('img').get('alt')
        #取详细图片
        name = item['imageUrl'][-6:]
        #item['num'] = c.find('a').get('href').split('.')[0]

        self.saveImg(item['imageUrl'], item['imageName1'], name)
예제 #21
0
    def parse(self, response):
        # 把json格式的数据转换为Python格式,data段是列表
        data = json.loads(response.text)['data']
        for each in data:
            item = DouyuItem()
            item['nickname'] = each['nickname']
            item['imagelink'] = each['vertical_src']
            item['anchorcity'] = each['anchor_city']

            yield item

        self.offset += 20
        yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
예제 #22
0
    def parse(self, response):
        data = json.loads(response.text)["data"]["list"]
        for each in data:
            item = DouyuItem()
            item["nickname"] = each["nickname"]
            item["imagelink"] = each["vertical_src"]

            yield item

        self.offset += 20
        yield scrapy.Request(self.url + str(self.offset) +
                             "/20/ios?client_sys=ios",
                             callback=self.parse)
예제 #23
0
    def parse(self, response):
        # 将json中的data转换为python,data是列表
        data = json.loads(response.text)['data']
        for each in data:
            item = DouyuItem()
            item['nickname'] = each['nickname']
            item['imagelink'] = each['vertical_src']

            # 将数据交给管道文件
            yield item
        if self.offset < 1000:
            self.offset += 20
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
예제 #24
0
    def parse(self, response):
        # print(response.text)
        # 主播
        zbItems = DouyuItem()
        data = json.loads(response.text)['data']['rl']

        # 图片
        zbImgItems = DouyuItem()

        print(data)
        '''
        image_urls = scrapy.Field()  # 图片url
        images = scrapy.Field()  # 图片名
        
        c2name = scrapy.Field()  # 板块
        nn = scrapy.Field()  # 主播
        rn = scrapy.Field()  # 标签
        '''
        for mz in data:
            # for mz in data:
            # 主播信息
            c2name = mz['c2name']
            nn = mz['nn']
            rn = mz['rn']

            zbItems['c2name'] = c2name
            zbItems['nn'] = nn
            zbItems['rn'] = rn

            # 图片
            image_urls = mz['rs1']
            images = mz['nn']

            # zbImgItems['image_urls'] = [image_urls]  # 图片是个列表
            # zbImgItems['images'] = images

            # yield zbItems
            yield zbImgItems
예제 #25
0
 def parse(self, response):
     data_list1 = json.loads(response.body)['data']
     data_list = data_list1['rl']
     print(data_list)
     if len(data_list) == 0:
         return
     for data in data_list:
         item = DouyuItem()
         item['name_img'] = data['nn']
         item['url_img'] = data['rs1']
         yield item
     self.num += 1
     url = self.base_url + str(self.num)
     yield scrapy.Request(url, callback=self.parse, dont_filter=True)
예제 #26
0
파일: doyu.py 프로젝트: foxyiy/gfox
    def parse(self, response):
        data = json.loads(response.text)["data"]
        if len(data) == 0:
            return
        for i in data:

            item = DouyuItem()
            item["name"] = i["nickname"]
            item["room_id"] = i["room_id"]
            item["image_urls"] = i["vertical_src"]
            yield item
            self.offset += 20
            yield scrapy.Request(self.url + str(self.offset),
                                 callback=self.parse)
예제 #27
0
파일: DouYu.py 프로젝트: ArvinZX/douyu_girl
 def parse(self, response):
     data_list = json.loads(response.body.decode())['data']['list']
     for data in data_list:
         item = DouyuItem()
         item['room_rid'] = data['rid']
         item['room_name'] = data['roomName']
         item['room_src'] = data['roomSrc']
         item['nick_name'] = data['nickname']
         item['room_hn'] = data['hn']
         yield item
     if self.offset == 122:
         return
     self.offset += 1
     yield scrapy.Request(url='https://m.douyu.com/api/room/list?page='+ str(self.offset)+'&type=yz',callback=self.parse)
예제 #28
0
 def parse(self, response):
     res = json.loads(response.body.decode('utf-8'))
     room_id_list = jsonpath.jsonpath(res, '$..room_id')
     room_name_list = jsonpath.jsonpath(res, '$..room_name')
     room_src_list = jsonpath.jsonpath(res, '$..room_src')
     nickname_list = jsonpath.jsonpath(res, '$..nickname')
     for room_id, room_name, room_src, nickname in zip(
             room_id_list, room_name_list, room_src_list, nickname_list):
         item = DouyuItem()
         item['room_id'] = room_id
         item['room_name'] = room_name
         item['room_src'] = room_src
         item['nickname'] = nickname
         yield item
예제 #29
0
    def parse(self, response):
        data_list = json.loads(response.body)['data']
        if len(data_list) == 0:
            return

        for data in data_list:
            item = DouyuItem()
            item['nickname'] = data['nickname']
            item['img_src'] = data['vertical_src']
            yield item

        self.offset += 20
        url = self.base_url + str(self.offset)
        yield scrapy.Request(url, callback=self.parse)
예제 #30
0
    def parse(self, response):
        # 把json格式的数据转换为python格式, data段是列表
        # data = json.loads(response.text).get('data')
        data = json.loads(response.text)['data']
        for each in data:
            item = DouyuItem()
            item['nickname'] = each['nickname']
            item['imageLink'] = each['vertical_src']

            yield item

        self.offset += 20
        # yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)