Python TiebaItem 예제들, tieba.items.TiebaItem Python 예제들

예제 #1

0

파일 보기

    def parse(self, response):
        XiaohuaSpider.count += 1
        pagenum = XiaohuaSpider.count
        #有输入并且当前页数超过页数就停止
        if XiaohuaSpider.maxpage and pagenum > int(XiaohuaSpider.maxpage):
            print('超过页数，停止')
            return
        print('performing page %d' % pagenum)
        item = TiebaItem()  #实例化item
        page_item = dict()  #记录页数
        page_item['page_num'] = pagenum
        yield page_item  #返回到pipline写出当前页码
        #将注释符号去掉，并替换response中的body， byte类型
        newbody = bytes(response.text.replace("<!--", "").replace("-->", ""),
                        encoding='utf-8')
        # 提取数据
        newresponse = response.replace(body=newbody)
        li_list = newresponse.xpath('//li[@class=" j_thread_list clearfix"]')
        #print(len(li_list))

        for el in li_list:
            item['title'] = el.xpath(".//a/text()").get()  #每个结果都是列表
            item['link'] = newresponse.urljoin(el.xpath(".//a/@href").get())
            yield item
        #下一页
        next_part_url = newresponse.xpath(
            '//*[@id="frs_list_pager"]/a[contains(text(),"下一页>")]/@href').get(
            )
        # 只要还有下一页就进行请求
        if next_part_url != None:
            next_url = 'https:' + next_part_url
            yield scrapy.Request(url=next_url, callback=self.parse)

예제 #2

0

파일 보기

    def the_tiezi(self,one_thread):
        '''输入的是贴吧首页的每条帖子标题的原始信息，
            返回标题、发帖人等信息组成的 dict'''
        tiezi = TiebaItem()
        data = json.loads(one_thread.xpath("@data-field").extract_first())  # 相当于大纲吧
        title = one_thread.xpath('.//a[@class="j_th_tit "]/text()').extract_first().strip()  #标题
        author = data['author_name']                                                        #发帖人
        tid = data['id']                                                                    #帖子的tid
        reply_num = int(data['reply_num'])                                                  #帖子的回复数量

        last_reply_time = one_thread.xpath(
            './/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()').extract_first()
        if last_reply_time is not None:                                                 #帖子最后回复时间(顶置的帖子没有)
            last_reply_time = last_reply_time.strip()
            if re.match(r'\d+:\d+', last_reply_time):  # 最后回复时间，以前的只会显示日期；今天的只显示时分，得加上日期
                last_reply_time = time.strftime("%Y-%m-%d ", time.localtime()) + last_reply_time
        #最后回复人，顶置帖子好像没有？
        last_reply_author=one_thread.xpath('.//span[@class="tb_icon_author_rely j_replyer"]/@title').re_first(r'最后回复人: \s*(.*)')

        tiezi['title'] = title
        tiezi['author'] = author
        tiezi['tid'] = tid
        tiezi['pages']=None
        tiezi['reply_num'] = int(reply_num)
        tiezi['last_reply_author'] = last_reply_author
        tiezi['last_reply_time'] = last_reply_time
        tiezi['post_list'] = []   #所以楼层信息组成的list
        return dict(tiezi)

예제 #3

0

파일 보기

    def parse_post_note(self, response):
        json_obj = json.loads(response.body)
        no = int(json_obj['no'])
        if no == 0:
            utils.debug('发帖成功：', json.dumps(json_obj['data']))
            tid = json_obj['data']['tid']

            meta = response.meta['datas']

            data = dict()
            data['fid'] = meta['fid']
            data['id'] = tid
            data['kw'] = meta['kw']
            data['tbs'] = meta['tbs']
            data['title'] = meta['title']
            data['content'] = meta['content']
            data['timestamp'] = utils.timestamp
            item = TiebaItem(type=1)
            item['note'] = data
            yield item

            time.sleep(2)
            yield self.post_reply(tid)
        else:
            err_code = int(json_obj['err_code'])
            utils.debug('发帖失败：', get_post_err_msg(no, err_code, response.body))
        if no == 40:
            vcode_obj = json_obj['data']['vcode']
            input_captcha = utils.show_captcha(vcode_obj['captcha_vcode_str'])
            captcha_type = vcode_obj['captcha_code_type']
            yield self.__check_captcha(captcha=input_captcha,
                                       captcha_type=captcha_type)

            yield self.post_note(input_captcha)

예제 #4

0

파일 보기

    def parse(self, response):
        self.i = 1
        item = TiebaItem()
        list = response.xpath('//*[@id="thread_list"]/li')
        for i in list:
            item["tittle"] = i.xpath(
                './/a[@rel="noreferrer"]/@title').extract_first()
            item["url"] = i.xpath(
                './/a[@rel="noreferrer"]/@href').extract_first()
            item["content"] = i.xpath(
                './/div[contains(@class,"threadlist_abs threadlist_abs_onlyline ")]/text()'
            ).extract_first()

            if item["url"] is not 'javascript:;':

                item["url"] = "https://tieba.baidu.com" + (
                    i.xpath('.//a[@rel="noreferrer"]/@href').extract_first())

                yield scrapy.Request(item["url"],
                                     callback=self.parse_detail,
                                     meta={"item": copy.deepcopy(item)})
                print(item["url"])
        next_url = "https:" + response.xpath(
            "//a[contains(@class,'next pagination-item')]/@href"
        ).extract_first()
        print(type(next_url))
        print(next_url)
        if next_url is not None:
            yield scrapy.Request(next_url, callback=self.parse)

예제 #5

0

파일 보기

파일: auto_sign_spiders.py 프로젝트: zhoulingjie/TiebaCrawler

    def parse_my_forums(self, response):
        text = re.search(r'<table>(.*?)</table>', response.text).group(1)
        selector = Selector(text=text)
        trs = selector.xpath('//tr')
        if trs is not None and len(trs) > 1:
            trs = trs[1:]
            for tr in trs:
                if len(tr.xpath('td')) == 0:
                    continue
                forum = dict()
                forum['title'] = tr.xpath('td[1]/a/@title').extract()[0]
                forum['href'] = tr.xpath('td[1]/a/@href').extract()[0]
                forum['exper'] = tr.xpath('td[2]/a/text()').extract()[0]
                forum['fid'] = tr.xpath('td[4]/span/@balvid').extract()[0]
                forum['tbs'] = tr.xpath('td[4]/span/@tbs').extract()[0]
                self.forums.append(forum)

                item = TiebaItem(type=0)
                item['forum'] = forum
                yield item
            self.pn += 1
            yield self.__get_my_forums(pn=self.pn)
        else:
            index = 0
            for forum in self.forums:
                if index > 0:
                    sleep(2)
                index += 1
                yield self.__single_signin(forum['tbs'], forum['title'])

예제 #6

0

파일 보기

파일: tiebaspider.py 프로젝트: huankiki/DataProcBeginner

    def parse(self, response):
        item = TiebaItem()
        selector = Selector(response)
        infos = selector.xpath('//div[@class="zu-top-feed-list"]/div')
        for info in infos:
            try:
                question = info.xpath(
                    'div/div/h2/a/text()').extract()[0].strip()
                favour = info.xpath(
                    'div/div/div[1]/div[1]/a/text()').extract()[0]
                user = info.xpath(
                    'div/div/div[1]/div[3]/span/span[1]/a/text()').extract()[0]
                user_info = info.xpath(
                    'div/div/div[1]/div[3]/span/span[2]/text()').extract(
                    )[0].strip()
                content = info.xpath(
                    'div/div/div[1]/div[5]/div/text()').extract()[0].strip()

                item['question'] = question
                item['favour'] = favour
                item['user'] = user
                item['user_info'] = user_info
                item['content'] = content

                yield item
            except IndexError:
                pass

        urls = [
            'https://www.zhihu.com/topic/19552832/top-answers?page={}'.format(
                str(i)) for i in range(2, 50)
        ]
        for url in urls:
            yield Request(url, callback=self.parse)

예제 #7

0

파일 보기

파일: iebaspider.py 프로젝트: lihaofei/scrapy_python

    def parse(self, response):  #定义parse()函数
        item = TiebaItem()  #实例化类
        selector = Selector(response)

        infos = selector.xpath('//*[@id="TopicMain"]/div[2]/div/div')
        print(infos)
        for info in infos:
            try:
                question = info.xpath(
                    'div/div/h2/div/a/text()').extract()[0].strip()
                # favour = info.xpath('div/div/div[1]/div[1]/a/text()').extract()[0]
                # user = info.xpath('div/div/div[1]/div[3]/span/span[1]/a/text()').extract()[0]
                # user_info = info.xpath('div/div/div[1]/div[3]/span/span[2]/text()').extract()[0].strip()
                # content = info.xpath('div/div/div[1]/div[5]/div/text()').extract()[0].strip()
                item['question'] = question
                # item['favour'] = favour
                # item['user'] = user
                # item['content']= content
                yield item  #返回爬虫数据
            except IndexError:
                pass  #pass 掉 IndexError错误
        urls = [
            'https://www.zhihu.com/topic/19552832/top-answers?page={}'.format(
                str(i)) for i in range(2, 50)
        ]
        for url in urls:
            yield Request(url, callback=self.parse)  #回调函数

예제 #8

0

파일 보기

 def getInfo(self, response):
     print("存入信息")
     li_list = response.xpath(
         '//*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div'
     )
     for li in li_list:
         item = TiebaItem()
         item['reply_num'] = li.xpath(
             'div["col2_left j_threadlist_li_left"]/span[@title="回复"]/text()'
         ).extract_first()
         item['theme'] = li.xpath(
             'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@title'
         ).extract_first()
         item['theme_site'] = li.xpath(
             'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href'
         ).extract_first()
         item['theme_author'] = li.xpath(
             'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author "]/@title'
         ).extract_first()
         item['create_time'] = li.xpath(
             'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()'
         ).extract_first()
         item['content'] = li.xpath(
             'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div["@class=threadlist_text pull_left"]/div/text()'
         ).extract_first()
         item['replyer'] = li.xpath(
             'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author_rely j_replyer"]/@title'
         ).extract_first()
         item['reply_date'] = li.xpath(
             'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()'
         ).extract_first()
         yield item

예제 #9

0

파일 보기

    def parse(self, response):

        for items in response.css(
                'div.l_post.l_post_bright.j_l_post.clearfix'):
            var = TiebaItem()
            x = items.css(
                'div.d_author>ul.p_author>li.d_name>a::text').extract()
            var['author'] = " ".join(x)
            var['image_urls'] = items.css(
                'div.d_post_content_main>div.p_content>cc>div>img::attr(src)'
            ).extract()
            var['images_name'] = []
            count = 0
            for i in var['image_urls']:
                count = count + 1
                var['images_name'].append(var['author'] + '_20FebI' +
                                          str(count))
            if not var['image_urls']:
                continue
            yield var
        next_url = response.css(
            ' div.l_thread_info > ul > li.l_pager.pager_theme_5.pb_list_pager > a:nth-child(7)::attr(href)'
        ).extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse)

예제 #10

0

파일 보기

파일: MySpider.py 프로젝트: xinxini/scrapy_1

 def parse(self, response):
     item = TiebaItem()
     for box in response.xpath(
             '//div[@class="threadlist_title pull_left j_th_tit "]/a[@class="j_th_tit "]'
     ):
         item['url'] = 'http://tieba.baidu.com' + box.xpath(
             './/@href').extract()[0]
         item['title'] = box.xpath('.//@title').extract()[0].strip()
         yield item

예제 #11

0

파일 보기

 def parse(self, response):
     #百度贴吧全是注释形式的html，所以response不能直接用css选择器，crawlspider也无法使用
     #提取出所有注释的html并且联结成一个html
     html_str_list = re.findall(r'<!--(.*?)-->', response.text, re.S)
     html_str = ''.join(html_str_list)
     bs = BeautifulSoup(html_str, 'html.parser')
     all_article = bs.select('[class="t_con cleafix"]')
     for article in all_article:
         item = TiebaItem()
         bs_article = BeautifulSoup(str(article), 'html.parser')
         title = re.findall(
             r'<a.*?>(.*?)</a>',
             str(bs_article.select('[class="j_th_tit"]')[0]))[0]
         detail_url = 'https://tieba.baidu.com' + re.findall(
             r'href="(.*?)"', str(
                 bs_article.select('[class="j_th_tit"]')[0]))[0]
         #有些vip用户的class是不一样的
         try:
             auther = re.findall(
                 r'title="主题作者: (.*?)"',
                 str(bs_article.select('[class="tb_icon_author"]')[0]))[0]
         except:
             auther = re.findall(
                 r'title="主题作者: (.*?)"',
                 str(
                     bs_article.select(
                         '[class="tb_icon_author no_icon_author"]')[0]))[0]
         item['title'] = title
         item['auther'] = auther
         item['detail_url'] = detail_url
         #获取id，可以通过id和评论外键关联
         article_id = int(detail_url.split('/')[-1])
         item['article_id'] = article_id
         #是否是置顶帖
         if bs_article.select('[class="icon-top"]') != []:
             item['is_top'] = 1
         else:
             item['is_top'] = 0
         item_copy = copy.deepcopy(item)
         yield scrapy.Request(item_copy['detail_url'],
                              callback=self.parse_detail,
                              meta={'item': item_copy})
     next_page_url = 'https:' + re.findall(
         r'href="(.*?)"', str(
             bs.select('[class="next pagination-item"]')[0]))[0].replace(
                 'amp;', '')
     try:
         #若无法获取，则说明是最后一页
         last_page_url = 'https:' + re.findall(
             r'href="(.*?)"',
             str(bs.select('[class="last pagination-item"]')
                 [0]))[0].replace('amp;', '')
         yield scrapy.Request(next_page_url, callback=self.parse)
     except:
         return

예제 #12

0

파일 보기

 def parse(self, response):
     item = TiebaItem()
     item['title'] = response.xpath(
         '//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a/text()'
     ).extract()
     item['author'] = response.xpath(
         '//*[@id="thread_list"]/li/div/div[2]/div[1]/div[2]/span[1]/span[1]/a/text()'
     ).extract()
     item['reply'] = response.xpath(
         '//*[@id="thread_list"]/li/div/div[1]/span/text()').extract(
         )  # TODO 前两条评论无法输出
     yield item

예제 #13

0

파일 보기

 def parse(self, response):
     for i in response.xpath('//li[@class=" j_thread_list clearfix"]'):
         item = TiebaItem(
         )  # attention! item的实例化对象一定要在for里面,也就是yield之后必须再实例化一个item
         item['title'] = i.xpath(
             './/div[@class="threadlist_title pull_left j_th_tit "]/a/text()'
         ).extract_first()
         item['author'] = i.xpath(
             './/a[@class="frs-author-name j_user_card "]/text()'
         ).extract_first()
         item['describ'] = i.xpath(
             './/div[@class="threadlist_abs threadlist_abs_onlyline "]/text()'
         ).extract_first().strip()
         item['comment_num'] = i.xpath(
             './/span[@class="threadlist_rep_num center_text"]/text()'
         ).extract_first()
         link = i.xpath(
             './/div[@class="threadlist_title pull_left j_th_tit "]/a/@href'
         ).extract_first()
         item['_id'] = response.urljoin(link)
         if link:
             yield Request(response.urljoin(link),
                           callback=self.comment_parse,
                           meta={'item': item})
             link = item['_id'].split('/')[-1]
             yield Request(
                 'https://tieba.baidu.com/p/totalComment?t=1506043640283&tid=%s&fid=13785031&pn=1&see_lz=0'
                 % link,
                 callback=self.sub_comment_parse,
                 meta={'item': item})
             # yield SplashRequest(response.urljoin(link), callback=self.comment_parse, meta={'item': item},args={'wait': '0.2'})
         # item['_id']=item['title']
         print item['title']
         for key in item:
             item[key] = item[key] if item[key] else "nothing"
         # yield item
     self.index += 1
     a = response.xpath(
         '//div[@class="thread_list_bottom clearfix"]//a[@class="next pagination-item "]/text()'
     ).extract_first()
     if a:
         print '>>>\n', a, "第%s页" % self.index, '\n>>>'
         # yield {'glap': '>>>' + a + "this is %s page" % self.index}
     #
     next_page_url = response.xpath(
         '//div[@class="thread_list_bottom clearfix"]//a[@class="next pagination-item "]/@href'
     ).extract_first()
     if next_page_url != None:
         yield Request(response.urljoin(next_page_url))

예제 #14

0

파일 보기

파일: tieba_spider.py 프로젝트: linyi0604/Python

 def parse(self, response):
     item = TiebaItem()
     div_list = response.xpath('//div[contains(@class,"i")]')
     for div in div_list:
         item['title'] = div.xpath('./a/text()').extract_first()
         item['href'] = self.base_url + div.xpath(
             './a/@href').extract_first()
         item['img_list'] = []
         yield scrapy.Request(url=item['href'],
                              callback=self.parse_detail,
                              meta=item)
     next_page = response.xpath('//a[text()="下一页"]/@href').extract_first()
     if next_page is not None:
         next_page = self.base_url + next_page
         yield scrapy.Request(url=next_page, callback=self.parse)

예제 #15

0

파일 보기

 def parse(self, response):
     li_list = response.xpath(
         "//div[@id='pagelet_frs-list/pagelet/thread_list']/ul/li")
     for li in li_list:
         item = TiebaItem()
         item["标题"] = li.xpath(
             "./div/div[2]/div/div/a/text()").extract_first()
         item["回复数"] = li.xpath("./div/div[1]/span/text()").extract_first()
         item["详情页链接"] = li.xpath(
             "./div/div[2]/div/div/a/@href").extract_first()
         item["详情页链接"] = [
             "https://tieba.baidu.com" + i for i in item["详情页链接"]
         ]
         yield scrapy.Request(item["详情页链接"],
                              callback=self.get_next_page,
                              mate={"item": item})
         next_url = response.xpath(
             "//div[@class='thread_list_bottom clearfix']/div/a[10]/@href"
         ).extract_first()
         yield scrapy.Request(next_url, callback=self.parse)

예제 #16

0

파일 보기

파일: tb.py 프로젝트: dacharry/baidutieba

 def parse(self, response):
     html = re.findall(
         r'pagelet_html_frs-list/pagelet/thread_list" style="display:none;"><!--(.*?)--></code>',
         response.body.decode(), re.S)[0]
     html = etree.HTML(html)
     li_list = html.xpath(
         '//ul[@id="thread_list"]//li[@class=" j_thread_list clearfix"]')
     for li in li_list:
         item = TiebaItem()
         item['title'] = li.xpath('.//a/text()')[0]
         item['li_url'] = li.xpath('.//a/@href')[0]
         # print(item)
         item['img_list'] = []
         if item['li_url']:
             yield response.follow(item['li_url'],
                                   callback=self.parse_detail,
                                   meta={'item': item})
     next_page = html.xpath(
         '//div[@id="frs_list_pager"]/a[contains(@class,"next")]/@href')[0]
     if next_page and self.page < self.max_page:
         self.page += 1
         yield response.follow(next_page, callback=self.parse)

예제 #17

0

파일 보기

파일: tieba_stl.py 프로젝트: adreame/Spider

    def parse(self, response):
        # 分组获取每个帖子
        tz_list = response.xpath("//div[contains(@class, 'i')]")
        print("-"*100 , tz_list)
        # 遍历取到的li标签, 取出相应信息
        for div in tz_list:
            item = TiebaItem()
            item['tz_href'] = 'https://tieba.baidu.com' + div.xpath(".//a/@href").extract_first()
            item['tz_title'] = div.xpath(".//a/text()").extract()
            # print("*"*100, item)
            yield scrapy.Request(
                item['href'],
                callback=self.parse_detail,
                meta={"item":item}
            )

        # 下一页
        next_url = response.xpath("//a[text()='下一页']/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(
                'http://tieba.baidu.com/mo/q---596A8CA33D57134A7383E14E264D8288%3AFG%3D1--1-3-0--2--wapp_1516511596154_469' + next_url,
                callback=self.parse
            )

예제 #18

0

파일 보기

 def third_parse(self, response):
     global tmpItems
     '''
     Get every post content
     '''
     item_1 = response.meta['item_1']
     response = response.body
     response = self.clean_data(response)
     html = Selector(text=response)
     page = html.xpath(
         '//div[@class="d_post_content j_d_post_content  clearfix"]/text()'
     ).extract()
     items = []
     item = TiebaItem()
     item['title'] = item_1['title'].encode('utf8')
     item['url'] = item_1['url'].encode('utf8')
     item['pageUrl'] = item_1['pageUrl'].encode('utf8')
     page = [p.strip() for p in page]
     item['text'] = "##".join(page)
     # print item
     items.append(item)
     tmpItems = []
     return items

예제 #19

0

파일 보기

파일: Tiebaspider.py 프로젝트: swingwu/scrapy

    def parse(self, response):
        item = TiebaItem()
        selector = Selector(response)
        infos = selector.xpath('')
        for info in infos:
            try:
                question = info.xpath()
                favour = info.xpath()
                user = info.xpath()
                user_info = info.xpath()
                content = info.xpath()
                item['question'] = question
                item['favour'] = favour
                item['user'] = user
                item['user_info'] = user_info
                item['content'] = content
                yield item
            except IndexError:
                pass

        urls = [''.format(str(i)) for i in range(2, 50)]
        for url in urls:
            yield Request(url, callback=self.parse)  #回调函数

예제 #20

0

파일 보기

    def parse_post_reply(self, response):
        json_obj = json.loads(response.body)
        no = int(json_obj['no'])
        data_obj = json_obj['data']
        tid = int(data_obj['tid']) if 'tid' in data_obj else 0
        if no == 0 and tid != 0:
            utils.debug('评论成功：', json.dumps(json_obj['data']))

            meta = response.meta['datas']

            data = dict()
            data['fid'] = meta['fid']
            data['kw'] = meta['kw']
            data['tbs'] = meta['tbs']
            data['tid'] = meta['tid']
            data['content'] = meta['content']
            data['timestamp'] = utils.timestamp
            item = TiebaItem(type=2)
            item['reply'] = data
            yield item

            time.sleep(60)
            yield self.post_reply(tid)
        else:
            err_code = int(json_obj['err_code'])
            utils.debug('评论失败：', get_post_err_msg(no, err_code, response.body))
        if no == 220034 and tid != 0:
            time.sleep(300)
            yield self.post_reply(tid)
        if no == 40 and tid != 0:
            vcode_obj = json_obj['data']['vcode']
            input_captcha = utils.show_captcha(vcode_obj['captcha_vcode_str'])
            captcha_type = vcode_obj['captcha_code_type']
            yield self.__check_captcha(captcha=input_captcha,
                                       captcha_type=captcha_type)

            yield self.post_reply(tid, input_captcha)

예제 #21

0

파일 보기

    def parsePage(self, response):

        selector = Selector(response)
        # content_list3 = selector.xpath("/html/body//div[@class='s_post_list']/div[@class='s_post']")
        content_list = selector.xpath(
            "/html/body//div[@class='s_post_list']/div[@class='s_post']/span[@class='p_title']/a"
        )
        content_list_2 = selector.xpath(
            "/html/body//div[@class='s_post_list']/div[@class='s_post']/div[@class='p_content']"
        )
        i = 0
        for content in content_list:
            item = TiebaItem()
            title = content.xpath("string(.)").extract_first()
            url = content.xpath('@href').extract_first()
            content2 = content_list_2[i].xpath("string(.)").extract_first()
            i = i + 1
            url = str(self.host + url)
            item['url'] = url
            item['title'] = title
            item['content'] = content2
            print url
            print title
            yield item

예제 #22

0

파일 보기

    def parse(self, response):

        li_list = response.xpath(
            '//*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div')
        for li in li_list:
            item = TiebaItem()
            item['reply_num'] = li.xpath(
                'div["col2_left j_threadlist_li_left"]/span[@title="回复"]/text()'
            ).extract_first()
            item['theme'] = li.xpath(
                'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@title'
            ).extract_first()
            item['theme_site'] = li.xpath(
                'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href'
            ).extract_first()
            item['theme_author'] = li.xpath(
                'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author "]/@title'
            ).extract_first()
            item['create_time'] = li.xpath(
                'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()'
            ).extract_first()
            item['content'] = li.xpath(
                'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div["@class=threadlist_text pull_left"]/div/text()'
            ).extract_first()
            item['replyer'] = li.xpath(
                'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author_rely j_replyer"]/@title'
            ).extract_first()
            item['reply_date'] = li.xpath(
                'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()'
            ).extract_first()
            yield item

        # 下一页
        num = int(
            response.xpath(
                '//*[@id="frs_list_pager"]/span[@class="pagination-current pagination-item "]/text()'
            ).extract_first())
        if num <= 10:
            next_page = response.xpath(
                '//*[@id="frs_list_pager"]/a[@class="next pagination-item "]/@href'
            ).extract_first()
            print("next_page, %s", next_page)
            print(num)
            if next_page is not None:
                yield response.follow(next_page, self.parse)

        # 回复
        # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_left j_threadlist_li_left"]/span[@title="回复"]/text()')
        # 主题
        # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit  member_thread_title_frs "]/a/@href
        # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit  member_thread_title_frs "]/a/@title
        # 作者
        # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author no_icon_author"]/@title
        # 创建时间
        # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()
        # 内容
        # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div["@class=threadlist_text pull_left"]/div/text()
        # 最后回复人
        # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author_rely j_replyer"]/@title
        # 最后回复时间
        # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()'
        def getInfo(self, response):
            print("存入信息")
            li_list = response.xpath(
                '//*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div'
            )
            for li in li_list:
                item = TiebaItem()
                item['reply_num'] = li.xpath(
                    'div["col2_left j_threadlist_li_left"]/span[@title="回复"]/text()'
                ).extract_first()
                item['theme'] = li.xpath(
                    'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@title'
                ).extract_first()
                item['theme_site'] = li.xpath(
                    'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href'
                ).extract_first()
                item['theme_author'] = li.xpath(
                    'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author "]/@title'
                ).extract_first()
                item['create_time'] = li.xpath(
                    'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()'
                ).extract_first()
                item['content'] = li.xpath(
                    'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div["@class=threadlist_text pull_left"]/div/text()'
                ).extract_first()
                item['replyer'] = li.xpath(
                    'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author_rely j_replyer"]/@title'
                ).extract_first()
                item['reply_date'] = li.xpath(
                    'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()'
                ).extract_first()
                yield item