예제 #1
0
    def parse_post(self, response):
        meta = response.meta
        has_comment = False
        for floor in response.xpath("//div[contains(@class, 'l_post')]"):
            if not helper.is_ad(floor):
                data = json.loads(floor.xpath("@data-field").extract_first())
                item = PostItem()
                item['post_id'] = data['content']['post_id']
                item['author'] = data['author']['user_name']
                item['comment_num'] = data['content']['comment_num']
                if item['comment_num'] > 0:
                    has_comment = True
                content = floor.xpath(
                    ".//div[contains(@class,'j_d_post_content')]"
                ).extract_first()
                #以前的帖子, data-field里面没有content
                item['content'] = helper.parse_content(content, True)

                images = helper.get_images(content, True)
                if len(images) > 0:
                    for image in images:
                        yield self.parse_image(image_url=image,
                                               post_id=item['post_id'],
                                               image_index=images.index(image))

                #以前的帖子, data-field里面没有thread_id
                item['thread_id'] = meta['thread_id']
                item['floor'] = data['content']['post_no']
                if 'user_id' in data['author'].keys():
                    item['user_id'] = data['author']['user_id']
                else:
                    item['user_id'] = None
                #只有以前的帖子, data-field里面才有date
                if 'date' in data['content'].keys():
                    item['time'] = data['content']['date']
                    #只有以前的帖子, data-field里面才有date
                else:
                    item['time'] = floor.xpath(".//span[@class='tail-info']")\
                    .re_first(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}')
                user_uri = floor.xpath(
                    ".//a[@class='p_author_name j_user_card']/@href"
                ).extract_first()
                if user_uri:
                    url = 'http://tieba.baidu.com%s' % user_uri
                    yield scrapy.Request(url, callback=self.parse_user)
                yield item
        if has_comment:
            url = "http://tieba.baidu.com/p/totalComment?tid=%d&fid=1&pn=%d" % (
                meta['thread_id'], meta['page'])
            if self.see_lz:
                url += '&see_lz=1'
            yield scrapy.Request(url, callback=self.parse_comment, meta=meta)
        next_page = response.xpath(
            u".//ul[@class='l_posts_num']//a[text()='下一页']/@href")
        if next_page:
            meta['page'] += 1
            url = response.urljoin(next_page.extract_first())
            yield scrapy.Request(url, callback=self.parse_post, meta=meta)
예제 #2
0
 def parse_post(self, response):
     meta = response.meta
     has_comment = False
     for floor in response.xpath("//div[contains(@class, 'l_post')]"):
         if not helper.is_ad(floor):
             data = json.loads(floor.xpath("@data-field").extract_first())
             item = PostItem()
             item['id'] = data['content']['post_id']
             item['author'] = data['author']['user_name']
             item['comment_num'] = data['content']['comment_num']
             if item['comment_num'] > 0:
                 has_comment = True
             content = floor.xpath(
                 ".//div[contains(@class,'j_d_post_content')]"
             ).extract_first()
             #以前的帖子, data-field里面没有content
             item['content'] = helper.parse_content(content)
             #以前的帖子, data-field里面没有thread_id
             item['thread_id'] = meta['thread_id']
             item['floor'] = data['content']['post_no']
             #只有以前的帖子, data-field里面才有date
             if 'date' in data['content'].keys():
                 item['time'] = data['content']['date']
                 #只有以前的帖子, data-field里面才有date
             else:
                 item['time'] = floor.xpath(".//span[@class='tail-info']")\
                 .re_first(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}')
             yield item
     if has_comment:
         url = "http://tieba.baidu.com/p/totalComment?tid=%d&fid=1&pn=%d" % (
             meta['thread_id'], meta['page'])
         if self.see_lz:
             url += '&see_lz=1'
         yield scrapy.Request(
             url,
             callback=self.parse_comment,
             meta=meta,
             headers={
                 'User-Agent':
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
             })
     next_page = response.xpath(
         u".//ul[@class='l_posts_num']//a[text()='下一页']/@href")
     if next_page:
         meta['page'] += 1
         url = response.urljoin(next_page.extract_first())
         yield scrapy.Request(
             url,
             callback=self.parse_post,
             meta=meta,
             headers={
                 'User-Agent':
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
             })
예제 #3
0
    def parse_post(self, response):
        meta = response.meta
        has_comment = False
        for floor in response.xpath("//div[contains(@class, 'l_post')]"):
            if not helper.is_ad(floor):
                data = json.loads(floor.xpath("@data-field").extract_first())
                item = PostItem()
                item['id'] = data['content']['post_id']
                item['author'] = data['author']['user_name']
                item['comment_num'] = data['content']['comment_num']
                if item['comment_num'] > 0:
                    has_comment = True
                content = floor.xpath(
                    ".//div[contains(@class,'j_d_post_content')]"
                ).extract_first()
                #以前的帖子, data-field里面没有content
                item['content'] = helper.parse_content(content)
                #以前的帖子, data-field里面没有thread_id
                item['thread_id'] = meta['thread_id']
                item['floor'] = data['content']['post_no']
                #只有以前的帖子, data-field里面才有date
                if 'date' in data['content'].keys():
                    item['time'] = data['content']['date']
                    #只有以前的帖子, data-field里面才有date
                else:
                    item['time'] = floor.xpath(".//span[@class='tail-info']")\
                    .re_first(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}')
                yield item

        if has_comment:
            url = "http://tieba.baidu.com/p/totalComment?tid=%d&fid=1&pn=%d" % (
                meta['thread_id'], meta['page'])
            if self.see_lz:
                url += '&see_lz=1'
            yield scrapy.Request(url,
                                 callback=self.parse_totalComment,
                                 meta=meta,
                                 headers=self.my_headers)
        next_page = response.xpath(
            u".//ul[@class='l_posts_num']//a[text()='下一页']/@href")
        if next_page:
            meta['page'] += 1
            url = response.urljoin(next_page.extract_first())
            yield scrapy.Request(url,
                                 callback=self.parse_post,
                                 meta=meta,
                                 headers=self.my_headers)
예제 #4
0
 def parse(self, response):
     # for sel in response.xpath("//ul[@id='thread_list']"):
     for sel in response.xpath("//li[contains(@class,'j_thread_list')]"):
         # li = sel.xpath("//li[@class='j_thread_list']")
         item = PostItem()
         try:
             item['title'] = self.extractInfo(sel.xpath(".//div[contains(@class,'threadlist_abs')]/text()").extract())
             item['date'] = self.extractInfo(sel.xpath(".//span[contains(@class,'is_show_create_time')]/text()").extract())
             item['url'] = self.extractInfo(sel.xpath(".//div[contains(@class,'threadlist_title pull_left j_th_tit')]").xpath("a/@href").extract())
             item['author'] = self.extractInfo(sel.xpath(".//span[contains(@class,'tb_icon_author')]/@title").extract())
             print(item['title'], item['date'], item['url'], item['author'])
             self.data.append(item)
             # self.saveToDb(item)
         except IndexError:
             continue
     self.currentIndex = self.currentIndex+50
     if self.currentIndex > 1000:
         self.saveToDb(self.data)
         return 
     nexturl = 'https://tieba.baidu.com/f?kw=clannad&ie=utf-8&pn='+str(self.currentIndex)    
     yield self.make_requests_from_url(nexturl)
예제 #5
0
    def parse_post(self, response):
        meta = response.meta
        has_comment = False
        # total_commont_floor_num = 0

        # 递归调用时候增加总数 放在另一个服务里面去统计
        # if meta.has_key('total_commont_floor_num'):
        #     print("已经有了统计的总数:{}".format(meta['total_commont_floor_num']))
        #     total_commont_floor_num = meta['total_commont_floor_num']

        floor_list = response.xpath("//div[contains(@class, 'l_post')]")
        #vip 用户会有 vip_red 在class 中 这边用contains
        # post_floor_userinfo_list = response.xpath('//a[contains(@class, "p_author_name")]//@href')

        # print("楼层用户的集合的长度:{}".format(len(post_floor_userinfo_list)))

        for index, floor in enumerate(floor_list):
            if not helper.is_ad(floor):
                # total_commont_floor_num  +=1
                data = json.loads(floor.xpath("@data-field").extract_first())
                item = PostItem()

                # 楼层用户名
                # item['author'] = data['author']['user_name']
                # 楼层用户id
                item['tiebaAccountId'] = data['author']['user_id']
                # 具体楼层评论数 用来处理楼中楼信息
                content = floor.xpath(
                    ".//div[contains(@class,'j_d_post_content')]"
                ).extract_first()
                # 以前的帖子, data-field里面没有content
                item['content'] = helper.parse_content(content)
                #楼层id
                item['outContentId'] = data['content']['post_id']
                #贴吧id
                item['tiebaInfoId'] = meta['tiebaInfoId']
                # TODO 获取用户逇跳转链接 可以直接拿去扒数据
                # print("获取:{}".format(user_detail_href))
                # item['user_detail_href'] = user_detail_href
                #以前的帖子, data-field里面没有 threadId
                item['threadId'] = meta['threadId']
                #楼层信息
                # item['floor'] = data['content']['post_no']

                post_time = None
                #只有以前的帖子, data-field里面才有date
                if 'date' in data['content'].keys():
                    post_time = data['content']['date']
                    item['publishTime'] = post_time
                    #只有以前的帖子, data-field里面才有date
                else:
                    post_time = floor.xpath(".//span[@class='tail-info']")\
                    .re_first(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}')
                    item['publishTime'] = post_time

                # 时间格式处理
                post_time = self.dealTime(post_time)

                created_at = datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S")
                since_date = datetime.strptime(self.since_date,
                                               "%Y-%m-%d %H:%M:%S")

                if created_at < since_date:
                    item['isSend'] = False
                else:
                    item['isSend'] = True
                #处理评论
                item['comment_num'] = data['content']['comment_num']
                if item['comment_num'] > 0:
                    has_comment = True
                yield item

                user_detail_href = "https://tieba.baidu.com/home/main?id=" + data[
                    'author']['portrait']
                #请求用户详情页
                yield scrapy.Request(user_detail_href,
                                     callback=self.parse_user_detail,
                                     meta=meta,
                                     cb_kwargs=dict(item))

        # meta['total_commont_floor_num'] = total_commont_floor_num
        if has_comment:
            url = "http://tieba.baidu.com/p/totalComment?tid=%d&fid=1&pn=%d" % (
                meta['threadId'], meta['page'])
            if self.see_lz:
                url += '&see_lz=1'
            yield scrapy.Request(url, callback=self.parse_comment, meta=meta)
        next_page = response.xpath(
            u".//ul[@class='l_posts_num']//a[text()='下一页']/@href")
        if next_page:
            meta['page'] += 1
            url = response.urljoin(next_page.extract_first())
            yield scrapy.Request(url, callback=self.parse_post, meta=meta)