Exemplo n.º 1
0
class NewsComment(object):
    def __init__(self):
        self.mongo = MongoDB()

    def run(self, news_url, _id, page):
        # comment_url = 'http://apiv2.sohu.com/api/topic/load?page_size=10' \
        #                '&topic_source_id=%s&page_no=1&hot_size=5&topic_url=%s&source_id=%s' % (_id,news_url,_id)
        if news_url.endswith('shtml'):
            pass
        else:
            tow_ids = news_url.split('/')[-1].split('_')
            media_id = tow_ids[1]
            source_id = tow_ids[0]
            comment_url = 'http://apiv2.sohu.com/api/topic/load?callback=jQuery1124008187733188312629_1539945526218&page_size=10' \
                          '&topic_source_id=%s&page_no=1&media_id=%s&source_id=mp_%s' % (_id, media_id, source_id)
            # print comment_url
            json_object = dict()
            flag = 1
            while 1:
                try:
                    # json_object = json.loads(requests.get(comment_url, timeout=30).content)
                    comments = requests.get(comment_url, timeout=30).content
                    json_object = json.loads(
                        re.match('.*218\((.*?)\);', comments).group(1))
                    break
                except Exception as e:
                    flag += 1
                    print "获取评论错误:", e

                if flag > 5:
                    return
                count = 0
            if (json_object[u'jsonObject'].has_key(u'topic_id') == False):
                print "暂时无法获取topic_id"

            else:
                item = json_object[u'jsonObject'][u'topic_id']

                # comment_URL = 'http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id=%s&page_no=%d&source_id=%s' % (item, page,_id)
                comment_URL = 'http://apiv2.sohu.com/api/topic/load?callback=jQuery1124008187733188312629_1539945526218&page_size=10' \
                              '&topic_id=%s&page_no=%s&media_id=%s&source_id=mp_%s' % (item, page, media_id, source_id)
                Json_object = dict()
                comment_dict = dict()
                flag = 1
                while 1:
                    try:
                        # json_object = json.loads(requests.get(comment_url, timeout=30).content)
                        comments = requests.get(comment_URL,
                                                timeout=30).content
                        Json_object = json.loads(
                            re.match('.*218\((.*?)\);', comments).group(1))
                        break
                    except Exception as e:
                        flag += 1
                        print "获取评论错误:", e

                    if flag > 5:
                        return

                count = 0
                for item in Json_object[u'jsonObject'][u'comments']:

                    # 评论文章url
                    news_url = news_url

                    # 评论内容
                    ping_lun_nei_rong = item["content"]
                    comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong

                    # 评论时间
                    ping_lun_shi_jian = item["create_time"]
                    comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian

                    # 回复数量
                    hui_fu_shu = item["reply_count"]
                    comment_dict['hui_fu_shu'] = hui_fu_shu

                    # 点赞数量
                    dian_zan_shu = item["support_count"]
                    comment_dict['dian_zan_shu'] = dian_zan_shu

                    # 评论id
                    ping_lun_id = item["comment_id"]
                    comment_dict['ping_lun_id'] = ping_lun_id

                    # 用户昵称
                    if (item[u'passport'].has_key(u'nickname') == False):
                        yong_hu_ming = None
                    else:
                        yong_hu_ming = item[u'passport']["nickname"]
                    comment_dict['yong_hu_ming'] = yong_hu_ming
                    # 性别
                    xing_bie = None
                    comment_dict['xing_bie'] = xing_bie

                    # 用户等级
                    yong_hu_deng_ji = None
                    comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji

                    # 用户省份
                    yong_hu_sheng_fen = item["ip_location"]
                    comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen

                    # 抓取时间
                    do_time = time.time()
                    comment_dict['do_time'] = do_time

                    # 抓取网站
                    zhan_dian = u'搜狐网'
                    comment_dict['zhan_dian'] = zhan_dian

                    # 主键
                    comment_dict['_id'] = str(ping_lun_id) + '|_|' + news_url
                    #
                    count += 1
                    # print json.dumps(comment_dict, ensure_ascii=False, indent=4)
                    self.mongo.put_comment(comment_dict)
Exemplo n.º 2
0
class MyCommentThread(threading.Thread):
    def __init__(self, workqueue):
        threading.Thread.__init__(self)
        self.workQueue = workqueue
        self.setDaemon(True)
        self.start()
        self.mongodb = MongoDB()
        self.checkMongoDB = TempMongoDB()

    def run(self):
        while not self.workQueue.empty():
            try:
                # print "%s start working" % self.name
                info, wenzhang_Url = self.workQueue.get()
                default_url = 'http://comment.%s.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/' \
                              'comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2' % \
                              (info.group(1), info.group(2))
                pages = self.working(wenzhang_Url, default_url, info)
                if pages > 0:
                    comment_urls = list()
                    for i in range(1, pages + 1):
                        offset = i * 30
                        temp = 'http://comment.%s.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s' \
                               '/comments/newList?offset=%d&limit=30&showLevelThreshold=72&headLimit=1&' \
                               'tailLimit=2' % (info.group(1), info.group(2), offset)
                        comment_urls.append(temp)
                    for item in comment_urls:
                        drop = self.working(wenzhang_Url, item, info)
            except Exception as e:
                print "ERROR: Locate in the CommentThread's run method 'while not Queue empty', exception: %s" % e
                continue

    def working(self, content_url, the_comment_url, info):
        host = 'comment.%s.163.com' % (info.group(1))
        referer = the_comment_url
        header = {
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Connection': 'keep-alive',
            'Host': host,
            'Referer': referer,
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
                          ' Chrome/59.0.3071.115 Safari/537.36'
        }
        # 默认获取第一页的json数据
        flag = 1
        while 1:
            try:
                json_data = json.loads(requests.get(url=the_comment_url, headers=header, timeout=30).content)
                break
            except Exception as e:
                print "ERROR: Failed to get the comment's json, exception: %s" % e
                flag += 1
            if flag > 5:
                return
        pages = 0
        try:
            for comment_id in json_data['comments']:
                comment_dict = dict()
                # 评论内容
                ping_lun_nei_rong = json_data['comments'][str(comment_id)]['content']
                comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong

                # 评论时间
                ping_lun_shi_jian = json_data['comments'][str(comment_id)]['createTime']
                comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian

                # 回复数量
                hui_fu_shu = None
                comment_dict['hui_fu_shu'] = hui_fu_shu

                # 点赞数量
                dian_zan_shu = json_data['comments'][str(comment_id)]['vote']
                comment_dict['dian_zan_shu'] = dian_zan_shu

                # 评论ID
                ping_lun_id = comment_id
                comment_dict['ping_lun_id'] = ping_lun_id

                # 用户昵称
                try:
                    yong_hu_ming = json_data['comments'][str(comment_id)]['user']['nickname']
                    comment_dict['yong_hu_ming'] = yong_hu_ming
                except Exception as e:
                    comment_dict['yong_hu_ming'] = None

                # 性别
                comment_dict['xing_bie'] = None

                # 用户等级
                comment_dict['yong_hu_deng_ji'] = None

                # 用户省份
                comment_dict['yong_hu_sheng_fen'] = json_data['comments'][str(comment_id)]['user']['location']

                # 抓取时间
                do_time = time.time()
                comment_dict['do_time'] = do_time

                # 抓取网站
                zhan_dian = u'网易新闻'
                comment_dict['zhan_dian'] = zhan_dian

                # 主键
                comment_dict['_id'] = ping_lun_id + content_url

                # 获取评论数
                ping_lun_shu = json_data['newListSize']
                pages = ping_lun_shu / 30

                self.mongodb.put_comment(comment_dict)
                # put the data into the database
                check_dict = dict()
                check_dict['_id'] = content_url
                check_dict['do_time'] = do_time
                check_dict['ping_lun_shu'] = ping_lun_shu
                self.checkMongoDB.put(check_dict)
            return pages
        except Exception as e:
            print "ERROR: Locate in the CommentThread's working method for parsing json data, exception: %s," \
                  "and json data is %s" % (e, json_data)
Exemplo n.º 3
0
class NewsComment(object):
    def __init__(self):
        self.mongo = MongoDB()

    def run(self, news_url, page):
        bu = re.split(r'c_|.htm', news_url)[1]
        comment_url = 'http://comment.home.news.cn/a/newsCommAll.do?&newsId=1-%s&pid=%d' % (
            bu, page)
        json_object = dict()
        comment_dict = dict()
        flag = 1
        while 1:
            try:
                json_object = json.loads(
                    requests.get(comment_url, timeout=30).content.replace(
                        'var commentJsonVarStr___=', '')[:-1])
                break
            except Exception as e:
                flag += 1
                print "获取评论错误:", e

            if flag > 5:
                return
        for item in json_object['contentAll']:
            # 评论文章url
            news_url = news_url

            # 评论内容
            ping_lun_nei_rong = item["content"]
            comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong

            # 评论时间
            ping_lun_shi_jian = item["commentTime"]
            comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian

            # 回复数量
            hui_fu_shu = None
            comment_dict['hui_fu_shu'] = hui_fu_shu

            # 点赞数量
            dian_zan_shu = item["upAmount"]
            comment_dict['dian_zan_shu'] = dian_zan_shu

            # 评论id
            ping_lun_id = item["userId"]
            comment_dict['ping_lun_id'] = ping_lun_id

            # 用户昵称
            yong_hu_ming = item["nickName"]
            comment_dict['yong_hu_ming'] = yong_hu_ming

            # 性别
            xing_bie = None
            comment_dict['xing_bie'] = xing_bie

            # 用户等级
            yong_hu_deng_ji = None
            comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji

            # 用户省份
            yong_hu_sheng_fen = item["ipInfo"]
            comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen

            # 抓取时间
            do_time = time.time()
            comment_dict['do_time'] = do_time

            # 抓取网站
            zhan_dian = u'新华网'
            comment_dict['zhan_dian'] = zhan_dian

            # 主键
            comment_dict['_id'] = str(ping_lun_id) + news_url

            #print json.dumps(comment_dict, ensure_ascii=False, indent=4)
            self.mongo.put_comment(comment_dict)
Exemplo n.º 4
0
class NewsComment(object):
    def __init__(self):
        self.mongo = MongoDB()

    def run(self, news_url, page):
        comment_url = 'http://comment.ifeng.com/get.php?docUrl=%s&format=js&job=1&p=%d&pageSize=20' % (
            news_url, page)
        json_object = dict()
        comment_dict = dict()
        flag = 1
        while 1:
            try:
                json_object = json.loads(
                    requests.get(comment_url, timeout=30).content.replace(
                        'var commentJsonVarStr___=', '')[:-1])
                break
            except Exception as e:
                flag += 1
                print "获取评论错误:", e

            if flag > 3:
                return
        for item in json_object['comments']:
            # 评论文章url
            news_url = news_url

            # 评论内容
            ping_lun_nei_rong = item["comment_contents"]
            comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong

            # 评论时间
            ping_lun_shi_jian = item["create_time"]
            comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian

            # 回复数量
            hui_fu_shu = None
            comment_dict['hui_fu_shu'] = hui_fu_shu

            # 点赞数量
            dian_zan_shu = None
            comment_dict['dian_zan_shu'] = dian_zan_shu

            # 评论id
            ping_lun_id = item["comment_id"]
            comment_dict['ping_lun_id'] = ping_lun_id

            # 用户昵称
            yong_hu_ming = item["uname"]
            comment_dict['yong_hu_ming'] = yong_hu_ming

            # 性别
            xing_bie = None
            comment_dict['xing_bie'] = xing_bie

            # 用户等级
            yong_hu_deng_ji = None
            comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji

            # 用户省份
            yong_hu_sheng_fen = item["ip_from"]
            comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen

            # 抓取时间
            do_time = time.time()
            comment_dict['do_time'] = do_time

            # 抓取网站
            zhan_dian = u'凤凰网'
            comment_dict['zhan_dian'] = zhan_dian

            # 主键
            comment_dict['_id'] = ping_lun_id + news_url

            # print json.dumps(comment_dict, ensure_ascii=False, indent=4)
            self.mongo.put_comment(comment_dict)
        pass