Exemplo n.º 1
0
def text_kmeans_clustering():
    # 聚类评价时选取TOPK_FREQ_WORD的高频词
    TOPK_FREQ_WORD = 50

    # 聚类评价时最小簇的大小
    LEAST_SIZE = 8

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        results = eventcomment.getNewsComments(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'])
            comment.update_comment_sentiment(sentiment)

        # kmeans 聚类及评价
        kmeans_results = kmeans(inputs, k=10)
        reserve_num = 5
        final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \
                top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None)

        inputs = []
        for label, items in final_cluster_results.iteritems():
            if label != 'other':
                inputs.extend(items)

            for item in items:
                news = News(item['news_id'])

                if label == 'other':
                    label = news.otherClusterId

                comment = Comment(item['_id'])
                comment.update_comment_label(label)

            eventcomment.save_cluster(label, news_id, int(time.time()))

        #计算各簇特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            eventcomment.update_feature_words(label, fwords)

        #计算文本权重
        for input in inputs:
            weight = text_weight_cal(input, cluster_feature[input['label']])
            comment = Comment(input['_id'])
            comment.update_comment_weight(weight)
Exemplo n.º 2
0
def text_kmeans_clustering():
    # 聚类评价时选取TOPK_FREQ_WORD的高频词
    TOPK_FREQ_WORD = 50

    # 聚类评价时最小簇的大小
    LEAST_SIZE = 8

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        results = eventcomment.getNewsComments(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'])
            comment.update_comment_sentiment(sentiment)

        # kmeans 聚类及评价
        kmeans_results = kmeans(inputs, k=10)
        reserve_num = 5
        final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \
                top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None)

        inputs = []
        for label, items in final_cluster_results.iteritems():
            if label != 'other':
                inputs.extend(items)

            for item in items:
                news = News(item['news_id'])

                if label == 'other':
                    label = news.otherClusterId

                comment = Comment(item['_id'])
                comment.update_comment_label(label)

            eventcomment.save_cluster(label, news_id, int(time.time()))

        #计算各簇特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            eventcomment.update_feature_words(label, fwords)

        #计算文本权重
        for input in inputs:
            weight = text_weight_cal(input, cluster_feature[input['label']])
            comment = Comment(input['_id'])
            comment.update_comment_weight(weight)
def classify_without_sentiment(uid_weibo, uid_list, start_date, end_date):
    '''
      没有情感标签的分类主函数
      输入数据:list对象 [[uid,text,time],[uid,text,time],...]
      输出数据:字典对象 {uid1:str1,uid2:str2,...}
    '''

    uid_sentiment = dict()
    new_uid = []
    min_ts = int(time.mktime(time.strptime(start_date, '%Y-%m-%d')))
    max_ts = int(time.mktime(time.strptime(end_date, '%Y-%m-%d')))
    time_index, time_list = sta_time_list(min_ts, max_ts)
    for uid, text, ts in uid_weibo:
        if uid not in new_uid:
            new_uid.append(uid)
        sentiment = triple_classifier({'text': text})
        date_str = time.strftime('%Y-%m-%d', time.localtime(float(ts)))
        if uid_sentiment.has_key(uid):
            item = uid_sentiment[uid]
            index = time_index[date_str]
            if sentiment == 2 or sentiment == 5:
                item[index][0] = item[index][0] + 1
            if sentiment == 3 or sentiment == 4:
                item[index][1] = item[index][1] + 1
            item[index][2] = item[index][2] + 1
            uid_sentiment[uid] = item
        else:
            item = time_list
            index = time_index[date_str]
            if sentiment == 2 or sentiment == 5:
                item[index][0] = item[index][0] + 1
            if sentiment == 3 or sentiment == 4:
                item[index][1] = item[index][1] + 1
            item[index][2] = item[index][2] + 1
            uid_sentiment[uid] = item

    s_result = sentiment_classify(uid_sentiment, min_ts, max_ts)

    com_result = dict()
    if len(uid_list):
        for uid in uid_list:
            if s_result.has_key(uid):
                com_result[uid] = SEN_DICT[s_result[uid]]
            else:
                com_result[uid] = SEN_DICT[0]
    else:
        for uid in new_uid:
            if s_result.has_key(uid):
                com_result[uid] = SEN_DICT[s_result[uid]]
            else:
                com_result[uid] = SEN_DICT[0]

    return com_result
Exemplo n.º 4
0
def diamond_classifier(item):
    # 其他类
    sentiment = 0

    if '【' in item['text'].encode('utf-8') and '】' in item['text'].encode('utf-8'):
        # 简单规则判断新闻类
        sentiment = 4
    else:
        # 积极、愤怒、悲伤3类情感分类器
        sentiment = triple_classifier(item)

    return sentiment
def classify_without_sentiment(uid_weibo,uid_list,start_date,end_date):
    '''
      没有情感标签的分类主函数
      输入数据:list对象 [[uid,text,time],[uid,text,time],...]
      输出数据:字典对象 {uid1:str1,uid2:str2,...}
    '''

    uid_sentiment = dict()
    new_uid = []
    min_ts = int(time.mktime(time.strptime(start_date,'%Y-%m-%d')))
    max_ts = int(time.mktime(time.strptime(end_date,'%Y-%m-%d')))
    time_index,time_list = sta_time_list(min_ts,max_ts)
    for uid,text,ts in uid_weibo:
        if uid not in new_uid:
            new_uid.append(uid)
        sentiment = triple_classifier({'text':text})
        date_str = time.strftime('%Y-%m-%d',time.localtime(float(ts)))
        if uid_sentiment.has_key(uid):
            item = uid_sentiment[uid]
            index = time_index[date_str]
            if sentiment == 2 or sentiment == 5:
                item[index][0] = item[index][0] + 1
            if sentiment == 3 or sentiment == 4:
                item[index][1] = item[index][1] + 1
            item[index][2] = item[index][2] + 1
            uid_sentiment[uid] = item
        else:
            item = time_list
            index = time_index[date_str]
            if sentiment == 2 or sentiment == 5:
                item[index][0] = item[index][0] + 1
            if sentiment == 3 or sentiment == 4:
                item[index][1] = item[index][1] + 1
            item[index][2] = item[index][2] + 1
            uid_sentiment[uid] = item

    s_result = sentiment_classify(uid_sentiment,min_ts,max_ts)

    com_result = dict()
    if len(uid_list):
        for uid in uid_list:
            if s_result.has_key(uid):
                com_result[uid] = SEN_DICT[s_result[uid]]
            else:
                com_result[uid] = SEN_DICT[0]
    else:
        for uid in new_uid:
            if s_result.has_key(uid):
                com_result[uid] = SEN_DICT[s_result[uid]]
            else:
                com_result[uid] = SEN_DICT[0]

    return com_result
Exemplo n.º 6
0
def _diamond_classifier(text):
    # 其他类
    sentiment = 0

    text_utf8 = text.encode('utf-8')
    if '【' in text_utf8 and '】' in text_utf8:
        # 简单规则判断新闻类
        sentiment = 4
    else:
        # 积极、愤怒、悲伤3类情感分类器
        sentiment = triple_classifier(text)

    return sentiment
def get_sentiment(uid_weibo,name):#学习情绪有关的参数

    uid_sentiment = dict()
    uid_list = []
    min_ts = MIN_TS
    max_ts = MAX_TS
    for item in uid_weibo:
        uid = item[0]
        text = item[1]
        ts = item[2]
        if int(ts) <= min_ts:
            min_ts = int(ts)
        if int(ts) >= max_ts:
            max_ts = int(ts)
        if uid not in uid_list:
            uid_list.append(uid)
        sentiment = triple_classifier({'text':text})
        date_str = time.strftime('%Y-%m-%d',time.localtime(float(ts)))
        if uid_sentiment.has_key(uid):
            item = uid_sentiment[uid]
            if item.has_key(date_str):
                row = item[date_str]
                row.append(sentiment)
                item[date_str] = row
            else:
                row = []
                row.append(sentiment)
                item[date_str] = row
            uid_sentiment[uid] = item
        else:
            item = dict()
            row = []
            row.append(sentiment)
            item[date_str] = row
            uid_sentiment[uid] = item

    s_result = sentiment_classify(uid_sentiment,min_ts,max_ts)

    write_e_result(s_result,name)    
Exemplo n.º 8
0
def get_sentiment(uid_weibo, name):  #学习情绪有关的参数

    uid_sentiment = dict()
    uid_list = []
    min_ts = MIN_TS
    max_ts = MAX_TS
    for item in uid_weibo:
        uid = item[0]
        text = item[1]
        ts = item[2]
        if int(ts) <= min_ts:
            min_ts = int(ts)
        if int(ts) >= max_ts:
            max_ts = int(ts)
        if uid not in uid_list:
            uid_list.append(uid)
        sentiment = triple_classifier({'text': text})
        date_str = time.strftime('%Y-%m-%d', time.localtime(float(ts)))
        if uid_sentiment.has_key(uid):
            item = uid_sentiment[uid]
            if item.has_key(date_str):
                row = item[date_str]
                row.append(sentiment)
                item[date_str] = row
            else:
                row = []
                row.append(sentiment)
                item[date_str] = row
            uid_sentiment[uid] = item
        else:
            item = dict()
            row = []
            row.append(sentiment)
            item[date_str] = row
            uid_sentiment[uid] = item

    s_result = sentiment_classify(uid_sentiment, min_ts, max_ts)

    write_e_result(s_result, name)
Exemplo n.º 9
0
def classify_without_sentiment(uid_weibo,uid_list,start_date,end_date):
    '''
      没有情感标签的分类主函数
      输入数据:list对象 [[uid,text,time],[uid,text,time],...]
      输出数据:字典对象 {uid1:str1,uid2:str2,...}
    '''

    uid_sentiment = dict()
    min_ts = int(time.mktime(time.strptime(start_date,'%Y-%m-%d')))
    max_ts = int(time.mktime(time.strptime(end_date,'%Y-%m-%d')))
    for uid,text,s,ts in uid_weibo:
        sentiment = triple_classifier({'text':text})
        date_str = time.strftime('%Y-%m-%d',time.localtime(float(ts)))
        if uid_sentiment.has_key(uid):
            item = uid_sentiment[uid]
            if item.has_key(date_str):
                row = item[date_str]
                row.append(sentiment)
                item[date_str] = row
            else:
                row = []
                row.append(sentiment)
                item[date_str] = row
            uid_sentiment[uid] = item
        else:
            item = dict()
            row = []
            row.append(sentiment)
            item[date_str] = row
            uid_sentiment[uid] = item

    s_result = sentiment_classify(uid_sentiment,min_ts,max_ts)

    com_result = dict()
    for uid in uid_list:
        com_result[uid] = SEN_DICT[s_result[uid]]

    return com_result
Exemplo n.º 10
0
def test(ft_type):
    print ft_type
    if ft_type == 'facebook':
        index_name_pre = facebook_flow_text_index_name_pre
        index_type = facebook_flow_text_index_type
        user_index_name = facebook_user_index_name
        user_index_type = facebook_user_index_type
    else:
        index_name_pre = twitter_flow_text_index_name_pre
        index_type = twitter_flow_text_index_type
        user_index_name = twitter_user_index_name
        user_index_type = twitter_user_index_type

    # date_list = load_date_list(True)
    date_list = load_date_list()

    DFA = createWordTree()
    query_body = {
        'post_filter': {
            'missing': {
                'field': 'keywords_string'
            }
        },
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': [{
                            'range': {
                                'flag_ch': {
                                    'gte': -1
                                }
                            }
                        }]
                    }
                }
            }
        }
    }
    for date in date_list:
        count = 0
        bulk_action = []
        index_name = index_name_pre + date
        try:
            es_scan_results = scan(es,
                                   query=query_body,
                                   size=1000,
                                   index=index_name,
                                   doc_type=index_type)
            while True:
                try:
                    scan_data = es_scan_results.next()
                    item = scan_data['_source']
                    text = item['text_ch']
                    uid = item['uid']
                    if ft_type == 'facebook':
                        _id = item['fid']
                    else:
                        _id = item['tid']

                    ts = datetime2ts(date)
                    #add sentiment field to weibo

                    sentiment, keywords_list = triple_classifier(item)

                    #add key words to weibo
                    keywords_dict, keywords_string = get_weibo_keywords(
                        keywords_list)

                    #sensitive_words_dict
                    sensitive_words_dict = searchWord(
                        text.encode('utf-8', 'ignore'), DFA)
                    if sensitive_words_dict:
                        sensitive_words_string_data = "&".join(
                            sensitive_words_dict.keys())
                        sensitive_words_dict_data = json.dumps(
                            sensitive_words_dict)
                    else:
                        sensitive_words_string_data = ""
                        sensitive_words_dict_data = json.dumps({})

                    #redis
                    if sensitive_words_dict:
                        sensitive_count_string = r_cluster.hget(
                            'sensitive_' + str(ts), str(uid))
                        if sensitive_count_string:  #redis取空
                            sensitive_count_dict = json.loads(
                                sensitive_count_string)
                            for word in sensitive_words_dict.keys():
                                if sensitive_count_dict.has_key(word):
                                    sensitive_count_dict[
                                        word] += sensitive_words_dict[word]
                                else:
                                    sensitive_count_dict[
                                        word] = sensitive_words_dict[word]
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_count_dict))
                        else:
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_words_dict))

                    #sensitive
                    sensitive_score = 0
                    if sensitive_words_dict:
                        for k, v in sensitive_words_dict.iteritems():
                            tmp_stage = r_sensitive.hget("sensitive_words", k)
                            if tmp_stage:
                                sensitive_score += v * sensitive_score_dict[
                                    str(tmp_stage)]

                    #directed_uid
                    directed_uid_data = 0
                    directed_uid, directed_uname = get_root_retweet(
                        text, uid, ft_type)
                    if directed_uid:
                        directed_uid_data = long(directed_uid)

                    # hashtag
                    hashtag = ''
                    RE = re.compile(
                        u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]'
                    )
                    hashtag_list = re.findall(RE, text)
                    if hashtag_list:
                        hashtag = '&'.join(hashtag_list)

                    #action
                    action = {'update': {'_id': _id}}

                    # action_data
                    action_data = {
                        'sentiment': str(sentiment),
                        'keywords_dict': json.dumps(keywords_dict),
                        'keywords_string': keywords_string,
                        'sensitive_words_string': sensitive_words_string_data,
                        'sensitive_words_dict': sensitive_words_dict_data,
                        'sensitive': sensitive_score,
                        'directed_uid': directed_uid_data,
                        'directed_uname': directed_uname,
                        'hashtag': hashtag,
                    }

                    bulk_action.extend([action, {'doc': action_data}])
                    count += 1

                    if count % 1000 == 0 and count != 0:
                        if bulk_action:
                            es.bulk(bulk_action,
                                    index=index_name,
                                    doc_type=facebook_flow_text_index_type,
                                    timeout=600)
                        bulk_action = []
                        count = 0
                except StopIteration:
                    break
            if bulk_action:

                es.bulk(bulk_action,
                        index=index_name,
                        doc_type=facebook_flow_text_index_type,
                        timeout=600)
        except Exception, e:  #es文档不存在
            print e
Exemplo n.º 11
0
def comments_sentiment_rubbish_calculation(comments, logger):
    """输入为一堆comments, 字段包括title、content168
       输出:
           item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate
    """
    # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻
    NON_CLUSTER_ID = 'nonsense'

    # 有意义的信息clusterid
    MEAN_CLUSTER_ID = 'sentiment'

    # 单条信息list,每条信息存储 clusterid weight sentiment字段
    items_infos = []

    # 去除sentiment label clusterid ad_label subob_label rub_label
    clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight']
    inputs = []
    for r in comments:
        for key in clear_keys:
            if key in r:
                del r[key]

        inputs.append(r)
    comments = inputs

    # 数据字段预处理
    inputs = []
    for r in comments:
        r['title'] = ''
        try:
            r['content168'] = r['content168'].encode('utf-8')
        except:
            r['content168'] = r['text'].encode('utf-8')
        r['content'] = r['content168']
        r['text'] = r['content168']

        inputs.append(r)

    # 先分中性及3类分类器
    svm_inputs = []
    for r in inputs:
        sentiment = neutral_classifier(r)

        if sentiment != 0:
            sentiment = triple_classifier(r)
            if sentiment == 0:
                svm_inputs.append(r)
            else:
                r['sentiment'] = sentiment
                items_infos.append(r)
        else:
            svm_inputs.append(r)

    # 情绪调整
    senti_modify_inputs = []
    for r in svm_inputs:
        sentiment = mid_sentiment_classify(r['text'])
        if sentiment == -1:
            sentiment = 0 # 中性

        if sentiment != 0:
            r['sentiment'] = sentiment
            items_infos.append(r)
        else:
            r['sentiment'] = sentiment
            senti_modify_inputs.append(r)

    # 新闻分类
    inputs = []
    for r in senti_modify_inputs:
        r = subob_classifier(r)
        if r['subob_label'] == 1:
            # 主客观文本分类
            r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻
            items_infos.append(r)
        else:
            inputs.append(r)

    # 去垃圾
    items = rubbish_classifier(inputs)
    for item in items:
        if item['rub_label'] == 1:
            # svm去垃圾
            item['sentiment'] = NON_CLUSTER_ID + '_rub'
        else:
            # 简单规则过滤广告
            item = ad_filter(item)
            if item['ad_label'] == 1:
                item['sentiment'] = NON_CLUSTER_ID + '_rub'

        items_infos.append(item)

    # 去重,在一个情绪类别下将文本去重
    sentiment_dict = dict()
    for item in items_infos:
        if 'sentiment' in item:
            sentiment = item['sentiment']
            try:
                sentiment_dict[sentiment].append(item)
            except KeyError:
                sentiment_dict[sentiment] = [item]

    items_infos = []
    for sentiment, items in sentiment_dict.iteritems():
        items_list = duplicate(items)
        items_infos.extend(items_list)

    return {'item_infos': items_infos}
Exemplo n.º 12
0
def one_topic_calculation_comments_v2(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v2 import kmeans, tfidf_v2, text_classify, cluster_evaluation, global_text_weight

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)

        tfidf_word = tfidf_v2(inputs)

        #聚类个数=过滤后文本数/2向上取整,大于10的取10
        kmeans_cluster_number = int(math.ceil(float(len(inputs)) / 5.0))
        if kmeans_cluster_number > 10:
            kmeans_cluster_number = 10
        if kmeans_cluster_number < 5:
            kmeans_cluster_number = 5

        # 评论词聚类
        word_label = kmeans(tfidf_word, inputs, k=kmeans_cluster_number)

        # 计算全局文本权重
        for r in inputs:
            gweight = global_text_weight(r['content'], tfidf_word)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_global_weight(gweight)

        # 评论文本分类
        results = text_classify(inputs, word_label, tfidf_word)

        #簇评价
        reserved_num = int(math.ceil(float(kmeans_cluster_number) / 2.0))
        LEAST_CLUSTER_SIZE = 3 # 最小的簇大小
        TOPK_FREQ = 10
        TOPK_WEIGHT = 5
        LEAST_FREQ = 0
        final_cluster_results = cluster_evaluation(results, top_num=reserved_num, topk_freq=TOPK_FREQ, \
                least_freq=LEAST_FREQ, least_size=LEAST_CLUSTER_SIZE, topk_weight=TOPK_WEIGHT)
        for label, items in final_cluster_results.iteritems():
            if label == 'other':
                label = news.otherClusterId

            if len(items):
                eventcomment.save_cluster(label, news_id, int(time.time()))

            if label != news.otherClusterId:
                fwords = word_label[label]
                eventcomment.update_feature_words(label, fwords)

            for item in items:
                comment = Comment(item['_id'], topicid)
                comment.update_comment_label(label)
                comment.update_comment_weight(item['weight'])
Exemplo n.º 13
0
def one_topic_calculation_comments_v7(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v7 import tfidf_v2, text_classify, \
            cluster_evaluation, choose_cluster
    from weibo_subob_rub_neu_classifier import weibo_subob_rub_neu_classifier

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        # 数据字段预处理
        inputs = []
        for r in results:
            r['title'] = ''
            r['content168'] = r['content168'].encode('utf-8')
            r['content'] = r['content168']
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 去除垃圾和新闻文本
        items = weibo_subob_rub_neu_classifier(inputs)
        inputs = []
        for item in items:
            subob_rub_neu_label = item['subob_rub_neu_label']
            if not subob_rub_neu_label in [1, 0]:
                # 1表示垃圾文本,0表示新闻文本
                inputs.append(item)

        MIN_CLUSTERING_INPUT = 30
        MIN_CLUSTER_NUM = 2
        MAX_CLUSTER_NUM = 10
        # TFIDF词、聚类数量自动选择、vsm作属性也要可设成参数
        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v2(inputs)
            results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k,v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价
            recommend_text = cluster_evaluation(evaluation_inputs)
            for label, items in recommend_text.iteritems():
                if label == 'other':
                    label = news.otherClusterId

                if len(items):
                    eventcomment.save_cluster(label, news_id, int(time.time()))

                if label != news.otherClusterId:
                    fwords = results[label]
                    eventcomment.update_feature_words(label, fwords)

                for item in items:
                    comment = Comment(item['_id'], topicid)
                    comment.update_comment_label(label)
                    comment.update_comment_weight(item['weight'])

        # 情绪计算
        for r in inputs:
            if r['subob_rub_neu_label'] == 2:
                sentiment = 0 # 0 中性
            elif r['subob_rub_neu_label'] == -1:
                sentiment = triple_classifier(r) # 1 高兴、2 愤怒、3 悲伤、0无情感
                if sentiment == 0:
                    sentiment = mid_sentiment_classify(r['text'])

                if sentiment == -1:
                    sentiment = 0 # 中性

            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)
Exemplo n.º 14
0
def classify_without_sentiment(uid_weibo,uid_list,start_date,end_date):
    '''
      没有情感标签的分类主函数
      输入数据:list对象 [[uid,text,time],[uid,text,time],...]
      输出数据:字典对象 {uid1:str1,uid2:str2,...}
    '''

    uid_sentiment = dict()
    new_uid = []
    min_ts = int(time.mktime(time.strptime(start_date,'%Y-%m-%d')))
    max_ts = int(time.mktime(time.strptime(end_date,'%Y-%m-%d')))
    time_index,time_list = sta_time_list(min_ts,max_ts)
    n = len(time_list)
    for uid,text,ts in uid_weibo:
        if uid not in new_uid:
            new_uid.append(uid)
        if isinstance(text, unicode):#判断是否为unicode编码
            sentiment = triple_classifier({'text':text})
        else:
            sentiment = triple_classifier({'text':text.decode('utf-8')})
        date_str = time.strftime('%Y-%m-%d',time.localtime(float(ts)))
        if uid_sentiment.has_key(uid):
            item = uid_sentiment[uid]
            index = time_index[date_str]
            if sentiment == 0:#中性
                item[index][0] = item[index][0] + 1
            elif sentiment == 2 or sentiment == 5:#冲动
                item[index][1] = item[index][1] + 1
            elif sentiment == 3 or sentiment == 4:#抑郁
                item[index][2] = item[index][2] + 1
            elif sentiment == 1:#积极
                item[index][3] = item[index][3] + 1
            else:
                item[index][4] = item[index][4] + 1
            uid_sentiment[uid] = item
        else:
            item = list(np.zeros((n, 5)))
            index = time_index[date_str]
            if sentiment == 0:#中性
                item[index][0] = item[index][0] + 1
            elif sentiment == 2 or sentiment == 5:#冲动
                item[index][1] = item[index][1] + 1
            elif sentiment == 3 or sentiment == 4:#抑郁
                item[index][2] = item[index][2] + 1
            elif sentiment == 1:#积极
                item[index][3] = item[index][3] + 1
            else:
                item[index][4] = item[index][4] + 1
            uid_sentiment[uid] = item

    s_result = sentiment_classify(uid_sentiment,min_ts,max_ts)

    com_result = dict()
    if len(uid_list):
        for uid in uid_list:
            if s_result.has_key(uid):
                com_result[uid] = s_result[uid]
            else:
                com_result[uid] = {'impulse':0,'depressed':0}
    else:
        for uid in new_uid:
            if s_result.has_key(uid):
                com_result[uid] = s_result[uid]
            else:
                com_result[uid] = {'impulse':0,'depressed':0}

    return com_result
Exemplo n.º 15
0
def one_topic_calculation_comments_v4(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v4 import kmeans, tfidf_v4, text_classify, \
            cluster_evaluation, choose_cluster

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)

        MIN_CLUSTERING_INPUT = 50
        MIN_CLUSTER_NUM = 2
        MAX_CLUSTER_NUM = 15
        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v4(inputs)
            results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM)

            # for k, v in results.iteritems():
            #     print k, len(v)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k,v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价
            recommend_text = cluster_evaluation(evaluation_inputs)
            for label, items in recommend_text.iteritems():
                if label == 'other':
                    label = news.otherClusterId

                if len(items):
                    eventcomment.save_cluster(label, news_id, int(time.time()))

                if label != news.otherClusterId:
                    fwords = results[label]
                    eventcomment.update_feature_words(label, fwords)

                for item in items:
                    comment = Comment(item['_id'], topicid)
                    comment.update_comment_label(label)
                    comment.update_comment_weight(item['weight'])
    action = []
    xdata = []
    class_ts = time.time()
    while 1:

        item = receiver.recv_json()
        if not item:
            continue 

        if int(item['sp_type']) == 1:
            read_count += 1
            text = item['text']
            uid = item['uid']

            #add sentiment field to weibo
            sentiment, keywords_list  = triple_classifier(item)
            item['sentiment'] = str(sentiment)
            #add key words to weibo
            keywords_dict, keywords_string = get_weibo_keywords(keywords_list)
            item['keywords_dict'] = json.dumps(keywords_dict) # use to compute
            item['keywords_string'] = keywords_string         # use to search

            sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
            if sensitive_words_dict:
                item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys())
                item['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
            else:
                item['sensitive_words_string'] = ""
                item['sensitive_words_dict'] = json.dumps({})

            timestamp = item['timestamp']
Exemplo n.º 17
0
def comments_sentiment_rubbish_calculation(comments, logger):
    """输入为一堆comments, 字段包括title、content168
       输出:
           item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate
    """
    # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻
    NON_CLUSTER_ID = 'nonsense'

    # 有意义的信息clusterid
    MEAN_CLUSTER_ID = 'sentiment'

    # 单条信息list,每条信息存储 clusterid weight sentiment字段
    items_infos = []

    # 去除sentiment label clusterid ad_label subob_label rub_label
    clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight']
    inputs = []
    for r in comments:
        for key in clear_keys:
            if key in r:
                del r[key]

        inputs.append(r)
    comments = inputs

    # 数据字段预处理
    inputs = []
    for r in comments:
        r['title'] = ''
        r['content168'] = r['content168'].encode('utf-8')
        r['content'] = r['content168']
        r['text'] = r['content168']

        inputs.append(r)

    # 先分中性及3类分类器
    svm_inputs = []
    for r in inputs:
        sentiment = neutral_classifier(r)

        if sentiment != 0:
            sentiment = triple_classifier(r)
            if sentiment == 0:
                svm_inputs.append(r)
            else:
                r['sentiment'] = sentiment
                items_infos.append(r)
        else:
            svm_inputs.append(r)

    # 情绪调整
    senti_modify_inputs = []
    for r in svm_inputs:
        sentiment = mid_sentiment_classify(r['text'])
        if sentiment == -1:
            sentiment = 0 # 中性

        if sentiment != 0:
            r['sentiment'] = sentiment
            items_infos.append(r)
        else:
            r['sentiment'] = sentiment
            senti_modify_inputs.append(r)

    # 新闻分类
    inputs = []
    for r in senti_modify_inputs:
        r = subob_classifier(r)
        if r['subob_label'] == 1:
            # 主客观文本分类
            r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻
            items_infos.append(r)
        else:
            inputs.append(r)

    # 去垃圾
    items = rubbish_classifier(inputs)
    for item in items:
        if item['rub_label'] == 1:
            # svm去垃圾
            item['sentiment'] = NON_CLUSTER_ID + '_rub'
        else:
            # 简单规则过滤广告
            item = ad_filter(item)
            if item['ad_label'] == 1:
                item['sentiment'] = NON_CLUSTER_ID + '_rub'

        items_infos.append(item)

    # 去重,在一个情绪类别下将文本去重
    sentiment_dict = dict()
    for item in items_infos:
        if 'sentiment' in item:
            sentiment = item['sentiment']
            try:
                sentiment_dict[sentiment].append(item)
            except KeyError:
                sentiment_dict[sentiment] = [item]

    items_infos = []
    for sentiment, items in sentiment_dict.iteritems():
        items_list = duplicate(items)
        items_infos.extend(items_list)

    return {'item_infos': items_infos}
Exemplo n.º 18
0
def one_topic_calculation_comments_v7(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v7 import tfidf_v2, text_classify, \
            cluster_evaluation, choose_cluster
    from weibo_subob_rub_neu_classifier import weibo_subob_rub_neu_classifier

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        # 数据字段预处理
        inputs = []
        for r in results:
            r['title'] = ''
            r['content168'] = r['content168'].encode('utf-8')
            r['content'] = r['content168']
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 去除垃圾和新闻文本
        items = weibo_subob_rub_neu_classifier(inputs)
        inputs = []
        for item in items:
            subob_rub_neu_label = item['subob_rub_neu_label']
            if not subob_rub_neu_label in [1, 0]:
                # 1表示垃圾文本,0表示新闻文本
                inputs.append(item)

        MIN_CLUSTERING_INPUT = 30
        MIN_CLUSTER_NUM = 2
        MAX_CLUSTER_NUM = 10
        # TFIDF词、聚类数量自动选择、vsm作属性也要可设成参数
        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v2(inputs)
            results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM,
                                     MAX_CLUSTER_NUM)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k, v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价
            recommend_text = cluster_evaluation(evaluation_inputs)
            for label, items in recommend_text.iteritems():
                if label == 'other':
                    label = news.otherClusterId

                if len(items):
                    eventcomment.save_cluster(label, news_id, int(time.time()))

                if label != news.otherClusterId:
                    fwords = results[label]
                    eventcomment.update_feature_words(label, fwords)

                for item in items:
                    comment = Comment(item['_id'], topicid)
                    comment.update_comment_label(label)
                    comment.update_comment_weight(item['weight'])

        # 情绪计算
        for r in inputs:
            if r['subob_rub_neu_label'] == 2:
                sentiment = 0  # 0 中性
            elif r['subob_rub_neu_label'] == -1:
                sentiment = triple_classifier(r)  # 1 高兴、2 愤怒、3 悲伤、0无情感
                if sentiment == 0:
                    sentiment = mid_sentiment_classify(r['text'])

                if sentiment == -1:
                    sentiment = 0  # 中性

            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)
Exemplo n.º 19
0
def one_topic_calculation_comments_v4(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v4 import kmeans, tfidf_v4, text_classify, \
            cluster_evaluation, choose_cluster

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)

        MIN_CLUSTERING_INPUT = 50
        MIN_CLUSTER_NUM = 2
        MAX_CLUSTER_NUM = 15
        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v4(inputs)
            results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM,
                                     MAX_CLUSTER_NUM)

            # for k, v in results.iteritems():
            #     print k, len(v)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k, v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价
            recommend_text = cluster_evaluation(evaluation_inputs)
            for label, items in recommend_text.iteritems():
                if label == 'other':
                    label = news.otherClusterId

                if len(items):
                    eventcomment.save_cluster(label, news_id, int(time.time()))

                if label != news.otherClusterId:
                    fwords = results[label]
                    eventcomment.update_feature_words(label, fwords)

                for item in items:
                    comment = Comment(item['_id'], topicid)
                    comment.update_comment_label(label)
                    comment.update_comment_weight(item['weight'])
Exemplo n.º 20
0
def classify_without_sentiment(uid_weibo, uid_list, start_date, end_date):
    '''
      没有情感标签的分类主函数
      输入数据:list对象 [[uid,text,time],[uid,text,time],...]
      输出数据:字典对象 {uid1:str1,uid2:str2,...}
    '''

    uid_sentiment = dict()
    new_uid = []
    min_ts = int(time.mktime(time.strptime(start_date, '%Y-%m-%d')))
    max_ts = int(time.mktime(time.strptime(end_date, '%Y-%m-%d')))
    time_index, time_list = sta_time_list(min_ts, max_ts)
    n = len(time_list)
    for uid, text, ts in uid_weibo:
        if uid not in new_uid:
            new_uid.append(uid)
        if isinstance(text, unicode):  #判断是否为unicode编码
            sentiment = triple_classifier({'text': text})
        else:
            sentiment = triple_classifier({'text': text.decode('utf-8')})
        date_str = time.strftime('%Y-%m-%d', time.localtime(float(ts)))
        if uid_sentiment.has_key(uid):
            item = uid_sentiment[uid]
            index = time_index[date_str]
            if sentiment == 0:  #中性
                item[index][0] = item[index][0] + 1
            elif sentiment == 2 or sentiment == 5:  #冲动
                item[index][1] = item[index][1] + 1
            elif sentiment == 3 or sentiment == 4:  #抑郁
                item[index][2] = item[index][2] + 1
            elif sentiment == 1:  #积极
                item[index][3] = item[index][3] + 1
            else:
                item[index][4] = item[index][4] + 1
            uid_sentiment[uid] = item
        else:
            item = list(np.zeros((n, 5)))
            index = time_index[date_str]
            if sentiment == 0:  #中性
                item[index][0] = item[index][0] + 1
            elif sentiment == 2 or sentiment == 5:  #冲动
                item[index][1] = item[index][1] + 1
            elif sentiment == 3 or sentiment == 4:  #抑郁
                item[index][2] = item[index][2] + 1
            elif sentiment == 1:  #积极
                item[index][3] = item[index][3] + 1
            else:
                item[index][4] = item[index][4] + 1
            uid_sentiment[uid] = item

    s_result = sentiment_classify(uid_sentiment, min_ts, max_ts)

    com_result = dict()
    if len(uid_list):
        for uid in uid_list:
            if s_result.has_key(uid):
                com_result[uid] = s_result[uid]
            else:
                com_result[uid] = {'impulse': 0, 'depressed': 0}
    else:
        for uid in new_uid:
            if s_result.has_key(uid):
                com_result[uid] = s_result[uid]
            else:
                com_result[uid] = {'impulse': 0, 'depressed': 0}

    return com_result
Exemplo n.º 21
0
def one_topic_calculation_comments_v2(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v2 import kmeans, tfidf_v2, text_classify, cluster_evaluation, global_text_weight

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)

        tfidf_word = tfidf_v2(inputs)

        #聚类个数=过滤后文本数/2向上取整,大于10的取10
        kmeans_cluster_number = int(math.ceil(float(len(inputs)) / 5.0))
        if kmeans_cluster_number > 10:
            kmeans_cluster_number = 10
        if kmeans_cluster_number < 5:
            kmeans_cluster_number = 5

        # 评论词聚类
        word_label = kmeans(tfidf_word, inputs, k=kmeans_cluster_number)

        # 计算全局文本权重
        for r in inputs:
            gweight = global_text_weight(r['content'], tfidf_word)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_global_weight(gweight)

        # 评论文本分类
        results = text_classify(inputs, word_label, tfidf_word)

        #簇评价
        reserved_num = int(math.ceil(float(kmeans_cluster_number) / 2.0))
        LEAST_CLUSTER_SIZE = 3  # 最小的簇大小
        TOPK_FREQ = 10
        TOPK_WEIGHT = 5
        LEAST_FREQ = 0
        final_cluster_results = cluster_evaluation(results, top_num=reserved_num, topk_freq=TOPK_FREQ, \
                least_freq=LEAST_FREQ, least_size=LEAST_CLUSTER_SIZE, topk_weight=TOPK_WEIGHT)
        for label, items in final_cluster_results.iteritems():
            if label == 'other':
                label = news.otherClusterId

            if len(items):
                eventcomment.save_cluster(label, news_id, int(time.time()))

            if label != news.otherClusterId:
                fwords = word_label[label]
                eventcomment.update_feature_words(label, fwords)

            for item in items:
                comment = Comment(item['_id'], topicid)
                comment.update_comment_label(label)
                comment.update_comment_weight(item['weight'])
Exemplo n.º 22
0
                item = scan_data['_source']

                text = item['text']
                uid = item['uid']

                try:
                    text_ch = trans([text])

                    if text_ch:
                        text = text_ch[0]
                        item['text'] = text_ch[0]
                except:
                    pass

                #add sentiment field to weibo
                sentiment, keywords_list = triple_classifier(item)
                item['sentiment'] = str(sentiment)
                #add key words to weibo
                keywords_dict, keywords_string = get_weibo_keywords(
                    keywords_list)
                item['keywords_dict'] = json.dumps(
                    keywords_dict)  # use to compute
                item['keywords_string'] = keywords_string  # use to search

                sensitive_words_dict = searchWord(
                    text.encode('utf-8', 'ignore'), DFA)
                if sensitive_words_dict:
                    item['sensitive_words_string'] = "&".join(
                        sensitive_words_dict.keys())
                    item['sensitive_words_dict'] = json.dumps(
                        sensitive_words_dict)
Exemplo n.º 23
0
 def fill_sentiment(item):
     sentiment = triple_classifier(item)
     item[XAPIAN_EXTRA_FIELD] = sentiment
     return item
Exemplo n.º 24
0
 def fill_sentiment(item):
     sentiment = triple_classifier(item)
     item[XAPIAN_EXTRA_FIELD] = sentiment
     return item
Exemplo n.º 25
0
def sentimentCronTopic(topic, weibos_list, start_ts, over_ts, sort_field=SORT_FIELD,
                       save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT,
                       k_limit=TOP_KEYWORDS_LIMIT):
    import sys

    sys.path.append('../triple_classifier/')
    from triple_sentiment_classifier import triple_classifier

    start_ts = int(start_ts)
    over_ts = int(over_ts)

    over_ts = ts2HourlyTime(over_ts, during)
    interval = (over_ts - start_ts) / during
    logFile.write('start_ts: ' + str(start_ts) + '\r\n')
    logFile.write('over_ts: ' + str(over_ts) + '\r\n')
    logFile.write('during: ' + str(during) + '\r\n')
    logFile.write('interval: ' + str(interval) + '\r\n')

    for i in range(interval, 0, -1):
        begin_ts = over_ts - during * i
        end_ts = begin_ts + during

        emotions_count = {}
        emotions_kcount = {}
        emotions_weibo = {}
        emotions_rcount = {}
        weiboIDs = {}

        for k, v in emotions_kv.iteritems():
            zero = 0
            emotions_count[v] = [end_ts, 0]
            emotions_kcount[v] = [end_ts, '']
            emotions_weibo[v] = [end_ts, []]
            weiboIDs[v] = [end_ts, []]
            # print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8')
        slide = get_weibos_slide(weibos_list, begin_ts, end_ts)
        string = ['', '', '', '']

        emo0 = 0
        emo1 = 1

        for weibo in slide:
            sentiment, emo = triple_classifier(weibo)

            if sentiment != 0 and emo == 0:
                emo0 += 1
            elif sentiment != 0 and emo == 1:
                emo1 += 1
            # words = jieba.cut(weibo['text'], cut_all=False)
            weibo['sentiment'] = sentiment
            string[sentiment] = string[sentiment] + weibo['text']

            if sentiment != 0:
                emotions_count[sentiment][1] += 1
                #                kcount = emotions_kcount[sentiment][1]
                emotions_weibo[sentiment][1].append(weibo)
            else:
                zero += 1

        for k, v in emotions_kv.iteritems():
            #            sorted_kcount = sorted(emotions_kcount[v][1].iteritems(), key=lambda(k, v):v, reverse=False)
            #            sorted_kcount = { k: v for k, v in sorted_kcount[len(sorted_kcount)-k_limit:]}
            #            emotions_kcount[v][1] = sorted_kcount
            sorted_weibos = sorted(emotions_weibo[v][1], key=lambda i: i[sort_field], reverse=False)
            emotions_weibo[v][1] = sorted_weibos[len(sorted_weibos) - w_limit:]

            for item in emotions_weibo[v][1]:
                weiboIDs[v][1].append(item['key'])

            wordd = {}

            if string[v] != '':
                words = GetKeyWords(string[v].encode('utf-8'), 5, True)

            word_list = words.split('#')
            for word in word_list:
                token = word.split(r'/')
                if (len(token) == 3 and not (token[0] in STOPWORDS)):
                    #                    wordd.append({token[0]:token[2]})
                    wordd[token[0]] = token[2]
            emotions_kcount[v][1] = wordd

        print emo0, emo1
        print zero, emotions_count[1][1], emotions_count[2][1], emotions_count[3][1]
        save_rt_results('count', topic, emotions_count, during)
        save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit)
        save_rt_results('weibos', topic, weiboIDs, during, k_limit, w_limit)

        j = interval - i
        logFile.write('finish ' + str(j) + ' slide' + '\r\n')