def step4_cal(): """ 24 点时merge已有的簇 """ if not initializing and now_hour == 0: print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp) subevents = event.getSubEvents() subevent_fwords = dict() for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() subevent_fwords[subeventid] = fwords subeventids_sort_timestamp = event.get_sorted_subeventids() cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3) for res_id, mer_id in mids: # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇 temp_infos = event.get_subevent_infos(mer_id) for r in temp_infos: news = News(r["_id"], event.id) news.update_news_subeventid(res_id) event.remove_subevents([mer_id]) # 重新计算各簇的特征词, 并计算文本权重, 并去重 if len(mids): step3_cal() print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def step1_cal(): """第一步计算,获取子事件特征词,新文本与特征词匹配分类 """ print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp) if initializing: # 若话题需要做初始聚类,获取话题开始时间之前的文本 results = event.getInitialInfos() else: # 若话题已做完初始聚类,获取话题最新一小时的文本 results = event.getInfos(timestamp - 3600, timestamp) if now_hour == 0: # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇 temp_subeventids = event.getTodayCreatSubeventIds() temp_infos = event.getTodayCreatSubeventInfos() event.remove_subevents(temp_subeventids) results.extend(temp_infos) print eventid, ' before classify: ', len(results) # 获取子事件 subevents = event.getSubEvents() labels_list = [] feature_words_list = [] for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() feature_words_list.append(fwords) labels_list.append(subeventid) for r in results: text = (r['title'] + r['content168']).encode('utf-8') feature_words_inputs = [] for fwords in feature_words_list: wcdict = dict() for w, c in fwords.iteritems(): if isinstance(w, unicode): w = w.encode('utf-8') wcdict[w] = c feature_words_inputs.append(wcdict) # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签 label = subevent_classifier(text, labels_list, feature_words_inputs) if label == "other": label = event.getOtherSubEventID() news = News(r["_id"], event.id) news.update_news_subeventid(label) print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)
def step1_cal(): """第一步计算,获取子事件特征词,新文本与特征词匹配分类 """ print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp) if initializing: # 若话题需要做初始聚类,获取话题开始时间之前的文本 results = event.getInitialInfos() else: # 若话题已做完初始聚类,获取话题最新一小时的文本 results = event.getInfos(timestamp - 3600, timestamp) if now_hour == 0: # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇 temp_subeventids = event.getTodayCreatSubeventIds() temp_infos = event.getTodayCreatSubeventInfos() event.remove_subevents(temp_subeventids) results.extend(temp_infos) print eventid, ' before classify: ', len(results) # 获取子事件 subevents = event.getSubEvents() labels_list = [] feature_words_list = [] for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() feature_words_list.append(fwords) labels_list.append(subeventid) for r in results: text = (r['title'] + r['content168']).encode('utf-8') feature_words_inputs = [] for fwords in feature_words_list: wcdict = dict() for w, c in fwords.iteritems(): if isinstance(w, unicode): w = w.encode('utf-8') wcdict[w] = c feature_words_inputs.append(wcdict) # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签 label = subevent_classifier(text, labels_list, feature_words_inputs) if label == "other": label = event.getOtherSubEventID() news = News(r["_id"], event.id) news.update_news_subeventid(label) print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)
def step2_cal(): """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价 """ # 聚类评价时选取TOPK_FREQ_WORD的高频词 TOPK_FREQ_WORD = 50 # 聚类评价时最小簇的大小 LEAST_SIZE = 8 # 判断其他类是否需要分裂 ifsplit = event.check_ifsplit(initializing) print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(timestamp) if ifsplit: inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(initializing) print eventid, ' after classify before split: ', len(inputs), kmeans_cluster_num, reserve_num if len(inputs) > 2: items = [] for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") items.append(r) # kmeans聚类 kmeans_results = kmeans(items, k=kmeans_cluster_num) # 聚类评价 if initializing or now_hour == 0: min_tfidf = event.get_min_tfidf() final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=min_tfidf) else: # 每小时聚类时,不用和已有簇的最小tfidf作比 final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE) # 更新新闻簇标签,更新子事件表 for label, items in final_cluster_results.iteritems(): if label == "other": label = event.getOtherSubEventID() event.save_subevent(label, timestamp) if label != event.getOtherSubEventID(): # 更新每类的tfidf event.update_subevent_tfidf(label, tfidf_dict[label]) for r in items: news = News(r["_id"], event.id) news.update_news_subeventid(label) else: print 'inputs less than 2, kmeans aborted' print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)
def text_kmeans_clustering(): # 聚类评价时选取TOPK_FREQ_WORD的高频词 TOPK_FREQ_WORD = 50 # 聚类评价时最小簇的大小 LEAST_SIZE = 8 eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: results = eventcomment.getNewsComments(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id']) comment.update_comment_sentiment(sentiment) # kmeans 聚类及评价 kmeans_results = kmeans(inputs, k=10) reserve_num = 5 final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \ top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None) inputs = [] for label, items in final_cluster_results.iteritems(): if label != 'other': inputs.extend(items) for item in items: news = News(item['news_id']) if label == 'other': label = news.otherClusterId comment = Comment(item['_id']) comment.update_comment_label(label) eventcomment.save_cluster(label, news_id, int(time.time())) #计算各簇特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): eventcomment.update_feature_words(label, fwords) #计算文本权重 for input in inputs: weight = text_weight_cal(input, cluster_feature[input['label']]) comment = Comment(input['_id']) comment.update_comment_weight(weight)
def step3_cal(): """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息 """ print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp) inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] inputs.extend(event.getSubeventInfos(subeventid)) for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") r["label"] = r["subeventid"] # 计算各簇的存量特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): feature = Feature(label) feature.upsert_newest(fwords) # 计算文本权重 for r in inputs: weight = text_weight_cal(r, cluster_feature[r['label']]) news = News(r["_id"], event.id) news.update_news_weight(weight) # 文本去重 items_dict = {} for r in inputs: try: items_dict[r["label"]].append(r) except KeyError: items_dict[r["label"]] = [r] for label, items in items_dict.iteritems(): results = duplicate(items) for r in results: news = News(r["_id"], event.id) news.update_news_duplicate(r["duplicate"], r["same_from"]) # 更新簇的大小、增幅信息 before_size = event.get_subevent_size(label) event.update_subevent_size(label, len(items)) event.update_subevent_addsize(label, len(items) - before_size) if initializing: # 更新事件状态由initializing变为active event.activate() print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def step4_cal(): """ 24 点时merge已有的簇 """ if not initializing and now_hour == 0: print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp) subevents = event.getSubEvents() subevent_fwords = dict() for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() subevent_fwords[subeventid] = fwords subeventids_sort_timestamp = event.get_sorted_subeventids() cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3) for res_id, mer_id in mids: # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇 temp_infos = event.get_subevent_infos(mer_id) for r in temp_infos: news = News(r["_id"], event.id) news.update_news_subeventid(res_id) event.remove_subevents([mer_id]) # 重新计算各簇的特征词, 并计算文本权重, 并去重 if len(mids): step3_cal() print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def test_subevent_classifier(): labels_list = [] feature_words_inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] if subeventid != "575612b6-a26f-4df9-a2de-01c85cae56a2": labels_list.append(subeventid) feature = Feature(subeventid) feature_words = feature.get_newest() new_feature_words = dict() for k, v in feature_words.iteritems(): new_feature_words[k.encode('utf-8')] = v feature_words_inputs.append(new_feature_words) news_id = "http://news.xinhuanet.com/comments/2014-11/03/c_1113084515.htm" news = News(news_id, event.id) ns = news.get_news_info() text = ns['title'].encode('utf-8') + ns['content168'].encode('utf-8') label = subevent_classifier(text, labels_list, feature_words_inputs) print label
def add(self, id): parser = NewsParser(id) if not parser.success: return False try: date = parser.get_date() header = parser.get_header() text = parser.get_text() except Exception: return False news_entry = News(id=id, date=date, header=header, text=text) self.session.add(news_entry) self.session.commit() return True
def step3_cal(): """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息 """ print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp) inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] inputs.extend(event.getSubeventInfos(subeventid)) for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") r["label"] = r["subeventid"] # 计算各簇的存量特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): feature = Feature(label) feature.upsert_newest(fwords) # 计算文本权重 for r in inputs: weight = text_weight_cal(r, cluster_feature[r['label']]) news = News(r["_id"], event.id) news.update_news_weight(weight) # 文本去重 items_dict = {} for r in inputs: try: items_dict[r["label"]].append(r) except KeyError: items_dict[r["label"]] = [r] for label, items in items_dict.iteritems(): results = duplicate(items) for r in results: news = News(r["_id"], event.id) news.update_news_duplicate(r["duplicate"], r["same_from"]) # 更新簇的大小、增幅信息 before_size = event.get_subevent_size(label) event.update_subevent_size(label, len(items)) event.update_subevent_addsize(label, len(items) - before_size) if initializing: # 更新事件状态由initializing变为active event.activate() print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def one_topic_calculation_comments_v2(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v2 import kmeans, tfidf_v2, text_classify, cluster_evaluation, global_text_weight eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment) tfidf_word = tfidf_v2(inputs) #聚类个数=过滤后文本数/2向上取整,大于10的取10 kmeans_cluster_number = int(math.ceil(float(len(inputs)) / 5.0)) if kmeans_cluster_number > 10: kmeans_cluster_number = 10 if kmeans_cluster_number < 5: kmeans_cluster_number = 5 # 评论词聚类 word_label = kmeans(tfidf_word, inputs, k=kmeans_cluster_number) # 计算全局文本权重 for r in inputs: gweight = global_text_weight(r['content'], tfidf_word) comment = Comment(r['_id'], topicid) comment.update_comment_global_weight(gweight) # 评论文本分类 results = text_classify(inputs, word_label, tfidf_word) #簇评价 reserved_num = int(math.ceil(float(kmeans_cluster_number) / 2.0)) LEAST_CLUSTER_SIZE = 3 # 最小的簇大小 TOPK_FREQ = 10 TOPK_WEIGHT = 5 LEAST_FREQ = 0 final_cluster_results = cluster_evaluation(results, top_num=reserved_num, topk_freq=TOPK_FREQ, \ least_freq=LEAST_FREQ, least_size=LEAST_CLUSTER_SIZE, topk_weight=TOPK_WEIGHT) for label, items in final_cluster_results.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = word_label[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight'])
def one_topic_calculation_comments_v7(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v7 import tfidf_v2, text_classify, \ cluster_evaluation, choose_cluster from weibo_subob_rub_neu_classifier import weibo_subob_rub_neu_classifier eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) # 数据字段预处理 inputs = [] for r in results: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 去除垃圾和新闻文本 items = weibo_subob_rub_neu_classifier(inputs) inputs = [] for item in items: subob_rub_neu_label = item['subob_rub_neu_label'] if not subob_rub_neu_label in [1, 0]: # 1表示垃圾文本,0表示新闻文本 inputs.append(item) MIN_CLUSTERING_INPUT = 30 MIN_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 10 # TFIDF词、聚类数量自动选择、vsm作属性也要可设成参数 if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v2(inputs) results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k, v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价 recommend_text = cluster_evaluation(evaluation_inputs) for label, items in recommend_text.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = results[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight']) # 情绪计算 for r in inputs: if r['subob_rub_neu_label'] == 2: sentiment = 0 # 0 中性 elif r['subob_rub_neu_label'] == -1: sentiment = triple_classifier(r) # 1 高兴、2 愤怒、3 悲伤、0无情感 if sentiment == 0: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment)
def one_topic_calculation_comments_v4(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v4 import kmeans, tfidf_v4, text_classify, \ cluster_evaluation, choose_cluster eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment) MIN_CLUSTERING_INPUT = 50 MIN_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 15 if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v4(inputs) results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM) # for k, v in results.iteritems(): # print k, len(v) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k, v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价 recommend_text = cluster_evaluation(evaluation_inputs) for label, items in recommend_text.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = results[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight'])
def step2_cal(): """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价 """ # 聚类评价时选取TOPK_FREQ_WORD的高频词 TOPK_FREQ_WORD = 50 # 聚类评价时最小簇的大小 LEAST_SIZE = 8 # 判断其他类是否需要分裂 ifsplit = event.check_ifsplit(initializing) print '[%s] ' % ts2datetime( int(time.time()) ), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime( timestamp) if ifsplit: inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos( initializing) print eventid, ' after classify before split: ', len( inputs), kmeans_cluster_num, reserve_num if len(inputs) > 2: items = [] for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") items.append(r) # kmeans聚类 kmeans_results = kmeans(items, k=kmeans_cluster_num) # 聚类评价 if initializing or now_hour == 0: min_tfidf = event.get_min_tfidf() final_cluster_results, tfidf_dict = cluster_evaluation( kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=min_tfidf) else: # 每小时聚类时,不用和已有簇的最小tfidf作比 final_cluster_results, tfidf_dict = cluster_evaluation( kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE) # 更新新闻簇标签,更新子事件表 for label, items in final_cluster_results.iteritems(): if label == "other": label = event.getOtherSubEventID() event.save_subevent(label, timestamp) if label != event.getOtherSubEventID(): # 更新每类的tfidf event.update_subevent_tfidf(label, tfidf_dict[label]) for r in items: news = News(r["_id"], event.id) news.update_news_subeventid(label) else: print 'inputs less than 2, kmeans aborted' print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)