def summary_main(weibo_data): #摘要自动生成主函数 ''' 输入数据: weibo列表:[weibo1,weibo2,...] ''' word_result, word_weight = word_net(weibo_data, 5) text_list = text_net(word_result, word_weight, weibo_data) text_str = '' for text in text_list: re_t = re_cut(text) if not len(re_t): continue if re_t[-1] != '。': text_str = text_str + re_t + '。' else: text_str = text_str + re_t #print text_str tr4s = TextRank4Sentence() tr4s.analyze(text=text_str, lower=True, source='all_filters') result = [] for item in tr4s.get_key_sentences(num=10): result.append(item.sentence) return result
def opinion_main(weibo_data,k_cluster): ''' 观点挖掘主函数: 输入数据: weibo_data:微博列表,[weibo1,weibo2,...] k_cluster:子话题个数 输出数据: opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...} word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...} text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..} ''' weibo_new = [] for i in range(0,len(weibo_data)): text = weibo_data[i] n = str(text).count('@') if n >= 5: continue value = cut_filter(text) if len(value) > 0: if text != '转发微博': weibo_new.append(value) word_result,word_weight = word_net(weibo_new,k_cluster)#提取关键词对 text_list,opinion_name = text_net(word_result,word_weight,weibo_new)#提取代表文本 return opinion_name,word_result,text_list
def opinion_main(weibo_data, k_cluster): ''' 观点挖掘主函数: 输入数据: weibo_data:微博列表,[weibo1,weibo2,...] k_cluster:子话题个数 输出数据: opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...} word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...} text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..} ''' print('\t\tGetting keywords...') limit_num = 30000 weibo_data = weibo_num_limit(weibo_data, limit_num) # while True: word_result, word_weight, word_main = word_net(weibo_data, k_cluster) #提取关键词对 # if len(word_result): # break # else: # print('Cluto wrong!!! Trying again... If you want to stop it, just kill it...') print('\t\tGetting present text...') text_list, opinion_name = text_net(word_result, word_weight, weibo_data) #提取代表文本,会保证每个聚类里面的微博数量是相等的 return opinion_name, word_result, text_list, word_main
def search_weibo_from_word(uidlist,keywords):#第四种策略:先根据BM25检索文本,然后再根据关键词的交集筛选文本 ''' 输入数据: uidlist:uid列表 keywords:keywords列表,热点新闻切词之后的结果 输出数据: text_list:筛选之后的微博文本 ''' text_list,word_set = get_text_word_by_id(uidlist)#根据uid列表获取对应的文本和分词之后的结果 text_set,word_dict = get_text_by_BM(text_list,word_set,keywords) n = int(0.5*len(text_set)) if n < 1: n = 1 result_list = TopkHeap(n) w_n = int(0.5*len(keywords)) if w_n < 1: w_n = 1 for i in range(0,len(word_dict)): words = word_dict[i] len_n = len(set(words)&set(keywords)) if len_n >= w_n: result_list.Push((len_n,text_set[i])) result = result_list.TopK() text_list = [] for i in range(0,len(result)): if result[i][1] not in text_list: text_list.append(result[i][1]) if len(text_list) >= 10: word_result,word_weight = word_net(text_list,OPINION_CLUSTER) text_list = text_net(word_result,word_weight,text_list) result = [] for text in text_list: s = summary_text(text) max_r,n = get_s(result,s) if max_r >= 0.5: continue else: result.append(s) else: result = [summary_text(text_list)] return result
def opinion_main(weibo_data,k_cluster): ''' 观点挖掘主函数: 输入数据: weibo_data:微博列表,[[mid,text,uid,timetamp,uname,forwarding_count,comment_count],...] k_cluster:子话题个数 输出数据: opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...} word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...} text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..} ''' weibo_new = [] for i in range(0,len(weibo_data)): text = weibo_data[i][1] mid = weibo_data[i][0] uid = weibo_data[i][2] timetamp = weibo_data[i][3] uname = weibo_data[i][4] forwarding_count = weibo_data[i][5] comment_count = weibo_data[i][6] #print 'text..',text #print 'type..text..',type(text) text = text.encode('utf-8') n = str(text).count('@') if n >= 5: continue value = cut_filter(text) if len(value) > 0: if text != '转发微博': weibo_new.append((value,mid,uid,timetamp,uname,forwarding_count,comment_count)) word_result,word_weight = word_net(weibo_new,k_cluster)#提取关键词对 text_list,opinion_name = text_net(word_result,word_weight,weibo_new)#提取代表文本 return opinion_name,word_result,text_list
def main(flag, k_cluster): weibo = [] weibo_dict = dict() reader = csv.reader(file('./test/weibo%s.csv' % flag, 'rb')) for mid, text in reader: n = str(text).count('@') if n >= 5: continue value = cut_filter(text) if len(value) > 0: if text != '转发微博': weibo.append(str(mid)) weibo_dict[str(mid)] = str(text) test(weibo, weibo_dict, flag) #生成测试数据 lable = choose_ad(flag) #广告过滤 ind, word = word_net(weibo, weibo_dict, lable, flag, k_cluster) #提取关键词对 write(ind, word, flag) #写关键词对 text_net(weibo, weibo_dict, lable, ind, word, flag) #提取代表文本
def main(flag,k_cluster): weibo = [] weibo_dict = dict() reader = csv.reader(file('./test/weibo%s.csv' % flag, 'rb')) for mid,text in reader: n = str(text).count('@') if n >= 5: continue value = cut_filter(text) if len(value) > 0: if text != '转发微博': weibo.append(str(mid)) weibo_dict[str(mid)] = str(text) test(weibo,weibo_dict,flag)#生成测试数据 lable = choose_ad(flag)#广告过滤 ind, word = word_net(weibo,weibo_dict,lable,flag,k_cluster)#提取关键词对 write(ind,word,flag)#写关键词对 text_net(weibo,weibo_dict,lable,ind,word,flag)#提取代表文本