def triple_classifier(tweet): ''' 输出结果: 0 中性 1 积极 2 生气 3 焦虑 4 悲伤 5 厌恶 6 消极其他 ''' sentiment = MIDDLE text = tweet['text'] keywords_list = [] emoticon_sentiment = emoticon(text) if emoticon_sentiment != MIDDLE: entries = cut(fc, text) entry = [e for e in entries] keywords_list = entries if emoticon_sentiment == POSITIVE: sentiment = emoticon_sentiment text = u'' else: sentiment = flow_psychology_classfiy(text) if sentiment == 0: sentiment = 6 text = u'' if text != u'': entries = fc.get_text_fc(text) entry = [e for e in entries] keywords_list = entry bow = dictionary_1.doc2bow(entry) s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0]**pair[1]) s[1] = s[1] * (step1_score[pair[0]][1]**pair[1]) if s[0] < s[1]: bow = dictionary_2.doc2bow(entry) s2 = [1, 1] for pair in bow: s2[0] = s2[0] * (step2_score[pair[0]][0]**pair[1]) s2[1] = s2[1] * (step2_score[pair[0]][1]**pair[1]) if s2[0] > s2[1]: sentiment = POSITIVE elif s2[0] == s2[1]: sentiment = MIDDLE else: sentiment = flow_psychology_classfiy(text) if sentiment == 0: sentiment = 6 else: sentiment = MIDDLE return sentiment
def cut_words_noun(text): '''分词, 加入黑名单过滤单个词,保留名词 input texts: 输入text的list,utf-8 output: terms: 关键词list ''' if not isinstance(text, str): raise ValueError("cut words input text must be string") cx_terms = fc.get_text_fc(text, cx=True) return [term for term, cx in cx_terms if cx in cx_dict_noun_utils and term not in black_words]
def test_data(weibo, flag): word_dict = dict() with open(ABS_PATH + '/svm/new_feature.csv', 'r') as f: reader = csv.reader(f) for w, c in reader: word_dict[str(w)] = c items = [] for i in range(0, len(weibo)): words = fc.get_text_fc(weibo[i]['content168']) row = dict() for word in words: if str(word[0]) in row: row[str(word[0])] = row[str(word[0])] + 1 else: row[str(word[0])] = 1 items.append(row) f_items = [] for i in range(0, len(items)): row = items[i] f_row = '' f_row = f_row + str(1) for k, v in word_dict.items(): if k in row: item = str(word_dict[k]) + ':' + str(row[k]) f_row = f_row + ' ' + str(item) f_items.append(f_row) with open(ABS_PATH + '/svm_test/test%s.txt' % flag, 'w') as f: writer = csv.writer(f) for i in range(0, len(f_items)): row = [] row.append(f_items[i]) writer.writerow((row)) f.close()
def word_net(weibo,k_cluster):#词频词网 single_word_whitelist = load_single_word_whitelist() black = load_black_words() cx_dict = set(['Ag','a','an','Ng','n','nr','ns','nt','nz','Vg','v','vd','vn','@','j']) n = 0 ts = time.time() f_dict = dict()#频数字典 total = 0#词的总数 weibo_word = [] for i in range(0,len(weibo)): text = weibo[i]['content168'] words = fc.get_text_fc(text ,cx=True) row = [] for word in words: if (word[1] in cx_dict) and (1 < len(word[0]) < 10 or word[0] in single_word_whitelist) and (word[0] not in black):#选择分词结果的名词、动词、形容词,并去掉单个词 total = total + 1 if str(word[0]) in f_dict: f_dict[str(word[0])] = f_dict[str(word[0])] + 1 else: f_dict[str(word[0])] = 1 row.append(word[0]) weibo_word.append(row) keyword = TopkHeap(300) for k,v in f_dict.items():#计算单个词的信息量 if v >= 2 and (float(v)/float(total)) <= 0.8:#去掉频数小于3,频率高于80%的词 p = v keyword.Push((p,k))#排序 keyword_data = keyword.TopK()#取得前100的高频词作为顶点 ts = time.time() keyword = [] k_value = dict() for i in range(0,len(keyword_data)): keyword.append(keyword_data[i][1]) k_value[str(keyword_data[i][1])] = float(keyword_data[i][0])/float(total) word_net = dict()#词网字典 for i in range(0,len(weibo_word)): row = weibo_word[i] for j in range(0,len(row)): if row[j] in keyword: if j-1 >= 0 and row[j] != row[j-1]: if str(row[j]+'_'+row[j-1]) in word_net: word_net[str(row[j]+'_'+row[j-1])] = word_net[str(row[j]+'_'+row[j-1])] + 1 elif str(row[j-1]+'_'+row[j]) in word_net: word_net[str(row[j-1]+'_'+row[j])] = word_net[str(row[j-1]+'_'+row[j])] + 1 else: word_net[str(row[j-1]+'_'+row[j])] = 1 if j+1 < len(row) and row[j] != row[j+1]: if str(row[j]+'_'+row[j+1]) in word_net: word_net[str(row[j]+'_'+row[j+1])] = word_net[str(row[j]+'_'+row[j+1])] + 1 elif str(row[j+1]+'_'+row[j]) in word_net: word_net[str(row[j+1]+'_'+row[j])] = word_net[str(row[j+1]+'_'+row[j])] + 1 else: word_net[str(row[j]+'_'+row[j+1])] = 1 weight = TopkHeap(500) #这里选择的top数会和下面聚类的输入及输出一样 for k,v in word_net.items():#计算权重 k1,k2 = k.split('_') if k1 not in k_value: k_value[k1] = 0 if k2 not in k_value: k_value[k2] = 0 if k_value[k1] > k_value[k2]: p = v*k_value[k1] else: p = v*k_value[k2] weight.Push((p,k))#排序 data = weight.TopK() word = [] word_weight = dict() for i in range(0,len(data)): if data[i][1] not in word: word.append(data[i][1]) word_weight[data[i][1]] = data[i][0] #聚类 feature = [] for w in word: k1,k2 = w.split('_') c = [] for i in range(0, len(weibo)): n1 = weibo[i]['content168'].count(str(k1)) n2 = weibo[i]['content168'].count(str(k2)) n = n1 + n2 c.append(n) feature.append(c) features = np.array(feature) result = kmeans(features,k_cluster,'summary') word_result_before = dict() for i in range(0,len(result)): label = result[i][0] w = (word[i],word_weight[word[i]]) try: word_result_before[label].append(w) except KeyError: word_result_before[label] = [w] word_result = dict() word_main = dict() for label in word_result_before: main_words = sorted(word_result_before[label],key = lambda x:x[1],reverse = True) word_result[label] = [i[0] for i in word_result_before[label]] word_main[label] = [i[0] for i in main_words[:k_cluster]] return word_result, word_weight, word_main