def analyze(sentence): s = SnowNLP(sentence) if (round(s.sentiments) >= 0.5): return (round(round(s.sentiments * 10))) else: return (round(round(s.sentiments * 10) * (-2)))
query_id = [] passage = [] query = [] answer = [] alternatives = [] passage_len = [] query_in_char = [] query_in_word = [] query_in_char_set = [] query_in_word_set = [] ques_mark = [] for i in range(0, len(lines)): l = lines[i] ge = json.loads(l) que = SnowNLP(ge.get('query', '')).han query.append(que) query_id.append(ge.get('query_id', '')) pas = SnowNLP(ge.get('passage', '')).han try: pas = pas.split('?')[-1].strip() except: pass pas = pas.replace(que, '') if len(pas) < 2: pas = SnowNLP(ge.get('passage', '')).han print(ge.get('passage', '')) print(ge.get('query', '')) print(ge.get('answer', ''))
def split_all(): input_list_1 = 'list/data_location/mycode/china_loc_list_full.txt' input_list = 'list/data_location/mycode/china_prov_list_simple.txt' f_in = open(input_list, 'r') provs = f_in.readlines() f_in.close() provs = [prov.strip() for prov in provs] save_dir = './data/select_prov/' f = open('./list/stop_words.txt', 'r') stop_words = f.readlines() f.close() stop_words = [i.strip() for i in stop_words] count_all = {} for prov in provs: for year in xrange(2002, 2018): year = str(year) label = save_dir + year + '/' + ''.join( [str(i) for i in SnowNLP(unicode(prov)).pinyin]) #print label year_dict = read_dict(label + '_all_count.txt') for key in year_dict: if key not in count_all: count_all[key] = np.zeros(17, 'float') count_all[key][0] += year_dict[key] count_all[key][int(year) - 2001] += year_dict[key] for prov in provs: loc_counts = make_dict(input_list_1) word_counts = {} for year in xrange(2002, 2018): year = str(year) label = save_dir + year + '/' + ''.join( [str(i) for i in SnowNLP(unicode(prov)).pinyin]) #print label year_dict = read_dict(label + '_all_count.txt') for key in loc_counts.keys(): if key in year_dict: loc_counts[key][0] += year_dict[key] loc_counts[key][int(year) - 2001] = year_dict[key] for key in year_dict: if key in word_counts: word_counts[key] += year_dict[key] else: word_counts[key] = year_dict[key] for key in loc_counts.keys(): for i in xrange(0, 13): if loc_counts[key][i] != 0: loc_counts[key][i] /= count_all[key][i] # for key in word_counts.keys(): # word_counts[key] /= count_all[key][0] print(prov) word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) loc_counts = sorted(loc_counts.items(), key=lambda x: x[1][0], reverse=True) # _all_stop: all_count - stop_words f = open( save_dir + 'all/' + ''.join([str(i) for i in SnowNLP(unicode(prov)).pinyin]) + '_all_stop.txt', 'w') for k, v in word_counts: if k not in stop_words and len(k.decode()) > 1: f.write(k + ' ' + str(v) + '\n') f.close() ## _all_count: all_count(word, count) #f = open(save_dir+'all/'+''.join([str(i) for i in SnowNLP(unicode(prov)).pinyin])+'_all_count.txt', 'w') #for k,v in word_counts: # if k not in stop_words: # f.write(k+' '+str(v)+'\n') #f.close() ## _all_count_list: [word_in_list, all_count, 16*year_count] #f_1 = open(save_dir+'all/'+''.join([str(i) for i in SnowNLP(unicode(prov)).pinyin])+'_all_count_list.txt', 'w') #for k,v in loc_counts: # f_1.write(k+' '+array_2_str(v)+'\n') #f_1.close() count_all = sorted(count_all.items(), key=lambda x: x[1][0], reverse=True) f_1 = open(save_dir + 'all_count.txt', 'w') for k, v in count_all: if k not in stop_words: f_1.write(k + ' ' + array_2_str(v.astype('int')) + '\n') f_1.close()
def _sentiment(text): s = SnowNLP(text) return s.sentiments
# -*- coding: utf-8 -*- from snownlp import SnowNLP import codecs def load_sentences(path): with codecs.open(path, encoding='utf-8-sig') as f1: text = f1.read() return text.replace("\r", "").split("\n") for sent in s.sentences: print(sent, SnowNLP(sent).sentiments) def test_new_word(): new_word = {"positive_word": [], "negative_word": []} with codecs.open(Path1 + "new_word.txt", encoding="utf-8-sig") as f1: for line in f1.readlines(): direction, word = line.replace("\r\n", "").split(" ")[:2] if direction == u"positive": new_word["positive_word"].append(word) if direction == u"negative": new_word["negative_word"].append(word)
def sentiment_analysis(text): snownlp = SnowNLP(text) return Comment.rank_sentiments(snownlp.sentiments)
from snownlp import SnowNLP text="皱皱的颜色。也不鲜艳。很喜欢。但总归。还是嫌弃。" s = SnowNLP(text) for i in s.sentences: #print(i) s1 = SnowNLP(i) #print(s1.sentiments) text="皱皱的颜色也不鲜艳很喜欢但总归还是嫌弃" a = SnowNLP(text) print(text) print(a.sentiments) text="皱皱的颜色也不鲜艳很嫌弃但总归还是喜欢" a = SnowNLP(text) print(text) print(a.sen timents)
def NewsProcess(data): s = SnowNLP(data) print(s.words) print("##" + str(s.sentiments))
# sents = normal.get_sentences(t) # doc = [] # for sent in sents: # words = seg.seg(sent) # words = normal.filter_stop(words) # doc.append(words) # rank = textrank.TextRank(doc) # rank.solve() # for index in rank.top_index(5): # print(sents[index]) # keyword_rank = textrank.KeywordTextRank(doc) # keyword_rank.solve() # for w in keyword_rank.top_index(5): # print(w) sentimentslist = [] data = xlrd.open_workbook('MOBICK.xlsx') table = data.sheets()[0] nrows = table.nrows for i in range(nrows): str = "".join('%s' % id for id in table.row_values(i)) s = SnowNLP(str) print(table.row_values(i), s.sentiments) sentimentslist.append(s.sentiments) fileObject = open('MOBICK.txt', 'w+') for ip in sentimentslist: fileObject.write(__builtins__.str(ip)) fileObject.write('\n') fileObject.close() fig1 = plt.figure("sentiment") plt.hist(sentimentslist, bins=np.arange(0, 1, 0.02)) plt.show()
def comment_analysis(pagedata): jsondata = str(pagedata) df = pd.DataFrame(eval(jsondata)) df = df.drop_duplicates() #Constuct Features ud = [] ff = [] fo = [] for i in range(0, len(df)): try: #trim Comments s = df.loc[i, "Comments"] df.loc[i, "Comments"] = s[1:len(s) - 1] #drop rows with pure repost if (df.loc[i, "Comments"] == "转发微博"): df = df.drop(i) else: if ("回复@" in df.loc[i, "Comments"]): s = df.loc[i, "Comments"] df.loc[i, "Comments"] = s[s.index(':') + 1:len(s)] #label user description attribute if (str(df.loc[i, "User_description"]) == "nan"): ud.append(0) else: ud.append(1) #label verified user attribute if (df.loc[i, "Verified"] == 0): df.loc[i, "Verified"] = 1 elif (df.loc[i, "Verified"] == -1): df.loc[i, "Verified"] = 0 #calculate follower/following and follower/originalPost ratio if df.loc[i, "Follower"] == 0: df.loc[i, "Follower"] = 1 ff.append(df.loc[i, "Following"] / df.loc[i, "Follower"]) fo.append(df.loc[i, "Original_post"] / df.loc[i, "Follower"]) except KeyError: ud.append(-1) ff.append(-1) fo.append(-1) df = df.reset_index(drop=True) #add features from post/user information df['description'] = ud df['ffRatio'] = ff df['foRatio'] = fo for i in range(0, len(df)): if df.loc[i, "description"] == -1: df = df.drop(i) ud.remove(-1) ff.remove(-1) fo.remove(-1) df = df.reset_index(drop=True) #Append Sentiment Score pynlpir.open() segPosts = [] sentiScore = [] translator = Translator() r = '[’!?:;【】,《》!"#$%&\'()()“”…*+,-./:;<=>?@[\\]^_`{|}~]+' #Append Sentiment Score for all Comments under original post for i in range(0, len(df)): df.loc[i, "Comments"] = re.sub(r, '', df.loc[i, "Comments"]) try: if "en" in str(detect(df.loc[i, "Comments"])): transText = str( translator.translate(df.loc[i, "Comments"], src='en', dest='zh-cn').text) line = transText.strip() s = SnowNLP(line) senti = (s.sentiments + SnowNLP(re.sub("[0-9]", "", line)).sentiments) / 2 sentiScore.append(senti) seg = pynlpir.segment(line, pos_tagging=False) segPosts.append(seg) elif "zh-cn" or "zh-tw" or "ko" in str( detect(df.loc[i, "Comments"])): line = df.loc[i, "Comments"].strip() s = SnowNLP(line) senti = (s.sentiments + SnowNLP(re.sub("[0-9]", "", line)).sentiments) / 2 sentiScore.append(senti) seg = pynlpir.segment(line, pos_tagging=False) segPosts.append(seg) else: #drop rows without valid sentiment scores print("error1") df = df.drop(i) except: print("error2") df = df.drop(i) df = df.reset_index(drop=True) df['Sentiment'] = sentiScore df = df.drop('Follower', axis=1).drop('Following', axis=1).drop( 'User_description', axis=1).drop('Original_post', axis=1).drop("Comments", axis=1).drop("Username", axis=1).drop("UID", axis=1) pd.set_option('display.max_rows', 400) df = df.round({'Sentiment': 5}) return df
def senti(x): return (x[0],[SnowNLP(x[1]).sentiments,1])
plt.xlabel('Epochs') plt.ylabel('Loss and Acc') plt.legend() #plt.show() pre_y = model.predict(pre_x, batch_size=32) pre_y_class = model.predict_classes(pre_x, batch_size=32) pre_y = [round(x[0], 3) for x in pre_y] pre_y_class = [x[0] for x in pre_y_class] print(pre_y[:10]) print(pre_y_class[:10]) # 关键词情感分类 star = [] for it in x_pre: s = SnowNLP(it) t = s.sentiments star.append(round(t, 3)) print(star[:10]) ##汇总 star_class = np.array(star) star_class = np.where(star_class >= 0.5, 1, 0).tolist() x_pre = x_pre.tolist() y_label = y_label.tolist() # pre_y = pre_y.tolist() # pre_y_class = pre_y_class.tolist() # 验证模型准确率 acc1 = accuracy_score(y_label, pre_y_class) acc2 = accuracy_score(y_label,star_class)
def fenci(string): s = SnowNLP(string) #result = s.words #print(len(result), '/'.join(result)) # 。words 分词 print('Sentiments:', s.sentiments) # 。sentiments 情感分析
from snownlp import SnowNLP text = '我来到河北保定河北大学上学' s = SnowNLP(text) print(s.words)
def snownlp_segment(self, sentence): # snownlp分词 # unicode_sentence = sentence.decode('gbk') sentence = SnowNLP(sentence).words return ' '.join(sentence)
(7)文本关键词和文本摘要提取(TextRank算法) (8)计算文档词频(TF,Term Frequency)和逆向文档频率(IDF,Inverse Document Frequency) (9)Tokenization(分割成句子) (10)文本相似度计算(BM25) SnowNLP的最大特点是特别容易上手,用其处理中文文本时能够得到不少有意思的结果,但不少功能比较简单,还有待进一步完善。 # In[10]: from snownlp import SnowNLP s=SnowNLP(u'杭州西湖风景很好,是旅游胜地,每年吸引大量前来游玩的游客!') #分词 print(s.words) # In[11]: #情感词性计算 print("该文本的情感词性为正的概率:" + str(s.sentiments)) # In[12]: _s=SnowNLP(u'今天又是下雨又是刮风,真是糟糕透了!') print("该文本的情感词性为正的概率:" + str(_s.sentiments))
# -*- coding: utf-8 -*- import pandas as pd from snownlp import SnowNLP if __name__ == '__main__': test = pd.read_csv(r"TestModel.csv") review_list = [review for review in test['review']] label_list = [label for label in test['label']] list_test = [(label, review) for label, review in list(zip(label_list, review_list)) if type(review) != float] for j in list_test: print(j[1], j[0], SnowNLP(j[1]).sentiments) senti = [SnowNLP(review).sentiments for label, review in list_test] newSenti = [] for i in senti: # 预测结果为pos的概率,大于0.6我们认定为积极评价 if (i >= 0.6): newSenti.append(1) else: newSenti.append(0) counts = 0 for i in range(len(list_test)): if (newSenti[i] == list_test[i][0]): counts += 1 accuracy = float(counts) / float(len(list_test)) print("准确率为:%.2f" % accuracy)
def qingxu_number(text): s = SnowNLP(text) print('本句情绪指数:%f' % s.sentiments)
""" 中文文本的情感分析 @Date 2020.04.28 pip3 install snownlp """ from snownlp import SnowNLP text = '人们日常所犯最大的错误,是对陌生人太客气,而对亲密的人太苛刻了,而努力改掉这个习惯。' s = SnowNLP(text) # 分词 print(s.words) # 词性标注 tags = [x for x in s.tags] print(tags) # 断句 print(s.sentences) # 拼音 print(s.pinyin) # 情绪判断,返回值为正面情绪的概率,越接近1表示正面情绪,越接近0表示负面情绪 text1 = '这部电影太棒了' text2 = '这部电影简直是烂到爆' s1 = SnowNLP(text1) s2 = SnowNLP(text2) # 这部电影太棒了 0.9829468270441747 print(text1, s1.sentiments) # 这部电影简直是烂到爆 0.2296519477554504 print(text2, s2.sentiments) # 关键字抽取
# coding=utf8 from snownlp import sentiment from snownlp import SnowNLP s = SnowNLP(u"这个东西真赞") print(s.sentiments)
def Txtmine(indata): data = indata[6] title = indata[2] dresult["title"] = title dresult["article"] = data mylen = len(data) line_num = data.count("\n") """字数行数""" dresult["wordnums"] = mylen dresult["linenums"] = line_num """词频""" words = jieba.lcut(data) counts = {} for word in words: if len(word) == 1: continue elif word.isdigit(): continue else: rword = word counts[rword] = counts.get(rword, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) dresult["words"] = items[0:3] """情感""" s = SnowNLP(data) dresult["sentiments"] = s.sentiments """摘要""" s = SnowNLP(data.replace("不", "")) # print(s.keywords()) swords = list() for word in s.keywords(): if len(word) < 2: continue else: swords.append(word) dresult["keywords"] = swords summary = list(set(s.summary()))[:3:1] # print(summary) lines = "" for line in summary: # print(line) lines = lines + line + " //// " dresult["summary"] = lines """特征值""" values = [i for i in indata[7:]] dresult["val"] = values """超链接数目""" results = re.findall("(?isu)(https\://[a-zA-Z0-9\.\?/&\=\:]+)", data) dresult["urlnums"] = len(results) """错字检测""" r = requests.post("http://www.cuobiezi.net/api/v1/zh_spellcheck/json", data={ 'content': '我最喜欢的就是元啊节吃汤圆。 ', 'check_mode': 'advanced', 'action': 'show' }) """""" loc = 0 org = 0 peo = 0 words, ners = fool.analysis(data) for ner in ners[0]: if ner[2] == "location": loc += 1 elif ner[2] == "org": org += 1 elif ner[2] == "person": peo += 1 else: continue dresult["loc"] = loc dresult["org"] = org dresult["peo"] = peo # print(dresult) return dresult
# coding:utf-8 # SnowNLP是python中用来处理文本内容的, # 可以用来分词、标注、文本情感分析等,情感分析是简单的将文本分为两类, # 积极和消极,返回值为情绪的概率,越接近1为积极,接近0为消极。代码如下: import numpy as np from snownlp import SnowNLP import matplotlib.pyplot as plt f = open('avengers.txt', 'r') list = f.readlines() sentimentslist = [] for i in list: s = SnowNLP(i) # print s.sentiments sentimentslist.append(s.sentiments) plt.hist(sentimentslist, bins=np.arange(0, 1, 0.01), facecolor='g') plt.xlabel('Sentiments Probability') plt.ylabel('Quantity') plt.title('Analysis of Sentiments') plt.show()
def process_data_8(text_name, gt_name, save_name, simple): data = {} with open(text_name, 'r', encoding='utf-8') as f: for line in f.readlines(): line = line.strip() id = line.split()[0][5:-1] text = ''.join(line.split()[1:]) if simple: if len(text) != len(SnowNLP(text).han): print(text) han = list(SnowNLP(text).han) new_han = [] k = 0 for i in range(len(han)): if i < k: continue if i+3 < len(han) and han[i:i+4] == ['公', '共', '汽', '车']: new_han += ['公', '车'] k = i + 4 else: new_han += han[i] new_text = ''.join(new_han) if len(text) != len(new_text): han = list(new_text) new_han = [] k = 0 for i in range(len(han)): if i < k: continue if i + 2 < len(han) and han[i:i + 3] == ['出', '租', '车']: new_han += ['的', '士'] k = i + 3 else: new_han += han[i] new_text = ''.join(new_han) print(new_text) if len(text) != len(new_text): han = list(new_text) new_han = [] k = 0 for i in range(len(han)): if i < k: continue if i + 2 < len(han) and han[i:i + 3] == ['因', '特', '网']: new_han += ['网', '际', '网', '络'] k = i + 3 else: new_han += han[i] new_text = ''.join(new_han) print(new_text) assert len(text) == len(new_text) text = new_text else: text = SnowNLP(text).han data[id] = {'text': text} with open(gt_name, 'r', encoding='utf-8') as f: for line in f.readlines(): line = line.strip().split(', ') answer = [] # extract wrong word id and replaced word if line[1] == '0': data[line[0]]['answer'] = [] else: for i in range(1, len(line), 2): if simple: line[i + 1] = SnowNLP(line[i+1]).han answer.append([int(line[i]), line[i+1]]) data[line[0]]['answer'] = answer with open(save_name, 'w', encoding='utf-8') as f: json.dump(data, f)
from snownlp import SnowNLP text1 = '垃圾东西,很难吃' def rank_sentiments(sentiments): if sentiments >= 0.8: return 5 elif sentiments >= 0.6: return 4 elif sentiments >= 0.4: return 3 elif sentiments >= 0.2: return 2 else: return 1 s1 = SnowNLP(text1) print(rank_sentiments(s1.sentiments))
def split_all(): input_list_1 = 'list/bag.txt' input_list = 'list/data_location/mycode/china_prov_list_simple.txt' f_in = open(input_list, 'r') provs = f_in.readlines() f_in.close() provs = [prov.strip() for prov in provs] save_dir = './data/select_prov/' f = open('./list/stop_words.txt', 'r') stop_words = f.readlines() f.close() stop_words = [i.strip() for i in stop_words] for prov in provs: loc_counts = make_dict(input_list_1) word_counts = {} for year in xrange(2002, 2018): year = str(year) label = save_dir + year + '/' + ''.join( [str(i) for i in SnowNLP(unicode(prov)).pinyin]) #print label year_dict = read_dict(label + '_count_search.txt') for key in loc_counts.keys(): for key_split in key.split(): if key_split in year_dict: loc_counts[key][0] += year_dict[key_split] loc_counts[key][int(year) - 2001] = year_dict[key_split] for key in year_dict: for key_split in key.split(): if key in word_counts: word_counts[key] += year_dict[key_split] else: word_counts[key] = year_dict[key_split] print(prov) word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) loc_counts = sorted(loc_counts.items(), key=lambda x: x[1][0], reverse=True) # _all_stop: all_count - stop_words #f = open(save_dir+'all/'+''.join([str(i) for i in SnowNLP(unicode(prov)).pinyin])+'_all_stop.txt', 'w') #for k,v in word_counts: # if k not in stop_words: # f.write(k+' '+str(v)+'\n') #f.close() ## _all_count: all_count(word, count) #f = open(save_dir+'all/'+''.join([str(i) for i in SnowNLP(unicode(prov)).pinyin])+'_all_count.txt', 'w') #for k,v in word_counts: # if k not in stop_words: # f.write(k+' '+str(v)+'\n') #f.close() # _all_count_list: [word_in_list, all_count, 16*year_count] f_1 = open( save_dir + 'all/' + ''.join([str(i) for i in SnowNLP(unicode(prov)).pinyin]) + '_count_search_bag.txt', 'w') for k, v in loc_counts: f_1.write(k + ' ' + array_2_str(v) + '\n') f_1.close()
def get_sentiment(word): text = u'{}'.format(word) s = SnowNLP(text) print(s.sentiments)
from snownlp import SnowNLP import matplotlib.pyplot as plt import numpy as np conn = pymysql.connect(host='127.0.0.1', user='******', password='', charset='utf8') with conn: cur = conn.cursor() cur.execute("SELECT * FROM sinaweibo.macomment") rows = cur.fetchall() commentlist = [] for row in rows: row = list(row) del row[0] if row not in commentlist: commentlist.append(row[2]) conn.close() print("Finish fetching the comment data...") # print(commentlist) # snowanalysis print("Start SnowNLP data") sentimentslist = [] for com in commentlist: s = SnowNLP(com) sentimentslist.append(s.sentiments) # print(sentimentslist) plt.hist(sentimentslist, bins=np.arange(0, 1, 0.02)) plt.show()
from snownlp import SnowNLP text = '手机质量非常好,非常漂亮!' s = SnowNLP(text) # print(s.words) # tags = [x for x in s.tags] # print(tags) # print(s.sentiments) # print(s.keywords(limit=20)) print(s.sentences) for sen in s.sentences: t = SnowNLP(sen) tags = [x for x in t.tags] for k, v in tags: if (v == 'n'): print(k) print(tags)
#encoding=utf-8 from snownlp import SnowNLP """汉字转拼音""" s = SnowNLP(u'这个东西真心很赞') print s.pinyin # [u'zhe', u'ge', u'dong', u'xi', # u'zhen', u'xin', u'hen', u'zan']
def saveComments(): index = 0 mongoClient = MongoClient('172.24.177.30', 27017) db = mongoClient['futures_data'] posts = db['comments'] sinanews = db['sinanews'] for data in loadUrls(): try: index += 1 tempList = [] url = data['url'] id = data['id'] print(url) url = getCUrl(url) request.urlopen(url) response = request.urlopen(url + '&page=1&page_size=200') jsonStr = response.read() print(jsonStr[9:]) jsonObj = json.loads(jsonStr[9:]) tempList.extend(jsonObj['result']['cmntlist']) count = int(jsonObj['result']['count']['show']) pageNum = math.ceil(count / 200) if pageNum > 1: for page in range(2, pageNum + 1): response = request.urlopen(url + ('&page=%s&page_size=200' % page)) jsonStr = response.read() jsonObj = json.loads(jsonStr[9:]) tempList.extend(jsonObj['result']['cmntlist']) print('page: %s' % page) print(len(tempList)) # break sentiments = [] positive = 0 negative = 0 for temp in tempList: s = SnowNLP(temp['content']) sentiments.append(s.sentiments) if s.sentiments > 0.5: positive += 1 else: negative += 1 if len(sentiments) != 0: sentiment = np.mean(sentiments) else: sentiment = -1 # saveObj = {'id': id, 'url': url, 'comment': tempList, 'sentiment': sentiment, 'positive': positive, 'negative': negative} print('第' + str(index) + '篇', sentiment, data['id']) # posts.insert_one(saveObj) sinanews.update( {'_id': ObjectId(data['id'])}, {'$set': { 'positive': positive, 'negative': negative }}) except Exception as e: print(e)