def main(): doc = '''杰森我爱你!加油你是最棒的!''' start_time = time.time() datalist = Seg().get_data_from_mysql(5, 0) npl = SimpleNLP(1, doc, datalist) print(npl.seg_doc()) print(npl.seg_datalist()) keyword = npl.get_keyword_datalist() print(keyword) print(len(keyword)) '''
def Analysis(lyric, mod=True): if mod == False: pos = [] neg = [] with open( "D:\\Academic_work\\01_ERG3010\\Project\\corpus\\doubandata.txt", 'r', encoding='utf-8-sig') as f: for line in f: line = f.readline() line = line.split("##") try: star = int(line[1]) except: pass if star == 1 or star == 2: neg.append(line[2].strip('\n')) elif star == 4 or star == 5: pos.append(line[2].strip('\n')) ''' segment ''' seg_pos = Seg().seg_from_datalist(pos) seg_neg = Seg().seg_from_datalist(neg) ''' training & test ''' word_list = [] lable_list = [] data = [] train_data = [] shuffle(seg_pos) shuffle(seg_neg) for k in seg_pos[:500]: train_data.append(('pos', k)) word_list.append(k) lable_list.append('pos') for k in seg_neg[:500]: train_data.append(('neg', k)) word_list.append(k) lable_list.append('neg') ''' train, test''' fe = FeatureExtraction(word_list, lable_list) best_words = fe.best_words(3000, False) best_words = "D:\Academic_work\01_ERG3010\Project\lyricsAnalysis2\svmmodel-bestwords.dat" model = Sentiment(best_words) model.train_model(train_data) model.save_model(root_path + "\\lyricsAnalysis2\\svmmodel") else: model = Sentiment() model.load_model(root_path + "\\lyricsAnalysis2\\svmmodel") result = model.predict_datalist(lyric) # lyric 是一个list, 放每一首歌曲 data = [] count = 1 for prob in result: time = "{}/{}".format((count // 12), count // 30) data.append([count, prob, "Pos"]) data.append([count, 1 - prob, "Neg"]) count += 1 ''' text visualization ''' tr = ThemeRiver("Sentiment", title_color="#274C77", title_text_size=20) tr.add(['Pos', 'Neg'], data, is_label_show=True, is_datazoom_show=True, legend_text_color="#274C77", legend_text_size=15) tr.render("ThemeRiver.html")
def __init__(self, c, best_words): self.seg = Seg() self.clf = SVC(probability=True, C=c) self.train_data = [] self.train_label = [] self.best_words = best_words
def get_seg(self, fname='seg.pickle'): seg = Seg() seg.load(fname) return seg
def __init__(self): self.classifier = Bayes() self.seg = Seg() self.seg.load('seg.pickle')
def __init__(self, best_words=None): self.svm = SVM(50, best_words) self.seg = Seg()
def __init__(self, method=1, doc=None, datalist=None): self.doc = doc self.datalist = datalist self.seg = Seg() self.sentiment = Sentiment(method) self.method = method
from seg import Seg ''' 1, 使用直接估计法估计HMM的参数 2,使用 viterbi 算法选择概率最大的分词序列 ''' segger = Seg() # 澳/b 门/e 的/s 回/b 归/e 一/b 定/e 能/b 够/e 顺/b 利/e 实/b 现/e 。/s def train(fname): datas = [] i = 0 with open(fname, 'r', encoding='utf-8') as f: for line in f: if i == 10000: break line = line.rstrip() if not line: continue tmp = list(map(lambda x: x.split('/'), line.split())) datas.append(tmp) i += 1 segger.train(datas)