예제 #1
0
def main():
    doc = '''杰森我爱你!加油你是最棒的!'''
    start_time = time.time()
    datalist = Seg().get_data_from_mysql(5, 0)
    npl = SimpleNLP(1, doc, datalist)
    print(npl.seg_doc())
    print(npl.seg_datalist())

    keyword = npl.get_keyword_datalist()
    print(keyword)
    print(len(keyword))
    '''
def Analysis(lyric, mod=True):
    if mod == False:
        pos = []
        neg = []
        with open(
                "D:\\Academic_work\\01_ERG3010\\Project\\corpus\\doubandata.txt",
                'r',
                encoding='utf-8-sig') as f:
            for line in f:
                line = f.readline()
                line = line.split("##")
                try:
                    star = int(line[1])
                except:
                    pass
                if star == 1 or star == 2:
                    neg.append(line[2].strip('\n'))
                elif star == 4 or star == 5:
                    pos.append(line[2].strip('\n'))
        ''' segment '''
        seg_pos = Seg().seg_from_datalist(pos)
        seg_neg = Seg().seg_from_datalist(neg)
        ''' training & test  '''
        word_list = []
        lable_list = []
        data = []
        train_data = []
        shuffle(seg_pos)
        shuffle(seg_neg)
        for k in seg_pos[:500]:
            train_data.append(('pos', k))
            word_list.append(k)
            lable_list.append('pos')
        for k in seg_neg[:500]:
            train_data.append(('neg', k))
            word_list.append(k)
            lable_list.append('neg')
        ''' train, test'''
        fe = FeatureExtraction(word_list, lable_list)
        best_words = fe.best_words(3000, False)
        best_words = "D:\Academic_work\01_ERG3010\Project\lyricsAnalysis2\svmmodel-bestwords.dat"
        model = Sentiment(best_words)
        model.train_model(train_data)
        model.save_model(root_path + "\\lyricsAnalysis2\\svmmodel")
    else:
        model = Sentiment()
        model.load_model(root_path + "\\lyricsAnalysis2\\svmmodel")

    result = model.predict_datalist(lyric)  # lyric 是一个list, 放每一首歌曲
    data = []
    count = 1
    for prob in result:
        time = "{}/{}".format((count // 12), count // 30)
        data.append([count, prob, "Pos"])
        data.append([count, 1 - prob, "Neg"])
        count += 1
    ''' text visualization '''
    tr = ThemeRiver("Sentiment", title_color="#274C77", title_text_size=20)
    tr.add(['Pos', 'Neg'],
           data,
           is_label_show=True,
           is_datazoom_show=True,
           legend_text_color="#274C77",
           legend_text_size=15)
    tr.render("ThemeRiver.html")
예제 #3
0
 def __init__(self, c, best_words):
     self.seg = Seg()
     self.clf = SVC(probability=True, C=c)
     self.train_data = []
     self.train_label = []
     self.best_words = best_words
예제 #4
0
파일: main.py 프로젝트: AotY/Play_Interview
 def get_seg(self, fname='seg.pickle'):
     seg = Seg()
     seg.load(fname)
     return seg
예제 #5
0
 def __init__(self):
     self.classifier = Bayes()
     self.seg = Seg()
     self.seg.load('seg.pickle')
 def __init__(self, best_words=None):
     self.svm = SVM(50, best_words)
     self.seg = Seg()
예제 #7
0
 def __init__(self, method=1, doc=None, datalist=None):
     self.doc = doc
     self.datalist = datalist
     self.seg = Seg()
     self.sentiment = Sentiment(method)
     self.method = method
예제 #8
0
파일: main.py 프로젝트: AotY/Play_Interview
from seg import Seg

'''
1, 使用直接估计法估计HMM的参数
2,使用 viterbi 算法选择概率最大的分词序列

'''

segger = Seg()


# 澳/b 门/e 的/s 回/b 归/e 一/b 定/e 能/b 够/e 顺/b 利/e 实/b 现/e 。/s
def train(fname):
    datas = []
    i = 0
    with open(fname, 'r', encoding='utf-8') as f:
        for line in f:
            if i == 10000:
                break

            line = line.rstrip()
            if not line:
                continue

            tmp = list(map(lambda x: x.split('/'), line.split()))

            datas.append(tmp)

            i += 1

    segger.train(datas)