def __init__(self): print('开始加载模型数据') print('开始加载第一层神经网络模型') self.sentencerec_model = SentenceRecModel().load_trained_model() print('加载第一层神经网络模型结束') print('开始加载第二层神经网络模型') self.wordrec_model = WordRecModel().load_trained_model() print('加载第二层神经网络模型结束') print('开始加载词向量模型') self.wordvec_model = WordVecModel().load_trained_wordvec_model() print('加载词向量模型结束') print('加载模型数据结束') print('开始加载分词词库') jieba.load_userdict(config.CORPUS_DIC + '/name.txt') print('加载分词词库结束') self.graph = tf.get_default_graph()
def __init__(self): super().__init__() print('开始加载模型数据') print('开始加载第一层神经网络模型') self.sentencerec_model = SentenceRecModel().load_trained_model() print('加载第一层神经网络模型结束') print('开始加载第二层神经网络模型') self.wordrec_model = WordRecModel().load_trained_model() print('加载第二层神经网络模型结束') print('开始加载词向量模型') self.wordvec_model = WordVecModel().load_trained_wordvec_model() print('加载词向量模型结束') print('加载模型数据结束') print('开始加载分词词库') jieba.load_userdict(config.CORPUS_DIC + '/name.txt') print('加载分词词库结束') self.init_server_socket() print('开始监听客户端发起聊天') # threading.Thread(target=self.accept_client).start() self.accept_client()
def _data2vec(self, sentences: list, labels: list): print('开始将数据转成向量!') wordvec_model = WordVecModel().load_trained_wordvec_model( ) # 加载训练好的词向量模型 sentencevec_list = [] # 句子向量列表 labelvec_list = [] # 标签向量列表 for sentence in sentences: sentencevec = [] sent_len = 0 for word in sentence.split(): if sent_len < self.sentence_len: try: sentencevec.append(wordvec_model[word]) except: sentencevec.append(wordvec_model['。']) else: break sent_len += 1 while sent_len < self.sentence_len: sentencevec.append(wordvec_model['。']) sent_len += 1 sentencevec_list.append(sentencevec) srmt = SentenceRecModelTool() for label in labels: labelvec = srmt.get_one_hot_vec(label) print(label, labelvec) labelvec_list.append(labelvec) print('数据转成向量结束!') return sentencevec_list, labelvec_list
def test_wordrec_model(): labels = ['B-company', 'I-company', 'E-company', 'B-time', 'I-time', 'E-time', 'B-edu', 'I-edu', 'E-edu', 'B-name', 'I-name', 'E-name', 'B-job', 'I-job', 'E-job', 'B-nationality', 'I-nationality', 'E-nationality', 'B-sex', 'I-sex', 'E-sex', 'B-school', 'I-school', 'E-school', 'B-pborn', 'I-pborn', 'E-pborn', 'O'] count=0 right=0 wordvec_model = WordVecModel().load_trained_wordvec_model() wordrec_model = WordRecModel().load_trained_model() jieba.load_userdict(config.CORPUS_DIC + '/Chinese_Names_Corpus.txt') wrmt=WordRecModelTool() reader1 = bigfile.get_file_content(config.PREDATA_DIC+'/second_model_test_sentences.txt') reader2=bigfile.get_file_content(config.PREDATA_DIC+'/second_model_test_labels.txt') words = reader1.__next__() labels=reader2.__next__() while words and labels: words=words.split() labels=labels.split() if len(words)==len(labels): sentencevec=[] label_list=[]#长度为句子长度的标签列表 sen_len = 0 for word in words: if sen_len<config.SENTENCE_LEN: try: sentencevec.append(wordvec_model[word]) except: sentencevec.append(wordvec_model['。']) sen_len+=1 else: break while sen_len<config.SENTENCE_LEN: sentencevec.append(wordvec_model['。']) sen_len+=1 sen_len = 0 for label in labels: if sen_len < config.SENTENCE_LEN: label_list.append(label) sen_len+=1 else: break while sen_len<config.SENTENCE_LEN: label_list.append('O') sen_len+=1 #处理数据 test_x = np.array(sentencevec).reshape(1, config.SENTENCE_LEN, config.WORDVEC_SIZE) #预测 forecast = wordrec_model.predict(test_x) #转成标签 word_label_list = wrmt.labelvecs2strs(forecast[0]) print('句子:',words) print('正确标签:',label_list) print('预测标签:',word_label_list) count+=1 index=0 for label in label_list: if word_label_list[index]!=label: break index+=1 if index==len(label_list): right+=1 print('right') else: print('wrong') words=reader1.__next__() labels=reader2.__next__() print('句子总数:',count) print('预测正确数:',right)
def test_sentencerec_model(): labels=['ctime', 'ptime', 'peri', 'unc', 'noinfo'] counts=[0,0,0,0,0] rights=[0,0,0,0,0] persents=[] wordvec_model=WordVecModel().load_trained_wordvec_model() sentencerec_model=SentenceRecModel().load_trained_model() srt = SentenceRecModelTool() #wordrec_model=WordRecModel().load_trained_model() jieba.load_userdict(config.CORPUS_DIC+'/Chinese_Names_Corpus.txt') reader=bigfile.get_file_content(config.PREDATA_DIC+'/first_model_train_sentences.txt') line=reader.__next__() while line: sentence_label=line.strip()#一条简历信息 if sentence_label != None and sentence_label != '': #分句 temp=sentence_label.split(';;;') if len(temp)==2: sentence=temp[0] label=temp[1] if sentence.strip() != '': sent_len = 0 sentencevec=[] for word in jieba.lcut(sentence): if (word != ' ') and (word != '') and (word != '\n'): if sent_len < config.SENTENCE_LEN: try: sentencevec.append(wordvec_model[word]) except: sentencevec.append(wordvec_model['。']) else: break sent_len += 1 while sent_len < config.SENTENCE_LEN: sentencevec.append(wordvec_model['。']) sent_len+=1 test_x=np.array(sentencevec).reshape(1,config.SENTENCE_LEN,config.WORDVEC_SIZE) forecast=sentencerec_model.predict(test_x) forecast_label=srt.labelvec2str(forecast) label_index= srt.get_index(label) print('test',label,label_index) counts[label_index]+=1 if label==forecast_label: rights[label_index]+=1 print('句子:'+sentence,'真实类别:'+label,'预测类别:'+forecast_label) line=reader.__next__() for right,count in zip(rights,counts): present=float(int(right)*1.0/int(count)) persents.append(present) print('标签',labels) print('标签总数',counts) print('标签正确数',rights) print('标签正确率',persents)
def _data2vec(self, rnndata_list: list, rnnlabel_list: list): print('开始将数据转成向量!') wordvec_model = WordVecModel().load_trained_wordvec_model() # 加载词向量模型 sentencevec_list = [] labelvec_list = [] for sentence in rnndata_list: sentencevec = [] sent_len = 0 for word in sentence.split(): # 一行词 #测试---------------- print(sentence) if sent_len < self.sentence_len: # 如果词数量小于句子长度 # 把词向量加进入 try: sentencevec.append(wordvec_model[word]) except: sentencevec.append(wordvec_model['。']) else: # 如果词数量大于句子长度 break #截断 sent_len += 1 while sent_len < self.sentence_len: # 防止在词数量过短时补'。'的词向量 sentencevec.append(wordvec_model['。']) sent_len += 1 sentencevec_list.append(sentencevec) wrmt = WordRecModelTool() for labels in rnnlabel_list: labelvec = [] #测试----------------- print('标签开始') sent_len = 0 for label in labels.split(): if sent_len < self.sentence_len: # 如果词数量小于句子长度 try: # 标签成标准化编码 one-hot print(label, wrmt.get_one_hot_vec(str(label))) labelvec.append(wrmt.get_one_hot_vec(str(label))) except: print('O', wrmt.get_one_hot_vec('O')) labelvec.append(wrmt.get_one_hot_vec('O')) else: break sent_len += 1 while sent_len < self.sentence_len: # 如果词数量小于句子长度 print('O', wrmt.get_one_hot_vec('O')) # 标签成标准化编码 one-hot labelvec.append(wrmt.get_one_hot_vec('O')) sent_len += 1 # 测试----------------- print('标签结束') labelvec_list.append(labelvec) print('数据转成向量结束!') return sentencevec_list, labelvec_list
#!/usr/bin/python # -*- coding: utf-8 -*- from script.tool import splitword from script.model.wordvec_model import WordVecModel from script.model.sentencerec_model import SentenceRecModel from script.model.wordrec_model import WordRecModel #训练所有模型的总流程 if __name__ == '__main__': #分词 splitword.split2word() #训练词向量模型 WordVecModel().train_and_save_wordvec_model() #训练句子识别模型 srm = SentenceRecModel() srm.train() srm.test() #训练实体词识别模型 wrm = WordRecModel() wrm.train() wrm.test() #--------------------