def __init__(self): self.size = 8 # LSTM神经元size self.GO_ID = 1 # 输出序列起始标记 self.EOS_ID = 2 # 结尾标记 self.PAD_ID = 0 # 空值填充0 self.min_freq = 1 # 样本词频超过这个值才会存入词表 self.epochs = 20000 # 训练次数 self.batch_num = 1000 # 参与训练的问答对个数 self.input_seq_len = 25 # 输入序列长度 self.output_seq_len = 50 # 输出序列长度 self.init_learning_rate = 0.5 # 初始学习率 self.wordToken = word_token.WordToken() # 放在全局的位置,为了动态算出 num_encoder_symbols 和 num_decoder_symbols self.max_token_id = self.wordToken.load_file_list( ['./dialog/question', './dialog/answer'], self.min_freq) self.num_encoder_symbols = self.max_token_id + 5 self.num_decoder_symbols = self.max_token_id + 5 self.sess = tf.Session() encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate = self.get_model( feed_previous=True) saver.restore(self.sess, './model/' + str(self.epochs) + '/demo_') self.encoder_inputs = encoder_inputs self.decoder_inputs = decoder_inputs self.target_weights = target_weights self.outputs = outputs
PAD_ID = 0 # 输出序列起始标记 GO_ID = 1 # 结尾标记 EOS_ID = 2 # LSTM神经元size size = 8 # 初始学习率 init_learning_rate = 1 # 在样本中出现频率超过这个值才会进入词表 min_freq = 10 # 训练的轮数 train_round = 10000 wordToken = word_token.WordToken() # 放在全局的位置,为了动态算出num_encoder_symbols和num_decoder_symbols max_token_id = wordToken.load_file_list( ['samples/question.txt', 'samples/answer.txt'], min_freq) num_encoder_symbols = max_token_id + 5 #表示encoder_inputs中的整数词id的数目 num_decoder_symbols = max_token_id + 5 def get_id_list_from(sentence): """ 获取输入句子的分词对应id列表 """ sentence_id_list = [] seg_list = jieba.cut(sentence) for str in seg_list:
from tensorflow.contrib.legacy_seq2seq.python.ops import seq2seq import word_token import jieba import random import bm25_fitness_data size = 8 # LSTM神经元size GO_ID = 1 # 输出序列起始标记 EOS_ID = 2 # 结尾标记 PAD_ID = 0 # 空值填充0 min_freq = 1 # 样本频率超过这个值才会存入词表 epochs = 20000 # 训练次数 batch_num = 1000 # 参与训练的问答对个数 input_seq_len = 25 # 输入序列长度 output_seq_len = 50 # 输出序列长度 init_learning_rate = 0.5 # 初始学习率 wordToken = word_token.WordToken() # 这是个词袋模型 max_token_id = wordToken.load_file_list(['./samples/question', './samples/answer'], min_freq) num_encoder_symbols = max_token_id + 5 # 算上加上填充、结尾标记、输出标记 num_decoder_symbols = max_token_id + 5 APP_ID = '21290378' API_KEY = 'ZoKi9QNvTdPseK1jOSVWvGZK' SECRET_KEY = 'jjHqdb1SDwQBecELrc7SaWIHXnYpg8HB' client = AipNlp(APP_ID, API_KEY, SECRET_KEY) # 调用百度词,句相似度API options = {} options["model"] = "bert" q_list = [] # question a_list = [] # answer type_list = [] # ill type with open('question.txt', encoding="utf-8") as f: