def main(_): model_path = os.path.join('model', FLAGS.name) if os.path.exists(model_path) is False: os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) with codecs.open(FLAGS.input_file_vali, encoding='utf-8') as f_v: text_v = f_v.read() # converter_v = TextConverter(text_v, FLAGS.max_vocab) # converter_v.save_to_file(os.path.join(model_path, 'converter.pkl')) arr_v = converter.text_to_arr(text_v) g_v = batch_generator(arr_v, FLAGS.num_seqs, FLAGS.num_steps) # print(converter.vocab_size) model = CharRNN(converter.vocab_size, num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.train(g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, g_v)
def main(_): model_path = os.path.join('model', FLAGS.name) if os.path.exists(model_path) is False: os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) print(converter.vocab_size) model = CharRNN(converter.vocab_size, num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size ) model.train(g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): model_path = os.path.join('model', FLAGS.name) # 保存模型的路径 if os.path.exists(model_path) is False: os.makedirs(model_path) # 用codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() # 读取训练的文本 converter = TextConverter(text, FLAGS.max_vocab) # 转换text文本格式 converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) # 转换text为数组 g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) # 批生成 print(converter.vocab_size) model = CharRNN(converter.vocab_size, # 读取模型 num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size ) model.train(g, # 训练 FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): script_path = os.path.abspath(os.path.dirname(__file__)) model_path = os.path.join(script_path, 'model', FLAGS.name) if os.path.exists(model_path) is False: os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() print("corpus size " + str(len(text))) if os.path.exists(FLAGS.whitelist_file): with codecs.open(FLAGS.whitelist_file, encoding='utf-8') as f: whitelist = f.read() text = remove_non_matching_chars(text, whitelist) converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) model = CharRNN(converter.vocab_size, num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.train( g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path = tf.train.latest_checkpoint( FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, None, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) # start = converter.text_to_arr(FLAGS.seed_for_generating) seeds = [ 'var a = fun', 'function a(', 'this.', 'document.', 'window.', 'var a = document.g', 'var a;', 'jQuery' ] for seed in seeds: start = converter.text_to_arr(seed) for i in range(0, FLAGS.num_to_generate): print('Generating: ' + seed + ' -> ' + str(i)) file_name = str(uuid.uuid1()) file_path = '../../BrowserFuzzingData/generated/' + FLAGS.file_type + '/' + file_name + '.' + FLAGS.file_type arr = model.sample(FLAGS.max_length_of_generated, start, converter.vocab_size, converter.word_to_int) f = open(file_path, "wb") f.write(converter.arr_to_text(arr).encode('utf-8')) f.close()
def main(_): FLAGS.start_string = FLAGS.start_string #.decode('utf-8') converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start_string = FLAGS.start_string sys.stdout.write("> ") sys.stdout.flush() start_string = sys.stdin.readline() while start_string: start = converter.text_to_arr(start_string) arr = model.sample(FLAGS.max_length, start, converter.vocab_size) print(converter.arr_to_text(arr)) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline()
def main(_): model_path = os.path.join('models', Config.file_name) et = TextConverter(text=None,save_dir='models/en_vocab.pkl', max_vocab=Config.en_vocab_size, seq_length = Config.seq_length) zt = TextConverter(text=None,save_dir='models/zh_vocab.pkl', max_vocab=Config.zh_vocab_size, seq_length = Config.seq_length+1) # +1是因为,decoder层序列拆成input=[:-1]和label=[1:] print('english vocab lens:',et.vocab_size) print('chinese vocab lens:',zt.vocab_size) # 加载上一次保存的模型 model = Model(Config) checkpoint_path = tf.train.latest_checkpoint(model_path) if checkpoint_path: model.load(checkpoint_path) while True: # english_speek = 'what can i help you ?' # print('english:', english_speek) english_speek = input("english:") english_speek = english_speek.split() en_arr, arr_len = et.text_to_arr(english_speek) test_g = [np.array([en_arr,]), np.array([arr_len,])] output_ids = model.test(test_g, model_path, zt) strs = zt.arr_to_text(output_ids) print('chinese:',strs)
def main(_): model_path = os.path.join('model', FLAGS.name) #print(model_path) if os.path.exists(model_path) is False: os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) g = batch_generator(arr, FLAGS.num_seq, FLAGS.num_step) print(converter.vocab_size) model = CharModel( converter.vocab_size, num_seq=FLAGS.num_seq, num_step=FLAGS.num_step, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, #learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, #use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size, is_Training=True) #model.add_placeholder() #model.build_lstm() #model.build_loss() #model.build_optimizer() model.train(g, FLAGS.max_steps, model_path)
def main(_): model_path = os.path.join('model', FLAGS.name) if os.path.exists(model_path) is False: os.makedirs(model_path) #创建model存储路径 with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() #读取text converter = TextConverter(text, FLAGS.max_vocab) #创建映射表 converter.save_to_file(os.path.join(model_path, 'converter.pkl')) #将映射表存在model_path arr = converter.text_to_arr(text) #将text转为Id g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) #创建batch print(converter.vocab_size) model = CharRNN( converter.vocab_size, #创建模型示例 num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.train( g, #进行模型训练 FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): model_path = os.path.join('model', FLAGS.name)#创建路径字符串 if os.path.exists(model_path) is False:#创建文件夹路径 os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read()#读取整个文件作为字符串 converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text)#将文本序列化 g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)#100,100 print(converter.vocab_size) model = CharRNN(converter.vocab_size,#创建模型,这里num_classes设置为了字典的大小,因为要预测下一个char num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size ) model.train(g,#训练模型 FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): converter = TextConverter(filename=FLAGS.converter_path) model = charRNN(converter.vocab_size, train=False) model.load(tf.train.latest_checkpoint(FLAGS.checkpoint_path)) start = converter.text_to_arr(FLAGS.start_string) arr = model.generate(FLAGS.max_length, start, converter.vocab_size) print(converter.arr_to_text(arr))
def test_batch_generator(self): with codecs.open('data/shakespeare.txt', encoding='utf-8') as f: text = f.read() converter = TextConverter(text, 35000) arr = converter.text_to_arr(text) g = batch_generator(arr, 32, 50) count = 0 for x, y in g: count += 1 print(count)
def main(_): model_path = os.path.join('models', Config.file_name) input_file = 'data/去除2和null.xlsx' vocab_file = os.path.join(model_path, 'vocab_label.pkl') # 数据处理 converter = TextConverter(None, vocab_file, max_vocab=Config.vocab_max_size, seq_length=Config.seq_length) print('vocab size:', converter.vocab_size) # 加载上一次保存的模型 model = Model(Config, converter.vocab_size) checkpoint_path = tf.train.latest_checkpoint(model_path) if checkpoint_path: model.load(checkpoint_path) # 获取测试库数据 # test_libs = get_excel_libs('data/tianlong_libs.xlsx') # 用整个库3w+ QAs = get_excel_QAs(input_file) thres = int(0.8 * len(QAs)) test_QAs = QAs[thres:] test_libs = [r for q, r, y in test_QAs] # 用QAs test_libs_arrs = converter.libs_to_arrs(test_libs) # 产生匹配库向量 save_file = checkpoint_path + '_matul_state_QAs.pkl' if os.path.exists(save_file) is False: response_matul_state = model.test_to_matul(test_libs_arrs) with open(save_file, 'wb') as f: pickle.dump(response_matul_state, f) else: with open(save_file, 'rb') as f: response_matul_state = pickle.load(f) # 测试 print('start to testing...') QAY = [] k, n = 0, 0 for query, y_response, label in test_QAs: input_arr, input_len = converter.text_to_arr(query) indexs = model.test(input_arr, input_len, response_matul_state) responses = converter.index_to_response(indexs, test_libs) QAY.append((query, y_response, responses)) if responses[0] == y_response: k += 1 print(k, '/', n) n += 1 print('accuracy:', k / float(n)) result_xls = checkpoint_path + '_Q_for_QAs.xls' converter.save_to_excel(QAY, result_xls)
def test_vocab_size(self): testConverter = TextConverter(text=[ "We", "are", "accounted", "poor", "citizens,", "the", "patricians", "goodare", "accounted", "poor", "citizens,", "the", "patricians", "good" ], max_vocab=10) print(testConverter.vocab_size) print(testConverter.int_to_word(4)) print(testConverter.text_to_arr(['the'])) print(testConverter.arr_to_text([3, 4]))
def initialize_converter(model_path): if not os.path.exists(model_path): os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() converter_path = os.path.join(model_path, 'converter.pkl') if os.path.exists(converter_path): converter = TextConverter(filename=converter_path) else: converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(converter_path) arr = converter.text_to_arr(text) return arr, converter
def main(_): converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start = converter.text_to_arr(FLAGS.start_string) arr = model.sample(FLAGS.max_length, start, converter.vocab_size) print(converter.arr_to_text(arr))
def main(_): model_path = os.path.join('model', 'en') if os.path.exists(model_path) is False: os.makedirs(model_path) with open("data/shakespeare.txt") as f: text = f.read() print("=====>", len(text)) converter = TextConverter(text) converter.save(os.path.join(model_path, "converter.pkl")) arr = converter.text_to_arr(text) g = batch_generator(arr, batch_size, seq_len, converter=None) model = charRNN(converter.vocab_size) model.train(g, model_path)
def sample(): with tf.Session() as sess: model_path = os.path.join(FLAGS.train_dir, FLAGS.model_name) converter = TextConverter(None, FLAGS.max_vocab_size, os.path.join(model_path, 'converter.pkl')) model = create_model(sess, converter.vocab_size, True, model_path) sys.stdout.write("> ") sys.stdout.flush() start_str = sys.stdin.readline().decode('utf-8') while start_str: start = converter.text_to_arr(start_str) samples = [c for c in start] initial_state = sess.run(model.initial_state) x = np.zeros((1, 1)) for c in start: x[0, 0] = c feed = {model.inputs: x, model.initial_state: initial_state} preds, final_state = sess.run( [model.proba_prediction, model.final_state], feed_dict=feed) initial_state = final_state c = pick_top_n(preds, converter.vocab_size) while c == converter.vocab_size - 1: c = pick_top_n(preds, converter.vocab_size) samples.append(c) for i in range(FLAGS.sample_length): x[0, 0] = c feed = {model.inputs: x, model.initial_state: initial_state} preds, final_state = sess.run( [model.proba_prediction, model.final_state], feed_dict=feed) initial_state = final_state c = pick_top_n(preds, converter.vocab_size) while c == converter.vocab_size - 1: c = pick_top_n(preds, converter.vocab_size) samples.append(c) print(converter.arr_to_text(np.array(samples))) sys.stdout.write("> ") sys.stdout.flush() start_str = sys.stdin.readline().decode('utf-8')
def main(_): FLAGS.start_string = FLAGS.start_string.decode('utf-8') converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start = converter.text_to_arr(FLAGS.start_string) arr = model.sample(FLAGS.max_length, start, converter.vocab_size) print(converter.arr_to_text(arr))
def main(_): model_path = os.path.join('model', FLAGS.name) print(model_path) if os.path.exists(model_path) is False: os.makedirs(model_path) path_exist = False else: path_exist = True with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) print(converter.vocab_size) model = CharRNN(converter.vocab_size, num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size ) model_file_path = tf.train.latest_checkpoint(model_path) if path_exist: model.load(model_file_path) indexes = [] for dirpath, dirnames, filenames in os.walk(model_path): for name in filenames: filepath = os.path.join(dirpath, name) if filepath.endswith(".index"): indexes.append(int(name[6:-6])) indexes.sort() last_index = indexes[-1] model.step = last_index model.train(g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): FLAGS.start_string = FLAGS.start_string.decode('utf-8') converter = TextConverter(filename=FLAGS.converter_path) #创建文本转化器 if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path = tf.train.latest_checkpoint( FLAGS.checkpoint_path) #下载最新模型 model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) #加载模型 start = converter.text_to_arr(FLAGS.start_string) #将input text转为id arr = model.sample(FLAGS.max_length, start, converter.vocab_size) #输出为生成的序列 print(converter.arr_to_text(arr))
def generate(): tf.compat.v1.disable_eager_execution() converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start = converter.text_to_arr(FLAGS.start_string) arr = model.sample(FLAGS.max_length, start, converter.vocab_size) return converter.arr_to_text(arr)
class Dianpin(Singleton): def __init__(self): self.text = '' self.tfmodel = None self.converter = None def model_built(self):#,vocab_size,sampling,lstm_size,num_layers,use_embedding,embedding_size): FLAGS.start_string = FLAGS.start_string.decode('utf-8') self.converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) self.tfmodel = CharRNN(self.converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) self.tfmodel.load(FLAGS.checkpoint_path) def final_predict(self): start = self.converter.text_to_arr(FLAGS.start_string) arr = self.tfmodel.sample(FLAGS.max_length, start, self.converter.vocab_size) return self.converter.arr_to_text(arr)
def main(_): FLAGS.start_string = FLAGS.start_string converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start = converter.text_to_arr(FLAGS.start_string) arr = model.predict(FLAGS.max_length, start, converter.vocab_size, 10) for c, p in arr: prediction = converter.arr_to_text(c) prediction = remove_return(prediction) # 如果有中文字生成,请将 {1:^14} 改为 {1:{4}^14} 以修复对齐问题。 # {1:^14}中的 14 随着生成的字符数量而定,一般可以设为字符数+4 print("{0} -> {1:^14} {2} {3}".format(FLAGS.start_string, prediction, "probability:", p, chr(12288)))
def main(_): ## 对数据进行预处理。调用read_utils.py模块中的文本转换类TextConverter,获取经过频数挑选的字符并且得到相应的index。 ## 然后调用batch_generator函数得到一个batch生成器。 model_path = os.path.join('model', FLAGS.name) # 路径拼接 print("模型保存位置: ", model_path) if os.path.exists(model_path) is False: os.makedirs(model_path) # 递归创建目录 # Python读取文件中的汉字方法:导入codecs,添加encoding='utf-8' with codecs.open(FLAGS.input_file, encoding='utf-8') as f: print("建模训练数据来源", FLAGS.input_file) text = f.read() # 返回一个词典文件 converter = TextConverter(text, FLAGS.max_vocab) # 将经过频数挑选的字符序列化保存 converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) #得到每个字符的index g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) # 得到一个batch生长期 print(converter.vocab_size) # 打印字符数量 ## 数据处理完毕后,调用model.py模块的CharRNN类构造循环神经网络,最后调用train()函数对神经网络进行训练 model = CharRNN(converter.vocab_size, #字符分类的数量 num_seqs=FLAGS.num_seqs, #一个batch中的序列数 num_steps=FLAGS.num_steps, #一个序列中的字符数 lstm_size=FLAGS.lstm_size, #每个cell的节点数量 num_layers=FLAGS.num_layers, #RNN的层数 learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size ) model.train(g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def poem_genetate(poem_start=u'君'): #FLAGS.start_string = FLAGS.start_string #FLAGS.start_string = FLAGS.start_string.decode('utf-8') converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =tf.train.latest_checkpoint(FLAGS.checkpoint_path) print FLAGS.checkpoint_path """ model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) """ model = CharRNN(converter.vocab_size, sampling=True, lstm_size=lstm_size, num_layers=num_layers, use_embedding=use_embedding,embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) #start = converter.text_to_arr(start_string) start1 = converter.text_to_arr(poem_start) arr = model.sample(max_length, start1, converter.vocab_size) #pl = model.poemline(max_length, start, converter.vocab_size) #sp=model.sample_hide_poetry( start, converter.vocab_size) poem=converter.arr_to_text(arr) #print (converter.arr_to_text(sp)) print('---------') print(poem) print('---------') #print(converter.arr_to_text(pl)) print('---------') #0:, 1:。 2:\n,每行12个字符。不可以有0,1,2大于1个 lines=poem.split('\n') r_poem=[] for i in range(len(lines)): if len(lines[i])==12: count=0 print lines[i][5] if lines[i][5]==',': print "true" if lines[i][5]==u',': print "u true" if lines[i][5]==u',' and lines[i][11]==u'。': for j in range(len(lines[i])): if lines[i][j]==u',' or lines[i][j]==u'。': count+=1 if count==2: r_poem.append(lines[i]) if len(r_poem)==2: break """ lines=poem.split('\n') r_poem=[] for i in range(len(lines)): if len(lines[i])==12: count=0 if lines[i][5]==0 and lines[i][11]==1: for j in range(len(lines[i])): if lines[i][j]==0 or lines[i][j]==1: count+=1 if count==2: r_poem.append(lines[i]) if len(r_poem)==2: break """ with codecs.open("app/poem.txt","w",'utf-8') as f: words="".join(r_poem) print (lines) print (r_poem) print (words) #words=words.decode('utf-8') f.write(words)
def train(): with tf.Session() as sess: model_path = os.path.join(FLAGS.train_dir, FLAGS.model_name) if (not os.path.exists(model_path)): os.makedirs(model_path) checkpoint_path = os.path.join(model_path, "generate.ckpt") with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() #.replace("\n", "") converter_path = os.path.join(model_path, 'converter.pkl') if (not os.path.exists(converter_path)): print("construct converter.") converter = TextConverter(text, FLAGS.max_vocab_size) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) else: print("load converter") converter = TextConverter(None, FLAGS.max_vocab_size, converter_path) print("actual vocabulary size is: " + str(converter.vocab_size)) arr = converter.text_to_arr(text) sent_len_p = [ 1.0 / len(train_sentence_length) for l in train_sentence_length ] max_time = np.random.choice(train_sentence_length, 1, p=sent_len_p)[0] batch_cnt = get_batch_cnt(arr, FLAGS.batch_size, max_time) current_step_batch = 0 # create model print("Creating %d layers of %d units for max time %d." % (FLAGS.num_layers, FLAGS.lstm_size, max_time)) model = create_model(sess, converter.vocab_size, False, model_path) if (FLAGS.set_learning_rate > 0): model.set_learning_rate(sess, FLAGS.set_learning_rate) loss_per_checkpoint = 0.0 current_step = 0 previous_losses = [] initial_state = sess.run(model.initial_state) while True: g = batch_generator(arr, FLAGS.batch_size, max_time) for inputs, targets in g: start_time = time.time() batch_loss, final_state = model.train_step( sess, inputs, targets, initial_state) step_time = time.time() - start_time loss_per_checkpoint += batch_loss / FLAGS.steps_per_checkpoint current_step += 1 current_step_batch += 1 if current_step % FLAGS.steps_per_log == 0: perplexity = math.exp(float( batch_loss)) if batch_loss < 300 else float("inf") print( "global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) if current_step % FLAGS.steps_per_checkpoint == 0: if len(previous_losses) > 2 and loss_per_checkpoint > max( previous_losses[-3:]) and sess.run( model.learning_rate) >= 0.0002: sess.run(model.learning_rate_decay_op) previous_losses.append(loss_per_checkpoint) loss_per_checkpoint = 0.0 model.saver.save(sess, checkpoint_path, global_step=model.global_step) if current_step_batch % batch_cnt == 0: print("reset initial state") initial_state = sess.run(model.initial_state) current_step_batch = 0 else: initial_state = final_state if current_step % FLAGS.steps_per_sentence_length == 0: max_time = np.random.choice(train_sentence_length, 1, p=sent_len_p)[0] print("change max time: %d" % (max_time)) batch_cnt = get_batch_cnt(arr, FLAGS.batch_size, max_time) current_step_batch = 0 initial_state = sess.run(model.initial_state) break if current_step >= FLAGS.max_train_steps: break if current_step >= FLAGS.max_train_steps: break model.saver.save(sess, checkpoint_path, global_step=model.global_step)