print("extract training data~~~~") sup_features = pickle.load(open(hp.sup_prepro, 'rb')) dev_features = pickle.load(open(hp.dev_prepro, 'rb')) unsup_features = pickle.load(open(hp.unsup_prepro, 'rb')) print("# Load model") m = toy_uda(hp) print("# Session") saver = tf.train.Saver(max_to_keep=hp.num_epochs) with tf.Session() as sess: ckpt = tf.train.latest_checkpoint(hp.modeldir) if ckpt is None: print("Initializing from scratch") sess.run(tf.global_variables_initializer()) save_variable_specs(os.path.join(hp.modeldir, "specs")) else: saver.restore(sess, ckpt) m.model = True _gs = sess.run(m.global_step) tolerant = 0 for epoch in range(hp.num_epochs): total_loss = 0.0 total_acc = 0.0 print("<<<<<<<<<<<<<<<< epoch {} >>>>>>>>>>>>>>>>".format(epoch)) sup_len = len(sup_features) sup_batch_num = calc_num_batches(sup_len, hp.sup_batch_size) unsup_len = len(unsup_features) unsup_batch_num = calc_num_batches(unsup_len, hp.sup_batch_size * hp.unsup_ratio)
eval_init_op = iter.make_initializer(eval_batches) logging.info("# Load model") m = Transformer(hp) loss, train_op, global_step, train_summaries = m.train(xs, ys) y_hat, eval_summaries = m.eval(xs, ys) # y_hat = m.infer(xs, ys) logging.info("# Session") saver = tf.train.Saver(max_to_keep=hp.num_epochs) with tf.Session() as sess: ckpt = tf.train.latest_checkpoint(hp.logdir) if ckpt is None: logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) save_variable_specs(os.path.join(hp.logdir, "specs")) else: saver.restore(sess, ckpt) summary_writer = tf.summary.FileWriter(hp.logdir, sess.graph) sess.run(train_init_op) total_steps = hp.num_epochs * num_train_batches _gs = sess.run(global_step) for i in tqdm(range(_gs, total_steps + 1)): _, _gs, _summary = sess.run([train_op, global_step, train_summaries]) epoch = math.ceil(_gs / num_train_batches) summary_writer.add_summary(_summary, _gs) if _gs and _gs % num_train_batches == 0: logging.info("epoch {} is done".format(epoch))
def train_(self, epochs): train_batches, num_train_batches, num_train_samples = get_batch( '../data/iwslt2016/segmented/train.de.bpe', '../data/iwslt2016/segmented/train.en.bpe', self.sequence_length, self.sequence_length, self.vocab_file, self.batch_size, shuffle=True) eval_batches, num_eval_batches, num_eval_samples = get_batch( '../data/iwslt2016/segmented/eval.de.bpe', '../data/iwslt2016/segmented/eval.en.bpe', 100000, 100000, self.vocab_file, self.batch_size, shuffle=False) iter = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes) xs, ys = iter.get_next() train_init_op = iter.make_initializer(train_batches) eval_init_op = iter.make_initializer(eval_batches) loss, train_op, global_step, train_summaries = self.model.train(xs, ys) y_hat, eval_summaries = self.model.eval(xs, ys) logging.info("# Session") with tf.Session() as sess: ckpt = tf.train.latest_checkpoint(self.model_dir) if ckpt is None: logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) save_variable_specs(os.path.join('../data/log/1', "specs")) else: self.saver.restore(sess, ckpt) summary_writer = tf.summary.FileWriter(self.model_dir, sess.graph) sess.run(train_init_op) total_steps = epochs * num_train_batches _gs = sess.run(global_step) for i in tqdm(range(_gs, total_steps + 1)): _, _gs, _summary = sess.run( [train_op, global_step, train_summaries]) epoch = math.ceil(_gs / num_train_batches) summary_writer.add_summary(_summary, _gs) if _gs and _gs % num_train_batches == 0: logging.info("epoch {} is done".format(epoch)) _loss = sess.run(loss) # train loss logging.info("# test evaluation") _, _eval_summaries = sess.run( [eval_init_op, eval_summaries]) summary_writer.add_summary(_eval_summaries, _gs) logging.info("# get hypotheses") hypotheses = get_hypotheses(num_eval_batches, num_eval_samples, sess, y_hat, self.model.index_char) logging.info("# write results") model_output = "iwslt2016_E%02dL%.2f" % (epoch, _loss) if not os.path.exists('data/eval/1'): os.makedirs('../data/eval/1') translation = os.path.join('../data/eval/1', model_output) with open(translation, 'w') as fout: fout.write("\n".join(hypotheses)) logging.info( "# calc bleu score and append it to translation") calc_bleu('../data/iwslt2016/prepro/eval.en', translation) logging.info("# save models") self.saver.save(sess, os.path.join(self.model_dir, 'transformer.dat'), global_step=_gs) sess.run(train_init_op) summary_writer.close() logging.info("Done")
config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: time_sess = time.time() ckpt = tf.train.latest_checkpoint(logdir) if ckpt is None or "new" == run_type: # 新建 save_hparams(hp, logdir) logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) else: # continue OR finetune saver.restore(sess, ckpt) if "finetune" == hp.run_type: # finetune save_hparams(hp, logdir) else: # continue batch_size = hp.batch_size save_variable_specs(os.path.join(logdir, "var_specs")) save_operation_specs(os.path.join(logdir, "op_specs")) f_debug = open(os.path.join(logdir, "debug.txt"), "a") summary_writer = tf.summary.FileWriter(logdir, sess.graph) if hp.zero_step: sess.run(global_step.assign(0)) sess.run(train_init_op) total_steps = num_epochs * num_train_batches logging.info("total_steps:%s, num_epochs:%s, num_train_batches:%s", total_steps, num_epochs, num_train_batches) _gs = sess.run(global_step) logging.info("global_step is stated at %s", _gs) t_epoch = time.time() model_output = 'default' for i in tqdm(range(_gs, total_steps + 1)):
def train_template(class_model, shuffle=True, save_model=True): # 大数据集耗时请关掉shuffle,调参请关掉save_model logger = logging.getLogger() logger.setLevel(logging.INFO) logging.info("# hparams") hparams = Hparams() parser = hparams.parser hp = parser.parse_args() run_type = hp.run_type logdir = hp.logdir batch_size = hp.batch_size num_epochs = hp.num_epochs task_type = hp.task_type assert hp.run_type in ("new", "continue", "finetune") if "continue" == hp.run_type: load_hparams(hp, logdir) batch_size = hp.batch_size if task_type is not None: assert task_type == hp.task_type task_type = hp.task_type assert task_type is not None context = Context(hp) logging.info("# Prepare train/eval batches") logging.info("Use %s for training set", hp.train_data) logging.info("Use %s for evaluation set", hp.eval_data) eval_batches, num_eval_batches, num_eval_samples = get_batch( fpath=hp.eval_data, task_type=task_type, input_indices=context.input_indices, vocabs=context.vocabs, context=context, batch_size=batch_size, shuffle=False) train_batches, num_train_batches, num_train_samples = get_batch( fpath=hp.train_data, task_type=task_type, input_indices=context.input_indices, vocabs=context.vocabs, context=context, batch_size=batch_size, shuffle=shuffle) # create a iterator of the correct shape and type iterr = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes) inputs_and_target = iterr.get_next() # 照抄即可,目前不是很熟悉这些接口 train_init_op = iterr.make_initializer(train_batches) eval_init_op = iterr.make_initializer(eval_batches) model = class_model(context) loss, train_op, global_step, train_summaries = model.train( inputs=inputs_and_target[:-1], targets=inputs_and_target[-1]) eval_ouputs, eval_summaries = model.eval(inputs=inputs_and_target[:-1], targets=inputs_and_target[-1]) inference_name = model.get_inference_op_name() logging.info("inference_node_name:%s" % inference_name) logging.info("# Session") saver = tf.train.Saver(max_to_keep=num_epochs) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: time_sess = time.time() ckpt = tf.train.latest_checkpoint(logdir) if ckpt is None or "new" == run_type: # 新建 save_hparams(hp, logdir) logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) else: # continue OR finetune saver.restore(sess, ckpt) if "finetune" == hp.run_type: # finetune save_hparams(hp, logdir) save_variable_specs(os.path.join(logdir, "var_specs")) save_operation_specs(os.path.join(logdir, "op_specs")) f_debug = open(os.path.join(logdir, "debug.txt"), "a") summary_writer = tf.summary.FileWriter(logdir, sess.graph) if hp.zero_step: sess.run(global_step.assign(0)) sess.run(train_init_op) total_steps = num_epochs * num_train_batches logging.info("total_steps:%s, num_epochs:%s, num_train_batches:%s", total_steps, num_epochs, num_train_batches) _gs = sess.run(global_step) logging.info("global_step is stated at %s", _gs) t_epoch = time.time() model_output = 'default' for i in tqdm(range(_gs, total_steps + 1)): ts = time.time() # f_debug.write("loss\n") # tensor_tmp = tf.get_default_graph().get_tensor_by_name("loss:0") # np.savetxt(f_debug, tensor_tmp.eval().reshape([1]), delimiter=', ', footer="=" * 64) _, _gs, _summary = sess.run( [train_op, global_step, train_summaries]) epoch = math.ceil(_gs / num_train_batches) f_debug.write("train: epoch %s takes %s\n" % (epoch, time.time() - ts)) summary_writer.add_summary(_summary, _gs) if _gs and _gs % num_train_batches == 0: logging.info("epoch {} is done".format(epoch)) # train loss _loss = sess.run(loss) # eval logging.info("# eval evaluation") _, _eval_summaries = sess.run([eval_init_op, eval_summaries]) summary_writer.add_summary(_eval_summaries, _gs) if save_model: # save checkpoint logging.info("# save models") model_output = "model%02dL%.2f" % (epoch, _loss) ckpt_name = os.path.join(logdir, model_output) saver.save(sess, ckpt_name, global_step=_gs) logging.info( "after training of {} epochs, {} has been saved.". format(epoch, ckpt_name)) # proceed to next epoch logging.info("# fall back to train mode") ts = time.time() sess.run(train_init_op) logging.info("fallback_train: %s\t%s\t%s takes %s" % (i, _gs, epoch, time.time() - ts)) logging.info("epoch %s takes %s", epoch, time.time() - t_epoch) t_epoch = time.time() summary_writer.close() logging.info("Session runs for %s", time.time() - time_sess) if save_model: # save to pb inference_node_name = inference_name[:inference_name.find(":")] graph_def = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, output_node_names=[inference_node_name]) tf.train.write_graph(graph_def, logdir, '%s.pb' % model_output, as_text=False) f_debug.close() logging.info("Done")
logging.info("# Load model") # 这里即是加载模型,然后调用模型里的train()和eval()方法来进行训练和做评估。 m = Transformer(hp) loss, train_op, global_step, train_summaries = m.train(xs, ys) y_hat, eval_summaries = m.eval(xs, ys) # y_hat = m.infer(xs, ys) logging.info("# Session") saver = tf.train.Saver(max_to_keep=hp.num_epochs) with tf.Session() as sess: ckpt = tf.train.latest_checkpoint(hp.logdir) # tf.train.latest_checkpoint: 用来查找到最近的检查点文件。 if ckpt is None: logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) # 初始化模型参数 save_variable_specs(os.path.join(hp.logdir, "specs")) # 保存训练过程中的一些参数变量。 else: saver.restore(sess, ckpt) summary_writer = tf.summary.FileWriter(hp.logdir, sess.graph) # 利用TensorBoard来进行数据可视化展示 sess.run(train_init_op) # 运行一次数据集生成器,即生成一次数据集 total_steps = hp.num_epochs * num_train_batches # 每epoch次要对num_train_batches个batch进行训练, # 也就是每次epoch都要对所有的batch进行一次训练,以此来计算总的计算次数。 _gs = sess.run(global_step) for i in tqdm(range(_gs, total_steps+1)): # 进度条模块 _, _gs, _summary = sess.run([train_op, global_step, train_summaries]) epoch = math.ceil(_gs / num_train_batches) summary_writer.add_summary(_summary, _gs)
def train(hp): save_hparams(hp, hp.checkpoints_dir) # Data generator logging.info("Prepare Train/Eval batches...") train_batches, num_train_batches, num_train_samples = get_batch( hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab, hp.batch_size, shuffle=True) eval_batches, num_eval_batches, num_eval_samples = get_batch(hp.eval1, hp.eval2, 10000, 10000, hp.vocab, hp.batch_size, shuffle=False) # Batch iterator iter = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes) xs, ys = iter.get_next() train_init_op = iter.make_initializer(train_batches) eval_init_op = iter.make_initializer(eval_batches) # Build model logging.info("Build model...") model = Transformer(hp) logging.info("Model is built!") # Session logging.info("Session initialize") saver = tf.train.Saver(max_to_keep=5) with tf.Session() as sess: # Check & Load latest version model checkpoint ckpt = tf.train.latest_checkpoint(hp.checkpoints_dir) if ckpt is None: logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) save_variable_specs(os.path.join(hp.checkpoints_dir, "specs")) else: saver.restore(sess, ckpt) summary_writer = tf.summary.FileWriter(hp.checkpoints_dir, sess.graph) sess.run(train_init_op) total_steps = hp.num_epochs * num_train_batches _gs = sess.run(model.global_step) k = 5 min_dev_loss = 0 stop_alpha = 20.0 eval_losses = [] # Start training for i in tqdm(range(_gs, total_steps + 1)): _input_x, _decoder_input, _target = sess.run([xs[0], ys[0], ys[1]]) _, _gs, _summary = sess.run( [model.train_op, model.global_step, model.summaries], feed_dict={ model.input_x: _input_x, model.decoder_input: _decoder_input, model.target: _target, model.is_training: True }) epoch = math.ceil(_gs / num_train_batches) summary_writer.add_summary(_summary, _gs) # Evaluation if _gs and _gs % num_train_batches == 0: logging.info("Epoch {} is done".format(epoch)) _loss = sess.run(model.loss, feed_dict={ model.input_x: _input_x, model.decoder_input: _decoder_input, model.target: _target, model.is_training: False }) # evaluation y_hat, mean_loss = model.eval(sess, eval_init_op, xs, ys, num_eval_batches) # id to token logging.info("# Get hypotheses") hypotheses = get_hypotheses(num_eval_samples, y_hat, model.idx2token) # save translation results if not os.path.exists(hp.evaldir): os.makedirs(hp.evaldir) logging.info("# Write results") model_output = "translation_E{:02d}L{:.2f}EL{:.2f}".format( epoch, _loss, mean_loss) translation = os.path.join(hp.evaldir, model_output) with open(translation, 'w', encoding="utf-8") as fout: fout.write("\n".join(hypotheses)) logging.info( "# Calculate bleu score and append it to translation") # bleu calc_bleu_nltk(hp.eval2, translation) # save model logging.info("# Save models") ckpt_name = os.path.join(hp.checkpoints_dir, model_output) saver.save(sess, ckpt_name, global_step=_gs) logging.info( "After training of {} epochs, {} has been saved.".format( epoch, ckpt_name)) # claculate early stop if len(eval_losses) == 0: min_dev_loss = mean_loss eval_losses.append(mean_loss) gl, p_k, pq_alpha = calculate_earlystop_baseline( mean_loss, min_dev_loss, eval_losses, k) min_dev_loss = mean_loss if mean_loss < min_dev_loss else min_dev_loss eval_losses = eval_losses[-k:] logging.info( "GL(t): {:.4f}, P_k: {:.4f}, PQ_alpha: {:.4f}".format( gl, p_k, pq_alpha)) if gl > stop_alpha: logging.info( "No optimization for a long time, auto-stopping...") break # change data iterator back to train iterator sess.run(train_init_op) summary_writer.close() logging.info("Done")