def separate_shuffle(data_files, suffix): assert isinstance(data_files, list) start_time = time.time() with Pool(2) as p: p.starmap(shuffle_single_train, [(data_file, suffix) for data_file in data_files]) utils.print_out( " Shuffled monolingual training datasets separately, time %.2fs" % (time.time() - start_time))
def run_external_eval(infer_model, infer_sess, model_dir, params, summary_writer): with infer_model.graph.as_default(): loaded_infer_model, global_step = trainer_utils.create_or_load_model( infer_model.model, model_dir, infer_sess, "infer") out_dir = params["model_dir"] misc_utils.print_out("# External BLEU evaluation, global step %d" % global_step) infer_sess.run(infer_model.iterator.initializer) output = os.path.join(out_dir, "output_eval") tags = ["%s2%s" % (params["lang1"], params["lang2"]), "%s2%s" % (params["lang2"], params["lang1"])] pred_files = ["%s_%s" % (output, tag) for tag in tags] ref_files = [params["lang1to2_ref"], params["lang2to1_ref"]] scores = trainer_utils.decode_and_evaluate( tags, loaded_infer_model, infer_sess, pred_files, ref_files, bleu_script_path=params["moses_bleu_script"]) for tag in scores: add_summary(summary_writer, global_step, "%s_BLEU" % tag, scores[tag]) return scores, global_step
def dual_inference(params): misc_utils.print_out("# lang1_valid_data and lang2_valid_data are used for inference.") infer_model = trainer_utils.create_infer_model(TrainerMT, params) config_proto = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config_proto.gpu_options.allow_growth = True ckpt_path = tf.train.latest_checkpoint(params["model_dir"]) with tf.Session(graph=infer_model.graph, config=config_proto) as sess: loaded_infer_model = trainer_utils.load_model( infer_model.model, ckpt_path, sess, "infer") with infer_model.graph.as_default(): sess.run(infer_model.iterator.initializer) output = os.path.join(params["model_dir"], "output_pred") tags = ["%s2%s" % (params["lang1"], params["lang2"]), "%s2%s" % (params["lang2"], params["lang1"])] pred_files = ["%s_%s" % (output, tag) for tag in tags] ref_files = [] trainer_utils.decode_and_evaluate( tags, loaded_infer_model, sess, pred_files, ref_files) # unused since it is empty.
def eval_moses_bleu(ref, hyp, bleu_script_path): """ Given a file of hypothesis and reference files, evaluate the BLEU score using Moses scripts. """ assert os.path.isfile(ref) and os.path.isfile(hyp) command = bleu_script_path + ' %s < %s' p = subprocess.Popen(command % (ref, hyp), stdout=subprocess.PIPE, shell=True) result = p.communicate()[0].decode("utf-8") if result.startswith('BLEU'): return float(result[7:result.index(',')]) else: utils.print_out('Impossible to parse BLEU score! "%s"' % result) return -1
def process_stats(stats, info, global_step, steps_per_stats, log_f): """Update info and check for overflow.""" # Update info info["avg_step_time"] = stats["step_time"] / steps_per_stats info["avg_train_ae_loss"] = stats["ae_loss"] / steps_per_stats info["avg_train_bt_loss"] = stats["bt_loss"] / steps_per_stats is_overflow = False for avg_loss in [info["avg_train_ae_loss"], info["avg_train_bt_loss"]]: if math.isnan(avg_loss) or math.isinf(avg_loss) or avg_loss > 1e20: misc_utils.print_out(" step %d overflow loss, stop early" % global_step, log_f) is_overflow = True break return is_overflow
def before_train(loaded_train_model, train_model, train_sess, global_step, log_f): """Misc tasks to do before training.""" stats = init_stats() info = {"avg_step_time": 0.0, "avg_train_ae_loss": 0.0, "avg_train_bt_loss": 0.0, "learning_rate": loaded_train_model.learning_rate.eval( session=train_sess)} start_train_time = time.time() misc_utils.print_out("# Start step %d, lr %g, %s" % (global_step, info["learning_rate"], time.ctime()), log_f) # Initialize all of the iterators train_sess.run(train_model.iterator.initializer) return stats, info, start_train_time
def create_or_load_model(model, model_dir, session, name): """Create translation model and initialize or load parameters in session.""" latest_ckpt = tf.train.latest_checkpoint(model_dir) if latest_ckpt: model = load_model(model, latest_ckpt, session, name) else: start_time = time.time() session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) utils.print_out(" created %s model with fresh parameters, time %.2fs" % (name, time.time() - start_time)) model.saver.save( session, os.path.join(model_dir, "model.ckpt"), global_step=0) utils.print_out("# Save model at global step 0, for initial eval/infer.") global_step = model.global_step.eval(session=session) return model, global_step
def decode_and_evaluate(tags, model, sess, pred_files, ref_files, tgt_eos="</s>", bleu_script_path="~/mosesdecoder/scripts/generic/multi-bleu.perl"): start_time = time.time() num_sentences = 0 if tgt_eos: tgt_eos = tgt_eos.encode("utf-8") pred_file_1to2, pred_file_2to1 = pred_files with codecs.getwriter("utf-8")(tf.gfile.GFile(pred_file_1to2, mode="w")) as pred_f_1to2: with codecs.getwriter("utf-8")(tf.gfile.GFile(pred_file_2to1, mode="w")) as pred_f_2to1: pred_f_1to2.write("") pred_f_2to1.write("") while True: try: sample_results = model.infer(sess) batch_size = sample_results[0].shape[0] for sample_words, pred_f in zip(sample_results, (pred_f_1to2, pred_f_2to1)): for sent_id in range(batch_size): output = sample_words[sent_id].tolist() if tgt_eos and tgt_eos in output: output = output[:output.index(tgt_eos)] # pred_f.write((b" ".join(output) + b"\n").decode("utf-8")) pred_f.write( (b" ".join(output).replace(b"@@ ", b"").replace(b"@@", b"") + b"\n").decode("utf-8")) num_sentences += batch_size except tf.errors.OutOfRangeError: utils.print_out(" done, num sentences 2 * %d, time %ds" % (num_sentences, time.time() - start_time)) break # Evaluation scores = {} if len(ref_files) == len(pred_files): for ref_file, pred_file, tag in zip(ref_files, pred_files, tags): bleu = eval_moses_bleu(ref_file, pred_file, bleu_script_path) scores[tag] = bleu utils.print_out(" %s BLEU: %.2f" % (tag, bleu)) return scores
def _create_pretrained_emb_from_txt(vocab_file, embed_file, num_notpretrained_tokens=4, dtype=tf.float32): """Load pretrain embeding from embed_file, and return an embedding matrix. Args: embed_file: Path to a Glove formated embedding txt file. num_notpretrained_tokens: Make the first n tokens in the vocab file as not pretrained variables. Default is 4, which is "</s>, <s>, <unk>, <mask>". """ vocab, vocab_size = misc_utils.load_vocab(vocab_file) # notpretrained_tokens = vocab[:num_notpretrained_tokens] # TODO: hparam to control notpretrained_tokens = vocab[:1] + vocab[ 2:num_notpretrained_tokens] # id=1 of </s> has been pretrained. misc_utils.print_out("# Using pre-trained embedding: %s." % embed_file) misc_utils.print_out(" Analyzing not pre-trained tokens: ") emb_dict, emb_size = misc_utils.load_embed_txt(embed_file) assert len(emb_dict) == vocab_size - num_notpretrained_tokens + 1 for token in notpretrained_tokens: misc_utils.print_out(" %s" % token) if token == notpretrained_tokens[0]: emb_dict[token] = [0.0] * emb_size elif token not in emb_dict: emb_dict[token] = emb_size**-0.5 * np.random.randn(emb_size) emb_np = np.array([emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype()) return emb_np
def _set_train_or_infer(self, res, reverse_vocab_tables, params): if self.mode == tf.estimator.ModeKeys.TRAIN: self.ae_loss, self.bt_loss, _ = res else: _, _, sample_ids = res self.sample_ids_1to2, self.sample_ids_2to1 = sample_ids if self.mode == tf.estimator.ModeKeys.PREDICT: self.sample_words_1to2 = reverse_vocab_tables[params["lang2"]].lookup(tf.to_int64(self.sample_ids_1to2)) self.sample_words_2to1 = reverse_vocab_tables[params["lang1"]].lookup(tf.to_int64(self.sample_ids_2to1)) # start to optimize tvars = tf.trainable_variables() if self.mode == tf.estimator.ModeKeys.TRAIN: self.learning_rate = trainer_utils.get_learning_rate( learning_rate=params["learning_rate"], step=self.global_step, hidden_size=params["hidden_size"], learning_rate_warmup_steps=params["learning_rate_warmup_steps"], noam_decay=params["noam_decay"]) optimizer = tf.contrib.opt.LazyAdamOptimizer( self.learning_rate, beta1=params["optimizer_adam_beta1"], beta2=params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) self.ae_train_op = tf.contrib.layers.optimize_loss( self.lambda_xe * self.ae_loss, self.global_step, learning_rate=None, optimizer=optimizer, variables=tvars, clip_gradients=params["clip_grad_norm"], colocate_gradients_with_ops=True, increment_global_step=False) self.bt_train_op = tf.contrib.layers.optimize_loss( self.lambda_xe * self.bt_loss, self.global_step, learning_rate=None, optimizer=optimizer, variables=tvars, clip_gradients=params["clip_grad_norm"], colocate_gradients_with_ops=True, increment_global_step=True) self.train_ae_summary = tf.summary.merge([tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("ae_loss", self.ae_loss)]) self.train_bt_summary = tf.summary.merge([tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("bt_loss", self.bt_loss)]) misc_utils.print_out("# Trainable variables") misc_utils.print_out("Format: <name>, <shape>, <(soft) device placement>") for tvar in tvars: misc_utils.print_out(" %s, %s, %s" % (tvar.name, str(tvar.get_shape()), tvar.op.device))
def load_model(model, ckpt_path, session, name): """Load model from a checkpoint.""" start_time = time.time() try: model.saver.restore(session, ckpt_path) except tf.errors.NotFoundError as e: utils.print_out("Can't load checkpoint") utils.print_out("%s" % str(e)) session.run(tf.tables_initializer()) utils.print_out( " loaded %s model parameters from %s, time %.2fs" % (name, ckpt_path, time.time() - start_time)) return model
default=False, help="Only inference from saved model dir.") if __name__ == "__main__": parser = argparse.ArgumentParser() add_arguments(parser) params = vars(parser.parse_args()) params["lang1_vocab_size"] = count_lines(params["lang1_vocab_file"]) if params["lang1_vocab_file"] == params["lang2_vocab_file"]: params["lang2_vocab_size"] = params["lang1_vocab_size"] else: params["lang2_vocab_size"] = count_lines(params["lang2_vocab_file"]) misc_utils.print_out("# All hyperparameters:") for key in params: misc_utils.print_out("%s=%s" % (key, str(params[key]))) if params["batch_size"] >= 1024: misc_utils.print_out( "# batch_size >= 1024 indicates token level batch size for training." ) if params["only_infer"]: if not tf.gfile.Exists(params["model_dir"]): raise ValueError("No checkpoint saved in %s" % params["model_dir"]) dual_inference(params) else: if params["model_dir"] and not tf.gfile.Exists(params["model_dir"]): misc_utils.print_out("# Creating saved model directory %s ..." %
def train_and_eval(params, target_session=""): out_dir = params["model_dir"] steps_per_stats = params["steps_per_stats"] steps_per_eval = 10 * steps_per_stats # Log and output files log_file = os.path.join(out_dir, "log_%d" % time.time()) log_f = tf.gfile.GFile(log_file, mode="a") misc_utils.print_out("# log_file=%s" % log_file, log_f) # create models model_creator = TrainerMT train_model = trainer_utils.create_train_model(model_creator, params) eval_model = trainer_utils.create_eval_model(model_creator, params) infer_model = trainer_utils.create_infer_model(model_creator, params) # TensorFlow models config_proto = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config_proto.gpu_options.allow_growth = True train_sess = tf.Session(target=target_session, config=config_proto, graph=train_model.graph) eval_sess = tf.Session(target=target_session, config=config_proto, graph=eval_model.graph) infer_sess = tf.Session(target=target_session, config=config_proto, graph=infer_model.graph) with train_model.graph.as_default(): loaded_train_model, global_step = trainer_utils.create_or_load_model( train_model.model, params["model_dir"], train_sess, "train") # Summary writer summary_writer = tf.summary.FileWriter( os.path.join(out_dir, "train_log"), train_model.graph) # First evaluation without training yet # run_external_eval(infer_model, infer_sess, params["model_dir"], params, summary_writer) last_stats_step = global_step last_eval_step = global_step # This is the train loop. trainer_utils.separate_shuffle( [params["lang1_train_data"], params["lang2_train_data"]], params["train_data_suffix"]) stats, info, start_train_time = before_train( loaded_train_model, train_model, train_sess, global_step, log_f) lambda_xe_mono_config = trainer_utils.parse_lambda_config(params["lambda_xe_mono"]) loaded_eval_model = sync_eval_model(eval_model, eval_sess, params["model_dir"]) while global_step < params["num_train_steps"]: # Run a step start_time = time.time() lambda_xe_mono = trainer_utils.get_lambda_xe_mono(lambda_xe_mono_config, global_step) try: ae_step_result = loaded_train_model.ae_updates(train_sess, lambda_xe_mono) new_clean_inputs = ae_step_result[-1] ii1, ii2 = new_clean_inputs[params["lang1"]], new_clean_inputs[params["lang2"]] ids1to2, ids2to1 = loaded_eval_model.otfb(eval_sess, ii1, ii2) bt_step_result = loaded_train_model.bt_updates( train_sess, params["lambda_xe_otfb"], ids2to1, ids1to2, ii1, ii2) step_result = [ae_step_result[:-1], bt_step_result] except tf.errors.OutOfRangeError: misc_utils.print_out("# Finished Training of One Epochs.") trainer_utils.separate_shuffle( [params["lang1_train_data"], params["lang2_train_data"]], params["train_data_suffix"]) train_sess.run(train_model.iterator.initializer) continue global_step, info["learning_rate"], step_summary = update_stats(stats, start_time, step_result) summary_writer.add_summary(step_summary, global_step) if global_step - last_stats_step >= steps_per_stats: last_stats_step = global_step is_overflow = process_stats(stats, info, global_step, steps_per_stats, log_f) print_step_info(" ", global_step, info, log_f) if is_overflow: break # Reset statistics stats = init_stats() if global_step - last_eval_step >= steps_per_eval: last_eval_step = global_step misc_utils.print_out("# Save eval, global step %d" % global_step) loaded_train_model.saver.save( train_sess, os.path.join(params["model_dir"], "model.ckpt"), global_step=global_step) loaded_eval_model = sync_eval_model(eval_model, eval_sess, params["model_dir"]) run_external_eval(infer_model, infer_sess, params["model_dir"], params, summary_writer) # Done training loaded_train_model.saver.save( train_sess, os.path.join(params["model_dir"], "model.ckpt"), global_step=global_step) misc_utils.print_out("# Done training, time %ds!" % (time.time() - start_train_time)) summary_writer.close() return global_step
def print_step_info(prefix, global_step, info, log_f): """Print all info at the current global step.""" misc_utils.print_out("%sstep %d lr %g step-time %.2fs ae_loss %.4f bt_loss %.4f, %s" % (prefix, global_step, info["learning_rate"], info["avg_step_time"], info["avg_train_ae_loss"], info["avg_train_bt_loss"], time.ctime()), log_f)
def get_all_embeddings(params, dtype=tf.float32, scope=None): if params["lang1_partitions"] <= 1: lang1_partitioner = None else: lang1_partitioner = tf.fixed_size_partitioner( params["lang1_partitions"]) if params["lang2_partitions"] <= 1: lang2_partitioner = None else: lang2_partitioner = tf.fixed_size_partitioner( params["lang2_partitions"]) encoder_embeddings = {} decoder_embeddings = {} lang1_emb_np, lang2_emb_np = None, None if params["lang1_embed_file"] and params["lang2_embed_file"]: lang1_emb_np = _create_pretrained_emb_from_txt( params["lang1_vocab_file"], params["lang1_embed_file"]) if params["lang1_embed_file"] == params["lang2_embed_file"]: lang2_emb_np = lang1_emb_np else: lang2_emb_np = _create_pretrained_emb_from_txt( params["lang2_vocab_file"], params["lang2_embed_file"]) if params["share_decpro_emb"]: if params["share_lang_emb"]: assert params["share_output_emb"] share_bias = tf.get_variable('share_projection/bias', [ params["lang1_vocab_size"], ], initializer=tf.zeros_initializer()) pro_embs = { params["lang1"]: share_bias, params["lang2"]: share_bias } else: pro_embs = { params["lang1"]: tf.get_variable('bias', [ params["lang1_vocab_size"], ], initializer=tf.zeros_initializer()), params["lang2"]: tf.get_variable('bias', [ params["lang2_vocab_size"], ], initializer=tf.zeros_initializer()) } else: if params["share_output_emb"]: assert params["share_lang_emb"] if params["pretrained_out"]: assert params["lang1_embed_file"] == params["lang2_embed_file"] misc_utils.print_out( "# Using pre-trained embedding to initialize shared projection kernel." ) share_proj_layer = tf.layers.Dense( params["lang1_vocab_size"], use_bias=True, kernel_initializer=tf.constant_initializer( lang1_emb_np.transpose()), name="share_projection") else: share_proj_layer = tf.layers.Dense(params["lang1_vocab_size"], use_bias=True, name="share_projection") pro_embs = { params["lang1"]: share_proj_layer, params["lang2"]: share_proj_layer } else: if params["pretrained_out"]: misc_utils.print_out( "# Using pre-trained embedding to initialize two projection kernels." ) pro_embs = { params["lang1"]: tf.layers.Dense(params["lang1_vocab_size"], use_bias=True, kernel_initializer=tf.constant_initializer( lang1_emb_np.transpose()), name="%s_projection" % params["lang1"]), params["lang2"]: tf.layers.Dense(params["lang2_vocab_size"], use_bias=True, kernel_initializer=tf.constant_initializer( lang2_emb_np.transpose()), name="%s_projection" % params["lang2"]) } else: pro_embs = { params["lang1"]: tf.layers.Dense(params["lang1_vocab_size"], use_bias=True, name="%s_projection" % params["lang1"]), params["lang2"]: tf.layers.Dense(params["lang2_vocab_size"], use_bias=True, name="%s_projection" % params["lang2"]) } with tf.variable_scope(scope or "all_embeddings", dtype=dtype) as scope: # encoder embeddings with tf.variable_scope("encoder", partitioner=lang1_partitioner): lang = "share" if params["share_lang_emb"] else params["lang1"] lang1_enc_embedding = _create_embed("%s_embedding" % lang, params["lang1_vocab_size"], params["hidden_size"], dtype, lang1_emb_np) if params["share_lang_emb"]: if params["lang1_vocab_size"] != params["lang2_vocab_size"]: raise ValueError( "Share embedding but different vocab sizes" " %d vs. %d" % (params["lang1_vocab_size"], params["lang2_vocab_size"])) assert params["lang1_vocab_size"] == params["lang2_vocab_size"] misc_utils.print_out( "# Use the same encoder embedding for both languages.") lang2_enc_embedding = lang1_enc_embedding else: with tf.variable_scope("encoder", partitioner=lang2_partitioner): lang2_enc_embedding = _create_embed( "%s_embedding" % params["lang2"], params["lang2_vocab_size"], params["hidden_size"], dtype, lang2_emb_np) encoder_embeddings[params["lang1"]] = lang1_enc_embedding encoder_embeddings[params["lang2"]] = lang2_enc_embedding # decoder embeddings if params["share_encdec_emb"]: misc_utils.print_out( "# Use the same embedding for encoder and decoder of each language." ) decoder_embeddings = encoder_embeddings else: with tf.variable_scope("decoder", partitioner=lang1_partitioner): lang = "share" if params["share_lang_emb"] else params["lang1"] lang1_dec_embedding = _create_embed("%s_embedding" % lang, params["lang1_vocab_size"], params["hidden_size"], dtype, lang1_emb_np) if params["share_lang_emb"]: misc_utils.print_out( "# Use the same decoder embedding for both languages.") lang2_dec_embedding = lang1_dec_embedding else: lang2_dec_embedding = _create_embed( "%s_embedding" % params["lang2"], params["lang2_vocab_size"], params["hidden_size"], dtype, lang2_emb_np) decoder_embeddings[params["lang1"]] = lang1_dec_embedding decoder_embeddings[params["lang2"]] = lang2_dec_embedding return encoder_embeddings, decoder_embeddings, pro_embs