def eval(flags, default_hparams, target_session=""): """Run main.""" # Job jobid = flags.jobid num_workers = flags.num_workers utils.print_out("# Job id %d" % jobid) # Random random_seed = flags.random_seed if random_seed is not None and random_seed > 0: utils.print_out("# Set random seed to %d" % random_seed) random.seed(random_seed + jobid) np.random.seed(random_seed + jobid) ## Decode out_dir = flags.out_dir if not tf.gfile.Exists(out_dir): raise IOError("%s path not exist." % out_dir) # Load hparams. hparams = create_or_load_hparams(out_dir, default_hparams, flags.hparams_path, False) # Inference indices hparams.inference_indices = None ckpt = tf.train.latest_checkpoint(out_dir) # get ckpt from out dir if not ckpt: raise IOError("%s ckpt not find in path." % ckpt) inference.predicate(ckpt, hparams, num_workers, jobid)
def print_step_info(prefix, global_step, info, result_summary, log_f): """Print all info at the current global step.""" utils.print_out( "%sstep %d lr %g step-time %.2fs wps %.2fK ppl %.2f gN %.2f %s, %s" % (prefix, global_step, info["learning_rate"], info["avg_step_time"], info["speed"], info["train_ppl"], info["avg_grad_norm"], result_summary, time.ctime()), log_f)
def check_vocab(vocab_file, out_dir, check_special_token=True, sos=None, eos=None, unk=None): """Check if vocab_file doesn't exist, create from corpus_file.""" if tf.gfile.Exists(vocab_file): utils.print_out("# Vocab file %s exists" % vocab_file) vocab, vocab_size = load_vocab(vocab_file) if check_special_token: # Verify if the vocab starts with unk, sos, eos # If not, prepend those tokens & generate a new vocab file if not unk: unk = UNK if not sos: sos = SOS if not eos: eos = EOS assert len(vocab) >= 3 if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos: utils.print_out("The first 3 vocab words [%s, %s, %s]" " are not [%s, %s, %s]" % (vocab[0], vocab[1], vocab[2], unk, sos, eos)) vocab = [unk, sos, eos] + vocab vocab_size += 3 new_vocab_file = os.path.join(out_dir, os.path.basename(vocab_file)) with codecs.getwriter("utf-8")(tf.gfile.GFile( new_vocab_file, "wb")) as f: for word in vocab: f.write("%s\n" % word) vocab_file = new_vocab_file else: raise ValueError("vocab_file '%s' does not exist." % vocab_file) vocab_size = len(vocab) return vocab_size, vocab_file
def _cell_list(unit_type, num_units, num_layers, num_residual_layers, forget_bias, dropout, mode, num_gpus, base_gpu=0, single_cell_fn=None, residual_fn=None): """Create a list of RNN cells.""" if not single_cell_fn: single_cell_fn = _single_cell # Multi-GPU cell_list = [] for i in range(num_layers): utils.print_out(" cell %d" % i, new_line=False) single_cell = single_cell_fn( unit_type=unit_type, num_units=num_units, forget_bias=forget_bias, dropout=dropout, mode=mode, residual_connection=(i >= num_layers - num_residual_layers), device_str=get_device_str(i + base_gpu, num_gpus), residual_fn=residual_fn) utils.print_out("") cell_list.append(single_cell) return cell_list
def translate_and_return(hparams, infer_model, input_data, loaded_infer_model, sess): # Encode Data sess.run( infer_model.iterator.initializer, feed_dict={ infer_model.src_placeholder: input_data, infer_model.batch_size_placeholder: hparams.infer_batch_size }) # Decode utils.print_out("# Start decoding") res = nmt_utils.decode_and_return( "infer", loaded_infer_model, sess, None, ref_file=None, metrics=hparams.metrics, subword_option=hparams.subword_option, beam_width=hparams.beam_width, tgt_eos=hparams.eos, num_translations_per_input=hparams.num_translations_per_input) return res
def _get_learning_rate_decay(self, hparams): """Get learning rate decay.""" if hparams.decay_scheme in ["luong5", "luong10", "luong234"]: decay_factor = 0.5 if hparams.decay_scheme == "luong5": start_decay_step = int(hparams.num_train_steps / 2) decay_times = 5 elif hparams.decay_scheme == "luong10": start_decay_step = int(hparams.num_train_steps / 2) decay_times = 10 elif hparams.decay_scheme == "luong234": start_decay_step = int(hparams.num_train_steps * 2 / 3) decay_times = 4 remain_steps = hparams.num_train_steps - start_decay_step decay_steps = int(remain_steps / decay_times) elif not hparams.decay_scheme: # no decay start_decay_step = hparams.num_train_steps decay_steps = 0 decay_factor = 1.0 elif hparams.decay_scheme: raise ValueError("Unknown decay scheme %s" % hparams.decay_scheme) utils.print_out(" decay_scheme=%s, start_decay_step=%d, decay_steps %d, " "decay_factor %g" % (hparams.decay_scheme, start_decay_step, decay_steps, decay_factor)) return tf.cond( self.global_step < start_decay_step, lambda: self.learning_rate, lambda: tf.train.exponential_decay( self.learning_rate, (self.global_step - start_decay_step), decay_steps, decay_factor, staircase=True), name="learning_rate_decay_cond")
def load_model(model, ckpt, session, name): start_time = time.time() model.saver.restore(session, ckpt) session.run(tf.tables_initializer()) utils.print_out(" loaded %s model parameters from %s, time %.2fs" % (name, ckpt, time.time() - start_time)) return model
def run_main(flags, default_hparams, train_fn, inference_fn, target_session=""): """Run main.""" # Job jobid = flags.jobid num_workers = flags.num_workers utils.print_out("# Job id %d" % jobid) # Random random_seed = flags.random_seed if random_seed is not None and random_seed > 0: utils.print_out("# Set random seed to %d" % random_seed) random.seed(random_seed + jobid) np.random.seed(random_seed + jobid) ## Train / Decode out_dir = flags.out_dir if not tf.gfile.Exists(out_dir): tf.gfile.MakeDirs(out_dir) # Load hparams. hparams = create_or_load_hparams(out_dir, default_hparams, flags.hparams_path, save_hparams=(jobid == 0)) # Train train_fn(hparams, target_session=target_session)
def ensure_compatible_hparams(hparams, default_hparams, hparams_path=""): """Make sure the loaded hparams is compatible with new changes.""" default_hparams = utils.maybe_parse_standard_hparams( default_hparams, hparams_path) # Set num encoder/decoder layers (for old checkpoints) if hasattr(hparams, "num_layers"): if not hasattr(hparams, "num_encoder_layers"): hparams.add_hparam("num_encoder_layers", hparams.num_layers) if not hasattr(hparams, "num_decoder_layers"): hparams.add_hparam("num_decoder_layers", hparams.num_layers) # For compatible reason, if there are new fields in default_hparams, # we add them to the current hparams default_config = default_hparams.values() config = hparams.values() for key in default_config: if key not in config: hparams.add_hparam(key, default_config[key]) # Update all hparams' keys if override_loaded_hparams=True if getattr(default_hparams, "override_loaded_hparams", None): overwritten_keys = default_config.keys() else: # For inference overwritten_keys = INFERENCE_KEYS for key in overwritten_keys: if getattr(hparams, key) != default_config[key]: utils.print_out( "# Updating hparams.%s: %s -> %s" % (key, str(getattr(hparams, key)), str(default_config[key]))) setattr(hparams, key, default_config[key]) return hparams
def _build_encoder(self, hparams): """Build an encoder.""" num_layers = self.num_encoder_layers num_residual_layers = self.num_encoder_residual_layers iterator = self.iterator source = iterator.source if self.time_major: source = tf.transpose(source) with tf.variable_scope("encoder") as scope: dtype = scope.dtype # Look up embedding, emp_inp: [max_time, batch_size, num_units] encoder_emb_inp = tf.nn.embedding_lookup( self.embedding_encoder, source) # Encoder_outputs: [max_time, batch_size, num_units] if hparams.encoder_type == "uni": utils.print_out(" num_layers = %d, num_residual_layers=%d" % (num_layers, num_residual_layers)) cell = self._build_encoder_cell( hparams, num_layers, num_residual_layers) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( cell, encoder_emb_inp, dtype=dtype, sequence_length=iterator.source_sequence_length, time_major=self.time_major, swap_memory=True) elif hparams.encoder_type == "bi": num_bi_layers = int(num_layers / 2) num_bi_residual_layers = int(num_residual_layers / 2) utils.print_out(" num_bi_layers = %d, num_bi_residual_layers=%d" % (num_bi_layers, num_bi_residual_layers)) encoder_outputs, bi_encoder_state = ( self._build_bidirectional_rnn( inputs=encoder_emb_inp, sequence_length=iterator.source_sequence_length, dtype=dtype, hparams=hparams, num_bi_layers=num_bi_layers, num_bi_residual_layers=num_bi_residual_layers)) if num_bi_layers == 1: encoder_state = bi_encoder_state else: # alternatively concat forward and backward states encoder_state = [] for layer_id in range(num_bi_layers): encoder_state.append(bi_encoder_state[0][layer_id]) # forward encoder_state.append(bi_encoder_state[1][layer_id]) # backward encoder_state = tuple(encoder_state) else: raise ValueError("Unknown encoder_type %s" % hparams.encoder_type) return encoder_outputs, encoder_state
def _external_eval(model, global_step, sess, hparams, iterator, iterator_feed_dict, tgt_file, label, summary_writer, save_on_best, avg_ckpts=False): """External evaluation such as BLEU and ROUGE scores.""" out_dir = hparams.out_dir decode = global_step > 0 if avg_ckpts: label = "avg_" + label if decode: utils.print_out("# External evaluation, global step %d" % global_step) sess.run(iterator.initializer, feed_dict=iterator_feed_dict) output = os.path.join(out_dir, "output_%s" % label) scores = nmt_utils.decode_and_evaluate( label, model, sess, output, ref_file=tgt_file, metrics=hparams.metrics, subword_option=hparams.subword_option, beam_width=hparams.beam_width, tgt_eos=hparams.eos, decode=decode) # Save on best metrics if decode: for metric in hparams.metrics: if avg_ckpts: best_metric_label = "avg_best_" + metric else: best_metric_label = "best_" + metric utils.add_summary(summary_writer, global_step, "%s_%s" % (label, metric), scores[metric]) # metric: larger is better if save_on_best and scores[metric] > getattr( hparams, best_metric_label): setattr(hparams, best_metric_label, scores[metric]) model.saver.save(sess, os.path.join( getattr(hparams, best_metric_label + "_dir"), "translate.ckpt"), global_step=model.global_step) utils.save_hparams(out_dir, hparams) return scores
def _get_infer_maximum_iterations(self, hparams, source_sequence_length): """Maximum decoding steps at inference time.""" if hparams.tgt_max_len_infer: maximum_iterations = hparams.tgt_max_len_infer utils.print_out(" decoding maximum_iterations %d" % maximum_iterations) else: # TODO(thangluong): add decoding_length_factor flag decoding_length_factor = 2.0 max_encoder_length = tf.reduce_max(source_sequence_length) maximum_iterations = tf.to_int32(tf.round( tf.to_float(max_encoder_length) * decoding_length_factor)) return maximum_iterations
def build_graph(self, hparams, scope=None): utils.print_out("# creating %s graph ..." % self.mode) dtype = tf.float32 with tf.variable_scope(scope or "dynamic_seq2seq", dtype=dtype): z_sample, Z = self.infer_z(hparams) with tf.variable_scope("generative_model", dtype=dtype): # P(x_1^m) language model lm_logits = self._build_language_model(hparams, z_sample=z_sample) # P(y_1^n|x_1^m) encoder encoder_outputs, encoder_state = self._build_encoder( hparams, z_sample=z_sample) # P(y_1^n|x_1^m) decoder tm_logits, sample_id, final_context_state = self._build_decoder( encoder_outputs, encoder_state, hparams, z_sample=z_sample) # Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: with tf.device( model_helper.get_device_str( self.num_encoder_layers - 1, self.num_gpus)): loss, components = self._compute_loss( tm_logits, lm_logits, Z, Z_source_target=( hparams.z_inference_from == "source_target"), r_train_mode=hparams.r_train_mode) else: loss = None # Save for summaries. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self._tm_loss = components[0] self._lm_loss = components[1] self._KL_Z = components[2] self._entropy = components[3] self._Z_networks_loss = components[4] self._elbo = -loss self._lm_accuracy = self._compute_accuracy( lm_logits, tf.argmax(self.source_output, axis=-1, output_type=tf.int32), self.source_sequence_length) return tm_logits, loss, final_context_state, sample_id
def language_model(embeddings, sequence_length, hparams, mode, single_cell_fn, time_major, batch_size, z_sample=None): with tf.variable_scope("language_model") as scope: # Use decoder cell options. cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=hparams.num_lm_layers, num_residual_layers=hparams.num_decoder_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=hparams.num_gpus, mode=mode, single_cell_fn=single_cell_fn) # Use a zero initial state or tanh(Wz) if provided (VAEJointModel). if z_sample is not None: utils.print_out(" initializing generative LM with tanh(Wz)") init_state_val = tf.tanh(tf.layers.dense(z_sample, hparams.num_units)) init_state = make_initial_state(init_state_val, hparams.unit_type) else: utils.print_out(" initializing generative LM with zeros.") init_state = cell.zero_state(batch_size, scope.dtype) # Apply word dropout if set. if hparams.word_dropout > 0 and \ (mode == tf.contrib.learn.ModeKeys.TRAIN): # Drop random words. noise_shape = [tf.shape(embeddings)[0], tf.shape(embeddings)[1], 1] embeddings = tf.nn.dropout(embeddings, (1.0 - hparams.word_dropout), noise_shape=noise_shape) # Run the RNN language model. helper = tf.contrib.seq2seq.TrainingHelper( embeddings, sequence_length, time_major=time_major) decoder = tf.contrib.seq2seq.BasicDecoder( cell, helper, initial_state=init_state) lm_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder, output_time_major=time_major, impute_finished=True, scope=scope) return lm_outputs
def _sample_decode(model, global_step, sess, hparams, iterator, src_data, tgt_data, iterator_src_placeholder, iterator_batch_size_placeholder, summary_writer): """Pick a sentence and decode.""" decode_id = random.randint(0, len(src_data) - 1) utils.print_out(" # %d" % decode_id) iterator_feed_dict = { iterator_src_placeholder: [src_data[decode_id]], iterator_batch_size_placeholder: 1, } sess.run(iterator.initializer, feed_dict=iterator_feed_dict) nmt_outputs, attention_summary = model.decode(sess) if hparams.beam_width > 0: # get the top translation. nmt_outputs = nmt_outputs[0] translation = nmt_utils.get_translation( nmt_outputs, sent_id=0, tgt_eos=hparams.eos, subword_option=hparams.subword_option) utils.print_out(" src: %s" % src_data[decode_id]) utils.print_out(" ref: %s" % tgt_data[decode_id]) utils.print_out(b" nmt: " + translation) # Summary if attention_summary is not None: summary_writer.add_summary(attention_summary, global_step)
def single_worker_inference(infer_model, ckpt, inference_input_file, inference_output_file, hparams): """Inference with a single worker.""" output_infer = inference_output_file # Read data infer_data = load_data(inference_input_file, hparams) infer_data = ["Lúc đấy tôi nghĩ chuyện này sẽ khó khăn gian khổ đây ."] with tf.Session( graph=infer_model.graph, config=utils.get_config_proto()) as sess: while True: # infer_data = ["Lúc đấy tôi nghĩ chuyện này sẽ khó khăn gian khổ đây ."] var = input("Input Vi Src: ") infer_data = [var] loaded_infer_model = model_helper.load_model( infer_model.model, ckpt, sess, "infer") sess.run( infer_model.iterator.initializer, feed_dict={ infer_model.src_placeholder: infer_data, infer_model.batch_size_placeholder: hparams.infer_batch_size }) # Decode utils.print_out("# Start decoding") if hparams.inference_indices: _decode_inference_indices( loaded_infer_model, sess, output_infer=output_infer, output_infer_summary_prefix=output_infer, inference_indices=hparams.inference_indices, tgt_eos=hparams.eos, subword_option=hparams.subword_option) else: nmt_utils.decode_and_evaluate( "infer", loaded_infer_model, sess, output_infer, ref_file=None, metrics=hparams.metrics, subword_option=hparams.subword_option, beam_width=hparams.beam_width, tgt_eos=hparams.eos, num_translations_per_input=hparams.num_translations_per_input)
def create_or_load_model(model, model_dir, session, name): """Create translation model and initialize or load parameters in session.""" latest_ckpt = tf.train.latest_checkpoint(model_dir) if latest_ckpt: model = load_model(model, latest_ckpt, session, name) else: start_time = time.time() session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) utils.print_out( " created %s model with fresh parameters, time %.2fs" % (name, time.time() - start_time)) global_step = model.global_step.eval(session=session) return model, global_step
def translate(ckpt, infer_data, inference_output_file, hparams, num_workers=1, jobid=0, scope=None): """Inference with a single worker.""" output_infer = inference_output_file """Perform translation.""" if hparams.inference_indices: assert num_workers == 1 if not hparams.attention: model_creator = nmt_model.Model elif hparams.attention_architecture == "standard": model_creator = attention_model.AttentionModel elif hparams.attention_architecture in ["gnmt", "gnmt_v2"]: model_creator = gnmt_model.GNMTModel else: raise ValueError("Unknown model architecture") infer_model = model_helper.create_infer_model(model_creator, hparams, scope) with tf.Session( graph=infer_model.graph, config=utils.get_config_proto()) as sess: loaded_infer_model = model_helper.load_model( infer_model.model, ckpt, sess, "infer") # Encode Data sess.run( infer_model.iterator.initializer, feed_dict={ infer_model.src_placeholder: infer_data, infer_model.batch_size_placeholder: hparams.infer_batch_size }) # Decode utils.print_out("# Start decoding") return nmt_utils.decode_and_return( "infer", loaded_infer_model, sess, output_infer, ref_file=None, metrics=hparams.metrics, subword_option=hparams.subword_option, beam_width=hparams.beam_width, tgt_eos=hparams.eos, num_translations_per_input=hparams.num_translations_per_input)
def process_stats(stats, info, global_step, steps_per_stats, log_f): """Update info and check for overflow.""" # Update info info["avg_step_time"] = stats["step_time"] / steps_per_stats info["avg_grad_norm"] = stats["grad_norm"] / steps_per_stats info["train_ppl"] = utils.safe_exp(stats["loss"] / stats["predict_count"]) info["speed"] = stats["total_count"] / (1000 * stats["step_time"]) # Check for overflow is_overflow = False train_ppl = info["train_ppl"] if math.isnan(train_ppl) or math.isinf(train_ppl) or train_ppl > 1e20: utils.print_out(" step %d overflow, stop early" % global_step, log_f) is_overflow = True return is_overflow
def setup(self, flags): # Model output directory out_dir = flags.out_dir if out_dir and not tf.gfile.Exists(out_dir): tf.gfile.MakeDirs(out_dir) # Load hparams. default_hparams = create_hparams(flags) loaded_hparams = False if flags.ckpt: # Try to load hparams from the same directory as ckpt ckpt_dir = os.path.dirname(flags.ckpt) ckpt_hparams_file = os.path.join(ckpt_dir, "hparams") if tf.gfile.Exists(ckpt_hparams_file) or flags.hparams_path: # Note: for some reason this will create an empty "best_bleu" directory and copy vocab files hparams = create_or_load_hparams(ckpt_dir, default_hparams, flags.hparams_path, save_hparams=False) loaded_hparams = True assert loaded_hparams # GPU device config_proto = utils.get_config_proto( allow_soft_placement=True, num_intra_threads=hparams.num_intra_threads, num_inter_threads=hparams.num_inter_threads) utils.print_out("# Devices visible to TensorFlow: %s" % repr(tf.Session(config=config_proto).list_devices())) # Inference indices (inference_indices is broken, but without setting it to None we'll crash) hparams.inference_indices = None # Create the graph model_creator = get_model_creator(hparams) infer_model = model_helper.create_infer_model(model_creator, hparams, scope=None) sess, loaded_infer_model = start_sess_and_load_model( infer_model, flags.ckpt, hparams) # Parameters needed by TF GNMT self.hparams = hparams self.infer_model = infer_model self.sess = sess self.loaded_infer_model = loaded_infer_model
def build_graph(self, hparams, scope=None): utils.print_out("# creating %s graph ..." % self.mode) dtype = tf.float32 with tf.variable_scope(scope or "dynamic_seq2seq", dtype=dtype): z_sample, Z = self.infer_z(hparams) if hparams.z_inference_from == "source_target": raise NotImplementedError( "source_target option not yet implemented for cvae") with tf.variable_scope("generative_model", dtype=dtype): # P(x_1^m) language model gauss_observations = self._build_language_model( hparams, z_sample=z_sample) # P(y_1^n|x_1^m) encoder encoder_outputs, encoder_state = self._build_encoder( hparams, z_sample=z_sample) # P(y_1^n|x_1^m) decoder tm_logits, sample_id, final_context_state = self._build_decoder( encoder_outputs, encoder_state, hparams, z_sample=z_sample) # Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: with tf.device( model_helper.get_device_str( self.num_encoder_layers - 1, self.num_gpus)): loss, components = self._compute_loss( tm_logits, gauss_observations, Z) else: loss = None # Save for summaries. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self._tm_loss = components[0] self._lm_loss = components[1] self._KL_Z = components[2] self._entropy = components[3] self._elbo = -loss return tm_logits, loss, final_context_state, sample_id
def decode_and_return(name, model, sess, trans_file, ref_file, metrics, subword_option, beam_width, tgt_eos, num_translations_per_input=1): """Decode a test set and compute a score according to the evaluation task.""" # Decode if not trans_file: utils.print_out(" decoding to output %s." % trans_file) start_time = time.time() num_sentences = 0 num_translations_per_input = max( min(num_translations_per_input, beam_width), 1) res = [] while True: try: nmt_outputs, _ = model.decode(sess) if beam_width == 0: nmt_outputs = np.expand_dims(nmt_outputs, 0) batch_size = nmt_outputs.shape[1] num_sentences += batch_size for sent_id in range(batch_size): for beam_id in range(num_translations_per_input): translation = get_translation( nmt_outputs[beam_id], sent_id, tgt_eos=tgt_eos, subword_option=subword_option) res.append(translation) except tf.errors.OutOfRangeError: utils.print_time( " done, num sentences %d, num translations per input %d" % (num_sentences, num_translations_per_input), start_time) break # Evaluation return translation
def infer_z(self, hparams): # Infer z from the embeddings if hparams.z_inference_from == "source_only": utils.print_out(" Inferring z from source only") Z_x = self._infer_z_from_embeddings(hparams, use_target=False) # Either use a sample or the mean. if self.mode != tf.contrib.learn.ModeKeys.INFER: z_sample = Z_x.sample() else: z_sample = Z_x.mean() return z_sample, Z_x elif hparams.z_inference_from == "source_target": utils.print_out(" Inferring z from both source and target") Z_xy = self._infer_z_from_embeddings( hparams, scope_name="z_inference_model_xy", use_target=True) if hparams.r_train_mode == "l2": deterministic_Z_x = True else: deterministic_Z_x = False Z_x = self._infer_z_from_embeddings( hparams, scope_name="z_inference_model_x", use_target=False, deterministic=deterministic_Z_x) # Either use a sample or the mean. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: z_sample = Z_xy.sample() else: if deterministic_Z_x: z_sample = Z_x else: z_sample = Z_x.mean() return z_sample, (Z_x, Z_xy) else: raise ValueError("Unknown z inference from option:" " %s" % hparams.z_inference_from)
def build_graph(self, hparams, scope=None): """Subclass must implement this method. Creates a sequence-to-sequence model with dynamic RNN decoder API. Args: hparams: Hyperparameter configurations. scope: VariableScope for the created subgraph; default "dynamic_seq2seq". Returns: A tuple of the form (logits, loss, final_context_state), where: logits: float32 Tensor [batch_size x num_decoder_symbols]. loss: the total loss / batch_size. final_context_state: The final state of decoder RNN. Raises: ValueError: if encoder_type differs from mono and bi, or attention_option is not (luong | scaled_luong | bahdanau | normed_bahdanau). """ utils.print_out("# creating %s graph ..." % self.mode) dtype = tf.float32 with tf.variable_scope(scope or "dynamic_seq2seq", dtype=dtype): # Encoder encoder_outputs, encoder_state = self._build_encoder(hparams) ## Decoder logits, sample_id, final_context_state = self._build_decoder( encoder_outputs, encoder_state, hparams) ## Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: with tf.device(model_helper.get_device_str(self.num_encoder_layers - 1, self.num_gpus)): loss = self._compute_loss(logits) else: loss = None return logits, loss, final_context_state, sample_id
def _decode_inference_indices(model, sess, output_infer, output_infer_summary_prefix, inference_indices, tgt_eos, subword_option): """Decoding only a specific set of sentences.""" utils.print_out(" decoding to output %s , num sents %d." % (output_infer, len(inference_indices))) start_time = time.time() with codecs.getwriter("utf-8")( tf.gfile.GFile(output_infer, mode="wb")) as trans_f: trans_f.write("") # Write empty string to ensure file is created. for decode_id in inference_indices: nmt_outputs, infer_summary = model.decode(sess) # get text translation assert nmt_outputs.shape[0] == 1 translation = nmt_utils.get_translation( nmt_outputs, sent_id=0, tgt_eos=tgt_eos, subword_option=subword_option) if infer_summary is not None: # Attention models image_file = output_infer_summary_prefix + str(decode_id) + ".png" utils.print_out(" save attention image to %s*" % image_file) image_summ = tf.Summary() image_summ.ParseFromString(infer_summary) with tf.gfile.GFile(image_file, mode="w") as img_f: img_f.write(image_summ.value[0].image.encoded_image_string) trans_f.write("%s\n" % translation) utils.print_out(translation + b"\n") utils.print_time(" done", start_time)
def _create_pretrained_emb_from_txt(vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32, scope=None): """Load pretrain embeding from embed_file, and return an embedding matrix. Args: embed_file: Path to a Glove formated embedding txt file. num_trainable_tokens: Make the first n tokens in the vocab file as trainable variables. Default is 3, which is "<unk>", "<s>" and "</s>". """ vocab, _ = vocab_utils.load_vocab(vocab_file) trainable_tokens = vocab[:num_trainable_tokens] utils.print_out("# Using pretrained embedding: %s." % embed_file) utils.print_out(" with trainable tokens: ") emb_dict, emb_size = vocab_utils.load_embed_txt(embed_file) for token in trainable_tokens: utils.print_out(" %s" % token) if token not in emb_dict: emb_dict[token] = [0.0] * emb_size emb_mat = np.array([emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype()) emb_mat = tf.constant(emb_mat) emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1]) with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype) as scope: with tf.device(_get_embed_device(num_trainable_tokens)): emb_mat_var = tf.get_variable("emb_mat_var", [num_trainable_tokens, emb_size]) return tf.concat([emb_mat_var, emb_mat_const], 0)
def ensure_compatible_hparams(hparams, default_hparams, hparams_path): """Make sure the loaded hparams is compatible with new changes.""" default_hparams = utils.maybe_parse_standard_hparams( default_hparams, hparams_path) # For compatible reason, if there are new fields in default_hparams, # we add them to the current hparams default_config = default_hparams.values() config = hparams.values() for key in default_config: if key not in config: hparams.add_hparam(key, default_config[key]) # Update all hparams' keys if override_loaded_hparams=True if default_hparams.override_loaded_hparams: for key in default_config: if getattr(hparams, key) != default_config[key]: utils.print_out("# Updating hparams.%s: %s -> %s" % (key, str(getattr( hparams, key)), str(default_config[key]))) setattr(hparams, key, default_config[key]) return hparams
def before_train(loaded_train_model, train_model, train_sess, global_step, hparams, log_f): """Misc tasks to do before training.""" stats = init_stats() info = { "train_ppl": 0.0, "speed": 0.0, "avg_step_time": 0.0, "avg_grad_norm": 0.0, "learning_rate": loaded_train_model.learning_rate.eval(session=train_sess) } start_train_time = time.time() utils.print_out( "# Start step %d, lr %g, %s" % (global_step, info["learning_rate"], time.ctime()), log_f) # Initialize all of the iterators skip_count = hparams.batch_size * hparams.epoch_step utils.print_out("# Init train iterator, skipping %d elements" % skip_count) train_sess.run(train_model.iterator.initializer, feed_dict={train_model.skip_count_placeholder: skip_count}) return stats, info, start_train_time
def _get_learning_rate_warmup(self, hparams): """Get learning rate warmup.""" warmup_steps = hparams.warmup_steps warmup_scheme = hparams.warmup_scheme utils.print_out(" learning_rate=%g, warmup_steps=%d, warmup_scheme=%s" % (hparams.learning_rate, warmup_steps, warmup_scheme)) # Apply inverse decay if global steps less than warmup steps. # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3) # When step < warmup_steps, # learing_rate *= warmup_factor ** (warmup_steps - step) if warmup_scheme == "t2t": # 0.01^(1/warmup_steps): we start with a lr, 100 times smaller warmup_factor = tf.exp(tf.log(0.01) / warmup_steps) inv_decay = warmup_factor ** ( tf.to_float(warmup_steps - self.global_step)) else: raise ValueError("Unknown warmup scheme %s" % warmup_scheme) return tf.cond( self.global_step < hparams.warmup_steps, lambda: inv_decay * self.learning_rate, lambda: self.learning_rate, name="learning_rate_warump_cond")
def _compute_loss(self, tm_logits, lm_logits, Z, Z_source_target=False, r_train_mode="KLq"): # The cross-entropy under a reparameterizable sample of the latent variable(s). tm_loss = self._compute_categorical_loss(tm_logits, self.target_output, self.target_sequence_length) # The cross-entropy for the language model also under a sample of the latent # variable(s). Not correct mathematically, if we use the relaxation. lm_loss = self._compute_dense_categorical_loss( lm_logits, self.source_output, self.source_sequence_length) # We use a heuristic as an unjustified approximation for monolingual # batches. max_source_time = self.get_max_time(lm_logits) source_weights = tf.sequence_mask(self.source_sequence_length, max_source_time, dtype=lm_logits.dtype) entropy = tf.cond(self.mono_batch, true_fn=lambda: self._compute_categorical_entropy( self.source, source_weights), false_fn=lambda: tf.constant(0.)) # We compute an analytical KL between the Gaussian variational approximation # and its Gaussian prior. if Z_source_target: Z_x, Z_xy = Z if self.mode != tf.contrib.learn.ModeKeys.TRAIN: Z_networks_loss = tf.constant(0.) else: if r_train_mode == "l2": utils.print_out("Using l2 train mode for r.") Z_networks_loss = tf.nn.l2_loss(Z_x - Z_xy.mean()) elif r_train_mode == "KLq": utils.print_out("Using KLq train mode for r.") Z_networks_loss = Z_xy.kl_divergence(Z_x) Z_networks_loss = tf.reduce_mean(Z_networks_loss) elif r_train_mode == "KLr": utils.print_out("Using KLr train mode for r.") Z_networks_loss = Z_x.kl_divergence(Z_xy) Z_networks_loss = tf.reduce_mean(Z_networks_loss) elif r_train_mode == "JS": utils.print_out("Using JS train mode for r.") Z_networks_loss = Z_xy.kl_divergence( Z_x) + Z_x.kl_divergence(Z_xy) Z_networks_loss = tf.reduce_mean(Z_networks_loss) else: raise ValueError("Unknown value for r_train_mode: %s" % r_train_mode) Z_networks_loss *= self.complexity_factor standard_normal = tf.contrib.distributions.MultivariateNormalDiag( tf.zeros_like(Z_xy.mean()), tf.ones_like(Z_xy.stddev())) KL_Z = Z_xy.kl_divergence(standard_normal) else: standard_normal = tf.contrib.distributions.MultivariateNormalDiag( tf.zeros_like(Z.mean()), tf.ones_like(Z.stddev())) KL_Z = Z.kl_divergence(standard_normal) Z_networks_loss = tf.constant(0.) KL_Z = tf.reduce_mean(KL_Z) self.KL = KL_Z return tm_loss + lm_loss + self.complexity_factor * KL_Z - entropy + Z_networks_loss, \ (tm_loss, lm_loss, KL_Z, entropy, Z_networks_loss)