def _set_params(self, hparams, data_wrapper, reverse_cate_table, scope=None): # Put anything that you want to print into 'print_pool', then it will be printed when session runs self.print_pool = dict() # Store tensors that will be feed in histogram summary self.histogram = dict() # Store summaries self.summaries = list() # Store the specific layer's activation for visual utility self.activations = list() self.restore_op = tf.no_op() self.reverse_cate_table = reverse_cate_table self.scope = scope self.feat_stride = [16] self.hparams = hparams self.ori_count = len(hparams.anchor_ratios) * len(hparams.anchor_scales) self.trainable = hparams.mode is "train" self.tunable = hparams.tunable self.predicable = hparams.mode is "infer" # Set data assert isinstance(data_wrapper, iterator_wrapper.DataWrapper) # !!!Make sure dataset batch size is 1 self.im_info = data_wrapper.images_size self.images_data = data_wrapper.images_data self.bbox_labels = data_wrapper.bbox_locations # Initializer self.initializer = helper.get_initializer( hparams.init_op, hparams.ran_seed, hparams.init_weight) self.bbox_initializer = helper.get_initializer( hparams.bbox_init_op, hparams.bbox_ran_seed, hparams.bbox_init_weight) # Regularization # weights_decay = hparams.weight_decay_factor # if hparams.bias_decay: # biases_decay = weights_decay # else: # biases_decay = None # layers.fill_arg_scope(weights_decay=weights_decay, biases_decay=biases_decay) self.weights_regularizer = tf.contrib.layers.l2_regularizer(hparams.weight_decay_factor) if hparams.bias_decay: self.biases_regularizer = self.weights_regularizer else: self.biases_regularizer = None # tf.get_variable_scope().set_initializer(self.initializer) # Set up global step self._setup_gloabal_step()
def _set_params_initializer(self, hparams, mode, iterator, source_vocab_table, target_vocab_table, label_vocab_table, scope, extra_args=None): """Set various params for self and initialize.""" assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.src_vocab_table = source_vocab_table self.tgt_vocab_table = target_vocab_table self.lbl_vocab_table = label_vocab_table self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.lbl_vocab_size = hparams.lbl_vocab_size self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major self.dtype = tf.float32 self.num_sampled_softmax = hparams.num_sampled_softmax # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Set num units self.num_units = hparams.num_units # Set num layers self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers assert self.num_encoder_layers assert self.num_decoder_layers # Batch size self.batch_size = tf.size(self.iterator.source_sequence_length) # Global step self.global_step = tf.Variable(0, trainable=False) # Initializer self.random_seed = hparams.random_seed initializer = model_helper.get_initializer(hparams.init_op, self.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.encoder_emb_lookup_fn = tf.nn.embedding_lookup self.init_embeddings(hparams, scope)
def set_hparams_init(self, flags, mode): # Select mode for TRAIN | PREDICT(infer) self.mode = mode # Set environment self.env = gym.make(flags.env) # [batch_size, image_height, image_width, num_channels] self.state = tf.placeholder(tf.float32, [None, flags.img_height, flags.img_width, 1]) # possible_action: [stop, left, right] self.action_size = 3 # Set image height and width self.img_height = flags.img_height self.img_width = flags.img_width # Global step self.global_step = tf.Variable(0, trainable=False) self.num_gpus = flags.num_gpus # Initializer for weights, biases self.random_seed = flags.random_seed self.w_init = model_helper.get_initializer( flags.w_init_op, self.random_seed, flags.mean, flags.stddev) self.b_init = model_helper.get_initializer( flags.b_init_op, self.random_seed, bias_start=flags.bias_start) # Convolution self.cv_num_outputs = flags.cv_num_outputs self.f_height = flags.f_height # filter height self.f_width = flags.f_width # filter width self.stride = flags.stride self.padding = flags.padding # Recurrent self.rnn_num_layers = flags.rnn_num_layers self.cell_type = flags.cell_type self.num_units = flags.num_units self.dropout = flags.dropout self.residual_connect = flags.residual_connect
def _set_params_initializer(self, hparams, mode, features): """Set various params for self and initialize.""" self.mode = mode self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.features = features self.dtype = tf.as_dtype(hparams.activation_dtype) self.single_cell_fn = None # Set num units self.num_units = hparams.num_units self.eos_id = hparams.tgt_eos_id self.label_smoothing = hparams.label_smoothing # Set num layers self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers assert self.num_encoder_layers assert self.num_decoder_layers # Batch size self.batch_size = tf.size(self.features["source_sequence_length"]) # Global step # Use get_global_step instead of user-defied global steps. Otherwise the # num_train_steps in TPUEstimator.train has no effect (will train forever). # TPUestimator only check if tf.train.get_global_step() < num_train_steps self.global_step = tf.train.get_or_create_global_step() # Initializer self.random_seed = hparams.random_seed initializer = model_helper.get_initializer(hparams.init_op, self.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.encoder_emb_lookup_fn = (self._emb_lookup if self.mode == tf.contrib.learn.ModeKeys.TRAIN else tf.nn.embedding_lookup)
def __init__(self, hparams, mode, iterator, source_vocab_table, target_vocab_table, reverse_target_vocab_table=None, scope=None, extra_args=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. source_vocab_table: Lookup table mapping source words to ids. target_vocab_table: Lookup table mapping target words to ids. reverse_target_vocab_table: Lookup table mapping ids to target words. Only required in INFER mode. Defaults to None. scope: scope of the model. extra_args: model_helper.ExtraArgs, for passing customizable functions. """ assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.src_vocab_table = source_vocab_table self.tgt_vocab_table = target_vocab_table self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Set num layers self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers assert self.num_encoder_layers assert self.num_decoder_layers # Set num residual layers if hasattr(hparams, "num_residual_layers"): # compatible common_test_utils self.num_encoder_residual_layers = hparams.num_residual_layers self.num_decoder_residual_layers = hparams.num_residual_layers else: self.num_encoder_residual_layers = hparams.num_encoder_residual_layers self.num_decoder_residual_layers = hparams.num_decoder_residual_layers # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection") ## Train graph res = self.build_graph(hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.iterator.source_sequence_length) + tf.reduce_sum( self.iterator.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) # Gradients gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + grad_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=hparams.num_keep_ckpts) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def __init__(self, hparams, mode, iterator, source_vocab_table, target_vocab_table, reverse_target_vocab_table=None): assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.src_vocab_table = source_vocab_table self.tgt_vocab_table = target_vocab_table self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.time_major = hparams.time_major self.single_cell_fn = None # Set num layers self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers assert self.num_encoder_layers assert self.num_decoder_layers # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope("build_netword"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection") ## Train graph res = self.build_graph(hparams) if self.mode == tf.estimator.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.iterator.source_sequence_length) + tf.reduce_sum( self.iterator.target_sequence_length) elif self.mode == tf.estimator.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.estimator.ModeKeys.PREDICT: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.estimator.ModeKeys.PREDICT: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrange for the embedding vars to appear at the beginning. if self.mode == tf.estimator.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) # Gradients gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + grad_norm_summary) if self.mode == tf.estimator.ModeKeys.PREDICT: self.infer_summary = self._get_infer_summary(hparams) # Saver self.saver = tf.train.Saver(tf.global_variables()) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def _set_params_initializer(self, hparams, mode, features, scope, extra_args=None): """Set various params for self and initialize.""" self.mode = mode self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.features = features self.time_major = hparams.time_major if hparams.use_char_encode: assert (not self.time_major), ("Can't use time major for" " char-level inputs.") self.dtype = tf.float16 if hparams.use_fp16 else tf.float32 # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Set num units self.num_units = hparams.num_units # Set num layers self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers assert self.num_encoder_layers assert self.num_decoder_layers # Set num residual layers if hasattr(hparams, "num_residual_layers"): # compatible common_test_utils self.num_encoder_residual_layers = hparams.num_residual_layers self.num_decoder_residual_layers = hparams.num_residual_layers else: self.num_encoder_residual_layers = hparams.num_encoder_residual_layers self.num_decoder_residual_layers = hparams.num_decoder_residual_layers # Batch size self.batch_size = tf.size(self.features["source_sequence_length"]) # Global step global_step = tf.train.get_global_step() if global_step is not None: utils.print_out("global_step already created!") self.global_step = tf.train.get_or_create_global_step() utils.print_out("model.global_step.name: %s" % self.global_step.name) # Initializer self.random_seed = hparams.random_seed initializer = model_helper.get_initializer(hparams.init_op, self.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.encoder_emb_lookup_fn = tf.nn.embedding_lookup self.init_embeddings(hparams, scope)
def __init__(self, hparams, mode, iterator, target_vocab_table, reverse_target_vocab_table=None, scope=None, single_cell_fn=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. target_vocab_table: Lookup table mapping target words to ids. reverse_target_vocab_table: Lookup table mapping ids to target words. Only required in INFER mode. Defaults to None. scope: scope of the model. single_cell_fn: allow for adding customized cell. When not specified, we default to model_helper._single_cell """ assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.tgt_vocab_table = target_vocab_table self.tgt_vocab_size = hparams.tgt_vocab_size self.num_layers = hparams.num_layers self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major self.cnn_input = self.iterator.source if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.cnn = AlexNet(self.cnn_input, (1 - hparams.dropout), model_helper.get_device_str(hparams.base_gpu)) else: self.cnn = AlexNet(self.cnn_input, 1, model_helper.get_device_str(hparams.base_gpu)) # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection") # To make it flexible for external code to add other cell types # If not specified, we will later use model_helper._single_cell self.single_cell_fn = single_cell_fn ## Train graph res = self.build_graph(hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum(self.iterator.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup(tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum(self.iterator.target_sequence_length) ## Learning rate print(" start_decay_step=%d, learning_rate=%g, decay_steps %d, decay_factor %g" % (hparams.start_decay_step, hparams.learning_rate, hparams.decay_steps, hparams.decay_factor)) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: if hparams.optimizer == "sgd": self.learning_rate = tf.cond(self.global_step < hparams.start_decay_step, lambda: tf.constant(hparams.learning_rate), lambda: tf.train.exponential_decay(hparams.learning_rate, (self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="learning_rate") opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": assert float(hparams.learning_rate) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate self.learning_rate = tf.constant(hparams.learning_rate) opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops) clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(gradients, max_gradient_norm=hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss)] + gradient_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver if hparams.eval_on_fly: self.saver = tf.train.Saver(tf.global_variables(), save_relative_paths= True) else: self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None, save_relative_paths= True) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def __init__(self, hparams, mode, iterator, handle, vocab_table, reverse_vocab_table=None, scope=None, extra_args=None): assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.handle = handle self.mode = mode self.vocab_table = vocab_table self.vocab_size = hparams.vocab_size self.num_layers = hparams.num_layers self.num_gpus = hparams.num_gpus self.hparams = hparams self.single_cell_fn = None self.global_gpu_num = 0 if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams, scope) self.batch_size = tf.shape(self.iterator.source)[0] # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer1 = layers_core.Dense( hparams.vocab_size, use_bias=False, name="output_projection_1") self.output_layer2 = layers_core.Dense( hparams.vocab_size, use_bias=False, name="output_projection_2") self.output_layer_action = layers_core.Dense( hparams.vocab_size, use_bias=False, name="output_projection_action") self.vn_project11 = layers_core.Dense( hparams.unit_value_network, use_bias=False, name="vn_project_11") self.vn_project12 = layers_core.Dense( hparams.unit_value_network, use_bias=False, name="vn_project_12") self.vn_project21 = layers_core.Dense( hparams.unit_value_network, use_bias=False, name="vn_project_21") self.vn_project22 = layers_core.Dense( hparams.unit_value_network, use_bias=False, name="vn_project_22") ## Train graph sl_loss, sl_loss_arr, rl_loss_arr, sample_id_arr_train, sample_id_arr_infer = build_graph( self, hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = sl_loss self.all_train_loss = sl_loss_arr self.word_count = tf.reduce_sum(self.iterator.dialogue_len) self.sample_ids_arr = sample_id_arr_train self.sample_words_arr1 = [] self.sample_words_arr2 = [] source = self.iterator.source for i in range(len(self.sample_ids_arr)): element_infer = self.sample_ids_arr[i] element_src = source[0] # element_src=0 src = reverse_vocab_table.lookup(tf.to_int64(element_src)) infer = reverse_vocab_table.lookup( tf.to_int64(element_infer) )[0] # src can only get the first one so I only get the first inference if i == 0: self.sample_words_arr1.append((tf.constant(i), src, infer)) elif i == 1: self.sample_words_arr2.append((tf.constant(i), src, infer)) self.vl1, self.vl2, self.pl1, self.pl2, self.eq11, self.eq12, self.eq2 = rl_loss_arr # reinforcement updates elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = sl_loss self.all_eval_loss = sl_loss_arr elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.sample_ids_arr = sample_id_arr_infer self.sample_words_arr = [] self.source = reverse_vocab_table.lookup( tf.to_int64(iterator.source)) for element in self.sample_ids_arr: self.sample_words_arr.append( reverse_vocab_table.lookup(tf.to_int64(element))) elif self.mode in dialogue_utils.self_play_modes: #### self play self.train_loss = sl_loss self.all_train_loss = sl_loss_arr self.selfplay_agent_1_utt = reverse_vocab_table.lookup( tf.to_int64(sample_id_arr_infer[0])) self.selfplay_agent_2_utt = reverse_vocab_table.lookup( tf.to_int64(sample_id_arr_infer[1])) self.selfplay_action = reverse_vocab_table.lookup( tf.to_int64(sample_id_arr_infer[2])) if self.mode == dialogue_utils.mode_self_play_mutable: self.vl1, self.vl2, self.pl1, self.pl2, self.eq11, self.eq12, self.eq2 = rl_loss_arr # reinforcement updates if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum(self.iterator.dialogue_len) ## Learning rate warmup_steps = hparams.learning_rate_warmup_steps warmup_factor = hparams.learning_rate_warmup_factor print(" start_decay_step=%d, learning_rate=%g, decay_steps %d, " "decay_factor %g, learning_rate_warmup_steps=%d, " "learning_rate_warmup_factor=%g, starting_learning_rate=%g" % (hparams.start_decay_step, hparams.learning_rate, hparams.decay_steps, hparams.decay_factor, warmup_steps, warmup_factor, (hparams.learning_rate * warmup_factor**warmup_steps))) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN or self.mode == dialogue_utils.mode_self_play_mutable: self.learning_rate = tf.constant(hparams.learning_rate) inv_decay = warmup_factor**(tf.to_float(warmup_steps - self.global_step)) self.learning_rate = tf.cond( self.global_step < hparams.learning_rate_warmup_steps, lambda: inv_decay * self.learning_rate, lambda: self.learning_rate, name="learning_rate_decay_warump_cond") if hparams.optimizer == "sgd": self.learning_rate = tf.cond( self.global_step < hparams.start_decay_step, lambda: self.learning_rate, lambda: tf.train.exponential_decay(self.learning_rate, ( self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="sgd_learning_rate_supervised") opt = tf.train.GradientDescentOptimizer(self.learning_rate, name="SGD_supervised") tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": assert float( hparams.learning_rate ) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate opt = tf.train.AdamOptimizer(self.learning_rate, name="Adam_supervised") gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops, name="gradients_adam") clipped_gradients, gradient_norm_summary = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step, name="adam_apply_gradients") # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + gradient_norm_summary) # second part of the learning rate if self.mode == tf.contrib.learn.ModeKeys.TRAIN or self.mode == dialogue_utils.mode_self_play_mutable: self.learning_rate2 = tf.constant(hparams.learning_rate2) self.learning_rate3 = tf.constant(hparams.learning_rate3) if hparams.optimizer == "sgd": self.learning_rate2 = tf.cond( self.global_step < hparams.start_decay_step, lambda: self.learning_rate2, lambda: tf.train.exponential_decay(self.learning_rate2, ( self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="sgd_learning_rate_supervised2") self.learning_rate3 = tf.cond( self.global_step < hparams.start_decay_step, lambda: self.learning_rate3, lambda: tf.train.exponential_decay(self.learning_rate3, ( self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="sgd_learning_rate_supervised3") tf.summary.scalar("self_play_lr", self.learning_rate) elif hparams.optimizer == "adam": assert float( hparams.learning_rate2 ) <= 0.001, "! High Adam learning rate2 %g" % hparams.learning_rate2 assert float( hparams.learning_rate3 ) <= 0.001, "! High Adam learning rate3 %g" % hparams.learning_rate3 # params=[] print("params=") for element in params: print(element.name) val1_params = self.patial_params( params, ["dynamic_seq2seq/value_network1"]) val2_params = self.patial_params( params, ["dynamic_seq2seq/value_network2"]) embedding_params = self.patial_params(params, ["embeddings"]) main_dec_enc_params1 = self.patial_params( params, ["dynamic_seq2seq/encoder1/", "dynamic_seq2seq/decoder1/"]) main_dec_enc_params2 = self.patial_params( params, ["dynamic_seq2seq/encoder2/", "dynamic_seq2seq/decoder2/"]) action_params = self.patial_params( params, ["dynamic_seq2seq/decoder_action"]) encoder_kb_params = self.patial_params( params, ["dynamic_seq2seq/encoder2_kb"]) encoder_intent_params = self.patial_params( params, ["dynamic_seq2seq/encoder1_intent"]) print("val1_params", "\n".join(map(lambda a: a.name, val1_params))) print("val2_params", "\n".join(map(lambda a: a.name, val2_params))) print("embedding_params", "\n".join(map(lambda a: a.name, embedding_params))) print("main_dec_enc_params1", "\n".join(map(lambda a: a.name, main_dec_enc_params1))) print("main_dec_enc_params2", "\n".join(map(lambda a: a.name, main_dec_enc_params2))) print("action_params", "\n".join(map(lambda a: a.name, action_params))) print("encoder_kb_params", "\n".join(map(lambda a: a.name, encoder_kb_params))) print("encoder_intent_params", "\n".join(map(lambda a: a.name, encoder_intent_params))) self.optimizer_vl1, self.v1_sum = self.generate_optimizer( self.vl1, params, "vl1", self.learning_rate2, self.hparams.max_gradient_norm2) self.optimizer_vl2, self.v2_sum = self.generate_optimizer( self.vl2, params, "vl2", self.learning_rate2, self.hparams.max_gradient_norm2) if hparams.self_play_variable_method == 0: rl_param1, rl_param2 = encoder_intent_params, encoder_kb_params + action_params elif hparams.self_play_variable_method == 1: rl_param1, rl_param2 = main_dec_enc_params1, main_dec_enc_params2 elif hparams.self_play_variable_method == 2: rl_param1, rl_param2 = main_dec_enc_params1 + encoder_intent_params, main_dec_enc_params2 + encoder_kb_params + action_params elif hparams.self_play_variable_method == 3: rl_param1, rl_param2 = [main_dec_enc_params1[0] ] + encoder_intent_params, [ main_dec_enc_params2[0] ] + encoder_kb_params elif hparams.self_play_variable_method == 4: rl_param1, rl_param2 = [main_dec_enc_params1[0] ], [main_dec_enc_params2[0]] elif hparams.self_play_variable_method == 5: rl_param1, rl_param2 = params, params self.optimizer_pl1, self.p1_sum = self.generate_optimizer( self.pl1, params, "pl1", self.learning_rate3, self.hparams.max_gradient_norm3) self.optimizer_pl2, self.p2_sum = self.generate_optimizer( self.pl2, params, "pl2", self.learning_rate3, self.hparams.max_gradient_norm3) print("self.learning", self.learning_rate, self.learning_rate2, self.learning_rate3) ################################ ### supervised learning######' ########################### # Saver self.saver = tf.train.Saver(tf.global_variables()) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def _set_params_initializer(self, hparams, mode, features, scope, extra_args=None): """Set various params for self and initialize.""" self.mode = mode self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.features = features self.time_major = hparams.time_major if self.time_major: mlperf_log.gnmt_print(key=mlperf_log.INPUT_ORDER, value="time_major") else: mlperf_log.gnmt_print(key=mlperf_log.INPUT_ORDER, value="batch_major") if hparams.use_char_encode: assert (not self.time_major), ("Can't use time major for" " char-level inputs.") self.dtype = tf.as_dtype(hparams.activation_dtype) # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Set num units mlperf_log.gnmt_print(key=mlperf_log.MODEL_HP_HIDDEN_SIZE, value=hparams.num_units) self.num_units = hparams.num_units self.eos_id = hparams.tgt_eos_id self.label_smoothing = hparams.label_smoothing # Set num layers mlperf_log.gnmt_print(key=mlperf_log.MODEL_HP_NUM_LAYERS, value={"encoder": hparams.num_encoder_layers, "decoder": hparams.num_decoder_layers}) self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers assert self.num_encoder_layers assert self.num_decoder_layers # Set num residual layers if hasattr(hparams, "num_residual_layers"): # compatible common_test_utils self.num_encoder_residual_layers = hparams.num_residual_layers self.num_decoder_residual_layers = hparams.num_residual_layers else: self.num_encoder_residual_layers = hparams.num_encoder_residual_layers self.num_decoder_residual_layers = hparams.num_decoder_residual_layers # Batch size self.batch_size = tf.size(self.features["source_sequence_length"]) # Global step # Use get_global_step instead of user-defied global steps. Otherwise the # num_train_steps in TPUEstimator.train has no effect (will train forever). # TPUestimator only check if tf.train.get_global_step() < num_train_steps self.global_step = tf.train.get_or_create_global_step() # Initializer mlperf_log.gnmt_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=hparams.random_seed) self.random_seed = hparams.random_seed initializer = model_helper.get_initializer( hparams.init_op, self.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.encoder_emb_lookup_fn = ( self._emb_lookup if self.mode == tf.contrib.learn.ModeKeys.TRAIN else tf.nn.embedding_lookup) self.init_embeddings(hparams, scope, self.dtype)
def __init__(self, hparams, mode, iterator, vocab_table, scope=None, extra_args=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. vocab_table: Lookup table mapping source words to ids. scope: scope of the model. extra_args: model_helper.ExtraArgs, for passing customizable functions. """ self.iterator = iterator self.mode = mode self.vocab_table = vocab_table #self.vocab_size = len(vocab_table) self.time_major = hparams.time_major self.single_cell_fn = None # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(2, use_bias=False, activation=tf.nn.sigmoid, name="output_projection") ## Train graph loss, accuracy = self.build_graph(hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = loss self.train_accuracy = accuracy elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = loss self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) # Gradients gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=True) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + grad_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=hparams.num_keep_ckpts) # Print trainable variables print("# Trainable variables") for param in params: print(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def __init__(self, iterator, hparams, mode, scope=None): self.iterator = iterator self.hparams = hparams self.mode = mode self.scope = scope # Initializer initializer = model_helper.get_initializer(self.hparams.init_op, None, self.hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings with tf.variable_scope(scope or 'embedding'): self.embedding = tf.get_variable( 'embedding', [self.hparams.vocab_size, self.hparams.num_units], dtype=tf.float32) # Output Layer with tf.variable_scope(scope or "build_network"): with tf.variable_scope('decoder/output_projection'): self.output_layer = tf.layers.Dense(self.hparams.vocab_size, use_bias=False) # Batch Size self.batch_size = tf.size(self.iterator.src_seq) # Build Graph print("# Building graph for the model ...") res = self.build_graph(self.scope) if self.mode == TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( tf.reduce_sum(self.iterator.src_seq) + tf.reduce_sum(self.iterator.tar_seq)) elif self.mode == EVAL: self.eval_loss = res[1] elif self.mode == PREDICT: self.infer_logits, _, self.final_state, self.sample_id = res if self.mode != PREDICT: # Count the number of predicted words for compute perplexity. self.predict_count = tf.reduce_sum(self.iterator.tar_seq) # Define variables self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Optimizer if self.mode == TRAIN: self.learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate') # self.learning_rate = tf.train.exponential_decay( # 0.001, self.global_step, 1000, 0.9) opt = tf.train.AdamOptimizer(self.learning_rate) # Gradient gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=self.hparams. colocate_gradients_with_ops) clipped_gradients, gradient_norm_summary, _ = model_helper.gradient_clip( gradients, self.hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar('train_loss', self.train_loss), tf.summary.scalar('learning_rate', self.learning_rate) ] + gradient_norm_summary) else: self.infer_summary = tf.no_op() # Saver self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.hparams.max_to_keep)
def __init__(self, hparams, mode, iterator, source_vocab_table, target_vocab_table, reverse_target_vocab_table=None, scope=None, extra_args=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. source_vocab_table: Lookup table mapping source words to ids. target_vocab_table: Lookup table mapping target words to ids. reverse_target_vocab_table: Lookup table mapping ids to target words. Only required in INFER mode. Defaults to None. scope: scope of the model. extra_args: model_helper.ExtraArgs, for passing customizable functions. """ assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.src_vocab_table = source_vocab_table self.tgt_vocab_table = target_vocab_table self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.num_layers = hparams.num_layers self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings # TODO(ebrevdo): Only do this if the mode is TRAIN? self.init_embeddings(hparams, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection") ## Train graph res = self.build_graph(hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.iterator.source_sequence_length) + tf.reduce_sum( self.iterator.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) ## Learning rate warmup_steps = hparams.learning_rate_warmup_steps warmup_factor = hparams.learning_rate_warmup_factor print(" start_decay_step=%d, learning_rate=%g, decay_steps %d, " "decay_factor %g, learning_rate_warmup_steps=%d, " "learning_rate_warmup_factor=%g, starting_learning_rate=%g" % (hparams.start_decay_step, hparams.learning_rate, hparams.decay_steps, hparams.decay_factor, warmup_steps, warmup_factor, (hparams.learning_rate * warmup_factor**warmup_steps))) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # Apply inverse decay if global steps less than warmup steps. # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3) # When step < warmup_steps, # learing_rate *= warmup_factor ** (warmup_steps - step) inv_decay = warmup_factor**(tf.to_float(warmup_steps - self.global_step)) self.learning_rate = tf.cond( self.global_step < hparams.learning_rate_warmup_steps, lambda: inv_decay * self.learning_rate, lambda: self.learning_rate, name="learning_rate_decay_warump_cond") if hparams.optimizer == "sgd": self.learning_rate = tf.cond( self.global_step < hparams.start_decay_step, lambda: self.learning_rate, lambda: tf.train.exponential_decay(self.learning_rate, ( self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="learning_rate") opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": assert float( hparams.learning_rate ) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) clipped_gradients, gradient_norm_summary = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + gradient_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver self.saver = tf.train.Saver(tf.global_variables()) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))