def define_ppo_step(data_points, hparams, action_space, lr): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points obs_shape = common_layers.shape_list(observation) observation = tf.reshape(observation, [obs_shape[0] * obs_shape[1]] + obs_shape[2:]) (logits, new_value) = get_policy(observation, hparams, action_space) logits = tf.reshape(logits, obs_shape[:2] + [action_space.n]) new_value = tf.reshape(new_value, obs_shape[:2]) new_policy_dist = tf.distributions.Categorical(logits=logits) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] loss = sum(losses) variables = tf.global_variables(hparams.policy_network + "/.*") train_op = optimize.optimize(loss, lr, hparams, variables=variables) with tf.control_dependencies([train_op]): return [tf.identity(x) for x in losses]
def optimize(self, loss, num_async_replicas=1): """Return a training op minimizing loss.""" use_tpu = self.hparams.use_tpu lr = self.hparams.learning_rate * optimize.learning_rate_decay(self.hparams) lr /= math.sqrt(float(num_async_replicas)) train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu) return train_op
def define_ppo_step(data_points, hparams, action_space, lr): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points (logits, new_value) = get_policy(observation, hparams, action_space) new_policy_dist = tfp.distributions.Categorical(logits=logits) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] loss = sum(losses) train_op = optimize.optimize(loss, lr, hparams) with tf.control_dependencies([train_op]): return [tf.identity(x) for x in losses]
def define_ppo_step(data_points, hparams, action_space, lr, distributional_size=1, distributional_subscale=0.04): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points obs_shape = common_layers.shape_list(observation) observation = tf.reshape(observation, [obs_shape[0] * obs_shape[1]] + obs_shape[2:]) (logits, new_value) = get_policy(observation, hparams, action_space, distributional_size=distributional_size) logits = tf.reshape(logits, obs_shape[:2] + [action_space.n]) new_policy_dist = tfp.distributions.Categorical(logits=logits) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) if distributional_size > 1: new_value = tf.reshape(new_value, obs_shape[:2] + [distributional_size]) new_value = tf.nn.log_softmax(new_value, axis=-1) # We assume the values range from (-half, half) -- set subscale accordingly. half = (distributional_size // 2) * distributional_subscale # To make values integers, we add half (to move range to (0, 2*half) and # then multiply by subscale after which we floor to get nearest int. quantized_dr = tf.floor( (discounted_reward + half) / distributional_subscale) hot_dr = tf.one_hot(tf.cast(quantized_dr, tf.int32), distributional_size) value_loss = -tf.reduce_sum(new_value * hot_dr, axis=-1) value_loss = hparams.value_loss_coef * tf.reduce_mean(value_loss) else: new_value = tf.reshape(new_value, obs_shape[:2]) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] loss = sum(losses) variables = tf.global_variables(hparams.policy_network + "/.*") train_op = optimize.optimize(loss, lr, hparams, variables=variables) with tf.control_dependencies([train_op]): return [tf.identity(x) for x in losses]
def optimize(self, loss, num_async_replicas=1, use_tpu=False, variables=None): """Return a training op minimizing loss.""" lr = ops.learning_rate_schedule(self.hparams) if num_async_replicas > 1: t2t_model.log_info("Dividing learning rate by num_async_replicas: %d", num_async_replicas) lr /= math.sqrt(float(num_async_replicas)) train_op = optimize.optimize( loss, lr, self.hparams, use_tpu=use_tpu, variables=variables) return train_op
def model_fn(features, labels, mode): """The model function for creating an Estimtator.""" del labels input_count = tf.reduce_sum( tf.to_int32( tf.greater(features["input_refs"][:, :, 1], features["input_refs"][:, :, 0]))) tf.summary.scalar("input_count", input_count) loss_dict, pred_dict, areas = seq2act_model.core_graph( features, hparams, mode, compute_additional_loss_fn) if mode == tf.estimator.ModeKeys.PREDICT: pred_dict["sequences"] = decode_sequence( features, areas, hparams, decode_length, post_processing=FLAGS.post_processing) return tf.estimator.EstimatorSpec(mode, predictions=pred_dict) elif mode == tf.estimator.ModeKeys.EVAL: metrics = {} _eval(metrics, pred_dict, loss_dict, features, areas, compute_seq_accuracy, hparams, metric_types=FLAGS.metric_types.split(","), decode_length=decode_length) if compute_additional_metric_fn: compute_additional_metric_fn(metrics, pred_dict, features) return tf.estimator.EstimatorSpec(mode, loss=loss_dict["total_loss"], eval_metric_ops=metrics) else: assert mode == tf.estimator.ModeKeys.TRAIN loss = loss_dict["total_loss"] for loss_name in loss_dict: if loss_name == "total_loss": continue if loss_name.endswith("losses"): continue tf.summary.scalar(loss_name, loss_dict[loss_name]) step_num = tf.to_float(tf.train.get_global_step()) schedule_string = hparams.learning_rate_schedule names = schedule_string.split("*") names = [name.strip() for name in names if name.strip()] ret = tf.constant(1.0) for name in names: ret *= learning_rate.learning_rate_factor( name, step_num, hparams) train_op = optimize.optimize(loss, ret, hparams) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def optimize(self, loss, num_async_replicas=1): """Return a training op minimizing loss.""" log_info("Base learning rate: %f", self.hparams.learning_rate) lr = learning_rate.learning_rate_schedule(self.hparams) if num_async_replicas > 1: log_info("Dividing learning rate by num_async_replicas: %d", num_async_replicas) lr /= math.sqrt(float(num_async_replicas)) train_op = optimize.optimize( loss, lr, self.hparams, use_tpu=common_layers.is_on_tpu()) return train_op
def estimator_spec_train(self, loss, use_tpu=False): """Construct EstimatorSpec for TRAIN mode.""" lr = self.hparams.learning_rate * optimize.learning_rate_decay(self.hparams) train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu) if use_tpu: _remove_summaries() # summaries not currently working on TPU return tf.contrib.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op) else: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
def estimator_spec_train(self, loss, use_tpu=False): """Construct EstimatorSpec for TRAIN mode.""" lr = self.hparams.learning_rate * optimize.learning_rate_decay( self.hparams) train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu) if use_tpu: _remove_summaries() # summaries not currently working on TPU return tf.contrib.tpu.TPUEstimatorSpec(tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op) else: return tf.estimator.EstimatorSpec(tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
def _test_resnet(self, img_size, output_size): vocab_size = 1 batch_size = 1 x = np.random.random_integers(0, high=255, size=(batch_size, img_size, img_size, 3)) y = np.random.random_integers(1, high=vocab_size, size=(batch_size, 1, 1, 1)) #hparams = resnet_tiny_cpu() #hparams = resnet_50() hparams = resnet_32() p_hparams = problem_hparams.test_problem_hparams( vocab_size, vocab_size, hparams) p_hparams.input_modality["inputs"] = modalities.ImageModality(hparams) p_hparams.target_modality = modalities.ClassLabelModality( hparams, vocab_size) run_meta = tf.RunMetadata() with self.test_session() as session: features = { "inputs": tf.constant(x, dtype=tf.int32), "targets": tf.constant(y, dtype=tf.int32), } #model = resnet.Resnet(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) model = shake_shake.ShakeShake(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) logits, _ = model(features) print(logits.get_shape()) #opts = tf.profiler.ProfileOptionBuilder.float_operation() #flops = tf.profiler.profile(tf.get_default_graph(), run_meta=run_meta, options=opts) #print(flops.total_float_ops) session.run(tf.global_variables_initializer()) #res = session.run(logits) tf.get_variable_scope().set_initializer( optimize.get_variable_initializer(hparams)) loss = tf.losses.sparse_softmax_cross_entropy(labels=tf.constant( 0, dtype=tf.int32, shape=[1, 1, 1, 1, 1]), logits=logits) train_op = optimize.optimize(loss, 0.1, hparams) session.run(loss) opts = tf.profiler.ProfileOptionBuilder.float_operation() flops = tf.profiler.profile(tf.get_default_graph(), run_meta=run_meta, options=opts) print(flops.total_float_ops)
def optimize(self, loss, num_async_replicas=1): """Return a training op minimizing loss.""" tf.logging.info("Base learning rate: %f", self.hparams.learning_rate) lr = self.hparams.learning_rate decay_rate = optimize.learning_rate_schedule(self.hparams) lr *= decay_rate if self.hparams.learning_rate_minimum: lr_min = float(self.hparams.learning_rate_minimum) tf.logging.info("Applying learning rate minimum: %f", lr_min) lr = tf.max(lr, tf.to_float(lr_min)) if num_async_replicas > 1: tf.logging.info("Dividing learning rate by num_async_replicas: %d", num_async_replicas) lr /= math.sqrt(float(num_async_replicas)) train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=common_layers.is_on_tpu()) return train_op
def optimize(self, loss, use_tpu=False): """Return a training op minimizing loss.""" lr = self.hparams.learning_rate * optimize.learning_rate_decay(self.hparams) train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu) return train_op
def optimize(self, loss, use_tpu=False): """Return a training op minimizing loss.""" lr = self.hparams.learning_rate * optimize.learning_rate_decay( self.hparams) train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu) return train_op
def define_ppo_step(data_points, hparams, action_space, lr, epoch=-1, distributional_size=1, distributional_subscale=0.04): """Define ppo step.""" del distributional_subscale (observation, action, discounted_reward, discounted_reward_probs, norm_advantage, old_pdf) = data_points obs_shape = common_layers.shape_list(observation) observation = tf.reshape( observation, [obs_shape[0] * obs_shape[1]] + obs_shape[2:] ) (logits, new_value) = get_policy(observation, hparams, action_space, epoch=epoch, distributional_size=distributional_size) logits = tf.reshape(logits, obs_shape[:2] + [action_space.n]) new_policy_dist = tfp.distributions.Categorical(logits=logits) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) if distributional_size > 1: new_value = tf.reshape(new_value, obs_shape[:2] + [distributional_size]) new_value = tf.nn.log_softmax(new_value, axis=-1) value_shape = common_layers.shape_list(new_value) # The above is the new value distribution. We are also given as discounted # reward the value distribution and the corresponding probabilities. # The given discounted reward is already rounded to integers but in range # increased by 2x for greater fidelity. Increase range of new_values here. new_value_shifted = tf.concat([new_value[1:], new_value[-1:]], axis=0) new_value_mean = (new_value + new_value_shifted) / 2 new_value = tf.concat([tf.expand_dims(new_value, axis=-1), tf.expand_dims(new_value_mean, axis=-1)], -1) new_value = tf.reshape(new_value, value_shape[:-1] + [2 * value_shape[-1]]) # Cast discounted reward to integers and gather the new log-probs for them. discounted_reward = tf.cast(discounted_reward, tf.int32) value_loss = tf.batch_gather(new_value, discounted_reward) # Weight the gathered (new) log-probs by the old probabilities. discounted_reward_probs = tf.expand_dims(discounted_reward_probs, axis=1) value_loss = - tf.reduce_sum(value_loss * discounted_reward_probs, axis=-1) # Take the mean over batch and time as final loss, multiply by coefficient. value_loss = hparams.value_loss_coef * tf.reduce_mean(value_loss) else: new_value = tf.reshape(new_value, obs_shape[:2]) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error ** 2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] loss = sum(losses) variables = tf.global_variables(hparams.policy_network + "/.*") train_op = optimize.optimize(loss, lr, hparams, variables=variables) with tf.control_dependencies([train_op]): return [tf.identity(x) for x in losses]
def build_model(self): # build index table index_table = tf.contrib.lookup.index_table_from_file( vocabulary_file=self.config.vocab_list, num_oov_buckets=0, default_value=0) # get data iterator self.data_iterator = self.data.get_data_iterator(index_table, mode=self.mode) # get inputs with tf.variable_scope("inputs"): # get next batch if there is no feeded data next_batch = self.data_iterator.get_next() self.input_queries = tf.placeholder_with_default( next_batch["input_queries"], [None, self.config.max_length], name="input_queries") self.input_replies = tf.placeholder_with_default( next_batch["input_replies"], [None, self.config.max_length], name="input_replies") self.query_lengths = tf.placeholder_with_default( tf.squeeze(next_batch["query_lengths"]), [None], name="query_lengths") self.reply_lengths = tf.placeholder_with_default( tf.squeeze(next_batch["reply_lengths"]), [None], name="reply_lengths") # get hyperparams self.embed_dropout_keep_prob = tf.placeholder( tf.float64, name="embed_dropout_keep_prob") self.lstm_dropout_keep_prob = tf.placeholder( tf.float32, name="lstm_dropout_keep_prob") self.dense_dropout_keep_prob = tf.placeholder( tf.float32, name="dense_dropout_keep_prob") self.num_negative_samples = tf.placeholder( tf.int32, name="num_negative_samples") with tf.variable_scope("properties"): # length properties cur_batch_length = tf.shape(self.input_queries)[0] # get hparms from tensor2tensor.models.transformer hparams = transformer.transformer_small() hparams.batch_size = self.config.batch_size # learning rate lr = learning_rate.learning_rate_schedule(hparams) # embedding layer with tf.variable_scope("embedding"): embeddings = tf.Variable(get_embeddings( self.config.vocab_list, self.config.pretrained_embed_dir, self.config.vocab_size, self.config.embed_dim), trainable=True, name="embeddings") embeddings = tf.nn.dropout( embeddings, keep_prob=self.embed_dropout_keep_prob, noise_shape=[tf.shape(embeddings)[0], 1]) queries_embedded = tf.to_float( tf.nn.embedding_lookup(embeddings, self.input_queries, name="queries_embedded")) replies_embedded = tf.to_float( tf.nn.embedding_lookup(embeddings, self.input_replies, name="replies_embedded")) self.queries_embedded = queries_embedded self.replies_embedded = replies_embedded # transformer layer with tf.variable_scope("transformer"): queries_expanded = tf.expand_dims(queries_embedded, axis=2, name="queries_expanded") replies_expanded = tf.expand_dims(replies_embedded, axis=2, name="replies_expanded") hparams = transformer.transformer_small() hparams.set_hparam("batch_size", self.config.batch_size) hparams.set_hparam("hidden_size", self.config.embed_dim) encoder = transformer.TransformerEncoder(hparams, mode=self.mode) self.queries_encoded = encoder({ "inputs": queries_expanded, "targets": queries_expanded })[0] self.replies_encoded = encoder({ "inputs": replies_expanded, "targets": replies_expanded })[0] self.queries_pooled = tf.nn.max_pool( self.queries_encoded, ksize=[1, self.config.max_length, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="queries_pooled") self.replies_pooled = tf.nn.max_pool( self.replies_encoded, ksize=[1, self.config.max_length, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="replies_pooled") self.queries_flattened = tf.reshape(self.queries_pooled, [cur_batch_length, -1]) self.replies_flattened = tf.reshape(self.replies_pooled, [cur_batch_length, -1]) # build dense layer with tf.variable_scope("dense_layer"): M = tf.get_variable( "M", shape=[self.config.embed_dim, self.config.embed_dim], initializer=tf.initializers.truncated_normal()) M = tf.nn.dropout(M, self.dense_dropout_keep_prob) self.queries_transformed = tf.matmul(self.queries_flattened, M) with tf.variable_scope("sampling"): self.distances = tf.matmul(self.queries_transformed, self.replies_flattened, transpose_b=True) positive_mask = tf.reshape(tf.eye(cur_batch_length), [-1]) negative_mask = tf.reshape( make_negative_mask( self.distances, method=self.config.negative_sampling, num_negative_samples=self.num_negative_samples), [-1]) with tf.variable_scope("prediction"): distances_flattened = tf.reshape(self.distances, [-1]) self.positive_logits = tf.gather(distances_flattened, tf.where(positive_mask), 1) self.negative_logits = tf.gather(distances_flattened, tf.where(negative_mask), 1) self.logits = tf.concat( [self.positive_logits, self.negative_logits], axis=0) self.labels = tf.concat([ tf.ones_like(self.positive_logits), tf.zeros_like(self.negative_logits) ], axis=0) self.positive_probs = tf.sigmoid(self.positive_logits) self.probs = tf.sigmoid(self.logits) self.predictions = tf.cast(self.probs > 0.5, dtype=tf.int32) with tf.variable_scope("loss"): self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=self.logits)) self.train_step = optimize.optimize(self.loss, lr, hparams, use_tpu=False) with tf.variable_scope("score"): correct_predictions = tf.equal(self.predictions, tf.to_int32(self.labels)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def __init__(self, gpu, checkpoints, config=None): self._logger = logging.getLogger('TransformerDecoder') self._settings = config.settings if config is not None else TransformerDecoder.Settings( ) self._checkpoints = checkpoints self._checkpoint = None self._nn_needs_reset = True with tf.device('/device:GPU:0' if gpu is not None else '/cpu:0'): self._restorer = checkpoints.restorer() # Prepare features for feeding into the model. self._ph_decode_length = tf.placeholder(dtype=tf.int32) self._ph_infer_inputs = tf.placeholder(dtype=tf.int32) self._ph_train_inputs = tf.reshape(tf.placeholder(dtype=tf.int32), shape=[-1, -1, 1, 1]) self._ph_train_targets = tf.reshape(tf.placeholder(dtype=tf.int32), shape=[-1, -1, 1, 1]) self._ph_learning_rate = tf.placeholder(tf.float32, [], name='learning_rate') # Prepare the model for training self._model = registry.model('transformer')( self._checkpoints.hparams, tf.estimator.ModeKeys.TRAIN) _, losses = self._model({ "inputs": self._ph_train_inputs, "targets": self._ph_train_targets }) self._loss = losses['training'] self._train_op = optimize.optimize( self._loss, self._ph_learning_rate, self._model.hparams, use_tpu=common_layers.is_on_tpu()) tf.get_variable_scope().reuse_variables() # Prepare the model for infer self._attention_mats_op = [ self._model.attention_weights[ 'transformer/body/decoder/layer_%i/encdec_attention/multihead_attention/dot_product_attention' % i] for i in xrange(self._model.hparams.num_hidden_layers) ] self._predictions_ops = [] infer_inputs = tf.reshape(self._ph_infer_inputs, [1, -1, 1, 1]) # Make it 4D. infer_out = self._model.infer({"inputs": infer_inputs}, beam_size=4, top_beams=1, alpha=0.6, decode_length=self._ph_decode_length) self._predictions_op = { "outputs": infer_out["outputs"], "inputs": infer_inputs, } session_config = tf.ConfigProto(allow_soft_placement=True) session_config.gpu_options.allow_growth = True if gpu is not None: session_config.gpu_options.force_gpu_compatible = True session_config.gpu_options.visible_device_list = str(gpu) self._session = tf.Session(config=session_config) # Init model self._warmup()
def model_fn(model, features, mode, hparams, problem_names, train_steps=100000, worker_id=0, worker_replicas=1, eval_run_autoregressive=False, decode_hparams=None): """Builds the model for all modes. * TRAIN: Constructs loss and train_op * EVAL: Constructs the loss and eval metrics * PREDICT: Constructs the predictions Args: model: str, name of model. features: dict<feature name, Tensor>. Expected to have keys {inputs, targets, problem_choice}. mode: tf.estimator.ModeKeys. hparams: model HParams. problem_names: list of str, names of the problems. train_steps: int, total number of training steps. Used to compute learning rate decay. worker_id: int, id of this worker. worker_replicas: int, number of workers. eval_run_autoregressive: bool, whether to run evaluation autoregressively. decode_hparams: HParams for decode settings. Used when mode == PREDICT. Returns: tf.estimator.EstimatorSpec """ assert len(problem_names) == len(hparams.problem_instances) decode_hp = decode_hparams # TODO(rsepassi): This still depends on FLAGS. Rm eventually. dp = devices.data_parallelism() tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams)) is_training = mode == tf.estimator.ModeKeys.TRAIN # Add input statistics for incoming features. with tf.name_scope("input_stats"): for (k, v) in six.iteritems(features): if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1: tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n) tf.summary.scalar("%s_length" % k, tf.shape(v)[1]) nonpadding = tf.to_float(tf.not_equal(v, 0)) nonpadding_tokens = tf.reduce_sum(nonpadding) if k == "targets": targets_nonpadding_tokens = nonpadding_tokens tf.summary.scalar("%s_nonpadding_tokens" % k, nonpadding_tokens) tf.summary.scalar("%s_nonpadding_fraction" % k, tf.reduce_mean(nonpadding)) # Get multi-problem logits and loss based on features["problem_choice"]. loss_variable_names = [] def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( hparams, mode, hparams.problems[n], n, dp, devices.ps_devices(all_workers=True), decode_hparams=decode_hparams) if mode == tf.estimator.ModeKeys.PREDICT: return model_class.infer(features, beam_size=decode_hp.beam_size, top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1), alpha=decode_hp.alpha, decode_length=decode_hp.extra_length) # In distributed mode, we build graph for problem=0 and problem=worker_id. skipping_is_on = hparams.problem_choice == "distributed" and is_training problem_worker_id = worker_id % len(hparams.problems) skip_this_one = n != 0 and n % worker_replicas != problem_worker_id # On worker 0 also build graph for problems <= 1. # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (worker_id != 0 or n > 1) if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL: sharded_logits, losses_dict = model_class.eval_autoregressive( features) else: sharded_logits, losses_dict = model_class.model_fn( features, skip=(skipping_is_on and skip_this_one)) with tf.variable_scope("losses_avg"): total_loss, ops = 0.0, [] for loss_key, loss_value in six.iteritems(losses_dict): loss_name = "problem_%d/%s_loss" % (n, loss_key) loss_moving_avg = tf.get_variable(loss_name, initializer=100.0, trainable=False) loss_variable_names.append(loss_name) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1)) total_loss += loss_value try: # Total loss avg might be reused or not, we try both. with tf.variable_scope(tf.get_variable_scope(), reuse=True): # Total loss was already constructed on input. loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) except ValueError: loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n, initializer=100.0, trainable=False) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)) with tf.variable_scope("train_stats"): # Count steps for this problem. problem_steps = tf.get_variable("problem_%d_steps" % n, initializer=0, trainable=False) ops.append(problem_steps.assign_add(1)) with tf.control_dependencies(ops): # Make sure the ops run. # Ensure the loss is a scalar here. total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") return [total_loss, tf.concat(sharded_logits, 0)] model_output = input_fn_builder.cond_on_index( nth_model, index_tensor=features["problem_choice"], max_idx=len(hparams.problems) - 1) if mode == tf.estimator.ModeKeys.PREDICT: # If beam searching, model_output will be a dict with keys "outputs" and # "scores". if isinstance(model_output, dict): outputs = model_output["outputs"] scores = model_output["scores"] else: outputs = model_output scores = None batched_problem_choice = (features["problem_choice"] * tf.ones( (tf.shape(features["inputs"])[0], ), dtype=tf.int32)) predictions = { "outputs": outputs, "scores": scores, "inputs": features.get("inputs", None), "firstP": features.get("firstP", None), "targets": features.get("infer_targets", None), "problem_choice": batched_problem_choice, } _del_dict_nones(predictions) export_out = {"outputs": predictions["outputs"]} if "scores" in predictions: export_out["scores"] = predictions["scores"] return tf.estimator.EstimatorSpec( mode, predictions=predictions, export_outputs={ "output": tf.estimator.export.PredictOutput(export_out) }) total_loss, logits = model_output if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_fns = metrics.create_evaluation_metrics( hparams.problem_instances, hparams) eval_metrics = {} for metric_name, metric_fn in six.iteritems(eval_metrics_fns): eval_metrics[metric_name] = metric_fn(logits, features) return tf.estimator.EstimatorSpec(mode, predictions={"predictions": logits}, eval_metric_ops=eval_metrics, loss=total_loss) assert mode == tf.estimator.ModeKeys.TRAIN # Set learning rate learning_rate = hparams.learning_rate * optimize.learning_rate_decay( hparams, num_worker_replicas=worker_replicas, num_train_steps=train_steps) learning_rate /= math.sqrt(float(worker_replicas)) # Get global step global_step = tf.train.get_or_create_global_step() # Some training statistics. with tf.name_scope("training_stats"): tf.summary.scalar("learning_rate", learning_rate) for n in xrange(len(hparams.problems)): names_and_vars = [] with tf.variable_scope("losses_avg", reuse=True): total_loss_var = tf.get_variable("problem_%d/total_loss" % n) names_and_vars.append(("total_loss", total_loss_var)) with tf.variable_scope("losses_avg", reuse=True): for loss_name in loss_variable_names: if loss_name.startswith("problem_%d/" % n): loss_var = tf.get_variable(loss_name) loss_suffix = loss_name[loss_name.index("/") + 1:] names_and_vars.append((loss_suffix, loss_var)) for (loss_name, loss_var) in names_and_vars: tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var) with tf.variable_scope("train_stats", reuse=True): nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32) tf.summary.scalar( "problem_%d_frequency" % n, tf.to_float(nth_steps) / (tf.to_float(global_step) + 1.0)) # Add weight decay and noise. total_size, weight_decay_loss = 0, 0.0 delib_params = None if hparams.update_delib_only: delib_params = [ v for v in tf.trainable_variables() if "delib" in v.name or "softmax" in v.name ] all_weights = {v.name: v for v in delib_params} print("Delib parameters") for v in delib_params: print("\t\t>>\t\t{}".format(v.name)) else: all_weights = {v.name: v for v in tf.trainable_variables()} for v_name in sorted(list(all_weights)): v = all_weights[v_name] v_size = int(np.prod(np.array(v.shape.as_list()))) total_size += v_size if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1: # Add weight regularization if set and the weight is not a bias (dim>1). with tf.device(v._ref().device): # pylint: disable=protected-access v_loss = tf.nn.l2_loss(v) / v_size weight_decay_loss += v_loss is_body = len(v_name) > 5 and v_name[:5] == "body/" if hparams.weight_noise > 0.0 and is_body: # Add weight noise if set in hparams. with tf.device(v._ref().device): # pylint: disable=protected-access scale = learning_rate * 0.001 noise = tf.truncated_normal( v.shape) * hparams.weight_noise * scale noise_op = v.assign_add(noise) with tf.control_dependencies([noise_op]): total_loss = tf.identity(total_loss) if hparams.weight_decay > 0.0: total_loss += weight_decay_loss * hparams.weight_decay # The new data reader occasionally emits very small batches, which # cause the examples in those batches to be grossly overweighted. # We decrease the loss proportionally to the ratio of the size of this # batch to the size of the largest training batch ever. # TODO(noam): to be more sophisticated, we could keep separate # maxima based on problem choice. max_nonpadding_var = tf.get_variable("max_nonpadding", shape=[], initializer=tf.ones_initializer(), trainable=False) max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens) with tf.control_dependencies( [tf.assign(max_nonpadding_var, max_nonpadding)]): small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding tf.summary.scalar("small_batch_multiplier", small_batch_multiplier) total_loss *= small_batch_multiplier # Log variable sizes _log_variable_sizes(tf.trainable_variables(), "Trainable Variables") diet_vars = [ v for v in tf.global_variables() if v.dtype == dtypes.float16_ref ] _log_variable_sizes(diet_vars, "Diet Variables") # Optimize train_op = optimize.optimize(total_loss, learning_rate, hparams, delib_params) # Remove summaries that will fail to run because they are in conditionals. # TODO(cwhipkey): Test with this code removed, later in 2017. summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES) for i in reversed(range(len(summaries))): if summaries[i].name.startswith("cond_"): del summaries[i] tf.logging.info("Global model_fn finished.") return tf.estimator.EstimatorSpec( mode, predictions={"problem_choice": features["problem_choice"]}, loss=total_loss, train_op=train_op)
def model_fn(model, features, mode, hparams, problem_names, train_steps=100000, worker_id=0, worker_replicas=1, eval_run_autoregressive=False, decode_hparams=None): """Builds the model for all modes. * TRAIN: Constructs loss and train_op * EVAL: Constructs the loss and eval metrics * PREDICT: Constructs the predictions Args: model: str, name of model. features: dict<feature name, Tensor>. Expected to have keys {inputs, targets, problem_choice}. mode: tf.estimator.ModeKeys. hparams: model HParams. problem_names: list of str, names of the problems. train_steps: int, total number of training steps. Used to compute learning rate decay. worker_id: int, id of this worker. worker_replicas: int, number of workers. eval_run_autoregressive: bool, whether to run evaluation autoregressively. decode_hparams: HParams for decode settings. Used when mode == PREDICT. Returns: tf.estimator.EstimatorSpec """ assert len(problem_names) == len(hparams.problem_instances) decode_hp = decode_hparams # TODO(rsepassi): This still depends on FLAGS. Rm eventually. dp = devices.data_parallelism(hparams) tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams)) is_training = mode == tf.estimator.ModeKeys.TRAIN # Add input statistics for incoming features. with tf.name_scope("input_stats"): for (k, v) in six.iteritems(features): if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1: tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n) tf.summary.scalar("%s_length" % k, tf.shape(v)[1]) nonpadding = tf.to_float(tf.not_equal(v, 0)) nonpadding_tokens = tf.reduce_sum(nonpadding) if k == "targets": targets_nonpadding_tokens = nonpadding_tokens tf.summary.scalar("%s_nonpadding_tokens" % k, nonpadding_tokens) tf.summary.scalar("%s_nonpadding_fraction" % k, tf.reduce_mean(nonpadding)) # Get multi-problem logits and loss based on features["problem_choice"]. loss_variable_names = [] def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( hparams, mode, hparams.problems[n], n, dp, devices.ps_devices(all_workers=True), decode_hparams=decode_hparams) if mode == tf.estimator.ModeKeys.PREDICT: return model_class.infer( features, beam_size=decode_hp.beam_size, top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1), alpha=decode_hp.alpha, decode_length=decode_hp.extra_length) # In distributed mode, we build graph for problem=0 and problem=worker_id. skipping_is_on = hparams.problem_choice == "distributed" and is_training problem_worker_id = worker_id % len(hparams.problems) skip_this_one = n != 0 and n % worker_replicas != problem_worker_id # On worker 0 also build graph for problems <= 1. # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (worker_id != 0 or n > 1) if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL: logits, losses_dict = model_class.eval_autoregressive(features) else: logits, losses_dict = model_class( features, skip=(skipping_is_on and skip_this_one)) with tf.variable_scope("losses_avg"): total_loss, ops = 0.0, [] for loss_key, loss_value in six.iteritems(losses_dict): loss_name = "problem_%d/%s_loss" % (n, loss_key) loss_moving_avg = tf.get_variable( loss_name, initializer=100.0, trainable=False) loss_variable_names.append(loss_name) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1)) total_loss += loss_value try: # Total loss avg might be reused or not, we try both. with tf.variable_scope(tf.get_variable_scope(), reuse=True): # Total loss was already constructed on input. loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) except ValueError: loss_moving_avg = tf.get_variable( "problem_%d/total_loss" % n, initializer=100.0, trainable=False) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)) with tf.variable_scope("train_stats"): # Count steps for this problem. problem_steps = tf.get_variable( "problem_%d_steps" % n, initializer=0, trainable=False) ops.append(problem_steps.assign_add(1)) with tf.control_dependencies(ops): # Make sure the ops run. # Ensure the loss is a scalar here. total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") return [total_loss, logits] model_output = input_fn_builder.cond_on_index( nth_model, index_tensor=features["problem_choice"], max_idx=len(hparams.problems) - 1) if mode == tf.estimator.ModeKeys.PREDICT: # If beam searching, model_output will be a dict with keys "outputs" and # "scores". if isinstance(model_output, dict): outputs = model_output["outputs"] scores = model_output["scores"] else: outputs = model_output scores = None batched_problem_choice = ( features["problem_choice"] * tf.ones( (tf.shape(features["inputs"])[0],), dtype=tf.int32)) predictions = { "outputs": outputs, "scores": scores, "inputs": features.get("inputs", None), "targets": features.get("infer_targets", None), "problem_choice": batched_problem_choice, } _del_dict_nones(predictions) export_out = {"outputs": predictions["outputs"]} if "scores" in predictions: export_out["scores"] = predictions["scores"] return tf.estimator.EstimatorSpec( mode, predictions=predictions, export_outputs={ "output": tf.estimator.export.PredictOutput(export_out) }) total_loss, logits = model_output if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_fns = metrics.create_evaluation_metrics( hparams.problem_instances, hparams) eval_metrics = {} for metric_name, metric_fn in six.iteritems(eval_metrics_fns): eval_metrics[metric_name] = metric_fn(logits, features) return tf.estimator.EstimatorSpec( mode, predictions={"predictions": logits}, eval_metric_ops=eval_metrics, loss=total_loss) assert mode == tf.estimator.ModeKeys.TRAIN # Set learning rate learning_rate = hparams.learning_rate * optimize.learning_rate_decay( hparams, num_worker_replicas=worker_replicas, num_train_steps=train_steps) learning_rate /= math.sqrt(float(worker_replicas)) # Get global step global_step = tf.train.get_or_create_global_step() # Some training statistics. with tf.name_scope("training_stats"): tf.summary.scalar("learning_rate", learning_rate) for n in xrange(len(hparams.problems)): names_and_vars = [] with tf.variable_scope("losses_avg", reuse=True): total_loss_var = tf.get_variable("problem_%d/total_loss" % n) names_and_vars.append(("total_loss", total_loss_var)) with tf.variable_scope("losses_avg", reuse=True): for loss_name in loss_variable_names: if loss_name.startswith("problem_%d/" % n): loss_var = tf.get_variable(loss_name) loss_suffix = loss_name[loss_name.index("/") + 1:] names_and_vars.append((loss_suffix, loss_var)) for (loss_name, loss_var) in names_and_vars: tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var) with tf.variable_scope("train_stats", reuse=True): nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32) tf.summary.scalar("problem_%d_frequency" % n, tf.to_float(nth_steps) / (tf.to_float(global_step) + 1.0)) # Add weight decay and noise. total_size, weight_decay_loss = 0, 0.0 all_weights = {v.name: v for v in tf.trainable_variables()} for v_name in sorted(list(all_weights)): v = all_weights[v_name] v_size = int(np.prod(np.array(v.shape.as_list()))) total_size += v_size if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1: # Add weight regularization if set and the weight is not a bias (dim>1). with tf.device(v._ref().device): # pylint: disable=protected-access v_loss = tf.nn.l2_loss(v) / v_size weight_decay_loss += v_loss is_body = len(v_name) > 5 and v_name[:5] == "body/" if hparams.weight_noise > 0.0 and is_body: # Add weight noise if set in hparams. with tf.device(v._ref().device): # pylint: disable=protected-access scale = learning_rate * 0.001 noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale noise_op = v.assign_add(noise) with tf.control_dependencies([noise_op]): total_loss = tf.identity(total_loss) if hparams.weight_decay > 0.0: total_loss += weight_decay_loss * hparams.weight_decay # The new data reader occasionally emits very small batches, which # cause the examples in those batches to be grossly overweighted. # We decrease the loss proportionally to the ratio of the size of this # batch to the size of the largest training batch ever. # TODO(noam): to be more sophisticated, we could keep separate # maxima based on problem choice. max_nonpadding_var = tf.get_variable( "max_nonpadding", shape=[], initializer=tf.ones_initializer(), trainable=False) max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens) with tf.control_dependencies([tf.assign(max_nonpadding_var, max_nonpadding)]): small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding tf.summary.scalar("small_batch_multiplier", small_batch_multiplier) total_loss *= small_batch_multiplier # Log variable sizes _log_variable_sizes(tf.trainable_variables(), "Trainable Variables") diet_vars = [ v for v in tf.global_variables() if v.dtype == dtypes.float16_ref ] _log_variable_sizes(diet_vars, "Diet Variables") # Optimize train_op = optimize.optimize(total_loss, learning_rate, hparams) # Remove summaries that will fail to run because they are in conditionals. # TODO(cwhipkey): Test with this code removed, later in 2017. summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES) for i in reversed(range(len(summaries))): if summaries[i].name.startswith("cond_"): del summaries[i] tf.logging.info("Global model_fn finished.") return tf.estimator.EstimatorSpec( mode, predictions={"problem_choice": features["problem_choice"]}, loss=total_loss, train_op=train_op)
def build_model(self): # build index table index_table = tf.contrib.lookup.index_table_from_file( vocabulary_file=self.config.vocab_list, num_oov_buckets=0, default_value=0) # get data iterator self.data_iterator = self.data.get_data_iterator(index_table, mode=self.mode) # get inputs with tf.variable_scope("inputs"): # get next batch if there is no feeded data next_batch = self.data_iterator.get_next() self.input_queries = tf.placeholder_with_default( next_batch["input_queries"], [None, self.config.max_length], name="input_queries") self.input_replies = tf.placeholder_with_default( next_batch["input_replies"], [None, self.config.max_length], name="input_replies") self.query_lengths = tf.placeholder_with_default( tf.squeeze(next_batch["query_lengths"]), [None], name="query_lengths") self.reply_lengths = tf.placeholder_with_default( tf.squeeze(next_batch["reply_lengths"]), [None], name="reply_lengths") # get hyperparams self.embed_dropout_keep_prob = tf.placeholder( tf.float64, name="embed_dropout_keep_prob") self.lstm_dropout_keep_prob = tf.placeholder( tf.float32, name="lstm_dropout_keep_prob") self.dense_dropout_keep_prob = tf.placeholder( tf.float32, name="dense_dropout_keep_prob") self.num_negative_samples = tf.placeholder( tf.int32, name="num_negative_samples") with tf.variable_scope("properties"): # length properties cur_batch_length = tf.shape(self.input_queries)[0] # get hparms from tensor2tensor.models.transformer hparams = transformer.transformer_small() hparams.batch_size = self.config.batch_size hparams.learning_rate_decay_steps = 10000 hparams.learning_rate_minimum = 3e-5 # learning rate lr = learning_rate.learning_rate_schedule(hparams) self.learning_rate = lr # embedding layer with tf.variable_scope("embedding"): embeddings = tf.Variable(get_embeddings( self.config.vocab_list, self.config.pretrained_embed_dir, self.config.vocab_size, self.config.embed_dim), trainable=True, name="embeddings") embeddings = tf.nn.dropout( embeddings, keep_prob=self.embed_dropout_keep_prob, noise_shape=[tf.shape(embeddings)[0], 1]) queries_embedded = tf.to_float( tf.nn.embedding_lookup(embeddings, self.input_queries, name="queries_embedded")) replies_embedded = tf.to_float( tf.nn.embedding_lookup(embeddings, self.input_replies, name="replies_embedded")) self.queries_embedded = queries_embedded self.replies_embedded = replies_embedded # transformer layer with tf.variable_scope("transformer"): queries_expanded = tf.expand_dims(queries_embedded, axis=2, name="queries_expanded") replies_expanded = tf.expand_dims(replies_embedded, axis=2, name="replies_expanded") hparams = transformer.transformer_small() hparams.set_hparam("batch_size", self.config.batch_size) hparams.set_hparam("hidden_size", self.config.embed_dim) encoder = transformer.TransformerEncoder(hparams, mode=self.mode) self.queries_encoded = encoder({ "inputs": queries_expanded, "targets": queries_expanded })[0] self.replies_encoded = encoder({ "inputs": replies_expanded, "targets": replies_expanded })[0] self.queries_encoded = tf.squeeze( tf.reduce_sum(self.queries_encoded, axis=1, keep_dims=True)) self.replies_encoded = tf.squeeze( tf.reduce_sum(self.replies_encoded, axis=1, keep_dims=True)) with tf.variable_scope("sampling"): positive_mask = tf.eye(cur_batch_length) negative_mask = make_negative_mask( tf.zeros([cur_batch_length, cur_batch_length]), method=self.config.negative_sampling, num_negative_samples=self.num_negative_samples) negative_queries_indices, negative_replies_indices = tf.split( tf.where(tf.not_equal(negative_mask, 0)), [1, 1], 1) self.distances = tf.matmul(self.queries_encoded, self.replies_encoded, transpose_b=True) self.distances_flattened = tf.reshape(self.distances, [-1]) self.positive_distances = tf.gather( self.distances_flattened, tf.where(tf.reshape(positive_mask, [-1]))) self.negative_distances = tf.gather( self.distances_flattened, tf.where(tf.reshape(negative_mask, [-1]))) self.negative_queries_indices = tf.squeeze( negative_queries_indices) self.negative_replies_indices = tf.squeeze( negative_replies_indices) self.positive_inputs = tf.concat([ self.queries_encoded, self.positive_distances, self.replies_encoded ], 1) self.negative_inputs = tf.reshape( tf.concat([ tf.nn.embedding_lookup(self.queries_encoded, self.negative_queries_indices), self.negative_distances, tf.nn.embedding_lookup(self.replies_encoded, self.negative_replies_indices) ], 1), [ tf.shape(negative_queries_indices)[0], self.config.embed_dim * 2 + 1 ]) with tf.variable_scope("prediction"): self.hidden_outputs = tf.layers.dense(tf.concat( [self.positive_inputs, self.negative_inputs], 0), 256, tf.nn.relu, name="hidden_layer") self.logits = tf.layers.dense(self.hidden_outputs, 2, tf.nn.relu, name="output_layer") labels = tf.concat([ tf.ones([tf.shape(self.positive_inputs)[0]], tf.float64), tf.zeros([tf.shape(self.negative_inputs)[0]], tf.float64) ], 0) self.labels = tf.one_hot(tf.to_int32(labels), 2) self.probs = tf.sigmoid(self.logits) self.predictions = tf.argmax(self.probs, 1) with tf.variable_scope("loss"): self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.labels, logits=self.logits)) self.train_step = optimize.optimize(self.loss, lr, hparams, use_tpu=False) with tf.variable_scope("score"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.labels, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def optimize(self, loss, num_async_replicas=1, use_tpu=False): """Return a training op minimizing loss.""" lr = self.hparams.learning_rate * optimize.learning_rate_decay(self.hparams) lr /= math.sqrt(float(num_async_replicas)) train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu) return train_op
def model_fn(model, features, mode, hparams, problem_names, train_steps=100000, worker_id=0, worker_replicas=1, eval_run_autoregressive=False, decode_hparams=None): """Builds the model for all modes. * TRAIN: Constructs loss and train_op * EVAL: Constructs the loss and eval metrics * PREDICT: Constructs the predictions Args: model: str, name of model. features: dict<feature name, Tensor>. Expected to have keys {inputs, targets, problem_choice}. mode: tf.estimator.ModeKeys. hparams: model HParams. problem_names: list of str, names of the problems. train_steps: int, total number of training steps. Used to compute learning rate decay. worker_id: int, id of this worker. worker_replicas: int, number of workers. eval_run_autoregressive: bool, whether to run evaluation autoregressively. decode_hparams: HParams for decode settings. Used when mode == PREDICT. Returns: tf.estimator.EstimatorSpec """ assert len(problem_names) == len(hparams.problem_instances) decode_hp = decode_hparams # TODO(rsepassi): This still depends on FLAGS. Rm eventually. dp = devices.data_parallelism() tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams)) # set the initializer functions is_training = mode == tf.estimator.ModeKeys.TRAIN # Add input statistics for incoming features. with tf.name_scope("input_stats"): for (k, v) in six.iteritems(features): if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1: tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n) tf.summary.scalar("%s_length" % k, tf.shape(v)[1]) nonpadding = tf.to_float(tf.not_equal(v, 0)) nonpadding_tokens = tf.reduce_sum( nonpadding) # non zeros tokens if k == "targets": targets_nonpadding_tokens = nonpadding_tokens tf.summary.scalar("%s_nonpadding_tokens" % k, nonpadding_tokens) tf.summary.scalar("%s_nonpadding_fraction" % k, tf.reduce_mean(nonpadding)) # Get multi-problem logits and loss based on features["problem_choice"]. loss_variable_names = [] def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( hparams, mode, hparams.problems[n], n, dp, devices.ps_devices(all_workers=True), decode_hparams=decode_hparams ) # initialize transformer model class: hparams, modalities if mode == tf.estimator.ModeKeys.PREDICT: return model_class.infer(features, beam_size=decode_hp.beam_size, top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1), alpha=decode_hp.alpha, decode_length=decode_hp.extra_length) # In distributed mode, we build graph for problem=0 and problem=worker_id. skipping_is_on = hparams.problem_choice == "distributed" and is_training problem_worker_id = worker_id % len(hparams.problems) skip_this_one = n != 0 and n % worker_replicas != problem_worker_id # On worker 0 also build graph for problems <= 1. # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (worker_id != 0 or n > 1) mrt_samples = getattr(hparams, 'mrt_samples', None) if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL: # evaluation mode sharded_logits, losses_dict = model_class.eval_autoregressive( features) else: # training mode if hparams.rl: # generate sample data, it will automatically sharded, samples shape [batch, time, 1, 1] if model_class._num_datashards == 1: # work on single GPU cards, fast sample print("###Work on Single GPU card, Use Fast Decode.###") train_beam = getattr(hparams, 'train_beam', None) if mrt_samples: samples, _ = model_class._fast_decode( features, decode_length=50, beam_size=mrt_samples, top_beams=mrt_samples) inputs = tf.squeeze(tf.squeeze(features["inputs"], axis=-1), axis=-1) targets = tf.squeeze(tf.squeeze(features["targets"], axis=-1), axis=-1) batch_size = tf.shape(inputs)[0] inputs_len = tf.shape(inputs)[1] targets_len = tf.shape(targets)[1] inputs_tile = tf.tile(inputs, [1, mrt_samples]) targets_tile = tf.tile(targets, [1, mrt_samples]) inputs_reshape = tf.reshape( inputs_tile, [batch_size * mrt_samples, inputs_len]) targets_reshape = tf.reshape( targets_tile, [batch_size * mrt_samples, targets_len]) inputs_feed = tf.expand_dims(tf.expand_dims( inputs_reshape, axis=-1), axis=-1) targets_feed = tf.expand_dims(tf.expand_dims( targets_reshape, axis=-1), axis=-1) features["inputs"] = inputs_feed features["targets"] = targets_feed elif train_beam and train_beam != 1: # beam search with hparams.train_beam size and return the top 1 sample samples, _ = model_class._fast_decode( features, decode_length=50, beam_size=hparams.train_beam) else: targets_beam = getattr(hparams, 'targets_beam', None) if targets_beam: targets_samples, _ = model_class._fast_decode( features, decode_length=50, beam_size=4, sampling_method='argmax') targets_samples = tf.reshape( targets_samples, [ tf.shape(targets_samples)[0], tf.shape(targets_samples)[1], 1, 1 ]) features["targets"] = targets_samples samples, _ = model_class._fast_decode(features, decode_length=50) samples = tf.expand_dims(samples, axis=-1) samples = tf.expand_dims( samples, axis=-1 ) # add two additional dimensions to make it compatible. else: # work on multi GPU cards, only support slow sample print("###Work on Multi GPU cards, Use Slow Decode.###") samples, _, _ = model_class._slow_greedy_infer( features, decode_length=50) # default decode_length = 50 samples = tf.stop_gradient(samples) # calculate bleu score use metric_fn # train_metric_fn = "approx_bleu_train_score" train_metric_fn = metrics.METRICS_FNS[ metrics.Metrics.APPROX_BLEU_TRAIN] labels = features.get("targets", None) samples.set_shape([None, None, 1, 1]) # haprams.delta_reward = True for delta reward; False for total reward metric_value = train_metric_fn( samples, labels, delat_reward=hparams.delta_reward) metric_value = tf.stop_gradient( metric_value) # to be more strict of the gradient metric_value.set_shape([None, None, 1, 1]) """Accodring to the metrics.py: The tf.metrics.mean function assures correct aggregation.""" # metric_value is total_reward: scalar features["samples"] = samples features["values"] = metric_value # del samples # del labels sharded_logits, losses_dict = model_class.model_fn( features, skip=(skipping_is_on and skip_this_one), mrt=mrt_samples) # if hparams.rl: # training_loss = losses_dict["training"] * metric_value # losses_dict["training"]: [batch, timesteps] # training_loss_sum = tf.reduce_sum(training_loss) # sum the training_loss # losses_dict["training"] = training_loss_sum # log_prob * r (current r is total_reward) with tf.variable_scope("losses_avg"): total_loss, ops = 0.0, [] for loss_key, loss_value in six.iteritems(losses_dict): if hparams.rl: baseline_loss_weight = getattr(hparams, 'baseline_loss_weight', 1.0) training_loss_weight = getattr(hparams, 'training_loss_weight', 1.0) mle_training_loss_weight = getattr( hparams, 'mle_training_loss_weight', 0.3) if loss_key == "training": loss_value = loss_value * training_loss_weight elif loss_key == "training_baseline": loss_value = loss_value * baseline_loss_weight elif loss_key == "mle_training": loss_value = loss_value * mle_training_loss_weight loss_name = "problem_%d/%s_loss" % (n, loss_key) loss_moving_avg = tf.get_variable(loss_name, initializer=100.0, trainable=False) loss_variable_names.append(loss_name) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1)) total_loss += loss_value try: # Total loss avg might be reused or not, we try both. with tf.variable_scope(tf.get_variable_scope(), reuse=True): # Total loss was already constructed on input. loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) except ValueError: loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n, initializer=100.0, trainable=False) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)) with tf.variable_scope("train_stats"): # Count steps for this problem. problem_steps = tf.get_variable("problem_%d_steps" % n, initializer=0, trainable=False) ops.append(problem_steps.assign_add(1)) with tf.control_dependencies(ops): # Make sure the ops run. # Ensure the loss is a scalar here. total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") return [total_loss, tf.concat(sharded_logits, 0)] model_output = input_fn_builder.cond_on_index( nth_model, index_tensor=features["problem_choice"], max_idx=len(hparams.problems) - 1) # total_loss and shared_logits if mode == tf.estimator.ModeKeys.PREDICT: # If beam searching, model_output will be a dict with keys "outputs" and # "scores". if isinstance(model_output, dict): # beam search outputs = model_output["outputs"] scores = model_output["scores"] else: outputs = model_output scores = None batched_problem_choice = (features["problem_choice"] * tf.ones( (tf.shape(features["inputs"])[0], ), dtype=tf.int32)) predictions = { "outputs": outputs, "scores": scores, "inputs": features.get("inputs", None), "targets": features.get("infer_targets", None), "problem_choice": batched_problem_choice, } _del_dict_nones(predictions) # delete the empty ones in predictions export_out = {"outputs": predictions["outputs"]} if "scores" in predictions: export_out["scores"] = predictions["scores"] return tf.estimator.EstimatorSpec( mode, predictions=predictions, export_outputs={ "output": tf.estimator.export.PredictOutput(export_out) }) total_loss, logits = model_output if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_fns = metrics.create_evaluation_metrics( hparams.problem_instances, hparams) eval_metrics = {} for metric_name, metric_fn in six.iteritems(eval_metrics_fns): eval_metrics[metric_name] = metric_fn(logits, features) return tf.estimator.EstimatorSpec(mode, predictions={"predictions": logits}, eval_metric_ops=eval_metrics, loss=total_loss) assert mode == tf.estimator.ModeKeys.TRAIN # Set learning rate learning_rate = hparams.learning_rate * optimize.learning_rate_decay( hparams, num_worker_replicas=worker_replicas, num_train_steps=train_steps) learning_rate /= math.sqrt(float(worker_replicas)) # Get global step global_step = tf.train.get_or_create_global_step() # Some training statistics. with tf.name_scope("training_stats"): tf.summary.scalar("learning_rate", learning_rate) for n in xrange(len(hparams.problems)): names_and_vars = [] with tf.variable_scope("losses_avg", reuse=True): total_loss_var = tf.get_variable("problem_%d/total_loss" % n) names_and_vars.append(("total_loss", total_loss_var)) with tf.variable_scope("losses_avg", reuse=True): for loss_name in loss_variable_names: if loss_name.startswith("problem_%d/" % n): loss_var = tf.get_variable(loss_name) loss_suffix = loss_name[loss_name.index("/") + 1:] names_and_vars.append((loss_suffix, loss_var)) for (loss_name, loss_var) in names_and_vars: tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var) with tf.variable_scope("train_stats", reuse=True): nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32) tf.summary.scalar( "problem_%d_frequency" % n, tf.to_float(nth_steps) / (tf.to_float(global_step) + 1.0)) # Add weight decay and noise. total_size, weight_decay_loss = 0, 0.0 all_weights = {v.name: v for v in tf.trainable_variables()} for v_name in sorted(list(all_weights)): v = all_weights[v_name] v_size = int(np.prod(np.array(v.shape.as_list()))) total_size += v_size if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1: # Add weight regularization if set and the weight is not a bias (dim>1). with tf.device(v._ref().device): # pylint: disable=protected-access v_loss = tf.nn.l2_loss(v) / v_size weight_decay_loss += v_loss is_body = len(v_name) > 5 and v_name[:5] == "body/" if hparams.weight_noise > 0.0 and is_body: # Add weight noise if set in hparams. with tf.device(v._ref().device): # pylint: disable=protected-access scale = learning_rate * 0.001 noise = tf.truncated_normal( v.shape) * hparams.weight_noise * scale noise_op = v.assign_add(noise) with tf.control_dependencies([noise_op]): total_loss = tf.identity(total_loss) if hparams.weight_decay > 0.0: total_loss += weight_decay_loss * hparams.weight_decay # The new data reader occasionally emits very small batches, which # cause the examples in those batches to be grossly overweighted. # We decrease the loss proportionally to the ratio of the size of this # batch to the size of the largest training batch ever. # TODO(noam): to be more sophisticated, we could keep separate # maxima based on problem choice. max_nonpadding_var = tf.get_variable("max_nonpadding", shape=[], initializer=tf.ones_initializer(), trainable=False) max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens) with tf.control_dependencies( [tf.assign(max_nonpadding_var, max_nonpadding)]): small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding tf.summary.scalar("small_batch_multiplier", small_batch_multiplier) total_loss *= small_batch_multiplier # Log variable sizes _log_variable_sizes(tf.trainable_variables(), "Trainable Variables") diet_vars = [ v for v in tf.global_variables() if v.dtype == dtypes.float16_ref ] _log_variable_sizes(diet_vars, "Diet Variables") # Optimize train_op = optimize.optimize(total_loss, learning_rate, hparams) # Remove summaries that will fail to run because they are in conditionals. # TODO(cwhipkey): Test with this code removed, later in 2017. summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES) for i in reversed(range(len(summaries))): if summaries[i].name.startswith("cond_"): del summaries[i] tf.logging.info("Global model_fn finished.") return tf.estimator.EstimatorSpec( mode, predictions={"problem_choice": features["problem_choice"]}, loss=total_loss, train_op=train_op)