Exemplo n.º 1
0
    def build_training_op(self, loss):
        """Get training operation.

    Args:
      loss: a loss function for training.

    Define the optimization operation and perform gradient calculation for both
      TPU/Non-TPU training.

    Returns:
      Computed gradient.
    """
        adam_optimizer = tf.train.AdamOptimizer(
            learning_rate=self._decayed_learning_rate, epsilon=1e-5)
        if self._use_tpu:
            # Notes from: learning/brain/research/dune/examples/v2018_09/train.py
            # If we use TPUs, reduce_mean runs on each chip separately and by default
            # only the loss of the first chip is reported.
            #
            # You can either:
            # - execute this if, which synchronizes the losses
            #   across the chips to obtain the full loss on all samples.
            # - or remove this section, gaining some performance and getting the
            #   loss only from the first chip.
            # compute gradients perform averaging of the loss
            adam_optimizer = tf.tpu.CrossShardOptimizer(adam_optimizer)

            tpu_sum_loss = contrib_tpu.cross_replica_sum(loss /
                                                         self._tpu_num_shards)

            grads_and_vars = adam_optimizer.compute_gradients(
                tpu_sum_loss, self.total_params)
            grads, var = zip(*grads_and_vars)
            sum_grads = []
            sum_vars = []
            for (grad, var) in grads_and_vars:
                if grad is None:
                    sum_grads.append(grad)
                    sum_vars.append(var)
                else:
                    sum_grads.append(
                        contrib_tpu.cross_replica_sum(grad) /
                        self._tpu_num_shards)
                    sum_vars.append(var)
            # calculate sum of grads
            norm_grads, _ = tf.clip_by_global_norm(sum_grads, 0.5)
            grads_and_vars = list(zip(norm_grads, sum_vars))
        else:
            grads_and_vars = adam_optimizer.compute_gradients(
                loss, self.total_params)
            grads, var = zip(*grads_and_vars)
            norm_grads, _ = tf.clip_by_global_norm(grads, 0.5)
            grads_and_vars = list(zip(norm_grads, var))

        return adam_optimizer.apply_gradients(
            grads_and_vars, global_step=tf.train.get_global_step())
Exemplo n.º 2
0
    def __init__(self, sess, state_dim, action_dim, learning_rate,
                 global_critic):
        self.sess = sess
        self.global_critic = global_critic
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate

        #크리틱 신경망 생성
        self.model, self.phi, self.states = build_network(self.state_dim)

        #시간차 타깃을 담을 플레이스홀더
        self.td_targets = tf.placeholder(tf.float32, [None, 1])

        #워커의 손실함수와 그래디언트
        v_values = self.model.output  #abstract tensor
        loss = tf.reduce_sum(tf.square(self.td_targets - v_values))
        dj_dphi = tf.gradients(loss, self.phi)

        #그래디언트 클리핑
        dj_dphi, _ = tf.clip_by_global_norm(dj_dphi, 40)

        #워커의 그래디언트를 이용해 글로벌 신경망 업데이트
        grads = zip(dj_dphi, self.global_critic.phi)

        self.critic_optimizer = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(grads)
Exemplo n.º 3
0
    def __init__(self,
                 mdp,
                 n_input,
                 lr,
                 n_h1=400,
                 n_h2=300,
                 l2=10,
                 name='deep_irl_fc'):
        super(DeepIRLFC, self).__init__(mdp, lr)

        self.n_input = n_input
        self.lr = lr
        self.n_h1 = n_h1
        self.n_h2 = n_h2
        self.name = name

        self.sess = tf.compat.v1.Session()
        self.input_s, self.reward, self.theta = self._build_network(self.name)
        self.optimizer = tf.train.AdamOptimizer(lr)

        self.grad_r = tf.placeholder(tf.float32, [None, 1])
        self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.theta])
        self.grad_l2 = tf.gradients(self.l2_loss, self.theta)

        self.grad_theta = tf.gradients(self.reward, self.theta, -self.grad_r)
        self.grad_theta = [
            tf.add(l2 * self.grad_l2[i], self.grad_theta[i])
            for i in range(len(self.grad_l2))
        ]
        self.grad_theta, _ = tf.clip_by_global_norm(self.grad_theta, 100.0)

        self.grad_norms = tf.global_norm(self.grad_theta)
        self.optimize = self.optimizer.apply_gradients(
            zip(self.grad_theta, self.theta))
        self.sess.run(tf.compat.v1.global_variables_initializer())
def output(image, labels, optimize, loss, out, reshaped_labels):
    """
        Handles output of a model
        input
            image
            labels
            optimize bool
                construct graph with optimizer or not
            loss
                objective of model
            out
                logits
            reshaped_labels
                labels as a (x,2) tensor
        output
            if optimize:
                image
                labels
                optmizer
                    used to train
            else:
                image
                labels
                prob_of_cell shape=shape=(b,s,s,1)
                    probability of pixel being a cone
                correct_prediction
                    number of correct pixel classifications

    """
    if optimize:
        optimizer = tf.train.RMSPropOptimizer(1e-3)
        gradients, variables = zip(*optimizer.compute_gradients(loss))
        gradients, _ = tf.clip_by_global_norm(gradients, 5.0)

        # occasionally gradients would explode
        # tells us if there are NaN values in any tensors
        grad_checks = [
            tf.check_numerics(grad, 'Gradients exploding')
            for grad in gradients if grad is not None
        ]
        with tf.control_dependencies(grad_checks):
            optimize = optimizer.apply_gradients(zip(gradients, variables))
            return image, labels, optimize
    else:
        # convert scores to probabilities
        probs = tf.nn.softmax(out)

        # gives classification
        prediction = tf.argmax(probs, 1)

        # count how many classifications are correct
        correct_prediction = tf.equal(tf.argmax(reshaped_labels, 1),
                                      prediction)

        # if you give images of shape (b,s,s,1) then
        # probs is (b,s,s,1) and each value of probs
        # is the probability that the corresponding
        # pixel belongs to a cone
        prob_of_cell = probs[:, 1]
        return image, labels, prob_of_cell, correct_prediction
Exemplo n.º 5
0
    def train(self, X, Y):
        # train_log = train_log.reshape(-1, self._batch_size, 21)

        global_step = tf.Variable(0, trainable=False, dtype=tf.int32, name='global_step')
        starter_learning_rate = 0.01

        optimizer = tf.train.AdadeltaOptimizer(starter_learning_rate, epsilon=1e-06)

        # Compute the gradients for each variable
        grads_and_vars = optimizer.compute_gradients(-self._loss)

        # gradient clipping
        grads, variables = zip(*grads_and_vars)
        grads_clipped, _ = tf.clip_by_global_norm(grads, clip_norm=1)
        apply_gradients_op = optimizer.apply_gradients(zip(grads_clipped, variables), global_step=global_step)

        # start the Session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())


        # for batch_session in train_log:
        # inputs, targets = self._get_batch_train_sample(batch_session)

        # get the loss and the probabilities that the model outputs
        for iter in range(1000):
            _, loss_, pred, probs = self.sess.run([apply_gradients_op, self._loss, self._predictions, self._probabilities],
                                             feed_dict={self._inputs: X,
                                                        self._targets: Y,
                                                        self._state_placeholder: np.zeros((self._lstm_num_layers, 2,
                                                                                            self._batch_size,
                                                                                            self._lstm_num_hidden)),
                                                        self.keep_prob: 0.9})
            print(iter, loss_)
Exemplo n.º 6
0
    def _build_train(self):
        """Build training ops."""
        print('-' * 80)
        print('Building train graph')
        reg_loss, loss = self._forward(self.x_train,
                                       self.y_train,
                                       self.train_params,
                                       self.batch_init_states,
                                       is_training=True)

        tf_vars = tf.trainable_variables()
        global_step = tf.train.get_or_create_global_step()
        lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) /
                    tf.cast(self.params.bptt_steps, dtype=tf.float32))
        learning_rate = utils.get_lr(global_step, self.params) * lr_scale
        # learning_rate = tf.Print(
        #     learning_rate,
        #     [learning_rate, lr_scale, self.base_bptt, tf.shape(self.y_train)],
        #     message='lr: ', summarize=3)
        grads = tf.gradients(reg_loss, tf_vars)
        clipped_grads, grad_norm = tf.clip_by_global_norm(
            grads, self.params.grad_bound)

        (self.update_moving_avg_ops, self.use_moving_avg_vars,
         self.restore_normal_vars) = self._create_average_ops()
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars),
                                             global_step=global_step)

        self.train_loss = loss
        self.train_op = train_op
        self.grad_norm = grad_norm
        self.learning_rate = learning_rate
    def _create_optimizer(self, args):
        """Create optimizer to minimize loss
        Args:
            args: Various arguments and specifications
        """
        # First extract mean and std for prior dists, dist over g, and dist over x
        g_prior_mean, g_prior_logstd = tf.split(
            self.g_prior, [args.latent_dim, args.latent_dim], axis=1)
        g_prior_std = tf.exp(g_prior_logstd) + 1e-6
        g_mean, g_logstd = tf.split(self.g_dists,
                                    [args.latent_dim, args.latent_dim],
                                    axis=1)
        g_std = tf.exp(g_logstd) + 1e-6

        # Get predictions for x and reconstructions
        self.x_pred_norm = self._get_decoder_output(args, self.z_vals)
        self.x_pred = self.x_pred_norm * self.scale + self.shift

        # First component of loss: NLL of observed states
        x_reshape = tf.reshape(
            self.x, [args.batch_size, 2 * args.seq_length, args.state_dim])
        x_pred_reshape = tf.reshape(
            self.x_pred_norm,
            [args.batch_size, args.seq_length, args.state_dim])
        self.x_pred_init = x_pred_reshape * self.scale + self.shift  # needed for ilqr

        # Add in predictions for how system will evolve
        self.x_pred_reshape = tf.concat([x_pred_reshape, self.x_future_norm],
                                        axis=1)
        self.x_pred_reshape_unnorm = self.x_pred_reshape * self.scale + self.shift

        # Prediction loss
        self.pred_loss = tf.reduce_sum(
            tf.square(x_reshape - self.x_pred_reshape))

        # Weight loss at t = T more heavily
        self.pred_loss += 20.0*tf.reduce_sum(tf.square(x_reshape[:, args.seq_length-1]\
                                                             - x_pred_reshape[:, args.seq_length-1]))

        # Define reconstructed state needed for ilqr
        self.rec_state = self._get_decoder_output(
            args, self.z1) * self.scale + self.shift

        # Second component of loss: KLD between approximate posterior and prior
        g_prior_dist = tf.distributions.Normal(loc=g_prior_mean,
                                               scale=g_prior_std)
        g_dist = tf.distributions.Normal(loc=g_mean, scale=g_std)
        self.kl_loss = tf.reduce_sum(
            tf.distributions.kl_divergence(g_dist, g_prior_dist))

        # Sum with regularization losses to form total cost
        self.cost = self.pred_loss + self.kl_weight * self.kl_loss + tf.reduce_sum(
            tf.losses.get_regularization_losses())

        # Perform parameter update
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        tvars = [v for v in tf.trainable_variables()]
        self.grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                               args.grad_clip)
        self.train = optimizer.apply_gradients(zip(self.grads, tvars))
Exemplo n.º 8
0
    def init_optimizer(self):
        """最適化アルゴリズムの設定"""
        # 論文だとSGDを利用、lrは1.0→0.1に変化するようになっている
        # Gradients and SGD update operation for training the model
        trainable_params = tf.trainable_variables()
        if self.config['optimizer'] == 'adadelta':
            self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.lr)
        elif self.config['optimizer'] == 'adam':
            self.opt = tf.train.AdamOptimizer(learning_rate=self.lr)
        elif self.config['optimizer'] == 'rmsprop':
            self.opt = tf.train.RMSPropOptimizer(learning_rate=self.lr)
        else:
            self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)

        # Compute gradients of loss w.r.t. all trainable variables
        gradients = tf.gradients(self.loss, trainable_params)

        # Clip gradients by a given maximum_gradient_norm
        clip_gradients, _ = tf.clip_by_global_norm(
            gradients, self.config['max_gradient_norm'])

        # Update the model
        self.train_op = self.opt.apply_gradients(zip(clip_gradients,
                                                     trainable_params),
                                                 global_step=self.global_step)
Exemplo n.º 9
0
	def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, entropy_beta, global_actor):
		self.sess = sess
		self.global_actor = global_actor

		self.state_dim = state_dim
		self.action_dim = action_dim
		self.action_bound = action_bound
		self.learning_rate = learning_rate

		# 표준편차의 최대값 최소값 설정
		self.std_bound = [1e-2, 1]
		
		#워커의 액터 신경망 생성
		self.model, self.theta, self.states = build_network(self.state_dim, self.action_dim, self.action_bound)

		#정책과 어드벤티지를 담을 플레이스 홀더
		self.actions = tf.placeholder(tf.float32, [None, self.action_dim])
		self.advantages = tf.placeholder(tf.float32, [None, 1])

		#정책 확률밀도함수 및 엔트로피
		mu_a, std_a = self.model.output
		log_policy_pdf, entropy = self.log_pdf(mu_a, std_a, self.actions)

		#워커의 손실함수와 그래디언트
		loss_policy = log_policy_pdf * self.advantages
		loss = tf.reduce_sum(-loss_policy-entropy_beta*entropy)
		dj_dtheta = tf.gradients(loss, self.theta)

		#그래디언트 글리핑
		dj_dtheta, _ = tf.clip_by_global_norm(dj_dtheta, 40)

		#워커의 그래디언트를 이용해 글로벌 신경망 업데이트
		grads = zip(dj_dtheta, self.global_actor.theta)
		self.actor_optimizer = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)
Exemplo n.º 10
0
    def init_optimizer(self):
        print("setting optimizer..")

        # add L2 loss to main loss, do backpropagation
        self.l2_loss = tf.losses.get_regularization_loss()
        tf.summary.scalar("l2_loss", self.l2_loss)

        self.total_loss = tf.add(self.loss, self.l2_loss)
        tf.summary.scalar('final_loss', self.total_loss)

        # we need to define a dependency before calculating the total_loss
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if update_ops:
            updates = tf.group(*update_ops)
            self.final_loss = control_flow_ops.with_dependencies([updates], self.total_loss)

        with tf.control_dependencies(update_ops):
            trainable_params = tf.trainable_variables()

            opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

            # Compute gradients of loss w.r.t. all trainable variables
            gradients = tf.gradients(self.final_loss, trainable_params)

            # Clip gradients by a given maximum_gradient_norm
            clip_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm)

            # Update the model
            self.update = opt.apply_gradients(zip(clip_gradients, trainable_params),
                                              global_step=self.global_step)
Exemplo n.º 11
0
def create_optimizer(loss,
                     learning_rate,
                     num_train_steps,
                     weight_decay_rate=0.0,
                     use_tpu=False,
                     warmup_steps=0,
                     warmup_proportion=0,
                     lr_decay_power=1.0,
                     layerwise_lr_decay_power=-1,
                     n_transformer_layers=None,
                     name="adamw",
                     var_map=None):
    """Creates an optimizer and training op."""
    global_step = tf.train.get_or_create_global_step()
    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=lr_decay_power,
                                              cycle=False)
    warmup_steps = max(num_train_steps * warmup_proportion, warmup_steps)
    learning_rate *= tf.minimum(
        1.0,
        tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32))

    if layerwise_lr_decay_power > 0:
        learning_rate = _get_layer_lrs(learning_rate, layerwise_lr_decay_power,
                                       n_transformer_layers)
    if name == "recadam":
        optimizer = RecAdamOptimizer(
            learning_rate=learning_rate,
            weight_decay_rate=weight_decay_rate,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
            anneal_k=0.5,
            anneal_t0=500,
            anneal_w=1.0,
            pretrain_cof=5000.0,
            pretrain_params=var_map)
    else:
        optimizer = AdamWeightDecayOptimizer(
            learning_rate=learning_rate,
            weight_decay_rate=weight_decay_rate,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
    if use_tpu:
        optimizer = tf.tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    grads = tf.gradients(loss, tvars)
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)
    new_global_step = global_step + 1
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
Exemplo n.º 12
0
    def _make_training_step(self, loss: tf.Tensor) -> tf.Tensor:
        """
        Constructs a trainig step from the loss parameter and hyperparameters.
        """
        optimizer_name = self.hyperparameters["optimizer"].lower()
        if optimizer_name == "sgd":
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=self.hyperparameters["learning_rate"])
        elif optimizer_name == "rmsprop":
            optimizer = tf.train.RMSPropOptimizer(
                learning_rate=self.hyperparameters["learning_rate"],
                decay=self.hyperparameters["learning_rate_decay"],
                momentum=self.hyperparameters["momentum"],
            )
        elif optimizer_name == "adam":
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.hyperparameters["learning_rate"])
        else:
            raise Exception('Unknown optimizer "%s".' %
                            (self.hyperparameters["optimizer"]))

        # Calculate and clip gradients
        trainable_vars = self._sess.graph.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES)
        gradients = tf.gradients(loss, trainable_vars)
        clipped_gradients, _ = tf.clip_by_global_norm(
            gradients, self.hyperparameters["gradient_clip_value"])
        pruned_clipped_gradients = []
        for (gradient, trainable_var) in zip(clipped_gradients,
                                             trainable_vars):
            if gradient is None:
                continue
            pruned_clipped_gradients.append((gradient, trainable_var))
        return optimizer.apply_gradients(pruned_clipped_gradients)
Exemplo n.º 13
0
def nll_gnp_step_bandits(model, data, optimizer_config):
    """Applies gradient updates and returns appropriate metrics.

  Args:
    model: An instance of SNP Regressor.
    data: A 5-tuple consisting of context_x, context_y, target_x, target_y,
      unseen_targets (i.e., target_x-context_x).
    optimizer_config: A dictionary with two keys: an 'optimizer' object and
      a 'max_grad_norm' for clipping gradients.

  Returns:
    nll_term: Negative log-likelihood of model for unseen targets.
    local_kl: KL loss for latent variables of unseen targets.
    global_kl: KL loss for global latent variable.
  """
    (context_x, context_y, target_x, target_y, unseen_target_y,
     unseen_target_a) = data
    num_context = tf.shape(context_x)[1]
    with tf.GradientTape() as tape:
        prediction = model(context_x, context_y, target_x, target_y)
        unseen_predictions = prediction[:, num_context:]
        nll_term = nll(unseen_target_y, unseen_predictions, unseen_target_a)
        local_kl = tf.reduce_mean(
            tf.reduce_sum(model.losses[-1][:, num_context:], axis=[1, 2]))
        global_kl = tf.reduce_mean(tf.reduce_sum(model.losses[-2], axis=-1))
        loss = nll_term + local_kl + global_kl
    gradients = tape.gradient(loss, model.trainable_variables)
    max_grad_norm = optimizer_config['max_grad_norm']
    optimizer = optimizer_config['optimizer']
    clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm)
    optimizer.apply_gradients(zip(clipped_gradients,
                                  model.trainable_variables))
    return nll_term, local_kl, global_kl
Exemplo n.º 14
0
    def _add_train_op(self):
        """Sets self._train_op, the op to run for training."""
        # Take gradients of the trainable variables w.r.t. the loss function to minimize
        loss_to_minimize = self._total_loss if self._hps.coverage else self._loss
        tvars = tf.trainable_variables()
        gradients = tf.gradients(
            loss_to_minimize,
            tvars,
            aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)

        # Clip the gradients
        with tf.device("/gpu:0"):
            grads, global_norm = tf.clip_by_global_norm(
                gradients, self._hps.max_grad_norm)

        # Add a summary
        tf.summary.scalar('global_norm', global_norm)

        # Apply adagrad optimizer
        optimizer = tf.train.AdagradOptimizer(
            self._hps.lr, initial_accumulator_value=self._hps.adagrad_init_acc)
        with tf.device("/gpu:0"):
            self._train_op = optimizer.apply_gradients(
                zip(grads, tvars),
                global_step=self.global_step,
                name='train_step')
 def build_optimizer(self):
     # 使用clipping gradients
     tvars = tf.trainable_variables()
     grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                       self.grad_clip)
     train_op = tf.train.AdamOptimizer(self.learning_rate)
     self.optimizer = train_op.apply_gradients(zip(grads, tvars))
Exemplo n.º 16
0
    def _build_train(self):
        """Build training ops."""
        print('-' * 80)
        print('Building train graph')
        reg_loss, loss = self._forward(self.x_train,
                                       self.y_train,
                                       self.train_params,
                                       self.batch_init_states,
                                       is_training=True)

        tf_vars = [
            v for v in tf.trainable_variables() if v.name.startswith(self.name)
        ]
        global_step = tf.train.get_or_create_global_step()
        lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) /
                    tf.cast(self.params.bptt_steps, dtype=tf.float32))
        learning_rate = utils.get_lr(global_step, self.params) * lr_scale
        if self.params.grad_bound:
            grads = tf.gradients(reg_loss, tf_vars)
            clipped_grads, grad_norm = tf.clip_by_global_norm(
                grads, self.params.grad_bound)

        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars),
                                             global_step=global_step)

        self.train_loss = loss
        self.train_op = train_op
        self.grad_norm = grad_norm
        self.learning_rate = learning_rate
Exemplo n.º 17
0
    def build_train(self, initial_lr):
        """
        """
        #count_number_trainable_params(verbose=True) # TODO remove

        # Decay learning rate by manually incrementing decay_step
        decay_step = tf.Variable(0.0, name='decay_step', trainable=False)
        learning_rate = tf.train.exponential_decay(initial_lr,
                                                   decay_step,
                                                   1,
                                                   0.8,
                                                   staircase=True,
                                                   name="learning_rate")

        trainable_variables = tf.trainable_variables()

        optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9)
        # clip gradients
        grads = tf.gradients(self.loss, trainable_variables)
        grads, _ = tf.clip_by_global_norm(grads,
                                          1.0,
                                          use_norm=tf.global_norm(grads))

        train_op = optimizer.apply_gradients(zip(grads, trainable_variables))

        self.decay_step = decay_step
        self.learning_rate = learning_rate
        self.train_op = train_op
Exemplo n.º 18
0
def eager_train_step(detection_model,
                     features,
                     labels,
                     unpad_groundtruth_tensors,
                     optimizer,
                     learning_rate,
                     add_regularization_loss=True,
                     clip_gradients_value=None,
                     global_step=None,
                     num_replicas=1.0):
    is_training = True
    detection_model._is_training = is_training
    tf.keras.backend.set_learning_phase(is_training)
    labels = model_lib.unstack_batch(
        labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)
    with tf.GradientTape() as tape:
        losses_dict, _ = _compute_losses_and_predictions_dicts(
            detection_model, features, labels, add_regularization_loss)
        total_loss = losses_dict["Loss/total_loss"]
        total_loss = tf.math.divide(
            total_loss, tf.constant(num_replicas, dtype=tf.float32))
        losses_dict["Loss/normalized_total_loss"] = total_loss
    for loss_type in losses_dict:
        tf.compat.v2.summary.scalar(loss_type,
                                    losses_dict[loss_type],
                                    step=global_step)
    trainable_variables = detection_model.trainable_variables
    gradients = tape.gradient(total_loss, trainable_variables)
    if clip_gradients_value:
        gradients, _ = tf.clip_by_global_norm(gradients, clip_gradients_value)
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    return total_loss
Exemplo n.º 19
0
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps,
                     use_tpu):
    """Creates an optimizer training op."""
    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

    # Implements linear decay of the learning rate.
    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=1.0,
                                              cycle=False)

    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
    # learning rate will be `global_step/num_warmup_steps * init_lr`.
    if num_warmup_steps:
        global_steps_int = tf.cast(global_step, tf.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = init_lr * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        learning_rate = ((1.0 - is_warmup) * learning_rate +
                         is_warmup * warmup_learning_rate)

    # It is recommended that you use this optimizer for fine tuning, since this
    # is how the model was trained (note that the Adam m/v variables are NOT
    # loaded from init_checkpoint.)
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    if use_tpu:
        optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    grads = tf.gradients(loss, tvars)

    # This is how the model was pre-trained.
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)

    # Normally the global step update is done inside of `apply_gradients`.
    # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
    # a different optimizer, you should probably take this line out.
    new_global_step = global_step + 1
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
Exemplo n.º 20
0
def clip_gradients(gvs, value_clip=0, norm_clip=0):
    """Clips gradients."""

    grads, vs = zip(*gvs)
    grads = list(grads)

    if value_clip > 0:
        for i, g in enumerate(grads):
            if g is not None:
                grads[i] = tf.clip_by_value(g, -value_clip, value_clip)

    if norm_clip > 0:
        n_params = sum(np.prod(g.shape) for g in grads if g is not None)
        # n_params is most likely tf.Dimension and cannot be converted
        # to float directly
        norm_clip *= np.sqrt(float(int(n_params)))

        grads_to_clip = [(i, g) for i, g in enumerate(grads) if g is not None]
        idx, grads_to_clip = zip(*grads_to_clip)
        clipped_grads = tf.clip_by_global_norm(grads_to_clip, norm_clip)[0]

        for i, g in zip(idx, clipped_grads):
            grads[i] = g

    return [item for item in zip(grads, vs)]
Exemplo n.º 21
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """Applying gradients and tune hyperparams with YellowFin.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name:  Optional name for the returned operation. Default to the
        name passed to the Optimizer constructor.

    Returns:
        (A group of operations)
        Variable Update with Momentum ops,
        YellowFin ops(Curvature, Variance, Distance) ops,
        SingleStep and lr_mu tuning ops,
        Step increment ops.
    """
        self._grad, self._vars = zip(*[(g, t) for g, t in grads_and_vars
                                       if g is not None])

        # Var update with Momentum.
        with tf.variable_scope("apply_updates"):
            # Gradient Clipping?
            if self._clip_thresh_var is not None:
                self._grad, _ = tf.clip_by_global_norm(self._grad,
                                                       self._clip_thresh_var)

                apply_grad_op = self._momentum_optimizer.apply_gradients(
                    zip(self._grad, self._vars),
                    global_step=global_step,
                    name=name)
            else:
                apply_grad_op = self._momentum_optimizer.apply_gradients(
                    zip(self._grad, self._vars),
                    global_step=global_step,
                    name=name)

        # Begin lr and mu tuning.
        with tf.variable_scope("prepare_yellowFin_variables"):
            # the dependencies ideally only need to be after clip is done,
            # i.e. depends on self._grads. However, the control_dependencies
            # does not support indexed slice for sparse gradients.
            # The alternative dependencies here might be slightly slower due
            # to less parallelization.
            with tf.control_dependencies([
                    apply_grad_op,
            ]):
                prepare_variables_op = self._prepare_variables()

        with tf.variable_scope("yellowfin"):
            with tf.control_dependencies([prepare_variables_op]):
                yellowfin_op = self._yellowfin()

        # Update YellowFin step variable.
        with tf.control_dependencies([yellowfin_op]):
            self._increment_step_op = tf.assign_add(self._step, 1).op

        return tf.group(apply_grad_op, prepare_variables_op, yellowfin_op,
                        self._increment_step_op)
Exemplo n.º 22
0
    def _update_actor(self, obs, mask):
        """Updates parameters of critic given samples from the batch.

    Args:
       obs: A tfe.Variable with a batch of observations.
       mask: A tfe.Variable with a batch of masks.
    """
        with tf.GradientTape() as tape:
            if self.use_td3:
                q_pred, _ = self.critic(obs, self.actor(obs))
            else:
                q_pred = self.critic(obs, self.actor(obs))
            if self.use_absorbing_state:
                # Don't update the actor for absorbing states.
                # And skip update if all states are absorbing.
                a_mask = 1.0 - tf.maximum(0, -mask)
                if tf.reduce_sum(a_mask) < 1e-8:
                    return
                actor_loss = -tf.reduce_sum(
                    q_pred * a_mask) / tf.reduce_sum(a_mask)
            else:
                actor_loss = -tf.reduce_mean(q_pred)

        grads = tape.gradient(actor_loss, self.actor.variables)
        # Clipping makes training more stable.
        grads, _ = tf.clip_by_global_norm(grads, 40.0)
        self.actor_optimizer.apply_gradients(zip(grads, self.actor.variables),
                                             global_step=self.actor_step)

        with contrib_summary.record_summaries_every_n_global_steps(
                100, self.actor_step):
            contrib_summary.scalar('actor/loss',
                                   actor_loss,
                                   step=self.actor_step)
Exemplo n.º 23
0
def ProcessGradients(grads_and_vars,
                     global_gradient_clip=0.0,
                     sanitize_gradients=False,
                     normalize_gradients=False):
    tf.logging.info("Prcessing gradients")
    grads, vars_ = list(zip(*grads_and_vars))
    if sanitize_gradients:
        new_grads = []
        for g in grads:
            if g is not None:
                g = tf.where(tf.is_finite(g), g, tf.zeros_like(g))
            new_grads.append(g)
        grads = new_grads
    if normalize_gradients:
        new_grads = []
        for g in grads:
            if g is not None:
                g *= tf.rsqrt(tf.maximum(1e-12, tf.reduce_sum(tf.square(g))))
            new_grads.append(g)
        grads = new_grads
    if global_gradient_clip > 0:
        grads, grad_norm = tf.clip_by_global_norm(grads, global_gradient_clip)
        grads_and_vars = list(zip(grads, vars_))
    else:
        grad_norm = tf.global_norm(grads)
    tf.summary.scalar("global_grad_norm", grad_norm)
    return grads_and_vars
Exemplo n.º 24
0
    def add_train_op(self, lr_method, lr, loss, clip=-1):
        """Defines self.train_op that performs an update on a batch

        Args:
            lr_method: (string) sgd method, for example "adam"
            lr: (tf.placeholder) tf.float32, learning rate
            loss: (tensor) tf.float32 loss to minimize
            clip: (python float) clipping of gradient. If < 0, no clipping

        """
        _lr_m = lr_method.lower() # lower to make sure

        with tf.variable_scope("train_step"):
            if _lr_m == 'adam': # sgd method
                optimizer = tf.train.AdamOptimizer(lr)
            elif _lr_m == 'adagrad':
                optimizer = tf.train.AdagradOptimizer(lr)
            elif _lr_m == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(lr)
            elif _lr_m == 'rmsprop':
                optimizer = tf.train.RMSPropOptimizer(lr)
            else:
                raise NotImplementedError("Unknown method {}".format(_lr_m))

            if clip > 0: # gradient clipping if clip is positive
                grads, vs     = zip(*optimizer.compute_gradients(loss))
                grads, gnorm  = tf.clip_by_global_norm(grads, clip)
                self.train_op = optimizer.apply_gradients(zip(grads, vs))
            else:
                self.train_op = optimizer.minimize(loss)
Exemplo n.º 25
0
    def _finish(self, caches):
        """ """

        if self.clip > 0:
            S_t = [cache['s_t'] for cache in caches]
            S_t, _ = tf.clip_by_global_norm(S_t, self.clip)
            for cache, s_t in zip(caches, S_t):
                cache['s_t'] = s_t

        for cache in caches:
            x_tm1 = cache['x_tm1']
            s_t = cache['s_t']
            updates = cache['updates']
            with tf.name_scope('update_' + x_tm1.op.name), tf.device(
                    x_tm1.device):
                if 'idxs' in cache:
                    idxs = cache['idxs']
                    x_t = tf.scatter_sub(x_tm1, idxs, s_t)
                    if self.chi > 0:
                        x_t_ = tf.gather(x_t, idxs)
                        x_bar_t, t_x_bar = self._sparse_moving_average(
                            x_tm1, idxs, x_t_, 'x', beta=self.chi)
                else:
                    x_t = tf.assign_sub(x_tm1, s_t)
                    if self.chi > 0:
                        x_bar_t, t_x_bar = self._dense_moving_average(
                            x_tm1, x_t, 'x', beta=self.chi)
            updates.append(x_t)
            if self.chi > 0:
                updates.extend([x_bar_t, t_x_bar])

        update_ops = [tf.group(*cache['updates']) for cache in caches]
        return tf.group(*update_ops, name='update')
Exemplo n.º 26
0
 def apply_gradients(self, grads_and_vars, *args, **kwargs):
     if self._clip_norm == np.inf:
         return self._opt.apply_gradients(grads_and_vars, *args, **kwargs)
     grads, vars_ = list(zip(*grads_and_vars))
     clipped_grads, _ = tf.clip_by_global_norm(grads, self._clip_norm)
     return self._opt.apply_gradients(zip(clipped_grads, vars_), *args,
                                      **kwargs)
def create_optimizer(
    loss,
    learning_rate,
    num_train_steps,
    weight_decay_rate=0.0,
    use_tpu=False,
    warmup_steps=0,
    warmup_proportion=0,
    lr_decay_power=1.0,
    layerwise_lr_decay_power=-1,
    n_transformer_layers=None,
    decoder_layers=None,
):
    """Creates an optimizer and training op."""
    global_step = tf.train.get_or_create_global_step()
    learning_rate = tf.train.polynomial_decay(
        learning_rate,
        global_step,
        num_train_steps,
        end_learning_rate=0.0,
        power=lr_decay_power,
        cycle=False,
    )
    warmup_steps = max(num_train_steps * warmup_proportion, warmup_steps)
    learning_rate *= tf.minimum(
        1.0,
        tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32),
    )
    cp_learning_rate = learning_rate

    if layerwise_lr_decay_power > 0:
        learning_rate = _get_layer_lrs(
            learning_rate,
            layerwise_lr_decay_power,
            n_transformer_layers,
            decoder_layers,
        )
        learning_rate['embedding_shared_weights/'] = cp_learning_rate
        learning_rate['decoder_stack/layer_normalization/'] = cp_learning_rate
        print(learning_rate)
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=weight_decay_rate,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
    )
    if use_tpu:
        optimizer = tf.tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    grads = tf.gradients(loss, tvars)
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)
    new_global_step = global_step + 1
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
Exemplo n.º 28
0
    def step_fn(self, params, model):
        """A single step for supervised learning."""
        images, labels = tf.raw_ops.InfeedDequeueTuple(
            dtypes=params.train_dtypes, shapes=params.train_shapes)

        if labels.dtype == tf.int32:
            labels = tf.one_hot(labels,
                                depth=params.num_classes,
                                dtype=tf.float32)
        global_step = tf.train.get_or_create_global_step()

        train_batch_size = tf.cast(params.train_batch_size, tf.float32)
        num_replicas = tf.cast(params.num_replicas, tf.float32)

        with tf.variable_scope(MODEL_SCOPE):
            logits = model(images, training=True)

        cross_entropy = tf.losses.softmax_cross_entropy(
            onehot_labels=labels,
            logits=logits,
            label_smoothing=params.label_smoothing,
            reduction=tf.losses.Reduction.SUM) / train_batch_size

        l2_reg_rate = tf.cast(params.weight_decay / params.num_replicas,
                              tf.float32)
        weight_dec = common_utils.get_l2_loss()
        total_loss = cross_entropy + weight_dec * l2_reg_rate

        variables = tf.trainable_variables()
        gradients = tf.gradients(total_loss, variables)
        gradients = [tf.tpu.cross_replica_sum(g) for g in gradients]
        gradients, grad_norm = tf.clip_by_global_norm(gradients,
                                                      params.grad_bound)

        learning_rate, optimizer = common_utils.get_optimizer(params)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.cond(
            tf.math.is_finite(grad_norm), lambda: optimizer.apply_gradients(
                zip(gradients, variables), global_step=global_step), tf.no_op)
        with tf.control_dependencies(update_ops + [train_op]):
            ema_train_op = common_utils.setup_ema(
                params, f'{MODEL_SCOPE}/{model.name}')

        with tf.control_dependencies([ema_train_op]):
            logs = collections.OrderedDict()
            logs['global_step'] = tf.cast(global_step, tf.float32)
            logs['loss/total'] = total_loss
            logs['loss/weight_decay'] = weight_dec / num_replicas
            logs['loss/cross_entropy'] = cross_entropy
            logs['loss/lr'] = tf.identity(learning_rate) / num_replicas
            logs['loss/grad_norm'] = grad_norm / num_replicas

            tensors = [tf.expand_dims(t, axis=0) for t in logs.values()]
            self.step_info = {k: [tf.float32, [1]] for k in logs.keys()}
            outfeed_enqueue_op = tf.cond(
                common_utils.should_log(params),
                lambda: tf.raw_ops.OutfeedEnqueueTuple(inputs=tensors),
                tf.no_op)
        return outfeed_enqueue_op
Exemplo n.º 29
0
 def __init__(self):
     # placeholder
     self.sph_user = tf.sparse_placeholder(tf.int32, name='sph_user')
     self.sph_doc = tf.sparse_placeholder(tf.int32, name='sph_doc')
     self.sph_con = tf.sparse_placeholder(tf.int32, name='sph_con')
     self.ph_reward = tf.placeholder(tf.float32, name='ph_reward')
     self.ph_nq = tf.placeholder(
         tf.float32,
         shape=[pd['batch_size'], pd['rnn_max_len']],
         name='ph_nq')
     # main networks
     self.dst_embed, self.mq = self.build_net('main')
     # target networks
     _, self.tq = self.build_net('target')
     diff = tf.reshape(self.ph_reward, [-1]) + tf.scalar_mul(
         tf.constant(pd['gamma']), tf.reshape(
             self.ph_nq, [-1])) - tf.reshape(self.mq, [-1])
     self.loss = tf.reduce_mean(tf.square(diff))
     self.a_grads = tf.clip_by_global_norm(
         tf.gradients(self.mq, self.dst_embed), pd['grad_clip'])[0]
     vs = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                            scope='main/value')
     vs.extend(
         tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                           scope='main/feat_embedding'))
     self.grads = tf.clip_by_global_norm(tf.gradients(self.loss, vs),
                                         pd['grad_clip'])[0]
     with tf.variable_scope('train_value'):
         optimizer = tf.train.AdamOptimizer(pd['lr'])
         self.opt = optimizer.apply_gradients(zip(self.grads, vs))
     self.m_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                       scope="main/value")
     self.m_params.extend(
         tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                           scope='main/feat_embedding'))
     self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                       scope="target/value")
     self.t_params.extend(
         tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                           scope='target/feat_embedding'))
     alpha = pd['double_networks_sync_step']
     self.sync_op = [
         tf.assign(t, (1.0 - alpha) * t + alpha * m)
         for t, m in zip(self.t_params, self.m_params)
     ]
     self.total_loss, self.batch_counter = 0.0, 0
Exemplo n.º 30
0
    def minimize_with_clipping(optimizer, loss):
      grads_and_vars = optimizer.compute_gradients(loss)
      if max_global_gradient_norm is not None:
        grads, variables = zip(*grads_and_vars)
        grads, _ = tf.clip_by_global_norm(grads, max_global_gradient_norm)
        grads_and_vars = list(zip(grads, variables))

      return optimizer.apply_gradients(grads_and_vars)