def _build_training(self): trainable_variables = tf.trainable_variables() if self.run_opt.is_distrib: if self.scale_lr_coef > 1.: log.info('Scale learning rate by coef: %f', self.scale_lr_coef) optimizer = tf.train.AdamOptimizer(self.learning_rate * self.scale_lr_coef) else: optimizer = tf.train.AdamOptimizer(self.learning_rate) optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) else: optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) if self.mixed_prec is not None: _TF_VERSION = Version(TF_VERSION) # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed if _TF_VERSION < Version('1.14.0'): raise RuntimeError( "TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) elif _TF_VERSION < Version('2.4.0'): optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) else: optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite( optimizer) apply_op = optimizer.minimize(loss=self.l2_l, global_step=self.global_step, var_list=trainable_variables, name='train_step') train_ops = [apply_op] + self._extra_train_ops self.train_op = tf.group(*train_ops) log.info("built training")
def _build_training(self): trainable_variables = tf.trainable_variables() if self.run_opt.is_distrib: optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate * self.run_opt.world_size) optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) else: optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) apply_op = optimizer.minimize(loss=self.l2_l, global_step=self.global_step, var_list=trainable_variables, name='train_step') train_ops = [apply_op] + self._extra_train_ops self.train_op = tf.group(*train_ops) log.info("built training")
def _build_training(self): trainable_variables = tf.trainable_variables() optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate) if self.run_opt.is_distrib : optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate = self.run_opt.cluster_spec.num_tasks("worker"), total_num_replicas = self.run_opt.cluster_spec.num_tasks("worker"), name = "sync_replicas") self.sync_replicas_hook = optimizer.make_session_run_hook(self.run_opt.is_chief) grads = tf.gradients(self.l2_l, trainable_variables) apply_op = optimizer.apply_gradients (zip (grads, trainable_variables), global_step=self.global_step, name='train_step') train_ops = [apply_op] + self._extra_train_ops self.train_op = tf.group(*train_ops) self._message("built training")