def build(self): """Build the trainer by assembling the necessary components.""" super().build() self.optimizer = Optimizer()(model=self.model) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.metric_name = self.config.metric.type # Some trainer has different train batch size from valid batch self.train_metrics = None self.valid_metrics = self._init_metrics()
def model_fn(self, features, labels, mode): """Define cars model_fn used by TensorFlow Estimator.""" logging.info('Cars model function action') self.trainer.loss = Loss()() train_op = None if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.compat.v1.train.get_global_step() epoch = tf.cast(global_step, tf.float32) / tf.cast( len(self.trainer.train_loader), tf.float32) self.trainer.optimizer = Optimizer()( distributed=self.trainer.distributed) self.trainer.lr_scheduler = LrScheduler()(self.trainer.optimizer) self.trainer.lr_scheduler.step(epoch) self.trainer.model.training = True alphas = tf.convert_to_tensor(self.alphas) for j in range(self.alg_policy.num_individual_per_iter): i = np.random.randint(0, self.alg_policy.num_individual, 1)[0] if self.epoch < self.alg_policy.warmup: alpha = tf.convert_to_tensor( self.search_alg.random_sample_path()) else: alpha = alphas[i] logits = self.trainer.model(features, alpha=alpha) logits = tf.cast(logits, tf.float32) loss = self.trainer.loss(logits=logits, labels=labels) loss = self.trainer.optimizer.regularize_loss(loss) grads, vars = zip( *self.trainer.optimizer.compute_gradients(loss)) if j == 0: accum_grads = [ tf.Variable(tf.zeros_like(grad), trainable=False) for grad in grads ] accum_grads = [ accum_grads[k] + grads[k] for k in range(len(grads)) ] if self.epoch < self.alg_policy.warmup: break clipped_grads, _ = tf.clip_by_global_norm( accum_grads, self.trainer.config.grad_clip) minimize_op = self.trainer.optimizer.apply_gradients( list(zip(clipped_grads, vars)), global_step) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: alpha = tf.convert_to_tensor(self.trainer.valid_alpha) self.trainer.model.training = False logits = self.trainer.model(features, alpha=alpha) logits = tf.cast(logits, tf.float32) loss = self.trainer.loss(logits=logits, labels=labels) eval_metric_ops = self.trainer.valid_metrics(logits, labels) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
def _build_multigpu_train_op(self, num_gpus): with self.graph.as_default(), tf.device('/gpu:0'): tower_grads = [] self.inputs = [] self.labels = [] opt = Optimizer()() for i in range(num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as scope: # tf.get_variable_scope().reuse_variables() inputs = self._create_tensor(self.loss_input['inputs']) labels = self._create_tensor(self.loss_input['labels']) input = inputs[0] model_output = self.model(input) loss = Loss()() loss = loss(model_output, labels) # Calculate the gradients for the batch of data on this tower. varlist = [x for x in tf.trainable_variables() if x.name.startswith('tower_%d' % i)] grads = opt.compute_gradients(loss, varlist) tower_grads.append(grads) if i == 0: self.actor_var = TFVariables(model_output, self.sess) self.input = input self.logits = model_output self.loss = loss self.inputs.append(inputs) self.labels.append(labels) grads = self._average_gradients(tower_grads) self.train_op = opt.apply_gradients(grads) self.sess.run(tf.initialize_all_variables())
def _default_model_fn(self, features, labels, mode): """Define model_fn used by TensorFlow Estimator. :params features: input features :type features: tensorflow tensors :params labels: label data :type labels: tensorflow tensors :params mode: mode of estimator :type mode: tf.estimator.ModeKeys :return: tensorflow EstimatorSpec :rtype: tf.estimator.EstimatorSpec """ logging.info('model function action') self.model.training = mode == tf.estimator.ModeKeys.TRAIN logits = self.model(features) logits = tf.cast(logits, tf.float32) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() loss = self.loss(logits, labels) train_op = None if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.compat.v1.train.get_or_create_global_step() epoch = tf.cast(global_step, tf.float32) / tf.cast( len(self.train_loader), tf.float32) self.optimizer = Optimizer()(distributed=self.distributed) self.lr_scheduler = LrScheduler()(optimizer=self.optimizer) self.lr_scheduler.step(epoch) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) loss_scale = self.config.loss_scale if self.use_amp else 1 minimize_op = self.optimizer.step(loss, loss_scale, global_step) train_op = tf.group(minimize_op, update_ops) eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = self.valid_metrics(logits, labels) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
def build(self): """Build the trainer by assembling the necessary components.""" super().build() self.optimizer = Optimizer()(model=self.model, distributed=self.distributed) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.lr_scheduler = LrScheduler()(self.optimizer) # Some trainer has different train batch size from valid batch self.train_metrics = self._init_metrics() self.valid_metrics = self._init_metrics() self._init_horovod_setting() if self.use_amp: self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O1')
def build(self): """Build the trainer by assembling the necessary components.""" self._init_hps(self.hps) logging.debug("Trainer Config: {}".format(self.config)) self.do_validation = self.config.with_valid self.use_syncbn = self.config.syncbn if self.use_syncbn and zeus.is_torch_backend(): self.model = apex.parallel.convert_syncbn_model(self.model) self.train_loader = self._init_dataloader(mode='train') self.valid_loader = self._init_dataloader(mode='val') self.batch_num_train = self.train_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.train_loader) self.batch_num_valid = self.valid_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.valid_loader) if zeus.is_torch_backend(): self.optimizer = Optimizer()(model=self.model, distributed=self.distributed) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.lr_scheduler = LrScheduler()(self.optimizer) elif zeus.is_ms_backend(): self.optimizer = Optimizer()(model=self.model) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.metric_name = self.config.metric().type # Some trainer has different train batch size from valid batch self.train_metrics = self._init_metrics() if zeus.is_torch_backend() else None self.valid_metrics = self._init_metrics() self._init_horovod_setting() if self.use_amp and zeus.is_torch_backend(): self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level='O1')
def _init_train_op(self): self.inputs = self._create_tensor(self.loss_input['inputs']) self.labels = self._create_tensor(self.loss_input['labels']) self.input = self.inputs[0] logits = self.model(self.input) self.logits = logits self.actor_var = TFVariables(logits, self.sess) loss = Loss()() self.loss = loss(logits, self.labels) self.optimizer = Optimizer()(distributed=self.distributed) grads_and_var = self.optimizer.compute_gradients(self.loss) grads, var = zip(*grads_and_var) grads_and_var = list(zip(grads, var)) self.train_op = self.optimizer.apply_gradients(grads_and_var) self.sess.run(tf.initialize_all_variables())
def model_fn(self, features, labels, mode): """Darts model_fn used by TensorFlow Estimator.""" logging.info('Darts model function action') global_step = tf.compat.v1.train.get_global_step() train_op = None if mode == tf.estimator.ModeKeys.TRAIN: features, valid_features = features['train'], features['valid'] labels, valid_labels = labels['train'], labels['valid'] # update arch epoch = tf.cast(global_step, tf.float32) / tf.cast( len(self.trainer.train_loader), tf.float32) self.trainer.optimizer = Optimizer()( distributed=self.trainer.distributed) self.trainer.lr_scheduler = LrScheduler()(self.trainer.optimizer) self.trainer.lr_scheduler.step(epoch) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) arch_minimize_op = self.search_alg.step( valid_x=valid_features, valid_y=valid_labels, lr=self.trainer.lr_scheduler.get_lr()[0]) train_op = tf.group(arch_minimize_op, update_ops) self.model.training = mode == tf.estimator.ModeKeys.TRAIN logits = self.model(features) logits = tf.cast(logits, tf.float32) self.trainer.loss = Loss()() loss = self.trainer.loss(logits=logits, labels=labels) if mode == tf.estimator.ModeKeys.TRAIN: with tf.control_dependencies([train_op]): weight_ops = self.model.get_weight_ops() loss_scale = self.trainer.config.loss_scale if self.trainer.use_amp else 1 train_op = self.trainer.optimizer.step(loss, loss_scale, global_step, weight_ops) eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = self.trainer.valid_metrics(logits, labels) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)