def __call__(self, model=None, distributed=False): """Call Optimizer class. :param model: model, used in torch case :param distributed: use distributed :return: optimizer """ params = self.map_config.get("params", {}) logging.debug("Call Optimizer. name={}, params={}".format(self.optim_cls.__name__, params)) optimizer = None try: if zeus.is_torch_backend(): learnable_params = [param for param in model.parameters() if param.requires_grad] optimizer = self.optim_cls(learnable_params, **params) if distributed: optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.none) elif zeus.is_tf_backend(): optimizer = dynamic_optimizer(self.optim_cls, **params) if distributed: optimizer = hvd.DistributedOptimizer(optimizer) if zeus.is_gpu_device() else \ NPUDistributedOptimizer(optimizer) elif zeus.is_ms_backend(): learnable_params = [param for param in model.trainable_params() if param.requires_grad] optimizer = self.optim_cls(learnable_params, **params) return optimizer except Exception as ex: logging.error("Failed to call Optimizer name={}, params={}".format(self.optim_cls.__name__, params)) raise ex
def cnn_model_fn(features, labels, mode, params): """Model function for CNN.""" # Input Layer # Reshape X to 4-D tensor: [batch_size, width, height, channels] # MNIST images are 28x28 pixels, and have one color channel logits = create_model(features, mode) # PREDICT predictions = { # Generate predictions (for PREDICT and EVAL mode) "classes": tf.argmax(input=logits, axis=1), # Add `softmax_tensor` to the graph. It is used for PREDICT and by the # `logging_hook`. "probabilities": tf.nn.softmax(logits, name="softmax_tensor") } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate Loss (for both TRAIN and EVAL modes) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) tf.summary.scalar('loss', loss) # Configure the Training Op (for TRAIN mode) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.GradientDescentOptimizer( learning_rate=FLAGS.learning_rate) distributedOptimizer = NPUDistributedOptimizer(optimizer) train_op = distributedOptimizer.minimize( loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # Add evaluation metrics (for EVAL mode) eval_metric_ops = { "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def __call__(self, model=None, lr_scheduler=None, epoch=None, distributed=False): """Call Optimizer class. :param model: model, used in torch case :param lr_scheduler: learning rate scheduler, used in tf case :param epoch: epoch of training, used in tf case :param distributed: use distributed :return: optimizer """ params = obj2config(self.config).get("params", {}) logging.debug("Call Optimizer. name={}, params={}".format( self.optim_cls.__name__, params)) optimizer = None try: if vega.is_torch_backend(): learnable_params = [ param for param in model.parameters() if param.requires_grad ] optimizer = self.optim_cls(learnable_params, **params) if distributed: optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.none) elif vega.is_tf_backend(): lr_scheduler.step(epoch) params['learning_rate'] = lr_scheduler.get_lr()[0] optimizer = self.optim_cls(**params) if distributed: optimizer = hvd.DistributedOptimizer(optimizer) if vega.is_gpu_device() else \ NPUDistributedOptimizer(optimizer) return optimizer except Exception as ex: logging.error("Failed to call Optimizer name={}, params={}".format( self.optim_cls.__name__, params)) raise ex
def npu_tf_optimizer(opt): """Set NPU Tensorflow optimizer""" npu_opt = NPUDistributedOptimizer(opt) return npu_opt
def npu_tf_optimizer(opt): npu_opt = NPUDistributedOptimizer(opt) return npu_opt
def model_func(self, images, labels, is_training=True, train_steps=None): model_inference_func = self.get_model_func() with tf.name_scope('resnet') as name_scope: with tf.device('/gpu:0'): labels = tf.reshape( labels, (-1,) ) image = tf.cast( images, self.params['dtype'] ) if self.params['data_format'] == 'channels_first': image = tf.transpose(image, [0,3,1,2]) logits = model_inference_func( image, self.params['data_format'], training=is_training, conv_initializer=tf.variance_scaling_initializer(scale=1.0, mode='fan_in', distribution='uniform', seed=1), bn_init_mode='conv_bn_init', bn_gamma_initial_value=1.4 ) logits = tf.cast(logits, tf.float32) one_hot_labels = tf.one_hot(labels, self.params['num_classes']) base_loss = tf.losses.softmax_cross_entropy( one_hot_labels, logits=logits, label_smoothing=0.1 ) predicted_label = tf.math.argmax(logits,1, output_type=tf.int32) # Eval branch if not is_training: return tf.no_op(name='eval_op'), predicted_label, base_loss, base_loss, train_steps, labels def exclude_batch_norm(name): return 'BatchNorm' not in name and 'batchnorm' not in name and 'batch_norm' not in name and 'Batch_Norm' not in name if self.params['use_lars']: total_loss = base_loss else: l2_loss = self.params['weight_decay'] * tf.add_n( [tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables() if exclude_batch_norm(v.name)]) total_loss = base_loss + l2_loss lr = learning_rate.get_lr(self.params, train_steps) opt = tf.train.MomentumOptimizer( lr, self.params['momentum'] ) opt = NPUDistributedOptimizer(opt) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or [] with tf.control_dependencies(update_ops): gate_gradients = (tf.train.Optimizer.GATE_NONE) scaled_grads = opt.compute_gradients( total_loss * 512.0 ) unscaled_grads = [ (g/512.0, v) for g,v in scaled_grads ] if self.params['use_lars']: g_list_bn_bias = [] var_list_bn_bias = [] g_list_else = [] var_list_else = [] g_list_else_lars = [] grad_var_list=[] for g,var in unscaled_grads: if 'BatchNorm' not in var.name and 'bias' not in var.name: g_list_else.append(g) var_list_else.append(var) g_new = npu_ops.LARSV2( input_weight=var, input_grad = g, weight_decay = self.params['weight_decay'], learning_rate = 1.0, use_clip=False ) g_list_else_lars.append(g_new) else: g_list_bn_bias.append(g) var_list_bn_bias.append(var) g_list_lars = g_list_bn_bias + g_list_else_lars var_list = var_list_bn_bias + var_list_else for (g, var) in zip(g_list_lars,var_list): g_and_v = ( g, var ) grad_var_list.append( g_and_v ) train_op = opt.apply_gradients(grad_var_list) else: train_op = opt.apply_gradients(unscaled_grads) return train_op, predicted_label, base_loss, lr, train_steps, labels
def get_estimator_model_func(self, features, labels, mode, params=None): labels = tf.reshape(labels, (-1, )) inputs = features is_training = (mode == tf.estimator.ModeKeys.TRAIN) inputs = tf.cast(inputs, self.args.dtype) if is_training: if self.args.network == "inception_v1": with slim.arg_scope( inception_v1.inception_v1_arg_scope( weight_decay=self.args.weight_decay)): top_layer, end_points = inception_v1.inception_v1( inputs=features, num_classes=2, dropout_keep_prob=0.7, is_training=True) if self.args.network == "inception_v4": with slim.arg_scope( inception_v4.inception_v4_arg_scope( weight_decay=self.args.weight_decay)): top_layer, end_points = inception_v4.inception_v4( inputs=features, num_classes=2, dropout_keep_prob=0.8, is_training=True) else: if self.args.network == "inception_v1": with slim.arg_scope(inception_v1.inception_v1_arg_scope()): top_layer, end_points = inception_v1.inception_v1( inputs=features, num_classes=2, dropout_keep_prob=1.0, is_training=False) if self.args.network == "inception_v4": with slim.arg_scope(inception_v4.inception_v4_arg_scope()): top_layer, end_points = inception_v4.inception_v4( inputs=features, num_classes=2, dropout_keep_prob=1.0, is_training=False) logits = top_layer predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32) logits = tf.cast(logits, tf.float32) labels_one_hot = tf.one_hot(labels, depth=2) loss = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels_one_hot, label_smoothing=self.args.label_smoothing) base_loss = tf.identity(loss, name='loss') l2_loss = tf.add_n([ tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables() ]) l2_loss = tf.multiply(l2_loss, self.args.weight_decay) total_loss = base_loss + l2_loss # loss = tf.losses.softmax_cross_entropy(logits, labels_one_hot, label_smoothing=self.args.label_smoothing) # loss = tf.identity(loss, name='loss') # total_loss = tf.losses.get_total_loss(add_regularization_losses = True) total_loss = tf.identity(total_loss, name='total_loss') if mode == tf.estimator.ModeKeys.EVAL: with tf.device(None): metrics = self.layers.get_accuracy(labels, predicted_classes, logits, self.args) return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) assert (mode == tf.estimator.ModeKeys.TRAIN) batch_size = tf.shape(inputs)[0] global_step = tf.train.get_global_step() learning_rate = self.hyper_param.get_learning_rate() momentum = self.args.momentum opt = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=self.args.use_nesterov) from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer opt = NPUDistributedOptimizer(opt) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or [] with tf.control_dependencies(update_ops): gate_gradients = tf.train.Optimizer.GATE_NONE grads_and_vars = opt.compute_gradients( total_loss, gate_gradients=gate_gradients) train_op = opt.apply_gradients(grads_and_vars, global_step=global_step) train_op = tf.group(train_op) return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)