예제 #1
0
    def __call__(self, model=None, distributed=False):
        """Call Optimizer class.

        :param model: model, used in torch case
        :param distributed: use distributed
        :return: optimizer
        """
        params = self.map_config.get("params", {})
        logging.debug("Call Optimizer. name={}, params={}".format(self.optim_cls.__name__, params))
        optimizer = None
        try:
            if zeus.is_torch_backend():
                learnable_params = [param for param in model.parameters() if param.requires_grad]
                optimizer = self.optim_cls(learnable_params, **params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(optimizer,
                                                         named_parameters=model.named_parameters(),
                                                         compression=hvd.Compression.none)
            elif zeus.is_tf_backend():
                optimizer = dynamic_optimizer(self.optim_cls, **params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(optimizer) if zeus.is_gpu_device() else \
                        NPUDistributedOptimizer(optimizer)
            elif zeus.is_ms_backend():
                learnable_params = [param for param in model.trainable_params() if param.requires_grad]
                optimizer = self.optim_cls(learnable_params, **params)
            return optimizer
        except Exception as ex:
            logging.error("Failed to call Optimizer name={}, params={}".format(self.optim_cls.__name__, params))
            raise ex
예제 #2
0
def cnn_model_fn(features, labels, mode, params):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    logits = create_model(features, mode)  # PREDICT
    predictions = {
        # Generate predictions (for PREDICT and EVAL mode)
        "classes": tf.argmax(input=logits, axis=1),
        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
        # `logging_hook`.
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    tf.summary.scalar('loss', loss)
    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=FLAGS.learning_rate)
        distributedOptimizer = NPUDistributedOptimizer(optimizer)
        train_op = distributedOptimizer.minimize(
            loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)
    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy":
        tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
    }
    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=loss,
                                      eval_metric_ops=eval_metric_ops)
예제 #3
0
    def __call__(self,
                 model=None,
                 lr_scheduler=None,
                 epoch=None,
                 distributed=False):
        """Call Optimizer class.

        :param model: model, used in torch case
        :param lr_scheduler: learning rate scheduler, used in tf case
        :param epoch: epoch of training, used in tf case
        :param distributed: use distributed
        :return: optimizer
        """
        params = obj2config(self.config).get("params", {})
        logging.debug("Call Optimizer. name={}, params={}".format(
            self.optim_cls.__name__, params))
        optimizer = None
        try:
            if vega.is_torch_backend():
                learnable_params = [
                    param for param in model.parameters()
                    if param.requires_grad
                ]
                optimizer = self.optim_cls(learnable_params, **params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(
                        optimizer,
                        named_parameters=model.named_parameters(),
                        compression=hvd.Compression.none)
            elif vega.is_tf_backend():
                lr_scheduler.step(epoch)
                params['learning_rate'] = lr_scheduler.get_lr()[0]
                optimizer = self.optim_cls(**params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(optimizer) if vega.is_gpu_device() else \
                        NPUDistributedOptimizer(optimizer)
            return optimizer
        except Exception as ex:
            logging.error("Failed to call Optimizer name={}, params={}".format(
                self.optim_cls.__name__, params))
            raise ex
예제 #4
0
def npu_tf_optimizer(opt):
    """Set NPU Tensorflow optimizer"""
    npu_opt = NPUDistributedOptimizer(opt)
    return npu_opt
예제 #5
0
def npu_tf_optimizer(opt):
    npu_opt = NPUDistributedOptimizer(opt)
    return npu_opt
예제 #6
0
  def model_func(self, images, labels, is_training=True, train_steps=None): 
    model_inference_func = self.get_model_func()
    with tf.name_scope('resnet') as name_scope:
        with tf.device('/gpu:0'):
          labels = tf.reshape( labels, (-1,) ) 
          image = tf.cast( images, self.params['dtype'] )

          if self.params['data_format'] == 'channels_first':
            image = tf.transpose(image, [0,3,1,2])

          logits = model_inference_func( image, self.params['data_format'], training=is_training, 
                                         conv_initializer=tf.variance_scaling_initializer(scale=1.0, mode='fan_in', distribution='uniform', seed=1),
                                         bn_init_mode='conv_bn_init', bn_gamma_initial_value=1.4  )
          
          logits = tf.cast(logits, tf.float32)
          one_hot_labels = tf.one_hot(labels, self.params['num_classes'])
          base_loss = tf.losses.softmax_cross_entropy( one_hot_labels, logits=logits, label_smoothing=0.1 )
    
          predicted_label = tf.math.argmax(logits,1, output_type=tf.int32)
        
          # Eval branch
          if not is_training:
              return tf.no_op(name='eval_op'), predicted_label, base_loss, base_loss, train_steps, labels
  
          def exclude_batch_norm(name):
            return 'BatchNorm' not in name and 'batchnorm' not in name and 'batch_norm' not in name and 'Batch_Norm' not in name

          if self.params['use_lars']:
              total_loss = base_loss
          else:
              l2_loss = self.params['weight_decay'] * tf.add_n( 
                      [tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()
                       if exclude_batch_norm(v.name)])
              total_loss = base_loss + l2_loss
          
          lr = learning_rate.get_lr(self.params, train_steps)    
          opt = tf.train.MomentumOptimizer( lr, self.params['momentum'] )
          opt = NPUDistributedOptimizer(opt)
          update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []

          with tf.control_dependencies(update_ops):

            gate_gradients = (tf.train.Optimizer.GATE_NONE)
            scaled_grads = opt.compute_gradients( total_loss * 512.0 )
            unscaled_grads = [ (g/512.0, v) for g,v in scaled_grads ]

            if self.params['use_lars']:
                g_list_bn_bias = []
                var_list_bn_bias = []
                g_list_else = []
                var_list_else = []
                g_list_else_lars = []
                grad_var_list=[]
                for g,var in unscaled_grads:
                    if 'BatchNorm' not in var.name and 'bias' not in var.name:
                        g_list_else.append(g)
                        var_list_else.append(var)

                        g_new = npu_ops.LARSV2( input_weight=var,
                                      input_grad = g,
                                      weight_decay = self.params['weight_decay'],
                                      learning_rate = 1.0, use_clip=False )
                        g_list_else_lars.append(g_new)
                    else:
                        g_list_bn_bias.append(g)
                        var_list_bn_bias.append(var)

                g_list_lars = g_list_bn_bias + g_list_else_lars
                var_list = var_list_bn_bias + var_list_else

                for (g, var) in zip(g_list_lars,var_list):
                    g_and_v = ( g, var )
                    grad_var_list.append( g_and_v )

                train_op = opt.apply_gradients(grad_var_list)
            else:
                train_op = opt.apply_gradients(unscaled_grads)

    return train_op, predicted_label, base_loss, lr, train_steps, labels
예제 #7
0
    def get_estimator_model_func(self, features, labels, mode, params=None):
        labels = tf.reshape(labels, (-1, ))

        inputs = features
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        inputs = tf.cast(inputs, self.args.dtype)

        if is_training:
            if self.args.network == "inception_v1":
                with slim.arg_scope(
                        inception_v1.inception_v1_arg_scope(
                            weight_decay=self.args.weight_decay)):
                    top_layer, end_points = inception_v1.inception_v1(
                        inputs=features,
                        num_classes=2,
                        dropout_keep_prob=0.7,
                        is_training=True)
            if self.args.network == "inception_v4":
                with slim.arg_scope(
                        inception_v4.inception_v4_arg_scope(
                            weight_decay=self.args.weight_decay)):
                    top_layer, end_points = inception_v4.inception_v4(
                        inputs=features,
                        num_classes=2,
                        dropout_keep_prob=0.8,
                        is_training=True)
        else:
            if self.args.network == "inception_v1":
                with slim.arg_scope(inception_v1.inception_v1_arg_scope()):
                    top_layer, end_points = inception_v1.inception_v1(
                        inputs=features,
                        num_classes=2,
                        dropout_keep_prob=1.0,
                        is_training=False)
            if self.args.network == "inception_v4":
                with slim.arg_scope(inception_v4.inception_v4_arg_scope()):
                    top_layer, end_points = inception_v4.inception_v4(
                        inputs=features,
                        num_classes=2,
                        dropout_keep_prob=1.0,
                        is_training=False)

        logits = top_layer
        predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32)
        logits = tf.cast(logits, tf.float32)

        labels_one_hot = tf.one_hot(labels, depth=2)

        loss = tf.losses.softmax_cross_entropy(
            logits=logits,
            onehot_labels=labels_one_hot,
            label_smoothing=self.args.label_smoothing)

        base_loss = tf.identity(loss, name='loss')

        l2_loss = tf.add_n([
            tf.nn.l2_loss(tf.cast(v, tf.float32))
            for v in tf.trainable_variables()
        ])
        l2_loss = tf.multiply(l2_loss, self.args.weight_decay)
        total_loss = base_loss + l2_loss

        # loss = tf.losses.softmax_cross_entropy(logits, labels_one_hot, label_smoothing=self.args.label_smoothing)
        # loss = tf.identity(loss, name='loss')
        # total_loss = tf.losses.get_total_loss(add_regularization_losses = True)

        total_loss = tf.identity(total_loss, name='total_loss')

        if mode == tf.estimator.ModeKeys.EVAL:
            with tf.device(None):
                metrics = self.layers.get_accuracy(labels, predicted_classes,
                                                   logits, self.args)

            return tf.estimator.EstimatorSpec(mode,
                                              loss=loss,
                                              eval_metric_ops=metrics)

        assert (mode == tf.estimator.ModeKeys.TRAIN)

        batch_size = tf.shape(inputs)[0]

        global_step = tf.train.get_global_step()
        learning_rate = self.hyper_param.get_learning_rate()

        momentum = self.args.momentum

        opt = tf.train.MomentumOptimizer(learning_rate,
                                         momentum,
                                         use_nesterov=self.args.use_nesterov)

        from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
        opt = NPUDistributedOptimizer(opt)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []

        with tf.control_dependencies(update_ops):
            gate_gradients = tf.train.Optimizer.GATE_NONE
            grads_and_vars = opt.compute_gradients(
                total_loss, gate_gradients=gate_gradients)
            train_op = opt.apply_gradients(grads_and_vars,
                                           global_step=global_step)

        train_op = tf.group(train_op)

        return tf.estimator.EstimatorSpec(mode,
                                          loss=total_loss,
                                          train_op=train_op)