Пример #1
0
def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
              tower_gradvars, tower_preds, is_cpu):
  """Build computation tower for each device (CPU or GPU).

  Args:
    is_training: true if is training graph.
    weight_decay: weight regularization strength, a float.
    feature: a Tensor.
    label: a Tensor.
    tower_losses: a list to be appended with current tower's loss.
    tower_gradvars: a list to be appended with current tower's gradients.
    tower_preds: a list to be appended with current tower's predictions.
    is_cpu: true if build tower on CPU.
  """
  data_format = 'channels_last' if is_cpu else 'channels_first'
  model = cifar10_model.ResNetCifar10(
      FLAGS.num_layers, is_training=is_training, data_format=data_format)
  logits = model.forward_pass(feature, input_data_format='channels_last')
  tower_pred = {
      'classes': tf.argmax(input=logits, axis=1),
      'probabilities': tf.nn.softmax(logits)
  }
  tower_preds.append(tower_pred)

  tower_loss = tf.losses.sparse_softmax_cross_entropy(
      logits=logits, labels=label)
  tower_loss = tf.reduce_mean(tower_loss)

  model_params = tf.trainable_variables()
  tower_loss += weight_decay * tf.add_n(
      [tf.nn.l2_loss(v) for v in model_params])
  tower_losses.append(tower_loss)

  tower_grad = tf.gradients(tower_loss, model_params)
  tower_gradvars.append(zip(tower_grad, model_params))
def _tower_fn(is_training, weight_decay, feature, label, data_format,
              num_layers, batch_norm_decay, batch_norm_epsilon, optimizer,
              gradient_scale):
    """Build computation tower (Resnet).

  Args:
    is_training: true if is training graph.
    weight_decay: weight regularization strength, a float.
    feature: a Tensor.
    label: a Tensor.
    data_format: channels_last (NHWC) or channels_first (NCHW).
    num_layers: number of layers, an int.
    batch_norm_decay: decay for batch normalization, a float.
    batch_norm_epsilon: epsilon for batch normalization, a float.

  Returns:
    A tuple with the loss for the tower, the gradients and parameters, and
    predictions.

  """
    model = cifar10_model.ResNetCifar10(num_layers,
                                        batch_norm_decay=batch_norm_decay,
                                        batch_norm_epsilon=batch_norm_epsilon,
                                        is_training=is_training,
                                        data_format=data_format)
    logits = model.forward_pass(feature, input_data_format='channels_last')
    tower_pred = {
        'classes': tf.argmax(input=logits, axis=1),
        'probabilities': tf.nn.softmax(logits)
    }

    tower_loss = tf.losses.sparse_softmax_cross_entropy(logits=logits,
                                                        labels=label)
    tower_loss = tf.reduce_mean(tower_loss)

    model_params = tf.trainable_variables()
    tower_loss += weight_decay * tf.add_n(
        [tf.nn.l2_loss(v) for v in model_params])

    compgrad_tower = [optimizer.compute_gradients(tower_loss, model_params)]

    for g, v in compgrad_tower[0]:
        tf.summary.histogram(v.name + '_ORG', g)
        tf.logging.info('@sahiltyagi4 shape of new_grads ' + str(g.shape))
        tf.logging.info('@sahiltyagi4 shape of vars ' + str(v.shape))

    new_grads = [(grad[0] * gradient_scale) for grad in compgrad_tower[0]]

    for new_g, new_v in zip(new_grads, model_params):
        tf.summary.histogram(new_v.name + '_MODIFIED',
                             new_g)  #last_gradient = new_g

    return tower_loss, zip(new_grads, model_params), tower_pred, compgrad_tower
Пример #3
0
def _tower_fn(
    is_training,
    weight_decay,
    feature,
    label,
    data_format,
    num_layers,
    batch_norm_decay,
    batch_norm_epsilon,
):
    """Build computation tower (Resnet).

    Args:
        is_training: true if is training graph.
        weight_decay: weight regularization strength, a float.
        feature: a Tensor.
        label: a Tensor.
        data_format: channels_last (NHWC) or channels_first (NCHW).
        num_layers: number of layers, an int.
        batch_norm_decay: decay for batch normalization, a float.
        batch_norm_epsilon: epsilon for batch normalization, a float.

    Returns:
        A tuple with the loss for the tower, the gradients and parameters, and
        predictions.

    """
    model = cifar10_model.ResNetCifar10(
        num_layers,
        batch_norm_decay=batch_norm_decay,
        batch_norm_epsilon=batch_norm_epsilon,
        is_training=is_training,
        data_format=data_format,
    )
    logits = model.forward_pass(feature, input_data_format="channels_last")
    tower_pred = {
        "classes": tf.argmax(input=logits, axis=1),
        "probabilities": tf.nn.softmax(logits),
    }

    tower_loss = tf.losses.sparse_softmax_cross_entropy(logits=logits,
                                                        labels=label)
    tower_loss = tf.reduce_mean(tower_loss)

    model_params = tf.trainable_variables()
    tower_loss += weight_decay * tf.add_n(
        [tf.nn.l2_loss(v) for v in model_params])

    tower_grad = tf.gradients(tower_loss, model_params)

    return tower_loss, zip(tower_grad, model_params), tower_pred
Пример #4
0
    def __init__(self, dim):
        self.dim = dim

        # param values from cifar10_main.py
        if not tf.test.is_gpu_available():
            data_format = 'channels_last'
        else:
            data_format = 'channels_first'

        is_training = True
        weight_decay = 2e-4,
        num_layers = 8
        batch_size = 32
        batch_norm_decay = 0.997
        batch_norm_epsilon = 1e-5
        image_batch = tf.random_uniform((batch_size, 32, 32, 3))
        label_batch = tf.ones((batch_size, ), dtype=tf.int32)

        self.model = cifar10_model.ResNetCifar10(
            num_layers,
            batch_norm_decay=batch_norm_decay,
            batch_norm_epsilon=batch_norm_epsilon,
            is_training=is_training,
            data_format=data_format)
        self.logits = self.model.forward_pass(
            image_batch, input_data_format='channels_last')

        # make size of parameters multiple of 8 (75360)
        dummy_var = tf.Variable(tf.ones((5, )))
        self.pred = {
            'classes': tf.argmax(input=self.logits, axis=1),
            'probabilities': tf.nn.softmax(self.logits)
        }

        self.loss = tf.losses.sparse_softmax_cross_entropy(logits=self.logits,
                                                           labels=label_batch)
        self.model_params = tf.trainable_variables()
        self.loss += weight_decay * tf.add_n(
            [tf.nn.l2_loss(v) for v in self.model_params])

        grads = tf.gradients(self.loss, self.model_params)
        self.grad = tf.concat([tf.reshape(g, [-1]) for g in grads], axis=0)
        self.weights = np.zeros(self.grad.shape, dtype=np.float32)

        # TODO: make this into an op that accepts actual values
        self.set_weights_op = tf.global_variables_initializer()

        # todo(y): pad things so that it's divisible by num_ps?

        self.sess = tf.Session()
Пример #5
0
def main(_):
    num_train_examples = 45000
    melt.apps.train.init()

    batch_size = melt.batch_size()
    num_gpus = melt.num_gpus()

    batch_size_per_gpu = FLAGS.batch_size

    # batch size not changed but FLAGS.batch_size will change to batch_size / num_gpus
    #print('--------------batch_size, FLAGS.batch_size, num_steps_per_epoch', batch_size, FLAGS.batch_size, num_train_examples // batch_size)

    global_scope = FLAGS.algo
    with tf.variable_scope(global_scope) as global_scope:
        data_format = 'channels_first'
        num_layers = 44
        batch_norm_decay = 0.997
        batch_norm_epsilon = 1e-05
        data_dir = './mount/data/cifar10/'
        with tf.variable_scope('main') as scope:
            model = cifar10_model.ResNetCifar10(
                num_layers,
                batch_norm_decay=batch_norm_decay,
                batch_norm_epsilon=batch_norm_epsilon,
                is_training=True,
                data_format=data_format)

            dataset = cifar10.Cifar10DataSet(data_dir,
                                             subset='train',
                                             use_distortion=True)

            ## This is wrong will cause all gpu read same data, so slow convergence but will get better test result
            #_, image_batch, label_batch = dataset.make_batch(FLAGS.batch_size)
            def loss_function():
                # doing this 2gpu will get similar result as 1gpu, seems a bit better valid result and a bit worse test result might due to randomness
                _, image_batch, label_batch = dataset.make_batch(
                    batch_size_per_gpu)
                return tower_loss(model, image_batch, label_batch)

            #loss_function = lambda: tower_loss(model, image_batch, label_batch)
            loss = melt.tower_losses(loss_function, num_gpus)
            pred = model.predict()
            pred = pred['classes']
            label_batch = dataset.label_batch
            acc = tf.reduce_mean(tf.to_float(tf.equal(pred, label_batch)))

            #tf.summary.image('train/image', dataset.image_batch)
            # # Compute confusion matrix
            # matrix = tf.confusion_matrix(label_batch, pred, num_classes=10)
            # # Get a image tensor for summary usage
            # image_tensor = draw_confusion_matrix(matrix)
            # tf.summary.image('train/confusion_matrix', image_tensor)

            scope.reuse_variables()
            ops = [loss, acc]

            # TODO multiple gpu validation and inference

            validator = cifar10_model.ResNetCifar10(
                num_layers,
                batch_norm_decay=batch_norm_decay,
                batch_norm_epsilon=batch_norm_epsilon,
                is_training=False,
                data_format=data_format)

            valid_dataset = cifar10.Cifar10DataSet(data_dir,
                                                   subset='valid',
                                                   use_distortion=False)
            valid_iterator = valid_dataset.make_batch(batch_size)
            valid_id_batch, valid_image_batch, valid_label_batch = valid_iterator.get_next(
            )

            valid_loss = tower_loss(validator, valid_image_batch,
                                    valid_label_batch)
            valid_pred = validator.predict()
            valid_pred = valid_pred['classes']

            ## seems not work with non rpeat mode..
            #tf.summary.image('valid/image', valid_image_batch)
            ## Compute confusion matrix
            #matrix = tf.confusion_matrix(valid_label_batch, valid_pred, num_classes=10)
            ## Get a image tensor for summary usage
            #image_tensor = draw_confusion_matrix(matrix)
            #tf.summary.image('valid/confusion_matrix', image_tensor)

            #loss_function = lambda: tower_loss(validator, val_image_batch, val_label_batch)
            #val_loss = melt.tower_losses(loss_function, FLAGS.num_gpus, is_training=False)
            #eval_ops = [val_loss]

            metric_eval_fn = lambda model_path=None: \
                                evaluator.evaluate([valid_id_batch, valid_loss, valid_pred, valid_label_batch, valid_image_batch],
                                                   valid_iterator,
                                                   model_path=model_path)

            predictor = cifar10_model.ResNetCifar10(
                num_layers,
                batch_norm_decay=batch_norm_decay,
                batch_norm_epsilon=batch_norm_epsilon,
                is_training=False,
                data_format=data_format)

            predictor.init_predict()

            test_dataset = cifar10.Cifar10DataSet(data_dir,
                                                  subset='test',
                                                  use_distortion=False)
            test_iterator = test_dataset.make_batch(batch_size)
            test_id_batch, test_image_batch, test_label_batch = test_iterator.get_next(
            )

            test_pred = predictor.predict(test_image_batch,
                                          input_data_format='channels_last')
            test_pred = test_pred['classes']

            inference_fn = lambda model_path=None: \
                                evaluator.inference([test_id_batch, test_pred],
                                                    test_iterator,
                                                    model_path=model_path)

            global eval_names
            names = ['loss', 'acc']

        melt.apps.train_flow(ops,
                             names=names,
                             metric_eval_fn=metric_eval_fn,
                             inference_fn=inference_fn,
                             model_dir=FLAGS.model_dir,
                             num_steps_per_epoch=num_train_examples //
                             batch_size)
Пример #6
0
def _tower_fn(is_training, weight_decay, feature, label, data_format,
              num_layers, batch_norm_decay, batch_norm_epsilon, optimizer,
              gradient_scale, w_name):
    """Build computation tower (Resnet).

  Args:
    is_training: true if is training graph.
    weight_decay: weight regularization strength, a float.
    feature: a Tensor.
    label: a Tensor.
    data_format: channels_last (NHWC) or channels_first (NCHW).
    num_layers: number of layers, an int.
    batch_norm_decay: decay for batch normalization, a float.
    batch_norm_epsilon: epsilon for batch normalization, a float.

  Returns:
    A tuple with the loss for the tower, the gradients and parameters, and
    predictions.

  """
    tf_config = json.loads(os.environ['TF_CONFIG'])
    tasktype = tf_config['task']['type']
    index = str(tf_config['task']['index'])
    data_format = 'channels_first'

    model = cifar10_model.ResNetCifar10(num_layers,
                                        batch_norm_decay=batch_norm_decay,
                                        batch_norm_epsilon=batch_norm_epsilon,
                                        is_training=is_training,
                                        data_format=data_format)
    logits = model.forward_pass(feature, input_data_format='channels_last')
    tower_pred = {
        'classes': tf.argmax(input=logits, axis=1),
        'probabilities': tf.nn.softmax(logits)
    }

    tower_loss = tf.losses.sparse_softmax_cross_entropy(logits=logits,
                                                        labels=label)
    tower_loss = tf.reduce_mean(tower_loss)

    ### LOGGING START TIME HERE
    # get_time_module = tf.load_op_library('/home/tensorflow/bazel-bin/tensorflow/core/user_ops/get_time.so')
    # start_time_operation = get_time_module.get_time(tower_loss)
    # start_time_operation = tf.reshape(start_time_operation, [-1])
    # start_time_operation = tf.convert_to_tensor(start_time_operation, name='sahil_grad_start_time_operationresult')

    # loss_op = tf.reshape(tower_loss, [-1])
    # start_time_op = tf.compat.v1.py_func(func=return_time, inp=[loss_op], Tout=tf.float32)
    # start_time_op = tf.reshape(start_time_op, [-1])
    # start_time_op = tf.reduce_sum(start_time_op, name='START_SAHIL_TIME_GRADIENT')

    model_params = tf.trainable_variables()
    tower_loss += weight_decay * tf.add_n(
        [tf.nn.l2_loss(v) for v in model_params])

    #start_ts = tf.timestamp(name='TS_START_SAHIL')
    # compgrad_tower = [optimizer.compute_gradients(tower_loss, model_params, worker_name=str(start_time_operation[0]))]
    compgrad_tower = [optimizer.compute_gradients(tower_loss, model_params)]
    #compgrad_tower = [optimizer.compute_gradients(tower_loss, model_params, worker_name=str(start_ts))]

    for g, v in compgrad_tower[0]:
        tf.summary.histogram(v.name + '_ORG', g)

    #last_gradient=0
    new_grads = [(grad[0] * gradient_scale) for grad in compgrad_tower[0]]
    for new_g, new_v in zip(new_grads, model_params):
        tf.summary.histogram(new_v.name + '_MODIFIED',
                             new_g)  #last_gradient = new_g

    return tower_loss, zip(new_grads, model_params), tower_pred, compgrad_tower
Пример #7
0
def main(_):
    num_train_examples = 45000
    melt.apps.init()

    batch_size = melt.batch_size()
    num_gpus = melt.num_gpus()

    batch_size_per_gpu = FLAGS.batch_size

    # batch size not changed but FLAGS.batch_size will change to batch_size / num_gpus
    #print('--------------batch_size, FLAGS.batch_size, num_steps_per_epoch', batch_size, FLAGS.batch_size, num_train_examples // batch_size)

    global_scope = FLAGS.algo
    with tf.variable_scope(global_scope) as global_scope:
        data_format = 'channels_first'
        num_layers = 44
        batch_norm_decay = 0.997
        batch_norm_epsilon = 1e-05
        data_dir = './mount/data/cifar10/'
        with tf.variable_scope('main') as scope:
            model = cifar10_model.ResNetCifar10(
                num_layers,
                batch_norm_decay=batch_norm_decay,
                batch_norm_epsilon=batch_norm_epsilon,
                training=True,
                data_format=data_format)

            dataset = cifar10.Cifar10DataSet(data_dir,
                                             subset='train',
                                             use_distortion=True)
            # this is faster then above method
            iterator = dataset.make_batch(batch_size)
            batch = iterator.get_next()

            ## Now below is also ok...
            # x = {'id': batch[0], 'image': batch[1]}
            # y = batch[2]
            # batch = (x, y)
            # x, y = melt.split_batch(batch, batch_size, num_gpus)
            # image_batches, label_batches = [item['image'] for item in x], y

            _, image_batches, label_batches = melt.split_batch(
                batch, batch_size, num_gpus)

            def loss_function(i):
                return tower_loss(model, image_batches[i], label_batches[i])

            label_batch = label_batches[-1]

            #loss_function = lambda: tower_loss(model, image_batch, label_batch)
            loss = melt.tower(loss_function, num_gpus)
            pred = model.predict()
            pred = pred['classes']
            #label_batch = dataset.label_batch
            acc = tf.reduce_mean(tf.to_float(tf.equal(pred, label_batch)))

            #tf.summary.image('train/image', dataset.image_batch)
            # # Compute confusion matrix
            # matrix = tf.confusion_matrix(label_batch, pred, num_classes=10)
            # # Get a image tensor for summary usage
            # image_tensor = draw_confusion_matrix(matrix)
            # tf.summary.image('train/confusion_matrix', image_tensor)

            scope.reuse_variables()
            ops = [loss, acc]

            validator = cifar10_model.ResNetCifar10(
                num_layers,
                batch_norm_decay=batch_norm_decay,
                batch_norm_epsilon=batch_norm_epsilon,
                training=False,
                data_format=data_format)

            valid_dataset = cifar10.Cifar10DataSet(data_dir,
                                                   subset='valid',
                                                   use_distortion=False)
            valid_iterator = valid_dataset.make_batch(batch_size)
            valid_batch = valid_iterator.get_next()
            valid_id_batches, valid_image_batches, valid_label_batches = melt.split_batch(
                valid_batch, batch_size, num_gpus, training=False)

            def valid_loss_fn(i):
                valid_loss = tower_loss(validator, valid_image_batches[i],
                                        valid_label_batches[i])
                valid_pred = validator.predict()
                return valid_id_batches[i], valid_loss, valid_pred[
                    'classes'], valid_label_batches[i], valid_image_batches[i]

            num_valid_examples = dataset.num_examples_per_epoch(subset='valid')
            valid_ops = melt.tower(valid_loss_fn, num_gpus, training=False)

            ## seems not work with non rpeat mode..
            #tf.summary.image('valid/image', valid_image_batch)
            ## Compute confusion matrix
            #matrix = tf.confusion_matrix(valid_label_batch, valid_pred, num_classes=10)
            ## Get a image tensor for summary usage
            #image_tensor = draw_confusion_matrix(matrix)
            #tf.summary.image('valid/confusion_matrix', image_tensor)

            #loss_function = lambda: tower_loss(validator, val_image_batch, val_label_batch)
            #val_loss = melt.tower_losses(loss_function, FLAGS.num_gpus, training=False)
            #eval_ops = [val_loss]

            metric_eval_fn = lambda model_path=None: \
                                evaluator.evaluate(valid_ops,
                                                   valid_iterator,
                                                   num_steps=-(-num_valid_examples // batch_size),
                                                   num_examples=num_valid_examples,
                                                   model_path=model_path,
                                                   num_gpus=num_gpus)

            predictor = cifar10_model.ResNetCifar10(
                num_layers,
                batch_norm_decay=batch_norm_decay,
                batch_norm_epsilon=batch_norm_epsilon,
                training=False,
                data_format=data_format)

            predictor.init_predict()

            test_dataset = cifar10.Cifar10DataSet(data_dir,
                                                  subset='test',
                                                  use_distortion=False)
            test_iterator = test_dataset.make_batch(batch_size)

            test_batch = test_iterator.get_next()
            test_id_batches, test_image_batches, test_label_batches = melt.split_batch(
                test_batch, batch_size, num_gpus, training=False)

            def test_fn(i):
                test_pred = predictor.predict(test_image_batches[i])
                test_pred = test_pred['classes']
                return test_id_batches[i], test_pred

            num_test_examples = dataset.num_examples_per_epoch(subset='test')
            test_ops = melt.tower(test_fn, num_gpus, training=False)
            inference_fn = lambda model_path=None: \
                                evaluator.inference(test_ops,
                                                    test_iterator,
                                                    num_steps=-(-num_test_examples // batch_size),
                                                    num_examples=num_test_examples,
                                                    model_path=model_path,
                                                    num_gpus=num_gpus)

            global eval_names
            names = ['loss', 'acc']

        melt.apps.train_flow(ops,
                             names=names,
                             metric_eval_fn=metric_eval_fn,
                             inference_fn=inference_fn,
                             model_dir=FLAGS.model_dir,
                             num_steps_per_epoch=num_train_examples //
                             batch_size)