Exemplo n.º 1
0
def add_gradients_summary(grads):
    """Add histogram summary for given gradients and scalar summary for clipped gradients.

    Args:
        grads: A list of `Tensor`. The gradients to summarize.

    Returns:
        The list of created gradient summaries.

    """

    # Add histograms for gradients.
    summary = []
    for gradient, var in grads:
        if isinstance(gradient, ops.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient

        if grad_values is not None:
            summary_name = var.op.name + '/Gradients'
            summary.append(
                get_summary(SummaryTypes.HISTOGRAM, summary_name, grad_values))

            summary_norm_name = var.op.name + '/GradientsNorm'
            summary.append(
                get_summary(SummaryTypes.SCALAR, summary_norm_name,
                            clip_ops.global_norm([grad_values])))

        summary.append(
            get_summary(SummaryTypes.SCALAR, 'ClippedGradientNorm',
                        clip_ops.global_norm(list(zip(*grads))[0])))
    return summary
Exemplo n.º 2
0
def add_gradients_summaries(grads_and_vars):
  """Add summaries to gradients.

  Args:
    grads_and_vars: A list of gradient to variable pairs (tuples).

  Returns:
    The list of created summaries.
  """
  summaries = []
  for grad, var in grads_and_vars:
    if grad is not None:
      if isinstance(grad, ops.IndexedSlices):
        grad_values = grad.values
      else:
        grad_values = grad
      summaries.append(
          summary.histogram(var.op.name + '_gradient', grad_values))
      summaries.append(
          summary.scalar(var.op.name + '_gradient_norm',
                         clip_ops.global_norm([grad_values])))
    else:
      logging.info('Var %s has no gradient', var.op.name)

  return summaries
def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.):
  """Clips gradients of a multitask loss by their global norm.
  Ignores all-zero tensors when computing the global norm.

  Args:
  gradients_variables: a list of pairs (gradient, variable).
  clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.

  Returns:
  list: A list of pairs of the same type as gradients_variables,.
  fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
  """
  gradients, variables = six.moves.zip(*gradients_variables)
  def _replace_nonexisting_grad(grad):
    if grad is None:
      return grad
    all_zeros = _is_all_zeros(grad)
    return control_flow_ops.cond(all_zeros,
                                 lambda: array_ops.zeros(
                                     [], dtype=dtypes.as_dtype(grad.dtype)),
                                 lambda: grad)
  nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients]
  fixed_global_norm = clip_ops.global_norm(nonzero_gradients)
  gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_norm,
                                              use_norm=fixed_global_norm)
  return list(six.moves.zip(gradients, variables)), fixed_global_norm
Exemplo n.º 4
0
    def gradient_clipping(grads_and_vars):
        """Internal function for adaptive clipping."""
        grads, variables = zip(*grads_and_vars)

        norm = clip_ops.global_norm(grads)

        max_norm, log_mean = _adaptive_max_norm(norm, std_factor, decay,
                                                global_step, epsilon, name)

        # reports the max gradient norm for debugging
        if report_summary:
            summary.scalar("global_norm/adaptive_max_gradient_norm", max_norm)

        # factor will be 1. if norm is smaller than max_norm
        factor = array_ops.where(norm < max_norm, array_ops.ones_like(norm),
                                 math_ops.exp(log_mean) / norm)

        if static_max_norm is not None:
            factor = math_ops.minimum(static_max_norm / norm, factor)

        # apply factor
        clipped_grads = []
        for grad in grads:
            if grad is None:
                clipped_grads.append(None)
            elif isinstance(grad, ops.IndexedSlices):
                clipped_grads.append(
                    ops.IndexedSlices(grad.values * factor, grad.indices,
                                      grad.dense_shape))
            else:
                clipped_grads.append(grad * factor)

        return list(zip(clipped_grads, variables))
Exemplo n.º 5
0
def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.):
    """Clips gradients of a multitask loss by their global norm.

  Ignores all-zero tensors when computing the global norm.

  Args:
    gradients_variables: a list of pairs (gradient, variable).
    clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.

  Returns:
    list: A list of pairs of the same type as gradients_variables,.
    fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
  """
    gradients, variables = six.moves.zip(*gradients_variables)

    def _replace_nonexisting_grad(grad):
        if grad is None:
            return grad
        all_zeros = _is_all_zeros(grad)
        return control_flow_ops.cond(
            all_zeros,
            lambda: array_ops.zeros([], dtype=dtypes.as_dtype(grad.dtype)),
            lambda: grad)

    nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients]
    fixed_global_norm = clip_ops.global_norm(nonzero_gradients)
    gradients, _ = clip_ops.clip_by_global_norm(gradients,
                                                clip_norm,
                                                use_norm=fixed_global_norm)
    return list(six.moves.zip(gradients, variables)), fixed_global_norm
Exemplo n.º 6
0
  def gradient_clipping(grads_and_vars):
    """Internal function for adaptive clipping."""
    grads, variables = zip(*grads_and_vars)

    norm = clip_ops.global_norm(grads)

    max_norm, log_mean = _adaptive_max_norm(norm, std_factor, decay,
                                            global_step, epsilon, name)

    # reports the max gradient norm for debugging
    if report_summary:
      summary.scalar("global_norm/adaptive_max_gradient_norm", max_norm)

    # factor will be 1. if norm is smaller than max_norm
    factor = array_ops.where(norm < max_norm,
                             array_ops.ones_like(norm),
                             math_ops.exp(log_mean) / norm)

    if static_max_norm is not None:
      factor = math_ops.minimum(static_max_norm / norm, factor)

    # apply factor
    clipped_grads = []
    for grad in grads:
      if grad is None:
        clipped_grads.append(None)
      elif isinstance(grad, ops.IndexedSlices):
        clipped_grads.append(
            ops.IndexedSlices(grad.values * factor, grad.indices,
                              grad.dense_shape))
      else:
        clipped_grads.append(grad * factor)

    return list(zip(clipped_grads, variables))
Exemplo n.º 7
0
def gradients(opt, loss, vars, step, max_gradient_norm=None, dont_clip=[]):
    '''
    Function for calculating and applying gradients on all trainable parameters
    '''
    gradients = opt.compute_gradients(loss, vars)
    if max_gradient_norm is not None:
        to_clip = [(g, v) for g, v in gradients if v.name not in dont_clip]
        not_clipped = [(g, v) for g, v in gradients if v.name in dont_clip]
        gradients, variables = zip(*to_clip)
        clipped_gradients, _ = clip_ops.clip_by_global_norm(
            gradients, max_gradient_norm)
        gradients = list(zip(clipped_gradients, variables)) + not_clipped

    # Add histograms for variables, gradients and gradient norms
    for gradient, variable in gradients:
        if isinstance(gradient, ops.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient
        if grad_values is None:
            print('warning: missing gradient: {}'.format(variable.name))
        if grad_values is not None:
            tf.summary.histogram(variable.name, variable)
            tf.summary.histogram(variable.name + '/gradients', grad_values)
            tf.summary.histogram(variable.name + '/gradient_norm',
                                 clip_ops.global_norm([grad_values]))

    return opt.apply_gradients(gradients, global_step=step)
Exemplo n.º 8
0
def add_gradients_summaries(grads_and_vars):
    """Add summaries to gradients.

  Args:
    grads_and_vars: A list of gradient to variable pairs (tuples).

  Returns:
    The list of created summaries.
  """
    summaries = []
    for grad, var in grads_and_vars:
        if grad is not None:
            if isinstance(grad, ops.IndexedSlices):
                grad_values = grad.values
            else:
                grad_values = grad
            summaries.append(
                summary.histogram(var.op.name + '/gradient', grad_values))
            summaries.append(
                summary.scalar(var.op.name + '/gradient_norm',
                               clip_ops.global_norm([grad_values])))
        else:
            logging.info('Var %s has no gradient', var.op.name)

    return summaries
Exemplo n.º 9
0
  def test_stable_global_norm_avoids_overflow(self):
    tensors = [array_ops.ones([4]), array_ops.ones([4, 4]) * 1e19, None]
    gnorm_is_inf = math_ops.is_inf(clip_ops.global_norm(tensors))
    stable_gnorm_is_inf = math_ops.is_inf(
        tfgan_losses._numerically_stable_global_norm(tensors))

    with self.test_session(use_gpu=True):
      self.assertTrue(gnorm_is_inf.eval())
      self.assertFalse(stable_gnorm_is_inf.eval())
Exemplo n.º 10
0
    def test_stable_global_norm_avoids_overflow(self):
        tensors = [array_ops.ones([4]), array_ops.ones([4, 4]) * 1e19, None]
        gnorm_is_inf = math_ops.is_inf(clip_ops.global_norm(tensors))
        stable_gnorm_is_inf = math_ops.is_inf(
            tfgan_losses._numerically_stable_global_norm(tensors))

        with self.test_session(use_gpu=True):
            self.assertTrue(gnorm_is_inf.eval())
            self.assertFalse(stable_gnorm_is_inf.eval())
Exemplo n.º 11
0
    def _optimizer(self, loss, variables, global_step, name):
        with tf.variable_scope(name):
            learning_rate = tf.where(
                tf.greater_equal(global_step, self.decay_from),
                tf.train.polynomial_decay(self.learning_rate,
                                          global_step - self.decay_from,
                                          decay_steps=self.steps -
                                          self.decay_from,
                                          end_learning_rate=0,
                                          power=1.0), self.learning_rate)

            if self.tb_verbose:
                tf.summary.scalar('learning_rate', learning_rate)

            adam = tf.train.AdamOptimizer(learning_rate,
                                          beta1=self.beta1,
                                          name=name)

            # this part is basically copied from tensorflow.python.training.optimizer.Optimizer#minimize
            # to access gradients
            gradients = self.compute_adam_gradients(adam, loss, variables)
            tf.summary.scalar("gradient_norm/global",
                              clip_ops.global_norm(list(zip(*gradients))[0]))

            # Add histograms for variables, gradients and gradient norms
            # copied from tensorflow.contrib.layers.python.layers.optimizers.optimize_loss
            for gradient, variable in gradients:
                if isinstance(gradient, ops.IndexedSlices):
                    grad_values = gradient.values
                else:
                    grad_values = gradient

                if grad_values is not None:
                    var_name = variable.name.replace(":", "_")
                    tf.summary.histogram("gradients/%s" % var_name,
                                         grad_values)
                    tf.summary.scalar("gradient_norm/%s" % var_name,
                                      clip_ops.global_norm([grad_values]))

            # I increment global step myself because applying gradiesnts is done in multiple session run
            # Interesting note: global step was incremented once per each optimizer, so 5 times faster
            learning_step = adam.apply_gradients(gradients)

            return learning_step
Exemplo n.º 12
0
 def _gradient_summaries(self, gradsandvar):
     for gradient, variable in gradsandvar:
         if isinstance(gradient, ops.IndexedSlices):
             grad_values = gradient.values
         else:
             grad_values = gradient
         tf.summary.histogram(variable.name, variable)
         tf.summary.histogram(variable.name + './gradients', grad_values)
         tf.summary.histogram(variable.name + '/gradient_norms',
                              clip_ops.global_norm([grad_values]))
Exemplo n.º 13
0
def grads_dict(gradients, histogram_dict):
    for gradient, variable in gradients:
        if isinstance(gradient, ops.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient
        histogram_dict[variable.name + "/gradients"] = grad_values
        histogram_dict[variable.name + "/gradients_norm"] =\
                       clip_ops.global_norm([grad_values])
    return histogram_dict
Exemplo n.º 14
0
  def test_stable_global_norm_unchanged(self):
    """Test that preconditioning doesn't change global norm value."""
    random_seed.set_random_seed(1234)
    tensors = [random_ops.random_uniform([3]*i, -10.0, 10.0) for i in range(6)]
    gnorm = clip_ops.global_norm(tensors)
    precond_gnorm = tfgan_losses._numerically_stable_global_norm(tensors)

    with self.test_session(use_gpu=True) as sess:
      for _ in range(10):  # spot check closeness on more than one sample.
        gnorm_np, precond_gnorm_np = sess.run([gnorm, precond_gnorm])
        self.assertNear(gnorm_np, precond_gnorm_np, 1e-5)
Exemplo n.º 15
0
  def test_stable_global_norm_unchanged(self):
    """Test that preconditioning doesn't change global norm value."""
    random_seed.set_random_seed(1234)
    tensors = [random_ops.random_uniform([3]*i, -10.0, 10.0) for i in range(6)]
    gnorm = clip_ops.global_norm(tensors)
    precond_gnorm = tfgan_losses._numerically_stable_global_norm(tensors)

    with self.test_session(use_gpu=True) as sess:
      for _ in range(10):  # spot check closeness on more than one sample.
        gnorm_np, precond_gnorm_np = sess.run([gnorm, precond_gnorm])
        self.assertNear(gnorm_np, precond_gnorm_np, 1e-5)
Exemplo n.º 16
0
def optimize(gradients,
             optim,
             global_step,
             summaries,
             global_norm=None,
             global_norm_clipped=None,
             appendix=''):
    """Modified from sugartensor"""

    # Add Summary
    if summaries is None:
        summaries = ["loss", "learning_rate"]
    if "gradient_norm" in summaries:
        if global_norm is None:
            tf.summary.scalar("global_norm/gradient_norm" + appendix,
                              clip_ops.global_norm(list(zip(*gradients))[0]))
        else:
            tf.summary.scalar("global_norm/gradient_norm" + appendix,
                              global_norm)
        if global_norm_clipped is not None:
            tf.summary.scalar("global_norm/gradient_norm_clipped" + appendix,
                              global_norm_clipped)

    # Add histograms for variables, gradients and gradient norms.
    for gradient, variable in gradients:
        if isinstance(gradient, ops.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient

        if grad_values is not None:
            var_name = variable.name.replace(":", "_")
            if "gradients" in summaries:
                tf.summary.histogram("gradients/%s" % var_name, grad_values)
            if "gradient_norm" in summaries:
                tf.summary.scalar("gradient_norm/%s" % var_name,
                                  clip_ops.global_norm([grad_values]))

    # Gradient Update OP
    return optim.apply_gradients(gradients, global_step=global_step)
def optimize(loss, learning_rate, optimizer, variables, global_step,
             summaries):
    """Modified from sugartensor"""

    optim = optimizer(learning_rate=learning_rate)

    # Calculate Gradient
    gradients = optim.compute_gradients(loss, var_list=variables)

    # Add Summary
    if summaries is None:
        summaries = ["loss", "learning_rate"]
    if "gradient_norm" in summaries:
        tf.summary.scalar("global_norm/gradient_norm",
                          clip_ops.global_norm(list(zip(*gradients))[0]))
    # Add scalar summary for loss.
    if "loss" in summaries:
        tf.summary.scalar("loss", loss)

    # Add histograms for variables, gradients and gradient norms.
    for gradient, variable in gradients:
        if isinstance(gradient, ops.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient

        if grad_values is not None:
            var_name = variable.name.replace(":", "_")
            if "gradients" in summaries:
                tf.summary.histogram("gradients/%s" % var_name, grad_values)
            if "gradient_norm" in summaries:
                tf.summary.scalar("gradient_norm/%s" % var_name,
                                  clip_ops.global_norm([grad_values]))

    # Gradient Update OP
    return optim.apply_gradients(gradients, global_step=global_step)
Exemplo n.º 18
0
def gradients_a(opt, loss, vars, step, max_gradient_norm=None, dont_clip=[]):
    gradients = opt.compute_gradients(loss, vars)
    # Add histograms for variables, gradients and gradient norms in Tensorboard
    for gradient, variable in gradients:
        if isinstance(gradient, ops.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient
        if grad_values is None:
            print('warning: missing gradient: {}'.format(variable.name))
        if grad_values is not None:
            tf.summary.histogram(variable.name, variable)
            tf.summary.histogram(variable.name + '/gradients', grad_values)
            tf.summary.histogram(
                variable.name + '/gradient_norm',
                clip_ops.global_norm([grad_values])
            )
    return opt.apply_gradients(gradients, global_step=step)
Exemplo n.º 19
0
    def model_fn(features, labels, mode, params):
        """Model function defining an inpainting estimator."""
        batch_size = params['batch_size']
        z_shape = [batch_size] + params['z_shape']
        add_summaries = params['add_summaries']
        input_clip = params['input_clip']

        z = variable_scope.get_variable(
            name=INPUT_NAME,
            initializer=random_ops.truncated_normal(z_shape),
            constraint=lambda x: clip_ops.clip_by_value(
                x, -input_clip, input_clip))

        generator = functools.partial(generator_fn, mode=mode)
        discriminator = functools.partial(discriminator_fn, mode=mode)
        gan_model = tfgan_train.gan_model(generator_fn=generator,
                                          discriminator_fn=discriminator,
                                          real_data=labels,
                                          generator_inputs=z,
                                          check_shapes=False)

        loss = loss_fn(gan_model, features, labels, add_summaries)

        # Use a variable scope to make sure that estimator variables dont cause
        # save/load problems when restoring from ckpts.
        with variable_scope.variable_scope(OPTIMIZER_NAME):
            opt = optimizer(learning_rate=params['learning_rate'],
                            **params['opt_kwargs'])
            train_op = opt.minimize(
                loss=loss,
                global_step=training_util.get_or_create_global_step(),
                var_list=[z])

        if add_summaries:
            z_grads = gradients_impl.gradients(loss, z)
            summary.scalar('z_loss/z_grads', clip_ops.global_norm(z_grads))
            summary.scalar('z_loss/loss', loss)

        return model_fn_lib.EstimatorSpec(mode=mode,
                                          predictions=gan_model.generated_data,
                                          loss=loss,
                                          train_op=train_op)
Exemplo n.º 20
0
def _numerically_stable_global_norm(tensor_list):
  """Compute the global norm of a list of Tensors, with improved stability.

  The global norm computation sometimes overflows due to the intermediate L2
  step. To avoid this, we divide by a cheap-to-compute max over the
  matrix elements.

  Args:
    tensor_list: A list of tensors, or `None`.

  Returns:
    A scalar tensor with the global norm.
  """
  if all(x is None for x in tensor_list):
    return 0.0

  list_max = math_ops.reduce_max([math_ops.reduce_max(math_ops.abs(x)) for x in
                                  tensor_list if x is not None])
  return list_max * clip_ops.global_norm([x / list_max for x in tensor_list
                                          if x is not None])
Exemplo n.º 21
0
def _numerically_stable_global_norm(tensor_list):
  """Compute the global norm of a list of Tensors, with improved stability.

  The global norm computation sometimes overflows due to the intermediate L2
  step. To avoid this, we divide by a cheap-to-compute max over the
  matrix elements.

  Args:
    tensor_list: A list of tensors, or `None`.

  Returns:
    A scalar tensor with the global norm.
  """
  if np.all([x is None for x in tensor_list]):
    return 0.0

  list_max = math_ops.reduce_max([math_ops.reduce_max(math_ops.abs(x)) for x in
                                  tensor_list if x is not None])
  return list_max * clip_ops.global_norm([x / list_max for x in tensor_list
                                          if x is not None])
Exemplo n.º 22
0
def minimize(loss_batch, learningrate=0.0005):
    """The following plots for every trainable variable
      - Histogram of the entries of the Tensor
      - Histogram of the gradient over the Tensor
      - Histogram of the gradient-norm over the Tensor"""
    tvars = tf.trainable_variables()
    grads = tf.gradients(loss_batch, tvars)
    gradients = zip(grads, tvars)
    step = tf.train.AdamOptimizer(learningrate).apply_gradients(gradients)

    for gradient, variable in zip(grads, tvars):
        if isinstance(gradient, ops.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient

        h1 = tf.summary.histogram(variable.name, variable)
        h2 = tf.summary.histogram(variable.name + "/gradients", grad_values)
        h3 = tf.summary.histogram(variable.name + "/gradient_norm",
                                  clip_ops.global_norm([grad_values]))
    return step
  def model_fn(features, labels, mode, params):
    """Model function defining an inpainting estimator."""
    batch_size = params['batch_size']
    z_shape = [batch_size] + params['z_shape']
    add_summaries = params['add_summaries']
    input_clip = params['input_clip']

    z = variable_scope.get_variable(
        name=INPUT_NAME, initializer=random_ops.truncated_normal(z_shape),
        constraint=lambda x: clip_ops.clip_by_value(x, -input_clip, input_clip))

    generator = functools.partial(generator_fn, mode=mode)
    discriminator = functools.partial(discriminator_fn, mode=mode)
    gan_model = tfgan_train.gan_model(generator_fn=generator,
                                      discriminator_fn=discriminator,
                                      real_data=labels,
                                      generator_inputs=z,
                                      check_shapes=False)

    loss = loss_fn(gan_model, features, labels, add_summaries)

    # Use a variable scope to make sure that estimator variables dont cause
    # save/load problems when restoring from ckpts.
    with variable_scope.variable_scope(OPTIMIZER_NAME):
      opt = optimizer(learning_rate=params['learning_rate'],
                      **params['opt_kwargs'])
      train_op = opt.minimize(
          loss=loss, global_step=training_util.get_or_create_global_step(),
          var_list=[z])

    if add_summaries:
      z_grads = gradients_impl.gradients(loss, z)
      summary.scalar('z_loss/z_grads', clip_ops.global_norm(z_grads))
      summary.scalar('z_loss/loss', loss)

    return model_fn_lib.EstimatorSpec(mode=mode,
                                      predictions=gan_model.generated_data,
                                      loss=loss,
                                      train_op=train_op)
Exemplo n.º 24
0
def test_model(dataset, pool_pctg, layer_size_1):
    tf.reset_default_graph()
    if dataset in UCR_DATASETS:
        ucr_dataset = UCRDataset("../ucr_data/" + dataset)
        X_train = ucr_dataset.Xtrain
        y_train = ucr_dataset.Ytrain

        X_val = ucr_dataset.Xtest[:2]
        y_val = ucr_dataset.Ytest[:2]
        X_test = ucr_dataset.Xtest[2:]
        y_test = ucr_dataset.Ytest[2:]
        N = X_train.shape[0]
        Ntest = X_test.shape[0]
        D = 1  # Number of varialbes represented in time series
        D_ts = X_train.shape[1]
        X_train = np.expand_dims(X_train, 1)
        X_test = np.expand_dims(X_test, 1)
    elif dataset in MV_DATASETS:
        dataset = cv_splits_for_dataset(dataset)
        dataset_idx = min(4, len(dataset) - 1)
        X_train = dataset[dataset_idx].X_train
        y_train = dataset[dataset_idx].y_train
        X_test = dataset[dataset_idx].X_test
        y_test = dataset[dataset_idx].y_test

        n = max([
            np.max([v.shape[0] for v in X_train]),
            np.max([v.shape[0] for v in X_test])
        ])
        X_train = standardize_ts_lengths(X_train, n)
        X_test = standardize_ts_lengths(X_test, n)

        N = X_train.shape[0]
        Ntest = X_test.shape[0]
        D = X_train.shape[1]
        D_ts = X_train.shape[2]
    else:

        X_train, y_train, X_test, y_test = loadEEG()
        X_val = X_test[:2]
        y_val = y_test[:2]
        X_test = X_test[2:]
        y_test = y_test[2:]

        n = max([
            np.max([v.shape[0] for v in X_train]),
            np.max([v.shape[0] for v in X_test])
        ])
        X_train = standardize_ts_lengths(X_train, n)
        X_test = standardize_ts_lengths(X_test, n)

        N = X_train.shape[0]
        Ntest = X_test.shape[0]
        D = X_train.shape[1]
        D_ts = X_train.shape[2]

    X_val = X_test[:2]
    y_val = y_test[:2]
    X_test = X_test[2:]
    y_test = y_test[2:]
    pool_width = max(int(POOL_PCTG * D), 2)
    stride_width = 1
    base = np.min(y_train)  #Check if data is 0-based
    if base != 0:
        y_train -= base
        y_test -= base
    y_val = y_test[:2]

    num_classes = len(np.unique(y_train))
    num_fc_1 = layer_size_1
    epochs = np.floor(batch_size * max_iterations / N)
    print('Train with approximately %d epochs' % (epochs))

    x = tf.placeholder("float", shape=[None, D, D_ts], name='Input_data')
    y_ = tf.placeholder(tf.int64, shape=[None], name='Ground_truth')
    keep_prob = tf.placeholder("float")
    bn_train = tf.placeholder(tf.bool)

    with tf.name_scope("Reshaping_data") as scope:
        x_image = tf.reshape(x, [-1, D, D_ts, 1])

        initializer = tf.contrib.layers.xavier_initializer()
        """Build the graph"""
        # ewma is the decay for which we update the moving average of the
        # mean and variance in the batch-norm layers
    with tf.name_scope("Conv1") as scope:
        W_conv1 = tf.get_variable("Conv_Layer_1",
                                  shape=[1, 5, 1, num_filt_1],
                                  initializer=initializer)
        b_conv1 = bias_variable([num_filt_1], 'bias_for_Conv_Layer_1')
        a_conv1 = conv2d(x_image, W_conv1) + b_conv1

        h_relu = tf.nn.relu(a_conv1)
        h_conv1 = max_pool_2x2(h_relu, pool_width)

    with tf.name_scope("Fully_Connected1") as scope:
        h_conv3_flat = tf.contrib.layers.flatten(h_conv1)
        W_fc1 = tf.get_variable(
            "Fully_Connected_layer_1",
            shape=[D * num_filt_1 * D_ts * (1. / stride_width), num_fc_1],
            initializer=initializer)
        b_fc1 = bias_variable([num_fc_1], 'bias_for_Fully_Connected_Layer_1')
        h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1)

    with tf.name_scope("Fully_Connected2") as scope:
        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
        W_fc2 = tf.get_variable("W_fc2",
                                shape=[num_fc_1, num_classes],
                                initializer=initializer)
        b_fc2 = tf.Variable(tf.constant(0.1, shape=[num_classes]),
                            name='b_fc2')
        h_fc2 = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

    with tf.name_scope("SoftMax") as scope:
        regularization = .001
        regularizers = (tf.nn.l2_loss(W_conv1) + tf.nn.l2_loss(b_conv1) +
                        tf.nn.l2_loss(W_fc2) + tf.nn.l2_loss(b_fc2))
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h_fc2,
                                                              labels=y_)

        cost = tf.reduce_sum(loss) / batch_size
        cost += regularization * regularizers
        loss_summ = tf.summary.scalar("cross entropy_loss", cost)
    with tf.name_scope("train") as scope:
        tvars = tf.trainable_variables()
        #We clip the gradients to prevent explosion
        grads = tf.gradients(cost, tvars)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients = list(zip(grads, tvars))
        train_step = optimizer.apply_gradients(gradients)
        # The following block plots for every trainable variable
        #  - Histogram of the entries of the Tensor
        #  - Histogram of the gradient over the Tensor
        #  - Histogram of the grradient-norm over the Tensor
        numel = tf.constant([[0]])
        for gradient, variable in gradients:
            if isinstance(gradient, ops.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient

            numel += tf.reduce_sum(tf.size(variable))

            h1 = tf.summary.histogram(variable.name, variable)
            h2 = tf.summary.histogram(variable.name + "/gradients",
                                      grad_values)
            h3 = tf.summary.histogram(variable.name + "/gradient_norm",
                                      clip_ops.global_norm([grad_values]))
    with tf.name_scope("Evaluating_accuracy") as scope:

        correct_prediction = tf.equal(tf.argmax(h_fc2, 1), y_)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        accuracy_summary = tf.summary.scalar("accuracy", accuracy)

    #Define one op to call all summaries
    merged = tf.summary.merge_all()

    # For now, we collect performances in a Numpy array.
    # In future releases, I hope TensorBoard allows for more
    # flexibility in plotting
    #perf_collect = np.zeros((3,int(np.floor(max_iterations /100))))
    cost_ma = 0.0
    acc_ma = 0.0

    patience_window = 20
    last_10_val = [0 for i in range(patience_window)]
    with tf.Session() as sess:
        writer = tf.summary.FileWriter("./log_tb", sess.graph)

        sess.run(tf.global_variables_initializer())

        step = 0  # Step is a counter for filling the numpy array perf_collect
        for i in range(int(max_iterations)):
            batch_ind = np.random.choice(N, batch_size, replace=False)
            #batch_ind = np.arange(N)
            #batch_ind = batch_ind[(i*batch_size)%N:((i+1)*batch_size)%N]

            if i == 0:
                # Use this line to check before-and-after test accuracy
                result = sess.run(accuracy,
                                  feed_dict={
                                      x: X_test,
                                      y_: y_test,
                                      keep_prob: 1.0,
                                      bn_train: False
                                  })
                acc_test_before = result

            if i % 50 == 0:
                #Check training performance

                result = sess.run([cost, accuracy],
                                  feed_dict={
                                      x: X_train,
                                      y_: y_train,
                                      keep_prob: 1.0,
                                      bn_train: False
                                  })
                #perf_collect[1,step] = acc_train = result[1]
                acc_train = result[1]
                cost_train = result[0]

                #Check validation performance

                #result = sess.run([accuracy,cost,merged], feed_dict={ x: X_val, y_: y_val, keep_prob: 1.0, bn_train : False})
                #perf_collect[0,step] = acc_val = result[0]
                #cost_val = result[1]
                cost_val = 10
                acc_val = 10
                if i == 0: cost_ma = cost_train
                if i == 0: acc_ma = acc_train
                cost_ma = 0.8 * cost_ma + 0.2 * cost_train
                acc_ma = 0.8 * acc_ma + 0.2 * acc_train
                train_embedding = h_fc1.eval(feed_dict={
                    x: X_train,
                    y_: y_train,
                    keep_prob: 1.0,
                    bn_train: False
                })
                test_embedding = h_fc1.eval(feed_dict={
                    x: X_test,
                    y_: y_train,
                    keep_prob: 1.0,
                    bn_train: False
                })
                gg = evaluate_test_embedding(train_embedding, y_train,
                                             test_embedding, y_test)
                print('Accuracy given NN approach %0.2f' % (100 * gg))
                last_10_val[(i / 200) % patience_window] = acc_val
                #if last_10_val.count(last_10_val[0]) == len(last_10_val) and i > 3000:
                #  print 'Stopping early!'
                #  break

                writer.flush(
                )  #Don't forget this command! It makes sure Python writes the summaries to the log-file
                #   print("At %5.0f/%5.0f Cost: train%5.3f val%5.3f(%5.3f) Acc: train%5.3f val%5.3f(%5.3f) " % (i,max_iterations, cost_train,cost_val,cost_ma,acc_train,acc_val,acc_ma))
                step += 1
            gg = h_relu.eval(
                feed_dict={
                    x: X_train[batch_ind],
                    y_: y_train[batch_ind],
                    keep_prob: dropout,
                    bn_train: False
                })
            sess.run(train_step,
                     feed_dict={
                         x: X_train[batch_ind],
                         y_: y_train[batch_ind],
                         keep_prob: dropout,
                         bn_train: False
                     })
        result = sess.run([accuracy, numel],
                          feed_dict={
                              x: X_test,
                              y_: y_test,
                              keep_prob: 1.0,
                              bn_train: False
                          })
        acc_test = result[0]
        print('The network has %s trainable parameters' % (result[1]))

        train_embedding = h_fc1.eval(feed_dict={
            x: X_train,
            y_: y_train,
            keep_prob: 1.0,
            bn_train: False
        })
        test_embedding = h_fc1.eval(feed_dict={
            x: X_test,
            y_: y_train,
            keep_prob: 1.0,
            bn_train: False
        })
        gg = evaluate_test_embedding(train_embedding, y_train, test_embedding,
                                     y_test)
        print('Accuracy given NN approach %0.2f' % (100 * gg))
        return gg
Exemplo n.º 25
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  learning_rate_decay_fn=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None,
                  colocate_gradients_with_ops=False,
                  increment_global_step=True):
  """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers, include:

  - string, name of the optimizer like 'SGD', 'Adam', see OPTIMIZER_CLS_NAMES
      for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
  - function, takes learning rate `Tensor` as argument and must return
      `Optimizer` instance. E.g. `optimize_loss(...,
      optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
    Alternatively, if `learning_rate` is `None`, the function takes no
    arguments. E.g. `optimize_loss(..., learning_rate=None,
      optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
  - class, subclass of `Optimizer` that takes only one required argument -
      learning rate, such as AdamOptimizer, AdagradOptimizer.
      E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`.
  - object, instance of subclass of `Optimizer`.
      E.g., `optimizer_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Scalar `Tensor`.
    global_step: Scalar int `Tensor`, step counter to update on each step
                 unless `increment_global_step` is `False`. If not supplied,
                 it will be fetched from the default graph (see
                 `tf.train.get_global_step` for details). If it's
                 not been created, no step will be incremented with each weight
                 update. `learning_rate_decay_fn` requires `global_step`.
    learning_rate: float or `Tensor`, magnitude of update per each training
                   step. Can be `None`.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of `tf.Optimizer` that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantiation of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float, callable or `None`. If float, is provided, a global
      clipping is applied to prevent the norm of the gradient to exceed this
      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
      This callable takes a `list` of `(gradients, variables)` `tuple`s and
      returns the same thing with the gradients modified.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: `tf.train.exponential_decay`.
                            Ignored if `learning_rate` is not supplied.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
    colocate_gradients_with_ops: If True, try colocating gradients with the
                                 corresponding op.
    increment_global_step: Whether to increment `global_step`. If your model
      calls `optimize_loss` multiple times per training step (e.g. to optimize
      different parts of the model), use this arg to avoid incrementing
      `global_step` more times than necessary.

  Returns:
    Training op.

  Raises:
    ValueError: if:
        * `loss` is an invalid type or shape.
        * `global_step` is an invalid type or shape.
        * `learning_rate` is an invalid type or value.
        * `optimizer` is wrong type.
        * `clip_gradients` is not float or callable.
        * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
          `global_step` is available.
        * `gradients` is empty
  """
  loss = ops.convert_to_tensor(loss)
  contrib_framework.assert_scalar(loss)
  if global_step is None:
    global_step = contrib_framework.get_global_step()
  else:
    contrib_framework.assert_global_step(global_step)
  with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
    # Update ops take UPDATE_OPS collection if not provided.
    if update_ops is None:
      update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
    # Make sure update ops are ran before computing loss.
    if update_ops:
      loss = control_flow_ops.with_dependencies(list(update_ops), loss)

    # Learning rate variable, with possible decay.
    lr = None
    if learning_rate is not None:
      if (isinstance(learning_rate, ops.Tensor) and
          learning_rate.get_shape().ndims == 0):
        lr = learning_rate
      elif isinstance(learning_rate, float):
        if learning_rate < 0.0:
          raise ValueError("Invalid learning_rate %s.", learning_rate)
        lr = vs.get_variable(
            "learning_rate", [],
            trainable=False,
            initializer=init_ops.constant_initializer(learning_rate))
      else:
        raise ValueError("Learning rate should be 0d Tensor or float. "
                         "Got %s of type %s" % (str(learning_rate),
                                                str(type(learning_rate))))
    if summaries is None:
      summaries = ["loss", "learning_rate"]
    else:
      for summ in summaries:
        if summ not in OPTIMIZER_SUMMARIES:
          raise ValueError("Summaries should be one of [%s], you provided %s." %
                           (", ".join(OPTIMIZER_SUMMARIES), summ))
    if learning_rate is not None and learning_rate_decay_fn is not None:
      if global_step is None:
        raise ValueError("global_step is required for learning_rate_decay_fn.")
      lr = learning_rate_decay_fn(lr, global_step)
      if "learning_rate" in summaries:
        summary.scalar("learning_rate", lr)

    # Create optimizer, given specified parameters.
    if isinstance(optimizer, six.string_types):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is string (%s)." % optimizer)
      if optimizer not in OPTIMIZER_CLS_NAMES:
        raise ValueError(
            "Optimizer name should be one of [%s], you provided %s." %
            (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
      opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
    elif (isinstance(optimizer, type) and
          issubclass(optimizer, optimizer_.Optimizer)):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is class (%s)." % optimizer)
      opt = optimizer(learning_rate=lr)
    elif isinstance(optimizer, optimizer_.Optimizer):
      opt = optimizer
    elif callable(optimizer):
      if learning_rate is not None:
        opt = optimizer(lr)
      else:
        opt = optimizer()
      if not isinstance(opt, optimizer_.Optimizer):
        raise ValueError("Unrecognized optimizer: function should return "
                         "subclass of Optimizer. Got %s." % str(opt))
    else:
      raise ValueError("Unrecognized optimizer: should be string, "
                       "subclass of Optimizer, instance of "
                       "subclass of Optimizer or function with one argument. "
                       "Got %s." % str(optimizer))

    # All trainable variables, if specific variables are not specified.
    if variables is None:
      variables = vars_.trainable_variables()

    # Compute gradients.
    gradients = opt.compute_gradients(
        loss,
        variables,
        colocate_gradients_with_ops=colocate_gradients_with_ops)

    # Optionally add gradient noise.
    if gradient_noise_scale is not None:
      gradients = _add_scaled_noise_to_gradients(gradients,
                                                 gradient_noise_scale)

    # Multiply some gradients.
    if gradient_multipliers is not None:
      gradients = _multiply_gradients(gradients, gradient_multipliers)
      if not gradients:
        raise ValueError(
            "Empty list of (gradient, var) pairs encountered. This is most "
            "likely to be caused by an improper value of gradient_multipliers.")

    if "gradient_norm" in summaries:
      summary.scalar("global_norm/gradient_norm",
                     clip_ops.global_norm(list(zip(*gradients))[0]))

    # Optionally clip gradients by global norm.
    if isinstance(clip_gradients, float):
      gradients = _clip_gradients_by_norm(gradients, clip_gradients)
    elif callable(clip_gradients):
      gradients = clip_gradients(gradients)
    elif clip_gradients is not None:
      raise ValueError(
          "Unknown type %s for clip_gradients" % type(clip_gradients))

    # Add scalar summary for loss.
    if "loss" in summaries:
      summary.scalar("loss", loss)

    # Add histograms for variables, gradients and gradient norms.
    for gradient, variable in gradients:
      if isinstance(gradient, ops.IndexedSlices):
        grad_values = gradient.values
      else:
        grad_values = gradient

      if grad_values is not None:
        var_name = variable.name.replace(":", "_")
        if "gradients" in summaries:
          summary.histogram("gradients/%s" % var_name, grad_values)
        if "gradient_norm" in summaries:
          summary.scalar("gradient_norm/%s" % var_name,
                         clip_ops.global_norm([grad_values]))

    if clip_gradients is not None and "gradient_norm" in summaries:
      summary.scalar("global_norm/clipped_gradient_norm",
                     clip_ops.global_norm(list(zip(*gradients))[0]))

    # Create gradient updates.
    grad_updates = opt.apply_gradients(
        gradients,
        global_step=global_step if increment_global_step else None,
        name="train")

    # Ensure the train_tensor computes grad_updates.
    train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

    return train_tensor
Exemplo n.º 26
0
  def __init__(self,config):
    batch_size = config['batch_size']
    learning_rate = config['learning_rate']
    num_enc1 = config['num_enc1']
    num_enc2 = config['num_enc2']
    num_l = config['num_l']
    D = config['D']

    #Function for initialization
    def xv_init(arg_in, arg_out,shape=None):
      low = -np.sqrt(6.0/(arg_in + arg_out))
      high = np.sqrt(6.0/(arg_in + arg_out))
      if shape is None:
        tensor_shape = (arg_in, arg_out)
      return tf.random_uniform(tensor_shape, minval=low, maxval=high, dtype=tf.float32)

    with tf.name_scope("Placeholders") as scope:
      self.x = tf.placeholder("float", shape=[None, D], name = 'Input_data')

    with tf.name_scope("Encoding_network") as scope:
      #Layer 1
      W1e = tf.Variable(xv_init(D,num_enc1))
      b1e = tf.Variable(tf.constant(0.1,shape=[num_enc1],dtype=tf.float32))
      h1e = tf.nn.relu(tf.nn.xw_plus_b(self.x,W1e,b1e))

      #Layer 1
      W2e = tf.Variable(xv_init(num_enc1,num_enc2))
      b2e = tf.Variable(tf.constant(0.1,shape=[num_enc2],dtype=tf.float32))
      h2e = tf.nn.relu(tf.nn.xw_plus_b(h1e,W2e,b2e))

      #layer for mean of z
      W_mu = tf.Variable(xv_init(num_enc2,num_l))
      b_mu = tf.Variable(tf.constant(0.1,shape=[num_l],dtype=tf.float32))
      self.z_mu = tf.nn.xw_plus_b(h2e,W_mu,b_mu)  #mu, mean, of latent space

      #layer for sigma of z
      W_sig = tf.Variable(xv_init(num_enc2,num_l))
      b_sig = tf.Variable(tf.constant(0.1,shape=[num_l],dtype=tf.float32))
      z_sig_log_sq = tf.nn.xw_plus_b(h2e,W_sig,b_sig)  #sigma of latent space, in log-scale and squared.
      # This log_sq will save computation later on. log(sig^2) is a real number, so no sigmoid is necessary

    with tf.name_scope("Latent_space") as scope:
      eps = tf.random_normal(tf.shape(self.z_mu),0,1,dtype=tf.float32)
      self.z = self.z_mu + tf.mul(tf.sqrt(tf.exp(z_sig_log_sq)),eps)

    with tf.name_scope("Decoding_network") as scope:
      #Layer 1
      W1d = tf.Variable(xv_init(num_l,num_enc2))
      b1d = tf.Variable(tf.constant(0.1,shape=[num_enc2],dtype=tf.float32))
      h1d = tf.nn.relu(tf.nn.xw_plus_b(self.z,W1d,b1d))

      #Layer 1
      W2d = tf.Variable(xv_init(num_enc2,num_enc1))
      b2d = tf.Variable(tf.constant(0.01,shape=[num_enc1],dtype=tf.float32))
      h2d = tf.nn.relu(tf.nn.xw_plus_b(h1d,W2d,b2d))

      #Layer for reconstruction
      W_rec = tf.Variable(xv_init(num_enc1,D))
      b_rec = tf.Variable(tf.constant(0.5,shape=[D],dtype=tf.float32))
      self.rec = tf.nn.sigmoid(tf.nn.xw_plus_b(h2d,W_rec,b_rec))  #Reconstruction. FOr now only mean

    with tf.name_scope("Loss_calculation") as scope:
      #See equation (10) of https://arxiv.org/abs/1312.6114
      loss_rec = tf.reduce_sum(self.x * tf.log(1e-10 + self.rec) + (1-self.x) * tf.log(1-self.rec+1e-10),1)  #Add 1e-10 to avoid numeric instability
      loss_kld = 0.5*tf.reduce_sum((1+z_sig_log_sq-tf.square(self.z_mu)-tf.exp(z_sig_log_sq)),1)   #KL divergence

      self.cost = -1*tf.reduce_mean(loss_rec + loss_kld)

    with tf.name_scope("Optimization") as scope:
      tvars = tf.trainable_variables()
      #We clip the gradients to prevent explosion
      grads = tf.gradients(self.cost, tvars)
      optimizer = tf.train.AdamOptimizer(learning_rate)
      gradients = zip(grads, tvars)
      self.train_step = optimizer.apply_gradients(gradients)
      # The following block plots for every trainable variable
      #  - Histogram of the entries of the Tensor
      #  - Histogram of the gradient over the Tensor
      #  - Histogram of the grradient-norm over the Tensor
      numel = tf.constant([[0]])
      for gradient, variable in gradients:
        if isinstance(gradient, ops.IndexedSlices):
          grad_values = gradient.values
        else:
          grad_values = gradient

        numel +=tf.reduce_sum(tf.size(variable))
        h1 = tf.histogram_summary(variable.name, variable)
        h2 = tf.histogram_summary(variable.name + "/gradients", grad_values)
        h3 = tf.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values]))

    #Define one op to call all summaries
    self.merged = tf.merge_all_summaries()
    print('Finished computation graph')
Exemplo n.º 27
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  moving_average_decay=None,
                  learning_rate_decay_fn=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None):
    """Given loss and parameters for optimizer, returns a training op.

  Args:
    loss: Tensor, 0 dimensional.
    global_step: Tensor, step counter for each update.
    learning_rate: float or Tensor, magnitude of update per each training step.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of tf.Optimizer that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantion of tf.Optimizer sub-class
                 and have `compute_gradients` and `apply_gradients` functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float or `None`, clips gradients by this value.
    moving_average_decay: Deprecated. float or None, takes into account previous
                          loss to make learning smoother due to outliers.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: tf.train.exponential_decay.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.

  Returns:
    Training op.

  Raises:
    ValueError: if optimizer is wrong type.
  """
    with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
        # Update ops take UPDATE_OPS collection if not provided.
        if update_ops is None:
            update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
        # Make sure update ops are ran before computing loss.
        if update_ops:
            loss = control_flow_ops.with_dependencies(list(update_ops), loss)

        # Moving average of the loss with decay.
        # TODO(b/30439864): moving_average_decay should be removed.
        if moving_average_decay is not None:
            logging.warn("'moving_average_decay' is deprecated. Please use "
                         "tensorboard's builtin averaging instead.")
            # Generate moving averages of the loss.
            loss_averages = train.ExponentialMovingAverage(
                moving_average_decay, name="avg")
            loss_averages_op = loss_averages.apply([loss])
            logging_ops.scalar_summary("loss/mean",
                                       loss_averages.average(loss))
            loss = control_flow_ops.with_dependencies([loss_averages_op], loss)

        # Learning rate variable, with possible decay.
        if (isinstance(learning_rate, ops.Tensor)
                and learning_rate.get_shape().ndims == 0):
            lr = learning_rate
        elif isinstance(learning_rate, float):
            lr = vs.get_variable(
                "learning_rate", [],
                trainable=False,
                initializer=init_ops.constant_initializer(learning_rate))
        else:
            raise ValueError("Learning rate should be 0d Tensor or float. "
                             "Got %s of type %s" %
                             (str(learning_rate), str(type(learning_rate))))
        if summaries is None:
            summaries = ["loss", "learning_rate"]
        if learning_rate_decay_fn is not None:
            lr = learning_rate_decay_fn(lr, global_step)
            if "learning_rate" in summaries:
                logging_ops.scalar_summary("learning_rate", lr)

        # Create optimizer, given specified parameters.
        if isinstance(optimizer, six.string_types):
            if optimizer not in OPTIMIZER_CLS_NAMES:
                raise ValueError(
                    "Optimizer name should be one of [%s], you provided %s." %
                    (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
            opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
        elif isinstance(optimizer, type) and issubclass(
                optimizer, optimizer_.Optimizer):
            opt = optimizer(learning_rate=lr)
        elif isinstance(optimizer, optimizer_.Optimizer):
            opt = optimizer
        else:
            raise ValueError("Unrecognized optimizer: should be string, "
                             "subclass of Optimizer or instance of "
                             "subclass of Optimizer. Got %s." % str(optimizer))

        # All trainable variables, if specific variables are not specified.
        if variables is None:
            variables = vars_.trainable_variables()

        # Compute gradients.
        gradients = opt.compute_gradients(loss, variables)

        # Optionally add gradient noise.
        if gradient_noise_scale is not None:
            gradients = _add_scaled_noise_to_gradients(gradients,
                                                       gradient_noise_scale)

        # Multiply some gradients.
        if gradient_multipliers is not None:
            gradients = _multiply_gradients(gradients, gradient_multipliers)

        # Optionally clip gradients by global norm.
        if clip_gradients is not None:
            gradients = _clip_gradients_by_norm(gradients, clip_gradients)

        # Add scalar summary for loss.
        if "loss" in summaries:
            logging_ops.scalar_summary("loss", loss)

        # Add histograms for variables, gradients and gradient norms.
        for gradient, variable in gradients:
            if isinstance(gradient, ops.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient

            if grad_values is not None:
                if "gradients" in summaries:
                    logging_ops.histogram_summary(variable.name + "/gradients",
                                                  grad_values)
                if "gradient_norm" in summaries:
                    logging_ops.histogram_summary(
                        variable.name + "/gradient_norm",
                        clip_ops.global_norm([grad_values]))

        # Create gradient updates.
        grad_updates = opt.apply_gradients(gradients,
                                           global_step=global_step,
                                           name="train")

        # Ensure the train_tensor computes grad_updates.
        train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

        return train_tensor
Exemplo n.º 28
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  moving_average_decay=None,
                  learning_rate_decay_fn=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None):
  """Given loss and parameters for optimizer, returns a training op.

  Args:
    loss: Tensor, 0 dimensional.
    global_step: Tensor, step counter for each update.
    learning_rate: float or Tensor, magnitude of update per each training step.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of tf.Optimizer that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantion of tf.Optimizer sub-class
                 and have `compute_gradients` and `apply_gradients` functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float or `None`, clips gradients by this value.
    moving_average_decay: Deprecated. float or None, takes into account previous
                          loss to make learning smoother due to outliers.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: tf.train.exponential_decay.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.

  Returns:
    Training op.

  Raises:
    ValueError: if optimizer is wrong type.
  """
  with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
    # Update ops take UPDATE_OPS collection if not provided.
    if update_ops is None:
      update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
    # Make sure update ops are ran before computing loss.
    if update_ops:
      loss = control_flow_ops.with_dependencies(update_ops, loss)

    # Moving average of the loss with decay.
    # TODO(b/30439864): moving_average_decay should be removed.
    if moving_average_decay is not None:
      logging.warn("'moving_average_decay' is deprecated. Please use "
                   "tensorboard's builtin averaging instead.")
      # Generate moving averages of the loss.
      loss_averages = train.ExponentialMovingAverage(moving_average_decay,
                                                     name="avg")
      loss_averages_op = loss_averages.apply([loss])
      logging_ops.scalar_summary("loss/mean", loss_averages.average(loss))
      loss = control_flow_ops.with_dependencies([loss_averages_op], loss)

    # Learning rate variable, with possible decay.
    if (isinstance(learning_rate, ops.Tensor)
        and learning_rate.get_shape().ndims == 0):
      lr = learning_rate
    elif isinstance(learning_rate, float):
      lr = vs.get_variable(
          "learning_rate", [], trainable=False,
          initializer=init_ops.constant_initializer(learning_rate))
    else:
      raise ValueError("Learning rate should be 0d Tensor or float. "
                       "Got %s of type %s" % (
                           str(learning_rate), str(type(learning_rate))))
    if summaries is None:
      summaries = ["loss", "learning_rate"]
    if learning_rate_decay_fn is not None:
      lr = learning_rate_decay_fn(lr, global_step)
      if "learning_rate" in summaries:
        logging_ops.scalar_summary("learning_rate", lr)

    # Create optimizer, given specified parameters.
    if isinstance(optimizer, six.string_types):
      if optimizer not in OPTIMIZER_CLS_NAMES:
        raise ValueError(
            "Optimizer name should be one of [%s], you provided %s."
            % (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
      opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
    elif isinstance(optimizer, type) and issubclass(optimizer,
                                                    optimizer_.Optimizer):
      opt = optimizer(learning_rate=lr)
    elif isinstance(optimizer, optimizer_.Optimizer):
      opt = optimizer
    else:
      raise ValueError("Unrecognized optimizer: should be string, "
                       "subclass of Optimizer or instance of "
                       "subclass of Optimizer. Got %s." % str(optimizer))

    # All trainable variables, if specific variables are not specified.
    if variables is None:
      variables = vars_.trainable_variables()

    # Compute gradients.
    gradients = opt.compute_gradients(loss, variables)

    # Optionally add gradient noise.
    if gradient_noise_scale is not None:
      gradients = _add_scaled_noise_to_gradients(
          gradients, gradient_noise_scale)

    # Multiply some gradients.
    if gradient_multipliers is not None:
      gradients = _multiply_gradients(gradients, gradient_multipliers)

    # Optionally clip gradients by global norm.
    if clip_gradients is not None:
      gradients = _clip_gradients_by_norm(gradients, clip_gradients)

    # Add scalar summary for loss.
    if "loss" in summaries:
      logging_ops.scalar_summary("loss", loss)

    # Add histograms for variables, gradients and gradient norms.
    for gradient, variable in gradients:
      if isinstance(gradient, ops.IndexedSlices):
        grad_values = gradient.values
      else:
        grad_values = gradient

      if grad_values is not None:
        if "gradients" in summaries:
          logging_ops.histogram_summary(variable.name + "/gradients",
                                        grad_values)
        if "gradient_norm" in summaries:
          logging_ops.histogram_summary(variable.name + "/gradient_norm",
                                        clip_ops.global_norm([grad_values]))

    # Create gradient updates.
    grad_updates = opt.apply_gradients(gradients,
                                       global_step=global_step,
                                       name="train")

    # Ensure the train_tensor computes grad_updates.
    train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

    return train_tensor
def build_multi_tower_graph(images,
                            sketches,
                            images_d,
                            image_paired_class_ids,
                            image_paired_class_ids_d,
                            text_vocab_indiceses,
                            LSTM_hybrid,
                            vocab_size,
                            batch_size,
                            num_gpu,
                            batch_portion,
                            training,
                            learning_rates,
                            counter,
                            max_iter_step,
                            ld=10,
                            data_format='NCHW',
                            distance_map=True,
                            optimizer='Adam',
                            block_type='MRU'):
    """
    :param images: [batch_size, 3, H, W]
    :param sketches:  [batch_size, 3, H, W]
    :param images_d:  [batch_size, 3, H, W]
    :param image_paired_class_ids: [batch_size, ], class_number
    :param image_paired_class_ids_d: [batch_size, ]
    :param text_vocab_indiceses: [batch_size, 15]
    :return:
    """
    models.set_param(data_format=data_format)

    with tf.device('/cpu:0'):
        images_list = split_inputs(images, batch_size, batch_portion,
                                   num_gpu)  # [num_gpu, [N, C, H, W]]
        images_d_list = split_inputs(images_d, batch_size, batch_portion,
                                     num_gpu)
        sketches_list = split_inputs(sketches, batch_size, batch_portion,
                                     num_gpu)
        image_paired_class_ids_list = split_inputs(image_paired_class_ids,
                                                   batch_size, batch_portion,
                                                   num_gpu)
        image_paired_class_ids_d_list = split_inputs(image_paired_class_ids_d,
                                                     batch_size, batch_portion,
                                                     num_gpu)
        text_vocab_indiceses_list = split_inputs(text_vocab_indiceses,
                                                 batch_size, batch_portion,
                                                 num_gpu)

    lr_g = learning_rates['generator']
    lr_d = learning_rates['discriminator']
    optimizer = get_optimizer(optimizer)
    decay = tf.maximum(
        0.2, 1. - (tf.cast(counter, tf.float32) / max_iter_step * 0.9))
    tf.summary.scalar('learning_rate_g', lr_g * decay)
    optim_g = optimizer(learning_rate=lr_g * decay)
    optim_d = optimizer(learning_rate=lr_d * decay)

    tower_grads_g = []
    tower_grads_d = []
    for i in range(num_gpu):
        with tf.name_scope('%s_%d' % ('GPU', i)) as scope:
            loss_g, loss_d, grad_g, grad_d \
                = build_single_graph(images_list[i],
                                     sketches_list[i],
                                     images_d_list[i],
                                     image_paired_class_ids_list[i],
                                     image_paired_class_ids_d_list[i],
                                     text_vocab_indiceses_list[i],
                                     batch_size * batch_portion[i],
                                     training,
                                     LSTM_hybrid=LSTM_hybrid,
                                     vocab_size=vocab_size,
                                     ld=ld, data_format=data_format,
                                     distance_map=distance_map,
                                     optim_g=optim_g,
                                     optim_d=optim_d,
                                     block_type=block_type)

            tower_grads_g.append(grad_g)
            tower_grads_d.append(grad_d)

    assert len(tower_grads_g) == len(tower_grads_d)
    if len(tower_grads_d) == 1:
        ave_grad_g = grad_g
        ave_grad_d = grad_d
    else:
        ave_grad_g, ave_grad_d = average_gradients(
            (tower_grads_g, tower_grads_d))

    # Apply gradients
    tf.get_variable_scope(
    )._reuse = False  # Hack to force initialization of optimizer variables

    if Config.sn:
        # Get the update ops
        spectral_norm_update_ops = tf.get_collection(
            Config.SPECTRAL_NORM_UPDATE_OPS)
    else:
        spectral_norm_update_ops = [tf.no_op()]
        assign_ops = tf.no_op()

    # Clip gradients if using WGAN/DRAGAN
    global_grad_norm_G = None
    global_grad_norm_G_clipped = None
    global_grad_norm_D = None
    global_grad_norm_D_clipped = None

    if not Config.sn:
        max_grad_norm_G = 50.
        max_grad_norm_D = 100.
        hard_clip_norm_G = 5.
        hard_clip_norm_D = 10.

        ave_grad_g_tensors, ave_grad_g_vars = list(zip(*ave_grad_g))
        global_grad_norm_G = clip_ops.global_norm(ave_grad_g_tensors)
        ave_grad_g_tensors, _ = clip_ops.clip_by_global_norm(
            ave_grad_g_tensors, max_grad_norm_G, global_grad_norm_G)
        ave_grad_g_tensors = [
            clip_ops.clip_by_norm(t, hard_clip_norm_G)
            for t in ave_grad_g_tensors
        ]
        ave_grad_g = list(zip(ave_grad_g_tensors, ave_grad_g_vars))

        ave_grad_d_tensors, ave_grad_d_vars = list(zip(*ave_grad_d))
        global_grad_norm_D = clip_ops.global_norm(ave_grad_d_tensors)
        ave_grad_d_tensors, _ = clip_ops.clip_by_global_norm(
            ave_grad_d_tensors, max_grad_norm_D, global_grad_norm_D)
        ave_grad_d_tensors = [
            clip_ops.clip_by_norm(t, hard_clip_norm_D)
            for t in ave_grad_d_tensors
        ]
        ave_grad_d = list(zip(ave_grad_d_tensors, ave_grad_d_vars))
    with tf.control_dependencies(spectral_norm_update_ops):
        opt_g = optimize(ave_grad_g,
                         optim_g,
                         None,
                         'gradient_norm',
                         global_norm=global_grad_norm_G,
                         global_norm_clipped=global_grad_norm_G_clipped,
                         appendix='_G')
    opt_d = optimize(ave_grad_d,
                     optim_d,
                     None,
                     'gradient_norm',
                     global_norm=global_grad_norm_D,
                     global_norm_clipped=global_grad_norm_D_clipped,
                     appendix='_D')

    summaries = gather_summaries()
    loss_g, loss_d = gather_losses()

    # Generator output from last tower
    return opt_g, opt_d, loss_g, loss_d, summaries
Exemplo n.º 30
0
def create_train_op(
    total_loss,
    optimizer,
    global_step=None,
    update_ops=None,
    variables_to_train=None,
    clip_gradient_norm=0,
    summarize_gradients=False,
    gate_gradients=tf_optimizer.Optimizer.GATE_OP,
    aggregation_method=None,
    colocate_gradients_with_ops=False):
  """Creates an `Operation` that evaluates the gradients and returns the loss.

  Args:
    total_loss: A `Tensor` representing the total loss.
    optimizer: A tf.Optimizer to use for computing the gradients.
    global_step: A `Tensor` representing the global step variable. If left as
      `None`, then slim.variables.global_step() is used.
    update_ops: an optional list of updates to execute. Note that the update_ops
      that are used are the union of those update_ops passed to the function and
      the value of slim.ops.GetUpdateOps(). Therefore, if `update_ops` is None,
      then the value of slim.ops.GetUpdateOps() is still used.
    variables_to_train: an optional list of variables to train. If None, it will
      default to all tf.trainable_variables().
    clip_gradient_norm: If greater than 0 then the gradients would be clipped
      by it.
    summarize_gradients: Whether or not add summaries for each gradient.
    gate_gradients: How to gate the computation of gradients. See tf.Optimizer.
    aggregation_method: Specifies the method used to combine gradient terms.
      Valid values are defined in the class `AggregationMethod`.
    colocate_gradients_with_ops: Whether or not to try colocating the gradients
      with the ops that generated them.

  Returns:
    A `Tensor` that when evaluated, computes the gradients and returns the total
      loss value.
  """
  if global_step is None:
    global_step = variables.get_or_create_global_step()

  update_ops = set(update_ops or [])

  # Make sure update_ops are computed before total_loss.
  if update_ops:
    with control_flow_ops.control_dependencies(update_ops):
      barrier = control_flow_ops.no_op(name='update_barrier')
    total_loss = control_flow_ops.with_dependencies([barrier], total_loss)

  if variables_to_train is None:
    # Default to tf.trainable_variables()
    variables_to_train = tf_variables.trainable_variables()
  else:
    # Make sure that variables_to_train are in tf.trainable_variables()
    for v in variables_to_train:
      assert v in tf_variables.trainable_variables()

  assert variables_to_train

  # Create the gradients. Note that apply_gradients adds the gradient
  # computation to the current graph.
  grads = optimizer.compute_gradients(
      total_loss, variables_to_train, gate_gradients=gate_gradients,
      aggregation_method=aggregation_method,
      colocate_gradients_with_ops=colocate_gradients_with_ops)

  # Clip gradients.
  if clip_gradient_norm > 0:
    grads = clip_gradient_norms(grads, clip_gradient_norm)

  # Summarize gradients.
  if summarize_gradients:
    for grad, var in grads:
      if grad is not None:
        if isinstance(grad, ops.IndexedSlices):
          grad_values = grad.values
        else:
          grad_values = grad
        logging_ops.histogram_summary(var.op.name + ':gradient', grad_values)
        logging_ops.histogram_summary(var.op.name + ':gradient_norm',
                                      clip_ops.global_norm([grad_values]))
      else:
        logging.info('Var %s has no gradient', var.op.name)

  # Create gradient updates.
  grad_updates = optimizer.apply_gradients(grads, global_step=global_step)

  # Make sure total_loss is valid.
  total_loss = array_ops.check_numerics(total_loss, 'LossTensor is inf or nan')

  # Ensure the train_tensor computes grad_updates.
  return control_flow_ops.with_dependencies([grad_updates], total_loss)
Exemplo n.º 31
0
    def __init__(self, parameters, hyper_search=False):
        #feed_future_data, train, num_observation_steps, num_prediction_steps, batch_size,
        #         rnn_size, num_layers, learning_rate, learning_rate_decay_factor, input_size, max_gradient_norm,
        #        dropout_prob,random_bias,subsample,random_rotate,num_mixtures,model_type):

        # feed_future_data: whether or not to feed the true data into the decoder instead of using a loopback
        #                function. If false, a loopback function is used, feeding the last generated output as the next
        #                decoder input.
        # train: train the model (or test)
        # Subsample: amount of subsampling. IMPORTANT If this is non-one, the input array must be n times longer than usual, as it only subsamples down
        # This is so that the track is not subsampled the same way each track sample.

        #######################################
        # The LSTM Model consists of:
        # Input Linear layer
        # N LSTM layers
        # a linear output layer to convert the LSTM output to MDN format
        #
        # MDN Format:
        # pi mu1 mu2 sigma1 sigma2 rho
        # (repeat for n mixtures)
        #
        # This MDN format is then either used for the loss, or is sampled to get a real value

        self.parameters = parameters
        self.max_gradient_norm = parameters['max_gradient_norm']
        self.rnn_size = parameters['rnn_size']
        self.num_layers = parameters['num_layers']
        dtype = tf.float32

        self.batch_size = parameters['batch_size']
        self.input_size = parameters['input_size']
        self.embedding_size = parameters['embedding_size']
        self.observation_steps = parameters['observation_steps']
        self.prediction_steps = parameters['prediction_steps']
        self.dropout_prob = parameters['dropout_prob']
        self.random_bias = parameters['random_bias']
        self.subsample = parameters['subsample']
        self.random_rotate = parameters['random_rotate']
        self.num_mixtures = parameters['num_mixtures']
        self.model_type = parameters['model_type']
        self.num_classes = parameters['num_classes']
        self.first_loss_only = parameters['first_loss_only']
        self.global_step = tf.Variable(0, trainable=False, name="Global_step")

        self.learning_rate = tf.Variable(float(parameters['learning_rate']),
                                         trainable=False,
                                         name="Learning_rate")
        min_rate = parameters['learning_rate_min']
        self.learning_rate_decay_op = self.learning_rate.assign((
            (parameters['learning_rate'] - min_rate) *
            (parameters['learning_rate_decay_factor']**tf.cast(
                self.global_step, tf.float32))) + min_rate)
        self.network_summaries = []
        keep_prob = 1 - self.dropout_prob

        if parameters['model_type'] == 'classifier':
            raise Exception("Error")

        # The output of the multiRNN is the size of rnn_size, and it needs to match the input size, or loopback makes
        #  no sense. Here a single layer without activation function is used, but it can be any number of
        #  non RNN layers / functions
        if self.model_type == 'MDN':
            n_MDN_nodes_out = 6 * self.num_mixtures
        if self.parameters['track_padding']:
            with tf.variable_scope('output_network_padding'):
                o_pad_w = tf.get_variable(
                    "o_pad_w", [self.rnn_size, 2],
                    initializer=tf.truncated_normal_initializer(
                        stddev=1.0 / np.sqrt(self.embedding_size)))
                o_pad_b = tf.get_variable(
                    "o_pad_b", [2], initializer=tf.constant_initializer(0.1))
                pad_output_projection = (o_pad_w, o_pad_b)

        ############## LAYERS ###################################

        # Layer is linear, just to re-scale the LSTM outputs [-1,1] to [-9999,9999]
        # If there is a regularizer, these weights should be excluded?

        with tf.variable_scope('output_MDN_proj'):
            MDN_o_w = tf.get_variable(
                "proj_MDN_w", [self.rnn_size, n_MDN_nodes_out],
                initializer=tf.truncated_normal_initializer(
                    stddev=1.0 / np.sqrt(self.embedding_size)))
            MDN_o_b = tf.get_variable("proj_MDN_b", [n_MDN_nodes_out],
                                      initializer=tf.constant_initializer(0.1))
            MDN_output_projection = (MDN_o_w, MDN_o_b)

        with tf.variable_scope('input_scaling'):
            i_s_m = tf.get_variable('in_scale_mean',
                                    shape=[self.input_size],
                                    trainable=False,
                                    initializer=tf.zeros_initializer())
            i_s_s = tf.get_variable('in_scale_stddev',
                                    shape=[self.input_size],
                                    trainable=False,
                                    initializer=tf.ones_initializer())
            scaling_layer = (i_s_m, i_s_s)
            self.scaling_layer = scaling_layer

        with tf.variable_scope('input_embedding_layer'):
            i_w = tf.get_variable(
                "in_w",
                [self.input_size, self.embedding_size
                 ],  # Remember, batch_size is automatic
                initializer=tf.truncated_normal_initializer(
                    stddev=1.0 / np.sqrt(self.embedding_size)))
            i_b = tf.get_variable("in_b", [self.embedding_size],
                                  initializer=tf.constant_initializer(0.1))
            input_layer = (i_w, i_b)

        def _generate_rnn_layer():
            if parameters['RNN_cell'] == "LSTMCell":
                return tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.rnn.LSTMCell(
                        self.rnn_size,
                        state_is_tuple=True,
                        use_peepholes=parameters['peephole_connections']),
                    output_keep_prob=keep_prob)
            if parameters['RNN_cell'] == "BN_LSTMCell":
                return tf.contrib.rnn.DropoutWrapper(
                    BN_LSTMCell(
                        self.rnn_size,
                        is_training=True,
                        use_peepholes=parameters['peephole_connections']),
                    output_keep_prob=keep_prob)
            if parameters['RNN_cell'] == "sketch_LSTM":
                return sketch_rnn.LSTMCell(self.rnn_size,
                                           use_recurrent_dropout=True,
                                           dropout_keep_prob=keep_prob)
            if parameters['RNN_cell'] == "sketch_layer_norm":
                return sketch_rnn.LayerNormLSTMCell(
                    self.rnn_size,
                    use_recurrent_dropout=True,
                    dropout_keep_prob=keep_prob)
            if parameters['RNN_cell'] == "sketch_hyper":
                return sketch_rnn.HyperLSTMCell(self.rnn_size,
                                                use_recurrent_dropout=True,
                                                dropout_keep_prob=keep_prob)

        if self.num_layers > 1:
            self._RNN_layers = tf.contrib.rnn.MultiRNNCell(
                [_generate_rnn_layer() for _ in range(self.num_layers)],
                state_is_tuple=True)
        else:
            self._RNN_layers = _generate_rnn_layer()

        # Don't double dropout
        #self._RNN_layers = tensorflow.contrib.rnn.DropoutWrapper(self._RNN_layers,output_keep_prob=keep_prob)

        def MDN_output_function(output):
            return nn_ops.xw_plus_b(output,
                                    MDN_output_projection[0],
                                    MDN_output_projection[1],
                                    name="MDN_output_projection")

        def pad_output_function(output):
            return nn_ops.xw_plus_b(output,
                                    pad_output_projection[0],
                                    pad_output_projection[1],
                                    name="pad_output_projection")

        def _pad_missing_output_with_zeros(MDN_samples):
            # Simple hack for now as I cannot get t-1 data for t_0 derivatives easily due to scoping problems.
            # sampled has shape 256,2 - it needs 256,4
            # No longer used unless is set input mask as 1100
            if MDN_samples.shape[1] < scaling_layer[0].shape[0]:
                resized = tf.concat([
                    MDN_samples,
                    tf.zeros([
                        MDN_samples.shape[0],
                        scaling_layer[0].shape[0] - MDN_samples.shape[1]
                    ],
                             dtype=tf.float32)
                ], 1)
            else:
                resized = MDN_samples
            return resized

        def _upscale_sampled_output(sample):
            return tf.add(tf.multiply(sample, scaling_layer[1][0:2]),
                          scaling_layer[0][0:2])

        def _scale_vel_thresh(velocity_thresh):
            _, _, _, speed_sub = tf.split(self.scaling_layer[0], 4, axis=0)
            _, _, _, speed_div = tf.split(self.scaling_layer[1], 4, axis=0)
            return tf.divide(
                tf.subtract(tf.constant(velocity_thresh, dtype=tf.float32),
                            speed_sub), speed_div)

        def _apply_scaling_and_input_layer(input_data):
            return tf.nn.dropout(
                tf.nn.relu(
                    nn_ops.xw_plus_b(
                        tf.divide(tf.subtract(input_data, scaling_layer[0]),
                                  scaling_layer[1]),  # Input scaling
                        input_layer[0],
                        input_layer[1])),
                1 - parameters['embedding_dropout'])

        def _padding_bool_to_logits(padding_bool):
            return tf.one_hot(tf.to_int32(padding_bool), depth=2)

        """This was always the biggest side-loading hack.  Because I cannot give an initial state to the decoder
        raw_rnn, its done in the loop function by defining the function here, and pulling variables traditionally 
         outside of functional scope into the function.
         IMPORTANT - the first call to this function is BEFORE the first node, s.t. the cell_output is None check 
         then sets the initial params.

         Loss function - How can I implement this? It needs to go into the loopback function.
         This is because the sequence length is undefined (even though it isn't) and so my standard loss functions are 
         not working."""
        """ Its better to read the simple implementation of dyn_rnn in 
        https://www.tensorflow.org/api_docs/python/tf/nn/raw_rnn. I can just declare some TensorArrays and fill
         them in the middle of the loop."""

        output_ta = (
            tf.TensorArray(size=self.prediction_steps,
                           dtype=tf.float32),  # Sampled output
            tf.TensorArray(size=self.prediction_steps,
                           dtype=tf.float32),  # loss
            tf.TensorArray(size=self.prediction_steps + 1,
                           dtype=tf.float32),  # time-1 for derivative loopback
            tf.TensorArray(
                size=self.prediction_steps, dtype=tf.float32
            ),  # MDN outputs # Its either real or generated # depending on feed_future_data # (always false for now)
            tf.TensorArray(size=self.prediction_steps, dtype=tf.float32)
        )  # dual logits for padding or not padding

        def seq2seq_f(encoder_inputs,
                      decoder_inputs,
                      targets,
                      last_input,
                      track_padding_vec=None):
            # returns (self.LSTM_output, self.internal_states)
            target_input_ta = tf.TensorArray(dtype=tf.float32,
                                             size=len(targets))
            for j in range(len(decoder_inputs)):
                target_input_ta = target_input_ta.write(j, targets[j])
            if track_padding_vec is not None:
                track_padding_ta = tf.TensorArray(dtype=tf.bool,
                                                  size=len(track_padding_vec))
                for j in range(len(decoder_inputs)):
                    track_padding_ta = track_padding_ta.write(
                        j, track_padding_vec[j])
            """ First this runs the encoder, then it saves the last internal RNN c state, and passes that into the
            loop parameter as the initial condition. Then it runs the decoder."""

            with tf.variable_scope('seq2seq_encoder'):
                # So I have a list of len(time) of Tensors of shape (batch, RNN dim)
                reordered_encoder_inputs = tf.stack(encoder_inputs, axis=1)
                encoder_outputs, last_enc_state = tf.nn.dynamic_rnn(
                    self._RNN_layers,
                    inputs=reordered_encoder_inputs,
                    dtype=tf.float32)
            """RNN loop function, the heart of this network. """
            def loop_fn(time, cell_output, cell_state, loop_state):
                emit_output = cell_output

                if cell_output is None:
                    # Set initial params
                    next_cell_state = last_enc_state
                    # I have defined last 'encoder input' as actually the first decoder input. It is data for time T_0
                    next_input = decoder_inputs[
                        0]  # Encoder inputs already have input layer applied
                    next_loop_state = (output_ta[0], output_ta[1],
                                       output_ta[2].write(time, last_input),
                                       output_ta[3], output_ta[4])
                else:
                    next_cell_state = cell_state
                    projected_output = MDN_output_function(cell_output)

                    # Take a single sample of the MDN. This may be ignored later, depending on the use-case.
                    sampled = MDN.sample(
                        projected_output,
                        temperature=self.parameters['sample_temperature'])
                    upscale_sampled = _upscale_sampled_output(sampled)

                    # If the no feedforward flag, just give the next time-step of the network zeros.
                    # This is the equivalent of the RNN-ZF (zero feed) network in the paper.
                    if self.parameters['no_feedforward']:
                        next_sampled_input = tf.zeros(
                            [
                                upscale_sampled.shape[0],
                                scaling_layer[0].shape[0]
                            ],
                            dtype=tf.float32)  # Size batch, input width

                    elif self.parameters['input_mask'][2:4] == [0, 0]:
                        next_sampled_input = _pad_missing_output_with_zeros(
                            upscale_sampled)

                    # Else take a sample, and feed this as the next input for the next sequence.
                    # All of this is done within tensorflow, as it allows it to run INSIDE the GPU.
                    # This section is often done sampled once outside of tensorflow using Numpy to resolve the MDN
                    # and performing it this way does not allow
                    else:
                        next_sampled_input = MDN.compute_derivates(
                            loop_state[2].read(time - 1),
                            upscale_sampled,
                            self.parameters['input_columns'],
                            self.parameters['velocity_threshold'],
                            subsample_rate=self.parameters['subsample'])
                    target_ta = target_input_ta.read(
                        time -
                        1)  # Only allowed to call read() once. Dunno why.
                    next_datapoint = next_sampled_input  # tf.cond(feed_forward, lambda: target_ta, lambda: next_sampled_input)
                    next_input = _apply_scaling_and_input_layer(next_datapoint)
                    # That dotted loopy line in the diagram

                    loss = MDN.lossfunc_wrapper(target_ta, projected_output)
                    timewise_track_padding = track_padding_ta.read(time - 1)
                    timewise_track_padding_logits = _padding_bool_to_logits(
                        timewise_track_padding)
                    if track_padding_vec is not None:  # If we have declared padding is being used.
                        # use padding as binary mask for mixture based loss
                        # i.e. if the ground truth says this timestep is padding data, set that timestep's loss to zero
                        loss = tf.multiply(
                            loss,
                            tf.minimum(
                                tf.to_float(
                                    parameters['padding_loss_mixture_weight']),
                                tf.expand_dims(
                                    tf.to_float(
                                        tf.logical_not(timewise_track_padding)
                                    ),  # Hyperparam search sometimes makes this a float64
                                    axis=-1),
                                name='mixture_loss'))
                        padding_output = pad_output_function(
                            cell_output
                        )  # compute what the network thinks about padding
                        # Normalize the softmax loss w.r.t. number of prediction steps
                        # If weight is zero, don't bother computing
                        if abs(parameters['padding_loss_logit_weight']
                               ) > 1e-12:
                            loss = tf.add(
                                loss,
                                tf.expand_dims(
                                    tf.multiply(
                                        tf.
                                        divide(  # Normalize by prediction_steps
                                            tf.nn.
                                            softmax_cross_entropy_with_logits(
                                                logits=padding_output,
                                                labels=
                                                timewise_track_padding_logits),
                                            self.prediction_steps),
                                        tf.to_float(parameters[
                                            'padding_loss_logit_weight'])),
                                    axis=-1,
                                    name="padding_logit_loss"
                                )  # Without this tf.add( shape(100,), shape(100,1)) becomes (100, 100) for some reason
                            )  # compare to GT
                    else:
                        padding_output = None  # loop_state write needs something at least

                    next_loop_state = (loop_state[0].write(
                        time - 1, next_sampled_input), loop_state[1].write(
                            time - 1, loss), loop_state[2].write(
                                time, next_datapoint), loop_state[3].write(
                                    time - 1,
                                    MDN.upscale_and_resolve_mixtures(
                                        projected_output,
                                        scaling_layer)), loop_state[4].write(
                                            time - 1, padding_output))
                    #Its an off by one error I'd rather solve with a new array for readability

                elements_finished = (
                    time >= self.prediction_steps
                )  # whether or not this RNN in the batch has declared itself done

                return (elements_finished, next_input, next_cell_state,
                        emit_output, next_loop_state)

            with tf.variable_scope('seq2seq_decoder'):
                from tensorflow.python.ops.rnn import _transpose_batch_time
                emit_ta, final_state, loop_state_ta = tf.nn.raw_rnn(
                    self._RNN_layers, loop_fn)
                # Here emit_ta should contain all the MDN's for each timestep. To confirm.
                output_sampled = _transpose_batch_time(
                    loop_state_ta[0].stack())
                losses = _transpose_batch_time(loop_state_ta[1].stack())
                MDN_output = _transpose_batch_time(loop_state_ta[3].stack())
                track_padding_output = _transpose_batch_time(
                    loop_state_ta[4].stack())

            return (
                output_sampled,
                losses,  # tf.reduce_sum(losses,axis=1)/len(self.decoder_inputs),\
                final_state,
                MDN_output,
                track_padding_output)

        ################# FEEDS SECTION #######################
        # Feeds for inputs.
        self.observation_inputs = []
        self.future_inputs = []
        self.target_weights = []
        self.trackwise_padding_input = []
        targets = []

        # TODO REFACTOR the new RNN may not need this unrolling, check the input space
        for i in xrange(
                self.observation_steps):  # Last bucket is the biggest one.
            self.observation_inputs.append(
                tf.placeholder(tf.float32,
                               shape=[self.batch_size, self.input_size],
                               name="observation{0}".format(i)))

        if self.model_type == 'MDN':
            for i in xrange(self.prediction_steps):
                self.future_inputs.append(
                    tf.placeholder(tf.float32,
                                   shape=[self.batch_size, self.input_size],
                                   name="prediction{0}".format(i)))
            for i in xrange(self.prediction_steps):
                self.target_weights.append(
                    tf.placeholder(dtype,
                                   shape=[self.batch_size],
                                   name="weight{0}".format(i)))
        if self.parameters['track_padding']:
            for i in xrange(self.prediction_steps):
                self.trackwise_padding_input.append(
                    tf.placeholder(tf.bool,
                                   shape=[self.batch_size],
                                   name="trackwise_padding{0}".format(i)))
            # targets are just the future data
            # Rescale gt data x1 and x2 such that the MDN is judged in smaller unit scale dimensions
            # This is because I do not expect the network to figure out the scaling, and so the Mixture is in unit size scale
            # So the GT must be brought down to meet it.
            targets\
                = [tf.divide(tf.subtract(self.future_inputs[i], scaling_layer[0]), scaling_layer[1])
                   for i in xrange(len(self.future_inputs))]

        #Hook for the input_feed
        self.target_inputs = targets
        ############## IO and LAYER ASSIGNMENT ##############################

        #Leave the last observation as the first input to the decoder
        #self.encoder_inputs = self.observation_inputs[0:-1]
        # TODO REFACTOR the new RNN may not need this unrolling, check the input space
        with tf.variable_scope('encoder_inputs'):
            self.encoder_inputs = [
                _apply_scaling_and_input_layer(input_timestep)
                for input_timestep in self.observation_inputs[0:-1]
            ]

        #decoder inputs are the last observation and all but the last future
        with tf.variable_scope('decoder_inputs'):
            # Last observation
            self.decoder_inputs = [
                _apply_scaling_and_input_layer(self.observation_inputs[-1])
            ]
            # TODO I don't think I use this anymore (after the first input). i.e. feed_future is hardcoded to False.
            # The length is checked, though.
            self.decoder_inputs.extend([
                _apply_scaling_and_input_layer(self.future_inputs[i])
                for i in xrange(len(self.future_inputs) - 1)
            ])

        #### SEQ2SEQ function HERE

        with tf.variable_scope('seq_rnn'):
            self.MDN_sampled_output,\
            self.losses, \
            self.internal_states,\
            self.MDN_mixture_output,\
            self.trackwise_padding_output=\
                seq2seq_f(self.encoder_inputs, self.decoder_inputs, self.target_inputs, self.observation_inputs[-1],
                          self.trackwise_padding_input)

        ########### EVALUATOR / LOSS SECTION ###################
        # TODO There are several types of cost functions to compare tracks. Implement many
        # Mainly, average MSE over the whole track, or just at a horizon time (t+10 or something)
        # There's this corner alg that Social LSTM refernces, but I haven't looked into it.
        # NOTE - there is a good cost function for the MDN (MLE), this is different to the track accuracy metric (above)
        if self.model_type == 'MDN':
            # tf.reduce_sum(losses,axis=1)/len(self.decoder_inputs)
            self.full_losses = tf.reduce_sum(
                self.losses) / (self.batch_size * len(self.decoder_inputs))
            self.first_loss_losses = tf.reduce_sum(
                self.losses[:,
                            0])  # reduce_sum over batch dim, only take step 1
            self.full_accuracy = -self.full_losses  #TODO placeholder, use MSE or something visually intuitive
            self.first_loss_accuracy = -self.first_loss_losses
        if self.model_type == 'classifier':
            raise Exception  # This model is MDN only

        ############# OPTIMIZER SECTION ########################
        # Gradients and SGD update operation for training the model.
        # Here we split the model into training and validation/testing (inference)
        # This allows for slightly different loss functions.
        # The original Alex Graves 2014 paper and the sketch-rnn architectures only do loss on t+1.
        # and they do inference for all time, so I'd like to try that.
        # Because training only occurs in the training section, there is no inference graidents, etc. ONly inference
        # loss and `accuracy'

        tvars = tf.trainable_variables()
        self.first_loss_gradient_norms = []
        self.first_loss_updates = []
        self.full_gradient_norms = []
        self.full_updates = []
        #opt = tf.train.AdadeltaOptimizer(self.learning_rate)
        opt = tf.train.AdamOptimizer(self.learning_rate)
        #opt = tf.train.RMSPropOptimizer(self.learning_rate)
        #opt = tf.train.GradientDescentOptimizer(self.learning_rate)

        first_loss_gradients = tf.gradients(self.first_loss_losses, tvars)
        clipped_first_loss_gradients, first_loss_norm = tf.clip_by_global_norm(
            first_loss_gradients, self.max_gradient_norm)
        self.first_loss_gradient_norms.append(first_loss_norm)
        first_loss_gradients = zip(clipped_first_loss_gradients, tvars)
        self.first_loss_updates.append(
            opt.apply_gradients(first_loss_gradients,
                                global_step=self.global_step))

        full_gradients = tf.gradients(self.full_losses, tvars)
        clipped_full_gradients, full_norm = tf.clip_by_global_norm(
            full_gradients, self.max_gradient_norm)
        self.full_gradient_norms.append(full_norm)
        full_gradients = zip(clipped_full_gradients, tvars)
        self.full_updates.append(
            opt.apply_gradients(full_gradients, global_step=self.global_step))

        ############# LOGGING SECTION ###########################
        for gradient, variable in full_gradients:  #plot the gradient of each trainable variable
            if variable.name.find(
                    "seq_rnn/combined_tied_rnn_seq2seq/tied_rnn_seq2seq/MultiRNNCell"
            ) == 0:
                var_log_name = variable.name[
                    64:]  #Make the thing readable in Tensorboard
            else:
                var_log_name = variable.name
            if isinstance(gradient, ops.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient
            if not hyper_search:
                self.network_summaries.append(
                    tf.summary.histogram(var_log_name, variable))
                self.network_summaries.append(
                    tf.summary.histogram(var_log_name + "/gradients",
                                         grad_values))
                self.network_summaries.append(
                    tf.summary.histogram(var_log_name + "/gradient_norm",
                                         clip_ops.global_norm([grad_values])))

        if self.first_loss_only:
            self.network_summaries.append(
                tf.summary.scalar('Training_Loss', self.first_loss_losses))
        self.network_summaries.append(
            tf.summary.scalar('Learning_Rate', self.learning_rate))
        self.network_summaries.append(
            tf.summary.scalar('Loss', self.full_losses))

        self.summary_op = tf.summary.merge(self.network_summaries)

        self.saver = tf.train.Saver(max_to_keep=99999)

        return
Exemplo n.º 32
0
    def _build_train_ops(self, grad_bound=1.25, dont_repeat_ff=False):

        tf_variables = tf_ops.get_collection(
            tf_ops.GraphKeys.TRAINABLE_VARIABLES),

        opt = self._get_optimizer()

        pl_ent_loss = self.pl_ent_loss

        # print some ent, adv stats
        all_grads = []
        b_grads = []
        for i in range(self.bs):
            with tf.variable_scope('log_prob_grads'):
                grads_and_vars = opt.compute_gradients(self.log_prob_loss[i],
                                                       tf_variables)
            b_grads.append(grads_and_vars)
            for x in grads_and_vars:
                all_grads.append(x)

        grad_norm = clip_ops.global_norm(
            [tf.cast(g, tf.float32) for g, _ in all_grads if g is not None])
        self.logprob_grad_outs = [[g for g, _ in b_grads[i] if g is not None]
                                  for i in range(self.bs)]

        # print some ent, adv stats
        all_grads2 = []
        b_grads2 = []
        for i in range(self.bs):
            with tf.variable_scope('placement_ent_grads'):
                grads_and_vars2 = opt.compute_gradients(
                    pl_ent_loss[i], tf_variables)
            b_grads2.append(grads_and_vars2)
            for x in grads_and_vars2:
                all_grads2.append(x)

        grad_norm2 = clip_ops.global_norm(
            [tf.cast(g, tf.float32) for g, _ in all_grads2 if g is not None])
        self.ent_grad_outs = [[g for g, _ in b_grads2[i] if g is not None]
                              for i in range(self.bs)]

        self.reinforce_grad_norm = tf.reduce_mean(grad_norm)
        self.entropy_grad_norm = tf.reduce_mean(grad_norm2)
        self.grad_phs = []
        self.grad_outs = []
        gradphs_and_vars = []

        # if not dont_repeat_ff:
        # grads_and_vars = opt.compute_gradients(loss, tf_variables)
        self.grad_outs = None

        for i, [g, v] in enumerate(grads_and_vars):
            if g is not None:
                # if not dont_repeat_ff:
                # self.grad_outs.append(g)
                grad_vtype = tf.float32
                if v.dtype == tf.as_dtype('float16_ref'):
                    grad_vtype = tf.float16
                p = tf.placeholder(grad_vtype, name='grad_phs_%d' % i)
                self.grad_phs.append(p)
                gradphs_and_vars.append((p, v))

        self.grad_norm = tf.global_norm(
            [tf.cast(g, tf.float32) for g in self.grad_phs])

        clipped_grads = gradphs_and_vars
        self.gradphs_and_vars = gradphs_and_vars

        if not self.no_grad_clip:
            clipped_grads = self._clip_grads_and_vars(gradphs_and_vars,
                                                      self.grad_norm,
                                                      grad_bound)

        train_op = opt.apply_gradients(clipped_grads, self.global_step)

        return train_op, self.grad_outs, self.logprob_grad_outs, self.ent_grad_outs
Exemplo n.º 33
0
    def _get_train_ops(self,
                       loss,
                       tf_variables,
                       global_step,
                       grad_bound=1.25,
                       lr_init=1e-3,
                       lr_dec=0.9,
                       start_decay_step=10000,
                       decay_steps=100,
                       optimizer_type="adam"):
        """Loss optimizer.

    Args:
      loss: scalar tf tensor
      tf_variables: list of training variables, typically
        tf.trainable_variables()
      global_step: global_step
      grad_bound: max gradient norm
      lr_init: initial learning rate
      lr_dec: leaning rate decay coefficient
      start_decay_step: start decaying learning rate after this many steps
      decay_steps: apply decay rate factor at this step intervals
      optimizer_type: optimizer type should be either adam or sgd

    Returns:
      train_op: training op
      learning_rate: scalar learning rate tensor
      grad_norm: l2 norm of the gradient vector
      all_grad_norms: l2 norm of each component
    """
        lr_gstep = global_step - start_decay_step

        def f1():
            return constant_op.constant(lr_init)

        def f2():
            return learning_rate_decay.exponential_decay(
                lr_init, lr_gstep, decay_steps, lr_dec, True)

        learning_rate = control_flow_ops.cond(math_ops.less(
            global_step, start_decay_step),
                                              f1,
                                              f2,
                                              name="learning_rate")

        if optimizer_type == "adam":
            opt = adam.AdamOptimizer(learning_rate)
        elif optimizer_type == "sgd":
            opt = gradient_descent.GradientDescentOptimizer(learning_rate)
        grads_and_vars = opt.compute_gradients(loss, tf_variables)
        grad_norm = clip_ops.global_norm([g for g, v in grads_and_vars])
        all_grad_norms = {}
        clipped_grads = []
        clipped_rate = math_ops.maximum(grad_norm / grad_bound, 1.0)
        for g, v in grads_and_vars:
            if g is not None:
                if isinstance(g, tf_ops.IndexedSlices):
                    clipped = g.values / clipped_rate
                    norm_square = math_ops.reduce_sum(clipped * clipped)
                    clipped = tf_ops.IndexedSlices(clipped, g.indices)
                else:
                    clipped = g / clipped_rate
                    norm_square = math_ops.reduce_sum(clipped * clipped)
                all_grad_norms[v.name] = math_ops.sqrt(norm_square)
                clipped_grads.append((clipped, v))

        train_op = opt.apply_gradients(clipped_grads, global_step)
        return train_op, learning_rate, grad_norm, all_grad_norms
Exemplo n.º 34
0
  def __init__(self,config):
    """Hyperparameters"""
    num_layers = config['num_layers']
    hidden_size = config['hidden_size']
    max_grad_norm = config['max_grad_norm']
    batch_size = config['batch_size']
    sl = config['sl']
    mixtures = config['mixtures']
    crd = config['crd']
    learning_rate = config['learning_rate']
    num_l = config['num_l']
    self.sl = sl
    self.crd = crd
    self.batch_size = batch_size


    #Function for initialization
    def xv_init(arg_in, arg_out,shape=None):
      low = -np.sqrt(6.0/(arg_in + arg_out))
      high = np.sqrt(6.0/(arg_in + arg_out))
      if shape is None:
        tensor_shape = (arg_in, arg_out)
      return tf.random_uniform(tensor_shape, minval=low, maxval=high, dtype=tf.float32)

    # Nodes for the input variables
    self.x = tf.placeholder("float", shape=[batch_size, crd,sl], name = 'Input_data')
    x_next = tf.sub(self.x[:,:3,1:],  self.x[:,:3,:sl-1])
    xn1,xn2,xn3 = tf.split(1,3,x_next)   #Now tensors in [batch_size,1,seq_len-1]
    rev_dims = [False, False, True]
    xn1 = tf.reverse(xn1,rev_dims)
    xn2 = tf.reverse(xn2,rev_dims)
    xn3 = tf.reverse(xn3,rev_dims)


    with tf.variable_scope("Enc") as scope:
      cell_enc = tf.nn.rnn_cell.LSTMCell(hidden_size)
      cell_enc = tf.nn.rnn_cell.MultiRNNCell([cell_enc] * num_layers)

      #Initial state
      initial_state_enc = cell_enc.zero_state(batch_size, tf.float32)


      outputs_enc = []
      self.states_enc = []
      state = initial_state_enc
      for time_step in range(sl):
        if time_step > 0: tf.get_variable_scope().reuse_variables()
        (cell_output, state) = cell_enc(self.x[:, :, time_step], state)
        outputs_enc.append(cell_output)

    with tf.name_scope("Enc_2_lat") as scope:
      #m_enc,h_enc = tf.split(1,2,self.final_state)
      #layer for mean of z
      W_mu = tf.Variable(xv_init(hidden_size,num_l))
      b_mu = tf.Variable(tf.constant(0.1,shape=[num_l],dtype=tf.float32))
      self.z_mu = tf.nn.xw_plus_b(cell_output,W_mu,b_mu)  #mu, mean, of latent space

      #layer for sigma of z
      W_sig = tf.Variable(xv_init(hidden_size,num_l))
      b_sig = tf.Variable(tf.constant(0.1,shape=[num_l],dtype=tf.float32))
      z_sig_log_sq = tf.nn.xw_plus_b(cell_output,W_sig,b_sig)  #sigma of latent space, in log-scale and squared.
      # This log_sq will save computation later on. log(sig^2) is a real number, so no sigmoid is necessary

    with tf.name_scope("Latent_space") as scope:
      self.eps = tf.random_normal(tf.shape(self.z_mu),0,1,dtype=tf.float32)
      self.z = self.z_mu + tf.mul(tf.sqrt(tf.exp(z_sig_log_sq)),self.eps)   #Z is the vector in latent space

    with tf.variable_scope("Lat_2_dec") as scope:
      #Create initial vector
      params_xend = 3 + 4    # 3 (X,Y,Z) plus 4 (sx,sx,sz,rho)
      W_xend = tf.Variable(xv_init(num_l,params_xend))
      self.b_xend = tf.Variable(tf.constant(0.1,shape=[params_xend],dtype=tf.float32))
      self.parameters_xend = tf.nn.xw_plus_b(self.z,W_xend,self.b_xend)

      mu1x,mu2x,mu3x,s1x,s2x,s3x,rhox = tf.split(1,params_xend,self.parameters_xend)  #Individual vectors in [batch_size,1]
      s1x = tf.exp(s1x)
      s2x = tf.exp(s2x)
      s3x = tf.exp(s3x)
      rhox = tf.tanh(rhox)
      x_end = tf.concat(1,[mu1x,mu2x,mu3x])


      #Reconstruction loss for x_end
      xs1,xs2,xs3 = tf.split(1,3,self.x[:,:3,sl-1])
      pxend12 = tf_2d_normal(xs1, xs2, mu1x, mu2x, s1x, s2x, rhox)   #probability in x1x2 plane
      pxend3 = tf_1d_normal(xs3,mu3x,s3x)
      pxend = tf.mul(pxend12,pxend3)
      loss_xend = -tf.log(tf.maximum(pxend, 1e-20)) # at the beginning, some errors are exactly zero.
      self.cost_xstart = tf.reduce_mean(loss_xend)###tf.constant(0.0)#
      #Create initial hidden state and memory state
      W_hstart = tf.Variable(xv_init(num_l,hidden_size))
      b_hstart = tf.Variable(tf.constant(0.01,shape=[hidden_size],dtype=tf.float32))
      h_start = tf.nn.xw_plus_b(self.z,W_hstart,b_hstart)

    with tf.variable_scope("Out_layer") as scope:
      params = 7 # x,y,z,sx,sy,sz,rho
      output_units = mixtures*params  #Two for distribution over hit&miss, params for distribution parameters
      W_o = tf.Variable(tf.random_normal([hidden_size,output_units], stddev=0.1))
      b_o = tf.Variable(tf.constant(0.5, shape=[output_units]))


    with tf.variable_scope("Dec") as scope:
      cell_dec = tf.nn.rnn_cell.LSTMCell(hidden_size)
      cell_dec = tf.nn.rnn_cell.MultiRNNCell([cell_dec] * num_layers)

      #Initial state
      initial_state_dec = tf.tile(h_start,[1,2*num_layers])
      PARAMS = []
      self.states = []
      state = initial_state_dec
      x_in = x_end
      x_collect = []
      x_collect.append(x_in)
      for time_step in range(sl):
        if time_step > 0: tf.get_variable_scope().reuse_variables()
        (cell_output, state) = cell_dec(x_in, state)
        self.states.append(state)
        #Convert hidden state to offset for the next
        params_MDN = tf.nn.xw_plus_b(cell_output,W_o,b_o) # Now in [batch_size,output_units]
        PARAMS.append(params_MDN)
        x_in = x_in - params_MDN[:,:3]   #First three columns are the new x_in
        x_collect.append(x_in)

    #Prepare x_collect for extraction
    self.x_col = tf.pack(x_collect)   #in [seq_len, batch_size,crd]


    with tf.variable_scope("Loss_calc") as scope:
      ### Reconstruction loss
      PARAMS = tf.pack(PARAMS[:-1])
      PARAMS = tf.transpose(PARAMS,[1,2,0])  # Now in [batch_size, output_units,seq_len-1]
      mu1,mu2,mu3,s1,s2,s3,rho = tf.split(1,7,PARAMS)  #Each Tensor in [batch_size,seq_len-1]
      s1 = tf.exp(s1)
      s2 = tf.exp(s2)
      s3 = tf.exp(s3)
      rho = tf.tanh(rho)
      px1x2 = tf_2d_normal(xn1, xn2, mu1, mu2, s1, s2, rho)   #probability in x1x2 plane
      px3 = tf_1d_normal(xn3,mu3,s3)
      px1x2x3 = tf.mul(px1x2,px3)  #Now in [batch_size,1,seq_len-1]
      loss_seq = -tf.log(tf.maximum(px1x2x3, 1e-20)) # at the beginning, some errors are exactly zero.
      self.cost_seq = tf.reduce_mean(loss_seq)

      ### KL divergence between posterior on encoder and prior on z
      self.cost_kld = tf.reduce_mean(-0.5*tf.reduce_sum((1+z_sig_log_sq-tf.square(self.z_mu)-tf.exp(z_sig_log_sq)),1))   #KL divergence

      self.cost = self.cost_seq + self.cost_kld + self.cost_xstart

    with tf.name_scope("train") as scope:
      tvars = tf.trainable_variables()
      #We clip the gradients to prevent explosion
      grads = tf.gradients(self.cost, tvars)
      grads, _ = tf.clip_by_global_norm(grads,max_grad_norm)

      #Some decay on the learning rate
      global_step = tf.Variable(0,trainable=False)
      lr = tf.train.exponential_decay(learning_rate,global_step,1000,0.90,staircase=False)
      optimizer = tf.train.AdamOptimizer(lr)
      gradients = zip(grads, tvars)
      self.train_step = optimizer.apply_gradients(gradients,global_step=global_step)
      # The following block plots for every trainable variable
      #  - Histogram of the entries of the Tensor
      #  - Histogram of the gradient over the Tensor
      #  - Histogram of the grradient-norm over the Tensor
      self.numel = tf.constant([[0]])
      for gradient, variable in gradients:
        if isinstance(gradient, ops.IndexedSlices):
          grad_values = gradient.values
        else:
          grad_values = gradient

        self.numel +=tf.reduce_sum(tf.size(variable))

        h1 = tf.histogram_summary(variable.name, variable)
        h2 = tf.histogram_summary(variable.name + "/gradients", grad_values)
        h3 = tf.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values]))
    #Define one op to call all summaries
    self.merged = tf.merge_all_summaries()
Exemplo n.º 35
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  clip_gradients=None,
                  moving_average_decay=0.9,
                  learning_rate_decay_fn=None,
                  variables=None):
  """Given loss and parameters for optimizer, returns a training op.

  Args:
    loss: Tensor, 0 dimensional.
    global_step: Tensor, step counter for each update.
    learning_rate: float or Tensor, magnitude of update per each training step.
    optimizer: string or function, used as optimizer for training.
    clip_gradients: float or None, clips gradients by this value.
    moving_average_decay: float or None, takes into account previous loss
                          to make learning smoother due to outliers.
    learning_rate_decay_fn: function, takes learning_rate and global_step
                            Tensors, returns Tensor. Can be used to implement
                            any learning rate decay funcitons.
                            For example: tf.train.exponential_decay.
    variables: list of variables to optimizer or none.

  Returns:
    Training op.

  Raises:
    ValueError: if optimizer is wrong type.
  """
  # Moving average of the loss with decay.
  if moving_average_decay is not None:
    # Generate moving averages of the loss.
    loss_averages = train.ExponentialMovingAverage(moving_average_decay,
                                                   name="avg")
    loss_averages_op = loss_averages.apply([loss])
    logging_ops.scalar_summary("loss/mean", loss_averages.average(loss))
    loss = control_flow_ops.with_dependencies([loss_averages_op], loss)

  # Convert optimizer into the optimizer class.
  if isinstance(optimizer, str):
    opt_cls = OPTIMIZER_CLS_NAMES[optimizer]
  elif callable(optimizer):
    opt_cls = optimizer
  else:
    raise ValueError("Unrecognized optimizer: should be string or function.")

  # Learning rate variable, with possible decay.
  lr = vs.get_variable("learning_rate",
                       [],
                       trainable=False,
                       initializer=init_ops.constant_initializer(learning_rate))
  if learning_rate_decay_fn is not None:
    lr = learning_rate_decay_fn(lr, global_step)

  # Create optimizer.
  opt = opt_cls(learning_rate=lr)

  # All trainable variables, if specific variables are not specified.
  if variables is None:
    variables = vars_.trainable_variables()

  # Compute gradients and clip them if provided.
  gradients = opt.compute_gradients(loss, variables)
  if clip_gradients is not None:
    clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients,
                                                        clip_gradients)
    gradients = zip(clipped_gradients, variables)

  # Add scalar summary for loss.
  logging_ops.scalar_summary("loss", loss)

  # Add histograms for variables, gradients and gradient norms.
  for gradient, variable in gradients:
    if isinstance(gradient, ops.IndexedSlices):
      grad_values = gradient.values
    else:
      grad_values = gradient
    logging_ops.histogram_summary(variable.name, variable)
    logging_ops.histogram_summary(variable.name + "/gradients", grad_values)
    logging_ops.histogram_summary(variable.name + "/gradient_norm",
                                  clip_ops.global_norm([grad_values]))

  # Create gradient updates.
  grad_updates = opt.apply_gradients(gradients,
                                     global_step=global_step,
                                     name="train")
  # Make sure total_loss is valid.
  final_loss = array_ops.check_numerics(loss, "Loss is inf or nan")

  # Ensure the train_tensor computes grad_updates.
  train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss)

  return train_tensor
Exemplo n.º 36
0
    # The following block plots for every trainable variable
    #  - Histogram of the entries of the Tensor
    #  - Histogram of the gradient over the Tensor
    #  - Histogram of the grradient-norm over the Tensor
    numel = tf.constant([[0]])
    for gradient, variable in gradients:
      if isinstance(gradient, ops.IndexedSlices):
        grad_values = gradient.values
      else:
        grad_values = gradient

      numel +=tf.reduce_sum(tf.size(variable))

      h1 = tf.histogram_summary(variable.name, variable)
      h2 = tf.histogram_summary(variable.name + "/gradients", grad_values)
      h3 = tf.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values]))
with tf.name_scope("Evaluating_accuracy") as scope:
    correct_prediction = tf.equal(tf.argmax(h_fc2,1), y_)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    accuracy_summary = tf.scalar_summary("accuracy", accuracy)


#Define one op to call all summaries
merged = tf.merge_all_summaries()

def print_tvars():
  tvars = tf.trainable_variables()
  for variable in tvars:
    print(variable.name)
  return
print_tvars()
Exemplo n.º 37
0
def optimize_loss(losses,
                  global_step,
                  learning_rate,
                  optimizer,
                  num_gpus=1,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  learning_rate_decay_fn=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=["global_gradient_norm"],
                  colocate_gradients_with_ops=False,
                  increment_global_step=True,
                  use_tpu=False,
                  use_horovod=False):
    """Given loss and parameters for optimizer, returns a training op.
  Args:
    loss: Tensor, 0 dimensional.
    global_step: Tensor, step counter for each update.
    learning_rate: float or Tensor, magnitude of update per each training step.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of tf.Optimizer that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantion of tf.Optimizer sub-class
                 and have `compute_gradients` and `apply_gradients` functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float or `None`, clips gradients by this value.
    moving_average_decay: Deprecated. float or None, takes into account previous
                          loss to make learning smoother due to outliers.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: tf.train.exponential_decay.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
  Returns:
    Training op.
  Raises:
    ValueError: if optimizer is wrong type.
  """
    with vs.variable_scope(name, "OptimizeLoss", losses + [global_step]):
        # Update ops take UPDATE_OPS collection if not provided.
        if update_ops is None:
            update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))

        #--from https://github.com/tensorflow/tensorflow/blob/28c3c5dd38e3b397c2cf0acdaa6388dcbf0349f7/tensorflow/contrib/layers/python/layers/optimizers.py
        # Learning rate variable, with possible decay.
        lr = None
        if learning_rate is not None:
            if (isinstance(learning_rate, ops.Tensor)
                    or isinstance(learning_rate, tf.Variable)
                    and learning_rate.get_shape().ndims == 0):
                #print('------------------optimize_loss learning rate do nothhing', learning_rate)
                lr = learning_rate
            elif isinstance(learning_rate, float):
                if learning_rate < 0.0:
                    raise ValueError("Invalid learning_rate %s.",
                                     learning_rate)
                lr = vs.get_variable(
                    "learning_rate", [],
                    trainable=False,
                    initializer=init_ops.constant_initializer(learning_rate))
            else:
                raise ValueError(
                    "Learning rate should be 0d Tensor or float. "
                    "Got %s of type %s" %
                    (str(learning_rate), str(type(learning_rate))))

        if learning_rate is not None and learning_rate_decay_fn is not None:
            if global_step is None:
                raise ValueError(
                    "global_step is required for learning_rate_decay_fn.")
            lr = learning_rate_decay_fn(lr, global_step)

        # Create optimizer, given specified parameters.
        if isinstance(optimizer, six.string_types):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is string (%s)." % optimizer)
            if optimizer not in OPTIMIZER_CLS_NAMES:
                raise ValueError(
                    "Optimizer name should be one of [%s], you provided %s." %
                    (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
            opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
        elif (isinstance(optimizer, type)
              and issubclass(optimizer, optimizer_.Optimizer)):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is class (%s)." % optimizer)
            opt = optimizer(learning_rate=lr)
        elif isinstance(optimizer, optimizer_.Optimizer):
            #print('------------------optimize_loss optimizer do nothing', optimizer)
            opt = optimizer
        elif callable(optimizer):
            if learning_rate is not None:
                opt = optimizer(lr)
            else:
                opt = optimizer()
            if not isinstance(opt, optimizer_.Optimizer):
                pass
                # TODO all tf.keras.optimizers
                #raise ValueError("Unrecognized optimizer: function should return "
                #                 "subclass of Optimizer. Got %s." % str(opt))
        else:
            raise ValueError(
                "Unrecognized optimizer: should be string, "
                "subclass of Optimizer, instance of "
                "subclass of Optimizer or function with one argument. "
                "Got %s." % str(optimizer))

        if use_tpu:
            opt = tf.contrib.tpu.CrossShardOptimizer(opt)
        assert not (use_tpu and use_horovod)
        if use_horovod:
            #https://blog.csdn.net/qq_16234613/article/details/96186398
            # we enable compression only for fp16
            #import horovod.tensorflow as hvd
            #from horovod.tensorflow.compression import Compression
            # if use_fp16:
            #     compression = Compression.fp16
            # else:
            #     compression = Compression.none
            # opt = hvd.DistributedOptimizer(opt, sparse_as_dense=True,t7
            #                                compression=compression)
            opt = hvd.DistributedOptimizer(opt)

            ## just use opt.minize 0.1 epoch 64 *  8 gpu  auc 0.717 if cmment this line using gradiens.. auc 0.7186
            #return opt.minimize(losses[0], global_step=global_step if increment_global_step else None)

        if num_gpus > 1:
            # Calculate the gradients for each model tower.
            # TODO check below is all ok, right now single gpu using below code will be slower then tf.contrib.optimize_loss  4.5 batch/s -> 3 batch/s
            tower_grads = []
            for i in range(num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % ('tower', i)) as name_scope:
                        # All trainable variables, if specific variables are not specified.

                        #-- TODO trainable_variables affect speed ?
                        if variables is None:
                            variables = vars_.trainable_variables()
                        # Compute gradients.
                        loss = losses[i]
                        # if update_ops:
                        #   loss = control_flow_ops.with_dependencies(list(update_ops), loss)
                        #print('------------',)
                        try:
                            gradients = opt.compute_gradients(
                                loss,
                                variables,
                                colocate_gradients_with_ops=
                                colocate_gradients_with_ops)
                        except Exception:
                            # try:
                            #   gradients = opt.compute_gradients(loss)
                            # except Exception:
                            gradients = opt.get_updates(loss, params=variables)

                        #TODO FIXME might have None for example add another predictor to graph
                        #[(None, <tf.Variable 'dual_bow/model_init/emb:0' shape=(29285, 256) dtype=float32_ref>),
                        #(None, <tf.Variable 'dual_bow/main/dual_textsim/encode/text_mlp/linear/weights:0' shape=(256, 256) dtype=float32_ref>),
                        #(<tensorflow.python.framework.ops.IndexedSlices object at 0x1b72ff50>, <tf.Variable 'seq2seq/model_init/emb:0' shape=(29285, 256) dtype=float32_ref>)
                        #print('-------gradients1', gradients)
                        #--now hack use below, TODO why dual_bow.. in introduced when compute gradient of loss as seem not related my seq2seq loss?
                        gradients = [x for x in gradients if x[0] is not None]
                        # Optionally add gradient noise.
                        if gradient_noise_scale is not None:
                            gradients = _add_scaled_noise_to_gradients(
                                gradients, gradient_noise_scale)
                        # Multiply some gradients.
                        if gradient_multipliers is not None:
                            gradients = _multiply_gradients(
                                gradients, gradient_multipliers)
                        # Optionally clip gradients by global norm.
                        if clip_gradients is not None:
                            gradients = _clip_gradients_by_norm(
                                gradients, clip_gradients)

                        #print('-------gradients', gradients)
                        tower_grads.append(gradients)

            gradients = average_gradients(tower_grads)
            if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
                summary.scalar("global_norm/gradient_norm",
                               clip_ops.global_norm(list(zip(*gradients))[0]))

            # Add histograms for variables, gradients and gradient norms.
            for gradient, variable in gradients:
                if isinstance(gradient, ops.IndexedSlices):
                    grad_values = gradient.values
                else:
                    grad_values = gradient

                if grad_values is not None:
                    var_name = variable.name.replace(":", "_")
                    if "gradients" in summaries:
                        summary.histogram("gradients/%s" % var_name,
                                          grad_values)
                    if "gradient_norm" in summaries:
                        summary.scalar("gradient_norm/%s" % var_name,
                                       clip_ops.global_norm([grad_values]))

            if clip_gradients is not None and (
                    "global_gradient_norm" in summaries
                    or "gradient_norm" in summaries):
                summary.scalar("global_norm/clipped_gradient_norm",
                               clip_ops.global_norm(list(zip(*gradients))[0]))
        else:
            loss = losses[0]

            ## similar but will do do clip gradient and other things, if comment auc 0.72739 not comment 0.72634
            #return opt.minimize(losses[0], global_step=global_step if increment_global_step else None)

            # All trainable variables, if specific variables are not specified.
            if variables is None:
                variables = vars_.trainable_variables()

            # Compute gradients.
            try:
                gradients = opt.compute_gradients(
                    loss,
                    variables,
                    colocate_gradients_with_ops=colocate_gradients_with_ops)
            except Exception:
                # TODO not work for keras
                gradients = opt.get_updates(loss=loss, params=variables)

            # Optionally add gradient noise.
            if gradient_noise_scale is not None:
                gradients = _add_scaled_noise_to_gradients(
                    gradients, gradient_noise_scale)

            # Multiply some gradients.
            if gradient_multipliers is not None:
                gradients = _multiply_gradients(gradients,
                                                gradient_multipliers)
                if not gradients:
                    raise ValueError(
                        "Empty list of (gradient, var) pairs encountered. This is most "
                        "likely to be caused by an improper value of gradient_multipliers."
                    )

            if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
                summary.scalar("global_norm/gradient_norm",
                               clip_ops.global_norm(list(zip(*gradients))[0]))

            # Optionally clip gradients by global norm.
            if isinstance(clip_gradients, float):
                gradients = _clip_gradients_by_norm(gradients, clip_gradients)
            elif callable(clip_gradients):
                gradients = clip_gradients(gradients)
            elif clip_gradients is not None:
                raise ValueError("Unknown type %s for clip_gradients" %
                                 type(clip_gradients))

            # # Add scalar summary for loss.
            # if "loss" in summaries:
            #   summary.scalar("loss", loss)

            # Add histograms for variables, gradients and gradient norms.
            for gradient, variable in gradients:
                if isinstance(gradient, ops.IndexedSlices):
                    grad_values = gradient.values
                else:
                    grad_values = gradient

                if grad_values is not None:
                    var_name = variable.name.replace(":", "_")
                    if "gradients" in summaries:
                        summary.histogram("gradients/%s" % var_name,
                                          grad_values)
                    if "gradient_norm" in summaries:
                        summary.scalar("gradient_norm/%s" % var_name,
                                       clip_ops.global_norm([grad_values]))

            if clip_gradients is not None and (
                    "global_gradient_norm" in summaries
                    or "gradient_norm" in summaries):
                summary.scalar("global_norm/clipped_gradient_norm",
                               clip_ops.global_norm(list(zip(*gradients))[0]))

        # Create gradient updates.
        grad_updates = opt.apply_gradients(
            gradients,
            global_step=global_step if increment_global_step else None,
            name="train")

        # IMPORTANT this is needed for momentum!
        if update_ops:
            grad_updates = [grad_updates]
            #print('--------------------1', grad_updates)
            grad_updates.extend(update_ops)
            #print('-------------------2', update_ops)
            grad_updates = tf.group(*grad_updates)
            #print('-----------grad updates', grad_updates)

        # # Make sure total_loss is valid.
        # final_loss = array_ops.check_numerics(loss, "Loss is inf or nan")

        # # Ensure the train_tensor computes grad_updates.
        # train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss)

        #return train_tensor
        return grad_updates
Exemplo n.º 38
0
def basic_CNN(lr_rate,num_filt_1,num_filt_2,num_fc_1,num_fc_2):
    
    """Hyperparameters"""
    filt_1 = [num_filt_1,5]     #Number of filters in first conv layer
    filt_2 = [num_filt_2,5]      #Number of filters in second conv layer
    num_fc_1 = num_fc_1       #Number of neurons in first fully connected layer
    num_fc_2 = num_fc_2       #Number or neurons in second fully connected layer
    max_iterations = 4000
    batch_size = 10
    dropout = 0.5       #Dropout rate in the fully connected layer
    plot_row = 5        #How many rows do you want to plot in the visualization
    learning_rate = lr_rate
    input_cent = True   # Do you want to center the x,y,z coordinates? 
    sl = 137           #sequence length
    ratio = 0.8         #Ratio for train-val split
    crd = 264             #How many coordinates you feed
    sl_pad = 2
    D = (sl+sl_pad-filt_1[1])/1+1              
    #Explanation on D: We pad the input sequence at the basket-side. There is more
    # information and we dont want to lose it in the border effect.
    # The /1 is when future implementation want to play with different strides
    plot_every = 100    #How often do you want terminal output for the performances



    """Load the data"""
    data,labels,p_id = rf.read_data('/home/siddhu/FBIRN/original_res/ROI_files/masked','/home/siddhu/FBIRN/original_res/mat_format',[3])
    print('We have %s observations with a sequence length of %s '%(data.shape[0],sl))
    #print('We have %s observations with a sequence length of %s '%(N,sl))

    #Demean the data conditionally
    if input_cent:
      data = rf.standardize(data)

    #Shuffle the data
    (X_train,X_val,y_train,y_val) = rf.random_split(data,labels,ratio=0.8)
    
    N = X_train.shape[0]
    Nval = X_val.shape[0]
    data = None  #we don;t need to store this big matrix anymore
    
    # Organize the classes
    num_classes = len(np.unique(y_train))
    base = np.min(y_train)  #Check if data is 0-based
    if base != 0:
        y_train -=base
        y_val -= base


    #For sanity check we plot a random collection of lines
    # For better visualization, see the MATLAB script in this project folder
    # if False:
    #     plot_basket(X_train,y_train)


    #Proclaim the epochs
    epochs = np.floor(batch_size*max_iterations / N)
    #print('Train with approximately %d epochs' %(epochs))

    # Nodes for the input variables
    x = tf.placeholder("float", shape=[None, crd,sl], name = 'Input_data')
    y_ = tf.placeholder(tf.int64, shape=[None], name = 'Ground_truth')
    keep_prob = tf.placeholder("float")
    bn_train = tf.placeholder(tf.bool)          #Boolean value to guide batchnorm

    # Define functions for initializing variables and standard layers
    #For now, this seems superfluous, but in extending the code
    #to many more layers, this will keep our code
    #read-able
    def weight_variable(shape, name):
      initial = tf.truncated_normal(shape, stddev=0.1)
      return tf.Variable(initial, name = name)

    def bias_variable(shape, name):
      initial = tf.constant(0.1, shape=shape)
      return tf.Variable(initial, name = name)

    def conv2d(x, W):
      return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')


    with tf.name_scope("Reshaping_data") as scope:
      x_feed = tf.expand_dims(x,dim=3, name = 'x_feed')
      x_pad = tf.pad(x_feed,[[0,0],[0,0],[0,sl_pad],[0,0]])


    """Build the graph"""
    # ewma is the decay for which we update the moving average of the 
    # mean and variance in the batch-norm layers
    with tf.name_scope("Conv1") as scope:
      W_conv1 = weight_variable([crd, filt_1[1], 1, filt_1[0]], 'Conv_Layer_1')
      b_conv1 = bias_variable([filt_1[0]], 'bias_for_Conv_Layer_1')
      a_conv1 = tf.add(tf.nn.conv2d(x_pad,W_conv1,strides=[1,1,1,1],padding='VALID'),b_conv1)
      size1 = tf.shape(a_conv1)

    with tf.name_scope('Batch_norm_conv1') as scope:
    #    ewma = tf.train.ExponentialMovingAverage(decay=0.99)                  
    #    bn_conv1 = ConvolutionalBatchNormalizer(num_filt_1, 0.001, ewma, True)           
    #    update_assignments = bn_conv1.get_assigner() 
    #    a_conv1 = bn_conv1.normalize(a_conv1, train=bn_train) 
        a_conv1_bn = batch_norm(a_conv1,filt_1[0],bn_train,'bn1')
        h_conv1 = tf.nn.relu(a_conv1_bn)
        a_conv1_hist = tf.histogram_summary('a_conv1_bn',a_conv1_bn)
        a_conv1_hist1 = tf.histogram_summary('a_conv1',a_conv1)

    with tf.name_scope("Conv2") as scope:
      W_conv2 = weight_variable([1, filt_2[1], filt_1[0], filt_2[0]], 'Conv_Layer_2')
      b_conv2 = bias_variable([filt_2[0]], 'bias_for_Conv_Layer_2')
      a_conv2 = conv2d(h_conv1, W_conv2) + b_conv2

    with tf.name_scope('Batch_norm_conv2') as scope:
    #    bn_conv2 = ConvolutionalBatchNormalizer(num_filt_2, 0.001, ewma, True)           
    #    update_assignments = bn_conv2.get_assigner() 
    #    a_conv2 = bn_conv2.normalize(a_conv2, train=bn_train) 
        a_conv2 = batch_norm(a_conv2,filt_2[0],bn_train,'bn2')
        h_conv2 = tf.nn.relu(a_conv2)

    with tf.name_scope("Fully_Connected1") as scope:
      W_fc1 = weight_variable([D*filt_2[0], num_fc_1], 'Fully_Connected_layer_1')
      b_fc1 = bias_variable([num_fc_1], 'bias_for_Fully_Connected_Layer_1')
      h_conv2_flat = tf.reshape(h_conv2, [-1, D*filt_2[0]])
      h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, W_fc1) + b_fc1)

    with tf.name_scope("Fully_Connected2") as scope:
      W_fc2 = weight_variable([num_fc_1,num_fc_2], 'Fully_Connected_layer_2')
      b_fc2 = bias_variable([num_fc_2], 'bias_for_Fully_Connected_Layer_2')
      h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)  


    with tf.name_scope("Output") as scope:
        #postfix _o represent variables for output layer
      h_o_drop = tf.nn.dropout(h_fc2, keep_prob)
      W_o = tf.Variable(tf.truncated_normal([num_fc_2, 1], stddev=0.1),name = 'W_o')
      b_o = tf.Variable(tf.constant(0.1, shape=[1]),name = 'b_o')
      h_o = tf.matmul(h_o_drop, W_o) + b_o
      sm_o = tf.sigmoid(h_o)

    with tf.name_scope("Sigmoid") as scope:
        loss = tf.square(sm_o-tf.to_float(y_))
        cost = tf.reduce_mean(loss)
        loss_summ = tf.scalar_summary("cross entropy_loss", cost)
    with tf.name_scope("train") as scope:
        tvars = tf.trainable_variables()
        #We clip the gradients to prevent explosion
        grads = tf.gradients(cost, tvars)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients = zip(grads, tvars)
        train_step = optimizer.apply_gradients(gradients)
        # The following block plots for every trainable variable
        #  - Histogram of the entries of the Tensor
        #  - Histogram of the gradient over the Tensor
        #  - Histogram of the grradient-norm over the Tensor
        numel = tf.constant([[0]])
        for gradient, variable in gradients:
          if isinstance(gradient, ops.IndexedSlices):
            grad_values = gradient.values
          else:
            grad_values = gradient

          numel +=tf.reduce_sum(tf.size(variable))  

          h1 = tf.histogram_summary(variable.name, variable)
          h2 = tf.histogram_summary(variable.name + "/gradients", grad_values)
          h3 = tf.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values]))
        #tf.gradients returns a list. We cannot fetch a list. therefore we fetch the tensor that is the 0-th element of the list
        vis = tf.gradients(loss, x_feed)[0]
    with tf.name_scope("Evaluating_accuracy") as scope:
        correct_prediction = tf.equal(tf.argmax(h_o,1), y_)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        accuracy_summary = tf.scalar_summary("accuracy", accuracy)


    #Define one op to call all summaries    
    merged = tf.merge_all_summaries()

    # For now, we collect performances in a Numpy array.
    # In future releases, I hope TensorBoard allows for more
    # flexibility in plotting
    perf_collect = np.zeros((4,int(np.floor(max_iterations /100))))

    with tf.Session() as sess:
      writer = tf.train.SummaryWriter('/home/siddhu/FBIRN/cnn/log/', sess.graph)

      sess.run(tf.initialize_all_variables())

      step = 0      # Step is a counter for filling the numpy array perf_collect
      for i in range(max_iterations):
        batch_ind = np.random.choice(N,batch_size,replace=False)

        check = sess.run([size1],feed_dict={ x: X_val, y_: y_val, keep_prob: 1.0, bn_train : False})    
        #print check[0]

        if i==0:
            # Use this line to check before-and-after test accuracy
            result = sess.run(accuracy, feed_dict={ x: X_val, y_: y_val, keep_prob: 1.0, bn_train : False})
            acc_test_before = result
        if i%100 == 0:
          #Check training performance
          result = sess.run([accuracy,cost],feed_dict = { x: X_train, y_: y_train, keep_prob: 1.0, bn_train : False})
          perf_collect[0,step] = result[0] 
          perf_collect[1,step] = result[1]        

          #Check validation performance
          result = sess.run([accuracy,cost,merged], feed_dict={ x: X_val, y_: y_val, keep_prob: 1.0, bn_train : False})
          acc = result[0]
          perf_collect[2,step] = acc
          perf_collect[3,step] = result[1]

          #Write information to TensorBoard
          summary_str = result[2]
          writer.add_summary(summary_str, i)
          writer.flush()  #Don't forget this command! It makes sure Python writes the summaries to the log-file
          #print(" Validation accuracy at %s out of %s is %s" % (i,max_iterations, acc))
          step +=1
        sess.run(train_step,feed_dict={x:X_train[batch_ind], y_: y_train[batch_ind], keep_prob: dropout, bn_train : True})
      #In the next line we also fetch the softmax outputs 
      result = sess.run([accuracy,numel,sm_o, x_pad], feed_dict={ x: X_val, y_: y_val, keep_prob: 1.0, bn_train : False})
      acc_test = result[0]
    tf.reset_default_graph()
    return acc_test
Exemplo n.º 39
0
    def __init__(self, config):

        num_layers = config['num_layers']
        hidden_size = config['hidden_size']
        max_grad_norm = config['max_grad_norm']
        self.batch_size = config['batch_size']
        sl = config['sl']
        learning_rate = config['learning_rate']
        num_classes = config['num_classes']
        """Place holders"""
        self.input = tf.placeholder(tf.float32, [None, sl], name='input')
        self.labels = tf.placeholder(tf.int64, [None], name='labels')
        self.keep_prob = tf.placeholder("float", name='Drop_out_keep_prob')

        with tf.name_scope("LSTM_setup") as scope:
            cell = tf.nn.rnn_cell.LSTMCell(hidden_size)
            cell = tf.nn.rnn_cell.DropoutWrapper(
                cell, output_keep_prob=self.keep_prob)
            cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)
            initial_state = cell.zero_state(self.batch_size, tf.float32)

        input_list = tf.unstack(tf.expand_dims(self.input, axis=2), axis=1)
        outputs, _ = tf.nn.seq2seq.rnn_decoder(input_list, initial_state, cell)

        output = outputs[-1]

        #Generate a classification from the last cell_output
        #Note, this is where timeseries classification differs from sequence to sequence
        #modelling. We only output to Softmax at last time step
        with tf.name_scope("Softmax") as scope:
            with tf.variable_scope("Softmax_params"):
                softmax_w = tf.get_variable("softmax_w",
                                            [hidden_size, num_classes])
                softmax_b = tf.get_variable("softmax_b", [num_classes])
            logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
            #Use sparse Softmax because we have mutually exclusive classes
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits, self.labels, name='softmax')
            self.cost = tf.reduce_sum(loss) / self.batch_size
        with tf.name_scope("Evaluating_accuracy") as scope:
            correct_prediction = tf.equal(tf.argmax(logits, 1), self.labels)
            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction,
                                                   "float"))
        """Optimizer"""
        with tf.name_scope("Optimizer") as scope:
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(
                tf.gradients(self.cost, tvars),
                max_grad_norm)  #We clip the gradients to prevent explosion
            optimizer = tf.train.AdamOptimizer(learning_rate)
            gradients = zip(grads, tvars)
            self.train_op = optimizer.apply_gradients(gradients)
            # Add histograms for variables, gradients and gradient norms.
            # The for-loop loops over all entries of the gradient and plots
            # a histogram. We cut of
            for gradient, variable in gradients:  #plot the gradient of each trainable variable
                if isinstance(gradient, ops.IndexedSlices):
                    grad_values = gradient.values
                else:
                    grad_values = gradient

                tf.summary.histogram(variable.name, variable)
                tf.summary.histogram(variable.name + "/gradients", grad_values)
                tf.summary.histogram(variable.name + "/gradient_norm",
                                     clip_ops.global_norm([grad_values]))

        #Final code for the TensorBoard
        self.merged = tf.summary.merge_all()
        self.init_op = tf.global_variables_initializer()
Exemplo n.º 40
0
  def __init__(self,config):
    
    num_layers = config['num_layers']
    hidden_size = config['hidden_size']
    max_grad_norm = config['max_grad_norm']
    self.batch_size = config['batch_size']
    sl = config['sl']
    learning_rate = config['learning_rate']
    num_classes = config['num_classes']
    """Place holders"""
    self.input = tf.placeholder(tf.float32, [None, sl], name = 'input')
    self.labels = tf.placeholder(tf.int64, [None], name='labels')
    self.keep_prob = tf.placeholder("float", name = 'Drop_out_keep_prob')

    with tf.name_scope("LSTM_setup") as scope:
      def single_cell():
        return tf.contrib.rnn.DropoutWrapper(LSTMCell(hidden_size),output_keep_prob=self.keep_prob)

      cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)])
      initial_state = cell.zero_state(self.batch_size, tf.float32)
    
    input_list = tf.unstack(tf.expand_dims(self.input,axis=2),axis=1)
    outputs,_ = core_rnn.static_rnn(cell, input_list, dtype=tf.float32)

    output = outputs[-1]


    #Generate a classification from the last cell_output
    with tf.name_scope("Softmax") as scope:
      with tf.variable_scope("Softmax_params"):
        softmax_w = tf.get_variable("softmax_w", [hidden_size, num_classes])
        softmax_b = tf.get_variable("softmax_b", [num_classes])
      logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)

      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=self.labels,name = 'softmax')
      self.cost = tf.reduce_sum(loss) / self.batch_size
    with tf.name_scope("Evaluating_accuracy") as scope:
      correct_prediction = tf.equal(tf.argmax(logits,1),self.labels)
      self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
      h1 = tf.summary.scalar('accuracy',self.accuracy)
      h2 = tf.summary.scalar('cost', self.cost)


    #Optimizer
    with tf.name_scope("Optimizer") as scope:
      tvars = tf.trainable_variables()
      grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),max_grad_norm)
      optimizer = tf.train.AdamOptimizer(learning_rate)
      gradients = zip(grads, tvars)
      self.train_op = optimizer.apply_gradients(gradients)

      for gradient, variable in gradients:  #plot the gradient of each trainable variable
            if isinstance(gradient, ops.IndexedSlices):
              grad_values = gradient.values
            else:
              grad_values = gradient
      
            tf.summary.histogram(variable.name, variable)
            tf.summary.histogram(variable.name + "/gradients", grad_values)
            tf.summary.histogram(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values]))

    #Final code for the TensorBoard
    #self.merged = tf.summary.merge_all()
    self.merged = tf.constant(1)
    self.init_op = tf.global_variables_initializer()
    print('Finished computation graph')
Exemplo n.º 41
0
def optimize_loss(
    loss,
    global_step,
    learning_rate,
    optimizer,
    gradient_noise_scale=None,
    gradient_multipliers=None,
    clip_gradients=None,
    learning_rate_decay_fn=None,
    update_ops=None,
    variables=None,
    name=None,
    summaries=None,
):
    """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers, include:
    - string, name of the optimizer like 'SGD', 'Adam', see OPTIMIZER_CLS_NAMES
        for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
    - function, takes learning rate `Tensor` as argument and must return
        `Optimizer` instance. E.g. `optimize_loss(...,
        optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
      Alternatively, if `learning_rate` is `None`, the function takes no
      arguments. E.g. `optimize_loss(..., learning_rate=None,
        optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
    - class, subclass of `Optimizer` that takes only one required argument -
        learning rate, such as AdamOptimizer, AdagradOptimizer.
        E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`.
    - object, instance of subclass of `Optimizer`.
        E.g., `optimizer_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Tensor, 0 dimensional.
    global_step: Tensor, step counter for each update.
    learning_rate: float or Tensor, magnitude of update per each training step.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of tf.Optimizer that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantion of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float or `None`, clips gradients by this value.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: tf.train.exponential_decay.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.

  Returns:
    Training op.

  Raises:
    ValueError: if optimizer is wrong type.
  """
    with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
        # Update ops take UPDATE_OPS collection if not provided.
        if update_ops is None:
            update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
        # Make sure update ops are ran before computing loss.
        if update_ops:
            loss = control_flow_ops.with_dependencies(list(update_ops), loss)

        # Learning rate variable, with possible decay.
        lr = None
        if learning_rate is not None:
            if isinstance(learning_rate, ops.Tensor) and learning_rate.get_shape().ndims == 0:
                lr = learning_rate
            elif isinstance(learning_rate, float):
                lr = vs.get_variable(
                    "learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)
                )
            else:
                raise ValueError(
                    "Learning rate should be 0d Tensor or float. "
                    "Got %s of type %s" % (str(learning_rate), str(type(learning_rate)))
                )
        if summaries is None:
            summaries = ["loss", "learning_rate"]
        if learning_rate is not None and learning_rate_decay_fn is not None:
            lr = learning_rate_decay_fn(lr, global_step)
            if "learning_rate" in summaries:
                logging_ops.scalar_summary("learning_rate", lr)

        # Create optimizer, given specified parameters.
        if isinstance(optimizer, six.string_types):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if " "optimizer is string (%s)." % optimizer
                )
            if optimizer not in OPTIMIZER_CLS_NAMES:
                raise ValueError(
                    "Optimizer name should be one of [%s], you provided %s."
                    % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)
                )
            opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
        elif isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if " "optimizer is class (%s)." % optimizer
                )
            opt = optimizer(learning_rate=lr)
        elif isinstance(optimizer, optimizer_.Optimizer):
            opt = optimizer
        elif callable(optimizer):
            if learning_rate is not None:
                opt = optimizer(lr)
            else:
                opt = optimizer()
            if not isinstance(opt, optimizer_.Optimizer):
                raise ValueError(
                    "Unrecognized optimizer: function should return " "subclass of Optimizer. Got %s." % str(opt)
                )
        else:
            raise ValueError(
                "Unrecognized optimizer: should be string, "
                "subclass of Optimizer, instance of "
                "subclass of Optimizer or function with one argument. "
                "Got %s." % str(optimizer)
            )

        # All trainable variables, if specific variables are not specified.
        if variables is None:
            variables = vars_.trainable_variables()

        # Compute gradients.
        gradients = opt.compute_gradients(loss, variables)

        # Optionally add gradient noise.
        if gradient_noise_scale is not None:
            gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale)

        # Multiply some gradients.
        if gradient_multipliers is not None:
            gradients = _multiply_gradients(gradients, gradient_multipliers)

        # Optionally clip gradients by global norm.
        if clip_gradients is not None:
            gradients = _clip_gradients_by_norm(gradients, clip_gradients)

        # Add scalar summary for loss.
        if "loss" in summaries:
            logging_ops.scalar_summary("loss", loss)

        # Add histograms for variables, gradients and gradient norms.
        for gradient, variable in gradients:
            if isinstance(gradient, ops.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient

            if grad_values is not None:
                if "gradients" in summaries:
                    logging_ops.histogram_summary(variable.name + "/gradients", grad_values)
                if "gradient_norm" in summaries:
                    logging_ops.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values]))

        # Create gradient updates.
        grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train")

        # Ensure the train_tensor computes grad_updates.
        train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

        return train_tensor
  def _get_train_ops(self,
                     loss,
                     tf_variables,
                     global_step,
                     grad_bound=1.25,
                     lr_init=1e-3,
                     lr_dec=0.9,
                     start_decay_step=10000,
                     decay_steps=100,
                     optimizer_type="adam"):
    """Loss optimizer.

    Args:
      loss: scalar tf tensor
      tf_variables: list of training variables, typically
        tf.trainable_variables()
      global_step: global_step
      grad_bound: max gradient norm
      lr_init: initial learning rate
      lr_dec: leaning rate decay coefficient
      start_decay_step: start decaying learning rate after this many steps
      decay_steps: apply decay rate factor at this step intervals
      optimizer_type: optimizer type should be either adam or sgd

    Returns:
      train_op: training op
      learning_rate: scalar learning rate tensor
      grad_norm: l2 norm of the gradient vector
      all_grad_norms: l2 norm of each component
    """
    lr_gstep = global_step - start_decay_step

    def f1():
      return constant_op.constant(lr_init)

    def f2():
      return learning_rate_decay.exponential_decay(lr_init, lr_gstep,
                                                   decay_steps, lr_dec, True)

    learning_rate = control_flow_ops.cond(
        math_ops.less(global_step, start_decay_step),
        f1,
        f2,
        name="learning_rate")

    if optimizer_type == "adam":
      opt = adam.AdamOptimizer(learning_rate)
    elif optimizer_type == "sgd":
      opt = gradient_descent.GradientDescentOptimizer(learning_rate)
    grads_and_vars = opt.compute_gradients(loss, tf_variables)
    grad_norm = clip_ops.global_norm([g for g, v in grads_and_vars])
    all_grad_norms = {}
    clipped_grads = []
    clipped_rate = math_ops.maximum(grad_norm / grad_bound, 1.0)
    for g, v in grads_and_vars:
      if g is not None:
        if isinstance(g, tf_ops.IndexedSlices):
          clipped = g.values / clipped_rate
          norm_square = math_ops.reduce_sum(clipped * clipped)
          clipped = tf_ops.IndexedSlices(clipped, g.indices)
        else:
          clipped = g / clipped_rate
          norm_square = math_ops.reduce_sum(clipped * clipped)
        all_grad_norms[v.name] = math_ops.sqrt(norm_square)
        clipped_grads.append((clipped, v))

    train_op = opt.apply_gradients(clipped_grads, global_step)
    return train_op, learning_rate, grad_norm, all_grad_norms
Exemplo n.º 43
0
    #  - Histogram of the grradient-norm over the Tensor
    if True:
        numel = tf.constant([[0]])
        for gradient, variable in gradients:
            if isinstance(gradient, ops.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient

            numel += tf.reduce_sum(tf.size(variable))

            h1 = tf.histogram_summary(variable.name, variable)
            h2 = tf.histogram_summary(variable.name + "/gradients",
                                      grad_values)
            h3 = tf.histogram_summary(variable.name + "/gradient_norm",
                                      clip_ops.global_norm([grad_values]))
with tf.name_scope("Evaluating") as scope:
    correct_prediction = tf.equal(tf.argmax(h_fc2, 1), y_)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    accuracy_summary = tf.scalar_summary("accuracy", accuracy)

merged = tf.merge_all_summaries()

# For now, we collect performances in a Numpy array.
# In future releases, I hope TensorBoard allows for more
# flexibility in plotting
perf_collect = np.zeros((4, int(np.floor(max_iterations / plot_every))))

with tf.Session() as sess:
    print('Start session')
    writer = tf.train.SummaryWriter("/home/rob/Dropbox/ml_projects/FCN/log_tb",
Exemplo n.º 44
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  learning_rate_decay_fn=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None,
                  colocate_gradients_with_ops=False,
                  increment_global_step=True):
    """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers include:

  - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
      for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
  - by function taking learning rate `Tensor` as argument and returning an
      `Optimizer` instance. E.g. `optimize_loss(...,
      optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
    Alternatively, if `learning_rate` is `None`, the function takes no
    arguments. E.g. `optimize_loss(..., learning_rate=None,
      optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
  - by a subclass of `Optimizer` having a single-argument constructor
      (the argument is the learning rate), such as AdamOptimizer or
      AdagradOptimizer. E.g. `optimize_loss(...,
      optimizer=tf.train.AdagradOptimizer)`.
  - by an instance of a subclass of `Optimizer`.
      E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Scalar `Tensor`.
    global_step: Scalar int `Tensor`, step counter to update on each step
                 unless `increment_global_step` is `False`. If not supplied,
                 it will be fetched from the default graph (see
                 `tf.train.get_global_step` for details). If it has
                 not been created, no step will be incremented with each weight
                 update. `learning_rate_decay_fn` requires `global_step`.
    learning_rate: float or `Tensor`, magnitude of update per each training
                   step. Can be `None`.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of `tf.Optimizer` that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantiation of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float, callable or `None`. If a float is provided, a global
      clipping is applied to prevent the norm of the gradient from exceeding
      this value. Alternatively, a callable can be provided, e.g.,
      `adaptive_clipping_fn()`.  This callable takes a list of 
      `(gradients, variables)` tuples and returns the same thing with the 
      gradients modified.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: `tf.train.exponential_decay`.
                            Ignored if `learning_rate` is not supplied.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set, the loss, the learning rate, and the global norm of the
               gradients will be reported. The complete list of possible values
               is in OPTIMIZER_SUMMARIES.
    colocate_gradients_with_ops: If True, try colocating gradients with the
                                 corresponding op.
    increment_global_step: Whether to increment `global_step`. If your model
      calls `optimize_loss` multiple times per training step (e.g. to optimize
      different parts of the model), use this arg to avoid incrementing
      `global_step` more times than necessary.

  Returns:
    Training op.

  Raises:
    ValueError: if:
        * `loss` is an invalid type or shape.
        * `global_step` is an invalid type or shape.
        * `learning_rate` is an invalid type or value.
        * `optimizer` has the wrong type.
        * `clip_gradients` is neither float nor callable.
        * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
          `global_step` is available.
        * `gradients` is empty.
  """
    loss = ops.convert_to_tensor(loss)
    contrib_framework.assert_scalar(loss)
    if global_step is None:
        global_step = train.get_global_step()
    else:
        train.assert_global_step(global_step)
    with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
        # Update ops take UPDATE_OPS collection if not provided.
        if update_ops is None:
            update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
        # Make sure update ops are ran before computing loss.
        if update_ops:
            loss = control_flow_ops.with_dependencies(list(update_ops), loss)

        # Learning rate variable, with possible decay.
        lr = None
        if learning_rate is not None:
            if (isinstance(learning_rate, ops.Tensor)
                    and learning_rate.get_shape().ndims == 0):
                lr = learning_rate
            elif isinstance(learning_rate, float):
                if learning_rate < 0.0:
                    raise ValueError("Invalid learning_rate %s.",
                                     learning_rate)
                lr = vs.get_variable(
                    "learning_rate", [],
                    trainable=False,
                    initializer=init_ops.constant_initializer(learning_rate))
            else:
                raise ValueError(
                    "Learning rate should be 0d Tensor or float. "
                    "Got %s of type %s" %
                    (str(learning_rate), str(type(learning_rate))))
        if summaries is None:
            summaries = ["loss", "learning_rate", "global_gradient_norm"]
        else:
            for summ in summaries:
                if summ not in OPTIMIZER_SUMMARIES:
                    raise ValueError(
                        "Summaries should be one of [%s], you provided %s." %
                        (", ".join(OPTIMIZER_SUMMARIES), summ))
        if learning_rate is not None and learning_rate_decay_fn is not None:
            if global_step is None:
                raise ValueError(
                    "global_step is required for learning_rate_decay_fn.")
            lr = learning_rate_decay_fn(lr, global_step)
            if "learning_rate" in summaries:
                summary.scalar("learning_rate", lr)

        # Create optimizer, given specified parameters.
        if isinstance(optimizer, six.string_types):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is string (%s)." % optimizer)
            if optimizer not in OPTIMIZER_CLS_NAMES:
                raise ValueError(
                    "Optimizer name should be one of [%s], you provided %s." %
                    (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
            opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
        elif (isinstance(optimizer, type)
              and issubclass(optimizer, optimizer_.Optimizer)):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is class (%s)." % optimizer)
            opt = optimizer(learning_rate=lr)
        elif isinstance(optimizer, optimizer_.Optimizer):
            opt = optimizer
        elif callable(optimizer):
            if learning_rate is not None:
                opt = optimizer(lr)
            else:
                opt = optimizer()
            if not isinstance(opt, optimizer_.Optimizer):
                raise ValueError(
                    "Unrecognized optimizer: function should return "
                    "subclass of Optimizer. Got %s." % str(opt))
        else:
            raise ValueError(
                "Unrecognized optimizer: should be string, "
                "subclass of Optimizer, instance of "
                "subclass of Optimizer or function with one argument. "
                "Got %s." % str(optimizer))

        # All trainable variables, if specific variables are not specified.
        if variables is None:
            variables = vars_.trainable_variables()

        # Compute gradients.
        gradients = opt.compute_gradients(
            loss,
            variables,
            colocate_gradients_with_ops=colocate_gradients_with_ops)

        # Optionally add gradient noise.
        if gradient_noise_scale is not None:
            gradients = _add_scaled_noise_to_gradients(gradients,
                                                       gradient_noise_scale)

        # Multiply some gradients.
        if gradient_multipliers is not None:
            gradients = _multiply_gradients(gradients, gradient_multipliers)
            if not gradients:
                raise ValueError(
                    "Empty list of (gradient, var) pairs encountered. This is most "
                    "likely to be caused by an improper value of gradient_multipliers."
                )

        if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
            summary.scalar("global_norm/gradient_norm",
                           clip_ops.global_norm(list(zip(*gradients))[0]))

        # Optionally clip gradients by global norm.
        if isinstance(clip_gradients, float):
            gradients = _clip_gradients_by_norm(gradients, clip_gradients)
        elif callable(clip_gradients):
            gradients = clip_gradients(gradients)
        elif clip_gradients is not None:
            raise ValueError("Unknown type %s for clip_gradients" %
                             type(clip_gradients))

        # Add scalar summary for loss.
        if "loss" in summaries:
            summary.scalar("loss", loss)

        # Add histograms for variables, gradients and gradient norms.
        for gradient, variable in gradients:
            if isinstance(gradient, ops.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient

            if grad_values is not None:
                var_name = variable.name.replace(":", "_")
                if "gradients" in summaries:
                    summary.histogram("gradients/%s" % var_name, grad_values)
                if "gradient_norm" in summaries:
                    summary.scalar("gradient_norm/%s" % var_name,
                                   clip_ops.global_norm([grad_values]))

        if clip_gradients is not None and ("global_gradient_norm" in summaries
                                           or "gradient_norm" in summaries):
            summary.scalar("global_norm/clipped_gradient_norm",
                           clip_ops.global_norm(list(zip(*gradients))[0]))

        # Create gradient updates.
        grad_updates = opt.apply_gradients(
            gradients,
            global_step=global_step if increment_global_step else None,
            name="train")

        # Ensure the train_tensor computes grad_updates.
        train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

        return train_tensor
Exemplo n.º 45
0
def test_model(dataset, pool_pctg, layer_size_1):

    X_train, y_train, X_test, y_test = loadECG()

    X_train = np.swapaxes(X_train, 1, 2)
    X_test = np.swapaxes(X_test, 1, 2)

    n = max([
        np.max([v.shape[0] for v in X_train]),
        np.max([v.shape[0] for v in X_test])
    ])
    if n % STRIDE_WIDTH != 0:
        n = n + (STRIDE_WIDTH - (n % STRIDE_WIDTH))

    X_train = standardize_ts_lengths(X_train, n)
    X_test = standardize_ts_lengths(X_test, n)

    X_train = np.swapaxes(X_train, 1, 2)
    X_test = np.swapaxes(X_test, 1, 2)
    N = X_train.shape[0]
    Ntest = X_test.shape[0]
    D = X_train.shape[1]
    D_ts = X_train.shape[2]

    print "X shape: ", X_train.shape[0], X_train.shape[1], X_train.shape[2]
    X_val = X_test[:2]
    y_val = y_test[:2]
    X_test = X_test[2:]
    y_test = y_test[2:]

    num_classes = len(np.unique(y_train))
    num_fc_1 = layer_size_1
    epochs = np.floor(batch_size * max_iterations / N)
    pool_width = max(int(POOL_PCTG * D), 2)
    print('Train with approximately %d epochs' % (epochs))

    x_tensor = tf.placeholder("float",
                              shape=[None, D, D_ts],
                              name='Input_data')
    y_ = tf.placeholder(tf.int64, shape=[None], name='Ground_truth')
    keep_prob = tf.placeholder("float")

    with tf.name_scope("Reshaping_data") as scope:
        x_image = tf.reshape(x_tensor, [-1, D, D_ts, 1])

        x_image_fc_input = tf.reshape
        initializer = tf.contrib.layers.xavier_initializer()
        """Build the graph"""
        # ewma is the decay for which we update the moving average of the
        # mean and variance in the batch-norm layers
    with tf.name_scope("Conv1") as scope:
        W_conv1 = tf.get_variable("Conv_Layer_1",
                                  shape=[1, FILTER_SIZE, 1, num_filt_1],
                                  initializer=initializer)
        b_conv1 = bias_variable([num_filt_1], 'bias_for_Conv_Layer_1')
        a_conv1 = conv2d(x_image, W_conv1) + b_conv1

        h_relu = tf.nn.relu(a_conv1)
        h_conv1 = max_pool_2x2(h_relu, pool_width)

    with tf.name_scope("Globally_Informed") as scope:
        W_fc0 = tf.get_variable("Fully_Connected_0",
                                shape=[PADDED_LENGTH, num_fc_0],
                                initializer=initializer)
        b_fc0 = bias_variable([num_fc_0], 'bias_for_Fully_Connected_Layer_0')
        h_fc0 = tf.nn.relu(
            tf.matmul(tf.contrib.layers.flatten(x_image), W_fc0) + b_fc0)

    # Output of convolutional layer and the fully informed one go into
    with tf.name_scope("Fully_Connected1") as scope:
        # Code for network without fully-connected inputs
        #h_conv3_flat = tf.contrib.layers.flatten(h_conv1)
        #W_fc1 = tf.get_variable("Fully_Connected_layer_1", shape=[D*num_filt_1*D_ts*(1./STRIDE_WIDTH), num_fc_1],initializer=initializer)

        h_conv3_flat = tf.concat([tf.contrib.layers.flatten(h_conv1), h_fc0],
                                 1)
        W_fc1 = tf.get_variable("Fully_Connected_layer_1",
                                shape=[
                                    D * num_filt_1 * D_ts *
                                    (1. / STRIDE_WIDTH) + num_fc_0, num_fc_1
                                ],
                                initializer=initializer)

        b_fc1 = bias_variable([num_fc_1], 'bias_for_Fully_Connected_Layer_1')
        h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1)

    with tf.name_scope("Fully_Connected2") as scope:
        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
        W_fc2 = tf.get_variable("W_fc2",
                                shape=[num_fc_1, num_classes],
                                initializer=initializer)
        b_fc2 = tf.Variable(tf.constant(0.1, shape=[num_classes]),
                            name='b_fc2')
        h_fc2 = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

    with tf.name_scope("SoftMax") as scope:
        regularization = .001
        regularizers = (tf.nn.l2_loss(W_conv1) + tf.nn.l2_loss(b_conv1) +
                        tf.nn.l2_loss(W_fc0) + tf.nn.l2_loss(b_fc0) +
                        tf.nn.l2_loss(W_fc2) + tf.nn.l2_loss(b_fc2))
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h_fc2,
                                                              labels=y_)

        cost = tf.reduce_sum(loss) / batch_size
        cost += regularization * regularizers
        loss_summ = tf.summary.scalar("cross entropy_loss", cost)
    with tf.name_scope("train") as scope:
        tvars = tf.trainable_variables()
        #We clip the gradients to prevent explosion
        grads = tf.gradients(cost, tvars)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients = list(zip(grads, tvars))
        train_step = optimizer.apply_gradients(gradients)
        # The following block plots for every trainable variable
        #  - Histogram of the entries of the Tensor
        #  - Histogram of the gradient over the Tensor
        #  - Histogram of the grradient-norm over the Tensor
        numel = tf.constant([[0]])
        for gradient, variable in gradients:
            if isinstance(gradient, ops.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient

            numel += tf.reduce_sum(tf.size(variable))

            h1 = tf.summary.histogram(variable.name, variable)
            h2 = tf.summary.histogram(variable.name + "/gradients",
                                      grad_values)
            h3 = tf.summary.histogram(variable.name + "/gradient_norm",
                                      clip_ops.global_norm([grad_values]))
    with tf.name_scope("Evaluating_accuracy") as scope:

        correct_prediction = tf.equal(tf.argmax(h_fc2, 1), y_)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        accuracy_summary = tf.summary.scalar("accuracy", accuracy)

    test_normal_ids = np.array([
        1071, 1072, 1073, 1075, 1076, 108, 1086, 1088, 1089, 109, 1090, 1091,
        1092, 1093, 1094, 1095, 11, 110, 1104, 1105, 1106, 1107, 1109, 111,
        1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 112, 1120,
        1121, 1123, 1124, 1126, 1127, 1128, 1129, 113, 1130, 1131, 1133, 1134,
        1135, 1138, 1139, 114, 1140, 1141, 1142, 1143, 1146, 1147, 1148, 1149,
        115, 116, 117, 1171, 1176, 1179, 118, 1182, 1188, 119, 12, 120, 121,
        123, 1232, 124, 125, 126, 127, 128, 129, 13, 130, 131, 132, 133, 1332,
        134, 135, 136, 137, 138, 139, 140, 1417, 144, 1463, 15, 1547, 1554, 16,
        1616, 1644, 1687, 17, 1723, 1725, 1727, 1731, 1733, 1746, 18, 1804, 19,
        1902, 1936, 1992, 20, 2004, 21, 2152, 22, 2234, 2267, 2290, 23, 2390,
        24, 25, 2507, 2555, 2556, 2592, 26, 27, 2705, 2779, 28, 2812, 2831,
        2832, 2833, 2841, 2844, 2865, 29, 2965, 2966, 2987, 3, 30, 3010, 3014,
        3016, 3018, 3040, 3054, 3063, 3077, 31, 3181, 32, 3212, 3217, 3224,
        3239, 3249, 3255, 3266, 3268, 33, 34, 3445, 3495, 3522, 3561, 3566,
        3575, 36, 3621, 3671, 3679, 368, 3680, 369, 37, 370, 3704, 371, 3717,
        372, 3722, 3723, 3727, 373, 374, 3748, 375, 3751, 376, 377, 3779, 3787,
        379, 38, 3801, 381, 382, 3861, 39, 3938, 3950, 3957, 4, 40, 4002, 4010,
        4018, 4073, 4096, 4132, 4169, 4171, 4174, 4193, 42, 4236, 4240, 4245,
        4248, 4257, 4263, 43, 437, 439, 4391, 44, 4407, 441, 4419, 442, 4442,
        4450, 4468, 45, 4517, 4523, 4585, 46, 47, 4741, 4747, 4755, 4756, 4766,
        4840, 4841, 4869, 4882, 49, 4917, 494, 495, 4954, 497, 4975, 4976, 498,
        4980, 4982, 499, 4992, 5, 50, 500, 5004, 502, 503, 5065, 5072, 5083,
        5088, 5098, 51, 5106, 5120, 5123, 5144, 5157, 5171
    ])
    test_death_ids = np.array(
        [5616, 5687, 5908, 5964, 6160, 6311, 760, 832, 854, 961])
    merged = tf.summary.merge_all()

    saver = tf.train.Saver()
    with tf.Session() as sess:
        #saver.restore(sess, "./models/model.ckpt")
        writer = tf.summary.FileWriter("./log_tb", sess.graph)

        sess.run(tf.global_variables_initializer())

        step = 0  # Step is a counter for filling the numpy array perf_collect

        i = 0

        def model1deathpred(pid, train_embedding, threshold=None):
            aa = get_all_adjacent_beats(pid)
            aa = standardize_ts_lengths(aa, PADDED_LENGTH)
            aa = np.swapaxes(aa, 1, 2)
            embedded_signal = h_fc1.eval(feed_dict={
                x_tensor: aa,
                y_: y_train,
                keep_prob: 1.0
            })
            return percentage_death(train_embedding, y_train, embedded_signal,
                                    threshold)

        stop = False
        while stop == False:

            batch_ind = np.random.choice(N, batch_size, replace=False)
            #batch_ind = np.arange(N)
            #batch_ind = batch_ind[(i*batch_size)%N:((i+1)*batch_size)%N]
            gg = h_relu.eval(
                feed_dict={
                    x_tensor: X_train[batch_ind],
                    y_: y_train[batch_ind],
                    keep_prob: dropout
                })
            sess.run(train_step,
                     feed_dict={
                         x_tensor: X_train[batch_ind],
                         y_: y_train[batch_ind],
                         keep_prob: dropout
                     })

            if i == 0:
                # Use this line to check before-and-after test accuracy
                result = sess.run(accuracy,
                                  feed_dict={
                                      x_tensor: X_test,
                                      y_: y_test,
                                      keep_prob: 1.0
                                  })
                acc_test_before = result

            if i % 750 == 0 and i != 0:
                #Check training performance

                result = sess.run([cost, accuracy],
                                  feed_dict={
                                      x_tensor: X_train,
                                      y_: y_train,
                                      keep_prob: 1.0
                                  })
                acc_train = result[1]
                cost_train = result[0]

                if i % 750 == 0 and i != 0:
                    #print "Running odds ratio calculation"
                    #train_embedding = h_fc1.eval(feed_dict = {x_tensor: X_train, y_: y_train, keep_prob: 1.0})
                    #get_upper_quartile_odds_ratio(test_death_ids, test_normal_ids, x, y_, y_train, h_fc1, keep_prob, train_embedding)
                    train_embedding = h_fc1.eval(feed_dict={
                        x_tensor: X_train,
                        y_: y_train,
                        keep_prob: 1.0
                    })
                    test_embedding = h_fc1.eval(feed_dict={
                        x_tensor: X_test,
                        y_: y_train,
                        keep_prob: 1.0
                    })
                    test_acc = evaluate_test_embedding(train_embedding,
                                                       y_train, test_embedding,
                                                       y_test)
                    print "test acc: ", test_acc
                    if test_acc > .67:
                        pdb.set_trace()

            # writer.add_summary(result[1], i)
                writer.flush(
                )  #Don't forget this command! It makes sure Python writes the summaries to the log-file
                if PRINT == True:
                    print("At %5.0f/%5.0f Cost: train%5.3f Acc: train%5.3f " %
                          (i, max_iterations, cost_train, acc_train))
                step += 1

            i += 1

        result = sess.run([accuracy, numel],
                          feed_dict={
                              x_tensor: X_test,
                              y_: y_test,
                              keep_prob: 1.0
                          })
        acc_test = result[0]
        print('The network has %s trainable parameters' % (result[1]))

        train_embedding = h_fc1.eval(feed_dict={
            x_tensor: X_train,
            y_: y_train,
            keep_prob: 1.0
        })
        test_embedding = h_fc1.eval(feed_dict={
            x_tensor: X_test,
            y_: y_train,
            keep_prob: 1.0
        })
        pdb.set_trace()

        #print('Accuracy given NN approach %0.2f \n' %(100*test_acc))

        death_test_pids = [102.0, 1050.0, 1107.0, 1115.0, 1171.0]
        for pid in test_death_pids:
            model1pred(pid, x, y_, y_train, h_fc1, keep_prob, train_embedding)

        return None
Exemplo n.º 46
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  clip_gradients=None,
                  moving_average_decay=0.9,
                  learning_rate_decay_fn=None,
                  variables=None):
  """Given loss and parameters for optimizer, returns a training op.

  Args:
    loss: Tensor, 0 dimensional.
    global_step: Tensor, step counter for each update.
    learning_rate: float or Tensor, magnitude of update per each training step.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of tf.Optimizer that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantion of tf.Optimizer sub-class
                 and have `compute_gradients` and `apply_gradients` functions.
    clip_gradients: float or None, clips gradients by this value.
    moving_average_decay: float or None, takes into account previous loss
                          to make learning smoother due to outliers.
    learning_rate_decay_fn: function, takes learning_rate and global_step
                            Tensors, returns Tensor. Can be used to implement
                            any learning rate decay funcitons.
                            For example: tf.train.exponential_decay.
    variables: list of variables to optimizer or none.

  Returns:
    Training op.

  Raises:
    ValueError: if optimizer is wrong type.
  """
  # Moving average of the loss with decay.
  if moving_average_decay is not None:
    # Generate moving averages of the loss.
    loss_averages = train.ExponentialMovingAverage(moving_average_decay,
                                                   name="avg")
    loss_averages_op = loss_averages.apply([loss])
    logging_ops.scalar_summary("loss/mean", loss_averages.average(loss))
    loss = control_flow_ops.with_dependencies([loss_averages_op], loss)

  # Learning rate variable, with possible decay.
  if isinstance(learning_rate, ops.Tensor) and len(learning_rate.get_shape()) == 0:
    lr = learning_rate
  elif isinstance(learning_rate, float):
    lr = vs.get_variable("learning_rate",
                         [],
                         trainable=False,
                         initializer=init_ops.constant_initializer(learning_rate))
  else:
    raise ValueError("Learning rate should be 0d Tensor or float. Got %s" %
        str(learning_rate))
  if learning_rate_decay_fn is not None:
    lr = learning_rate_decay_fn(lr, global_step)

  # Create optimizer, given specified parameters.
  if isinstance(optimizer, six.string_types):
    if optimizer not in OPTIMIZER_CLS_NAMES:
      raise ValueError("Optimizer name should be one of [%s], you provided %s."
                       % (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
    opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
  elif isinstance(optimizer, type) and issubclass(optimizer,
                                                  optimizer_.Optimizer):
    opt = optimizer(learning_rate=lr)
  elif isinstance(optimizer, optimizer_.Optimizer):
    opt = optimizer
  else:
    raise ValueError("Unrecognized optimizer: should be string, "
                     "subclass of Optimizer or instance of "
                     "subclass of Optimizer. Got %s." % str(optimizer))

  # All trainable variables, if specific variables are not specified.
  if variables is None:
    variables = vars_.trainable_variables()

  # Compute gradients and clip them if provided.
  gradients = opt.compute_gradients(loss, variables)
  if clip_gradients is not None:
    gradients, variables = zip(*gradients)
    clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients,
                                                        clip_gradients)
    gradients = list(zip(clipped_gradients, variables))

  # Add scalar summary for loss.
  logging_ops.scalar_summary("loss", loss)

  # Add histograms for variables, gradients and gradient norms.
  for gradient, variable in gradients:
    if isinstance(gradient, ops.IndexedSlices):
      grad_values = gradient.values
    else:
      grad_values = gradient

    if grad_values is not None:
      logging_ops.histogram_summary(variable.name, variable)
      logging_ops.histogram_summary(variable.name + "/gradients", grad_values)
      logging_ops.histogram_summary(variable.name + "/gradient_norm",
                                    clip_ops.global_norm([grad_values]))

  # Create gradient updates.
  grad_updates = opt.apply_gradients(gradients,
                                     global_step=global_step,
                                     name="train")
  # Make sure total_loss is valid.
  final_loss = array_ops.check_numerics(loss, "Loss is inf or nan")

  # Ensure the train_tensor computes grad_updates.
  train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss)

  return train_tensor
Exemplo n.º 47
0
def optimize_loss(loss,
                  optimizer,
                  optimizer_params,
                  learning_rate_decay_fn,
                  global_step=None,
                  dtype=tf.float32,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None,
                  colocate_gradients_with_ops=False,
                  increment_global_step=True,
                  larc_params=None,
                  loss_scale=1.0,
                  automatic_loss_scaling=None,
                  on_horovod=False):
  """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers include:

  - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
      for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
  - by function taking learning rate `Tensor` as argument and returning an
      `Optimizer` instance. E.g. `optimize_loss(...,
      optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
    Alternatively, if `learning_rate` is `None`, the function takes no
    arguments. E.g. `optimize_loss(..., learning_rate=None,
      optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
  - by a subclass of `Optimizer` having a single-argument constructor
      (the argument is the learning rate), such as AdamOptimizer or
      AdagradOptimizer. E.g. `optimize_loss(...,
      optimizer=tf.train.AdagradOptimizer)`.
  - by an instance of a subclass of `Optimizer`.
      E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Scalar `Tensor`.
    global_step: Scalar int `Tensor`, step counter to update on each step
                 unless `increment_global_step` is `False`. If not supplied,
                 it will be fetched from the default graph (see
                 `tf.train.get_global_step` for details). If it has
                 not been created, no step will be incremented with each weight
                 update. `learning_rate_decay_fn` requires `global_step`.
    learning_rate: float or `Tensor`, magnitude of update per each training
                   step. Can be `None`.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of `tf.Optimizer` that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantiation of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float, callable or `None`. If float, is provided, a global
      clipping is applied to prevent the norm of the gradient to exceed this
      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
      This callable takes a `list` of `(gradients, variables)` `tuple`s and
      returns the same thing with the gradients modified.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: `tf.train.exponential_decay`.
                            Ignored if `learning_rate` is not supplied.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
    colocate_gradients_with_ops: If True, try colocating gradients with the
                                 corresponding op.
    increment_global_step: Whether to increment `global_step`. If your model
      calls `optimize_loss` multiple times per training step (e.g. to optimize
      different parts of the model), use this arg to avoid incrementing
      `global_step` more times than necessary.
    LARC_mode: 'scale' or 'clip'
    LARC_nu: If not None, LARC re-scaling will be
             applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu
    automatic_loss_scaling: if not None, use the corresponding automatic
                            loss scaling algorithm. Must be one of 'Backoff'
                            of 'LogMax'. `dtype` must be "mixed" to use ALS.
  Returns:
    Training op.

  Raises:
    ValueError: if:
        * `loss` is an invalid type or shape.
        * `global_step` is an invalid type or shape.
        * `learning_rate` is an invalid type or value.
        * `optimizer` has the wrong type.
        * `clip_gradients` is neither float nor callable.
        * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
          `global_step` is available.
        * `gradients` is empty.
  """
  loss = ops.convert_to_tensor(loss)
  contrib_framework.assert_scalar(loss)
  if global_step is None:
    global_step = tf.train.get_or_create_global_step()
  else:
    tf.train.assert_global_step(global_step)
  with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
    # Update ops take UPDATE_OPS collection if not provided.
    if update_ops is None:
      update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
    # Make sure update ops are ran before computing loss.
    if update_ops:
      loss = control_flow_ops.with_dependencies(list(update_ops), loss)

    if summaries is None:
      summaries = ["learning_rate", "global_gradient_norm"]
    else:
      for summ in summaries:
        if summ not in OPTIMIZER_SUMMARIES:
          raise ValueError("Summaries should be one of [%s], you provided %s." %
                           (", ".join(OPTIMIZER_SUMMARIES), summ))
    if global_step is None:
      raise ValueError("global_step is required for learning_rate_decay_fn.")
    lr = learning_rate_decay_fn(global_step)

    if "learning_rate" in summaries:
      summary.scalar("learning_rate", lr)

    # Create optimizer, given specified parameters.
    if isinstance(optimizer, six.string_types):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is string (%s)." % optimizer)
      if optimizer not in OPTIMIZER_CLS_NAMES:
        raise ValueError(
            "Optimizer name should be one of [%s], you provided %s." %
            (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
      opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr, **optimizer_params)
    elif (isinstance(optimizer, type) and
          issubclass(optimizer, optimizer_.Optimizer)):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is class (%s)." % optimizer)
      opt = optimizer(learning_rate=lr, **optimizer_params)
    elif isinstance(optimizer, optimizer_.Optimizer):
      opt = optimizer
    elif callable(optimizer):
      if lr is not None:
        opt = optimizer(lr, **optimizer_params)
      else:
        opt = optimizer(**optimizer_params)
      if not isinstance(opt, optimizer_.Optimizer):
        raise ValueError("Unrecognized optimizer: function should return "
                         "subclass of Optimizer. Got %s." % str(opt))
    else:
      raise ValueError("Unrecognized optimizer: should be string, "
                       "subclass of Optimizer, instance of "
                       "subclass of Optimizer or function with one argument. "
                       "Got %s." % str(optimizer))
    # All trainable variables, if specific variables are not specified.
    if variables is None:
      variables = vars_.trainable_variables()

    if automatic_loss_scaling is not None:
      if automatic_loss_scaling not in AutomaticLossScaler.SUPPORTED_ALGOS:
        raise ValueError("Unknown automatic loss scaling algorithm: %s."
                         % automatic_loss_sclaing)
      if dtype != "mixed":
        raise ValueError("Automatic loss scaling can be used only with "
                         "dtype=mixed.")
      loss_scale = AutomaticLossScaler(algorithm=automatic_loss_scaling)

    if dtype == 'mixed':
      opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scale)
    if on_horovod:
      opt = DistributedOptimizer(opt)

    # Compute gradients.
    gradients = opt.compute_gradients(
      loss, variables,
      colocate_gradients_with_ops=colocate_gradients_with_ops,
    )

    # Optionally add gradient noise.
    if gradient_noise_scale is not None:
      gradients = _add_scaled_noise_to_gradients(gradients,
                                                 gradient_noise_scale)

    # Multiply some gradients.
    if gradient_multipliers is not None:
      gradients = _multiply_gradients(gradients, gradient_multipliers)
      if not gradients:
        raise ValueError(
            "Empty list of (gradient, var) pairs encountered. This is most "
            "likely to be caused by an improper value of gradient_multipliers.")

    if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
      summary.scalar(
        "global_norm/gradient_norm",
        clip_ops.global_norm(list(map(
          lambda x: tf.cast(x, tf.float32),
          list(zip(*gradients))[0])
        )),
      )

    # Optionally clip gradients by global norm.
    if clip_gradients is not None and larc_params is not None:
      raise AttributeError(
        "LARC and gradient norm clipping should not be used together"
      )
    if isinstance(clip_gradients, float):
      gradients = _clip_gradients_by_norm(gradients, clip_gradients)
    elif callable(clip_gradients):
      gradients = clip_gradients(gradients)
    elif clip_gradients is not None:
      raise ValueError(
          "Unknown type %s for clip_gradients" % type(clip_gradients))

    # Add histograms for variables, gradients and gradient norms.
    for gradient, variable in gradients:
      if isinstance(gradient, ops.IndexedSlices):
        grad_values = gradient.values
      else:
        grad_values = gradient

      if isinstance(variable, ops.IndexedSlices):
        var_values = variable.values
      else:
        var_values = variable

      if grad_values is not None:
        var_name = variable.name.replace(":", "_")
        if "gradients" in summaries:
          summary.histogram("gradients/%s" % var_name, mask_nans(grad_values))
        if "gradient_norm" in summaries:
          summary.scalar("gradient_norm/%s" % var_name,
                         clip_ops.global_norm([grad_values]))
        if "variables" in summaries:
          summary.histogram("variables/%s" % var_name, var_values)
        if "variable_norm" in summaries:
          summary.scalar("variable_norm/%s" % var_name,
                         clip_ops.global_norm([var_values]))

    if clip_gradients is not None and ("global_gradient_norm" in summaries or
                                       "gradient_norm" in summaries):
      summary.scalar(
        "global_norm/clipped_gradient_norm",
        clip_ops.global_norm(list(map(
          lambda x: tf.cast(x, tf.float32),
          list(zip(*gradients))[0])
        )),
      )

    # LARC gradient re-scaling
    if larc_params is not None:
      check_params(
        config=larc_params,
        required_dict={'larc_eta': float},
        optional_dict={
          'larc_mode': ['clip', 'scale'],
          'min_update': float,
          'epsilon': float
        },
      )
      larc_eta = larc_params['larc_eta']
      larc_mode = larc_params.get('larc_mode', 'clip')
      min_update = larc_params.get('min_update', 1e-7)
      eps = larc_params.get('epsilon', 1e-7)

      for idx, (g, v) in enumerate(gradients):
        var_dtype = v.dtype
        v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2)
        g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2)

        if larc_mode == 'clip':
          larc_grad_update = tf.maximum(
            larc_eta * v_norm / (lr * (g_norm + eps)),
            min_update,
          )
          if "larc_summaries" in summaries:
            summary.scalar('larc_clip_on/{}'.format(v.name),
                           tf.cast(tf.less(larc_grad_update, 1.0), tf.int32))
          larc_grad_update = tf.minimum(larc_grad_update, 1.0)
        else:
          larc_grad_update = tf.maximum(
            larc_eta * v_norm / (g_norm + eps),
            min_update,
          )
        larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype)
        gradients[idx] = (larc_grad_update * g, v)

        # adding additional summary
        if "larc_summaries" in summaries:
          summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update)
          summary.scalar("larc_final_lr/{}".format(v.name),
                         tf.cast(lr, var_dtype) * larc_grad_update)

    # Create gradient updates.
    grad_updates = opt.apply_gradients(
        gradients,
        global_step=global_step if increment_global_step else None,
        name="train")

    # # Ensure the train_tensor computes grad_updates.
    train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

    return train_tensor