Пример #1
0
def weighted_resample(inputs, weights, overall_rate, scope=None,
                      mean_decay=0.999, warmup=10, seed=None):
  """Performs an approximate weighted resampling of `inputs`.

  This method chooses elements from `inputs` where each item's rate of
  selection is proportional to its value in `weights`, and the average
  rate of selection across all inputs (and many invocations!) is
  `overall_rate`.

  Args:
    inputs: A list of tensors whose first dimension is `batch_size`.
    weights: A `[batch_size]`-shaped tensor with each batch member's weight.
    overall_rate: Desired overall rate of resampling.
    scope: Scope to use for the op.
    mean_decay: How quickly to decay the running estimate of the mean weight.
    warmup: Until the resulting tensor has been evaluated `warmup`
      times, the resampling menthod uses the true mean over all calls
      as its weight estimate, rather than a decayed mean.
    seed: Random seed.

  Returns:
    A list of tensors exactly like `inputs`, but with an unknown (and
      possibly zero) first dimension.
    A tensor containing the effective resampling rate used for each output.

  """
  # Algorithm: Just compute rates as weights/mean_weight *
  # overall_rate. This way the average weight corresponds to the
  # overall rate, and a weight twice the average has twice the rate,
  # etc.
  with ops.name_scope(scope, 'weighted_resample', inputs) as opscope:
    # First: Maintain a running estimated mean weight, with decay
    # adjusted (by also maintaining an invocation count) during the
    # warmup period so that at the beginning, there aren't too many
    # zeros mixed in, throwing the average off.

    with variable_scope.variable_scope(scope, 'estimate_mean', inputs):
      count_so_far = variable_scope.get_local_variable(
          'resample_count', initializer=0)

      estimated_mean = variable_scope.get_local_variable(
          'estimated_mean', initializer=0.0)

      count = count_so_far.assign_add(1)
      real_decay = math_ops.minimum(
          math_ops.truediv((count - 1), math_ops.minimum(count, warmup)),
          mean_decay)

      batch_mean = math_ops.reduce_mean(weights)
      mean = moving_averages.assign_moving_average(
          estimated_mean, batch_mean, real_decay, zero_debias=False)

    # Then, normalize the weights into rates using the mean weight and
    # overall target rate:
    rates = weights * overall_rate / mean

    results = resample_at_rate([rates] + inputs, rates,
                               scope=opscope, seed=seed, back_prop=False)

    return (results[1:], results[0])
Пример #2
0
  def _renorm_correction_and_moments(self, mean, variance, training):
    """Returns the correction and update values for renorm."""
    stddev = math_ops.sqrt(variance + self.epsilon)
    # Compute the average mean and standard deviation, as if they were
    # initialized with this batch's moments.
    mixed_renorm_mean = (self.renorm_mean +
                         (1. - self.renorm_mean_weight) * mean)
    mixed_renorm_stddev = (self.renorm_stddev +
                           (1. - self.renorm_stddev_weight) * stddev)
    # Compute the corrections for batch renorm.
    r = stddev / mixed_renorm_stddev
    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
    # Ensure the corrections use pre-update moving averages.
    with ops.control_dependencies([r, d]):
      mean = array_ops.identity(mean)
      stddev = array_ops.identity(stddev)
    rmin, rmax, dmax = [self.renorm_clipping.get(key)
                        for key in ['rmin', 'rmax', 'dmax']]
    if rmin is not None:
      r = math_ops.maximum(r, rmin)
    if rmax is not None:
      r = math_ops.minimum(r, rmax)
    if dmax is not None:
      d = math_ops.maximum(d, -dmax)
      d = math_ops.minimum(d, dmax)
    # When not training, use r=1, d=0, and decay=1 meaning no updates.
    r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
    d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
    decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.)

    def _update_renorm_variable(var, weight, value):
      """Updates a moving average and weight, returns the unbiased value."""
      # Update the variables without zero debiasing. The debiasing will be
      # accomplished by dividing the exponential moving average by the weight.
      # For example, after a single update, the moving average would be
      # (1-decay) * value. and the weight will be 1-decay, with their ratio
      # giving value.
      # Make sure the weight is not updated until before r and d computation.
      value = array_ops.identity(value)
      with ops.control_dependencies([value]):
        weight_value = array_ops.constant(1., dtype=weight.dtype)
      new_var = moving_averages.assign_moving_average(
          var, value, decay, zero_debias=False)
      new_weight = moving_averages.assign_moving_average(
          weight, weight_value, decay, zero_debias=False)
      return new_var / new_weight

    with ops.colocate_with(self.moving_mean):
      new_mean = _update_renorm_variable(self.renorm_mean,
                                         self.renorm_mean_weight,
                                         mean)
    with ops.colocate_with(self.moving_variance):
      new_stddev = _update_renorm_variable(self.renorm_stddev,
                                           self.renorm_stddev_weight,
                                           stddev)
      # Make sqrt(moving_variance + epsilon) = new_stddev.
      new_variance = math_ops.square(new_stddev) - self.epsilon

    return (r, d, new_mean, new_variance)
Пример #3
0
def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
               loss_collection=ops.GraphKeys.LOSSES,
               reduction=Reduction.WEIGHTED_SUM_BY_NONZERO_WEIGHTS):
  """Adds a Huber Loss term to the training procedure.

  For each value x in `error=labels-predictions`, the following is calculated:

  ```
    0.5 * x^2                  if |x| <= d
    0.5 * d^2 + d * (|x| - d)  if |x| > d
  ```

  where d is `delta`.

  See: https://en.wikipedia.org/wiki/Huber_loss

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  [batch_size], then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector. If the shape of
  `weights` matches the shape of `predictions`, then the loss of each
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.

  Args:
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    predictions: The predicted outputs.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    delta: `float`, the point where the huber loss function
      changes from a quadratic to linear.
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
    reduction: Type of reduction to apply to loss.

  Returns:
    A scalar `Tensor` that returns the weighted loss.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.
  """
  with ops.name_scope(scope, "huber_loss",
                      (predictions, labels, weights)) as scope:
    predictions = math_ops.to_float(predictions)
    labels = math_ops.to_float(labels)
    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
    error = math_ops.subtract(predictions, labels)
    abs_error = math_ops.abs(error)
    quadratic = math_ops.minimum(abs_error, delta)
    # The following expression is the same in value as
    # tf.maximum(abs_error - delta, 0), but importantly the gradient for the
    # expression when abs_error == delta is 0 (for tf.maximum it would be 1).
    # This is necessary to avoid doubling the gradient, since there is already a
    # nonzero contribution to the gradient from the quadratic term.
    linear = (abs_error - quadratic)
    losses = 0.5 * quadratic**2 + delta * linear
    return compute_weighted_loss(
        losses, weights, scope, loss_collection, reduction=reduction)
Пример #4
0
def saturate_cast(image, dtype):
  """Performs a safe cast of image data to `dtype`.

  This function casts the data in image to `dtype`, without applying any
  scaling. If there is a danger that image data would over or underflow in the
  cast, this op applies the appropriate clamping before the cast.

  Args:
    image: An image to cast to a different data type.
    dtype: A `DType` to cast `image` to.

  Returns:
    `image`, safely cast to `dtype`.
  """
  clamped = image

  # When casting to a type with smaller representable range, clamp.
  # Note that this covers casting to unsigned types as well.
  if image.dtype.min < dtype.min and image.dtype.max > dtype.max:
    clamped = clip_ops.clip_by_value(clamped,
                                     math_ops.cast(dtype.min, image.dtype),
                                     math_ops.cast(dtype.max, image.dtype))
  elif image.dtype.min < dtype.min:
    clamped = math_ops.maximum(clamped, math_ops.cast(dtype.min, image.dtype))
  elif image.dtype.max > dtype.max:
    clamped = math_ops.minimum(clamped, math_ops.cast(dtype.max, image.dtype))

  return math_ops.cast(clamped, dtype)
Пример #5
0
  def __call__(self, shape, dtype=None, partition_info=None):
    if dtype is None:
      dtype = self.dtype
    # Check the shape
    if len(shape) < 2:
      raise ValueError("The tensor to initialize must be "
                       "at least two-dimensional")
    # Flatten the input shape with the last dimension remaining
    # its original shape so it works for conv2d
    num_rows = 1
    for dim in shape[:-1]:
      num_rows *= dim
    num_cols = shape[-1]
    flat_shape = (num_rows, num_cols)

    # Generate a random matrix
    a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
    # Compute the qr factorization
    q, r = linalg_ops.qr(a, full_matrices=False)
    # Make Q uniform
    square_len = math_ops.minimum(num_rows, num_cols)
    d = array_ops.diag_part(r[:square_len, :square_len])
    ph = d / math_ops.abs(d)
    q *= ph
    # Pad zeros to Q (if rows smaller than cols)
    if num_rows < num_cols:
      padding = array_ops.zeros([num_rows, num_cols - num_rows], dtype=dtype)
      q = array_ops.concat([q, padding], 1)
    return self.gain * array_ops.reshape(q, shape)
Пример #6
0
def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name):
  """Find max_norm given norm and previous average."""
  with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]):
    log_norm = math_ops.log(norm + epsilon)

    def moving_average(name, value, decay):
      moving_average_variable = vs.get_variable(
          name,
          shape=value.get_shape(),
          dtype=value.dtype,
          initializer=init_ops.zeros_initializer(),
          trainable=False)
      return moving_averages.assign_moving_average(
          moving_average_variable, value, decay, zero_debias=False)

    # quicker adaptation at the beginning
    if global_step is not None:
      n = math_ops.to_float(global_step)
      decay = math_ops.minimum(decay, n / (n + 1.))

    # update averages
    mean = moving_average("mean", log_norm, decay)
    sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay)

    variance = sq_mean - math_ops.square(mean)
    std = math_ops.sqrt(math_ops.maximum(epsilon, variance))
    max_norms = math_ops.exp(mean + std_factor * std)
    return max_norms, mean
Пример #7
0
def clip_by_norm(t, clip_norm, name=None):
  """Clips tensor values to a maximum L2-norm.

  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
  normalizes `t` so that its L2-norm is less than or equal to `clip_norm'.
  Specifically, if the L2-norm is already less than or equal to `clip_norm`,
  then `t` is not modified. If the L2-norm is greater than `clip_norm`, then
  this operation returns a tensor of the same type and shape as `t` with its
  values set to:

  `t * clip_norm / l2norm(t)`

  In this case, the L2-norm of the output tensor is `clip_norm`.

  This operation is typically used to clip gradients before applying them with
  an optimizer.

  Args:
    t: A `Tensor`.
    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor`.
  """
  with ops.op_scope([t, clip_norm], name, "clip_by_norm") as name:
    t = ops.convert_to_tensor(t, name="t")

    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
    l2norm_inv = math_ops.rsqrt(
        math_ops.reduce_sum(t * t, math_ops.range(array_ops.rank(t))))
    tclip = array_ops.identity(t * clip_norm * math_ops.minimum(
        l2norm_inv, constant_op.constant(1.0 / clip_norm)), name=name)

  return tclip
Пример #8
0
  def _update_clip_coeff(self, grads_and_vars, precon_grads_and_vars):
    """Computes the scale factor for the update to satisfy the norm constraint.

    Defined as min(1, sqrt(c / r^T F r)), where c is the norm constraint,
    F is the approximate Fisher matrix, and r is the update vector, i.e.
    -alpha * v, where alpha is the learning rate, and v is the preconditioned
    gradient.

    This is based on Section 5 of Ba et al., Distributed Second-Order
    Optimization using Kronecker-Factored Approximations. Note that they
    absorb the learning rate alpha (which they denote eta_max) into the formula
    for the coefficient, while in our implementation, the rescaling is done
    before multiplying by alpha. Hence, our formula differs from theirs by a
    factor of alpha.

    Args:
      grads_and_vars: List of (gradient, variable) pairs.
      precon_grads_and_vars: List of (preconditioned gradient, variable) pairs.
        Must be the result of calling `self._fisher_est.multiply_inverse`
        on `grads_and_vars`.

    Returns:
      Scalar representing the coefficient which should be applied to the
      preconditioned gradients to satisfy the norm constraint.
    """
    sq_norm_grad = self._squared_fisher_norm(grads_and_vars,
                                             precon_grads_and_vars)
    sq_norm_up = sq_norm_grad * self._learning_rate**2
    return math_ops.minimum(1.,
                            math_ops.sqrt(self._norm_constraint / sq_norm_up))
Пример #9
0
  def gradient_clipping(grads_and_vars):
    """Internal function for adaptive clipping."""
    grads, variables = zip(*grads_and_vars)

    norm = clip_ops.global_norm(grads)

    max_norm, log_mean = _adaptive_max_norm(norm, std_factor, decay,
                                            global_step, epsilon, name)

    # reports the max gradient norm for debugging
    if report_summary:
      summary.scalar("global_norm/adaptive_max_gradient_norm", max_norm)

    # factor will be 1. if norm is smaller than max_norm
    factor = array_ops.where(norm < max_norm,
                             array_ops.ones_like(norm),
                             math_ops.exp(log_mean) / norm)

    if static_max_norm is not None:
      factor = math_ops.minimum(static_max_norm / norm, factor)

    # apply factor
    clipped_grads = []
    for grad in grads:
      if grad is None:
        clipped_grads.append(None)
      elif isinstance(grad, ops.IndexedSlices):
        clipped_grads.append(
            ops.IndexedSlices(grad.values * factor, grad.indices,
                              grad.dense_shape))
      else:
        clipped_grads.append(grad * factor)

    return list(zip(clipped_grads, variables))
Пример #10
0
    def _testConfMatrixOnTensors(self, tf_dtype, np_dtype):
        with self.test_session() as sess:
            m_neg = array_ops.placeholder(dtype=dtypes.float32)
            m_pos = array_ops.placeholder(dtype=dtypes.float32)
            s = array_ops.placeholder(dtype=dtypes.float32)

            neg = random_ops.random_normal([20], mean=m_neg, stddev=s, dtype=dtypes.float32)
            pos = random_ops.random_normal([20], mean=m_pos, stddev=s, dtype=dtypes.float32)

            data = array_ops.concat([neg, pos], 0)
            data = math_ops.cast(math_ops.round(data), tf_dtype)
            data = math_ops.minimum(math_ops.maximum(data, 0), 1)
            lab = array_ops.concat([array_ops.zeros([20], dtype=tf_dtype), array_ops.ones([20], dtype=tf_dtype)], 0)

            cm = confusion_matrix.confusion_matrix(lab, data, dtype=tf_dtype, num_classes=2)

            d, l, cm_out = sess.run([data, lab, cm], {m_neg: 0.0, m_pos: 1.0, s: 1.0})

            truth = np.zeros([2, 2], dtype=np_dtype)
            try:
                range_builder = xrange
            except NameError:  # In Python 3.
                range_builder = range
            for i in range_builder(len(d)):
                truth[l[i], d[i]] += 1

            self.assertEqual(cm_out.dtype, np_dtype)
            self.assertAllClose(cm_out, truth, atol=1e-10)
Пример #11
0
  def decayed_lr(learning_rate, global_step, decay_steps, initial_variance,
                 variance_decay, num_periods, alpha, beta, name):
    """Helper to recompute learning rate; most helpful in eager-mode."""
    with ops.name_scope(name, "NoisyLinearCosineDecay",
                        [learning_rate, global_step]) as name:
      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
      dtype = learning_rate.dtype
      decay_steps = math_ops.cast(decay_steps, dtype)
      initial_variance = math_ops.cast(initial_variance, dtype)
      variance_decay = math_ops.cast(variance_decay, dtype)
      num_periods = math_ops.cast(num_periods, dtype)
      alpha = math_ops.cast(alpha, dtype)
      beta = math_ops.cast(beta, dtype)

      global_step_recomp = math_ops.cast(global_step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
      variance = initial_variance / (
          math_ops.pow(1.0 + global_step_recomp, variance_decay))
      std = math_ops.sqrt(variance)
      noisy_linear_decayed = (
          linear_decayed + random_ops.random_normal(
              linear_decayed.shape, stddev=std))

      completed_fraction = global_step_recomp / decay_steps
      fraction = 2.0 * num_periods * completed_fraction
      cosine_decayed = 0.5 * (
          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
      noisy_linear_cosine_decayed = (
          (alpha + noisy_linear_decayed) * cosine_decayed + beta)

      return math_ops.multiply(
          learning_rate, noisy_linear_cosine_decayed, name=name)
Пример #12
0
  def decayed_lr(learning_rate, global_step, decay_steps, end_learning_rate,
                 power, cycle, name):
    """Helper to recompute learning rate; most helpful in eager-mode."""
    with ops.name_scope(
        name, "PolynomialDecay",
        [learning_rate, global_step, decay_steps, end_learning_rate, power]
    ) as name:
      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
      dtype = learning_rate.dtype
      end_learning_rate = math_ops.cast(end_learning_rate, dtype)
      power = math_ops.cast(power, dtype)

      global_step_recomp = math_ops.cast(global_step, dtype)
      decay_steps_recomp = math_ops.cast(decay_steps, dtype)
      if cycle:
        # Find the first multiple of decay_steps that is bigger than
        # global_step. If global_step is zero set the multiplier to 1
        multiplier = control_flow_ops.cond(
            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
            lambda: math_ops.ceil(global_step_recomp / decay_steps))
        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
      else:
        # Make sure that the global_step used is not bigger than decay_steps.
        global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)

      p = math_ops.div(global_step_recomp, decay_steps_recomp)
      return math_ops.add(
          math_ops.multiply(learning_rate - end_learning_rate,
                            math_ops.pow(1 - p, power)),
          end_learning_rate,
          name=name)
Пример #13
0
def huber_loss(y_true, y_pred, delta=1.0):
  """Computes Huber loss value.

  For each value x in `error=y_true-y_pred`, the following is calculated:

  ```
  0.5 * x^2                  if |x| <= d
  0.5 * d^2 + d * (|x| - d)  if |x| > d
  ```
  where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss

  Args:
    y_true: tensor of true targets.
    y_pred: tensor of predicted targets.
    delta: A float, the point where the Huber loss function changes from a
      quadratic to linear.

  Returns:
    Tensor with one scalar loss entry per sample.
  """
  y_pred = math_ops.cast(y_pred, dtype=K.floatx())
  y_true = math_ops.cast(y_true, dtype=K.floatx())
  error = math_ops.subtract(y_pred, y_true)
  abs_error = math_ops.abs(error)
  quadratic = math_ops.minimum(abs_error, delta)
  linear = math_ops.subtract(abs_error, quadratic)
  return math_ops.add(
      math_ops.multiply(
          ops.convert_to_tensor(0.5, dtype=quadratic.dtype),
          math_ops.multiply(quadratic, quadratic)),
      math_ops.multiply(delta, linear))
Пример #14
0
  def get_best(self, n):
    """Return the indices and values of the n highest scores in the TopN."""

    def refresh_shortlist():
      """Update the shortlist with the highest scores in id_to_score."""
      new_scores, new_ids = nn_ops.top_k(self.id_to_score, self.shortlist_size)
      smallest_new_score = math_ops.reduce_min(new_scores)
      new_length = math_ops.reduce_sum(
          math_ops.to_int32(math_ops.greater(new_scores, dtypes.float32.min)))
      u1 = self.sl_ids.assign(
          math_ops.to_int64(array_ops.concat([[new_length], new_ids], 0)))
      u2 = self.sl_scores.assign(
          array_ops.concat([[smallest_new_score], new_scores], 0))
      self.last_ops = [u1, u2]
      return control_flow_ops.group(u1, u2)

    # We only need to refresh the shortlist if n is greater than the
    # current shortlist size (which is stored in sl_ids[0]).
    with ops.control_dependencies(self.last_ops):
      cond_op = control_flow_ops.cond(n > self.sl_ids[0], refresh_shortlist,
                                      control_flow_ops.no_op)
      with ops.control_dependencies([cond_op]):
        topk_values, topk_indices = nn_ops.top_k(
            self.sl_scores,
            math_ops.minimum(n, math_ops.to_int32(self.sl_ids[0])))
        # topk_indices are the indices into the shortlist, we want to return
        # the indices into id_to_score
        gathered_indices = array_ops.gather(self.sl_ids, topk_indices)
        return gathered_indices, topk_values
  def __call__(self, step):
    with ops.name_scope(
        self.name, "PolynomialDecay",
        [self.initial_learning_rate, step, self.decay_steps,
         self.end_learning_rate, self.power]
    ) as name:
      initial_learning_rate = ops.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
      power = math_ops.cast(self.power, dtype)

      global_step_recomp = math_ops.cast(step, dtype)
      decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)
      if self.cycle:
        # Find the first multiple of decay_steps that is bigger than
        # global_step. If global_step is zero set the multiplier to 1
        multiplier = control_flow_ops.cond(
            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
            lambda: math_ops.ceil(global_step_recomp / self.decay_steps))
        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
      else:
        # Make sure that the global_step used is not bigger than decay_steps.
        global_step_recomp = math_ops.minimum(global_step_recomp,
                                              self.decay_steps)

      p = math_ops.div(global_step_recomp, decay_steps_recomp)
      return math_ops.add(
          math_ops.multiply(initial_learning_rate - end_learning_rate,
                            math_ops.pow(1 - p, power)),
          end_learning_rate,
          name=name)
Пример #16
0
def clip_by_value(t, clip_value_min, clip_value_max,
                  name=None):
  """Clips tensor values to a specified min and max.

  Given a tensor `t`, this operation returns a tensor of the same type and
  shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
  Any values less than `clip_value_min` are set to `clip_value_min`. Any values
  greater than `clip_value_max` are set to `clip_value_max`.

  Args:
    t: A `Tensor`.
    clip_value_min: A 0-D (scalar) `Tensor`. The minimum value to clip by.
    clip_value_max: A 0-D (scalar) `Tensor`. The maximum value to clip by.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor`.
  """
  with ops.name_scope(name, "clip_by_value",
                      [t, clip_value_min, clip_value_max]) as name:
    t = ops.convert_to_tensor(t, name="t")

    # Go through list of tensors, for each value in each tensor clip
    t_min = math_ops.minimum(t, clip_value_max)
    t_max = math_ops.maximum(t_min, clip_value_min, name=name)

  return t_max
Пример #17
0
def _get_scores(log_probs, sequence_lengths, length_penalty_weight,
                coverage_penalty_weight, finished, accumulated_attention_probs):
  """Calculates scores for beam search hypotheses.

  Args:
    log_probs: The log probabilities with shape
      `[batch_size, beam_width, vocab_size]`.
    sequence_lengths: The array of sequence lengths.
    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
    coverage_penalty_weight: Float weight to penalize the coverage of source
      sentence. Disabled with 0.0.
    finished: A boolean tensor of shape `[batch_size, beam_width]` that
      specifies which elements in the beam are finished already.
    accumulated_attention_probs: Accumulated attention probabilities up to the
      current time step, with shape `[batch_size, beam_width, max_time]` if
      coverage_penalty_weight is not 0.0.

  Returns:
    The scores normalized by the length_penalty and coverage_penalty.

  Raises:
    ValueError: accumulated_attention_probs is None when coverage penalty is
      enabled.
  """
  length_penalty_ = _length_penalty(
      sequence_lengths=sequence_lengths, penalty_factor=length_penalty_weight)
  length_penalty_ = math_ops.cast(length_penalty_, dtype=log_probs.dtype)
  scores = log_probs / length_penalty_

  coverage_penalty_weight = ops.convert_to_tensor(
      coverage_penalty_weight, name="coverage_penalty_weight")
  if coverage_penalty_weight.shape.ndims != 0:
    raise ValueError("coverage_penalty_weight should be a scalar, "
                     "but saw shape: %s" % coverage_penalty_weight.shape)

  if tensor_util.constant_value(coverage_penalty_weight) == 0.0:
    return scores

  if accumulated_attention_probs is None:
    raise ValueError(
        "accumulated_attention_probs can be None only if coverage penalty is "
        "disabled.")

  # Add source sequence length mask before computing coverage penalty.
  accumulated_attention_probs = array_ops.where(
      math_ops.equal(accumulated_attention_probs, 0.0),
      array_ops.ones_like(accumulated_attention_probs),
      accumulated_attention_probs)

  # coverage penalty =
  #     sum over `max_time` {log(min(accumulated_attention_probs, 1.0))}
  coverage_penalty = math_ops.reduce_sum(
      math_ops.log(math_ops.minimum(accumulated_attention_probs, 1.0)), 2)
  # Apply coverage penalty to finished predictions.
  coverage_penalty *= math_ops.to_float(finished)
  weighted_coverage_penalty = coverage_penalty * coverage_penalty_weight
  # Reshape from [batch_size, beam_width] to [batch_size, beam_width, 1]
  weighted_coverage_penalty = array_ops.expand_dims(
      weighted_coverage_penalty, 2)
  return scores + weighted_coverage_penalty
Пример #18
0
  def _setup_sparsity(self):
    begin_step = self._spec.sparsity_function_begin_step
    end_step = self._spec.sparsity_function_end_step
    initial_sparsity = self._spec.initial_sparsity
    target_sparsity = self._spec.target_sparsity
    exponent = self._spec.sparsity_function_exponent

    if begin_step >= end_step:
      raise ValueError(
          'Pruning must begin before it can end. begin_step=%d, end_step=%d' %
          (begin_step, end_step))

    with ops.name_scope(self._spec.name):
      p = math_ops.minimum(1.0,
                           math_ops.maximum(
                               0.0,
                               math_ops.div(
                                   math_ops.cast(self._global_step - begin_step,
                                                 np.float32),
                                   end_step - begin_step)))
      sparsity = math_ops.add(
          math_ops.multiply(initial_sparsity - target_sparsity,
                            math_ops.pow(1 - p, exponent)),
          target_sparsity,
          name='sparsity')

    return sparsity
  def __call__(self, step):
    with ops.name_scope(self.name, "NoisyLinearCosineDecay",
                        [self.initial_learning_rate, step]) as name:
      initial_learning_rate = ops.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      decay_steps = math_ops.cast(self.decay_steps, dtype)
      initial_variance = math_ops.cast(self.initial_variance, dtype)
      variance_decay = math_ops.cast(self.variance_decay, dtype)
      num_periods = math_ops.cast(self.num_periods, dtype)
      alpha = math_ops.cast(self.alpha, dtype)
      beta = math_ops.cast(self.beta, dtype)

      global_step_recomp = math_ops.cast(step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
      variance = initial_variance / (
          math_ops.pow(1.0 + global_step_recomp, variance_decay))
      std = math_ops.sqrt(variance)
      noisy_linear_decayed = (
          linear_decayed + random_ops.random_normal(
              linear_decayed.shape, stddev=std))

      completed_fraction = global_step_recomp / decay_steps
      fraction = 2.0 * num_periods * completed_fraction
      cosine_decayed = 0.5 * (
          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
      noisy_linear_cosine_decayed = (
          (alpha + noisy_linear_decayed) * cosine_decayed + beta)

      return math_ops.multiply(
          initial_learning_rate, noisy_linear_cosine_decayed, name=name)
  def compress(self, inputs):
    """Compress inputs and store their binary representations into strings.

    Args:
      inputs: `Tensor` with values to be compressed.

    Returns:
      String `Tensor` vector containing the compressed representation of each
      batch element of `inputs`.
    """
    with ops.name_scope(self._name_scope()):
      inputs = ops.convert_to_tensor(inputs)
      if not self.built:
        # Check input assumptions set before layer building, e.g. input rank.
        self._assert_input_compatibility(inputs)
        if self.dtype is None:
          self._dtype = inputs.dtype.base_dtype.name
        self.build(inputs.shape)

      # Check input assumptions set after layer building, e.g. input shape.
      if not context.executing_eagerly():
        self._assert_input_compatibility(inputs)

      ndim = self.input_spec.ndim
      channel_axis = self._channel_axis(ndim)
      # Tuple of slices for expanding dimensions of tensors below.
      slices = ndim * [None] + [slice(None)]
      slices[channel_axis] = slice(None)
      slices = tuple(slices)

      # Expand dimensions of CDF to input dimensions, keeping the channels along
      # the right dimension.
      cdf = self._quantized_cdf[slices[1:]]
      num_levels = array_ops.shape(cdf)[-1] - 1

      # Bring inputs to the right range by centering the range on the medians.
      half = constant_op.constant(.5, dtype=self.dtype)
      medians = array_ops.squeeze(self._medians, [1, 2])
      offsets = (math_ops.cast(num_levels // 2, self.dtype) + half) - medians
      # Expand offsets to input dimensions and add to inputs.
      values = inputs + offsets[slices[:-1]]

      # Clip to range and cast to integers. Because we have added .5 above, and
      # all values are positive, the cast effectively implements rounding.
      values = math_ops.maximum(values, half)
      values = math_ops.minimum(
          values, math_ops.cast(num_levels, self.dtype) - half)
      values = math_ops.cast(values, dtypes.int16)

      def loop_body(tensor):
        return coder_ops.range_encode(
            tensor, cdf, precision=self.range_coder_precision)
      strings = functional_ops.map_fn(
          loop_body, values, dtype=dtypes.string, back_prop=False)

      if not context.executing_eagerly():
        strings.set_shape(inputs.shape[:1])

      return strings
Пример #21
0
def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
                 name=None):
  """Applies cosine decay to the learning rate.

  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
  with Warm Restarts. https://arxiv.org/abs/1608.03983

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies a cosine decay function
  to a provided initial learning rate.  It requires a `global_step` value to
  compute the decayed learning rate.  You can just pass a TensorFlow variable
  that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:
  ```python
  global_step = min(global_step, decay_steps)
  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
  decayed = (1 - alpha) * cosine_decay + alpha
  decayed_learning_rate = learning_rate * decayed
  ```

  Example usage:
  ```python
  decay_steps = 1000
  lr_decayed = cosine_decay(learning_rate, global_step, decay_steps)
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
      The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
      Global step to use for the decay computation.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Number of steps to decay over.
    alpha: A scalar `float32` or `float64` Tensor or a Python number.
      Minimum learning rate value as a fraction of learning_rate.
    name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  Raises:
    ValueError: if `global_step` is not supplied.
  """
  if global_step is None:
    raise ValueError("cosine decay requires global_step")
  with ops.name_scope(name, "CosineDecay",
                      [learning_rate, global_step]) as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    decay_steps = math_ops.cast(decay_steps, dtype)
    global_step = math_ops.minimum(global_step, decay_steps)
    completed_fraction = global_step / decay_steps
    cosine_decayed = 0.5 * (
        1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))

    decayed = (1 - alpha) * cosine_decayed + alpha
    return math_ops.multiply(learning_rate, decayed)
Пример #22
0
 def _compare(self, x, y, use_gpu):
   np_min, np_max = np.minimum(x, y), np.maximum(x, y)
   with test_util.device(use_gpu=use_gpu):
     inx = ops.convert_to_tensor(x)
     iny = ops.convert_to_tensor(y)
     omin, omax = math_ops.minimum(inx, iny), math_ops.maximum(inx, iny)
     tf_min, tf_max = self.evaluate([omin, omax])
   self.assertAllEqual(np_min, tf_min)
   self.assertAllEqual(np_max, tf_max)
Пример #23
0
 def linear_decay_fn(global_step):
   if global_step is None:
     raise ValueError("global_step is required for linear_decay.")
   global_step = math_ops.minimum(global_step, decay_steps)
   remaining_steps = math_ops.to_int32(decay_steps) - math_ops.to_int32(
       global_step)
   decayed = math_ops.to_float(remaining_steps) / math_ops.to_float(
       decay_steps)
   return math_ops.maximum(0.0, decayed)
Пример #24
0
def eye(
    num_rows,
    num_columns=None,
    batch_shape=None,
    dtype=dtypes.float32,
    name=None):
  """Construct an identity matrix, or a batch of matrices.

  ```python
  # Construct one identity matrix.
  tf.eye(2)
  ==> [[1., 0.],
       [0., 1.]]

  # Construct a batch of 3 identity matricies, each 2 x 2.
  # batch_identity[i, :, :] is a 2 x 2 identity matrix, i = 0, 1, 2.
  batch_identity = tf.eye(2, batch_shape=[3])

  # Construct one 2 x 3 "identity" matrix
  tf.eye(2, num_columns=3)
  ==> [[ 1.,  0.,  0.],
       [ 0.,  1.,  0.]]
  ```

  Args:
    num_rows: Non-negative `int32` scalar `Tensor` giving the number of rows
      in each batch matrix.
    num_columns: Optional non-negative `int32` scalar `Tensor` giving the number
      of columns in each batch matrix.  Defaults to `num_rows`.
    batch_shape:  `int32` `Tensor`.  If provided, returned `Tensor` will have
      leading batch dimensions of this shape.
    dtype:  The type of an element in the resulting `Tensor`
    name:  A name for this `Op`.  Defaults to "eye".

  Returns:
    A `Tensor` of shape `batch_shape + [num_rows, num_columns]`
  """
  with ops.name_scope(
      name, default_name="eye", values=[num_rows, num_columns, batch_shape]):

    batch_shape = [] if batch_shape is None else batch_shape
    batch_shape = ops.convert_to_tensor(
        batch_shape, name="shape", dtype=dtypes.int32)

    if num_columns is None:
      diag_size = num_rows
    else:
      diag_size = math_ops.minimum(num_rows, num_columns)
    diag_shape = array_ops.concat_v2((batch_shape, [diag_size]), 0)
    diag_ones = array_ops.ones(diag_shape, dtype=dtype)

    if num_columns is None:
      return array_ops.matrix_diag(diag_ones)
    else:
      shape = array_ops.concat_v2((batch_shape, [num_rows, num_columns]), 0)
      zero_matrix = array_ops.zeros(shape, dtype=dtype)
      return array_ops.matrix_set_diag(zero_matrix, diag_ones)
Пример #25
0
  def apply(self, var_list=None):
    """Maintains moving averages of variables.

    `var_list` must be a list of `Variable` or `Tensor` objects.  This method
    creates shadow variables for all elements of `var_list`.  Shadow variables
    for `Variable` objects are initialized to the variable's initial value.
    For `Tensor` objects, the shadow variables are initialized to 0.

    shadow variables are created with `trainable=False` and added to the
    `GraphKeys.ALL_VARIABLES` collection.  They will be returned by calls to
    `tf.all_variables()`.

    Returns an op that updates all shadow variables as described above.

    Note that `apply()` can be called multiple times with different lists of
    variables.

    Args:
      var_list: A list of Variable or Tensor objects. The variables
        and Tensors must be of types float32 or float64.

    Returns:
      An Operation that updates the moving averages.

    Raises:
      TypeError: If the arguments are not all float32 or float64.
      ValueError: If the moving average of one of the variables is already
        being computed.
    """
    # TODO(touts): op_scope
    if var_list is None:
      var_list = variables.trainable_variables()
    for var in var_list:
      if var.dtype.base_dtype not in [types.float32, types.float64]:
        raise TypeError("The variables must be float or double: %s" % var)
      if var in self._averages:
        raise ValueError("Moving average already computed for: %s" % var)
      with ops.name_scope(var.op.name + "/" + self._name) as scope:
        with ops.device(var.device):
          if isinstance(var, variables.Variable):
            initial_value = var.initialized_value()
          else:
            initial_value = array_ops.zeros(var.get_shape().as_list())
          avg = variables.Variable(initial_value, name=scope, trainable=False)
          self._averages[var] = avg
    with ops.name_scope(self._name) as scope:
      decay = ops.convert_to_tensor(self._decay, name="decay")
      if self._num_updates is not None:
        num_updates = math_ops.cast(self._num_updates, types.float32,
                                    name="num_updates")
        decay = math_ops.minimum(decay,
                                 (1.0 + num_updates) / (10.0 + num_updates))
      updates = []
      for var in var_list:
        updates.append(assign_moving_average(self._averages[var], var, decay))
      return control_flow_ops.group(*updates, name=scope)
Пример #26
0
    def decayed_lr():
      """Helper to recompute learning rate; most helpful in eager-mode."""
      global_step_recomp = math_ops.cast(global_step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      completed_fraction = global_step_recomp / decay_steps
      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
          constant_op.constant(math.pi) * completed_fraction))

      decayed = (1 - alpha) * cosine_decayed + alpha
      return math_ops.multiply(learning_rate, decayed)
Пример #27
0
 def _compare(self, x, y, use_gpu):
   np_min, np_max = np.minimum(x, y), np.maximum(x, y)
   with self.test_session(
       use_gpu=use_gpu,
       force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
     inx = ops.convert_to_tensor(x)
     iny = ops.convert_to_tensor(y)
     omin, omax = math_ops.minimum(inx, iny), math_ops.maximum(inx, iny)
     tf_min, tf_max = sess.run([omin, omax])
   self.assertAllEqual(np_min, tf_min)
   self.assertAllEqual(np_max, tf_max)
Пример #28
0
    def decayed_lr():
      """Helper to recompute learning rate; most helpful in eager-mode."""
      global_step_recomp = math_ops.cast(global_step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
      completed_fraction = global_step_recomp / decay_steps
      fraction = 2.0 * num_periods * completed_fraction
      cosine_decayed = 0.5 * (
          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))

      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
      return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
Пример #29
0
 def cosine_decay_fn(global_step):
   if global_step is None:
     raise ValueError("global_step is required for cosine_decay.")
   global_step = math_ops.minimum(global_step, decay_steps)
   completed_fraction = math_ops.to_float(global_step) / math_ops.to_float(
       decay_steps)
   fraction = 2.0 * num_periods * completed_fraction
   decayed = 0.5 * (
       1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
   if zero_after is not None:
     decayed = array_ops.where(
         math_ops.greater_equal(fraction, 2 * zero_after), 0.0, decayed)
   return decayed
Пример #30
0
 def restart_decay_fn(global_step):
   if global_step is None:
     raise ValueError("global_step is required for cosine_decay.")
   global_step = math_ops.minimum(global_step, decay_steps)
   num = math_ops.mod(num_periods * math_ops.to_float(global_step),
                      decay_steps)
   fraction = num / math_ops.to_float(decay_steps)
   decayed = 0.5 * (
       1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
   if zero_after is not None:
     tmp = math_ops.to_float(
         num_periods * global_step) / math_ops.to_float(decay_steps)
     decayed = array_ops.where(
         math_ops.greater_equal(tmp, zero_after), 0.0, decayed)
   return decayed
Пример #31
0
    def _apply_sparse_shared(self, grad, var, indices, scatter_add):
        step, beta1_power, beta2_power = self._get_beta_accumulators()
        beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)

        if self._initial_total_steps > 0:
            total_steps = math_ops.cast(self._total_steps_t,
                                        var.dtype.base_dtype)
            warmup_proportion = math_ops.cast(self._warmup_proportion_t,
                                              var.dtype.base_dtype)
            min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype)
            warmup_steps = total_steps * warmup_proportion
            decay_steps = math_ops.maximum(total_steps - warmup_steps, 1)
            decay_rate = (min_lr - lr_t) / decay_steps
            lr_t = tf.where(
                step <= warmup_steps,
                lr_t * (step / warmup_steps),
                lr_t + decay_rate *
                math_ops.minimum(step - warmup_steps, decay_steps),
            )

        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        sma_inf = 2.0 / (1.0 - beta2_t) - 1.0
        sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power)

        m = self.get_slot(var, "m")
        m_scaled_g_values = grad * (1 - beta1_t)
        m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
        with ops.control_dependencies([m_t]):
            m_t = scatter_add(m, indices, m_scaled_g_values)
        m_corr_t = m_t / (1.0 - beta1_power)

        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad * grad) * (1 - beta2_t)
        v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
        with ops.control_dependencies([v_t]):
            v_t = scatter_add(v, indices, v_scaled_g_values)
        if self._amsgrad:
            vhat = self.get_slot(var, 'vhat')
            vhat_t = state_ops.assign(vhat,
                                      math_ops.maximum(vhat, v_t),
                                      use_locking=self._use_locking)
            v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta2_power)) + epsilon_t
        else:
            v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power)) + epsilon_t

        r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) /
                            (sma_inf - 2.0) * sma_inf / sma_t)

        var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / v_corr_t, m_corr_t)

        if var in self.decay_vars:
            if self._initial_weight_decay > 0.0:
                var_t += math_ops.cast(self._weight_decay_t,
                                       var.dtype.base_dtype) * var

        var_update = state_ops.assign_sub(var,
                                          lr_t * var_t,
                                          use_locking=self._use_locking)

        updates = [var_update, m_t, v_t]
        if self._amsgrad:
            updates.append(vhat_t)
        return control_flow_ops.group(*updates)
Пример #32
0
    def apply(self, var_list=None):
        """Maintains moving averages of variables.

    `var_list` must be a list of `Variable` objects.  This method
    creates shadow variables (holding the moving averages)
    for all elements of `var_list`, and
    updates the moving averages using the current `var_list` values. Shadow
    variables for `Variable` objects are initialized to the variable's initial
    value.

    Shadow variables are created with `trainable=False`. To access them you
    can use the EMA object's `average` method. Note that `EMA` objects are
    not trackable by checkpoints, so if you want to checkpoint or restore the
    moving variables you will need to manually grab the shadow
    variables via `average()` and assign them as `tf.Module` properties or
    directly pass them to your `tf.train.Checkpoint`.

    Note that `apply()` can be called multiple times. When eager execution is
    enabled each call to apply will update the variables once, so this needs to
    be called in a loop.

    In legacy TF 1.x graphs, this method returns an op that updates all
    shadow variables from the current value of their associated variables. In
    TF 1.x graphs without automatically control dependencies this op needs to be
    manually run.

    Args:
      var_list: A list of Variable objects. The variables
        must be of types bfloat16, float16, float32, or float64.
        (In legacy TF 1.x graphs these may be tensors, but this is unsupported
        when eager execution is enabled.)

    Returns:
      An Operation that updates the moving averages.

    Raises:
      TypeError: If the arguments are not an allowed type.
    """
        # TODO(touts): op_scope
        if var_list is None:
            var_list = variables.trainable_variables()
        for v in var_list:
            if (isinstance(v, ops.Tensor)
                    and ops.executing_eagerly_outside_functions()):
                raise TypeError(
                    "tf.train.ExponentialMovingAverage does not support non-Variable"
                    " tensors when eager execution is enabled.")
        zero_debias_true = set()  # set of vars to set `zero_debias=True`
        for var in var_list:
            if var.dtype.base_dtype not in [
                    dtypes.bfloat16, dtypes.float16, dtypes.float32,
                    dtypes.float64
            ]:
                raise TypeError(
                    "The variables must be half, float, or double: %s" %
                    var.name)

            if var.ref() not in self._averages:
                # For variables: to lower communication bandwidth across devices we keep
                # the moving averages on the same device as the variables. For other
                # tensors, we rely on the existing device allocation mechanism.
                with ops.init_scope():
                    if isinstance(var, variables.Variable):
                        with ops.device(var.device):
                            initialized_value = var.initialized_value()
                        avg = slot_creator.create_slot(
                            var,
                            initialized_value,
                            self.name,
                            colocate_with_primary=True,
                            copy_xla_sharding=True)
                        # NOTE(mrry): We only add `tf.Variable` objects to the
                        # `MOVING_AVERAGE_VARIABLES` collection.
                        ops.add_to_collection(
                            ops.GraphKeys.MOVING_AVERAGE_VARIABLES, var)
                    else:
                        avg = slot_creator.create_zeros_slot(
                            var,
                            self.name,
                            colocate_with_primary=(var.op.type in [
                                "Variable", "VariableV2", "VarHandleOp"
                            ]),
                            copy_xla_sharding=True)
                        if self._zero_debias:
                            zero_debias_true.add(avg.ref())
                self._averages[var.ref()] = avg

        with ops.name_scope(self.name) as scope:
            decay = ops.convert_to_tensor(self._decay,
                                          dtype=dtypes.float32,
                                          name="decay")
            if self._num_updates is not None:
                num_updates = math_ops.cast(self._num_updates,
                                            dtypes.float32,
                                            name="num_updates")
                decay = math_ops.minimum(decay, (1.0 + num_updates) /
                                         (10.0 + num_updates))
            updates = []
            for var in var_list:
                avg = self._averages[var.ref()]
                zero_debias = avg.ref() in zero_debias_true
                updates.append(
                    assign_moving_average(avg, var, decay, zero_debias))
            return control_flow_ops.group(*updates, name=scope)
Пример #33
0
def eye(num_rows,
        num_columns=None,
        batch_shape=None,
        dtype=dtypes.float32,
        name=None):
  """Construct an identity matrix, or a batch of matrices.

  ```python
  # Construct one identity matrix.
  tf.eye(2)
  ==> [[1., 0.],
       [0., 1.]]

  # Construct a batch of 3 identity matricies, each 2 x 2.
  # batch_identity[i, :, :] is a 2 x 2 identity matrix, i = 0, 1, 2.
  batch_identity = tf.eye(2, batch_shape=[3])

  # Construct one 2 x 3 "identity" matrix
  tf.eye(2, num_columns=3)
  ==> [[ 1.,  0.,  0.],
       [ 0.,  1.,  0.]]
  ```

  Args:
    num_rows: Non-negative `int32` scalar `Tensor` giving the number of rows
      in each batch matrix.
    num_columns: Optional non-negative `int32` scalar `Tensor` giving the number
      of columns in each batch matrix.  Defaults to `num_rows`.
    batch_shape:  A list or tuple of Python integers or a 1-D `int32` `Tensor`.
      If provided, the returned `Tensor` will have leading batch dimensions of
      this shape.
    dtype:  The type of an element in the resulting `Tensor`
    name:  A name for this `Op`.  Defaults to "eye".

  Returns:
    A `Tensor` of shape `batch_shape + [num_rows, num_columns]`
  """
  with ops.name_scope(
      name, default_name='eye', values=[num_rows, num_columns, batch_shape]):
    is_square = num_columns is None
    batch_shape = [] if batch_shape is None else batch_shape
    num_columns = num_rows if num_columns is None else num_columns
    if isinstance(num_rows, ops.Tensor) or isinstance(
        num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor):
      batch_shape = ops.convert_to_tensor(
          batch_shape, name='shape', dtype=dtypes.int32)
      diag_size = math_ops.minimum(num_rows, num_columns)
      diag_shape = array_ops.concat((batch_shape, [diag_size]), 0)
      if not is_square:
        shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0)
    else:
      if not isinstance(num_rows, compat.integral_types) or not isinstance(
          num_columns, compat.integral_types):
        raise TypeError(
            'num_rows and num_columns must be positive integer values.')
      batch_shape = [dim for dim in batch_shape]
      is_square = num_rows == num_columns
      diag_shape = batch_shape + [np.minimum(num_rows, num_columns)]
      if not is_square:
        shape = batch_shape + [num_rows, num_columns]

    diag_ones = array_ops.ones(diag_shape, dtype=dtype)
    if is_square:
      return array_ops.matrix_diag(diag_ones)
    else:
      zero_matrix = array_ops.zeros(shape, dtype=dtype)
      return array_ops.matrix_set_diag(zero_matrix, diag_ones)
Пример #34
0
    def _finish(self, state):
        var_dtype = self._variables[0].dtype.base_dtype
        # Update global step.
        global_step = self._get_global_step(state)
        update_global_step = state_ops.assign_add(global_step, 1.)

        # Update the first moment estimate.
        beta1 = state.get_hyper("beta1", dtype=var_dtype)
        moment1 = self._get_moment1(state)
        flat_grad = self._get_flat_grad(state)
        # moment1_t := beta1 * moment1_{t-1} + (1 - beta1) * flat_grad_t
        update_moment1 = moment1.assign(beta1 * moment1 +
                                        (1. - beta1) * flat_grad)

        # Update the gradient buffer.
        window = state.get_hyper("window")
        grad_buffer = self._get_grad_buffer(state)
        next_grad_index = math_ops.floormod(
            math_ops.to_int32(update_global_step - 1.), window)
        # grad_buffer[(t-1) % window] := moment1_t
        update_grad_buffer = state_ops.scatter_update(grad_buffer,
                                                      next_grad_index,
                                                      update_moment1)

        # Compute the update step.
        eps = state.get_hyper("eps", dtype=var_dtype)
        svd_eps = state.get_hyper("svd_eps", dtype=var_dtype)
        sigma_eps = state.get_hyper("sigma_eps", dtype=var_dtype)
        lr = state.get_hyper("lr", dtype=var_dtype)
        denom = math_ops.sqrt(
            math_ops.minimum(
                ops.convert_to_tensor(update_global_step),
                ops.convert_to_tensor(math_ops.cast(window, dtype=var_dtype))))
        moment1_2d = array_ops.expand_dims(update_moment1, -1)

        # m = grad_buffer^T / sqrt(min(t, window))
        # m has shape [model dimension, window], where model dimension is the sum
        # of the dimensions of the flattened variables.
        m = array_ops.transpose(math_ops.divide(update_grad_buffer, denom))

        # sigma, u, _ = SVD(m^Tm + I * svd_eps)
        mm = math_ops.matmul(m, m, transpose_a=True)
        damping = math_ops.cast(linalg_ops.eye(window),
                                dtype=var_dtype) * svd_eps
        sigma, u, _ = linalg_ops.svd(mm + damping)
        sigma_sqrt = math_ops.sqrt(sigma)
        sigma_sqrt_min = math_ops.reduce_min(sigma_sqrt)

        # sigma_sqrt_inv = 1 / (\sqrt{sigma} + sigma_eps) ^ 3
        # We add sigma_eps to alleviate numerical instability.
        # Note that (m^Tm)^(-3/2) = u diag(sigma_sqrt_inv) u^T.
        sigma_sqrt_inv = math_ops.divide(
            math_ops.cast(1.0, dtype=var_dtype),
            math_ops.pow(sigma_sqrt + sigma_eps, 3))

        # In full matrix AdaGrad, the update step computes (mm^T)^(-1/2)g, where the
        # inversion of a model dimension by model dimension matrix is needed. To
        # speed up this computation we calculate the following instead:
        # m(m^Tm)^(-3/2)m^T moment1 = m u diag(sigma_sqrt_inv) u^T m^T moment1.
        new_step = array_ops.expand_dims(
            array_ops.zeros(flat_grad.get_shape(), dtype=var_dtype), -1)
        head = math_ops.matmul(
            m,
            math_ops.matmul(
                u,
                math_ops.matmul(
                    array_ops.diag(sigma_sqrt_inv),
                    math_ops.matmul(u,
                                    math_ops.matmul(m,
                                                    moment1_2d,
                                                    transpose_a=True),
                                    transpose_a=True))))

        # When inverting (mm^t)^(1/2), we also add epsilon * I regularization for
        # degenerate cases. We expand ((mm^t)^(1/2) + epsilon * I)^(-1) using
        # Woodbury's identity.
        # For full derivation please see paper at
        # https://arxiv.org/pdf/1806.02958.pdf
        tail = moment1_2d - math_ops.matmul(
            m,
            math_ops.matmul(
                u,
                math_ops.matmul(
                    array_ops.diag(
                        math_ops.divide(math_ops.cast(1.0, dtype=var_dtype),
                                        sigma)),
                    math_ops.matmul(u,
                                    math_ops.matmul(
                                        m, moment1_2d, transpose_a=True),
                                    transpose_a=True))))
        scaled_tail = math_ops.divide(tail, sigma_sqrt_min)

        update_new_step = control_flow_ops.cond(
            sigma_sqrt_min > eps, lambda: math_ops.add(head, scaled_tail),
            lambda: math_ops.add(new_step, head))

        # Update each variable.
        update_step = []
        for var in self._variables:
            dim = self.shape_dict[var.name]
            start_index = self.index_dict[var.name]
            end_index = start_index + dim
            var_update_correct_shape = array_ops.reshape(
                update_new_step[start_index:end_index], var.get_shape())
            var_updated = state_ops.assign_sub(var,
                                               lr * var_update_correct_shape)
            update_step.append(var_updated)

        return control_flow_ops.group(update_step)
Пример #35
0
def _ragged_getitem_inner_dimensions(rt_input, key_list):
  """Retrieve inner dimensions, keeping outermost dimension unchanged.

  Args:
    rt_input: The `RaggedTensor` or `Tensor` from which a piece should be
      extracted.
    key_list: The __getitem__ keys for slicing the inner dimensions.

  Returns:
    A `RaggedTensor`.

  Raises:
    ValueError: If key_list is not supported.
  """
  if not key_list:
    return rt_input

  if isinstance(rt_input, ops.Tensor):
    return rt_input.__getitem__([slice(None, None, None)] + key_list)

  column_key = key_list[0]
  if column_key is Ellipsis:
    expanded_key_list = _expand_ellipsis(key_list, rt_input.values.shape.ndims)
    return _ragged_getitem_inner_dimensions(rt_input, expanded_key_list)

  # Adding a new axis to a ragged inner dimension: recursively get the inner
  # dimensions of rt_input with key_list[1:], and then wrap the result in a
  # RaggedTensor that puts each value in its own row.
  if column_key is array_ops.newaxis:
    inner_rt = _ragged_getitem_inner_dimensions(rt_input, key_list[1:])
    nsplits = array_ops.shape(inner_rt.row_splits,
                              out_type=inner_rt.row_splits.dtype)[0]
    return ragged_tensor.RaggedTensor.from_row_splits(inner_rt,
                                                      math_ops.range(nsplits),
                                                      validate=False)

  # Slicing a range of columns in a ragged inner dimension.  We use a
  # recursive call to process the values, and then assemble a RaggedTensor
  # with those values.
  if isinstance(column_key, slice):
    if (column_key.start is None and column_key.stop is None and
        column_key.step is None):
      # Trivial slice: recursively process all values, & splits is unchanged.
      return rt_input.with_values(
          _ragged_getitem_inner_dimensions(rt_input.values, key_list[1:]))
    else:
      if not (isinstance(column_key.start, (ops.Tensor, int, type(None))) and
              isinstance(column_key.stop, (ops.Tensor, int, type(None)))):
        raise TypeError("slice offsets must be integers or None")

      # Nontrivial slice: use ragged_gather to extract the indicated slice as
      # a new RaggedTensor (inner_rt), and then recursively process its values.
      starts = rt_input.row_splits[:-1]
      limits = rt_input.row_splits[1:]
      step = 1 if column_key.step is None else column_key.step
      lower_bound = _if_ge_zero(step, lambda: starts, lambda: starts - 1)
      upper_bound = _if_ge_zero(step, lambda: limits, lambda: limits - 1)
      # inner_rt_starts[i] = index to start gathering for row i.
      if column_key.start is None:
        inner_rt_starts = _if_ge_zero(step, lambda: starts, lambda: limits - 1)
      else:
        start_offset = math_ops.cast(column_key.start, starts.dtype)
        inner_rt_starts = _if_ge_zero(
            column_key.start,
            lambda: math_ops.minimum(starts + start_offset, upper_bound),
            lambda: math_ops.maximum(limits + start_offset, lower_bound))
      # inner_rt_limits[i] = index to stop gathering for row i.
      if column_key.stop is None:
        inner_rt_limits = _if_ge_zero(step, lambda: limits, lambda: starts - 1)
      else:
        stop_offset = math_ops.cast(column_key.stop, starts.dtype)
        inner_rt_limits = _if_ge_zero(
            column_key.stop,
            lambda: math_ops.minimum(starts + stop_offset, upper_bound),
            lambda: math_ops.maximum(limits + stop_offset, lower_bound))
      inner_rt = _build_ragged_tensor_from_value_ranges(
          inner_rt_starts, inner_rt_limits, column_key.step, rt_input.values)
      return inner_rt.with_values(
          _ragged_getitem_inner_dimensions(inner_rt.values, key_list[1:]))

  # Indexing a single column in a ragged inner dimension: raise an Exception.
  # See RaggedTensor.__getitem__.__doc__ for an explanation of why indexing
  # into a ragged inner dimension is problematic.
  else:
    raise ValueError("Cannot index into an inner ragged dimension.")
Пример #36
0
def log_ndtr(x, series_order=3, name="log_ndtr"):
  """Log Normal distribution function.

  For details of the Normal distribution function see `ndtr`.

  This function calculates `(log o ndtr)(x)` by either calling `log(ndtr(x))` or
  using an asymptotic series. Specifically:
  - For `x > upper_segment`, use the approximation `-ndtr(-x)` based on
    `log(1-x) ~= -x, x << 1`.
  - For `lower_segment < x <= upper_segment`, use the existing `ndtr` technique
    and take a log.
  - For `x <= lower_segment`, we use the series approximation of erf to compute
    the log CDF directly.

  The `lower_segment` is set based on the precision of the input:

  ```
  lower_segment = { -20,  x.dtype=float64
                  { -10,  x.dtype=float32
  upper_segment = {   8,  x.dtype=float64
                  {   5,  x.dtype=float32
  ```

  When `x < lower_segment`, the `ndtr` asymptotic series approximation is:

  ```
     ndtr(x) = scale * (1 + sum) + R_N
     scale   = exp(-0.5 x**2) / (-x sqrt(2 pi))
     sum     = Sum{(-1)^n (2n-1)!! / (x**2)^n, n=1:N}
     R_N     = O(exp(-0.5 x**2) (2N+1)!! / |x|^{2N+3})
  ```

  where `(2n-1)!! = (2n-1) (2n-3) (2n-5) ...  (3) (1)` is a
  [double-factorial](https://en.wikipedia.org/wiki/Double_factorial).


  Args:
    x: `Tensor` of type `float32`, `float64`.
    series_order: Positive Python `integer`. Maximum depth to
      evaluate the asymptotic expansion. This is the `N` above.
    name: Python string. A name for the operation (default="log_ndtr").

  Returns:
    log_ndtr: `Tensor` with `dtype=x.dtype`.

  Raises:
    TypeError: if `x.dtype` is not handled.
    TypeError: if `series_order` is a not Python `integer.`
    ValueError:  if `series_order` is not in `[0, 30]`.
  """
  if not isinstance(series_order, int):
    raise TypeError("series_order must be a Python integer.")
  if series_order < 0:
    raise ValueError("series_order must be non-negative.")
  if series_order > 30:
    raise ValueError("series_order must be <= 30.")

  with ops.name_scope(name, values=[x]):
    x = ops.convert_to_tensor(x, name="x")

    if x.dtype.as_numpy_dtype == np.float64:
      lower_segment = LOGNDTR_FLOAT64_LOWER
      upper_segment = LOGNDTR_FLOAT64_UPPER
    elif x.dtype.as_numpy_dtype == np.float32:
      lower_segment = LOGNDTR_FLOAT32_LOWER
      upper_segment = LOGNDTR_FLOAT32_UPPER
    else:
      raise TypeError("x.dtype=%s is not supported." % x.dtype)

    # The basic idea here was ported from py/scipy/special/cephes/ndtr.c.
    # We copy the main idea, with a few changes
    # * For x >> 1, and X ~ Normal(0, 1),
    #     Log[P[X < x]] = Log[1 - P[X < -x]] approx -P[X < -x],
    #     which extends the range of validity of this function.
    # * We use one fixed series_order for all of 'x', rather than adaptive.
    # * Our docstring properly reflects that this is an asymptotic series, not a
    #   Taylor series. We also provided a correct bound on the remainder.
    # * We need to use the max/min in the _log_ndtr_lower arg to avoid nan when
    #   x=0. This happens even though the branch is unchosen because when x=0
    #   the gradient of a select involves the calculation 1*dy+0*(-inf)=nan
    #   regardless of whether dy is finite. Note that the minimum is a NOP if
    #   the branch is chosen.
    return array_ops.where(
        math_ops.greater(x, upper_segment),
        -_ndtr(-x),  # log(1-x) ~= -x, x << 1
        array_ops.where(math_ops.greater(x, lower_segment),
                        math_ops.log(_ndtr(math_ops.maximum(x, lower_segment))),
                        _log_ndtr_lower(math_ops.minimum(x, lower_segment),
                                        series_order)))
Пример #37
0
def renyi_alpha(step,
                decay_time,
                alpha_min,
                alpha_max=0.99999,
                name='renyi_alpha'):
    r"""Exponentially decaying `Tensor` appropriate for Renyi ratios.

  When minimizing the Renyi divergence for `0 <= alpha < 1` (or maximizing the
  Renyi equivalent of elbo) in high dimensions, it is not uncommon to experience
  `NaN` and `inf` values when `alpha` is far from `1`.

  For that reason, it is often desirable to start the optimization with `alpha`
  very close to 1, and reduce it to a final `alpha_min` according to some
  schedule.  The user may even want to optimize using `elbo_ratio` for
  some fixed time before switching to Renyi based methods.

  This `Op` returns an `alpha` decaying exponentially with step:

  ```
  s(step) = (exp{step / decay_time} - 1) / (e - 1)
  t(s) = max(0, min(s, 1)),  (smooth growth from 0 to 1)
  alpha(t) = (1 - t) alpha_min + t alpha_max
  ```

  Args:
    step:  Non-negative scalar `Tensor`.  Typically the global step or an
      offset version thereof.
    decay_time:  Positive scalar `Tensor`.
    alpha_min:  `float` or `double` `Tensor`.
      The minimal, final value of `alpha`, achieved when `step >= decay_time`
    alpha_max:  `Tensor` of same `dtype` as `alpha_min`.
      The maximal, beginning value of `alpha`, achieved when `step == 0`
    name:  A name to give this `Op`.

  Returns:
    alpha:  A `Tensor` of same `dtype` as `alpha_min`.
  """
    with ops.name_scope(name, values=[step, decay_time, alpha_min, alpha_max]):
        alpha_min = ops.convert_to_tensor(alpha_min, name='alpha_min')
        dtype = alpha_min.dtype

        alpha_max = ops.convert_to_tensor(alpha_max,
                                          dtype=dtype,
                                          name='alpha_max')
        decay_time = math_ops.cast(decay_time, dtype)
        step = math_ops.cast(step, dtype)

        check_scalars = [
            check_ops.assert_rank(step, 0, message='step must be scalar'),
            check_ops.assert_rank(decay_time,
                                  0,
                                  message='decay_time must be scalar'),
            check_ops.assert_rank(alpha_min,
                                  0,
                                  message='alpha_min must be scalar'),
            check_ops.assert_rank(alpha_max,
                                  0,
                                  message='alpha_max must be scalar'),
        ]
        check_sign = [
            check_ops.assert_non_negative(step,
                                          message='step must be non-negative'),
            check_ops.assert_positive(decay_time,
                                      message='decay_time must be positive'),
        ]

        with ops.control_dependencies(check_scalars + check_sign):
            theta = (math_ops.exp(step / decay_time) - 1.) / (math.e - 1.)
            theta = math_ops.minimum(math_ops.maximum(theta, 0.), 1.)
            return alpha_max * (1. - theta) + alpha_min * theta
Пример #38
0
def huber_loss(labels,
               predictions,
               weights=1.0,
               delta=1.0,
               scope=None,
               loss_collection=ops.GraphKeys.LOSSES,
               reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
    """Adds a Huber Loss term to the training procedure.

  For each value x in `error=labels-predictions`, the following is calculated:

  ```
    0.5 * x^2                  if |x| <= d
    0.5 * d^2 + d * (|x| - d)  if |x| > d
  ```

  where d is `delta`.

  See: https://en.wikipedia.org/wiki/Huber_loss

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  `[batch_size]`, then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector. If the shape of
  `weights` matches the shape of `predictions`, then the loss of each
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.

  Args:
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    predictions: The predicted outputs.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    delta: `float`, the point where the huber loss function
      changes from a quadratic to linear.
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
    reduction: Type of reduction to apply to loss.

  Returns:
    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
    shape as `labels`; otherwise, it is scalar.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.  Also if `labels` or
     `predictions` is None.

  @compatibility(eager)
  The `loss_collection` argument is ignored when executing eagerly. Consider
  holding on to the return value or collecting losses via a `tf.keras.Model`.
  @end_compatibility
  """
    if labels is None:
        raise ValueError("labels must not be None.")
    if predictions is None:
        raise ValueError("predictions must not be None.")
    with ops.name_scope(scope, "huber_loss",
                        (predictions, labels, weights)) as scope:
        predictions = math_ops.cast(predictions, dtype=dtypes.float32)
        labels = math_ops.cast(labels, dtype=dtypes.float32)
        predictions.get_shape().assert_is_compatible_with(labels.get_shape())
        error = math_ops.subtract(predictions, labels)
        abs_error = math_ops.abs(error)
        quadratic = math_ops.minimum(abs_error, delta)
        # The following expression is the same in value as
        # tf.maximum(abs_error - delta, 0), but importantly the gradient for the
        # expression when abs_error == delta is 0 (for tf.maximum it would be 1).
        # This is necessary to avoid doubling the gradient, since there is already a
        # nonzero contribution to the gradient from the quadratic term.
        linear = math_ops.subtract(abs_error, quadratic)
        losses = math_ops.add(
            math_ops.multiply(
                ops.convert_to_tensor(0.5, dtype=quadratic.dtype),
                math_ops.multiply(quadratic, quadratic)),
            math_ops.multiply(delta, linear))
        return compute_weighted_loss(losses,
                                     weights,
                                     scope,
                                     loss_collection,
                                     reduction=reduction)
Пример #39
0
def polynomial_decay(exploration_rate,
                     timestep,
                     decay_steps,
                     end_exploration_rate=0.0001,
                     power=1.0,
                     cycle=False,
                     name=None):
    """Applies a polynomial decay to the exploration rate.

    It is commonly observed that a monotonically decreasing exploration rate, whose
    degree of change is carefully chosen, results in a better performing model.
    This function applies a polynomial decay function to a provided initial
    `exploration_rate` to reach an `end_exploration_rate` in the given `decay_steps`.

    It requires a `timestep` value to compute the decayed exploration rate.  You
    can just pass a TensorFlow variable that you increment at each training step.

    The function returns the decayed exploration rate.  It is computed as:

    ```python
    >>> timestep = min(timestep, decay_steps)
    >>> decayed_exploration_rate = (exploration_rate - end_exploration_rate) *
    ...                            (1 - timestep / decay_steps) ^ (power) + end_exploration_rate
    ```

    If `cycle` is True then a multiple of `decay_steps` is used, the first one
    that is bigger than `timesteps`.

    ```python
    >>> decay_steps = decay_steps * ceil(timestep / decay_steps)
    >>> decayed_exploration_rate = (exploration_rate - end_exploration_rate) *
    ...                            (1 - timestep / decay_steps) ^ (power) +
    ...                            end_exploration_rate

    ```

    Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):

    ```python
    >>> timestep = tf.Variable(0, trainable=False)
    >>> starter_exploration_rate = 0.1
    >>> end_exploration_rate = 0.01
    >>> decay_steps = 10000
    >>> exploration_rate = tf.train.polynomial_decay(starter_exploration_rate, timestep,
    ...                                              decay_steps, end_exploration_rate, power=0.5)
    >>> # Passing timestep to minimize() will increment it at each step.
    >>> learning_step = (
    ...     tf.train.GradientDescentOptimizer(exploration_rate)
    ...     .minimize(...my loss..., timestep=timestep)
    ... )
    ```

    Args:
        exploration_rate: A scalar `float32` or `float64` `Tensor` or a
            Python number.  The initial exploration rate.
        timestep: A scalar `int32` or `int64` `Tensor` or a Python number.
            Global step to use for the decay computation.  Must not be negative.
        decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
            Must be positive.  See the decay computation above.
        end_exploration_rate: A scalar `float32` or `float64` `Tensor` or a
            Python number.  The minimal end exploration rate.
        power: A scalar `float32` or `float64` `Tensor` or a
            Python number.  The power of the polynomial. Defaults to linear, 1.0.
        cycle: A boolean, whether or not it should cycle beyond decay_steps.
        name: String.  Optional name of the operation. Defaults to
            'PolynomialDecay'.

    Returns:
        A scalar `Tensor` of the same type as `exploration_rate`.  The decayed exploration rate.

    Raises:
        ValueError: if `timestep` is not supplied.
    """
    if timestep is None:
        raise ValueError("timestep is required for polynomial_decay.")
    with get_name_scope(name=name,
                        scope="PolynomialDecay",
                        values=[
                            exploration_rate, timestep, decay_steps,
                            end_exploration_rate, power
                        ]) as name:
        exploration_rate = ops.convert_to_tensor(exploration_rate,
                                                 name="exploration_rate")
        dtype = exploration_rate.dtype
        timestep = math_ops.cast(timestep, dtype)
        decay_steps = math_ops.cast(decay_steps, dtype)
        end_exploration_rate = math_ops.cast(end_exploration_rate, dtype)
        power = math_ops.cast(power, dtype)
        if cycle:
            # Find the first multiple of decay_steps that is bigger than timestep.
            decay_steps = math_ops.multiply(
                decay_steps, math_ops.ceil(timestep / decay_steps))
        else:
            # Make sure that the timestep used is not bigger than decay_steps.
            timestep = math_ops.minimum(timestep, decay_steps)

        p = math_ops.div(timestep, decay_steps)
        return math_ops.add(math_ops.multiply(
            exploration_rate - end_exploration_rate,
            math_ops.pow(1 - p, power)),
                            end_exploration_rate,
                            name=name)
Пример #40
0
def _interpolate_bilinear(grid,
                          query_points,
                          name='interpolate_bilinear',
                          indexing='ij'):
  """Similar to Matlab's interp2 function.
  Finds values for query points on a grid using bilinear interpolation.
  Args:
    grid: a 4-D float `Tensor` of shape `[batch, height, width, channels]`.
    query_points: a 3-D float `Tensor` of N points with shape `[batch, N, 2]`.
    name: a name for the operation (optional).
    indexing: whether the query points are specified as row and column (ij),
      or Cartesian coordinates (xy).
  Returns:
    values: a 3-D `Tensor` with shape `[batch, N, channels]`
  Raises:
    ValueError: if the indexing mode is invalid, or if the shape of the inputs
      invalid.
  """
  if indexing != 'ij' and indexing != 'xy':
    raise ValueError('Indexing mode must be \'ij\' or \'xy\'')

  with ops.name_scope(name):
    grid = ops.convert_to_tensor(grid)
    query_points = ops.convert_to_tensor(query_points)
    shape = grid.get_shape().as_list()
    if len(shape) != 4:
      msg = 'Grid must be 4 dimensional. Received size: '
      raise ValueError(msg + str(grid.get_shape()))

    batch_size, height, width, channels = (array_ops.shape(grid)[0],
                                           array_ops.shape(grid)[1],
                                           array_ops.shape(grid)[2],
                                           array_ops.shape(grid)[3])

    shape = [batch_size, height, width, channels]
    query_type = query_points.dtype
    grid_type = grid.dtype

    with ops.control_dependencies([
        check_ops.assert_equal(
            len(query_points.get_shape()),
            3,
            message='Query points must be 3 dimensional.'),
        check_ops.assert_equal(
            array_ops.shape(query_points)[2],
            2,
            message='Query points must be size 2 in dim 2.')
    ]):
      num_queries = array_ops.shape(query_points)[1]

    with ops.control_dependencies([
        check_ops.assert_greater_equal(
            height, 2, message='Grid height must be at least 2.'),
        check_ops.assert_greater_equal(
            width, 2, message='Grid width must be at least 2.')
    ]):
      alphas = []
      floors = []
      ceils = []
      index_order = [0, 1] if indexing == 'ij' else [1, 0]
      unstacked_query_points = array_ops.unstack(query_points, axis=2)

    for dim in index_order:
      with ops.name_scope('dim-' + str(dim)):
        queries = unstacked_query_points[dim]

        size_in_indexing_dimension = shape[dim + 1]

        # max_floor is size_in_indexing_dimension - 2 so that max_floor + 1
        # is still a valid index into the grid.
        max_floor = math_ops.cast(size_in_indexing_dimension - 2, query_type)
        min_floor = constant_op.constant(0.0, dtype=query_type)
        floor = math_ops.minimum(
            math_ops.maximum(min_floor, math_ops.floor(queries)), max_floor)
        int_floor = math_ops.cast(floor, dtypes.int32)
        floors.append(int_floor)
        ceil = int_floor + 1
        ceils.append(ceil)

        # alpha has the same type as the grid, as we will directly use alpha
        # when taking linear combinations of pixel values from the image.
        alpha = math_ops.cast(queries - floor, grid_type)
        min_alpha = constant_op.constant(0.0, dtype=grid_type)
        max_alpha = constant_op.constant(1.0, dtype=grid_type)
        alpha = math_ops.minimum(math_ops.maximum(min_alpha, alpha), max_alpha)

        # Expand alpha to [b, n, 1] so we can use broadcasting
        # (since the alpha values don't depend on the channel).
        alpha = array_ops.expand_dims(alpha, 2)
        alphas.append(alpha)

    with ops.control_dependencies([
        check_ops.assert_less_equal(
            math_ops.cast(batch_size * height * width, dtype=dtypes.float32),
            np.iinfo(np.int32).max / 8,
            message="""The image size or batch size is sufficiently large
                       that the linearized addresses used by array_ops.gather
                       may exceed the int32 limit.""")
    ]):
      flattened_grid = array_ops.reshape(
          grid, [batch_size * height * width, channels])
      batch_offsets = array_ops.reshape(
          math_ops.range(batch_size) * height * width, [batch_size, 1])

    # This wraps array_ops.gather. We reshape the image data such that the
    # batch, y, and x coordinates are pulled into the first dimension.
    # Then we gather. Finally, we reshape the output back. It's possible this
    # code would be made simpler by using array_ops.gather_nd.
    def gather(y_coords, x_coords, name):
      with ops.name_scope('gather-' + name):
        linear_coordinates = batch_offsets + y_coords * width + x_coords
        gathered_values = array_ops.gather(flattened_grid, linear_coordinates)
        return array_ops.reshape(gathered_values,
                                 [batch_size, num_queries, channels])

    # grab the pixel values in the 4 corners around each query point
    top_left = gather(floors[0], floors[1], 'top_left')
    top_right = gather(floors[0], ceils[1], 'top_right')
    bottom_left = gather(ceils[0], floors[1], 'bottom_left')
    bottom_right = gather(ceils[0], ceils[1], 'bottom_right')

    # now, do the actual interpolation
    with ops.name_scope('interpolate'):
      interp_top = alphas[1] * (top_right - top_left) + top_left
      interp_bottom = alphas[1] * (bottom_right - bottom_left) + bottom_left
      interp = alphas[0] * (interp_bottom - interp_top) + interp_top

    return interp
Пример #41
0
    def _kmc2_multiple_centers(self):
        """Adds new initial cluster centers using the k-MC2 algorithm.

    In each call to the op, the provided batch is split into subsets based on
    the specified `kmc2_chain_length`. On each subset, a single Markov chain of
    the k-MC2 algorithm is used to add *one* new center cluster center. If there
    are less than `kmc2_chain_length` points in the subset, a single center is
    added using one Markov chain on the full input. It is assumed that the
    provided batch has previously been randomly permuted. Otherwise, k-MC2 may
    return suboptimal centers.

    Returns:
      An op that adds new cluster centers.
    """
        # The op only operates on the first shard of data.
        first_shard = self._inputs[0]
        # Number of points in the input that can be used.
        batch_size = array_ops.shape(first_shard)[0]
        # Maximum number of subsets such that the size of each subset is at least
        # `kmc2_chain_length`. Final subsets may be larger.
        max_to_sample = math_ops.cast(batch_size / self._kmc2_chain_length,
                                      dtype=dtypes.int32)
        # We sample at least one new center and at most all remaining centers.
        num_to_sample = math_ops.maximum(
            math_ops.minimum(self._num_remaining, max_to_sample), 1)

        def _cond(i, _):
            """Stopping condition for the while loop."""
            return math_ops.less(i, num_to_sample)

        def _body(i, _):
            """Body that adds a single new center based on a subset."""
            def _sample_random():
                """Returns a random point as a cluster center."""
                # By assumption the batch is reshuffled and _sample_random is always
                # called for i=0. Hence, we simply return the first point.
                new_center = array_ops.reshape(first_shard[0], [1, -1])
                if self._distance_metric == COSINE_DISTANCE:
                    new_center = nn_impl.l2_normalize(new_center, dim=1)
                return new_center

            def _sample_kmc2_chain():
                """Returns previous centers as well as a new center sampled using k-MC2.
        """
                # Extract the subset from the underlying batch.
                start = i * self._kmc2_chain_length
                end = start + self._kmc2_chain_length
                subset = first_shard[start:end]
                # Compute the distances from points in the subset to previous centers.
                _, distances = gen_clustering_ops.nearest_neighbors(
                    subset, self._cluster_centers, 1)
                # Sample index of new center using k-MC2 Markov chain.
                new_center_index = gen_clustering_ops.kmc2_chain_initialization(
                    array_ops.squeeze(distances), self._random_seed)
                # Extract actual new center.
                newly_sampled_center = array_ops.reshape(
                    subset[new_center_index], [1, -1])
                # Return concatenation with previously sampled centers.
                if self._distance_metric == COSINE_DISTANCE:
                    newly_sampled_center = nn_impl.l2_normalize(
                        newly_sampled_center, dim=1)
                return array_ops.concat(
                    [self._cluster_centers, newly_sampled_center], 0)

            # Obtain a random point if there are no previously sampled centers.
            # Otherwise, construct a k-MC2 Markov chain.
            new_centers = control_flow_ops.cond(
                math_ops.equal(self._num_selected, 0), _sample_random,
                _sample_kmc2_chain)
            # Assign new cluster centers to underlying variable.
            assigned_centers = state_ops.assign(self._cluster_centers,
                                                new_centers,
                                                validate_shape=False)
            if self._cluster_centers_updated is not self._cluster_centers:
                assigned_centers = state_ops.assign(
                    self._cluster_centers_updated,
                    assigned_centers,
                    validate_shape=False)
            return i + 1, self._num_clusters - array_ops.shape(
                assigned_centers)[0]

        # Add num_to_sample new data points.
        _, num_remaining = control_flow_ops.while_loop(_cond, _body, [0, 0])
        return num_remaining
Пример #42
0
def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
                              initial_variance=1.0, variance_decay=0.55,
                              num_periods=0.5, alpha=0.0, beta=0.001,
                              name=None):
  """Applies noisy linear cosine decay to the learning rate.

  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
  https://arxiv.org/abs/1709.07417

  For the idea of warm starts here controlled by `num_periods`,
  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
  with Warm Restarts. https://arxiv.org/abs/1608.03983

  Note that linear cosine decay is more aggressive than cosine decay and
  larger initial learning rates can typically be used.

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies a noisy linear
  cosine decay function to a provided initial learning rate.
  It requires a `global_step` value to compute the decayed learning rate.
  You can just pass a TensorFlow variable that you increment at each
  training step.

  The function returns the decayed learning rate.  It is computed as:
  ```python
  global_step = min(global_step, decay_steps)
  linear_decay = (decay_steps - global_step) / decay_steps)
  cosine_decay = 0.5 * (
      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
  decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
  decayed_learning_rate = learning_rate * decayed
  ```
  where eps_t is 0-centered gaussian noise with variance
  initial_variance / (1 + global_step) ** variance_decay

  Example usage:
  ```python
  decay_steps = 1000
  lr_decayed = noisy_linear_cosine_decay(
    learning_rate, global_step, decay_steps)
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
      The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
      Global step to use for the decay computation.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Number of steps to decay over.
    initial_variance: initial variance for the noise. See computation above.
    variance_decay: decay for the noise's variance. See computation above.
    num_periods: Number of periods in the cosine part of the decay.
      See computation above.
    alpha: See computation above.
    beta: See computation above.
    name: String.  Optional name of the operation.  Defaults to
      'NoisyLinearCosineDecay'.
  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  Raises:
    ValueError: if `global_step` is not supplied.
  """
  if global_step is None:
    raise ValueError("noisy linear cosine decay requires global_step")
  with ops.name_scope(name, "NoisyLinearCosineDecay",
                      [learning_rate, global_step]) as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    decay_steps = math_ops.cast(decay_steps, dtype)
    global_step = math_ops.minimum(global_step, decay_steps)
    initial_variance = math_ops.cast(initial_variance, dtype)
    variance_decay = math_ops.cast(variance_decay, dtype)
    num_periods = math_ops.cast(num_periods, dtype)
    alpha = math_ops.cast(alpha, dtype)
    beta = math_ops.cast(beta, dtype)

    linear_decayed = (decay_steps - global_step) / decay_steps
    variance = initial_variance / (
        math_ops.pow(1.0 + global_step, variance_decay))
    std = math_ops.sqrt(variance)
    noisy_linear_decayed = (
        linear_decayed + random_ops.random_normal(
            linear_decayed.shape, stddev=std))

    completed_fraction = global_step / decay_steps
    fraction = 2.0 * num_periods * completed_fraction
    cosine_decayed = 0.5 * (
        1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
    noisy_linear_cosine_decayed = (
        (alpha + noisy_linear_decayed) * cosine_decayed + beta)

    return math_ops.multiply(
        learning_rate, noisy_linear_cosine_decayed, name=name)
Пример #43
0
def polynomial_decay(learning_rate, global_step, decay_steps,
                     end_learning_rate=0.0001, power=1.0,
                     cycle=False, name=None):
  """Applies a polynomial decay to the learning rate.

  It is commonly observed that a monotonically decreasing learning rate, whose
  degree of change is carefully chosen, results in a better performing model.
  This function applies a polynomial decay function to a provided initial
  `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.

  It requires a `global_step` value to compute the decayed learning rate.  You
  can just pass a TensorFlow variable that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:

  ```python
  global_step = min(global_step, decay_steps)
  decayed_learning_rate = (learning_rate - end_learning_rate) *
                          (1 - global_step / decay_steps) ^ (power) +
                          end_learning_rate

  ```

  If `cycle` is True then a multiple of `decay_steps` is used, the first one
  that is bigger than `global_steps`.

  ```python
  decay_steps = decay_steps * ceil(global_step / decay_steps)
  decayed_learning_rate = (learning_rate - end_learning_rate) *
                          (1 - global_step / decay_steps) ^ (power) +
                          end_learning_rate

  ```

  Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):

  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 0.1
  end_learning_rate = 0.01
  decay_steps = 10000
  learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
                                            decay_steps, end_learning_rate,
                                            power=0.5)
  # Passing global_step to minimize() will increment it at each step.
  learning_step = (
      tf.train.GradientDescentOptimizer(learning_rate)
      .minimize(...my loss..., global_step=global_step)
  )
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
      Global step to use for the decay computation.  Must not be negative.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Must be positive.  See the decay computation above.
    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The minimal end learning rate.
    power: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The power of the polynomial. Defaults to linear, 1.0.
    cycle: A boolean, whether or not it should cycle beyond decay_steps.
    name: String.  Optional name of the operation. Defaults to
      'PolynomialDecay'.

  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.

  Raises:
    ValueError: if `global_step` is not supplied.
  """
  if global_step is None:
    raise ValueError("global_step is required for polynomial_decay.")
  with ops.name_scope(name, "PolynomialDecay",
                      [learning_rate, global_step,
                       decay_steps, end_learning_rate, power]) as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    decay_steps = math_ops.cast(decay_steps, dtype)
    end_learning_rate = math_ops.cast(end_learning_rate, dtype)
    power = math_ops.cast(power, dtype)
    if cycle:
      # Find the first multiple of decay_steps that is bigger than global_step.
      # If global_step is zero set the multiplier to 1
      multiplier = control_flow_ops.cond(math_ops.equal(global_step, 0),
                                         lambda: 1.0,
                                         lambda: math_ops.ceil(
                                             global_step / decay_steps))
      decay_steps = math_ops.multiply(decay_steps, multiplier)
    else:
      # Make sure that the global_step used is not bigger than decay_steps.
      global_step = math_ops.minimum(global_step, decay_steps)

    p = math_ops.div(global_step, decay_steps)
    return math_ops.add(math_ops.multiply(learning_rate - end_learning_rate,
                                          math_ops.pow(1 - p, power)),
                        end_learning_rate, name=name)
Пример #44
0
    def make_inverse_updates_ops(self):
        """Create and return update ops corresponding to registered computations."""
        # TODO(b/69918258): Add correctness tests for this method.
        # pylint: disable=invalid-name

        ops = super(FullyConnectedMultiKF, self).make_inverse_update_ops()

        if (len(self._option1quants_by_damping) +
                len(self._option2quants_by_damping)):

            # Note that C0 and C1 are stand-ins for A0 and A1, or G0 and G1, from
            # the pseudo-code in the original paper.  Because the computations for
            # the A and G case are essentially the same they can both be performed by
            # the same class (this one).

            C1 = self.get_cov_dt1()

            # Get the eigendecomposition of C0  (= self.get_cov())
            eigen_e, eigen_V = self.get_eigendecomp()

            # TODO(b/69678661): Note, there is an implicit assumption here that C1
            # and C0 (as represented here by its eigen-decomp) are consistent.  This
            # could fail to be the case if self._cov and self._cov_dt1 are not updated
            # consistently, or are somehow read between or during the cov updates.
            # Can this possibly happen?  Is there a way to prevent it?

            for damping, (Lmat_var,
                          psi_var) in self._option1quants_by_damping.items():

                invsqrtC0 = math_ops.matmul(eigen_V *
                                            (eigen_e + damping)**(-0.5),
                                            eigen_V,
                                            transpose_b=True)

                # Might need to enforce symmetry lost due to numerical issues.
                invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0

                # The following line imposses the symmetry assumed by "Option 1" on C1.
                # Stangely the code can work okay with this line commented out,
                # depending on how psd_eig is defined.  I'm not sure why.
                C1 = (C1 + array_ops.transpose(C1)) / 2.0

                # hPsi = C0^(-1/2) * C1 * C0^(-1/2)  (hPsi means \hat{Psi})
                hPsi = math_ops.matmul(math_ops.matmul(invsqrtC0, C1),
                                       invsqrtC0)

                # Compute the decomposition U*diag(psi)*U^T = hPsi
                psi, U = utils.psd_eig(hPsi)

                # L = C0^(-1/2) * U
                Lmat = math_ops.matmul(invsqrtC0, U)

                ops.append(Lmat_var.assign(Lmat))
                ops.append(psi_var.assign(psi))

            for damping, (Pmat_var, Kmat_var,
                          mu_var) in self._option2quants_by_damping.items():

                # compute C0^(-1/2)
                invsqrtC0 = math_ops.matmul(eigen_V *
                                            (eigen_e + damping)**(-0.5),
                                            eigen_V,
                                            transpose_b=True)

                # Might need to enforce symmetry lost due to numerical issues.
                invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0

                # Compute the product C0^(-1/2) * C1
                invsqrtC0C1 = math_ops.matmul(invsqrtC0, C1)

                # hPsi = C0^(-1/2) * C1 * C0^(-1/2)  (hPsi means \hat{Psi})
                hPsi = math_ops.matmul(invsqrtC0C1, invsqrtC0)

                # Compute the decomposition E*diag(mu)*E^T = hPsi^T * hPsi
                # Note that we using the notation mu instead of "m" for the eigenvalues.
                # Instead of computing the product hPsi^T * hPsi and then doing an
                # eigen-decomposition of this we just compute the SVD of hPsi and then
                # square the singular values to get the eigenvalues. For a justification
                # of this approach, see:
                # https://en.wikipedia.org/wiki/Singular-value_decomposition#Relation_to_eigenvalue_decomposition
                sqrtmu, _, E = linalg_ops.svd(hPsi)
                mu = math_ops.square(sqrtmu)

                # Mathematically, the eigenvalues should not should not exceed 1.0, but
                # due to numerical issues, or possible issues with inconsistent
                # values of C1 and (the eigen-decomposition of) C0 they might. So
                # we enforce this condition.
                mu = math_ops.minimum(mu, 1.0)

                # P = (C0^(-1/2) * C1)^T * C0^(-1/2) = C_1^T * C_0^(-1)
                Pmat = math_ops.matmul(invsqrtC0C1,
                                       invsqrtC0,
                                       transpose_a=True)

                # K = C_0^(-1/2) * E
                Kmat = math_ops.matmul(invsqrtC0, E)

                ops.append(Pmat_var.assign(Pmat))
                ops.append(Kmat_var.assign(Kmat))
                ops.append(mu_var.assign(mu))

        return [control_flow_ops.group(ops)]
Пример #45
0
 def min_(x, y):
     if _is_tensor(x) or _is_tensor(y):
         return math_ops.minimum(x, y)
     else:
         return min(x, y)
Пример #46
0
    def compress(self, inputs):
        """Compress inputs and store their binary representations into strings.

    Args:
      inputs: `Tensor` with values to be compressed.

    Returns:
      String `Tensor` vector containing the compressed representation of each
      batch element of `inputs`.
    """
        with ops.name_scope(self._name_scope()):
            inputs = ops.convert_to_tensor(inputs)
            if not self.built:
                # Check input assumptions set before layer building, e.g. input rank.
                self._assert_input_compatibility(inputs)
                if self.dtype is None:
                    self._dtype = inputs.dtype.base_dtype.name
                self.build(inputs.shape)

            # Check input assumptions set after layer building, e.g. input shape.
            if not context.executing_eagerly():
                self._assert_input_compatibility(inputs)

            ndim = self.input_spec.ndim
            channel_axis = self._channel_axis(ndim)
            # Tuple of slices for expanding dimensions of tensors below.
            slices = ndim * [None] + [slice(None)]
            slices[channel_axis] = slice(None)
            slices = tuple(slices)

            # Expand dimensions of CDF to input dimensions, keeping the channels along
            # the right dimension.
            cdf = self._quantized_cdf[slices[1:]]
            num_levels = array_ops.shape(cdf)[-1] - 1

            # Bring inputs to the right range by centering the range on the medians.
            half = constant_op.constant(.5, dtype=self.dtype)
            medians = array_ops.squeeze(self._medians, [1, 2])
            offsets = (math_ops.cast(num_levels // 2, self.dtype) +
                       half) - medians
            # Expand offsets to input dimensions and add to inputs.
            values = inputs + offsets[slices[:-1]]

            # Clip to range and cast to integers. Because we have added .5 above, and
            # all values are positive, the cast effectively implements rounding.
            values = math_ops.maximum(values, half)
            values = math_ops.minimum(
                values,
                math_ops.cast(num_levels, self.dtype) - half)
            values = math_ops.cast(values, dtypes.int16)

            def loop_body(tensor):
                return coder_ops.range_encode(
                    tensor, cdf, precision=self.range_coder_precision)

            strings = functional_ops.map_fn(loop_body,
                                            values,
                                            dtype=dtypes.string,
                                            back_prop=False)

            if not context.executing_eagerly():
                strings.set_shape(inputs.shape[:1])

            return strings
Пример #47
0
def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
                      beam_width, end_token, length_penalty_weight):
    """Performs a single step of Beam Search Decoding.

  Args:
    time: Beam search time step, should start at 0. At time 0 we assume
      that all beams are equal and consider only the first beam for
      continuations.
    logits: Logits at the current time step. A tensor of shape
      `[batch_size, beam_width, vocab_size]`
    next_cell_state: The next state from the cell, e.g. an instance of
      AttentionWrapperState if the cell is attentional.
    beam_state: Current state of the beam search.
      An instance of `BeamSearchDecoderState`.
    batch_size: The batch size for this input.
    beam_width: Python int.  The size of the beams.
    end_token: The int32 end token.
    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.

  Returns:
    A new beam state.
  """

    static_batch_size = tensor_util.constant_value(batch_size)

    # Calculate the current lengths of the predictions
    prediction_lengths = beam_state.lengths
    previously_finished = beam_state.finished

    # Calculate the total log probs for the new hypotheses
    # Final Shape: [batch_size, beam_width, vocab_size]
    step_log_probs = nn_ops.log_softmax(logits)
    #step_log_probs",Tensor shape=(?, 10, 56136)
    step_log_probs = _mask_probs(step_log_probs, end_token,
                                 previously_finished)
    #step_log_probs_masked (?, 10, 56136)
    total_probs = array_ops.expand_dims(beam_state.log_probs,
                                        2) + step_log_probs
    #total_probs (?, 10, 56136)
    # Calculate the continuation lengths by adding to all continuing beams.
    vocab_size = logits.shape[-1].value or array_ops.shape(logits)[-1]
    lengths_to_add = array_ops.one_hot(
        indices=array_ops.tile(array_ops.reshape(end_token, [1, 1]),
                               [batch_size, beam_width]),
        depth=vocab_size,
        on_value=constant_op.constant(0, dtype=dtypes.int64),
        off_value=constant_op.constant(1, dtype=dtypes.int64),
        dtype=dtypes.int64)
    #lengths_to_add shape=(?, 10, 56136)
    add_mask = (1 - math_ops.to_int64(previously_finished))
    #add_mask shape=(?, 10), dtype=int64
    lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add
    #lengths_to_add shape=(?, 10, 56136)
    new_prediction_lengths = (lengths_to_add +
                              array_ops.expand_dims(prediction_lengths, 2))
    #new_prediction_lengths shape=(?, 10, 56136)
    # Calculate the scores for each beam
    scores = _get_scores(log_probs=total_probs,
                         sequence_lengths=new_prediction_lengths,
                         length_penalty_weight=length_penalty_weight)
    scores_mask = tf.constant([step_log_probs.dtype.min, 0],
                              dtype=dtypes.float32,
                              shape=[vocab_size],
                              name='mask')
    scores_masked = tf.add(scores, scores_mask)
    scores_mask2 = tf.constant([0, 0, 0, 0, 0, step_log_probs.dtype.min, 0],
                               dtype=dtypes.float32,
                               shape=[vocab_size],
                               name='mask2')
    scores_masked = tf.add(scores_mask2, scores_masked)

    def new_scores(scores_masked):
        scores_no_stop = tf.constant([0, 0, step_log_probs.dtype.min, 0],
                                     dtype=dtypes.float32,
                                     shape=[vocab_size],
                                     name='no_stop')
        scores = tf.add(scores_masked, scores_no_stop)
        return scores

    #constrain the length
    scores = control_flow_ops.cond(
        #time <9 ,
        time < 0,
        lambda: new_scores(scores_masked),
        lambda: scores_masked)

    #scores shape=(?, 10, 56136)
    #[batch_size, beam_width, vocab_size]
    time = ops.convert_to_tensor(time, name="time")
    # During the first time step we only consider the initial beam
    scores_shape = array_ops.shape(scores)
    #scores_shape" shape=(3,)
    scores_to_flat_1 = array_ops.reshape(scores, [batch_size, 2, -1])
    print("scores_to_flat_1", scores_to_flat_1)
    scores_to_0 = scores[:, 0]
    scores_to_1 = scores[:, -1]
    scores_to_flat_2 = tf.concat([scores_to_0, scores_to_1], 1)
    scores_flat = control_flow_ops.cond(
        time > 0, lambda: scores_to_flat_1,
        lambda: array_ops.reshape(scores_to_flat_2, [batch_size, 2, -1]))
    num_available_beam = control_flow_ops.cond(
        time > 0, lambda: math_ops.reduce_prod(scores_shape[1:]),
        lambda: math_ops.reduce_prod(scores_shape[2:]))
    #scores_flat", shape=(?, ?)
    #num_available_beam" shape=()
    # Pick the next beams according to the specified successors function
    next_beam_size = math_ops.minimum(
        ops.convert_to_tensor(beam_width,
                              dtype=dtypes.int32,
                              name="beam_width"), num_available_beam)
    #scores_t = tf.reshape(scores_flat,[batch_size,2,-1])
    ############################
    #input_words=['entrencheds01', 'entrencheds02', 'forgev01', 'forgev04', \
    #             'hitn02', 'hitn03', 'vaultn02', 'vaultn04', 'deepa03', \
    #             'deeps02', 'admitv01', 'admitv02', 'plantn01', 'plantn02',\
    #             'squaren01', 'squaren05', 'drawv05', 'drawv06', 'spellv03', \
    #             'spellv02', 'shotn02', 'shotn04', 'coachv01', 'coachv02', 'casen05',\
    #             'casen09', 'focusn01', 'focusn02', 'tasten01', 'tasten04', 'footn01', \
    #             'footv01']
    input_words = get_words()
    return_list = prior_scores(input_words)
    return_array = np.array(return_list)
    return_tensor = tf.convert_to_tensor(return_array)
    tiling = [1, 5, 1]
    prior_mask = tf.tile(tf.expand_dims(return_tensor, 1), tiling)
    prior_mask = tf.cast(prior_mask, tf.float32)
    prior_mask = array_ops.reshape(prior_mask, [batch_size, -1])
    #print ("prior_mask",prior_mask)
    scores_sum = tf.reduce_sum(scores_to_flat_1, 1)

    #print ("scores_sum_1",scores_sum)
    #def cal_scores_sum(scores_sum,prior_mask):
    #    return tf.add(scores_sum,prior_mask)
    #scores_sum = control_flow_ops.cond(
    #    time > 0,
    #    lambda: cal_scores_sum(scores_sum,prior_mask),
    #    lambda: scores_sum)
    #scores_sum=tf.add(scores_sum,prior_mask)
    #print ("scores_sum_2",scores_sum)
    ############################

    #scores_final=tf.concat([scores_sum, scores_sum],1)
    def cal_scores_indices(scores_to_0, scores_to_1):
        next_beam_scores_1, word_indices_1 = nn_ops.top_k(scores_to_0, k=5)
        print("ori next_beam_scores_1,word_indices_1", next_beam_scores_1)
        print("ori word_indices_1", word_indices_1)
        next_beam_scores_2, word_indices_2 = nn_ops.top_k(scores_to_1, k=5)
        next_beam_scores = tf.concat([next_beam_scores_1, next_beam_scores_2],
                                     1)
        word_indices = tf.concat(
            [word_indices_1, word_indices_2 + 9 * vocab_size], 1)
        return next_beam_scores, word_indices

    def cal_scores_indices_t1(scores_final, next_beam_size):
        next_beam_scores_1, word_indices_1 = nn_ops.top_k(scores_final, k=5)
        #next_beam_scores_1, word_indices_1=sample(next_beam_scores_1,word_indices_1)
        print("next_beam_scores_1", next_beam_scores_1)
        print("word_indices_1", word_indices_1)
        next_beam_scores = tf.concat([next_beam_scores_1, next_beam_scores_1],
                                     1)
        word_indices = tf.concat(
            [word_indices_1, word_indices_1 + 5 * vocab_size], 1)
        return next_beam_scores, word_indices

    next_beam_scores, word_indices = control_flow_ops.cond(
        time > 0, lambda: cal_scores_indices_t1(scores_sum, next_beam_size),
        lambda: cal_scores_indices(scores_to_0, scores_to_1))

    next_beam_scores.set_shape([static_batch_size, beam_width])
    word_indices.set_shape([static_batch_size, beam_width])
    #shape=(?, ?)
    # Pick out the probs, beam_ids, and states according to the chosen predictions

    next_beam_probs = _tensor_gather_helper(gather_indices=word_indices,
                                            gather_from=total_probs,
                                            batch_size=batch_size,
                                            range_size=beam_width * vocab_size,
                                            gather_shape=[-1],
                                            name="next_beam_probs")
    # Note: just doing the following
    #   math_ops.to_int32(word_indices % vocab_size,
    #       name="next_beam_word_ids")
    # would be a lot cleaner but for reasons unclear, that hides the results of
    # the op which prevents capturing it with tfdbg debug ops.
    raw_next_word_ids = math_ops.mod(word_indices,
                                     vocab_size,
                                     name="next_beam_word_ids")
    #raw_next_word_ids shape=(?, 10)
    next_word_ids = math_ops.to_int32(raw_next_word_ids)
    next_beam_ids = math_ops.to_int32(word_indices / vocab_size,
                                      name="next_beam_parent_ids")

    # Append new ids to current predictions
    previously_finished = _tensor_gather_helper(
        gather_indices=next_beam_ids,
        gather_from=previously_finished,
        batch_size=batch_size,
        range_size=beam_width,
        gather_shape=[-1])
    next_finished = math_ops.logical_or(previously_finished,
                                        math_ops.equal(next_word_ids,
                                                       end_token),
                                        name="next_beam_finished")

    # Calculate the length of the next predictions.
    # 1. Finished beams remain unchanged
    # 2. Beams that are now finished (EOS predicted) remain unchanged
    # 3. Beams that are not yet finished have their length increased by 1
    lengths_to_add = math_ops.to_int64(
        math_ops.not_equal(next_word_ids, end_token))
    lengths_to_add = (1 - math_ops.to_int64(next_finished)) * lengths_to_add
    next_prediction_len = _tensor_gather_helper(gather_indices=next_beam_ids,
                                                gather_from=beam_state.lengths,
                                                batch_size=batch_size,
                                                range_size=beam_width,
                                                gather_shape=[-1])
    next_prediction_len += lengths_to_add

    # Pick out the cell_states according to the next_beam_ids. We use a
    # different gather_shape here because the cell_state tensors, i.e.
    # the tensors that would be gathered from, all have dimension
    # greater than two and we need to preserve those dimensions.
    # pylint: disable=g-long-lambda
    next_cell_state = nest.map_structure(
        lambda gather_from: _maybe_tensor_gather_helper(
            gather_indices=next_beam_ids,
            gather_from=gather_from,
            batch_size=batch_size,
            range_size=beam_width,
            gather_shape=[batch_size * beam_width, -1]), next_cell_state)
    # pylint: enable=g-long-lambda

    next_state = BeamSearchDecoderState(cell_state=next_cell_state,
                                        log_probs=next_beam_probs,
                                        lengths=next_prediction_len,
                                        finished=next_finished)
    print('next_beam_probs', next_beam_probs)
    output = BeamSearchDecoderOutput(scores=next_beam_scores,
                                     predicted_ids=next_word_ids,
                                     parent_ids=next_beam_ids)

    return output, next_state
Пример #48
0
 def map_positive_offset(offset):
     return math_ops.minimum(starts + offset, limits)
Пример #49
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0.
        r = utils.smart_cond(training, lambda: r,
                             lambda: array_ops.ones_like(r))
        d = utils.smart_cond(training, lambda: d,
                             lambda: array_ops.zeros_like(d))

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            value = array_ops.identity(value)

            def _do_update():
                # Update the variables without zero debiasing. The debiasing will be
                # accomplished by dividing the exponential moving average by the weight.
                # For example, after a single update, the moving average would be
                # (1-decay) * value. and the weight will be 1-decay, with their ratio
                # giving the value.
                # Make sure the weight is not updated until before r and d computation.
                with ops.control_dependencies([value]):
                    weight_value = array_ops.constant(1., dtype=weight.dtype)
                new_var = moving_averages.assign_moving_average(
                    var, value, self.renorm_momentum, zero_debias=False)
                new_weight = moving_averages.assign_moving_average(
                    weight,
                    weight_value,
                    self.renorm_momentum,
                    zero_debias=False)
                return new_var / new_weight

            def _fake_update():
                return array_ops.identity(var)

            return utils.smart_cond(training, _do_update, _fake_update)

        with ops.colocate_with(self.moving_mean):
            new_mean = _update_renorm_variable(self.renorm_mean,
                                               self.renorm_mean_weight, mean)
        with ops.colocate_with(self.moving_variance):
            new_stddev = _update_renorm_variable(self.renorm_stddev,
                                                 self.renorm_stddev_weight,
                                                 stddev)
            # Make sqrt(moving_variance + epsilon) = new_stddev.
            new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
Пример #50
0
def linear_to_mel_weight_matrix(num_mel_bins=20,
                                num_spectrogram_bins=129,
                                sample_rate=8000,
                                lower_edge_hertz=125.0,
                                upper_edge_hertz=3800.0,
                                dtype=dtypes.float32,
                                name=None):
  """Returns a matrix to warp linear scale spectrograms to the [mel scale][mel].

  Returns a weight matrix that can be used to re-weight a `Tensor` containing
  `num_spectrogram_bins` linearly sampled frequency information from
  `[0, sample_rate / 2]` into `num_mel_bins` frequency information from
  `[lower_edge_hertz, upper_edge_hertz]` on the [mel scale][mel].

  For example, the returned matrix `A` can be used to right-multiply a
  spectrogram `S` of shape `[frames, num_spectrogram_bins]` of linear
  scale spectrum values (e.g. STFT magnitudes) to generate a "mel spectrogram"
  `M` of shape `[frames, num_mel_bins]`.

      # `S` has shape [frames, num_spectrogram_bins]
      # `M` has shape [frames, num_mel_bins]
      M = tf.matmul(S, A)

  The matrix can be used with `tf.tensordot` to convert an arbitrary rank
  `Tensor` of linear-scale spectral bins into the mel scale.

      # S has shape [..., num_spectrogram_bins].
      # M has shape [..., num_mel_bins].
      M = tf.tensordot(S, A, 1)
      # tf.tensordot does not support shape inference for this case yet.
      M.set_shape(S.shape[:-1].concatenate(A.shape[-1:]))

  Args:
    num_mel_bins: Python int. How many bands in the resulting mel spectrum.
    num_spectrogram_bins: An integer `Tensor`. How many bins there are in the
      source spectrogram data, which is understood to be `fft_size // 2 + 1`,
      i.e. the spectrogram only contains the nonredundant FFT bins.
    sample_rate: Python float. Samples per second of the input signal used to
      create the spectrogram. We need this to figure out the actual frequencies
      for each spectrogram bin, which dictates how they are mapped into the mel
      scale.
    lower_edge_hertz: Python float. Lower bound on the frequencies to be
      included in the mel spectrum. This corresponds to the lower edge of the
      lowest triangular band.
    upper_edge_hertz: Python float. The desired top edge of the highest
      frequency band.
    dtype: The `DType` of the result matrix. Must be a floating point type.
    name: An optional name for the operation.

  Returns:
    A `Tensor` of shape `[num_spectrogram_bins, num_mel_bins]`.

  Raises:
    ValueError: If `num_mel_bins`/`num_spectrogram_bins`/`sample_rate` are not
      positive, `lower_edge_hertz` is negative, frequency edges are incorrectly
      ordered, or `upper_edge_hertz` is larger than the Nyquist frequency.

  [mel]: https://en.wikipedia.org/wiki/Mel_scale
  """
  with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name:
    # Note: As num_spectrogram_bins is passed to `math_ops.linspace`
    # and the validation is already done in linspace (both in shape function
    # and in kernel), there is no need to validate num_spectrogram_bins here.
    _validate_arguments(num_mel_bins, sample_rate,
                        lower_edge_hertz, upper_edge_hertz, dtype)

    # This function can be constant folded by graph optimization since there are
    # no Tensor inputs.
    sample_rate = ops.convert_to_tensor(
        sample_rate, dtype, name='sample_rate')
    lower_edge_hertz = ops.convert_to_tensor(
        lower_edge_hertz, dtype, name='lower_edge_hertz')
    upper_edge_hertz = ops.convert_to_tensor(
        upper_edge_hertz, dtype, name='upper_edge_hertz')
    zero = ops.convert_to_tensor(0.0, dtype)

    # HTK excludes the spectrogram DC bin.
    bands_to_zero = 1
    nyquist_hertz = sample_rate / 2.0
    linear_frequencies = math_ops.linspace(
        zero, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:]
    spectrogram_bins_mel = array_ops.expand_dims(
        _hertz_to_mel(linear_frequencies), 1)

    # Compute num_mel_bins triples of (lower_edge, center, upper_edge). The
    # center of each band is the lower and upper edge of the adjacent bands.
    # Accordingly, we divide [lower_edge_hertz, upper_edge_hertz] into
    # num_mel_bins + 2 pieces.
    band_edges_mel = shape_ops.frame(
        math_ops.linspace(_hertz_to_mel(lower_edge_hertz),
                          _hertz_to_mel(upper_edge_hertz),
                          num_mel_bins + 2), frame_length=3, frame_step=1)

    # Split the triples up and reshape them into [1, num_mel_bins] tensors.
    lower_edge_mel, center_mel, upper_edge_mel = tuple(array_ops.reshape(
        t, [1, num_mel_bins]) for t in array_ops.split(
            band_edges_mel, 3, axis=1))

    # Calculate lower and upper slopes for every spectrogram bin.
    # Line segments are linear in the mel domain, not Hertz.
    lower_slopes = (spectrogram_bins_mel - lower_edge_mel) / (
        center_mel - lower_edge_mel)
    upper_slopes = (upper_edge_mel - spectrogram_bins_mel) / (
        upper_edge_mel - center_mel)

    # Intersect the line segments with each other and zero.
    mel_weights_matrix = math_ops.maximum(
        zero, math_ops.minimum(lower_slopes, upper_slopes))

    # Re-add the zeroed lower bins we sliced out above.
    return array_ops.pad(
        mel_weights_matrix, [[bands_to_zero, 0], [0, 0]], name=name)
Пример #51
0
 def min_or_and(x1, x2):
     if x1.dtype == dtypes.bool:
         assert x2.dtype == dtypes.bool
         return math_ops.logical_and(x1, x2)
     return math_ops.minimum(x1, x2)
Пример #52
0
def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
    """Clips values of multiple tensors by the ratio of the sum of their norms.

  Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`,
  this operation returns a list of clipped tensors `list_clipped`
  and the global norm (`global_norm`) of all tensors in `t_list`. Optionally,
  if you've already computed the global norm for `t_list`, you can specify
  the global norm with `use_norm`.

  To perform the clipping, the values `t_list[i]` are set to:

      t_list[i] * clip_norm / max(global_norm, clip_norm)

  where:

      global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))

  If `clip_norm > global_norm` then the entries in `t_list` remain as they are,
  otherwise they're all shrunk by the global ratio.

  Any of the entries of `t_list` that are of type `None` are ignored.

  This is the correct way to perform gradient clipping (for example, see
  [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063)
  ([pdf](http://arxiv.org/pdf/1211.5063.pdf))).

  However, it is slower than `clip_by_norm()` because all the parameters must be
  ready before the clipping operation can be performed.

  Args:
    t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
    clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio.
    use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global
      norm to use. If not provided, `global_norm()` is used to compute the norm.
    name: A name for the operation (optional).

  Returns:
    list_clipped: A list of `Tensors` of the same type as `list_t`.
    global_norm: A 0-D (scalar) `Tensor` representing the global norm.

  Raises:
    TypeError: If `t_list` is not a sequence.
  """
    if (not isinstance(t_list, collections.Sequence)
            or isinstance(t_list, six.string_types)):
        raise TypeError("t_list should be a sequence")
    t_list = list(t_list)
    if use_norm is None:
        use_norm = global_norm(t_list, name)

    with ops.op_scope(t_list + [clip_norm], name,
                      "clip_by_global_norm") as name:
        # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
        scale = clip_norm * math_ops.minimum(
            1.0 / use_norm,
            constant_op.constant(1.0 / clip_norm, dtype=use_norm.dtype))

        values = [
            ops.convert_to_tensor(
                t.values if isinstance(t, ops.IndexedSlices) else t,
                name="t_%d" % i) if t is not None else t
            for i, t in enumerate(t_list)
        ]

        values_clipped = [
            array_ops.identity(v * scale, name="%s_%d" %
                               (name, i)) if v is not None else None
            for i, v in enumerate(values)
        ]

        list_clipped = [
            ops.IndexedSlices(c_v, t.indices) if isinstance(
                t, ops.IndexedSlices) else c_v
            for (c_v, t) in zip(values_clipped, t_list)
        ]

    return list_clipped, use_norm
Пример #53
0
        def pad_or_truncate_map(replay):
            """Truncate or pad replays."""
            with_values = 'value' in replay

            if with_values:
                replay = experience.ReplayWithValues(**replay)
            else:
                replay = experience.Replay(**replay)

            sequence_length = math_ops.minimum(max_sequence_length,
                                               replay.sequence_length)
            sequence_length.set_shape([1])

            state = sequence_utils.pad_or_truncate(replay.state,
                                                   max_sequence_length,
                                                   axis=0,
                                                   pad_value=0)
            state.set_shape([max_sequence_length] + state_shape)

            next_state = sequence_utils.pad_or_truncate(replay.next_state,
                                                        max_sequence_length,
                                                        axis=0,
                                                        pad_value=0)
            next_state.set_shape([max_sequence_length] + state_shape)

            action = sequence_utils.pad_or_truncate(replay.action,
                                                    max_sequence_length,
                                                    axis=0,
                                                    pad_value=0)
            action.set_shape([max_sequence_length] + action_shape)

            action_value = sequence_utils.pad_or_truncate(replay.action_value,
                                                          max_sequence_length,
                                                          axis=0,
                                                          pad_value=0)
            action_value.set_shape([max_sequence_length] + action_value_shape)

            reward = sequence_utils.pad_or_truncate(replay.reward,
                                                    max_sequence_length,
                                                    axis=0,
                                                    pad_value=0)
            reward.set_shape([max_sequence_length] + reward_shape)

            terminal = sequence_utils.pad_or_truncate(
                replay.terminal,
                max_sequence_length,
                axis=0,
                pad_value=ops.convert_to_tensor(False))
            terminal.set_shape([max_sequence_length])

            if with_values:
                value = sequence_utils.pad_or_truncate(replay.value,
                                                       max_sequence_length,
                                                       axis=0,
                                                       pad_value=0)
                value.set_shape([max_sequence_length] + reward_shape)

                return experience.ReplayWithValues(
                    state=state,
                    next_state=next_state,
                    action=action,
                    action_value=action_value,
                    value=value,
                    reward=reward,
                    terminal=terminal,
                    sequence_length=sequence_length)

            return experience.Replay(state=state,
                                     next_state=next_state,
                                     action=action,
                                     action_value=action_value,
                                     reward=reward,
                                     terminal=terminal,
                                     sequence_length=sequence_length)
Пример #54
0
    def apply(self, var_list=None):
        """Maintains moving averages of variables.

    `var_list` must be a list of `Variable` or `Tensor` objects.  This method
    creates shadow variables for all elements of `var_list`.  Shadow variables
    for `Variable` objects are initialized to the variable's initial value.
    They will be added to the `GraphKeys.MOVING_AVERAGE_VARIABLES` collection.
    For `Tensor` objects, the shadow variables are initialized to 0 and zero
    debiased (see docstring in `assign_moving_average` for more details).

    shadow variables are created with `trainable=False` and added to the
    `GraphKeys.ALL_VARIABLES` collection.  They will be returned by calls to
    `tf.global_variables()`.

    Returns an op that updates all shadow variables from the current value of
    their associated variables.

    Note that `apply()` can be called multiple times. When eager execution is
    enabled each call to apply will update the variables once, so this needs to
    be called in a loop.

    Args:
      var_list: A list of Variable or Tensor objects. The variables
        and Tensors must be of types bfloat16, float16, float32, or float64.

    Returns:
      An Operation that updates the moving averages.

    Raises:
      TypeError: If the arguments are not an allowed type.
    """
        # TODO(touts): op_scope
        if var_list is None:
            var_list = variables.trainable_variables()
        zero_debias_true = set()  # set of vars to set `zero_debias=True`
        for var in var_list:
            if var.dtype.base_dtype not in [
                    dtypes.bfloat16, dtypes.float16, dtypes.float32,
                    dtypes.float64
            ]:
                raise TypeError(
                    "The variables must be half, float, or double: %s" %
                    var.name)

            if var not in self._averages:
                # For variables: to lower communication bandwidth across devices we keep
                # the moving averages on the same device as the variables. For other
                # tensors, we rely on the existing device allocation mechanism.
                with ops.init_scope():
                    if isinstance(var, variables.Variable):
                        avg = slot_creator.create_slot(
                            var,
                            var.initialized_value(),
                            self.name,
                            colocate_with_primary=True)
                        # NOTE(mrry): We only add `tf.Variable` objects to the
                        # `MOVING_AVERAGE_VARIABLES` collection.
                        ops.add_to_collection(
                            ops.GraphKeys.MOVING_AVERAGE_VARIABLES, var)
                    else:
                        avg = slot_creator.create_zeros_slot(
                            var,
                            self.name,
                            colocate_with_primary=(var.op.type in [
                                "Variable", "VariableV2", "VarHandleOp"
                            ]))
                        if self._zero_debias:
                            zero_debias_true.add(avg)
                self._averages[var] = avg

        with ops.name_scope(self.name) as scope:
            decay = ops.convert_to_tensor(self._decay, name="decay")
            if self._num_updates is not None:
                num_updates = math_ops.cast(self._num_updates,
                                            dtypes.float32,
                                            name="num_updates")
                decay = math_ops.minimum(decay, (1.0 + num_updates) /
                                         (10.0 + num_updates))
            updates = []
            for var in var_list:
                zero_debias = self._averages[var] in zero_debias_true
                updates.append(
                    assign_moving_average(self._averages[var],
                                          var,
                                          decay,
                                          zero_debias=zero_debias))
            return control_flow_ops.group(*updates, name=scope)
Пример #55
0
 def _calculate_t1_cond_values(self, alpha, beta, eta, lambd,
                               scaled_s_dg_db, scaled_s_dg_dl, graph, t):
     """
     Calculates values for t > 1 condition part.
     :return: Assignments made and values that are later needed in calculations.
     """
     assignments = []
     cond_t1 = math_ops.greater(t, 1.0)
     s_dt_db_norm = math_ops.sqrt(
         math_ops.add_n([
             math_ops.reduce_sum(self.get_slot(v, "s_dt_db")**2.0)
             for v in self._vars
         ]))
     s_dt_dl_norm = math_ops.sqrt(
         math_ops.add_n([
             math_ops.reduce_sum(self.get_slot(v, "s_dt_dl")**2.0)
             for v in self._vars
         ]))
     alpha0 = control_flow_ops.cond(
         cond_t1, lambda: math_ops.log(0.5 * math_ops.minimum(
             s_dt_db_norm / math_ops.sqrt(
                 math_ops.add_n([
                     math_ops.reduce_sum(sgb**2.0) for sgb in scaled_s_dg_db
                 ])), s_dt_dl_norm / math_ops.sqrt(
                     math_ops.add_n([
                         math_ops.reduce_sum(sgl**2.0)
                         for sgl in scaled_s_dg_dl
                     ])))), lambda: float('Inf'))
     cond_a0 = math_ops.greater(alpha, alpha0)
     alpha = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_a0),
                                   lambda: alpha - 2.0 * self._delta_t,
                                   lambda: alpha)
     beta = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_a0),
                                  lambda: math_ops.exp(alpha0),
                                  lambda: beta)
     eg2 = self._get_non_slot_variable("e_g2", graph)
     em2 = self._get_non_slot_variable("e_m2", graph)
     gamma = control_flow_ops.cond(
         cond_t1, lambda: math_ops.minimum(
             1.0,
             math_ops.minimum(self._C_t * eg2 / s_dt_db_norm**2.0, self._C_t
                              * em2 / s_dt_dl_norm**2.0)),
         lambda: self._get_non_slot_variable("gamma", graph))
     cond_gl = math_ops.greater(lambd, gamma)
     eta = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_gl),
                                 lambda: eta - 2.0 * self._delta_t,
                                 lambda: eta)
     lambd = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_gl),
                                   lambda: gamma, lambda: lambd)
     assignments.append(
         state_ops.assign(self._get_non_slot_variable("alpha", graph),
                          alpha))
     assignments.append(
         state_ops.assign(self._get_non_slot_variable("beta", graph), beta))
     assignments.append(
         state_ops.assign(self._get_non_slot_variable("eta", graph), eta))
     assignments.append(
         state_ops.assign(self._get_non_slot_variable("lambda", graph),
                          lambd))
     assignments.append(
         state_ops.assign(self._get_non_slot_variable("gamma", graph),
                          gamma))
     return assignments, beta, lambd, gamma
Пример #56
0
def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
  """Clips values of multiple tensors by the ratio of the sum of their norms.

  Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`,
  this operation returns a list of clipped tensors `list_clipped`
  and the global norm (`global_norm`) of all tensors in `t_list`. Optionally,
  if you've already computed the global norm for `t_list`, you can specify
  the global norm with `use_norm`.

  To perform the clipping, the values `t_list[i]` are set to:

      t_list[i] * clip_norm / max(global_norm, clip_norm)

  where:

      global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))

  If `clip_norm > global_norm` then the entries in `t_list` remain as they are,
  otherwise they're all shrunk by the global ratio.

  If `global_norm == infinity` then the entries in `t_list` are all set to `NaN`
  to signal that an error occurred.

  Any of the entries of `t_list` that are of type `None` are ignored.

  This is the correct way to perform gradient clipping (Pascanu et al., 2012).

  However, it is slower than `clip_by_norm()` because all the parameters must be
  ready before the clipping operation can be performed.

  Args:
    t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
    clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio.
    use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global
      norm to use. If not provided, `global_norm()` is used to compute the norm.
    name: A name for the operation (optional).

  Returns:
    list_clipped: A list of `Tensors` of the same type as `list_t`.
    global_norm: A 0-D (scalar) `Tensor` representing the global norm.

  Raises:
    TypeError: If `t_list` is not a sequence.

  References:
    On the difficulty of training Recurrent Neural Networks:
      [Pascanu et al., 2012](http://proceedings.mlr.press/v28/pascanu13.html)
      ([pdf](http://proceedings.mlr.press/v28/pascanu13.pdf))
  """
  if (not isinstance(t_list, collections_abc.Sequence) or
      isinstance(t_list, six.string_types)):
    raise TypeError("t_list should be a sequence")
  t_list = list(t_list)
  if use_norm is None:
    use_norm = global_norm(t_list, name)

  with ops.name_scope(name, "clip_by_global_norm",
                      t_list + [clip_norm]) as name:
    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
    scale_for_finite = clip_norm * math_ops.minimum(
        1.0 / use_norm,
        constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm)
    # If use_norm is any finite number, this is a no-op. For inf/-inf/NaN,
    # this will make scale NaN.
    scale = scale_for_finite + (use_norm - use_norm)

    values = [
        ops.convert_to_tensor(
            t.values if isinstance(t, ops.IndexedSlices) else t,
            name="t_%d" % i)
        if t is not None else t
        for i, t in enumerate(t_list)]

    values_clipped = []
    for i, v in enumerate(values):
      if v is None:
        values_clipped.append(None)
      else:
        with ops.colocate_with(v):
          values_clipped.append(
              array_ops.identity(v * scale, name="%s_%d" % (name, i)))

    list_clipped = [
        ops.IndexedSlices(c_v, t.indices, t.dense_shape)
        if isinstance(t, ops.IndexedSlices)
        else c_v
        for (c_v, t) in zip(values_clipped, t_list)]

  return list_clipped, use_norm
Пример #57
0
def clip_by_value(t, clip_value_min, clip_value_max,
                  name=None):
  """Clips tensor values to a specified min and max.

  Given a tensor `t`, this operation returns a tensor of the same type and
  shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
  Any values less than `clip_value_min` are set to `clip_value_min`. Any values
  greater than `clip_value_max` are set to `clip_value_max`.

  Note: `clip_value_min` needs to be smaller or equal to `clip_value_max` for
  correct results.

  For example:

  Basic usage passes a scalar as the min and max value.

  >>> t = tf.constant([[-10., -1., 0.], [0., 2., 10.]])
  >>> t2 = tf.clip_by_value(t, clip_value_min=-1, clip_value_max=1)
  >>> t2.numpy()
  array([[-1., -1.,  0.],
         [ 0.,  1.,  1.]], dtype=float32)

  The min and max can be the same size as `t`, or broadcastable to that size.

  >>> t = tf.constant([[-1, 0., 10.], [-1, 0, 10]])
  >>> clip_min = [[2],[1]]
  >>> t3 = tf.clip_by_value(t, clip_value_min=clip_min, clip_value_max=100)
  >>> t3.numpy()
  array([[ 2.,  2., 10.],
         [ 1.,  1., 10.]], dtype=float32)

  Broadcasting fails, intentionally, if you would expand the dimensions of `t`

  >>> t = tf.constant([[-1, 0., 10.], [-1, 0, 10]])
  >>> clip_min = [[[2, 1]]] # Has a third axis
  >>> t4 = tf.clip_by_value(t, clip_value_min=clip_min, clip_value_max=100)
  Traceback (most recent call last):
  ...
  InvalidArgumentError: Incompatible shapes: [2,3] vs. [1,1,2]

  It throws a `TypeError` if you try to clip an `int` to a `float` value
  (`tf.cast` the input to `float` first).

  >>> t = tf.constant([[1, 2], [3, 4]], dtype=tf.int32)
  >>> t5 = tf.clip_by_value(t, clip_value_min=-3.1, clip_value_max=3.1)
  Traceback (most recent call last):
  ...
  TypeError: Cannot convert ...


  Args:
    t: A `Tensor` or `IndexedSlices`.
    clip_value_min: The minimum value to clip to. A scalar `Tensor` or one that
      is broadcastable to the shape of `t`.
    clip_value_max: The minimum value to clip to. A scalar `Tensor` or one that
      is broadcastable to the shape of `t`.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor` or `IndexedSlices`.

  Raises:
    `tf.errors.InvalidArgumentError`: If the clip tensors would trigger array
      broadcasting that would make the returned tensor larger than the input.
    TypeError: If dtype of the input is `int32` and dtype of
      the `clip_value_min` or `clip_value_max` is `float32`
  """
  with ops.name_scope(name, "clip_by_value",
                      [t, clip_value_min, clip_value_max]) as name:
    values = ops.convert_to_tensor(
        t.values if isinstance(t, ops.IndexedSlices) else t, name="t")

    # Go through list of tensors, for each value in each tensor clip
    t_min = math_ops.minimum(values, clip_value_max)
    # Assert that the shape is compatible with the initial shape,
    # to prevent unintentional broadcasting.
    _ = values.shape.merge_with(t_min.shape)

    t_max = math_ops.maximum(t_min, clip_value_min, name=name)
    _ = values.shape.merge_with(t_max.shape)

    if isinstance(t, ops.IndexedSlices):
      t_max = ops.IndexedSlices(t_max, t.indices, t.dense_shape)

  return t_max
Пример #58
0
def smart_resize(x, size, interpolation='bilinear'):
  """Resize images to a target size without aspect ratio distortion.

  TensorFlow image datasets typically yield images that have each a different
  size. However, these images need to be batched before they can be
  processed by Keras layers. To be batched, images need to share the same height
  and width.

  You could simply do:

  ```python
  size = (200, 200)
  ds = ds.map(lambda img: tf.image.resize(img, size))
  ```

  However, if you do this, you distort the aspect ratio of your images, since
  in general they do not all have the same aspect ratio as `size`. This is
  fine in many cases, but not always (e.g. for GANs this can be a problem).

  Note that passing the argument `preserve_aspect_ratio=True` to `resize`
  will preserve the aspect ratio, but at the cost of no longer respecting the
  provided target size. Because `tf.image.resize` doesn't crop images,
  your output images will still have different sizes.

  This calls for:

  ```python
  size = (200, 200)
  ds = ds.map(lambda img: smart_resize(img, size))
  ```

  Your output images will actually be `(200, 200)`, and will not be distorted.
  Instead, the parts of the image that do not fit within the target size
  get cropped out.

  The resizing process is:

  1. Take the largest centered crop of the image that has the same aspect ratio
  as the target size. For instance, if `size=(200, 200)` and the input image has
  size `(340, 500)`, we take a crop of `(340, 340)` centered along the width.
  2. Resize the cropped image to the target size. In the example above,
  we resize the `(340, 340)` crop to `(200, 200)`.

  Args:
    x: Input image (as a tensor or NumPy array). Must be in format
      `(height, width, channels)`.
    size: Tuple of `(height, width)` integer. Target size.
    interpolation: String, interpolation to use for resizing.
      Defaults to `'bilinear'`. Supports `bilinear`, `nearest`, `bicubic`,
      `area`, `lanczos3`, `lanczos5`, `gaussian`, `mitchellcubic`.

  Returns:
    Array with shape `(size[0], size[1], channels)`. If the input image was a
    NumPy array, the output is a NumPy array, and if it was a TF tensor,
    the output is a TF tensor.
  """
  if len(size) != 2:
    raise ValueError('Expected `size` to be a tuple of 2 integers, '
                     'but got: %s' % (size,))
  img = ops.convert_to_tensor_v2_with_dispatch(x)
  if img.shape.rank is not None:
    if img.shape.rank != 3:
      raise ValueError(
          'Expected an image array with shape `(height, width, channels)`, but '
          'got input with incorrect rank, of shape %s' % (img.shape,))
  shape = array_ops.shape(img)
  height, width = shape[0], shape[1]
  target_height, target_width = size

  crop_height = math_ops.cast(
      math_ops.cast(width * target_height, 'float32') / target_width, 'int32')
  crop_width = math_ops.cast(
      math_ops.cast(height * target_width, 'float32') / target_height, 'int32')

  # Set back to input height / width if crop_height / crop_width is not smaller.
  crop_height = math_ops.minimum(height, crop_height)
  crop_width = math_ops.minimum(width, crop_width)

  crop_box_hstart = math_ops.cast(
      math_ops.cast(height - crop_height, 'float32') / 2, 'int32')
  crop_box_wstart = math_ops.cast(
      math_ops.cast(width - crop_width, 'float32') / 2, 'int32')

  crop_box_start = array_ops.stack([crop_box_hstart, crop_box_wstart, 0])
  crop_box_size = array_ops.stack([crop_height, crop_width, -1])

  img = array_ops.slice(img, crop_box_start, crop_box_size)
  img = image_ops.resize_images_v2(
      images=img,
      size=size,
      method=interpolation)
  if isinstance(x, np.ndarray):
    return img.numpy()
  return img
Пример #59
0
 def _merge_function(self, inputs):
   output = inputs[0]
   for i in range(1, len(inputs)):
     output = math_ops.minimum(output, inputs[i])
   return output
Пример #60
0
 def testMinimumGradient(self):
   inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
   outputs = math_ops.minimum(inputs, 2.0)
   with self.test_session():
     error = gradient_checker.compute_gradient_error(inputs, [4], outputs, [4])
     self.assertLess(error, 1e-4)