def _broadcast_uniform_partitioned_dimension(self, axis, lengths):
    """Broadcasts the partitioned dimension `axis` to match `lengths`."""
    axis_dim_size = self.dimension_size(axis)
    partitioned_sizes = list(self._partitioned_dim_sizes[:axis])

    if lengths.shape.ndims == 0:
      lengths = array_ops.where(
          math_ops.equal(axis_dim_size, 1), lengths, axis_dim_size)
      repeats = array_ops.where(math_ops.equal(axis_dim_size, 1), lengths, 1)
      splits = array_ops.stack([0, self.num_slices_in_dimension(axis)])
    else:
      splits = math_ops.range(
          array_ops.size(lengths, out_type=self.dim_size_dtype) + 1)
      repeats = lengths

    partitioned_sizes.append(lengths)

    for dim_size in self._partitioned_dim_sizes[axis + 1:]:
      if dim_size.shape.ndims == 0:
        partitioned_sizes.append(dim_size)
        splits *= dim_size
      else:
        partitioned_sizes.append(
            ragged_util.repeat_ranges(dim_size, splits, repeats))
        splits = array_ops.gather(
            ragged_util.lengths_to_splits(dim_size), splits)
    inner_sizes = self._inner_dim_sizes
    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
  def _survival_function(self, y):
    low = self._low
    high = self._high

    # Recall the promise:
    # survival_function(y) := P[Y > y]
    #                       = 0, if y >= high,
    #                       = 1, if y < low,
    #                       = P[X > y], otherwise.

    # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in
    # between.
    j = math_ops.ceil(y)

    # P[X > j], used when low < X < high.
    result_so_far = self.distribution.survival_function(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if low is not None:
      result_so_far = array_ops.where(j < low,
                                      array_ops.ones_like(result_so_far),
                                      result_so_far)
    if high is not None:
      result_so_far = array_ops.where(j >= high,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)

    return result_so_far
示例#3
0
  def _variance(self):
    var = (self._ones() * math_ops.square(self.sigma) * self.df / (self.df - 2))
    # When 1 < df <= 2, variance is infinite.
    inf = np.array(np.inf, dtype=self.dtype.as_numpy_dtype())
    result_where_defined = array_ops.where(
        math_ops.greater(self.df, array_ops.fill(self.batch_shape(), 2.)),
        var,
        array_ops.fill(
            self.batch_shape(), inf, name="inf"))

    if self.allow_nan_stats:
      nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype())
      return array_ops.where(
          math_ops.greater(self.df, self._ones()),
          result_where_defined,
          array_ops.fill(
              self.batch_shape(), nan, name="nan"))
    else:
      return control_flow_ops.with_dependencies(
          [
              check_ops.assert_less(
                  array_ops.ones(
                      (), dtype=self.dtype),
                  self.df,
                  message="variance not defined for components of df <= 1"),
          ],
          result_where_defined)
示例#4
0
def _safe_div(numerator, denominator, name="value"):
  """Computes a safe divide which returns 0 if the denominator is zero.

  Note that the function contains an additional conditional check that is
  necessary for avoiding situations where the loss is zero causing NaNs to
  creep into the gradient computation.

  Args:
    numerator: An arbitrary `Tensor`.
    denominator: A `Tensor` whose shape matches `numerator` and whose values are
      assumed to be non-negative.
    name: An optional name for the returned op.

  Returns:
    The element-wise value of the numerator divided by the denominator.
  """
  if compat.forward_compatible(2018, 11, 1):
    return math_ops.div_no_nan(numerator, denominator, name=name)
  return array_ops.where(
      math_ops.greater(denominator, 0),
      math_ops.div(numerator,
                   array_ops.where(
                       math_ops.equal(denominator, 0),
                       array_ops.ones_like(denominator), denominator)),
      array_ops.zeros_like(numerator),
      name=name)
def focal_loss(prediction_tensor, target_tensor, weights=None, alpha=0.25, gamma=2):
    r"""Compute focal loss for predictions.

        Multi-labels Focal loss formula:
            FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p)
                 ,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor.

    Args:
     prediction_tensor: A float tensor of shape [batch_size, num_anchors,
        num_classes] representing the predicted logits for each class
     target_tensor: A float tensor of shape [batch_size, num_anchors,
        num_classes] representing one-hot encoded classification targets
     weights: A float tensor of shape [batch_size, num_anchors]
     alpha: A scalar tensor for focal loss alpha hyper-parameter
     gamma: A scalar tensor for focal loss gamma hyper-parameter
    Returns:
        loss: A (scalar) tensor representing the value of the loss function
    """
    sigmoid_p = tf.nn.sigmoid(prediction_tensor)
    zeros = array_ops.zeros_like(sigmoid_p, dtype=sigmoid_p.dtype)
    pos_p_sub = array_ops.where(target_tensor >= sigmoid_p, target_tensor - sigmoid_p, zeros)
    neg_p_sub = array_ops.where(target_tensor > zeros, zeros, sigmoid_p)
    per_entry_cross_ent = - alpha * (pos_p_sub ** gamma) * tf.log(tf.clip_by_value(sigmoid_p, 1e-8, 1.0)) \
                          - (1 - alpha) * (neg_p_sub ** gamma) * tf.log(tf.clip_by_value(1.0 - sigmoid_p, 1e-8, 1.0))
    return tf.reduce_mean(per_entry_cross_ent)
示例#6
0
  def _variance(self):
    # We need to put the tf.where inside the outer tf.where to ensure we never
    # hit a NaN in the gradient.
    denom = array_ops.where(math_ops.greater(self.df, 2.),
                            self.df - 2.,
                            array_ops.ones_like(self.df))
    # Abs(scale) superfluous.
    var = (array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype) *
           math_ops.square(self.scale) * self.df / denom)
    # When 1 < df <= 2, variance is infinite.
    inf = np.array(np.inf, dtype=self.dtype.as_numpy_dtype())
    result_where_defined = array_ops.where(
        self.df > array_ops.fill(self.batch_shape_tensor(), 2.),
        var,
        array_ops.fill(self.batch_shape_tensor(), inf, name="inf"))

    if self.allow_nan_stats:
      nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype())
      return array_ops.where(
          math_ops.greater(
              self.df,
              array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)),
          result_where_defined,
          array_ops.fill(self.batch_shape_tensor(), nan, name="nan"))
    else:
      return control_flow_ops.with_dependencies(
          [
              check_ops.assert_less(
                  array_ops.ones([], dtype=self.dtype),
                  self.df,
                  message="variance not defined for components of df <= 1"),
          ],
          result_where_defined)
示例#7
0
def _safe_div(numerator, denominator, name="value"):
  """Computes a safe divide which returns 0 if the denominator is zero.

  Note that the function contains an additional conditional check that is
  necessary for avoiding situations where the loss is zero causing NaNs to
  creep into the gradient computation.

  Args:
    numerator: An arbitrary `Tensor`.
    denominator: `Tensor` whose shape matches `numerator` and whose values are
      assumed to be non-negative.
    name: An optional name for the returned op.

  Returns:
    The element-wise value of the numerator divided by the denominator.
  """
  if isinstance(denominator, float):
    if math_ops.equal(denominator, 0.0):
      return ops.convert_to_tensor(0.0, dtype=numerator.dtype)
    return math_ops.div(numerator, denominator)
  if context.in_eager_mode() and denominator._rank() == 0:  # pylint: disable=protected-access
    if math_ops.equal(denominator, 0.0):
      return ops.convert_to_tensor(0.0, dtype=numerator.dtype)
    return math_ops.div(numerator, denominator)
  return array_ops.where(
      math_ops.greater(denominator, 0),
      math_ops.div(numerator, array_ops.where(
          math_ops.equal(denominator, 0),
          array_ops.ones_like(denominator), denominator)),
      array_ops.zeros_like(numerator),
      name=name)
示例#8
0
    def body(time, outputs_ta, state, inputs, finished, sequence_lengths):
      """Internal while_loop body.

      Args:
        time: scalar int32 tensor.
        outputs_ta: structure of TensorArray.
        state: (structure of) state tensors and TensorArrays.
        inputs: (structure of) input tensors.
        finished: bool tensor (keeping track of what's finished).
        sequence_lengths: int32 tensor (keeping track of time of finish).

      Returns:
        `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
          next_sequence_lengths)`.
        ```
      """
      (next_outputs, decoder_state, next_inputs,
       decoder_finished) = decoder.step(time, inputs, state)
      next_finished = math_ops.logical_or(decoder_finished, finished)
      if maximum_iterations is not None:
        next_finished = math_ops.logical_or(
            next_finished, time + 1 >= maximum_iterations)
      next_sequence_lengths = array_ops.where(
          math_ops.logical_and(math_ops.logical_not(finished), next_finished),
          array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
          sequence_lengths)

      nest.assert_same_structure(state, decoder_state)
      nest.assert_same_structure(outputs_ta, next_outputs)
      nest.assert_same_structure(inputs, next_inputs)

      # Zero out output values past finish
      if impute_finished:
        emit = nest.map_structure(
            lambda out, zero: array_ops.where(finished, zero, out),
            next_outputs,
            zero_outputs)
      else:
        emit = next_outputs

      # Copy through states past finish
      def _maybe_copy_state(new, cur):
        # TensorArrays and scalar states get passed through.
        if isinstance(cur, tensor_array_ops.TensorArray):
          pass_through = True
        else:
          new.set_shape(cur.shape)
          pass_through = (new.shape.ndims == 0)
        return new if pass_through else array_ops.where(finished, cur, new)

      if impute_finished:
        next_state = nest.map_structure(
            _maybe_copy_state, decoder_state, state)
      else:
        next_state = decoder_state

      outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out),
                                      outputs_ta, emit)
      return (time + 1, outputs_ta, next_state, next_inputs, next_finished,
              next_sequence_lengths)
示例#9
0
 def _cdf(self, x):
   broadcasted_x = x * array_ops.ones(self.batch_shape())
   zeros = array_ops.zeros_like(x + self.a + self.b, dtype=self.dtype)
   ones = array_ops.ones_like(x + self.a + self.b, dtype=self.dtype)
   result_if_not_big = array_ops.where(
       x < self.a, zeros, (broadcasted_x - self.a) / self.range())
   return array_ops.where(x >= self.b, ones, result_if_not_big)
示例#10
0
def per_example_quantile_regression_loss(labels, weights, predictions,
                                         quantile):
  """Smoothed loss for quantile regression.

  The standard quantile regression loss is quantile*(y-y') when y>y' and
  (quantile-1)*(y-y') otherwise, y' is a prediction, y is a label. The impl
  below is this loss but squared in the region where the loss value < 1.

  Args:
    labels: Rank 2 (N, D) tensor of per-example labels.
    weights: Rank 2 (N, 1) tensor of per-example weights.
    predictions: Rank 2 (N, D) tensor of per-example predictions.
    quantile: The quantile to use.

  Returns:
    loss: A Rank 2 (N, 1) tensor of per-example quantile loss.
    update_op: An update operation to update the loss's internal state.
  """
  labels = math_ops.to_float(labels)
  error = labels - predictions
  square_loss_right = array_ops.where(error * quantile < 1.0,
                                      math_ops.square(quantile * error),
                                      quantile * error)
  square_loss_left = array_ops.where(error * (quantile - 1) < 1,
                                     math_ops.square((quantile - 1) * error),
                                     (quantile - 1) * error)

  unweighted_loss = array_ops.where(error > 0, square_loss_right,
                                    square_loss_left)
  if weights is None:
    return unweighted_loss, control_flow_ops.no_op()
  else:
    return unweighted_loss * weights, control_flow_ops.no_op()
  def _log_cdf(self, y):
    low = self._low
    high = self._high

    # Recall the promise:
    # cdf(y) := P[Y <= y]
    #         = 1, if y >= high,
    #         = 0, if y < low,
    #         = P[X <= y], otherwise.

    # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in
    # between.
    j = math_ops.floor(y)

    result_so_far = self.distribution.log_cdf(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if low is not None:
      neg_inf = -np.inf * array_ops.ones_like(result_so_far)
      result_so_far = array_ops.where(j < low, neg_inf, result_so_far)
    if high is not None:
      result_so_far = array_ops.where(j >= high,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)

    return result_so_far
  def _cdf(self, y):
    lower_cutoff = self._lower_cutoff
    upper_cutoff = self._upper_cutoff

    # Recall the promise:
    # cdf(y) := P[Y <= y]
    #         = 1, if y >= upper_cutoff,
    #         = 0, if y < lower_cutoff,
    #         = P[X <= y], otherwise.

    # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in
    # between.
    j = math_ops.floor(y)

    # P[X <= j], used when lower_cutoff < X < upper_cutoff.
    result_so_far = self.distribution.cdf(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if lower_cutoff is not None:
      result_so_far = array_ops.where(j < lower_cutoff,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)
    if upper_cutoff is not None:
      result_so_far = array_ops.where(j >= upper_cutoff,
                                      array_ops.ones_like(result_so_far),
                                      result_so_far)

    return result_so_far
示例#13
0
def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
  """Maximum entropy loss for multiclass problems.

  Maximum entropy is a generalization of logistic loss for the case when more
  than 2 classes are present.

  Args:
    labels: Rank 2 (N, 1) or Rank 1 (N) tensor of per-example labels.
    weights: Rank 2 (N, 1) tensor of per-example weights.
    logits: Rank 2 (N, K) tensor of per-example predictions, K - num of
    classes.
    num_classes: number of classes in classification task. Used to expand label
    indices into one-hot encodings.
    eps: tolerance, used as a minimum possible value.

  Returns:
    loss: A Rank 2 (N, 1) tensor of per-example maxent loss
    update_op: An update operation to update the loss's internal state.
  """
  labels = math_ops.to_int64(labels)
  # If labels are of rank 1, make them rank 2.
  labels_shape = labels.get_shape()
  if len(labels_shape) != 2:
    labels = array_ops.expand_dims(labels, 1)
  # Labels are indices of classes, convert them to one hot encodings.
  target_one_hot = array_ops.one_hot(indices=labels, depth=num_classes)
  labels = math_ops.reduce_sum(
      input_tensor=target_one_hot, reduction_indices=[1])
  labels = math_ops.to_float(labels)

  # Calculate softmax probabilities for each class.
  unnormalized_probs = math_ops.exp(logits)
  normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keepdims=True)
  softmax_predictions = math_ops.divide(unnormalized_probs,
                                        math_ops.add(normalizers, eps))

  # Pull out the probabilities for real label.
  probs_for_real_class = math_ops.reduce_sum(labels * softmax_predictions, 1)

  # Add handling for values near 0 and 1.
  zeros = array_ops.zeros_like(probs_for_real_class, dtype=logits.dtype) + eps
  one_minus_eps = array_ops.ones_like(
      probs_for_real_class, dtype=logits.dtype) - eps

  # Take maximum(eps, pred)
  cond = (probs_for_real_class >= eps)
  probs_for_real_class = array_ops.where(cond, probs_for_real_class, zeros)

  # Take minimum(1-eps, pred)
  cond = (probs_for_real_class <= 1 - eps)
  probs_for_real_class = array_ops.where(cond, probs_for_real_class,
                                         one_minus_eps)

  unweighted_loss = array_ops.expand_dims(-math_ops.log(probs_for_real_class),
                                          1)
  if weights is None:
    return unweighted_loss, control_flow_ops.no_op()
  else:
    return unweighted_loss * weights, control_flow_ops.no_op()
示例#14
0
 def _cdf(self, x):
   broadcast_shape = array_ops.broadcast_dynamic_shape(
       array_ops.shape(x), self.batch_shape_tensor())
   zeros = array_ops.zeros(broadcast_shape, dtype=self.dtype)
   ones = array_ops.ones(broadcast_shape, dtype=self.dtype)
   broadcasted_x = x * ones
   result_if_not_big = array_ops.where(
       x < self.low, zeros, (broadcasted_x - self.low) / self.range())
   return array_ops.where(x >= self.high, ones, result_if_not_big)
示例#15
0
 def _nest_where(vals, cases):
   assert len(vals) == len(cases) - 1
   if len(vals) == 1:
     return array_ops.where(
         math_ops.less(l1_norm, const(vals[0])), cases[0], cases[1])
   else:
     return array_ops.where(
         math_ops.less(l1_norm, const(vals[0])), cases[0],
         _nest_where(vals[1:], cases[1:]))
  def _get_coordinatewise_learning_rate(self, grad, var):
    # Compute the learning rate using a moving average for the diagonal of BB^T
    avg_first = self.get_slot(var, 'first_moment')
    avg_second = self.get_slot(var, 'second_moment')
    decay_tensor = math_ops.cast(self._decay_tensor, var.dtype)
    batch_size = math_ops.cast(self._batch_size_tensor, var.dtype)

    # Create an estimator for the moving average of gradient mean and variance
    # via Welford's algorithm
    if isinstance(grad, ops.Tensor):
      delta = grad - avg_first
      first_moment_update = avg_first.assign_add(
          array_ops.where(self._counter < 1, math_ops.cast(1, var.dtype),
                          1. - decay_tensor) * delta)

      with ops.control_dependencies([first_moment_update]):
        second_moment_update = avg_second.assign_add(
            math_ops.cast(self._counter < 1, var.dtype) *
            -(1. - decay_tensor) * (
                avg_second - decay_tensor  * math_ops.square(delta)))
      diag_preconditioner = control_flow_ops.with_dependencies(
          [second_moment_update],
          clip_ops.clip_by_value(avg_second, 1e-12, 1e12))
    elif isinstance(grad, ops.IndexedSlices):
      delta = grad.values - array_ops.gather_nd(avg_first, grad.indices)
      first_moment_update = state_ops.scatter_add(
          avg_first,
          grad.indices,
          array_ops.where(self._counter < 1,
                          math_ops.cast(1., var.dtype),
                          1. - decay_tensor) * delta)

      with ops.control_dependencies([first_moment_update]):
        avg_second = state_ops.scatter_add(
            avg_second,
            grad.indices,
            math_ops.cast(self._counter < 1, var.dtype) *
            -(1. - decay_tensor) * (
                array_ops.gather_nd(avg_second, grad.indices) - decay_tensor *
                math_ops.square(delta)))
        avg_second = array_ops.gather_nd(avg_second, grad.indices)
        # TODO(b/70783772)
        diag_preconditioner = clip_ops.clip_by_value(avg_second, 1e-12, 1e12)
    else:
      raise errors.InvalidArgumentError(
          None, None, 'grad must of type Tensor or IndexedSlice')

    diag_preconditioner *= batch_size

    if self._use_single_learning_rate:
      diag_preconditioner = math_ops.reduce_mean(diag_preconditioner)

    # From Theorem 2 Corollary 1 of Mandt et al. 2017
    return 2. * batch_size / (
        math_ops.cast(self._total_num_examples, var.dtype.base_dtype) *
        diag_preconditioner)
示例#17
0
 def _prob(self, x):
   broadcasted_x = x * array_ops.ones(self.batch_shape_tensor())
   return array_ops.where(
       math_ops.is_nan(broadcasted_x),
       broadcasted_x,
       array_ops.where(
           math_ops.logical_or(broadcasted_x < self.low,
                               broadcasted_x >= self.high),
           array_ops.zeros_like(broadcasted_x),
           array_ops.ones_like(broadcasted_x) / self.range()))
示例#18
0
 def _prob(self, x):
   broadcasted_x = x * array_ops.ones(self.batch_shape())
   return array_ops.where(
       math_ops.is_nan(broadcasted_x),
       broadcasted_x,
       array_ops.where(
           math_ops.logical_or(broadcasted_x < self.a,
                               broadcasted_x > self.b),
           array_ops.zeros_like(broadcasted_x),
           (1. / self.range()) * array_ops.ones_like(broadcasted_x)))
 def _loop_body(iter_, total, to_skip):
   total = array_ops.where(
       step <= to_skip,
       total,
       array_ops.where(
           to_skip > 0.,
           total + (step - to_skip) * samples[..., iter_],
           total + step * samples[..., iter_]))
   to_skip = array_ops.where(step <= to_skip, to_skip - step, 0.)
   return [iter_ + 1, total, to_skip]
示例#20
0
  def exp_with_logits(name, eps, labels=None, logits=None):
    """Computes exponential loss given `logits`.

    The loss returns is exp(-targets*modified_predictions), where
    modified_predictions are 1 if sigmoid is >= 0.5+eps (eg we predict positive
    class), -1 if sigmoid < 0.5-eps (e.g. we predict negative class) and ax+b in
    the interval 0.5-eps, 0.5+eps, where a = 1/eps, b=1/(2eps).

    Args:
      name: A name for the operation (optional).
      eps: For the range (0.5-eps, 0.5+eps) we set the predictions to be ax+b.
      labels: A `Tensor` of the same type and shape as `logits`.
      logits: A `Tensor` of type `float32` or `float64`.

    Returns:
      A `Tensor` of the same shape as `logits` with the componentwise
      exponential losses.

    Raises:
      ValueError: If `logits` and `labels` do not have the same shape.
    """
    with ops.name_scope(name, "exp_loss", [logits, labels]) as name:
      logits = ops.convert_to_tensor(logits, name="logits")
      labels = ops.convert_to_tensor(labels, name="labels")
      try:
        labels.get_shape().merge_with(logits.get_shape())
      except ValueError:
        raise ValueError("logits and labels must have the same shape (%s vs %s)"
                         % (logits.get_shape(), labels.get_shape()))

    # Default threshold to switch between classes
    zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
    ones = array_ops.ones_like(logits, dtype=logits.dtype)
    neg_ones = -array_ops.ones_like(logits, dtype=logits.dtype)

    # Convert labels to 1 and -1
    cond_labels = (labels > zeros)
    labels_converted = array_ops.where(cond_labels, ones, neg_ones)

    # Convert predictions to 1 and -1
    # The loss we build is min(1, max(-1,ax+b))
    # where a=1/eps, b=-1/2eps.

    a = 1.0 / eps
    b = -1.0 / 2 / eps
    probs = math_ops.sigmoid(logits)
    y = a * probs + b
    # Build max(-1, ax+b)
    cond = (y < -1)
    max_res = array_ops.where(cond, neg_ones, y)
    # Build min part
    cond = (max_res > 1)
    min_res = array_ops.where(cond, ones, max_res)
    preds_converted = min_res
    return math_ops.exp(-preds_converted * labels_converted)
def pick_vector(cond,
                true_vector,
                false_vector,
                name="pick_vector"):
  """Picks possibly different length row `Tensor`s based on condition.

  Value `Tensor`s should have exactly one dimension.

  If `cond` is a python Boolean or `tf.constant` then either `true_vector` or
  `false_vector` is immediately returned. I.e., no graph nodes are created and
  no validation happens.

  Args:
    cond: `Tensor`. Must have `dtype=tf.bool` and be scalar.
    true_vector: `Tensor` of one dimension. Returned when cond is `True`.
    false_vector: `Tensor` of one dimension. Returned when cond is `False`.
    name: `String`. The name to give this op.

  Example:

  ```python
  pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18))
  # result is tensor: [10, 11].
  pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18))
  # result is tensor: [15, 16, 17].
  ```

  Returns:
    true_or_false_vector: `Tensor`.

  Raises:
    TypeError: if `cond.dtype != tf.bool`
    TypeError: if `cond` is not a constant and
      `true_vector.dtype != false_vector.dtype`
  """
  with ops.name_scope(name, values=(cond, true_vector, false_vector)):
    cond = ops.convert_to_tensor(cond, name="cond")
    if cond.dtype != dtypes.bool:
      raise TypeError("%s.dtype=%s which is not %s" %
                      (cond.name, cond.dtype, dtypes.bool))
    cond_value_static = tensor_util.constant_value(cond)
    if cond_value_static is not None:
      return true_vector if cond_value_static else false_vector
    true_vector = ops.convert_to_tensor(true_vector, name="true_vector")
    false_vector = ops.convert_to_tensor(false_vector, name="false_vector")
    if true_vector.dtype != false_vector.dtype:
      raise TypeError(
          "%s.dtype=%s does not match %s.dtype=%s"
          % (true_vector.name, true_vector.dtype,
             false_vector.name, false_vector.dtype))
    n = array_ops.shape(true_vector)[0]
    return array_ops.slice(
        array_ops.concat((true_vector, false_vector), 0),
        [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
示例#22
0
 def _forward_log_det_jacobian(self, x):
   if self._is_only_identity_multiplier:
     # TODO(jvdillon): We don't pad in this case and instead let the fldj be
     # applied via broadcast.
     d = math_ops.cast(array_ops.shape(x)[-1], dtype=self._scale.dtype)
     return math_ops.log(math_ops.abs(self._scale)) * array_ops.where(
         math_ops.equal(self._shaper.event_ndims, 0), 1., d)
   fldj = self._scale.sqrt_log_abs_det()
   # We need to squeeze off the padded dimension.
   start = array_ops.where(self._rank_two_event_ndims_one, 1, 0)
   return array_ops.reshape(fldj, array_ops.shape(fldj)[start:])
示例#23
0
def clip_by_norm(t, clip_norm, axes=None, name=None):
  """Clips tensor values to a maximum L2-norm.

  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
  normalizes `t` so that its L2-norm is less than or equal to `clip_norm`,
  along the dimensions given in `axes`. Specifically, in the default case
  where all dimensions are used for calculation, if the L2-norm of `t` is
  already less than or equal to `clip_norm`, then `t` is not modified. If
  the L2-norm is greater than `clip_norm`, then this operation returns a
  tensor of the same type and shape as `t` with its values set to:

  `t * clip_norm / l2norm(t)`

  In this case, the L2-norm of the output tensor is `clip_norm`.

  As another example, if `t` is a matrix and `axes == [1]`, then each row
  of the output will have L2-norm equal to `clip_norm`. If `axes == [0]`
  instead, each column of the output will be clipped.

  This operation is typically used to clip gradients before applying them with
  an optimizer.

  Args:
    t: A `Tensor` or `IndexedSlices`.
    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
    axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions
      to use for computing the L2-norm. If `None` (the default), uses all
      dimensions.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor` or `IndexedSlices`.
  """
  with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name:
    values = ops.convert_to_tensor(
        t.values if isinstance(t, ops.IndexedSlices) else t, name="t")

    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
    l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True)
    pred = l2sum > 0
    # Two-tap tf.where trick to bypass NaN gradients
    l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum))
    l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum)
    intermediate = values * clip_norm
    # Assert that the shape is compatible with the initial shape,
    # to prevent unintentional broadcasting.
    _ = values.shape.merge_with(intermediate.shape)
    values_clip = array_ops.identity(
        intermediate / math_ops.maximum(l2norm, clip_norm), name=name)

    if isinstance(t, ops.IndexedSlices):
      return ops.IndexedSlices(values_clip, t.indices, t.dense_shape)

    return values_clip
示例#24
0
def _ndtr(x):
  """Implements ndtr core logic."""
  half_sqrt_2 = constant_op.constant(
      0.5 * math.sqrt(2.), dtype=x.dtype, name="half_sqrt_2")
  w = x * half_sqrt_2
  z = math_ops.abs(w)
  y = array_ops.where(math_ops.less(z, half_sqrt_2),
                      1. + math_ops.erf(w),
                      array_ops.where(math_ops.greater(w, 0.),
                                      2. - math_ops.erfc(z),
                                      math_ops.erfc(z)))
  return 0.5 * y
示例#25
0
 def testShapeMismatch(self):
   c = np.random.randint(0, 2, 8).astype(np.bool)
   x = np.random.rand(16, 3, 2) * 100
   y = np.random.rand(16, 3, 2) * 100
   for t in [
       np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
       np.complex128
   ]:
     xt = x.astype(t)
     yt = y.astype(t)
     with self.assertRaises(ValueError):
       array_ops.where(c, xt, yt)
def softplus_inverse(x, name=None):
  """Computes the inverse softplus, i.e., x = softplus_inverse(softplus(x)).

  Mathematically this op is equivalent to:

  ```none
  softplus_inverse = log(exp(x) - 1.)
  ```

  Args:
    x: `Tensor`. Non-negative (not enforced), floating-point.
    name: A name for the operation (optional).

  Returns:
    `Tensor`. Has the same type/shape as input `x`.
  """
  with ops.name_scope(name, "softplus_inverse", values=[x]):
    x = ops.convert_to_tensor(x, name="x")
    # We begin by deriving a more numerically stable softplus_inverse:
    # x = softplus(y) = Log[1 + exp{y}], (which means x > 0).
    # ==> exp{x} = 1 + exp{y}                                (1)
    # ==> y = Log[exp{x} - 1]                                (2)
    #       = Log[(exp{x} - 1) / exp{x}] + Log[exp{x}]
    #       = Log[(1 - exp{-x}) / 1] + Log[exp{x}]
    #       = Log[1 - exp{-x}] + x                           (3)
    # (2) is the "obvious" inverse, but (3) is more stable than (2) for large x.
    # For small x (e.g. x = 1e-10), (3) will become -inf since 1 - exp{-x} will
    # be zero.  To fix this, we use 1 - exp{-x} approx x for small x > 0.
    #
    # In addition to the numerically stable derivation above, we clamp
    # small/large values to be congruent with the logic in:
    # tensorflow/core/kernels/softplus_op.h
    #
    # Finally, we set the input to one whenever the input is too large or too
    # small. This ensures that no unchosen codepath is +/- inf. This is
    # necessary to ensure the gradient doesn't get NaNs. Recall that the
    # gradient of `where` behaves like `pred*pred_true + (1-pred)*pred_false`
    # thus an `inf` in an unselected path results in `0*inf=nan`. We are careful
    # to overwrite `x` with ones only when we will never actually use this
    # value.  Note that we use ones and not zeros since `log(expm1(0.)) = -inf`.
    threshold = np.log(np.finfo(x.dtype.as_numpy_dtype).eps) + 2.
    is_too_small = math_ops.less(x, np.exp(threshold))
    is_too_large = math_ops.greater(x, -threshold)
    too_small_value = math_ops.log(x)
    too_large_value = x
    # This `where` will ultimately be a NOP because we won't select this
    # codepath whenever we used the surrogate `ones_like`.
    x = array_ops.where(math_ops.logical_or(is_too_small, is_too_large),
                        array_ops.ones_like(x), x)
    y = x + math_ops.log(-math_ops.expm1(-x))  # == log(expm1(x))
    return array_ops.where(is_too_small, too_small_value,
                           array_ops.where(is_too_large, too_large_value, y))
示例#27
0
def sparsemax_loss(logits, sparsemax, labels, name=None):
  """Computes sparsemax loss function [1].

  [1]: https://arxiv.org/abs/1602.02068

  Args:
    logits: A `Tensor`. Must be one of the following types: `half`, `float32`,
      `float64`.
    sparsemax: A `Tensor`. Must have the same type as `logits`.
    labels: A `Tensor`. Must have the same type as `logits`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor`. Has the same type as `logits`.
  """

  with ops.name_scope(name, "sparsemax_loss",
                      [logits, sparsemax, labels]) as name:
    logits = ops.convert_to_tensor(logits, name="logits")
    sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
    labels = ops.convert_to_tensor(labels, name="labels")

    # In the paper, they call the logits z.
    # A constant can be substracted from logits to make the algorithm
    # more numerically stable in theory. However, there are really no major
    # source numerical instability in this algorithm.
    z = logits

    # sum over support
    # Use a conditional where instead of a multiplication to support z = -inf.
    # If z = -inf, and there is no support (sparsemax = 0), a multiplication
    # would cause 0 * -inf = nan, which is not correct in this case.
    sum_s = array_ops.where(
        math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
        sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax))

    # - z_k + ||q||^2
    q_part = labels * (0.5 * labels - z)
    # Fix the case where labels = 0 and z = -inf, where q_part would
    # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
    # z = -inf should be consideredself.
    # The code below also coveres the case where z = inf. Howeverm in this
    # caose the sparsemax will be nan, which means the sum_s will also be nan,
    # therefor this case doesn't need addtional special treatment.
    q_part_safe = array_ops.where(
        math_ops.logical_and(math_ops.equal(labels, 0), math_ops.is_inf(z)),
        array_ops.zeros_like(z), q_part)

    return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
示例#28
0
 def compute_lr(self, grad, var):
   scaled_lr = self._learning_rate
   if self._skip_list is None or not any(v in var.name
                                         for v in self._skip_list):
     w_norm = linalg_ops.norm(var, ord=2)
     g_norm = linalg_ops.norm(grad, ord=2)
     trust_ratio = array_ops.where(
         math_ops.greater(w_norm, 0),
         array_ops.where(
             math_ops.greater(g_norm, 0),
             (self._eeta * w_norm /
              (g_norm + self._weight_decay * w_norm + self._epsilon)), 1.0),
         1.0)
     scaled_lr = self._learning_rate * trust_ratio
   return scaled_lr
 def body_fn(t, state, ta):
   inputs_t = array_ops.expand_dims(
       array_ops.gather(inputs_ta.read(t), i), 0)
   output, new_state = cell(inputs_t, state)
   output = array_ops.reshape(output, [-1])
   # TODO(agarwal): one optimization that dynamic_rnn uses is to avoid the
   # array_ops.where when t < min(sequence_length). Doing that requires
   # supporting tf.cond pfor conversion.
   done = t >= sequence_length_i
   output = array_ops.where(done, zeros, output)
   ta = ta.write(t, output)
   new_state = [array_ops.where(done, s, ns) for s, ns in
                zip(nest.flatten(state), nest.flatten(new_state))]
   new_state = nest.pack_sequence_as(state, new_state)
   return t + 1, new_state, ta
示例#30
0
def _MaximumMinimumGrad(op, grad, selector_op):
  """Factor out the code for the gradient of Maximum or Minimum."""
  x = op.inputs[0]
  y = op.inputs[1]
  gdtype = grad.dtype
  sx = array_ops.shape(x)
  sy = array_ops.shape(y)
  gradshape = array_ops.shape(grad)
  zeros = array_ops.zeros(gradshape, gdtype)
  xmask = selector_op(x, y)
  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
  xgrad = array_ops.where(xmask, grad, zeros)
  ygrad = array_ops.where(xmask, zeros, grad)
  gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
  gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy)
  return (gx, gy)
示例#31
0
 def _loss(logits):
     """The loss of pairwise logits with l_i > l_j."""
     return array_ops.where(math_ops.greater(logits, 0),
                            1. - math_ops.sigmoid(logits),
                            math_ops.sigmoid(-logits))
示例#32
0
    def training_graph(self,
                       input_data,
                       input_labels,
                       random_seed,
                       data_spec,
                       epoch=None,
                       input_weights=None):
        """Constructs a TF graph for training a random tree.

    Args:
      input_data: A tensor or SparseTensor or placeholder for input data.
      input_labels: A tensor or placeholder for labels associated with
        input_data.
      random_seed: The random number generator seed to use for this tree.  0
        means use the current time as the seed.
      data_spec: A list of tf.dtype values specifying the original types of
        each column.
      epoch: A tensor or placeholder for the epoch the training data comes from.
      input_weights: A float tensor or placeholder holding per-input weights,
        or None if all inputs are to be weighted equally.

    Returns:
      The last op in the random tree training graph.
    """
        epoch = [0] if epoch is None else epoch

        if input_weights is None:
            input_weights = []

        sparse_indices = []
        sparse_values = []
        sparse_shape = []
        if isinstance(input_data, ops.SparseTensor):
            sparse_indices = input_data.indices
            sparse_values = input_data.values
            sparse_shape = input_data.shape
            input_data = []

        # Count extremely random stats.
        (node_sums, node_squares, splits_indices, splits_sums, splits_squares,
         totals_indices, totals_sums, totals_squares,
         input_leaves) = (self.training_ops.count_extremely_random_stats(
             input_data,
             sparse_indices,
             sparse_values,
             sparse_shape,
             data_spec,
             input_labels,
             input_weights,
             self.variables.tree,
             self.variables.tree_thresholds,
             self.variables.node_to_accumulator_map,
             self.variables.candidate_split_features,
             self.variables.candidate_split_thresholds,
             self.variables.start_epoch,
             epoch,
             num_classes=self.params.num_output_columns,
             regression=self.params.regression))
        node_update_ops = []
        node_update_ops.append(
            state_ops.assign_add(self.variables.node_sums, node_sums))

        splits_update_ops = []
        splits_update_ops.append(
            self.training_ops.scatter_add_ndim(
                self.variables.candidate_split_sums, splits_indices,
                splits_sums))
        splits_update_ops.append(
            self.training_ops.scatter_add_ndim(self.variables.accumulator_sums,
                                               totals_indices, totals_sums))

        if self.params.regression:
            node_update_ops.append(
                state_ops.assign_add(self.variables.node_squares,
                                     node_squares))
            splits_update_ops.append(
                self.training_ops.scatter_add_ndim(
                    self.variables.candidate_split_squares, splits_indices,
                    splits_squares))
            splits_update_ops.append(
                self.training_ops.scatter_add_ndim(
                    self.variables.accumulator_squares, totals_indices,
                    totals_squares))

        # Sample inputs.
        update_indices, feature_updates, threshold_updates = (
            self.training_ops.sample_inputs(
                input_data,
                sparse_indices,
                sparse_values,
                sparse_shape,
                input_weights,
                self.variables.node_to_accumulator_map,
                input_leaves,
                self.variables.candidate_split_features,
                self.variables.candidate_split_thresholds,
                split_initializations_per_input=(
                    self.params.split_initializations_per_input),
                split_sampling_random_seed=random_seed))
        update_features_op = state_ops.scatter_update(
            self.variables.candidate_split_features, update_indices,
            feature_updates)
        update_thresholds_op = state_ops.scatter_update(
            self.variables.candidate_split_thresholds, update_indices,
            threshold_updates)

        # Calculate finished nodes.
        with ops.control_dependencies(splits_update_ops):
            finished, stale = self.training_ops.finished_nodes(
                self.variables.accumulator_to_node_map,
                self.variables.node_to_accumulator_map,
                self.variables.candidate_split_sums,
                self.variables.candidate_split_squares,
                self.variables.accumulator_sums,
                self.variables.accumulator_squares,
                self.variables.start_epoch,
                epoch,
                num_split_after_samples=self.params.split_after_samples,
                min_split_samples=self.params.min_split_samples)

        # Update leaf scores.
        # TODO(thomaswc): Store the leaf scores in a TopN and only update the
        # scores of the leaves that were touched by this batch of input.
        children = array_ops.squeeze(array_ops.slice(self.variables.tree,
                                                     [0, 0], [-1, 1]),
                                     squeeze_dims=[1])
        is_leaf = math_ops.equal(constants.LEAF_NODE, children)
        leaves = math_ops.to_int32(
            array_ops.squeeze(array_ops.where(is_leaf), squeeze_dims=[1]))
        non_fertile_leaves = array_ops.boolean_mask(
            leaves,
            math_ops.less(
                array_ops.gather(self.variables.node_to_accumulator_map,
                                 leaves), 0))

        # TODO(gilberth): It should be possible to limit the number of non
        # fertile leaves we calculate scores for, especially since we can only take
        # at most array_ops.shape(finished)[0] of them.
        with ops.control_dependencies(node_update_ops):
            sums = array_ops.gather(self.variables.node_sums,
                                    non_fertile_leaves)
            if self.params.regression:
                squares = array_ops.gather(self.variables.node_squares,
                                           non_fertile_leaves)
                non_fertile_leaf_scores = self._variance(sums, squares)
            else:
                non_fertile_leaf_scores = self._weighted_gini(sums)

        # Calculate best splits.
        with ops.control_dependencies(splits_update_ops):
            split_indices = self.training_ops.best_splits(
                finished,
                self.variables.node_to_accumulator_map,
                self.variables.candidate_split_sums,
                self.variables.candidate_split_squares,
                self.variables.accumulator_sums,
                self.variables.accumulator_squares,
                regression=self.params.regression)

        # Grow tree.
        with ops.control_dependencies(
            [update_features_op, update_thresholds_op]):
            (tree_update_indices, tree_children_updates,
             tree_threshold_updates, new_eot) = (self.training_ops.grow_tree(
                 self.variables.end_of_tree,
                 self.variables.node_to_accumulator_map, finished,
                 split_indices, self.variables.candidate_split_features,
                 self.variables.candidate_split_thresholds))
            tree_update_op = state_ops.scatter_update(self.variables.tree,
                                                      tree_update_indices,
                                                      tree_children_updates)
            thresholds_update_op = state_ops.scatter_update(
                self.variables.tree_thresholds, tree_update_indices,
                tree_threshold_updates)
            # TODO(thomaswc): Only update the epoch on the new leaves.
            new_epoch_updates = epoch * array_ops.ones_like(
                tree_threshold_updates, dtype=dtypes.int32)
            epoch_update_op = state_ops.scatter_update(
                self.variables.start_epoch, tree_update_indices,
                new_epoch_updates)

        # Update fertile slots.
        with ops.control_dependencies([tree_update_op]):
            (n2a_map_updates, a2n_map_updates, accumulators_cleared,
             accumulators_allocated) = (self.training_ops.update_fertile_slots(
                 finished,
                 non_fertile_leaves,
                 non_fertile_leaf_scores,
                 self.variables.end_of_tree,
                 self.variables.accumulator_sums,
                 self.variables.node_to_accumulator_map,
                 stale,
                 regression=self.params.regression))

        # Ensure end_of_tree doesn't get updated until UpdateFertileSlots has
        # used it to calculate new leaves.
        gated_new_eot, = control_flow_ops.tuple(
            [new_eot], control_inputs=[n2a_map_updates])
        eot_update_op = state_ops.assign(self.variables.end_of_tree,
                                         gated_new_eot)

        updates = []
        updates.append(eot_update_op)
        updates.append(tree_update_op)
        updates.append(thresholds_update_op)
        updates.append(epoch_update_op)

        updates.append(
            state_ops.scatter_update(self.variables.node_to_accumulator_map,
                                     n2a_map_updates[0], n2a_map_updates[1]))

        updates.append(
            state_ops.scatter_update(self.variables.accumulator_to_node_map,
                                     a2n_map_updates[0], a2n_map_updates[1]))

        cleared_and_allocated_accumulators = array_ops.concat(
            0, [accumulators_cleared, accumulators_allocated])

        # Calculate values to put into scatter update for candidate counts.
        # Candidate split counts are always reset back to 0 for both cleared
        # and allocated accumulators. This means some accumulators might be doubly
        # reset to 0 if the were released and not allocated, then later allocated.
        split_values = array_ops.tile(
            array_ops.expand_dims(
                array_ops.expand_dims(
                    array_ops.zeros_like(cleared_and_allocated_accumulators,
                                         dtype=dtypes.float32), 1), 2),
            [
                1, self.params.num_splits_to_consider,
                self.params.num_output_columns
            ])
        updates.append(
            state_ops.scatter_update(self.variables.candidate_split_sums,
                                     cleared_and_allocated_accumulators,
                                     split_values))
        if self.params.regression:
            updates.append(
                state_ops.scatter_update(
                    self.variables.candidate_split_squares,
                    cleared_and_allocated_accumulators, split_values))

        # Calculate values to put into scatter update for total counts.
        total_cleared = array_ops.tile(
            array_ops.expand_dims(
                math_ops.neg(
                    array_ops.ones_like(accumulators_cleared,
                                        dtype=dtypes.float32)), 1),
            [1, self.params.num_output_columns])
        total_reset = array_ops.tile(
            array_ops.expand_dims(
                array_ops.zeros_like(accumulators_allocated,
                                     dtype=dtypes.float32), 1),
            [1, self.params.num_output_columns])
        accumulator_updates = array_ops.concat(0, [total_cleared, total_reset])
        updates.append(
            state_ops.scatter_update(self.variables.accumulator_sums,
                                     cleared_and_allocated_accumulators,
                                     accumulator_updates))
        if self.params.regression:
            updates.append(
                state_ops.scatter_update(self.variables.accumulator_squares,
                                         cleared_and_allocated_accumulators,
                                         accumulator_updates))

        # Calculate values to put into scatter update for candidate splits.
        split_features_updates = array_ops.tile(
            array_ops.expand_dims(
                math_ops.neg(
                    array_ops.ones_like(cleared_and_allocated_accumulators)),
                1), [1, self.params.num_splits_to_consider])
        updates.append(
            state_ops.scatter_update(self.variables.candidate_split_features,
                                     cleared_and_allocated_accumulators,
                                     split_features_updates))

        updates += self.finish_iteration()

        return control_flow_ops.group(*updates)
示例#33
0
def _SelectGrad(op, grad):
    c = op.inputs[0]
    x = op.inputs[1]
    zeros = array_ops.zeros_like(x)
    return (None, array_ops.where(c, grad,
                                  zeros), array_ops.where(c, zeros, grad))
示例#34
0
def reduce_weighted_logsumexp(logx,
                              w=None,
                              axis=None,
                              keep_dims=False,
                              return_sign=False,
                              name=None):
    """Computes `log(abs(sum(weight * exp(elements across tensor dimensions))))`.

  If all weights `w` are known to be positive, it is more efficient to directly
  use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.log(w))` is more
  efficient than `du.reduce_weighted_logsumexp(logx, w)`.

  Reduces `input_tensor` along the dimensions given in `axis`.
  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
  entry in `axis`. If `keep_dims` is true, the reduced dimensions
  are retained with length 1.

  If `axis` has no entries, all dimensions are reduced, and a
  tensor with a single element is returned.

  This function is more numerically stable than log(sum(w * exp(input))). It
  avoids overflows caused by taking the exp of large inputs and underflows
  caused by taking the log of small inputs.

  For example:

  ```python
  x = tf.constant([[0., 0, 0],
                   [0, 0, 0]])

  w = tf.constant([[-1., 1, 1],
                   [1, 1, 1]])

  du.reduce_weighted_logsumexp(x, w)
  # ==> log(-1*1 + 1*1 + 1*1 + 1*1 + 1*1 + 1*1) = log(4)

  du.reduce_weighted_logsumexp(x, w, axis=0)
  # ==> [log(-1+1), log(1+1), log(1+1)]

  du.reduce_weighted_logsumexp(x, w, axis=1)
  # ==> [log(-1+1+1), log(1+1+1)]

  du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True)
  # ==> [[log(-1+1+1)], [log(1+1+1)]]

  du.reduce_weighted_logsumexp(x, w, axis=[0, 1])
  # ==> log(-1+5)
  ```

  Args:
    logx: The tensor to reduce. Should have numeric type.
    w: The weight tensor. Should have numeric type identical to `logx`.
    axis: The dimensions to reduce. If `None` (the default),
      reduces all dimensions. Must be in the range
      `[-rank(input_tensor), rank(input_tensor))`.
    keep_dims: If true, retains reduced dimensions with length 1.
    return_sign: If `True`, returns the sign of the result.
    name: A name for the operation (optional).

  Returns:
    lswe: The `log(abs(sum(weight * exp(x))))` reduced tensor.
    sign: (Optional) The sign of `sum(weight * exp(x))`.
  """
    with ops.name_scope(name, "reduce_weighted_logsumexp", [logx, w]):
        logx = ops.convert_to_tensor(logx, name="logx")
        if w is None:
            lswe = math_ops.reduce_logsumexp(logx,
                                             axis=axis,
                                             keep_dims=keep_dims)
            if return_sign:
                sgn = array_ops.ones_like(lswe)
                return lswe, sgn
            return lswe
        w = ops.convert_to_tensor(w, dtype=logx.dtype, name="w")
        log_absw_x = logx + math_ops.log(math_ops.abs(w))
        max_log_absw_x = math_ops.reduce_max(log_absw_x,
                                             axis=axis,
                                             keep_dims=True)
        # If the largest element is `-inf` or `inf` then we don't bother subtracting
        # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That
        # this is ok follows from the fact that we're actually free to subtract any
        # value we like, so long as we add it back after taking the `log(sum(...))`.
        max_log_absw_x = array_ops.where(math_ops.is_inf(max_log_absw_x),
                                         array_ops.zeros_like(max_log_absw_x),
                                         max_log_absw_x)
        wx_over_max_absw_x = (math_ops.sign(w) *
                              math_ops.exp(log_absw_x - max_log_absw_x))
        sum_wx_over_max_absw_x = math_ops.reduce_sum(wx_over_max_absw_x,
                                                     axis=axis,
                                                     keep_dims=keep_dims)
        if not keep_dims:
            max_log_absw_x = array_ops.squeeze(max_log_absw_x, axis)
        sgn = math_ops.sign(sum_wx_over_max_absw_x)
        lswe = max_log_absw_x + math_ops.log(sgn * sum_wx_over_max_absw_x)
        if return_sign:
            return lswe, sgn
        return lswe
示例#35
0
 def loop_fn(i):
     a_i = array_ops.gather(a, i)
     b_i = array_ops.gather(b, i)
     cond_i = array_ops.gather(cond, i)
     return array_ops.where(cond_i, a_i, b_i)
示例#36
0
    def get_losses(self,
                   logits,
                   localisations,
                   gclasses,
                   glocalisations,
                   gscores,
                   match_threshold=0.5,
                   negative_ratio=2.5,
                   alpha=1.,
                   label_smoothing=0.,
                   scope=None):
        """Loss functions for training the SSD 300 VGG network.
    
        This function defines the different loss components of the SSD, and
        adds them to the TF loss collection.
    
        Arguments:
          logits: (list of) predictions logits Tensors;
          localisations: (list of) localisations Tensors;
          gclasses: (list of) groundtruth labels Tensors;
          glocalisations: (list of) groundtruth localisations Tensors;
          gscores: (list of) groundtruth score Tensors;
        """
        with tf.name_scope(scope, 'ssd_losses'):
            lshape = tfe.get_shape(logits[0], 5)
            num_classes = lshape[-1]
            #             batch_size = lshape[0]

            # Flatten out all vectors!
            flogits = []
            fgclasses = []
            fgscores = []
            flocalisations = []
            fglocalisations = []
            for i in range(len(logits)):
                flogits.append(tf.reshape(logits[i], [-1, num_classes]))
                fgclasses.append(tf.reshape(gclasses[i], [-1]))
                fgscores.append(tf.reshape(gscores[i], [-1]))
                flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
                fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))
            # And concat the crap!
            logits = tf.concat(flogits, axis=0)
            gclasses = tf.concat(fgclasses, axis=0)
            gscores = tf.concat(fgscores, axis=0)
            localisations = tf.concat(flocalisations, axis=0)
            glocalisations = tf.concat(fglocalisations, axis=0)
            dtype = logits.dtype

            # Compute positive matching mask...
            pmask = gclasses > 0
            fpmask = tf.cast(pmask, dtype)
            n_positives = tf.reduce_sum(fpmask)

            # Hard negative mining...
            #for no_classes, we only care that false positive's label is 0
            #this is why pmask sufice our needs
            no_classes = tf.cast(pmask, tf.int32)
            predictions = slim.softmax(logits)
            nmask = tf.logical_not(pmask)
            fnmask = tf.cast(nmask, dtype)

            nvalues = tf.where(nmask, predictions[:, 0], 1. - fnmask)

            nvalues_flat = tf.reshape(nvalues, [-1])
            # Number of negative entries to select.
            max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)

            n_neg = tf.cast(negative_ratio * n_positives, tf.int32)
            n_neg = tf.minimum(n_neg, max_neg_entries)
            #avoid n_neg is zero, and cause error when doing top_k later on
            n_neg = tf.maximum(n_neg, 1)

            val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
            max_hard_pred = -val[-1]
            # Final negative mask, hard negative mining
            nmask = tf.logical_and(nmask, nvalues <= max_hard_pred)
            fnmask = tf.cast(nmask, dtype)

            # Add cross-entropy loss.
            with tf.name_scope('cross_entropy_pos'):
                total_cross_pos = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits, labels=gclasses)
                total_cross_pos = tf.reduce_sum(total_cross_pos * fpmask,
                                                name="cross_entropy_pos")
                tf.losses.add_loss(total_cross_pos)

            with tf.name_scope('cross_entropy_neg'):
                total_cross_neg = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits, labels=no_classes)
                total_cross_neg = tf.reduce_sum(total_cross_neg * fnmask,
                                                name="cross_entropy_neg")
                tf.losses.add_loss(total_cross_neg)

            # Add localization loss: smooth L1, L2, ...
            with tf.name_scope('localization'):
                # Weights Tensor: positive mask + random negative.
                weights = tf.expand_dims(alpha * fpmask, axis=-1)
                total_loc = custom_layers.abs_smooth_2(localisations -
                                                       glocalisations)
                total_loc = tf.reduce_sum(total_loc * weights,
                                          name="localization")
                tf.losses.add_loss(total_loc)

            total_cross = tf.add(total_cross_pos, total_cross_neg,
                                 'cross_entropy')

            # Add to EXTRA LOSSES TF.collection
            tf.add_to_collection('EXTRA_LOSSES', total_cross_pos)
            tf.add_to_collection('EXTRA_LOSSES', total_cross_neg)
            tf.add_to_collection('EXTRA_LOSSES', total_cross)
            tf.add_to_collection('EXTRA_LOSSES', total_loc)

            #stick with the orgiginal paper in terms of definig model loss
            model_loss = tf.get_collection(tf.GraphKeys.LOSSES)
            model_loss = tf.add_n(model_loss)
            model_loss = array_ops.where(tf.equal(n_positives, 0),
                                         array_ops.zeros_like(model_loss),
                                         tf.div(1.0, n_positives) * model_loss)
            #Add regularziaton loss
            regularization_losses = tf.get_collection(
                tf.GraphKeys.REGULARIZATION_LOSSES)
            regularization_loss = tf.add_n(regularization_losses,
                                           name='regularization_loss')

            #if model oss is zero, no need to do gradient update on this batch
            total_loss = array_ops.where(
                tf.equal(n_positives, 0), array_ops.zeros_like(model_loss),
                tf.add(model_loss, regularization_loss))

            #debugging info
            tf.summary.scalar("postive_num", n_positives)
            tf.summary.scalar("negative_num", n_neg)
            tf.summary.scalar("regularization_loss", regularization_loss)
            #             with tf.name_scope('variables_loc'):
            #                 selected_p = tf.boolean_mask(glocalisations, pmask)
            #                 p_mean, p_variance = tf.nn.moments(selected_p, [0])
            #                 tf.summary.scalar("mean_cx", p_mean[0])
            #                 tf.summary.scalar("mean_cy", p_mean[1])
            #                 tf.summary.scalar("mean_w", p_mean[2])
            #                 tf.summary.scalar("mean_h", p_mean[3])
            #
            #                 tf.summary.scalar("var_cx", p_variance[0])
            #                 tf.summary.scalar("var_cy", p_variance[1])
            #                 tf.summary.scalar("var_w", p_variance[2])
            #                 tf.summary.scalar("var_h", p_variance[3])

            return total_loss
示例#37
0
def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
  """Computes log Poisson loss given `log_input`.

  Gives the log-likelihood loss between the prediction and the target under the
  assumption that the target has a Poisson distribution.
  Caveat: By default, this is not the exact loss, but the loss minus a
    constant term [log(z!)]. That has no effect for optimization, but
    does not play well with relative loss comparisons. To compute an
    approximation of the log factorial term, specify
    compute_full_loss=True to enable Stirling's Approximation.

  For brevity, let `c = log(x) = log_input`, `z = targets`.  The log Poisson
  loss is

        -log(exp(-x) * (x^z) / z!)
      = -log(exp(-x) * (x^z)) + log(z!)
      ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
          [ Note the second term is the Stirling's Approximation for log(z!).
            It is invariant to x and does not affect optimization, though
            important for correct relative loss comparisons. It is only
            computed when compute_full_loss == True. ]
      = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
      = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]

  Args:
    targets: A `Tensor` of the same type and shape as `log_input`.
    log_input: A `Tensor` of type `float32` or `float64`.
    compute_full_loss: whether to compute the full loss. If false, a constant
      term is dropped in favor of more efficient optimization.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `log_input` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `log_input` and `targets` do not have the same shape.
  """
  with ops.name_scope(name, "log_poisson_loss", [log_input, targets]) as name:
    log_input = ops.convert_to_tensor(log_input, name="log_input")
    targets = ops.convert_to_tensor(targets, name="targets")
    try:
      targets.get_shape().merge_with(log_input.get_shape())
    except ValueError:
      raise ValueError(
          "log_input and targets must have the same shape (%s vs %s)" %
          (log_input.get_shape(), targets.get_shape()))

    result = math_ops.exp(log_input) - log_input * targets
    if compute_full_loss:
      # need to create constant tensors here so that their dtypes can be matched
      # to that of the targets.
      point_five = constant_op.constant(0.5, dtype=targets.dtype)
      two_pi = constant_op.constant(2 * math.pi, dtype=targets.dtype)

      stirling_approx = (targets * math_ops.log(targets)) - targets + (
          point_five * math_ops.log(two_pi * targets))
      zeros = array_ops.zeros_like(targets, dtype=targets.dtype)
      ones = array_ops.ones_like(targets, dtype=targets.dtype)
      cond = math_ops.logical_and(targets >= zeros, targets <= ones)
      result += array_ops.where(cond, zeros, stirling_approx)
    return result
示例#38
0
def clip_by_norm(t, clip_norm, axes=None, name=None):
  """Clips tensor values to a maximum L2-norm.

  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
  normalizes `t` so that its L2-norm is less than or equal to `clip_norm`,
  along the dimensions given in `axes`. Specifically, in the default case
  where all dimensions are used for calculation, if the L2-norm of `t` is
  already less than or equal to `clip_norm`, then `t` is not modified. If
  the L2-norm is greater than `clip_norm`, then this operation returns a
  tensor of the same type and shape as `t` with its values set to:

  `t * clip_norm / l2norm(t)`

  In this case, the L2-norm of the output tensor is `clip_norm`.

  As another example, if `t` is a matrix and `axes == [1]`, then each row
  of the output will have L2-norm less than or equal to `clip_norm`. If
  `axes == [0]` instead, each column of the output will be clipped.

  Code example:

  >>> some_nums = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.float32)
  >>> tf.clip_by_norm(some_nums, 2.0).numpy()
  array([[0.26967996, 0.5393599 , 0.80903983, 1.0787199 , 1.3483998 ]],
        dtype=float32)

  This operation is typically used to clip gradients before applying them with
  an optimizer.  Most gradient data is a collection of different shaped tensors
  for different parts of the model.  Thus, this is a common usage:

  ```
  # Get your gradients after training
  loss_value, grads = grad(model, features, labels)

  # Apply some clipping
  grads = [tf.clip_by_norm(g, norm)
               for g in grads]

  # Continue on with training
  optimizer.apply_gradients(grads)
  ```

  Args:
    t: A `Tensor` or `IndexedSlices`.  This must be a floating point type.
    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value, also
      floating point
    axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions
      to use for computing the L2-norm. If `None` (the default), uses all
      dimensions.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor` or `IndexedSlices`.

  Raises:
    ValueError: If the clip_norm tensor is not a 0-D scalar tensor.
    TypeError: If dtype of the input is not a floating point or
      complex type.
  """
  with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name:
    values = ops.convert_to_tensor(
        t.values if isinstance(t, ops.IndexedSlices) else t, name="t")

    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
    l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True)
    pred = l2sum > 0
    # Two-tap tf.where trick to bypass NaN gradients
    l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum))
    l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum)
    intermediate = values * clip_norm
    # Assert that the shape is compatible with the initial shape,
    # to prevent unintentional broadcasting.
    _ = values.shape.merge_with(intermediate.shape)
    values_clip = array_ops.identity(
        intermediate / math_ops.maximum(l2norm, clip_norm), name=name)

    if isinstance(t, ops.IndexedSlices):
      return ops.IndexedSlices(values_clip, t.indices, t.dense_shape)

    return values_clip
示例#39
0
 def _copy_one_through(output, new_output):
     copy_cond = (time >= sequence_length)
     with ops.colocate_with(new_output):
         return array_ops.where(copy_cond, output, new_output)
def safe_embedding_lookup_sparse(
    embedding_weights,
    sparse_ids,
    sparse_weights=None,
    combiner="mean",
    default_id=None,
    name="safe_embedding_lookup_sparse",
    partition_strategy=None,  # no used
    max_norm=None,
    return_trainable=False,
):
  """Provides a dynamic version of `tf.nn.safe_embedding_lookup_sparse`.

    Lookup embedding results, accounting for empty features and invalid weights.

    Any IDs will be treated as valid include non-positive IDs.
    Invalid weights (<= 0) are pruned from input weights, as well as any IDs
    with non-positive weight. For an entry with no features, the embedding vector
    for `default_id` is returned, or the 0-vector if `default_id` is not supplied.

    The ids and weights may be multi-dimensional. Embeddings are always aggregated
    along the last dimension.

    Args:
      embedding_weights: A single `dynamic_embedding.Variable` instance
        representing the complete embedding tensor.
      sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
        ids. `d_0` is typically batch size.
      sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
        float weights corresponding to `sparse_ids`, or `None` if all weights are
        be assumed to be 1.0.
      combiner: A string specifying how to combine embedding results for each
        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
        default.
      default_id: The id to use for an entry with no features.
      name: A name for this operation. Name is optional in graph mode and required
        in eager mode.
      partition_strategy: A string specifying the partitioning strategy. Currently
        `"div"` and `"mod"` are supported. Default is `"div"`.
      max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
        combining.

    Returns:
      combined_embeddings:
        A dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
      trainable_wrap:
        A TrainableWrapper object used to fill the Optimizers `var_list`
          Only provided if `return_trainable` is True.

    Raises:
      ValueError: if `embedding_weights` is empty.
  """
  if embedding_weights is None:
    raise ValueError("Missing embedding_weights %s." % embedding_weights)

  if embedding_weights.key_dtype != sparse_ids.dtype:
    raise TypeError(
        "embedding_weights.key_dtype should be same with sparse_ids.dtype: "
        "{} vs. {}".format(embedding_weights.key_dtype, sparse_ids.dtype))

  weights_dtype = sparse_weights.dtype if sparse_weights is not None else None
  if weights_dtype and embedding_weights.value_dtype != weights_dtype:
    raise TypeError(
        "embedding_weights.value_dtype should be same with sparse_weights.dtype"
        ": {} vs. {}".format(embedding_weights.value_dtype, weights_dtype))

  scope = variable_scope.get_variable_scope()
  full_name = scope.name + "/" + name if scope.name else name
  with ops.name_scope(full_name + "/"):
    # Reshape higher-rank sparse ids and weights to linear segment ids.
    original_shape = sparse_ids.dense_shape
    original_rank_dim = tensor_shape.dimension_value(
        sparse_ids.dense_shape.get_shape()[0])
    original_rank = (array_ops.size(original_shape)
                     if original_rank_dim is None else original_rank_dim)
    sparse_ids = de.math.sparse_reshape(
        sparse_ids,
        [
            math_ops.reduce_prod(
                array_ops.slice(original_shape, [0], [original_rank - 1])),
            array_ops.gather(original_shape, original_rank - 1),
        ],
    )
    if sparse_weights is not None:
      sparse_weights = sparse_tensor.SparseTensor(sparse_ids.indices,
                                                  sparse_weights.values,
                                                  sparse_ids.dense_shape)

    # Prune invalid weights.
    if combiner != "sum":
      sparse_ids, sparse_weights = _prune_invalid_weights(
          sparse_ids, sparse_weights)

    # Fill in dummy values for empty features, if necessary.
    sparse_ids, is_row_empty = de.math.sparse_fill_empty_rows(
        sparse_ids, default_id or 0)
    if sparse_weights is not None:
      sparse_weights, _ = de.math.sparse_fill_empty_rows(sparse_weights, 1.0)

    result, trainable_ = embedding_lookup_sparse(
        embedding_weights,
        sparse_ids,
        sparse_weights,
        combiner=combiner,
        partition_strategy=partition_strategy,
        name=name + "/embedding_lookup_sparse",
        max_norm=max_norm,
        return_trainable=True,
    )

    if default_id is None:
      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
      # for use in Select.
      is_row_empty = array_ops.tile(
          array_ops.reshape(is_row_empty, [-1, 1]),
          array_ops.stack([1, array_ops.shape(result)[1]]),
      )

      result = array_ops.where(is_row_empty,
                               array_ops.zeros_like(result),
                               result,
                               name="where")

    # Reshape back from linear ids back into higher-dimensional dense result.
    final_result = array_ops.reshape(
        result,
        array_ops.concat(
            [
                array_ops.slice(
                    math_ops.cast(original_shape, dtypes.int32),
                    [0],
                    [original_rank - 1],
                ),
                array_ops.slice(array_ops.shape(result), [1], [-1]),
            ],
            0,
        ),
    )
    final_result.set_shape(
        tensor_shape.unknown_shape(
            (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(
                result.get_shape()[1:]))
    return (final_result, trainable_) if return_trainable else final_result
示例#41
0
文件: crf.py 项目: ljos/navnkjenner
 def _single_seq_fn():
     log_norm = math_ops.reduce_logsumexp(first_input, [1])
     # Mask `log_norm` of the sequences with length <= zero.
     log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
                                array_ops.zeros_like(log_norm), log_norm)
     return log_norm
示例#42
0
def _embedding_lookup_and_transform(params,
                                    ids,
                                    partition_strategy="mod",
                                    name=None,
                                    max_norm=None,
                                    transform_fn=None):
    """Helper function for embedding_lookup and _compute_sampled_logits.

  This function is a generalization of embedding_lookup that optionally
  applies a caller-specified transformation to each embedding. This is
  done through the `transform_fn` argument. If provided, the function is
  applied to each partitioned tensor of retrieved embeddings, colocated
  with the embeddings. This function will be called with a single `Tensor`
  argument of the same type as the `params` tensor and should return a
  `Tensor`. The shape of the argument will be the same as `params` except
  for the size of the first dimension. The first dimension of the result's
  shape must be the same size as the argument's.

  Args:
    params: See embedding_lookup.
    ids: See embedding_lookup.
    partition_strategy: See embedding_lookup.
    name: See embedding_lookup.
    max_norm: See embedding_lookup.
    transform_fn: An optional function to apply to each retrieved embedding.
      If max_norm is provided, transform_fn is applied to the norm-limited
      embeddings.

  Returns:
    See embedding_lookup for details.
  Raises:
    ValueError: If `params` is empty.
  """
    if params is None or params in ((), []):
        raise ValueError("Need at least one param")
    if isinstance(params, variables.PartitionedVariable):
        params = list(params)  # Iterate to get the underlying Variables.
    if not isinstance(params, list):
        params = [params]

    with ops.name_scope(name, "embedding_lookup", params + [ids]) as name:
        np = len(params)  # Number of partitions
        # Preserve the resource variable status to avoid accidental dense reads.
        if not any(
                isinstance(p, resource_variable_ops.ResourceVariable)
                for p in params):
            params = ops.convert_n_to_tensor_or_indexed_slices(params,
                                                               name="params")
        ids = ops.convert_to_tensor(ids, name="ids")
        if np == 1 and (not transform_fn or ids.get_shape().ndims == 1):
            with ops.colocate_with(params[0]):
                result = _clip(array_ops.gather(params[0], ids, name=name),
                               ids, max_norm)
                if transform_fn:
                    result = transform_fn(result)
                return result
        else:
            # Flatten the ids. There are two cases where we need to do this.
            # - There is more than one params tensor.
            # - There is a transform_fn and ids is not statically known to be 1-D.
            #   We must flatten in this case because transform_fn expects a flat
            #   tensor of embeddings.
            flat_ids = array_ops.reshape(ids, [-1])
            original_indices = math_ops.range(array_ops.size(flat_ids))

            # Create p_assignments and set new_ids depending on the strategy.
            if partition_strategy == "mod":
                p_assignments = flat_ids % np
                new_ids = flat_ids // np
            elif partition_strategy == "div":
                # Compute num_total_ids as the sum of dim-0 of params, then assign to
                # partitions based on a constant number of ids per partition. Optimize
                # if we already know the full shape statically.
                dim_0_size = params[0].get_shape()[0]
                for p in xrange(1, np):
                    dim_0_size += params[p].get_shape()[0]
                if dim_0_size.value:
                    num_total_ids = constant_op.constant(
                        dim_0_size.value, flat_ids.dtype)
                else:
                    dim_0_sizes = []
                    for p in xrange(np):
                        if params[p].get_shape()[0].value is not None:
                            dim_0_sizes.append(params[p].get_shape()[0].value)
                        else:
                            with ops.colocate_with(params[p]):
                                dim_0_sizes.append(
                                    array_ops.shape(params[p])[0])
                    num_total_ids = math_ops.reduce_sum(
                        math_ops.cast(array_ops.stack(dim_0_sizes),
                                      flat_ids.dtype))
                ids_per_partition = num_total_ids // np
                extras = num_total_ids % np

                p_assignments = math_ops.maximum(
                    flat_ids // (ids_per_partition + 1),
                    (flat_ids - extras) // ids_per_partition)

                # Emulate a conditional using a boolean indicator tensor
                new_ids = array_ops.where(
                    p_assignments < extras, flat_ids % (ids_per_partition + 1),
                    (flat_ids - extras) % ids_per_partition)
            else:
                raise ValueError("Unrecognized partition strategy: " +
                                 partition_strategy)

            # Cast partition assignments to int32 for use in dynamic_partition.
            # There really should not be more than 2^32 partitions.
            p_assignments = math_ops.cast(p_assignments, dtypes.int32)
            # Partition list of ids based on assignments into np separate lists
            gather_ids = data_flow_ops.dynamic_partition(
                new_ids, p_assignments, np)
            # Similarly, partition the original indices.
            pindices = data_flow_ops.dynamic_partition(original_indices,
                                                       p_assignments, np)
            # Do np separate lookups, finding embeddings for plist[p] in params[p]
            partitioned_result = []
            for p in xrange(np):
                pids = gather_ids[p]
                with ops.colocate_with(params[p]):
                    result = array_ops.gather(params[p], pids)
                    if transform_fn:
                        # If transform_fn is provided, the clip_by_norm precedes
                        # the transform and hence must be co-located. See below
                        # for the counterpart if transform_fn is not proveded.
                        result = transform_fn(_clip(result, pids, max_norm))
                partitioned_result.append(result)
            # Stitch these back together
            ret = data_flow_ops.parallel_dynamic_stitch(pindices,
                                                        partitioned_result,
                                                        name=name)

            # Determine the static element shape.
            if transform_fn is None:
                element_shape_s = params[0].get_shape()[1:]
                for p in params[1:]:
                    element_shape_s = element_shape_s.merge_with(
                        p.get_shape()[1:])
            else:
                element_shape_s = ret.get_shape()[1:]

            # Compute the dynamic element shape.
            if element_shape_s.is_fully_defined():
                element_shape_d = element_shape_s
            elif transform_fn is None:
                # It's important that we compute params[0].shape on the right device
                # to avoid data motion.
                with ops.colocate_with(params[0]):
                    params_shape = array_ops.shape(params[0])
                element_shape_d = params_shape[1:]
            else:
                element_shape_d = array_ops.shape(ret)[1:]

            # Reshape to reverse the flattening of ids.
            ret = array_ops.reshape(
                ret,
                array_ops.concat([array_ops.shape(ids), element_shape_d], 0))

            # Normally the reshape is sufficient, but setting shape explicitly
            # teaches shape inference that params[1:].get_shape() matters
            # (in the case that transform_fn is None).
            ret.set_shape(ids.get_shape().concatenate(element_shape_s))
            if not transform_fn:
                # If transform_fn was provided, the clip_by_norm was done above.
                ret = _clip(ret, ids, max_norm)
            return ret
示例#43
0
def connected_components(images):
  """Labels the connected components in a batch of images.

  A component is a set of pixels in a single input image, which are all adjacent
  and all have the same non-zero value. The components using a squared
  connectivity of one (all True entries are joined with their neighbors above,
  below, left, and right). Components across all images have consecutive ids 1
  through n. Components are labeled according to the first pixel of the
  component appearing in row-major order (lexicographic order by
  image_index_in_batch, row, col). Zero entries all have an output id of 0.

  This op is equivalent with `scipy.ndimage.measurements.label` on a 2D array
  with the default structuring element (which is the connectivity used here).

  Args:
    images: A 2D (H, W) or 3D (N, H, W) Tensor of boolean image(s).

  Returns:
    Components with the same shape as `images`. False entries in `images` have
    value 0, and all True entries map to a component id > 0.

  Raises:
    TypeError: if `images` is not 2D or 3D.
  """
  with ops.name_scope("connected_components"):
    image_or_images = ops.convert_to_tensor(images, name="images")
    if len(image_or_images.get_shape()) == 2:
      images = image_or_images[None, :, :]
    elif len(image_or_images.get_shape()) == 3:
      images = image_or_images
    else:
      raise TypeError(
          "images should have rank 2 (HW) or 3 (NHW). Static shape is %s" %
          image_or_images.get_shape())
    components = gen_image_ops.image_connected_components(images)

    # TODO(ringwalt): Component id renaming should be done in the op, to avoid
    # constructing multiple additional large tensors.
    components_flat = array_ops.reshape(components, [-1])
    unique_ids, id_index = array_ops.unique(components_flat)
    id_is_zero = array_ops.where(math_ops.equal(unique_ids, 0))[:, 0]
    # Map each nonzero id to consecutive values.
    nonzero_consecutive_ids = math_ops.range(
        array_ops.shape(unique_ids)[0] - array_ops.shape(id_is_zero)[0]) + 1

    def no_zero():
      # No need to insert a zero into the ids.
      return nonzero_consecutive_ids

    def has_zero():
      # Insert a zero in the consecutive ids where zero appears in unique_ids.
      # id_is_zero has length 1.
      zero_id_ind = math_ops.cast(id_is_zero[0], dtypes.int32)
      ids_before = nonzero_consecutive_ids[:zero_id_ind]
      ids_after = nonzero_consecutive_ids[zero_id_ind:]
      return array_ops.concat([ids_before, [0], ids_after], axis=0)

    new_ids = control_flow_ops.cond(
        math_ops.equal(array_ops.shape(id_is_zero)[0], 0), no_zero, has_zero)
    components = array_ops.reshape(
        array_ops.gather(new_ids, id_index), array_ops.shape(components))
    if len(image_or_images.get_shape()) == 2:
      return components[0, :, :]
    else:
      return components
def _list_mle_loss(labels,
                   logits,
                   weights=None,
                   lambda_weight=None,
                   reduction=core_losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
                   name=None,
                   seed=None):
    """Computes the ListMLE loss [Xia et al. 2008] for a list.

  Given the labels of graded relevance l_i and the logits s_i, we calculate
  the ListMLE loss for the given list.

  The `lambda_weight` re-weights examples based on l_i and r_i.
  The recommended weighting scheme is the formulation presented in the
  "Position-Aware ListMLE" paper (Lan et al.) and available using
  create_p_list_mle_lambda_weight() factory function above.

  Args:
    labels: A `Tensor` of the same shape as `logits` representing graded
      relevance.
    logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
      ranking score of the corresponding item.
    weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
      weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
      weights.
    lambda_weight: A `DCGLambdaWeight` instance.
    reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
      reduce training loss over batch.
    name: A string used as the name for this loss.
    seed: A randomization seed used when shuffling ground truth permutations.

  Returns:
    An op for the ListMLE loss.
  """
    with ops.name_scope(name, 'list_mle_loss', (labels, logits, weights)):
        is_label_valid = utils.is_label_valid(labels)
        # Reset the invalid labels to 0 and reset the invalid logits to a logit with
        # ~= 0 contribution.
        labels = array_ops.where(is_label_valid, labels,
                                 array_ops.zeros_like(labels))
        logits = array_ops.where(
            is_label_valid, logits,
            math_ops.log(_EPSILON) * array_ops.ones_like(logits))
        weights = 1.0 if weights is None else ops.convert_to_tensor(weights)
        weights = array_ops.squeeze(weights)

        # Shuffle labels and logits to add randomness to sort.
        shuffled_indices = utils.shuffle_valid_indices(is_label_valid, seed)
        shuffled_labels = array_ops.gather_nd(labels, shuffled_indices)
        shuffled_logits = array_ops.gather_nd(logits, shuffled_indices)

        sorted_labels, sorted_logits = utils.sort_by_scores(
            shuffled_labels, [shuffled_labels, shuffled_logits])

        raw_max = math_ops.reduce_max(sorted_logits, axis=1, keepdims=True)
        sorted_logits = sorted_logits - raw_max
        sums = math_ops.cumsum(math_ops.exp(sorted_logits),
                               axis=1,
                               reverse=True)
        sums = math_ops.log(sums) - sorted_logits

        if lambda_weight is not None and isinstance(lambda_weight,
                                                    ListMLELambdaWeight):
            sums *= lambda_weight.individual_weights(sorted_labels)

        negative_log_likelihood = math_ops.reduce_sum(sums, 1)

        return core_losses.compute_weighted_loss(negative_log_likelihood,
                                                 weights=weights,
                                                 reduction=reduction)
示例#45
0
def mean_pairwise_squared_error(predictions,
                                labels=None,
                                weights=1.0,
                                scope=None):
    """Adds a pairwise-errors-squared loss to the training procedure.

    Unlike `mean_squared_error`, which is a measure of the differences between
    corresponding elements of `predictions` and `labels`,
    `mean_pairwise_squared_error` is a measure of the differences between pairs of
    corresponding elements of `predictions` and `labels`.

    For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
    three pairs of differences are summed to compute the loss:
      loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3

    Note that since the inputs are of size [batch_size, d0, ... dN], the
    corresponding pairs are computed within each batch sample but not across
    samples within a batch. For example, if `predictions` represents a batch of
    16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
    is drawn from each image, but not across images.

    `weights` acts as a coefficient for the loss. If a scalar is provided, then
    the loss is simply scaled by the given value. If `weights` is a tensor of size
    [batch_size], then the total loss for each sample of the batch is rescaled
    by the corresponding element in the `weights` vector.

    Args:
      predictions: The predicted outputs, a tensor of size [batch_size, d0, .. dN]
        where N+1 is the total number of dimensions in `predictions`.
      labels: The ground truth output tensor, whose shape must match the shape of
        the `predictions` tensor.
      weights: Coefficients for the loss a scalar, a tensor of shape [batch_size]
        or a tensor whose shape matches `predictions`.
      scope: The scope for the operations performed in computing the loss.

    Returns:
      A scalar `Tensor` representing the loss value.

    Raises:
      ValueError: If the shape of `predictions` doesn't match that of `labels` or
        if the shape of `weights` is invalid.
    """
    with ops.name_scope(scope, "mean_pairwise_squared_error",
                        [predictions, labels, weights]) as scope:
        predictions.get_shape().assert_is_compatible_with(labels.get_shape())
        predictions = math_ops.cast(predictions, dtypes.float32)
        labels = math_ops.cast(labels, dtypes.float32)
        weights = math_ops.cast(ops.convert_to_tensor(weights), dtypes.float32)

        diffs = math_ops.subtract(predictions, labels)

        # Need to verify here since the function doesn't use compute_weighted_loss
        if diffs.get_shape().ndims is None:
            raise ValueError("diffs.get_shape().ndims cannot be None")
        if weights.get_shape().ndims is None:
            raise ValueError("weights.get_shape().ndims cannot be None")

        axis = list(range(1, diffs.get_shape().ndims))

        sum_squares_diff_per_batch = math_ops.reduce_sum(
            math_ops.square(diffs), axis=axis)
        num_present_per_batch = _num_present(diffs, weights, per_batch=True)

        term1 = 2.0 * math_ops.div_no_nan(
            sum_squares_diff_per_batch, num_present_per_batch, name="value")

        sum_diff = math_ops.reduce_sum(diffs, axis=axis)
        term2 = 2.0 * math_ops.div_no_nan(
            math_ops.square(sum_diff),
            math_ops.square(num_present_per_batch),
            name="value")

        loss = _scale_losses(term1 - term2, weights)

        mean_loss = array_ops.where(
            math_ops.reduce_sum(num_present_per_batch) > 0,
            loss,
            array_ops.zeros_like(loss),
            name="value")
        add_loss(mean_loss)
        return mean_loss
示例#46
0
 def copy_fn(cur_i, cand_i):
     with ops.colocate_with(cand_i):
         return array_ops.where(elements_finished, cur_i,
                                cand_i)
示例#47
0
    def interpolate_pr_auc(self):
        """Interpolation formula inspired by section 4 of Davis & Goadrich 2006.

        https://www.biostat.wisc.edu/~page/rocpr.pdf

        Note here we derive & use a closed formula not present in the paper
        as follows:

          Precision = TP / (TP + FP) = TP / P

        Modeling all of TP (true positive), FP (false positive) and their sum
        P = TP + FP (predicted positive) as varying linearly within each interval
        [A, B] between successive thresholds, we get

          Precision slope = dTP / dP
                          = (TP_B - TP_A) / (P_B - P_A)
                          = (TP - TP_A) / (P - P_A)
          Precision = (TP_A + slope * (P - P_A)) / P

        The area within the interval is (slope / total_pos_weight) times

          int_A^B{Precision.dP} = int_A^B{(TP_A + slope * (P - P_A)) * dP / P}
          int_A^B{Precision.dP} = int_A^B{slope * dP + intercept * dP / P}

        where intercept = TP_A - slope * P_A = TP_B - slope * P_B, resulting in

          int_A^B{Precision.dP} = TP_B - TP_A + intercept * log(P_B / P_A)

        Bringing back the factor (slope / total_pos_weight) we'd put aside, we get

          slope * [dTP + intercept *  log(P_B / P_A)] / total_pos_weight

        where dTP == TP_B - TP_A.

        Note that when P_A == 0 the above calculation simplifies into

          int_A^B{Precision.dTP} = int_A^B{slope * dTP} = slope * (TP_B - TP_A)

        which is really equivalent to imputing constant precision throughout the
        first bucket having >0 true positives.

        Returns:
          pr_auc: an approximation of the area under the P-R curve.
        """
        dtp = self.true_positives[:self.num_thresholds -
                                  1] - self.true_positives[1:]
        p = self.true_positives + self.false_positives
        dp = p[:self.num_thresholds - 1] - p[1:]

        prec_slope = math_ops.div_no_nan(
            dtp, math_ops.maximum(dp, 0), name='prec_slope')
        intercept = self.true_positives[1:] - \
            math_ops.multiply(prec_slope, p[1:])

        safe_p_ratio = array_ops.where(
            math_ops.logical_and(p[:self.num_thresholds - 1] > 0, p[1:] > 0),
            math_ops.div_no_nan(
                p[:self.num_thresholds - 1],
                math_ops.maximum(p[1:], 0),
                name='recall_relative_ratio'),
            array_ops.ones_like(p[1:]))

        return math_ops.reduce_sum(
            math_ops.div_no_nan(
                prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
                math_ops.maximum(self.true_positives[1:] + self.false_negatives[1:],
                                 0),
                name='pr_auc_increment'),
            name='interpolate_pr_auc')
示例#48
0
 def dense_to_sparse_non_scalar(tensor):
     indices = array_ops.where(
         array_ops.ones_like(tensor, dtype=dtypes.bool))
     values = array_ops.gather_nd(tensor, indices)
     shape = array_ops.shape(tensor, out_type=dtypes.int64)
     return sparse_tensor.SparseTensorValue(indices, values, shape)
示例#49
0
def sigmoid_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
                                      labels=None, logits=None,
                                      name=None):
  """Computes sigmoid cross entropy given `logits`.

  Measures the probability error in discrete classification tasks in which each
  class is independent and not mutually exclusive.  For instance, one could
  perform multilabel classification where a picture can contain both an elephant
  and a dog at the same time.

  For brevity, let `x = logits`, `z = labels`.  The logistic loss is

        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + log(1 + exp(-x))
      = x - x * z + log(1 + exp(-x))

  For x < 0, to avoid overflow in exp(-x), we reformulate the above

        x - x * z + log(1 + exp(-x))
      = log(exp(x)) - x * z + log(1 + exp(-x))
      = - x * z + log(1 + exp(x))

  Hence, to ensure stability and avoid overflow, the implementation uses this
  equivalent formulation

      max(x, 0) - x * z + log(1 + exp(-abs(x)))

  `logits` and `labels` must have the same type and shape.

  Args:
    _sentinel: Used to prevent positional parameters. Internal, do not use.
    labels: A `Tensor` of the same type and shape as `logits`.
    logits: A `Tensor` of type `float32` or `float64`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `logits` and `labels` do not have the same shape.
  """
  # pylint: disable=protected-access
  nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits",
                           _sentinel, labels, logits)
  # pylint: enable=protected-access

  with ops.name_scope(name, "logistic_loss", [logits, labels]) as name:
    logits = ops.convert_to_tensor(logits, name="logits")
    labels = ops.convert_to_tensor(labels, name="labels")
    try:
      labels.get_shape().merge_with(logits.get_shape())
    except ValueError:
      raise ValueError("logits and labels must have the same shape (%s vs %s)"
                       % (logits.get_shape(), labels.get_shape()))

    # The logistic loss formula from above is
    #   x - x * z + log(1 + exp(-x))
    # For x < 0, a more numerically stable formula is
    #   -x * z + log(1 + exp(x))
    # Note that these two expressions can be combined into the following:
    #   max(x, 0) - x * z + log(1 + exp(-abs(x)))
    # To allow computing gradients at zero, we define custom versions of max and
    # abs functions.
    zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
    cond = (logits >= zeros)
    relu_logits = array_ops.where(cond, logits, zeros)
    neg_abs_logits = array_ops.where(cond, -logits, logits)
    return math_ops.add(relu_logits - logits * labels,
                        math_ops.log1p(math_ops.exp(neg_abs_logits)),
                        name=name)
示例#50
0
def triplet_semihard_loss(labels, embeddings, margin=1.0):
    """Computes the triplet loss with semi-hard negative mining.

  The loss encourages the positive distances (between a pair of embeddings with
  the same labels) to be smaller than the minimum negative distance among
  which are at least greater than the positive distance plus the margin constant
  (called semi-hard negative) in the mini-batch. If no such negative exists,
  uses the largest negative distance instead.
  See: https://arxiv.org/abs/1503.03832.

  Args:
    labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
      multiclass integer labels.
    embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should
      be l2 normalized.
    margin: Float, margin term in the loss definition.

  Returns:
    triplet_loss: tf.float32 scalar.
  """
    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    # Build pairwise squared distance matrix.
    pdist_matrix = pairwise_distance(embeddings, squared=True)
    # Build pairwise binary adjacency matrix.
    adjacency = math_ops.equal(labels, array_ops.transpose(labels))
    # Invert so we can select negatives only.
    adjacency_not = math_ops.logical_not(adjacency)

    batch_size = array_ops.size(labels)

    # Compute the mask.
    pdist_matrix_tile = array_ops.tile(pdist_matrix, [batch_size, 1])
    mask = math_ops.logical_and(
        array_ops.tile(adjacency_not, [batch_size, 1]),
        math_ops.greater(
            pdist_matrix_tile,
            array_ops.reshape(array_ops.transpose(pdist_matrix), [-1, 1])))
    mask_final = array_ops.reshape(
        math_ops.greater(
            math_ops.reduce_sum(math_ops.cast(mask, dtype=dtypes.float32),
                                1,
                                keepdims=True), 0.0), [batch_size, batch_size])
    mask_final = array_ops.transpose(mask_final)

    adjacency_not = math_ops.cast(adjacency_not, dtype=dtypes.float32)
    mask = math_ops.cast(mask, dtype=dtypes.float32)

    # negatives_outside: smallest D_an where D_an > D_ap.
    negatives_outside = array_ops.reshape(
        masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
    negatives_outside = array_ops.transpose(negatives_outside)

    # negatives_inside: largest D_an.
    negatives_inside = array_ops.tile(
        masked_maximum(pdist_matrix, adjacency_not), [1, batch_size])
    semi_hard_negatives = array_ops.where(mask_final, negatives_outside,
                                          negatives_inside)

    loss_mat = math_ops.add(margin, pdist_matrix - semi_hard_negatives)

    mask_positives = math_ops.cast(adjacency,
                                   dtype=dtypes.float32) - array_ops.diag(
                                       array_ops.ones([batch_size]))

    # In lifted-struct, the authors multiply 0.5 for upper triangular
    #   in semihard, they take all positive pairs except the diagonal.
    num_positives = math_ops.reduce_sum(mask_positives)

    triplet_loss = math_ops.truediv(math_ops.reduce_sum(
        math_ops.maximum(math_ops.multiply(loss_mat, mask_positives), 0.0)),
                                    num_positives,
                                    name='triplet_semihard_loss')

    return triplet_loss
示例#51
0
def focal_loss(target_tensor,
               prediction_tensor,
               classes_num,
               gamma=2.,
               alpha=.25,
               e=0.1):
    # classes_num contains sample number of each classes
    '''
    prediction_tensor is the output tensor with shape [None, 100], where 100 is the number of classes
    target_tensor is the label tensor, same shape as predcition_tensor
    '''
    import tensorflow as tf
    from tensorflow.python.ops import array_ops
    from keras import backend as K

    #1# get focal loss with no balanced weight which presented in paper function (4)
    zeros = array_ops.zeros_like(prediction_tensor,
                                 dtype=prediction_tensor.dtype)
    one_minus_p = array_ops.where(tf.greater(target_tensor, zeros),
                                  target_tensor - prediction_tensor, zeros)
    FT = -1 * (one_minus_p**gamma) * tf.log(
        tf.clip_by_value(prediction_tensor, 1e-8, 1.0))

    #2# get balanced weight alpha
    classes_weight = array_ops.zeros_like(prediction_tensor,
                                          dtype=prediction_tensor.dtype)

    total_num = float(sum(classes_num))
    classes_w_t1 = [total_num / ff for ff in classes_num]
    sum_ = sum(classes_w_t1)
    classes_w_t2 = [ff / sum_ for ff in classes_w_t1]  #scale
    classes_w_tensor = tf.convert_to_tensor(classes_w_t2,
                                            dtype=prediction_tensor.dtype)
    classes_weight += classes_w_tensor

    alpha = array_ops.where(tf.greater(target_tensor, zeros), classes_weight,
                            zeros)

    #3# get balanced focal loss
    balanced_fl = alpha * FT
    balanced_fl = tf.reduce_mean(balanced_fl)

    #4# add other op to prevent overfit
    # reference : https://spaces.ac.cn/archives/4493
    nb_classes = len(classes_num)
    fianal_loss = (1 - e) * balanced_fl + e * K.categorical_crossentropy(
        K.ones_like(prediction_tensor) / nb_classes, prediction_tensor)
    # gp_loss=tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(prediction_tensor)/nb_classes, logits=prediction_tensor)
    # gp_loss=tf.reduce_mean(tf.reduce_sum(gp_loss, axis=1), name='loss')
    # fianal_loss = (1-e) * balanced_fl + e * gp_loss

    return fianal_loss


# def focal_loss(prediction_tensor, target_tensor, weights=None, alpha=0.25, gamma=2):
#     r"""Compute focal loss for predictions.
#         Multi-labels Focal loss formula:
#             FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p)
#                  ,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor.
#     Args:
#      prediction_tensor: A float tensor of shape [batch_size, num_anchors,
#         num_classes] representing the predicted logits for each class
#      target_tensor: A float tensor of shape [batch_size, num_anchors,
#         num_classes] representing one-hot encoded classification targets
#      weights: A float tensor of shape [batch_size, num_anchors]
#      alpha: A scalar tensor for focal loss alpha hyper-parameter
#      gamma: A scalar tensor for focal loss gamma hyper-parameter
#     Returns:
#         loss: A (scalar) tensor representing the value of the loss function
#     """
#     sigmoid_p = tf.nn.sigmoid(prediction_tensor)
#     zeros = array_ops.zeros_like(sigmoid_p, dtype=sigmoid_p.dtype)
#
#     # For poitive prediction, only need consider front part loss, back part is 0;
#     # target_tensor > zeros <=> z=1, so poitive coefficient = z - p.
#     pos_p_sub = array_ops.where(target_tensor > zeros, target_tensor - sigmoid_p, zeros)
#
#     # For negative prediction, only need consider back part loss, front part is 0;
#     # target_tensor > zeros <=> z=1, so negative coefficient = 0.
#     neg_p_sub = array_ops.where(target_tensor > zeros, zeros, sigmoid_p)
#     per_entry_cross_ent = - alpha * (pos_p_sub ** gamma) * tf.log(tf.clip_by_value(sigmoid_p, 1e-8, 1.0)) \
#                           - (1 - alpha) * (neg_p_sub ** gamma) * tf.log(tf.clip_by_value(1.0 - sigmoid_p, 1e-8, 1.0))
#     return tf.reduce_sum(per_entry_cross_ent)
示例#52
0
def streaming_covariance(predictions,
                         labels,
                         weights=None,
                         metrics_collections=None,
                         updates_collections=None,
                         name=None):
    """Computes the unbiased sample covariance between `predictions` and `labels`.
  The `streaming_covariance` function creates four local variables,
  `comoment`, `mean_prediction`, `mean_label`, and `count`, which are used to
  compute the sample covariance between predictions and labels across multiple
  batches of data. The covariance is ultimately returned as an idempotent
  operation that simply divides `comoment` by `count` - 1. We use `count` - 1
  in order to get an unbiased estimate.
  The algorithm used for this online computation is described in
  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance.
  Specifically, the formula used to combine two sample comoments is
  `C_AB = C_A + C_B + (E[x_A] - E[x_B]) * (E[y_A] - E[y_B]) * n_A * n_B / n_AB`
  The comoment for a single batch of data is simply
  `sum((x - E[x]) * (y - E[y]))`, optionally weighted.
  If `weights` is not None, then it is used to compute weighted comoments,
  means, and count. NOTE: these weights are treated as "frequency weights", as
  opposed to "reliability weights". See discussion of the difference on
  https://wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance
  To facilitate the computation of covariance across multiple batches of data,
  the function creates an `update_op` operation, which updates underlying
  variables and returns the updated covariance.
  Args:
    predictions: A `Tensor` of arbitrary size.
    labels: A `Tensor` of the same size as `predictions`.
    weights: Optional `Tensor` indicating the frequency with which an example is
      sampled. Rank must be 0, or the same rank as `labels`, and must be
      broadcastable to `labels` (i.e., all dimensions must be either `1`, or
      the same as the corresponding `labels` dimension).
    metrics_collections: An optional list of collections that the metric
      value variable should be added to.
    updates_collections: An optional list of collections that the metric update
      ops should be added to.
    name: An optional variable_scope name.
  Returns:
    covariance: A `Tensor` representing the current unbiased sample covariance,
      `comoment` / (`count` - 1).
    update_op: An operation that updates the local variables appropriately.
  Raises:
    ValueError: If labels and predictions are of different sizes or if either
      `metrics_collections` or `updates_collections` are not a list or tuple.
  """
    with variable_scope.variable_scope(name, 'covariance',
                                       (predictions, labels, weights)):
        predictions, labels, weights = metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
            predictions, labels, weights)
        predictions.get_shape().assert_is_compatible_with(labels.get_shape())
        count_ = metric_variable([], dtypes.float32, name='count')
        mean_prediction = metric_variable([],
                                          dtypes.float32,
                                          name='mean_prediction')
        mean_label = metric_variable([], dtypes.float32, name='mean_label')
        comoment = metric_variable(  # C_A in update equation
            [], dtypes.float32, name='comoment')

        if weights is None:
            batch_count = math_ops.to_float(
                array_ops.size(labels))  # n_B in eqn
            weighted_predictions = predictions
            weighted_labels = labels
        else:
            weights = weights_broadcast_ops.broadcast_weights(weights, labels)
            batch_count = math_ops.reduce_sum(weights)  # n_B in eqn
            weighted_predictions = math_ops.multiply(predictions, weights)
            weighted_labels = math_ops.multiply(labels, weights)

        update_count = state_ops.assign_add(count_, batch_count)  # n_AB in eqn
        prev_count = update_count - batch_count  # n_A in update equation

        # We update the means by Delta=Error*BatchCount/(BatchCount+PrevCount)
        # batch_mean_prediction is E[x_B] in the update equation
        batch_mean_prediction = _safe_div(
            math_ops.reduce_sum(weighted_predictions), batch_count,
            'batch_mean_prediction')
        delta_mean_prediction = _safe_div(
            (batch_mean_prediction - mean_prediction) * batch_count,
            update_count, 'delta_mean_prediction')
        update_mean_prediction = state_ops.assign_add(mean_prediction,
                                                      delta_mean_prediction)
        # prev_mean_prediction is E[x_A] in the update equation
        prev_mean_prediction = update_mean_prediction - delta_mean_prediction

        # batch_mean_label is E[y_B] in the update equation
        batch_mean_label = _safe_div(math_ops.reduce_sum(weighted_labels),
                                     batch_count, 'batch_mean_label')
        delta_mean_label = _safe_div(
            (batch_mean_label - mean_label) * batch_count, update_count,
            'delta_mean_label')
        update_mean_label = state_ops.assign_add(mean_label, delta_mean_label)
        # prev_mean_label is E[y_A] in the update equation
        prev_mean_label = update_mean_label - delta_mean_label

        unweighted_batch_coresiduals = ((predictions - batch_mean_prediction) *
                                        (labels - batch_mean_label))
        # batch_comoment is C_B in the update equation
        if weights is None:
            batch_comoment = math_ops.reduce_sum(unweighted_batch_coresiduals)
        else:
            batch_comoment = math_ops.reduce_sum(unweighted_batch_coresiduals *
                                                 weights)

        # View delta_comoment as = C_AB - C_A in the update equation above.
        # Since C_A is stored in a var, by how much do we need to increment that var
        # to make the var = C_AB?
        delta_comoment = (batch_comoment +
                          (prev_mean_prediction - batch_mean_prediction) *
                          (prev_mean_label - batch_mean_label) *
                          (prev_count * batch_count / update_count))
        update_comoment = state_ops.assign_add(comoment, delta_comoment)

        covariance = array_ops.where(math_ops.less_equal(count_, 1.),
                                     float('nan'),
                                     math_ops.truediv(comoment, count_ - 1),
                                     name='covariance')
        with ops.control_dependencies([update_comoment]):
            update_op = array_ops.where(math_ops.less_equal(count_, 1.),
                                        float('nan'),
                                        math_ops.truediv(comoment, count_ - 1),
                                        name='update_op')

    if metrics_collections:
        ops.add_to_collections(metrics_collections, covariance)

    if updates_collections:
        ops.add_to_collections(updates_collections, update_op)

    return covariance, update_op
示例#53
0
        def body(time, outputs_ta, state, inputs, finished, sequence_lengths,
                 bit_num, cur_interval):
            """Internal while_loop body.

      Args:
        time: scalar int32 tensor.
        outputs_ta: structure of TensorArray.
        state: (structure of) state tensors and TensorArrays.
        inputs: (structure of) input tensors.
        finished: bool tensor (keeping track of what's finished).
        sequence_lengths: int32 tensor (keeping track of time of finish).
        bit_num: int32 tensor (bits number been encoded this step)
        cur_interval: float32 shape=(2) (AC algorithm divide interval)

      Returns:
        `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
          next_sequence_lengths)`.
        ```
      """
            (next_outputs, decoder_state, next_inputs,
             decoder_finished, num_bits_encoded, next_cur_interval) = \
                decoder.step(time, inputs, state, bit_num, cur_interval)
            next_finished = math_ops.logical_or(decoder_finished, finished)
            bit_num += num_bits_encoded  # if no_hidden_sign==True: no hide
            if maximum_iterations is not None:
                next_finished = math_ops.logical_or(
                    next_finished, time + 1 >= maximum_iterations)

            # define sequence lengths for next sentence according to the 'finish' sign
            next_sequence_lengths = array_ops.where(
                math_ops.logical_and(math_ops.logical_not(finished),
                                     next_finished),
                array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
                sequence_lengths)

            nest.assert_same_structure(state, decoder_state)
            nest.assert_same_structure(outputs_ta, next_outputs)
            nest.assert_same_structure(inputs, next_inputs)
            nest.assert_same_structure(cur_interval, next_cur_interval)

            # Zero out output values past finish
            if impute_finished:
                emit = nest.map_structure(
                    lambda out, zero: array_ops.where(finished, zero, out),
                    next_outputs, zero_outputs)
            else:
                emit = next_outputs

            # Copy through states past finish
            def _maybe_copy_state(new, cur):
                # TensorArrays and scalar states get passed through.
                if isinstance(cur, tensor_array_ops.TensorArray):
                    pass_through = True
                else:
                    new.set_shape(cur.shape)
                    pass_through = (new.shape.ndims == 0)
                return new if pass_through else array_ops.where(
                    finished, cur, new)

            if impute_finished:
                next_state = nest.map_structure(_maybe_copy_state,
                                                decoder_state, state)
            else:
                next_state = decoder_state

            outputs_ta = nest.map_structure(
                lambda ta, out: ta.write(time, out), outputs_ta, emit)
            return (time + 1, outputs_ta, next_state, next_inputs,
                    next_finished, next_sequence_lengths, bit_num,
                    next_cur_interval)
示例#54
0
def batch_matrix_pow(matrices, powers):
  """Compute powers of matrices, e.g. A^3 = matmul(matmul(A, A), A).

  Uses exponentiation by squaring, with O(log(p)) matrix multiplications to
  compute A^p.

  Args:
    matrices: [batch size x N x N]
    powers: Which integer power to raise each matrix to [batch size]
  Returns:
    The matrices raised to their respective powers, same dimensions as the
    "matrices" argument.
  """

  def terminate_when_all_zero(current_argument, residual_powers, accumulator):
    del current_argument, accumulator  # not used for condition
    do_exit = math_ops.reduce_any(
        math_ops.greater(residual_powers, array_ops.ones_like(residual_powers)))
    return do_exit

  def do_iteration(current_argument, residual_powers, accumulator):
    """Compute one step of iterative exponentiation by squaring.

    The recursive form is:
      power(A, p) = { power(matmul(A, A), p / 2) for even p
                    { matmul(A, power(matmul(A, A), (p - 1) / 2)) for odd p
      power(A, 0) = I

    The power(A, 0) = I case is handled by starting with accumulator set to the
    identity matrix; matrices with zero residual powers are passed through
    unchanged.

    Args:
      current_argument: On this step, what is the first argument (A^2..^2) to
          the (unrolled) recursive function? [batch size x N x N]
      residual_powers: On this step, what is the second argument (residual p)?
          [batch_size]
      accumulator: Accumulates the exterior multiplications from the odd
          powers (initially the identity matrix). [batch_size x N x N]
    Returns:
      Updated versions of each argument for one step of the unrolled
      computation. Does not change parts of the batch which have a residual
      power of zero.
    """
    is_even = math_ops.equal(residual_powers % 2,
                             array_ops.zeros(
                                 array_ops.shape(residual_powers),
                                 dtype=dtypes.int32))
    new_accumulator = array_ops.where(is_even, accumulator,
                                      math_ops.matmul(accumulator,
                                                      current_argument))
    new_argument = math_ops.matmul(current_argument, current_argument)
    do_update = math_ops.greater(residual_powers, 1)
    new_residual_powers = residual_powers - residual_powers % 2
    new_residual_powers //= 2
    # Stop updating if we've reached our base case; some batch elements may
    # finish sooner than others
    accumulator = array_ops.where(do_update, new_accumulator, accumulator)
    current_argument = array_ops.where(do_update, new_argument,
                                       current_argument)
    residual_powers = array_ops.where(do_update, new_residual_powers,
                                      residual_powers)
    return (current_argument, residual_powers, accumulator)

  matrices = ops.convert_to_tensor(matrices)
  powers = math_ops.cast(powers, dtype=dtypes.int32)
  ident = array_ops.expand_dims(
      array_ops.diag(
          array_ops.ones([array_ops.shape(matrices)[1]], dtype=matrices.dtype)),
      0)
  ident_tiled = array_ops.tile(ident, [array_ops.shape(matrices)[0], 1, 1])
  (final_argument,
   final_residual_power, final_accumulator) = control_flow_ops.while_loop(
       terminate_when_all_zero, do_iteration, [matrices, powers, ident_tiled])
  return array_ops.where(
      math_ops.equal(final_residual_power,
                     array_ops.zeros_like(
                         final_residual_power, dtype=dtypes.int32)),
      ident_tiled, math_ops.matmul(final_argument, final_accumulator))
示例#55
0
 def b(i, r):
     return i + 1, array_ops.where(math_ops.less(i, squarings),
                                   math_ops.matmul(r, r), r)
示例#56
0
def kernel(target_log_prob_fn,
           current_state,
           step_size,
           num_leapfrog_steps,
           seed=None,
           current_target_log_prob=None,
           current_grads_target_log_prob=None,
           name=None):
    """Runs one iteration of Hamiltonian Monte Carlo.

  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
  algorithm that takes a series of gradient-informed steps to produce
  a Metropolis proposal. This function applies one step of HMC to
  randomly update the variable `x`.

  This function can update multiple chains in parallel. It assumes that all
  leftmost dimensions of `current_state` index independent chain states (and are
  therefore updated independently). The output of `target_log_prob_fn()` should
  sum log-probabilities across all event dimensions. Slices along the rightmost
  dimensions may have different target distributions; for example,
  `current_state[0, :]` could have a different target distribution from
  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)

  #### Examples:

  ##### Simple chain with warm-up.

  ```python
  tfd = tf.contrib.distributions

  # Tuning acceptance rates:
  dtype = np.float32
  target_accept_rate = 0.631
  num_warmup_iter = 500
  num_chain_iter = 500

  x = tf.get_variable(name="x", initializer=dtype(1))
  step_size = tf.get_variable(name="step_size", initializer=dtype(1))

  target = tfd.Normal(loc=dtype(0), scale=dtype(1))

  next_x, other_results = hmc.kernel(
      target_log_prob_fn=target.log_prob,
      current_state=x,
      step_size=step_size,
      num_leapfrog_steps=3)[:4]

  x_update = x.assign(next_x)

  step_size_update = step_size.assign_add(
      step_size * tf.where(
          tf.exp(tf.minimum(other_results.log_accept_ratio), 0.) >
              target_accept_rate,
          0.01, -0.01))

  warmup = tf.group([x_update, step_size_update])

  tf.global_variables_initializer().run()

  sess.graph.finalize()  # No more graph building.

  # Warm up the sampler and adapt the step size
  for _ in xrange(num_warmup_iter):
    sess.run(warmup)

  # Collect samples without adapting step size
  samples = np.zeros([num_chain_iter])
  for i in xrange(num_chain_iter):
    _, x_, target_log_prob_, grad_ = sess.run([
        x_update,
        x,
        other_results.target_log_prob,
        other_results.grads_target_log_prob])
    samples[i] = x_

  print(samples.mean(), samples.std())
  ```

  ##### Sample from more complicated posterior.

  I.e.,

  ```none
    W ~ MVN(loc=0, scale=sigma * eye(dims))
    for i=1...num_samples:
        X[i] ~ MVN(loc=0, scale=eye(dims))
      eps[i] ~ Normal(loc=0, scale=1)
        Y[i] = X[i].T * W + eps[i]
  ```

  ```python
  tfd = tf.contrib.distributions

  def make_training_data(num_samples, dims, sigma):
    dt = np.asarray(sigma).dtype
    zeros = tf.zeros(dims, dtype=dt)
    x = tfd.MultivariateNormalDiag(
        loc=zeros).sample(num_samples, seed=1)
    w = tfd.MultivariateNormalDiag(
        loc=zeros,
        scale_identity_multiplier=sigma).sample(seed=2)
    noise = tfd.Normal(
        loc=dt(0),
        scale=dt(1)).sample(num_samples, seed=3)
    y = tf.tensordot(x, w, axes=[[1], [0]]) + noise
    return y, x, w

  def make_prior(sigma, dims):
    # p(w | sigma)
    return tfd.MultivariateNormalDiag(
        loc=tf.zeros([dims], dtype=sigma.dtype),
        scale_identity_multiplier=sigma)

  def make_likelihood(x, w):
    # p(y | x, w)
    return tfd.MultivariateNormalDiag(
        loc=tf.tensordot(x, w, axes=[[1], [0]]))

  # Setup assumptions.
  dtype = np.float32
  num_samples = 150
  dims = 10
  num_iters = int(5e3)

  true_sigma = dtype(0.5)
  y, x, true_weights = make_training_data(num_samples, dims, true_sigma)

  # Estimate of `log(true_sigma)`.
  log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0))
  sigma = tf.exp(log_sigma)

  # State of the Markov chain.
  weights = tf.get_variable(
      name="weights",
      initializer=np.random.randn(dims).astype(dtype))

  prior = make_prior(sigma, dims)

  def joint_log_prob_fn(w):
    # f(w) = log p(w, y | x)
    return prior.log_prob(w) + make_likelihood(x, w).log_prob(y)

  weights_update = weights.assign(
      hmc.kernel(target_log_prob_fn=joint_log_prob,
                 current_state=weights,
                 step_size=0.1,
                 num_leapfrog_steps=5)[0])

  with tf.control_dependencies([weights_update]):
    loss = -prior.log_prob(weights)

  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
  log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma])

  sess.graph.finalize()  # No more graph building.

  tf.global_variables_initializer().run()

  sigma_history = np.zeros(num_iters, dtype)
  weights_history = np.zeros([num_iters, dims], dtype)

  for i in xrange(num_iters):
    _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights])
    weights_history[i, :] = weights_
    sigma_history[i] = sigma_

  true_weights_ = sess.run(true_weights)

  # Should converge to something close to true_sigma.
  plt.plot(sigma_history);
  plt.ylabel("sigma");
  plt.xlabel("iteration");
  ```

  Args:
    target_log_prob_fn: Python callable which takes an argument like
      `current_state` (or `*current_state` if it's a list) and returns its
      (possibly unnormalized) log-density under the target distribution.
    current_state: `Tensor` or Python `list` of `Tensor`s representing the
      current state(s) of the Markov chain(s). The first `r` dimensions index
      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
      for the leapfrog integrator. Must broadcast with the shape of
      `current_state`. Larger step sizes lead to faster progress, but too-large
      step sizes make rejection exponentially more likely. When possible, it's
      often helpful to match per-variable step sizes to the standard deviations
      of the target distribution in each variable.
    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
      for. Total progress per HMC step is roughly proportional to `step_size *
      num_leapfrog_steps`.
    seed: Python integer to seed the random number generator.
    current_target_log_prob: (Optional) `Tensor` representing the value of
      `target_log_prob_fn` at the `current_state`. The only reason to
      specify this argument is to reduce TF graph size.
      Default value: `None` (i.e., compute as needed).
    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
      representing gradient of `current_target_log_prob` at the `current_state`
      and wrt the `current_state`. Must have same shape as `current_state`. The
      only reason to specify this argument is to reduce TF graph size.
      Default value: `None` (i.e., compute as needed).
    name: Python `str` name prefixed to Ops created by this function.
      Default value: `None` (i.e., "hmc_kernel").

  Returns:
    next_state: Tensor or Python list of `Tensor`s representing the state(s)
      of the Markov chain(s) at each result step. Has same shape as
      `current_state`.
    kernel_results: `collections.namedtuple` of internal calculations used to
      advance the chain.

  Raises:
    ValueError: if there isn't one `step_size` or a list with same length as
      `current_state`.
  """
    with ops.name_scope(name, "hmc_kernel", [
            current_state, step_size, num_leapfrog_steps, seed,
            current_target_log_prob, current_grads_target_log_prob
    ]):
        with ops.name_scope("initialize"):
            [
                current_state_parts, step_sizes, current_target_log_prob,
                current_grads_target_log_prob
            ] = _prepare_args(target_log_prob_fn,
                              current_state,
                              step_size,
                              current_target_log_prob,
                              current_grads_target_log_prob,
                              maybe_expand=True)
            independent_chain_ndims = distributions_util.prefer_static_rank(
                current_target_log_prob)
            current_momentums = []
            for s in current_state_parts:
                current_momentums.append(
                    random_ops.random_normal(shape=array_ops.shape(s),
                                             dtype=s.dtype.base_dtype,
                                             seed=seed))
                seed = distributions_util.gen_new_seed(
                    seed, salt="hmc_kernel_momentums")

            num_leapfrog_steps = ops.convert_to_tensor(
                num_leapfrog_steps,
                dtype=dtypes.int32,
                name="num_leapfrog_steps")
        [
            proposed_momentums,
            proposed_state_parts,
            proposed_target_log_prob,
            proposed_grads_target_log_prob,
        ] = _leapfrog_integrator(current_momentums, target_log_prob_fn,
                                 current_state_parts, step_sizes,
                                 num_leapfrog_steps, current_target_log_prob,
                                 current_grads_target_log_prob)

        energy_change = _compute_energy_change(current_target_log_prob,
                                               current_momentums,
                                               proposed_target_log_prob,
                                               proposed_momentums,
                                               independent_chain_ndims)
        log_accept_ratio = -energy_change

        # u < exp(log_accept_ratio),  where u~Uniform[0,1)
        # ==> log(u) < log_accept_ratio
        random_value = random_ops.random_uniform(
            shape=array_ops.shape(energy_change),
            dtype=energy_change.dtype,
            seed=seed)
        random_negative = math_ops.log(random_value)
        is_accepted = random_negative < log_accept_ratio

        accepted_target_log_prob = array_ops.where(is_accepted,
                                                   proposed_target_log_prob,
                                                   current_target_log_prob)

        next_state_parts = [
            _choose(is_accepted, proposed_state_part, current_state_part,
                    independent_chain_ndims)
            for current_state_part, proposed_state_part in zip(
                current_state_parts, proposed_state_parts)
        ]

        accepted_grads_target_log_prob = [
            _choose(is_accepted, proposed_grad, grad, independent_chain_ndims)
            for proposed_grad, grad in zip(proposed_grads_target_log_prob,
                                           current_grads_target_log_prob)
        ]

        maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0]
        return [
            maybe_flatten(next_state_parts),
            KernelResults(
                log_accept_ratio=log_accept_ratio,
                current_grads_target_log_prob=accepted_grads_target_log_prob,
                current_target_log_prob=accepted_target_log_prob,
                is_accepted=is_accepted,
                proposed_grads_target_log_prob=proposed_grads_target_log_prob,
                proposed_state=maybe_flatten(proposed_state_parts),
                proposed_target_log_prob=proposed_target_log_prob,
            ),
        ]
示例#57
0
    def training_graph(self,
                       input_data,
                       input_labels,
                       data_spec=None,
                       epoch=None,
                       **tree_kwargs):
        """Constructs a TF graph for training a random forest.

    Args:
      input_data: A tensor or SparseTensor or placeholder for input data.
      input_labels: A tensor or placeholder for labels associated with
        input_data.
      data_spec: A list of tf.dtype values specifying the original types of
        each column.
      epoch: A tensor or placeholder for the epoch the training data comes from.
      **tree_kwargs: Keyword arguments passed to each tree's training_graph.

    Returns:
      The last op in the random forest training graph.
    """
        data_spec = [constants.DATA_FLOAT] if data_spec is None else data_spec
        tree_graphs = []
        for i in range(self.params.num_trees):
            with ops.device(self.device_assigner.get_device(i)):
                seed = self.params.base_random_seed
                if seed != 0:
                    seed += i
                # If using bagging, randomly select some of the input.
                tree_data = input_data
                tree_labels = input_labels
                if self.params.bagging_fraction < 1.0:
                    # TODO(thomaswc): This does sampling without replacment.  Consider
                    # also allowing sampling with replacement as an option.
                    batch_size = array_ops.slice(array_ops.shape(input_data),
                                                 [0], [1])
                    r = random_ops.random_uniform(batch_size, seed=seed)
                    mask = math_ops.less(
                        r,
                        array_ops.ones_like(r) * self.params.bagging_fraction)
                    gather_indices = array_ops.squeeze(array_ops.where(mask),
                                                       squeeze_dims=[1])
                    # TODO(thomaswc): Calculate out-of-bag data and labels, and store
                    # them for use in calculating statistics later.
                    tree_data = array_ops.gather(input_data, gather_indices)
                    tree_labels = array_ops.gather(input_labels,
                                                   gather_indices)
                if self.params.bagged_features:
                    tree_data = self._bag_features(i, tree_data)

                initialization = self.trees[i].tree_initialization()

                with ops.control_dependencies([initialization]):
                    tree_graphs.append(self.trees[i].training_graph(
                        tree_data,
                        tree_labels,
                        seed,
                        data_spec=data_spec,
                        epoch=([0] if epoch is None else epoch),
                        **tree_kwargs))

        return control_flow_ops.group(*tree_graphs, name='train')
示例#58
0
def rotate_transpose(x, shift, name="rotate_transpose"):
    """Circularly moves dims left or right.

  Effectively identical to:

  ```python
  numpy.transpose(x, numpy.roll(numpy.arange(len(x.shape)), shift))
  ```

  When `validate_args=False` additional graph-runtime checks are
  performed. These checks entail moving data from to GPU to CPU.

  Example:

    ```python
    x = ...  # Tensor of shape [1, 2, 3, 4].
    rotate_transpose(x, -1)  # result shape: [2, 3, 4, 1]
    rotate_transpose(x, -2)  # result shape: [3, 4, 1, 2]
    rotate_transpose(x,  1)  # result shape: [4, 1, 2, 3]
    rotate_transpose(x,  2)  # result shape: [3, 4, 1, 2]
    rotate_transpose(x, 7) == rotate_transpose(x, 3)
    rotate_transpose(x, -7) == rotate_transpose(x, -3)
    ```

  Args:
    x: `Tensor`.
    shift: `Tensor`. Number of dimensions to transpose left (shift<0) or
      transpose right (shift>0).
    name: Python `str`. The name to give this op.

  Returns:
    rotated_x: Input `Tensor` with dimensions circularly rotated by shift.

  Raises:
    TypeError: if shift is not integer type.
  """
    with ops.name_scope(name, values=[x, shift]):
        x = ops.convert_to_tensor(x, name="x")
        shift = ops.convert_to_tensor(shift, name="shift")
        # We do not assign back to preserve constant-ness.
        check_ops.assert_integer(shift)
        shift_value_static = tensor_util.constant_value(shift)
        ndims = x.get_shape().ndims
        if ndims is not None and shift_value_static is not None:
            if ndims < 2: return x
            shift_value_static = np.sign(shift_value_static) * (
                abs(shift_value_static) % ndims)
            if shift_value_static == 0: return x
            perm = np.roll(np.arange(ndims), shift_value_static)
            return array_ops.transpose(x, perm=perm)
        else:
            # Consider if we always had a positive shift, and some specified
            # direction.
            # When shifting left we want the new array:
            #   last(x, n-shift) + first(x, shift)
            # and if shifting right then we want:
            #   last(x, shift) + first(x, n-shift)
            # Observe that last(a) == slice(a, n) and first(a) == slice(0, a).
            # Also, we can encode direction and shift as one: direction * shift.
            # Combining these facts, we have:
            #   a = cond(shift<0, -shift, n-shift)
            #   last(x, n-a) + first(x, a) == x[a:n] + x[0:a]
            # Finally, we transform shift by modulo length so it can be specified
            # independently from the array upon which it operates (like python).
            ndims = array_ops.rank(x)
            shift = array_ops.where(math_ops.less(shift, 0),
                                    math_ops.mod(-shift, ndims),
                                    ndims - math_ops.mod(shift, ndims))
            first = math_ops.range(0, shift)
            last = math_ops.range(shift, ndims)
            perm = array_ops.concat([last, first], 0)
            return array_ops.transpose(x, perm=perm)
示例#59
0
    def call(self, inputs, states, edge_types, cell_mask, training=True):  # inputs: batch_size*embedding_dim, states:4*batch_size*embedding_dim, cell_mask: batch_size*recurrent_size
        batch_size = inputs.shape[0]
        state_size = len(states)
        if state_size > self.recurrent_size:
            raise ValueError("length of states exceeds recurrent_size.")
        if self.use_bias:
            unstacked_biases = array_ops.unstack(self.bias)  # unstacked_biases: (recurrent_size+1)*embedding_dim
            input_bias, recurrent_bias = unstacked_biases[0], unstacked_biases[1:]  # input_bias: (3*embedding_dim), recurrent_bias: recurrent_size*(3*embedding_dim)

        matrix_x = K.dot(inputs, self.kernel)  # matrix_x: batch_size*(3*embedding_dim)
        if self.use_bias:
            matrix_x = K.bias_add(matrix_x, input_bias)

        x_z = matrix_x[:, :self.units]  # x_z: batch_size*embedding_dim
        x_r = matrix_x[:, self.units: 2 * self.units]  # x_r: batch_size*embedding_dim
        x_h = matrix_x[:, 2 * self.units:]  # x_h: batch_size*embedding_dim

        def _expand_mask(mask_t, input_t, fixed_dim=1):  # mask_t: batch_size*1, input_t: batch_size*embedding_dim
            assert not nest.is_sequence(mask_t)
            assert not nest.is_sequence(input_t)
            rank_diff = len(input_t.shape) - len(mask_t.shape)  # rand_diff: 0
            for _ in range(rank_diff):
                mask_t = array_ops.expand_dims(mask_t, -1)
            multiples = [1] * fixed_dim + input_t.shape.as_list()[fixed_dim:]  # multiples: [1, embedding_dim]
            return array_ops.tile(mask_t, multiples)
        accumulate_h = array_ops.zeros([batch_size, self.units])  # accumulate_h: batch_size*embedding_dim
        accumulate_z_h = array_ops.zeros([batch_size, self.units])  # accumulate_z_h: batch_size*embedding_dim
        accumulate_z = array_ops.zeros([batch_size, self.units])  # accumulate_z: batch_size*embedding_dim
        loop = 1 if args['ablationD'] else self.recurrent_size

        z_list = []
        h_list = []
        for k in range(loop):
            # edge embedding
            edge_embed = self.edge_embeddings(edge_types[:, k])  # edge_embed: batch_size*embedding
            # mask
            tiled_mask_t = _expand_mask(cell_mask[:, k], edge_embed)  # tiled_mask_t: batch_size*embedding_dim
            edge_embed = array_ops.where(tiled_mask_t, edge_embed, array_ops.ones_like(edge_embed))  # edge_embed: batch_size*embedding_dim
            state = states[k]  # state: batch_size*embedding_dim
            h_list.append(state)

            matrix_inner = K.dot(state, self.recurrent_kernel[k])  # matrix_inner: batch_size*(3*embedding_dim), states[k]: batch_size*embedding_dim
            if self.use_bias:
                matrix_inner = K.bias_add(matrix_inner, recurrent_bias[k])

            recurrent_z = matrix_inner[:, :self.units]  # recurrent_z: batch_size*embedding_dim
            recurrent_r = matrix_inner[:, self.units: 2 * self.units]  # recurrent_r: batch_size*embedding_dim

            # add for softmax attention
            z_list.append(recurrent_z)

            z = self.recurrent_activation(x_z + recurrent_z)  # z: batch_size*embedding_dim
            r = self.recurrent_activation(x_r + recurrent_r)  # r: batch_size*embedding_dim

            # comment for sum_after
            recurrent_h = r * matrix_inner[:, 2 * self.units:]  # recurrent_h: batch_size*embedding_dim
            recurrent_h = array_ops.where(tiled_mask_t, recurrent_h, array_ops.zeros_like(recurrent_h))  # recurrent_h: batch_size*embedding_dim
            accumulate_h = accumulate_h + recurrent_h  # accumulate_h: batch_size*embedding_dim

        hh = self.activation(x_h + accumulate_h / loop)  # hh: batch_size*embedding_dim
        h_list.append(hh)  # h_list: input_hidden without linear
        z_list.append(hh)  # z_list: input_hidden after linear

        hidden_bank = tf.transpose(tf.stack(z_list, axis=0), [1, 0, 2])  # hidden_memory: batch_size * (recurrent_size + 1) * embedding_dim
        x_z_temp = tf.tile(tf.expand_dims(x_z, axis=1), [1, hidden_bank.shape[1], 1])  # x_z_temp = batch_size * (recurrent_size + 1) * embedding_dim
        prob_logits = tf.matmul(tf.tile(tf.expand_dims(tf.transpose(self.v, [1, 0]), axis=0), [batch_size, 1, 1]), tf.transpose(self.activation(x_z_temp + hidden_bank), [0, 2, 1]))
        prob_logits = tf.squeeze(tf.transpose(prob_logits, [0, 2, 1]), axis=2)
        mask_list = []
        cell_mask_slices = tf.split(cell_mask, num_or_size_splits=cell_mask.shape[1], axis=1)
        for tensor in cell_mask_slices:
            mask_list.append(tensor)
        hh_mask = tf.ones([batch_size, 1], dtype=tf.bool)  # hh_mask: batch_size * 1
        mask_list.append(hh_mask)
        new_mask = tf.squeeze(tf.stack(mask_list, axis=1), axis=2)  # new_mask: batch_size * (recurrent_size + 1)
        prob_logits_temp = array_ops.where(new_mask, prob_logits, (-1 * np.ones_like(prob_logits) * np.inf))
        prob_soft = self.softmax(prob_logits_temp)  # prob_soft: batch_size * (recurrent_size + 1)
        prob_soft_temp = tf.tile(tf.expand_dims(prob_soft, axis=2), [1, 1, hidden_bank.shape[2]])  # prob_soft_temp: batch_size * (recurrent_size + 1) * embedding_dim
        output_hidden_bank = tf.transpose(tf.stack(h_list, axis=0), [1, 0, 2])  # output_hidden_bank: batch_size * (recurrent_size + 1) * embedding_dim
        h = tf.reduce_sum((output_hidden_bank * prob_soft_temp), axis=1)  # h: batch_size * embedding_dim

        return h, [h]
示例#60
0
def mean_pairwise_squared_error(
    labels, predictions, weights=1.0, scope=None,
    loss_collection=ops.GraphKeys.LOSSES):
  """Adds a pairwise-errors-squared loss to the training procedure.

  Unlike `mean_squared_error`, which is a measure of the differences between
  corresponding elements of `predictions` and `labels`,
  `mean_pairwise_squared_error` is a measure of the differences between pairs of
  corresponding elements of `predictions` and `labels`.

  For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
  three pairs of differences are summed to compute the loss:
    loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3

  Note that since the inputs are of shape `[batch_size, d0, ... dN]`, the
  corresponding pairs are computed within each batch sample but not across
  samples within a batch. For example, if `predictions` represents a batch of
  16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
  is drawn from each image, but not across images.

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  `[batch_size]`, then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector.

  Args:
    labels: The ground truth output tensor, whose shape must match the shape of
      `predictions`.
    predictions: The predicted outputs, a tensor of size
      `[batch_size, d0, .. dN]` where N+1 is the total number of dimensions in
      `predictions`.
    weights: Coefficients for the loss a scalar, a tensor of shape
      `[batch_size]` or a tensor whose shape matches `predictions`.
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.

  Returns:
    A scalar `Tensor` that returns the weighted loss.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
      is None.

  @compatibility(eager)
  The `loss_collection` argument is ignored when executing eagerly. Consider
  holding on to the return value or collecting losses via a `tf.keras.Model`.
  @end_compatibility
  """
  if labels is None:
    raise ValueError("labels must not be None.")
  if predictions is None:
    raise ValueError("predictions must not be None.")
  with ops.name_scope(scope, "mean_pairwise_squared_error",
                      (predictions, labels, weights)) as scope:
    weights = math_ops.cast(weights, dtype=dtypes.float32)
    labels = math_ops.cast(labels, dtype=dtypes.float32)
    with ops.control_dependencies((
        weights_broadcast_ops.assert_broadcastable(weights, labels),)):
      predictions = math_ops.cast(predictions, dtype=dtypes.float32)
      predictions.get_shape().assert_is_compatible_with(labels.get_shape())

      diffs = math_ops.subtract(predictions, labels)

      axis = math_ops.range(1, array_ops.rank(diffs))

      sum_squares_diff_per_batch = math_ops.reduce_sum(
          math_ops.square(diffs), axis=axis, keepdims=True)
      num_present_per_batch = _num_present(diffs, weights, per_batch=True)

      term1 = 2.0 * math_ops.div_no_nan(
          sum_squares_diff_per_batch,
          math_ops.maximum(num_present_per_batch - 1, 0),
          name="value")

      sum_diff = math_ops.reduce_sum(diffs, axis=axis, keepdims=True)
      term2 = 2.0 * math_ops.div_no_nan(
          math_ops.square(sum_diff),
          math_ops.maximum(
              math_ops.multiply(num_present_per_batch,
                                num_present_per_batch - 1), 0),
          name="value")

      weighted_losses = math_ops.multiply(term1 - term2, weights)
      loss = math_ops.reduce_sum(weighted_losses)

      mean_loss = array_ops.where(
          math_ops.reduce_sum(num_present_per_batch) > 0,
          loss,
          array_ops.zeros_like(loss),
          name="value")
      util.add_loss(mean_loss, loss_collection)
      return mean_loss