def train(self, sentences):
    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
    tokens_sparse = tf.sparse.SparseTensor(
        indices=token_ids, values=token_values, dense_shape=token_dense_shape)
    tokens = tf.sparse.to_dense(tokens_sparse, default_value="")

    sparse_lookup_ids = tf.sparse.SparseTensor(
        indices=tokens_sparse.indices,
        values=self._words_to_indices(tokens_sparse.values),
        dense_shape=tokens_sparse.dense_shape)
    lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0)

    # Targets are the next word for each word of the sentence.
    tokens_ids_seq = lookup_ids[:, 0:-1]
    tokens_ids_target = lookup_ids[:, 1:]

    tokens_prefix = tokens[:, 0:-1]

    # Mask determining which positions we care about for a loss: all positions
    # that have a valid non-terminal token.
    mask = tf.logical_and(
        tf.logical_not(tf.equal(tokens_prefix, "")),
        tf.logical_not(tf.equal(tokens_prefix, "<E>")))

    input_mask = tf.cast(mask, tf.int32)

    with tf.GradientTape() as t:
      sentence_embeddings = tf.nn.embedding_lookup(self._embeddings,
                                                   tokens_ids_seq)

      lstm_initial_state = self._lstm_cell.get_initial_state(
          sentence_embeddings)

      lstm_output = self._rnn_layer(
          inputs=sentence_embeddings, initial_state=lstm_initial_state)

      # Stack LSTM outputs into a batch instead of a 2D array.
      lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size])

      logits = self._logit_layer(lstm_output)

      targets = tf.reshape(tokens_ids_target, [-1])
      weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32)

      losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=targets, logits=logits)

      # Final loss is the mean loss for all token losses.
      final_loss = tf.math.divide(
          tf.reduce_sum(tf.multiply(losses, weights)),
          tf.reduce_sum(weights),
          name="final_loss")

    watched = t.watched_variables()
    gradients = t.gradient(final_loss, watched)

    for w, g in zip(watched, gradients):
      w.assign_sub(g)

    return final_loss
Пример #2
0
    def validate(self, sentences):
        tokens, lookup_ids = self._tokens_to_lookup_ids(sentences)
        # Targets are the next word for each word of the sentence.
        tokens_ids_seq = lookup_ids[:, 0:-1]
        tokens_ids_target = lookup_ids[:, 1:]
        tokens_prefix = tokens[:, 0:-1]

        # Mask determining which positions we care about for a loss: all positions
        # that have a valid non-terminal token.
        mask = tf.logical_and(tf.logical_not(tf.equal(tokens_prefix, "")),
                              tf.logical_not(tf.equal(tokens_prefix, "<E>")))

        input_mask = tf.cast(mask, tf.int32)

        lstm_output = self.model(tokens_ids_seq)
        lstm_output = tf.reshape(lstm_output, [-1, self._state_size])
        logits = self._logit_layer(lstm_output)

        targets = tf.reshape(tokens_ids_target, [-1])
        weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32)

        losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets,
                                                                logits=logits)

        # Final loss is the mean loss for all token losses.
        final_loss = tf.math.divide(tf.reduce_sum(tf.multiply(losses,
                                                              weights)),
                                    tf.reduce_sum(weights),
                                    name="final_validation_loss")

        return final_loss
Пример #3
0
    def _filter(graph: jraph.GraphsTuple) -> tf.Tensor:

        if ("positions" not in graph.nodes) or ("positions_targets"
                                                not in graph.nodes) or (
                                                    "positions_nan_mask"
                                                    not in graph.globals):
            raise ValueError("Conformer features not available to filter.")

        any_nan = tf.logical_not(
            tf.squeeze(graph.globals["positions_nan_mask"]))
        return any_nan if with_nans else tf.logical_not(any_nan)
    def _inverse_log_det_jacobian(self, y, use_saved_statistics=False):
        if not self.batchnorm.built:
            # Create variables.
            self.batchnorm.build(y.shape)

        event_dims = self.batchnorm.axis
        reduction_axes = [
            i for i in range(len(y.shape)) if i not in event_dims
        ]

        # At training-time, ildj is computed from the mean and log-variance across
        # the current minibatch.
        # We use multiplication instead of tf.where() to get easier broadcasting.
        log_variance = tf.math.log(
            tf.where(
                tf.logical_or(use_saved_statistics,
                              tf.logical_not(self._training)),
                self.batchnorm.moving_variance,
                tf.nn.moments(x=y, axes=reduction_axes, keepdims=True)[1]) +
            self.batchnorm.epsilon)

        # TODO(b/137216713): determine whether it's unsafe for the reduce_sums below
        # to happen across all axes.
        # `gamma` and `log Var(y)` reductions over event_dims.
        # Log(total change in area from gamma term).
        log_total_gamma = tf.reduce_sum(tf.math.log(self.batchnorm.gamma))

        # Log(total change in area from log-variance term).
        log_total_variance = tf.reduce_sum(log_variance)
        # The ildj is scalar, as it does not depend on the values of x and are
        # constant across minibatch elements.
        return log_total_gamma - 0.5 * log_total_variance
Пример #5
0
 def testMaskingForwardBias(self):
     l = tf.keras.layers.Dense(12)
     ml = layers.MaskedLayer(l, name='test')
     x = tf.random.uniform((3, 5))
     # Bulding the layer and initializing the parameters.
     ml(x)
     l_bias = ml.layer.weights[1]
     b_mask = tf.random.uniform(l_bias.shape, maxval=2, dtype=tf.int32)
     # To get pruned parameters.
     b_mask_not_bool = tf.logical_not(tf.cast(b_mask, tf.bool))
     ml.set_mask(b_mask, is_bias=True)
     with tf.GradientTape() as tape:
         y = ml(x)
         # All weights under the mask expected to be zero after forward call.
         self.assertEqual(
             tf.math.count_nonzero(tf.boolean_mask(l_bias,
                                                   b_mask_not_bool)), 0)
         loss = tf.reduce_sum(y)
     grads = tape.gradient(loss, l.variables)
     optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
     optimizer.apply_gradients(list(zip(grads, l.variables)))
     # Weights are updated and they are not necesarrily zero anymore.
     self.assertNotEqual(
         tf.math.count_nonzero(tf.boolean_mask(l_bias, b_mask_not_bool)), 0)
     # All weights under the mask expected to be zero after forward call.
     # Don't need the return value.
     ml(x)
     self.assertEqual(
         tf.math.count_nonzero(tf.boolean_mask(l_bias, b_mask_not_bool)), 0)
Пример #6
0
 def testSetBiasMasking(self):
     l = tf.keras.layers.Dense(12, bias_initializer='glorot_uniform')
     ml = layers.MaskedLayer(l, name='test')
     with self.assertRaises(AssertionError):
         ml.set_mask(tf.zeros(10), is_bias=True)
     x = tf.random.uniform((3, 5))
     # Bulding the layer and initializing the parameters.
     ml(x)
     with self.assertRaises(AssertionError):
         # Wrong mask_shape
         ml.set_mask(tf.zeros(5, 12), is_bias=True)
     l_bias = ml.layer.weights[1]
     b_mask = tf.random.uniform(l_bias.shape, maxval=2, dtype=tf.int32)
     # To get pruned parameters.
     b_mask_not_bool = tf.logical_not(tf.cast(b_mask, tf.bool))
     ml.set_mask(b_mask, is_bias=True)
     self.assertIsInstance(ml.mask_bias, tf.Variable)
     self.assertAllEqual(b_mask, ml.mask_bias.numpy())
     self.assertEqual(l_bias.dtype, ml.mask_bias.dtype)
     # Check the assign works.
     b_mask = tf.random.uniform(l_bias.shape, maxval=2, dtype=tf.int32)
     ml.set_mask(b_mask, is_bias=True)
     self.assertAllEqual(b_mask, ml.mask_bias.numpy())
     self.assertAllEqual(ml.mask_weight.numpy(),
                         tf.ones_like(ml.mask_weight))
     # weights are not masked yet
     self.assertNotEqual(
         tf.math.count_nonzero(tf.boolean_mask(l_bias, b_mask_not_bool)), 0)
def prune_completely_outside_window(boxlist, window, scope=None):
    """Prunes bounding boxes that fall completely outside of the given window.

  The function clip_to_window prunes bounding boxes that fall
  completely outside the window, but also clips any bounding boxes that
  partially overflow. This function does not clip partially overflowing boxes.

  Args:
    boxlist: a BoxList holding M_in boxes.
    window: a float tensor of shape [4] representing [ymin, xmin, ymax, xmax]
      of the window
    scope: name scope.

  Returns:
    pruned_boxlist: a new BoxList with all bounding boxes partially or fully in
      the window.
    valid_indices: a tensor with shape [M_out] indexing the valid bounding boxes
     in the input tensor.
  """
    with tf.name_scope(scope, 'PruneCompleteleyOutsideWindow'):
        y_min, x_min, y_max, x_max = tf.split(value=boxlist.get(),
                                              num_or_size_splits=4,
                                              axis=1)
        win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
        coordinate_violations = tf.concat([
            tf.greater_equal(y_min, win_y_max),
            tf.greater_equal(x_min, win_x_max),
            tf.less_equal(y_max, win_y_min),
            tf.less_equal(x_max, win_x_min)
        ], 1)
        valid_indices = tf.reshape(
            tf.where(tf.logical_not(tf.reduce_any(coordinate_violations, 1))),
            [-1])
        return gather(boxlist, valid_indices), valid_indices
def prune_outside_window(boxlist, window, scope=None):
    """Prunes bounding boxes that fall outside a given window.

  This function prunes bounding boxes that even partially fall outside the given
  window. See also clip_to_window which only prunes bounding boxes that fall
  completely outside the window, and clips any bounding boxes that partially
  overflow.

  Args:
    boxlist: a BoxList holding M_in boxes.
    window: a float tensor of shape [4] representing [ymin, xmin, ymax, xmax]
      of the window
    scope: name scope.

  Returns:
    pruned_corners: a tensor with shape [M_out, 4] where M_out <= M_in
    valid_indices: a tensor with shape [M_out] indexing the valid bounding boxes
     in the input tensor.
  """
    with tf.name_scope(scope, 'PruneOutsideWindow'):
        y_min, x_min, y_max, x_max = tf.split(value=boxlist.get(),
                                              num_or_size_splits=4,
                                              axis=1)
        win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
        coordinate_violations = tf.concat([
            tf.less(y_min, win_y_min),
            tf.less(x_min, win_x_min),
            tf.greater(y_max, win_y_max),
            tf.greater(x_max, win_x_max)
        ], 1)
        valid_indices = tf.reshape(
            tf.where(tf.logical_not(tf.reduce_any(coordinate_violations, 1))),
            [-1])
        return gather(boxlist, valid_indices), valid_indices
Пример #9
0
 def _is_converged(converged, num_iterations, *ignored_args):  # pylint:disable=unused-argument
   # It is important to ensure that not_converged is a tensor. If
   # converged is not a tensor but a Python bool, then the overloaded
   # op '~' acts as bitwise complement so ~True = -2 and ~False = -1.
   # In that case, the loop will never terminate.
   not_converged = tf.logical_not(converged)
   return (not_converged if max_iterations is None
           else (not_converged & (num_iterations < max_iterations)))
Пример #10
0
def is_cudnn_supported_inputs(mask, time_major):
    if time_major:
        mask = tf.transpose(mask)

    return tf.logical_and(
        is_sequence_right_padded(mask),
        tf.logical_not(has_fully_masked_sequence(mask)),
    )
Пример #11
0
def has_fully_masked_sequence(mask):
    # See https://github.com/tensorflow/tensorflow/issues/33148 for more details.
    # Cudnn kernel will error out if the input sequence contains any fully masked
    # data. We walk around this issue by rerouting the computation to standard
    # kernel, until the issue on cudnn side has been fixed.
    # For a fully masked sequence, it will contain all Falses. To make it easy to
    # check, we inverse the boolean, check if any of the sequence has all True.
    return tf.reduce_any(tf.reduce_all(tf.logical_not(mask), axis=1))
def kl_divergence(distribution_a,
                  distribution_b,
                  allow_nan_stats=True,
                  name=None):
    """Get the KL-divergence KL(distribution_a || distribution_b).
    If there is no KL method registered specifically for `type(distribution_a)`
    and `type(distribution_b)`, then the class hierarchies of these types are
    searched.
    If one KL method is registered between any pairs of classes in these two
    parent hierarchies, it is used.
    If more than one such registered method exists, the method whose registered
    classes have the shortest sum MRO paths to the input types is used.
    If more than one such shortest path exists, the first method
    identified in the search is used (favoring a shorter MRO distance to
    `type(distribution_a)`).
    Args:
      distribution_a: The first distribution.
      distribution_b: The second distribution.
      allow_nan_stats: Python `bool`, default `True`. When `True`,
        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
        indicate the result is undefined. When `False`, an exception is raised
        if one or more of the statistic's batch members are undefined.
      name: Python `str` name prefixed to Ops created by this class.
    Returns:
      A Tensor with the batchwise KL-divergence between `distribution_a`
      and `distribution_b`.
    Raises:
      NotImplementedError: If no KL method is defined for distribution types
        of `distribution_a` and `distribution_b`.
    """
    kl_fn = _registered_kl(type(distribution_a), type(distribution_b))
    if kl_fn is None:
        raise NotImplementedError(
            "No KL(distribution_a || distribution_b) registered for distribution_a "
            "type {} and distribution_b type {}".format(
                type(distribution_a).__name__,
                type(distribution_b).__name__))

    name = name or "KullbackLeibler"
    with tf.name_scope(name):
        # pylint: disable=protected-access
        with distribution_a._name_and_control_scope(name + "_a"):
            with distribution_b._name_and_control_scope(name + "_b"):
                kl_t = kl_fn(distribution_a, distribution_b, name=name)
                if allow_nan_stats:
                    return kl_t

    # Check KL for NaNs
    kl_t = tf.identity(kl_t, name="kl")

    with tf.control_dependencies([
            tf.debugging.Assert(
                tf.logical_not(tf.reduce_any(tf.math.is_nan(kl_t))),
                [("KL calculation between {} and {} returned NaN values "
                  "(and was called with allow_nan_stats=False). Values:".
                  format(distribution_a.name, distribution_b.name)), kl_t])
    ]):
        return tf.identity(kl_t, name="checked_kl")
    def _flip_feature(self, sampler_state, idx):
        """Proposes flipping the sparsity indicator of the `idx`th feature.

    This method computes the sampler state (including factorized precision
    matrices) for a given sparsity pattern, given the state for a
    related sparsity pattern that differs in a single position. This is
    achieved using rank-1 Cholesky updates running in time
    proportional to `num_features**2`, and so is typically more efficient than
    recomputing the equivalent state from scratch using
    `_initialize_sampler_state`.

    Args:
      sampler_state: instance of `DynamicSpikeSlabSamplerState` collecting
        Tensor quantities relevant to the sampler. See the
        `DynamicSpikeSlabSamplerState` definition for details.
      idx: scalar int `Tensor` index in `[0, num_features)`.

    Returns:
      updated_sampler_state: instance of `DynamicSpikeSlabSamplerState`
        equivalent to `self._initialize_sampler_state(targets, new_nonzeros)`,
        where `new_nonzeros` is equal to `nonzeros` with the `idx`th entry
        negated.
    """
        with tf.name_scope('flip_feature_indicator'):
            was_nonzero = tf.gather(sampler_state.nonzeros, idx, axis=-1)
            new_nonzeros = _set_vector_index(sampler_state.nonzeros, idx,
                                             tf.logical_not(was_nonzero))
            # Update the weight posterior mean and precision for the new nonzeros.
            # (and also update the prior, used to compute the marginal likelihood).
            indices = tf.where(new_nonzeros)[:, 0]
            conditional_prior_precision_chol = tf.linalg.cholesky(
                tf.gather(tf.gather(self.weights_prior_precision, indices),
                          indices,
                          axis=1))
            conditional_posterior_precision_chol = tf.linalg.cholesky(
                tf.gather(tf.gather(sampler_state.weights_posterior_precision,
                                    indices),
                          indices,
                          axis=1))
            sub_x_transpose_y = tf.gather(sampler_state.x_transpose_y, indices)
            conditional_weights_mean = tf.linalg.cholesky_solve(
                conditional_posterior_precision_chol,
                sub_x_transpose_y[..., tf.newaxis])[..., 0]
            return self._compute_log_prob(
                nonzeros=new_nonzeros,
                y_transpose_y=sampler_state.y_transpose_y,
                conditional_prior_precision_chol=
                conditional_prior_precision_chol,
                conditional_posterior_precision_chol=(
                    conditional_posterior_precision_chol),
                weights_posterior_precision=sampler_state.
                weights_posterior_precision,
                observation_noise_variance_posterior_scale=(
                    self.observation_noise_variance_prior_scale +
                    (sampler_state.y_transpose_y - tf.reduce_sum(
                        conditional_weights_mean * sub_x_transpose_y, axis=-1))
                    / 2),
                x_transpose_y=sampler_state.x_transpose_y)
Пример #14
0
  def train(self, sentences):
    tokens,lookup_ids = self._tokens_to_lookup_ids(sentences)
    # Targets are the next word for each word of the sentence.
    tokens_ids_seq = lookup_ids[:, 0:-1]
    tokens_ids_target = lookup_ids[:, 1:]
    tokens_prefix = tokens[:, 0:-1]

    # Mask determining which positions we care about for a loss: all positions
    # that have a valid non-terminal token.
    mask = tf.logical_and(
        tf.logical_not(tf.equal(tokens_prefix, "")),
        tf.logical_not(tf.equal(tokens_prefix, "<E>")))

    input_mask = tf.cast(mask, tf.int32)

    with tf.GradientTape() as t:
      #sentence_embeddings = tf.nn.embedding_lookup(self._embeddings,tokens_ids_seq)
    
      lstm_output = self.model(tokens_ids_seq)
      lstm_output = tf.reshape(lstm_output, [-1,self._state_size])
      logits = self._logit_layer(lstm_output)
      

      targets = tf.reshape(tokens_ids_target, [-1])
      weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32)

      losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=targets, logits=logits)

      # Final loss is the mean loss for all token losses.
      final_loss = tf.math.divide(
          tf.reduce_sum(tf.multiply(losses, weights)),
          tf.reduce_sum(weights),
          name="final_loss")

    watched = t.watched_variables()
    gradients = t.gradient(final_loss, watched)
    self.optimizer.apply_gradients(zip(gradients, watched))

    #for w, g in zip(watched, gradients):
    #  w.assign_sub(g)

    return final_loss
Пример #15
0
def loss_function(y_pred, y):

    #shape of y [batch_size, ty]
    #shape of y_pred [batch_size, Ty, output_vocab_size]
    sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
    mask = tf.logical_not(tf.math.equal(y, 0))  #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)
    loss = mask * loss
    loss = tf.reduce_mean(loss)
    return loss
Пример #16
0
def _week_day_mappers(weekend_mask):
    """Creates functions to map from ordinals to week days and inverse.

  Creates functions to map from ordinal space (i.e. days since 31 Dec 0) to
  week days. The function assigns the value of 0 to the first non weekend
  day in the week starting on Sunday, 31 Dec 1 through to Saturday, 6 Jan 1 and
  the value assigned to each successive work day is incremented by 1. For a day
  that is not a week day, this count is not incremented from the previous week
  day (hence, multiple ordinal days may have the same week day value).

  Args:
    weekend_mask: A bool `Tensor` of length 7 or None. The weekend mask.

  Returns:
    A tuple of callables.
      `forward`: Takes one `Tensor` argument containing ordinals and returns a
        tuple of two `Tensor`s of the same shape as the input. The first
        `Tensor` is of type `int32` and contains the week day value. The second
        is a bool `Tensor` indicating whether the supplied ordinal was a weekend
        day (i.e. True where the day is a weekend day and False otherwise).
      `backward`: Takes one int32 `Tensor` argument containing week day values
        and returns an int32 `Tensor` containing ordinals for those week days.
  """
    if weekend_mask is None:
        default_forward = lambda x: (x, tf.zeros_like(x, dtype=tf.bool))
        identity = lambda x: x
        return default_forward, identity
    weekend_mask = tf.convert_to_tensor(weekend_mask, dtype=tf.bool)
    weekend_mask = tf.roll(weekend_mask, -_DAYOFWEEK_0, axis=0)
    weekday_mask = tf.logical_not(weekend_mask)
    weekday_offsets = tf.cumsum(tf.cast(weekday_mask, dtype=tf.int32))
    num_workdays = weekday_offsets[-1]
    weekday_offsets -= 1  # Adjust the first workday to index 0.
    ordinal_offsets = tf.convert_to_tensor([0, 1, 2, 3, 4, 5, 6],
                                           dtype=tf.int32)
    ordinal_offsets = ordinal_offsets[weekday_mask]

    def forward(ordinals):
        """Adjusts the ordinals by removing the number of weekend days so far."""
        mod, remainder = ordinals // 7, ordinals % 7
        weekday_values = mod * num_workdays + tf.gather(
            weekday_offsets, remainder)
        is_weekday = tf.gather(weekday_mask, remainder)
        return weekday_values, is_weekday

    def backward(weekday_values):
        """Converts from weekend adjusted values to ordinals."""
        return ((weekday_values // num_workdays) * 7 +
                tf.gather(ordinal_offsets, weekday_values % num_workdays))

    return forward, backward
Пример #17
0
    def _apply_scores(self, scores, value, scores_mask=None, training=None):
        """Applies attention scores to the given value tensor.

        To use this method in your attention layer, follow the steps:

        * Use `query` tensor of shape `[batch_size, Tq]` and `key` tensor of
          shape `[batch_size, Tv]` to calculate the attention `scores`.
        * Pass `scores` and `value` tensors to this method. The method applies
          `scores_mask`, calculates `attention_distribution = softmax(scores)`,
          then returns `matmul(attention_distribution, value).
        * Apply `query_mask` and return the result.

        Args:
          scores: Scores float tensor of shape `[batch_size, Tq, Tv]`.
          value: Value tensor of shape `[batch_size, Tv, dim]`.
          scores_mask: A boolean mask `Tensor` of shape `[batch_size, 1, Tv]` or
            `[batch_size, Tq, Tv]`. If given, scores at positions where
            `scores_mask==False` do not contribute to the result. It must
            contain at least one `True` value in each line along the last
            dimension.
          training: Python boolean indicating whether the layer should behave in
            training mode (adding dropout) or in inference mode (no dropout).

        Returns:
          Tensor of shape `[batch_size, Tq, dim]`.
          Attention scores after masking and softmax with shape
            `[batch_size, Tq, Tv]`.
        """
        if scores_mask is not None:
            padding_mask = tf.logical_not(scores_mask)
            # Bias so padding positions do not contribute to attention
            # distribution.  Note 65504. is the max float16 value.
            if scores.dtype is tf.float16:
                scores -= 65504.0 * tf.cast(padding_mask, dtype=scores.dtype)
            else:
                scores -= 1.0e9 * tf.cast(padding_mask, dtype=scores.dtype)
        if training is None:
            training = backend.learning_phase()
        weights = tf.nn.softmax(scores)

        if self.dropout > 0:

            def dropped_weights():
                return self._random_generator.dropout(weights,
                                                      rate=self.dropout)

            weights = control_flow_util.smart_cond(
                training, dropped_weights, lambda: tf.identity(weights))
        return tf.matmul(weights, value), weights
Пример #18
0
 def __call__(self, cache,
              new_items):
   datawise_matches = []
   for key in self.keys:
     cache_vals = cache.data[key]
     new_items_vals = new_items[key]
     if cache_vals.dtype.is_floating:
       raise NotImplementedError('Floating datatypes are not yet implemented.')
     cache_vals = tf.expand_dims(cache_vals, axis=0)
     new_items_vals = tf.expand_dims(new_items_vals, axis=1)
     elementwise = cache_vals == new_items_vals
     datawise = tf.reduce_all(elementwise, axis=range(2, tf.rank(elementwise)))
     datawise_matches.append(datawise)
   all_keys_datawise = tf.stack(datawise_matches, axis=2)
   all_keys_match = tf.reduce_all(all_keys_datawise, axis=2)
   in_cache = tf.reduce_any(all_keys_match, axis=1)
   return tf.logical_not(in_cache)
Пример #19
0
 def _log_prob(self, x):
     x = tf.convert_to_tensor(x, name='x')
     right_indices = tf.minimum(
         tf.size(self.outcomes) - 1,
         tf.reshape(
             tf.searchsorted(self.outcomes,
                             values=tf.reshape(x, shape=[-1]),
                             side='right'), ps.shape(x)))
     use_right_indices = self._is_equal_or_close(
         x, tf.gather(self.outcomes, indices=right_indices))
     left_indices = tf.maximum(0, right_indices - 1)
     use_left_indices = self._is_equal_or_close(
         x, tf.gather(self.outcomes, indices=left_indices))
     log_probs = self._categorical.log_prob(
         tf.where(use_left_indices, left_indices, right_indices))
     return tf.where(tf.logical_not(use_left_indices | use_right_indices),
                     dtype_util.as_numpy_dtype(log_probs.dtype)(-np.inf),
                     log_probs)
    def compute_loss_and_acc(self, rnn_output_logits: tf.Tensor,
                             target_token_seq: tf.Tensor) -> LanguageModelLoss:
        """
        Args:
            rnn_output_logits: tf.float32 Tensor of shape [B, T, V], representing
                logits as computed by the language model.
            target_token_seq: tf.int32 Tensor of shape [B, T], representing
                the target token sequence.

        Returns:
            LanguageModelLoss tuple, containing both the average per-token loss
            as well as the number of (non-padding) token predictions and how many
            of those were correct.
        
        Note:
            We assume that the two inputs are shifted by one from each other, i.e.,
            that rnn_output_logits[i, t, :] are the logits for sample i after consuming
            input t; hence its target output is assumed to be target_token_seq[i, t+1].
        """
        # TODO 5# 4) Compute CE loss for all but the last timestep:
        token_ce_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=target_token_seq[:, 1:],
            logits=rnn_output_logits[:, :-1, :])
        # token_ce_loss = tf.reduce_mean(token_ce_loss) becomes redundant, because I do it at TODO 7

        # TODO 6# Compute number of (correct) predictions
        pad_id = self.vocab.get_id_or_unk(self.vocab.get_pad())
        mask = tf.logical_not(tf.equal(target_token_seq, pad_id))[:, 1:]

        # compute predictions correctness and drop the padding by applying the mask
        predictions_status = tf.boolean_mask(
            tf.equal(target_token_seq[:, 1:],
                     tf.argmax(rnn_output_logits[:, :-1], axis=2)), mask)

        num_tokens = len(predictions_status)
        num_correct_tokens = tf.math.count_nonzero(predictions_status,
                                                   dtype=tf.float32)

        # TODO 7# Mask out CE loss for padding tokens
        token_ce_loss = tf.boolean_mask(token_ce_loss, mask)
        token_ce_loss = tf.reduce_mean(token_ce_loss)

        return LanguageModelLoss(token_ce_loss, num_tokens, num_correct_tokens)
  def _get_rpn_samples(self, match_results):
    """Computes anchor labels.

    This function performs subsampling for foreground (fg) and background (bg)
    anchors.
    Args:
      match_results: A integer tensor with shape [N] representing the
        matching results of anchors. (1) match_results[i]>=0,
        meaning that column i is matched with row match_results[i].
        (2) match_results[i]=-1, meaning that column i is not matched.
        (3) match_results[i]=-2, meaning that column i is ignored.
    Returns:
      score_targets: a integer tensor with the a shape of [N].
        (1) score_targets[i]=1, the anchor is a positive sample.
        (2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
        don't care (ignore).
    """
    sampler = (
        balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
            positive_fraction=self._rpn_fg_fraction, is_static=False))
    # indicator includes both positive and negative labels.
    # labels includes only positives labels.
    # positives = indicator & labels.
    # negatives = indicator & !labels.
    # ignore = !indicator.
    indicator = tf.greater(match_results, -2)
    labels = tf.greater(match_results, -1)

    samples = sampler.subsample(
        indicator, self._rpn_batch_size_per_im, labels)
    positive_labels = tf.where(
        tf.logical_and(samples, labels),
        tf.constant(2, dtype=tf.int32, shape=match_results.shape),
        tf.constant(0, dtype=tf.int32, shape=match_results.shape))
    negative_labels = tf.where(
        tf.logical_and(samples, tf.logical_not(labels)),
        tf.constant(1, dtype=tf.int32, shape=match_results.shape),
        tf.constant(0, dtype=tf.int32, shape=match_results.shape))
    ignore_labels = tf.fill(match_results.shape, -1)

    return (ignore_labels + positive_labels + negative_labels,
            positive_labels, negative_labels)
Пример #22
0
def _add_conformer_features(
    graph,
    conformer_features,
    augment_with_random_mirror_symmetry: bool,
    noise_std: float,
    is_training: bool,
):
    """Adds conformer features."""
    if not isinstance(graph.nodes, dict):
        raise ValueError("Expected a dict type for `graph.nodes`.")
    # Remove mean position to center around a canonical origin.
    positions = conformer_features["conformer"]
    # NaN's appear in ~0.13% of training, 0.104% of validation and 0.16% of test
    # nodes.
    # See this colab: http://shortn/_6UcuosxY7x.
    nan_mask = tf.reduce_any(tf.math.is_nan(positions))

    positions = tf.where(nan_mask, tf.constant(0., positions.dtype), positions)
    positions -= tf.reduce_mean(positions, axis=0, keepdims=True)

    # Optionally augment with a random rotation.
    if is_training:
        rot_mat = conformer_utils.get_random_rotation_matrix(
            augment_with_random_mirror_symmetry)
        positions = conformer_utils.rotate(positions, rot_mat)
    positions_targets = positions

    # Optionally add noise to the positions.
    if noise_std and is_training:
        positions = tf.random.normal(tf.shape(positions), positions, noise_std)

    return graph._replace(
        nodes=dict(positions=positions,
                   positions_targets=positions_targets,
                   **graph.nodes),
        globals={
            "positions_nan_mask": tf.expand_dims(tf.logical_not(nan_mask),
                                                 axis=0),
            **(graph.globals if isinstance(graph.globals, dict) else {})
        })
Пример #23
0
 def _log_prob(self, x):
     x = tf.convert_to_tensor(value=x, name='x')
     right_indices = tf.minimum(
         tf.size(input=self.outcomes) - 1,
         tf.reshape(
             tf.searchsorted(self.outcomes,
                             values=tf.reshape(x, shape=[-1]),
                             side='right'),
             dist_util.prefer_static_shape(x)))
     use_right_indices = self._is_equal_or_close(
         x, tf.gather(self.outcomes, indices=right_indices))
     left_indices = tf.maximum(0, right_indices - 1)
     use_left_indices = self._is_equal_or_close(
         x, tf.gather(self.outcomes, indices=left_indices))
     log_probs = self._categorical.log_prob(
         tf1.where(use_left_indices, left_indices, right_indices))
     should_be_neg_inf = tf.broadcast_to(
         tf.logical_not(use_left_indices | use_right_indices),
         shape=dist_util.prefer_static_shape(log_probs))
     return tf1.where(
         should_be_neg_inf,
         tf.fill(dist_util.prefer_static_shape(should_be_neg_inf),
                 dtype_util.as_numpy_dtype(log_probs.dtype)(-np.inf)),
         log_probs)
def fit(
    model_matrix,
    response,
    model,
    model_coefficients_start=None,
    predicted_linear_response_start=None,
    l2_regularizer=None,
    dispersion=None,
    offset=None,
    convergence_criteria_fn=None,
    learning_rate=None,
    fast_unsafe_numerics=True,
    maximum_iterations=None,
    l2_regularization_penalty_factor=None,
    name=None):
  """Runs multiple Fisher scoring steps.

  Args:
    model_matrix: (Batch of) `float`-like, matrix-shaped `Tensor` where each row
      represents a sample's features.
    response: (Batch of) vector-shaped `Tensor` where each element represents a
      sample's observed response (to the corresponding row of features). Must
      have same `dtype` as `model_matrix`.
    model: `tfp.glm.ExponentialFamily`-like instance which implicitly
      characterizes a negative log-likelihood loss by specifying the
      distribuion's `mean`, `gradient_mean`, and `variance`.
    model_coefficients_start: Optional (batch of) vector-shaped `Tensor`
      representing the initial model coefficients, one for each column in
      `model_matrix`. Must have same `dtype` as `model_matrix`.
      Default value: Zeros.
    predicted_linear_response_start: Optional `Tensor` with `shape`, `dtype`
      matching `response`; represents `offset` shifted initial linear
      predictions based on `model_coefficients_start`.
      Default value: `offset` if `model_coefficients is None`, and
      `tf.linalg.matvec(model_matrix, model_coefficients_start) + offset`
      otherwise.
    l2_regularizer: Optional scalar `Tensor` representing L2 regularization
      penalty, i.e.,
      `loss(w) = sum{-log p(y[i]|x[i],w) : i=1..n} + l2_regularizer ||w||_2^2`.
      Default value: `None` (i.e., no L2 regularization).
    dispersion: Optional (batch of) `Tensor` representing `response` dispersion,
      i.e., as in, `p(y|theta) := exp((y theta - A(theta)) / dispersion)`.
      Must broadcast with rows of `model_matrix`.
      Default value: `None` (i.e., "no dispersion").
    offset: Optional `Tensor` representing constant shift applied to
      `predicted_linear_response`.  Must broadcast to `response`.
      Default value: `None` (i.e., `tf.zeros_like(response)`).
    convergence_criteria_fn: Python `callable` taking:
      `is_converged_previous`, `iter_`, `model_coefficients_previous`,
      `predicted_linear_response_previous`, `model_coefficients_next`,
      `predicted_linear_response_next`, `response`, `model`, `dispersion` and
      returning a `bool` `Tensor` indicating that Fisher scoring has converged.
      See `convergence_criteria_small_relative_norm_weights_change` as an
      example function.
      Default value: `None` (i.e.,
      `convergence_criteria_small_relative_norm_weights_change`).
    learning_rate: Optional (batch of) scalar `Tensor` used to dampen iterative
      progress. Typically only needed if optimization diverges, should be no
      larger than `1` and typically very close to `1`.
      Default value: `None` (i.e., `1`).
    fast_unsafe_numerics: Optional Python `bool` indicating if faster, less
      numerically accurate methods can be employed for computing the weighted
      least-squares solution.
      Default value: `True` (i.e., "fast but possibly diminished accuracy").
    maximum_iterations: Optional maximum number of iterations of Fisher scoring
      to run; "and-ed" with result of `convergence_criteria_fn`.
      Default value: `None` (i.e., `infinity`).
    l2_regularization_penalty_factor: Optional (batch of) vector-shaped
      `Tensor`, representing a separate penalty factor to apply to each model
      coefficient, length equal to columns in `model_matrix`. Each penalty
      factor multiplies l2_regularizer to allow differential regularization. Can
      be 0 for some coefficients, which implies no regularization. Default is 1
      for all coefficients.
      `loss(w) = sum{-log p(y[i]|x[i],w) : i=1..n} + l2_regularizer ||w *
        l2_regularization_penalty_factor||_2^2`
      Default value: `None` (i.e., no per coefficient regularization).
    name: Python `str` used as name prefix to ops created by this function.
      Default value: `"fit"`.

  Returns:
    model_coefficients: (Batch of) vector-shaped `Tensor`; represents the
      fitted model coefficients, one for each column in `model_matrix`.
    predicted_linear_response: `response`-shaped `Tensor` representing linear
      predictions based on new `model_coefficients`, i.e.,
      `tf.linalg.matvec(model_matrix, model_coefficients) + offset`.
    is_converged: `bool` `Tensor` indicating that the returned
      `model_coefficients` met the `convergence_criteria_fn` criteria within the
      `maximum_iterations` limit.
    iter_: `int32` `Tensor` indicating the number of iterations taken.

  #### Example

  ```python
  from __future__ import print_function
  import numpy as np
  import tensorflow as tf
  import tensorflow_probability as tfp
  tfd = tfp.distributions

  def make_dataset(n, d, link, scale=1., dtype=np.float32):
    model_coefficients = tfd.Uniform(
        low=np.array(-1, dtype),
        high=np.array(1, dtype)).sample(d, seed=42)
    radius = np.sqrt(2.)
    model_coefficients *= radius / tf.linalg.norm(model_coefficients)
    model_matrix = tfd.Normal(
        loc=np.array(0, dtype),
        scale=np.array(1, dtype)).sample([n, d], seed=43)
    scale = tf.convert_to_tensor(scale, dtype)
    linear_response = tf.tensordot(
        model_matrix, model_coefficients, axes=[[1], [0]])
    if link == 'linear':
      response = tfd.Normal(loc=linear_response, scale=scale).sample(seed=44)
    elif link == 'probit':
      response = tf.cast(
          tfd.Normal(loc=linear_response, scale=scale).sample(seed=44) > 0,
          dtype)
    elif link == 'logit':
      response = tfd.Bernoulli(logits=linear_response).sample(seed=44)
    else:
      raise ValueError('unrecognized true link: {}'.format(link))
    return model_matrix, response, model_coefficients

  X, Y, w_true = make_dataset(n=int(1e6), d=100, link='probit')

  w, linear_response, is_converged, num_iter = tfp.glm.fit(
      model_matrix=X,
      response=Y,
      model=tfp.glm.BernoulliNormalCDF())
  log_likelihood = tfp.glm.BernoulliNormalCDF().log_prob(Y, linear_response)

  print('is_converged: ', is_converged.numpy())
  print('    num_iter: ', num_iter.numpy())
  print('    accuracy: ', np.mean((linear_response > 0.) == tf.cast(Y, bool)))
  print('    deviance: ', 2. * np.mean(log_likelihood))
  print('||w0-w1||_2 / (1+||w0||_2): ', (np.linalg.norm(w_true - w, ord=2) /
                                         (1. + np.linalg.norm(w_true, ord=2))))

  # ==>
  # is_converged:  True
  #     num_iter:  6
  #     accuracy:  0.804382
  #     deviance:  -0.820746600628
  # ||w0-w1||_2 / (1+||w0||_2):  0.00619245105309
  ```

  """
  with tf.name_scope(name or 'fit'):
    [
        model_matrix,
        response,
        model_coefficients_start,
        predicted_linear_response_start,
        offset,
    ] = prepare_args(
        model_matrix,
        response,
        model_coefficients_start,
        predicted_linear_response_start,
        offset)
    if convergence_criteria_fn is None:
      convergence_criteria_fn = (
          convergence_criteria_small_relative_norm_weights_change())

    def _body(
        is_converged_previous,
        iter_,
        model_coefficients_previous,
        predicted_linear_response_previous):
      """`tf.while_loop` body."""
      model_coefficients_next, predicted_linear_response_next = fit_one_step(
          model_matrix,
          response,
          model,
          model_coefficients_previous,
          predicted_linear_response_previous,
          l2_regularizer,
          dispersion,
          offset,
          learning_rate,
          fast_unsafe_numerics,
          l2_regularization_penalty_factor,
          name)
      is_converged_next = convergence_criteria_fn(
          is_converged_previous=is_converged_previous,
          iter_=iter_,
          model_coefficients_previous=model_coefficients_previous,
          predicted_linear_response_previous=predicted_linear_response_previous,
          model_coefficients_next=model_coefficients_next,
          predicted_linear_response_next=predicted_linear_response_next,
          response=response,
          model=model,
          dispersion=dispersion)
      return [
          is_converged_next,
          iter_ + 1,
          model_coefficients_next,
          predicted_linear_response_next,
      ]

    # while not converged:
    #   fit_one_step
    [
        is_converged,
        iter_,
        model_coefficients,
        predicted_linear_response,
    ] = tf.while_loop(
        cond=lambda is_converged, *args: tf.logical_not(is_converged),
        body=_body,
        loop_vars=[
            tf.zeros([], np.bool),   # is_converged
            tf.zeros([], np.int32),  # iter_
            model_coefficients_start,
            predicted_linear_response_start,
        ],
        maximum_iterations=maximum_iterations)

    return [
        model_coefficients,
        predicted_linear_response,
        is_converged,
        iter_
    ]
    def _maybe_validate_shape_override(self, override_shape, base_is_scalar_fn,
                                       static_base_shape, is_init):
        """Helper which ensures override batch/event_shape are valid."""

        assertions = []
        concretized_shape = None

        # Check valid dtype
        if is_init:  # No xor check because `dtype` cannot change.
            dtype_ = override_shape.dtype
            if dtype_ is None:
                if concretized_shape is None:
                    concretized_shape = tf.convert_to_tensor(override_shape)
                dtype_ = concretized_shape.dtype
            if dtype_util.base_dtype(dtype_) not in {tf.int32, tf.int64}:
                raise TypeError('Shape override must be integer type; '
                                'saw {}.'.format(dtype_util.name(dtype_)))

        # Check non-negative elements
        if is_init != tensor_util.is_ref(override_shape):
            override_shape_ = tf.get_static_value(override_shape)
            msg = 'Shape override must have non-negative elements.'
            if override_shape_ is not None:
                if np.any(np.array(override_shape_) < 0):
                    raise ValueError('{} Saw: {}'.format(msg, override_shape_))
            elif self.validate_args:
                if concretized_shape is None:
                    concretized_shape = tf.convert_to_tensor(override_shape)
                assertions.append(
                    assert_util.assert_non_negative(concretized_shape,
                                                    message=msg))

        # Check valid shape
        override_ndims_ = tensorshape_util.rank(override_shape.shape)
        if is_init != (override_ndims_ is None):
            msg = 'Shape override must be a vector.'
            if override_ndims_ is not None:
                if override_ndims_ != 1:
                    raise ValueError(msg)
            elif self.validate_args:
                if concretized_shape is None:
                    concretized_shape = tf.convert_to_tensor(override_shape)
                override_rank = tf.rank(concretized_shape)
                assertions.append(
                    assert_util.assert_equal(override_rank, 1, message=msg))

        static_base_rank = tensorshape_util.rank(static_base_shape)

        # Determine if the override shape is `[]` (static_override_dims == [0]),
        # in which case the base distribution may be nonscalar.
        static_override_dims = tensorshape_util.dims(override_shape.shape)

        if is_init != (static_base_rank is None
                       or static_override_dims is None):
            msg = 'Base distribution is not scalar.'
            if static_base_rank is not None and static_override_dims is not None:
                if static_base_rank != 0 and static_override_dims != [0]:
                    raise ValueError(msg)
            elif self.validate_args:
                if concretized_shape is None:
                    concretized_shape = tf.convert_to_tensor(override_shape)
                override_is_empty = tf.logical_not(
                    self._has_nonzero_rank(concretized_shape))
                assertions.append(
                    assert_util.assert_equal(tf.logical_or(
                        base_is_scalar_fn(), override_is_empty),
                                             True,
                                             message=msg))
        return assertions
Пример #26
0
 def _cond(state):
     """Continue if iterations remain and stopping condition is not met."""
     return ((state.num_iterations < max_iterations) & tf.logical_not(
         stopping_condition(state.converged, state.failed)))
Пример #27
0
def minimize(loss_fn,
             num_steps,
             optimizer,
             convergence_criterion=None,
             batch_convergence_reduce_fn=tf.reduce_all,
             trainable_variables=None,
             trace_fn=_trace_loss,
             return_full_length_trace=True,
             jit_compile=False,
             seed=None,
             name='minimize'):
    """Minimize a loss function using a provided optimizer.

  Args:
    loss_fn: Python callable with signature `loss = loss_fn()`, where `loss`
      is a `Tensor` loss to be minimized. This may optionally take a `seed`
      keyword argument, used to specify a per-iteration seed for stochastic
      loss functions (a stateless `Tensor` seed will be passed; see
      `tfp.random.sanitize_seed`).
    num_steps: Python `int` maximum number of steps to run the optimizer.
    optimizer: Optimizer instance to use. This may be a TF1-style
      `tf.train.Optimizer`, TF2-style `tf.optimizers.Optimizer`, or any Python
      object that implements `optimizer.apply_gradients(grads_and_vars)`.
    convergence_criterion: Optional instance of
      `tfp.optimizer.convergence_criteria.ConvergenceCriterion`
      representing a criterion for detecting convergence. If `None`,
      the optimization will run for `num_steps` steps, otherwise, it will run
      for at *most* `num_steps` steps, as determined by the provided criterion.
      Default value: `None`.
    batch_convergence_reduce_fn: Python `callable` of signature
      `has_converged = batch_convergence_reduce_fn(batch_has_converged)`
      whose input is a `Tensor` of boolean values of the same shape as the
      `loss` returned by `loss_fn`, and output is a scalar
      boolean `Tensor`. This determines the behavior of batched
      optimization loops when `loss_fn`'s return value is non-scalar.
      For example, `tf.reduce_all` will stop the optimization
      once all members of the batch have converged, `tf.reduce_any` once *any*
      member has converged,
      `lambda x: tf.reduce_mean(tf.cast(x, tf.float32)) > 0.5` once more than
      half have converged, etc.
      Default value: `tf.reduce_all`.
    trainable_variables: list of `tf.Variable` instances to optimize with
      respect to. If `None`, defaults to the set of all variables accessed
      during the execution of `loss_fn()`.
      Default value: `None`.
    trace_fn: Python callable with signature `traced_values = trace_fn(
      traceable_quantities)`, where the argument is an instance of
      `tfp.math.MinimizeTraceableQuantities` and the returned `traced_values`
      may be a `Tensor` or nested structure of `Tensor`s. The traced values are
      stacked across steps and returned.
      The default `trace_fn` simply returns the loss. In general, trace
      functions may also examine the gradients, values of parameters,
      the state propagated by the specified `convergence_criterion`, if any (if
      no convergence criterion is specified, this will be `None`),
      as well as any other quantities captured in the closure of `trace_fn`,
      for example, statistics of a variational distribution.
      Default value: `lambda traceable_quantities: traceable_quantities.loss`.
    return_full_length_trace: Python `bool` indicating whether to return a trace
      of the full length `num_steps`, even if a convergence criterion stopped
      the optimization early, by tiling the value(s) traced at the final
      optimization step. This enables use in contexts such as XLA that require
      shapes to be known statically.
      Default value: `True`.
    jit_compile: If True, compiles the minimization loop using
      XLA. XLA performs compiler optimizations, such as fusion, and attempts to
      emit more efficient code. This may drastically improve the performance.
      See the docs for `tf.function`. (In JAX, this will apply `jax.jit`).
      Default value: `False`.
    seed: PRNG seed for stochastic losses; see `tfp.random.sanitize_seed.`
      Default value: `None`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: 'minimize'.

  Returns:
    trace: `Tensor` or nested structure of `Tensor`s, according to the
      return type of `trace_fn`. Each `Tensor` has an added leading dimension
      stacking the trajectory of the traced values over the course of the
      optimization. The size of this dimension is equal to `num_steps` if
      a convergence criterion was not specified and/or
      `return_full_length_trace=True`, and otherwise it is equal
      equal to the number of optimization steps taken.

  ### Examples

  To minimize the scalar function `(x - 5)**2`:

  ```python
  x = tf.Variable(0.)
  loss_fn = lambda: (x - 5.)**2
  losses = tfp.math.minimize(loss_fn,
                             num_steps=100,
                             optimizer=tf.optimizers.Adam(learning_rate=0.1))

  # In TF2/eager mode, the optimization runs immediately.
  print("optimized value is {} with loss {}".format(x, losses[-1]))
  ```

  In graph mode (e.g., inside of `tf.function` wrapping), retrieving any Tensor
  that depends on the minimization op will trigger the optimization:

  ```python
  with tf.control_dependencies([losses]):
    optimized_x = tf.identity(x)  # Use a dummy op to attach the dependency.
  ```

  We can attempt to automatically detect convergence and stop the optimization
  by passing an instance of
  `tfp.optimize.convergence_criteria.ConvergenceCriterion`. For example, to
  stop the optimization once a moving average of the per-step decrease in loss
  drops below `0.01`:

  ```python
  losses = tfp.math.minimize(
    loss_fn, num_steps=1000, optimizer=tf.optimizers.Adam(learning_rate=0.1),
    convergence_criterion=(
      tfp.optimizers.convergence_criteria.LossNotDecreasing(atol=0.01)))
  ```

  Here `num_steps=1000` defines an upper bound: the optimization will be
  stopped after 1000 steps even if no convergence is detected.

  In some cases, we may want to track additional context inside the
  optimization. We can do this by defining a custom `trace_fn`. Note that
  the `trace_fn` is passed the loss and gradients, as well as any auxiliary
  state maintained by the convergence criterion (if any), for example, moving
  averages of the loss or gradients, but it may also report the
  values of trainable parameters or other derived quantities by capturing them
  in its closure. For example, we can capture `x` and track its value over the
  optimization:

  ```python
  # `x` is the tf.Variable instance defined above.
  trace_fn = lambda traceable_quantities: {
    'loss': traceable_quantities.loss, 'x': x}
  trace = tfp.math.minimize(loss_fn, num_steps=100,
                            optimizer=tf.optimizers.Adam(0.1),
                            trace_fn=trace_fn)
  print(trace['loss'].shape,   # => [100]
        trace['x'].shape)      # => [100]
  ```

  When optimizing a batch of losses, some batch members will converge before
  others. The optimization will continue until the condition defined by the
  `batch_convergence_reduce_fn` becomes `True`. During these additional steps,
  converged elements will continue to be updated and may become unconverged.
  The convergence status of batch members can be diagnosed by tracing
  `has_converged`:

  ```python
  batch_size = 10
  x = tf.Variable([0.] * batch_size)
  trace_fn = lambda traceable_quantities: {
    'loss': traceable_quantities.loss,
    'has_converged': traceable_quantities.has_converged}
  trace = tfp.math.minimize(loss_fn, num_steps=100,
                            optimizer=tf.optimizers.Adam(0.1),,
                            trace_fn=trace_fn,
                            convergence_criterion=(
      tfp.optimizers.convergence_criteria.LossNotDecreasing(atol=0.01)))

  for i in range(batch_size):
    print('Batch element {} final state is {}converged.'
          ' It first converged at step {}.'.format(
          i, '' if has_converged[-1, i] else 'not ',
          np.argmax(trace.has_converged[:, i])))
  ```

  """

    if jit_compile:
        # Run the entire minimization inside a jit-compiled function. This is
        # typically faster than jit-compiling the individual steps.
        parameters = dict(locals())
        parameters['jit_compile'] = False

        @tf.function(autograph=False, jit_compile=True)
        def run_jitted_minimize():
            return minimize(**parameters)

        return run_jitted_minimize()

    def convergence_detected(step,
                             seed,
                             trace_arrays,
                             has_converged=None,
                             convergence_criterion_state=None):
        del step
        del seed
        del trace_arrays
        del convergence_criterion_state
        return (has_converged is not None  # Convergence criterion in use.
                and batch_convergence_reduce_fn(has_converged))

    # Main optimization routine.
    with tf.name_scope(name) as name:
        seed = samplers.sanitize_seed(seed, salt='minimize')

        # Take an initial training step to obtain the initial loss and values, which
        # will define the shape(s) of the `TensorArray`(s) that we create to hold
        # the results, and are used to initialize the convergence criterion.
        # This will trigger tf.function tracing of `optimizer_step_fn`, which is
        # then reused inside the training loop (i.e., it is only traced once).
        optimizer_step_fn = _make_optimizer_step_fn(
            loss_fn=loss_fn,
            optimizer=optimizer,
            trainable_variables=trainable_variables)
        initial_loss, initial_grads, initial_parameters = optimizer_step_fn(
            seed=seed)
        has_converged = None
        initial_convergence_criterion_state = None
        if convergence_criterion is not None:
            has_converged = tf.zeros(tf.shape(initial_loss), dtype=tf.bool)
            initial_convergence_criterion_state = convergence_criterion.bootstrap(
                initial_loss, initial_grads, initial_parameters)
        initial_traced_values = trace_fn(
            MinimizeTraceableQuantities(
                loss=initial_loss,
                gradients=initial_grads,
                parameters=initial_parameters,
                step=0,
                has_converged=has_converged,
                convergence_criterion_state=initial_convergence_criterion_state
            ))

        trace_arrays = _initialize_arrays(
            initial_values=initial_traced_values,
            num_steps=num_steps,
            truncate_at_convergence=(convergence_criterion is not None
                                     and not return_full_length_trace))

        # Run the optimization loop.
        with tf.control_dependencies([initial_loss]):
            potential_loop_vars = (1, seed, trace_arrays, has_converged,
                                   initial_convergence_criterion_state)
            results = tf.while_loop(
                cond=lambda *args: tf.logical_not(convergence_detected(*args)),  # pylint: disable=no-value-for-parameter
                body=_make_training_loop_body(
                    optimizer_step_fn=optimizer_step_fn,
                    convergence_criterion=convergence_criterion,
                    trace_fn=trace_fn),
                loop_vars=[x for x in potential_loop_vars if x is not None],
                parallel_iterations=1,
                maximum_iterations=num_steps - 1)
            indices, _, trace_arrays = results[:3]  # Guaranteed to be present.

            if convergence_criterion is not None and return_full_length_trace:
                # Fill out the trace by tiling the last written values.
                last_written_idx = tf.reduce_max(indices) - 1
                trace_arrays = tf.nest.map_structure(
                    lambda ta: _tile_last_written_value(ta, last_written_idx),
                    trace_arrays)

        return tf.nest.map_structure(lambda array: array.stack(), trace_arrays)
Пример #28
0
def logical_not(x):
    x = array_ops.array(x, dtype=np.bool_)
    return utils.tensor_to_ndarray(tf.logical_not(x.data))
Пример #29
0
 def f(x):
     if x.dtype == tf.bool:
         return tf.logical_not(x)
     return tf.bitwise.invert(x)
Пример #30
0
 def loop_cond(i, decodes_BxT, unused_cache_BxU_dict):
   finished_B = tf.reduce_any(tf.equal(decodes_BxT, eos_id), axis=1)
   return tf.logical_and(i < max_decode_len,
                         tf.logical_not(tf.reduce_all(finished_B)))