Пример #1
0
  def train(self, sentences):
    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
    tokens_sparse = tf.sparse.SparseTensor(
        indices=token_ids, values=token_values, dense_shape=token_dense_shape)
    tokens = tf.sparse.to_dense(tokens_sparse, default_value="")

    sparse_lookup_ids = tf.sparse.SparseTensor(
        indices=tokens_sparse.indices,
        values=self._words_to_indices(tokens_sparse.values),
        dense_shape=tokens_sparse.dense_shape)
    lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0)

    # Targets are the next word for each word of the sentence.
    tokens_ids_seq = lookup_ids[:, 0:-1]
    tokens_ids_target = lookup_ids[:, 1:]

    tokens_prefix = tokens[:, 0:-1]

    # Mask determining which positions we care about for a loss: all positions
    # that have a valid non-terminal token.
    mask = tf.logical_and(
        tf.logical_not(tf.equal(tokens_prefix, "")),
        tf.logical_not(tf.equal(tokens_prefix, "<E>")))

    input_mask = tf.cast(mask, tf.int32)

    with tf.GradientTape() as t:
      sentence_embeddings = tf.nn.embedding_lookup(self._embeddings,
                                                   tokens_ids_seq)

      lstm_initial_state = self._lstm_cell.get_initial_state(
          sentence_embeddings)

      lstm_output = self._rnn_layer(
          inputs=sentence_embeddings, initial_state=lstm_initial_state)

      # Stack LSTM outputs into a batch instead of a 2D array.
      lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size])

      logits = self._logit_layer(lstm_output)

      targets = tf.reshape(tokens_ids_target, [-1])
      weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32)

      losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=targets, logits=logits)

      # Final loss is the mean loss for all token losses.
      final_loss = tf.math.divide(
          tf.reduce_sum(tf.multiply(losses, weights)),
          tf.reduce_sum(weights),
          name="final_loss")

    watched = t.watched_variables()
    gradients = t.gradient(final_loss, watched)

    for w, g in zip(watched, gradients):
      w.assign_sub(g)

    return final_loss
Пример #2
0
    def train_step(inputs):
        """Build `step_fn` for efficientnet learning."""
        images, labels = inputs
        images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])
        labels = tf.tile(labels, [FLAGS.ensemble_size, 1])

        num_replicas = tf.cast(strategy.num_replicas_in_sync, tf.float32)
        l2_coeff = tf.cast(FLAGS.l2, tf.float32)

        with tf.GradientTape() as tape:
            logits = model(images, training=True)
            logits = tf.cast(logits, tf.float32)
            negative_log_likelihood = tf.reduce_mean(
                tf.keras.losses.categorical_crossentropy(
                    labels,
                    logits,
                    from_logits=True,
                    label_smoothing=FLAGS.label_smoothing))

            filtered_variables = []
            for var in model.trainable_variables:
                # Apply l2 on the slow weights and bias terms. This excludes BN
                # parameters and fast weight approximate posterior/prior parameters,
                # but pay caution to their naming scheme.
                if 'kernel' in var.name or 'bias' in var.name:
                    filtered_variables.append(tf.reshape(var, (-1, )))

            l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss(
                tf.concat(filtered_variables, axis=0))
            loss = negative_log_likelihood + l2_coeff * l2_loss
            scaled_loss = loss / num_replicas

        grads = tape.gradient(scaled_loss, model.trainable_weights)

        # Separate learning rate implementation.
        if FLAGS.fast_weight_lr_multiplier != 1.0:
            grads_and_vars = []
            for grad, var in zip(grads, model.trainable_variables):
                # Apply different learning rate on the fast weights. This excludes BN
                # and slow weights, but pay caution to the naming scheme.
                if ('batch_norm' not in var.name and 'kernel' not in var.name):
                    grads_and_vars.append(
                        (grad * FLAGS.fast_weight_lr_multiplier, var))
                else:
                    grads_and_vars.append((grad, var))
            optimizer.apply_gradients(grads_and_vars)
        else:
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        sparse_labels = tf.cast(
            tf.math.argmax(labels, axis=-1, output_type=tf.int32), tf.float32)
        probs = tf.nn.softmax(logits)
        metrics['train/loss'].update_state(loss)
        metrics['train/negative_log_likelihood'].update_state(
            negative_log_likelihood)
        metrics['train/accuracy'].update_state(labels, logits)
        metrics['train/ece'].update_state(sparse_labels, probs)

        step_info = {
            'loss/negative_log_likelihood':
            negative_log_likelihood / num_replicas,
            'loss/total_loss': scaled_loss,
        }
        return step_info
Пример #3
0
 def _call(self, r):
     mean = grad_mean = tf.math.exp(r)
     variance = mean + mean**2 / tf.cast(self._total_count, r.dtype)
     return mean, variance, grad_mean
Пример #4
0
 def _call(self, r):
     mean = tf.math.reciprocal(r)
     grad_mean = -r**-2
     s = tf.cast(self._scale, r.dtype)
     variance = tf.fill(tf.shape(r), s**2.)
     return mean, variance, grad_mean
Пример #5
0
 def _as_distribution(self, r):
     concentration = DeferredTensor(self._concentration,
                                    lambda x: tf.cast(x, r.dtype),
                                    dtype=r.dtype)
     return tfd.Gamma(concentration=concentration,
                      rate=DeferredTensor(r, lambda x: tf.math.exp(-x)))
Пример #6
0
 def _as_distribution(self, r):
     total_count = DeferredTensor(self._total_count,
                                  lambda x: tf.cast(x, r.dtype),
                                  dtype=r.dtype)
     return tfd.Binomial(total_count=total_count, logits=r)
Пример #7
0
 def sparse_read(self, indices, name=None):
     """Reads the value of this variable sparsely, using `gather`."""
     val = self._variable.sparse_read(indices, name=name)
     return tf.cast(val, self._cast_dtype)
Пример #8
0
 def value(self):
     val = self._variable.value()
     if not self._should_cast():
         return val
     return tf.cast(val, self._cast_dtype)
Пример #9
0
def mixup_or_cutmix(batch):
    """Randomly applies one of cutmix or mixup to a batch."""
    logging.info('Randomly applying cutmix or mixup with 50% chance!')
    return tf.cond(
        tf.cast(tf.random.uniform([], maxval=2, dtype=tf.int32), tf.bool),
        lambda: my_mixup(batch), lambda: my_cutmix(batch))
Пример #10
0
 def cast_fn(batch):
     batch = dict(**batch)
     batch['images'] = tf.cast(batch['images'], _to_tf_dtype(dtype))
     return batch
Пример #11
0
def _milstein_step(*, dim, i, written_count, current_state, result, drift_fn,
                   volatility_fn, grad_volatility_fn, wiener_mean, num_samples,
                   times, dt, sqrt_dt, keep_mask, random_type, seed,
                   normal_draws, input_gradients, stratonovich_order,
                   aux_normal_draws):
    """Performs one step of Milstein scheme."""
    current_time = times[i + 1]
    written_count = tf.cast(written_count, tf.int32)
    if normal_draws is not None:
        dw = normal_draws[i]
    else:
        dw = random.mv_normal_sample((num_samples, ),
                                     mean=wiener_mean,
                                     random_type=random_type,
                                     seed=seed)
    if aux_normal_draws is not None:
        stratonovich_draws = []
        for j in range(3):
            stratonovich_draws.append(
                tf.reshape(aux_normal_draws[j][i],
                           [num_samples, dim, stratonovich_order]))
    else:
        stratonovich_draws = []
        # Three sets of normal draws for stratonovich integrals.
        for j in range(3):
            stratonovich_draws.append(
                random.mv_normal_sample(
                    (num_samples, ),
                    mean=tf.zeros((dim, stratonovich_order),
                                  dtype=current_state.dtype,
                                  name='stratonovich_draws_{}'.format(j)),
                    random_type=random_type,
                    seed=seed))

    if dim == 1:
        drift = drift_fn(current_time, current_state)
        vol = volatility_fn(current_time, current_state)
        grad_vol = grad_volatility_fn(current_time, current_state,
                                      tf.ones_like(current_state))
        next_state = _milstein_1d(dw=dw,
                                  dt=dt[i],
                                  sqrt_dt=sqrt_dt[i],
                                  current_state=current_state,
                                  drift=drift,
                                  vol=vol,
                                  grad_vol=grad_vol)
    else:
        drift = drift_fn(current_time, current_state)
        vol = volatility_fn(current_time, current_state)
        # This is a list of size equal to the dimension of the state space `dim`.
        # It contains tensors of shape [num_samples, dim, wiener_dim] representing
        # the gradient of the volatility function. In our case, the dimension of the
        # wiener process `wiener_dim` is equal to the state dimension `dim`.
        grad_vol = [
            grad_volatility_fn(current_time, current_state, start)
            for start in input_gradients
        ]
        next_state = _milstein_nd(dim=dim,
                                  num_samples=num_samples,
                                  dw=dw,
                                  dt=dt[i],
                                  sqrt_dt=sqrt_dt[i],
                                  current_state=current_state,
                                  drift=drift,
                                  vol=vol,
                                  grad_vol=grad_vol,
                                  stratonovich_draws=stratonovich_draws,
                                  stratonovich_order=stratonovich_order)

    result = utils.maybe_update_along_axis(tensor=result,
                                           do_update=keep_mask[i + 1],
                                           ind=written_count,
                                           axis=1,
                                           new_tensor=tf.expand_dims(
                                               next_state, axis=1))
    written_count += tf.cast(keep_mask[i + 1], dtype=tf.int32)
    return i + 1, written_count, next_state, result
Пример #12
0
def sample(*,
           dim,
           drift_fn,
           volatility_fn,
           times,
           time_step=None,
           num_time_steps=None,
           num_samples=1,
           initial_state=None,
           grad_volatility_fn=None,
           random_type=None,
           seed=None,
           swap_memory=True,
           skip=0,
           precompute_normal_draws=True,
           watch_params=None,
           stratonovich_order=5,
           dtype=None,
           name=None):
    r"""Returns a sample paths from the process using the Milstein method.

  For an Ito process,

  ```
    dX = a(t, X_t) dt + b(t, X_t) dW_t
  ```
  given drift `a`, volatility `b` and derivative of volatility `b'`, the
  Milstein method generates a
  sequence {Y_n} approximating X

  ```
  Y_{n+1} = Y_n + a(t_n, Y_n) dt + b(t_n, Y_n) dW_n + \frac{1}{2} b(t_n, Y_n)
  b'(t_n, Y_n) ((dW_n)^2 - dt)
  ```
  where `dt = t_{n+1} - t_n`, `dW_n = (N(0, t_{n+1}) - N(0, t_n))` and `N` is a
  sample from the Normal distribution.

  In higher dimensions, when `a(t, X_t)` is a d-dimensional vector valued
  function and `W_t` is a d-dimensional Wiener process, we have for the kth
  element of the expansion:

  ```
  Y_{n+1}[k] = Y_n[k] + a(t_n, Y_n)[k] dt + \sum_{j=1}^d b(t_n, Y_n)[k, j]
  dW_n[j] + \sum_{j_1=1}^d \sum_{j_2=1}^d L_{j_1} b(t_n, Y_n)[k, j_2] I(j_1,
  j_2)
  ```
  where `L_{j} = \sum_{i=1}^d b(t_n, Y_n)[i, j] \frac{\partial}{\partial x^i}`
  is an operator and `I(j_1, j_2) = \int_{t_n}^{t_{n+1}} \int_{t_n}^{s_1}
  dW_{s_2}[j_1] dW_{s_1}[j_2]` is a multiple Ito integral.


  See [1] and [2] for details.

  #### References
  [1]: Wikipedia. Milstein method:
  https://en.wikipedia.org/wiki/Milstein_method
  [2]: Peter E. Kloeden,  Eckhard Platen. Numerical Solution of Stochastic
    Differential Equations. Springer. 1992

  Args:
    dim: Python int greater than or equal to 1. The dimension of the Ito
      Process.
    drift_fn: A Python callable to compute the drift of the process. The
      callable should accept two real `Tensor` arguments of the same dtype. The
      first argument is the scalar time t, the second argument is the value of
      Ito process X - tensor of shape `batch_shape + [dim]`. The result is
      value of drift a(t, X). The return value of the callable is a real
      `Tensor` of the same dtype as the input arguments and of shape
      `batch_shape + [dim]`.
    volatility_fn: A Python callable to compute the volatility of the process.
      The callable should accept two real `Tensor` arguments of the same dtype
      as `times`. The first argument is the scalar time t, the second argument
      is the value of Ito process X - tensor of shape `batch_shape + [dim]`. The
      result is value of volatility b(t, X). The return value of the callable is
      a real `Tensor` of the same dtype as the input arguments and of shape
      `batch_shape + [dim, dim]`.
    times: Rank 1 `Tensor` of increasing positive real values. The times at
      which the path points are to be evaluated.
    time_step: An optional scalar real `Tensor` - maximal distance between
      points in grid in Milstein schema.
      Either this or `num_time_steps` should be supplied.
      Default value: `None`.
    num_time_steps: An optional Scalar integer `Tensor` - a total number of time
      steps performed by the algorithm. The maximal distance betwen points in
      grid is bounded by `times[-1] / (num_time_steps - times.shape[0])`.
      Either this or `time_step` should be supplied.
      Default value: `None`.
    num_samples: Positive scalar `int`. The number of paths to draw.
      Default value: 1.
    initial_state: `Tensor` of shape `[dim]`. The initial state of the
      process.
      Default value: None which maps to a zero initial state.
    grad_volatility_fn: An optional python callable to compute the gradient of
      `volatility_fn`. The callable should accept three real `Tensor` arguments
      of the same dtype as `times`. The first argument is the scalar time t. The
      second argument is the value of Ito process X - tensor of shape
      `batch_shape + [dim]`. The third argument is a tensor of input gradients
      of shape `batch_shape + [dim]` to pass to `gradient.fwd_gradient`. The
      result is a list of values corresponding to the forward gradient of
      volatility b(t, X) with respect to X. The return value of the callable is
      a list of size `dim` containing real `Tensor`s of the same dtype as the
      input arguments and of shape `batch_shape + [dim, dim]`. Each index of the
      list corresponds to a dimension of the state. If `None`, the gradient is
      computed from `volatility_fn` using forward differentiation.
    random_type: Enum value of `RandomType`. The type of (quasi)-random number
      generator to use to generate the paths.
      Default value: None which maps to the standard pseudo-random numbers.
    seed: Seed for the random number generator. The seed is only relevant if
      `random_type` is one of `[STATELESS, PSEUDO, HALTON_RANDOMIZED,
      PSEUDO_ANTITHETIC, STATELESS_ANTITHETIC]`. For `PSEUDO`,
      `PSEUDO_ANTITHETIC` and `HALTON_RANDOMIZED` the seed should be a Python
      integer. For `STATELESS` and  `STATELESS_ANTITHETIC `must be supplied as
      an integer `Tensor` of shape `[2]`.
      Default value: `None` which means no seed is set.
    swap_memory: A Python bool. Whether GPU-CPU memory swap is enabled for this
      op. See an equivalent flag in `tf.while_loop` documentation for more
      details. Useful when computing a gradient of the op since `tf.while_loop`
      is used to propagate stochastic process in time.
      Default value: True.
    skip: `int32` 0-d `Tensor`. The number of initial points of the Sobol or
      Halton sequence to skip. Used only when `random_type` is 'SOBOL',
      'HALTON', or 'HALTON_RANDOMIZED', otherwise ignored.
      Default value: `0`.
    precompute_normal_draws: Python bool. Indicates whether the noise increments
      `N(0, t_{n+1}) - N(0, t_n)` are precomputed. For `HALTON` and `SOBOL`
      random types the increments are always precomputed. While the resulting
      graph consumes more memory, the performance gains might be significant.
      Default value: `True`.
    watch_params: An optional list of zero-dimensional `Tensor`s of the same
      `dtype` as `initial_state`. If provided, specifies `Tensor`s with respect
      to which the differentiation of the sampling function will happen. A more
      efficient algorithm is used when `watch_params` are specified. Note the
      the function becomes differentiable only wrt to these `Tensor`s and the
      `initial_state`. The gradient wrt any other `Tensor` is set to be zero.
    stratonovich_order: A positive integer. The number of terms to use when
      calculating the approximate Stratonovich integrals in the multidimensional
      scheme. Stratonovich integrals are an alternative to Ito integrals, and
      can be used interchangeably when defining the higher order terms in the
      update equation. We use Stratonovich integrals here because they have a
      convenient approximation scheme for calculating cross terms involving
      different components of the Wiener process. See Eq. 8.10 in Section 5.8 of
      [2]. Default value: `5`.
    dtype: `tf.Dtype`. If supplied the dtype for the input and output `Tensor`s.
      Default value: None which means that the dtype implied by `times` is used.
    name: Python string. The name to give this op.
      Default value: `None` which maps to `milstein_sample`.
  """
    name = name or 'milstein_sample'
    with tf.name_scope(name):
        if stratonovich_order <= 0:
            raise ValueError(
                '`stratonovich_order` must be a positive integer.')
        times = tf.convert_to_tensor(times, dtype=dtype)
        if dtype is None:
            dtype = times.dtype
        if initial_state is None:
            initial_state = tf.zeros(dim, dtype=dtype)
        initial_state = tf.convert_to_tensor(initial_state,
                                             dtype=dtype,
                                             name='initial_state')
        num_requested_times = tf.shape(times)[0]
        # Create a time grid for the Milstein scheme.
        if num_time_steps is not None and time_step is not None:
            raise ValueError(
                'Only one of either `num_time_steps` or `time_step` '
                'should be defined but not both')
        if time_step is None:
            if num_time_steps is None:
                raise ValueError(
                    'Either `num_time_steps` or `time_step` should be '
                    'defined.')
            num_time_steps = tf.convert_to_tensor(num_time_steps,
                                                  dtype=tf.int32,
                                                  name='num_time_steps')
            time_step = times[-1] / tf.cast(num_time_steps, dtype=dtype)
        else:
            time_step = tf.convert_to_tensor(time_step,
                                             dtype=dtype,
                                             name='time_step')
        times, keep_mask, time_indices = utils.prepare_grid(
            times=times,
            time_step=time_step,
            num_time_steps=num_time_steps,
            dtype=dtype)
        if watch_params is not None:
            watch_params = [
                tf.convert_to_tensor(param, dtype=dtype)
                for param in watch_params
            ]
        if grad_volatility_fn is None:

            def _grad_volatility_fn(current_time, current_state,
                                    input_gradients):
                return gradient.fwd_gradient(
                    functools.partial(volatility_fn, current_time),
                    current_state,
                    input_gradients=input_gradients,
                    unconnected_gradients=tf.UnconnectedGradients.ZERO)

            grad_volatility_fn = _grad_volatility_fn

        input_gradients = None
        if dim > 1:
            input_gradients = tf.unstack(tf.eye(dim, dtype=dtype))
            input_gradients = [
                tf.broadcast_to(start, [num_samples, dim])
                for start in input_gradients
            ]

        return _sample(dim=dim,
                       drift_fn=drift_fn,
                       volatility_fn=volatility_fn,
                       grad_volatility_fn=grad_volatility_fn,
                       times=times,
                       time_step=time_step,
                       keep_mask=keep_mask,
                       num_requested_times=num_requested_times,
                       num_samples=num_samples,
                       initial_state=initial_state,
                       random_type=random_type,
                       seed=seed,
                       swap_memory=swap_memory,
                       skip=skip,
                       precompute_normal_draws=precompute_normal_draws,
                       watch_params=watch_params,
                       time_indices=time_indices,
                       input_gradients=input_gradients,
                       stratonovich_order=stratonovich_order,
                       dtype=dtype)
Пример #13
0
def find_interval_index(query_xs,
                        interval_lower_xs,
                        last_interval_is_closed=False,
                        dtype=None,
                        name=None):
    """Function to find the index of the interval where query points lies.

  Given a list of adjacent half-open intervals [x_0, x_1), [x_1, x_2), ...,
  [x_{n-1}, x_n), [x_n, inf), described by a list [x_0, x_1, ..., x_{n-1}, x_n].
  Return the index where the input query points lie. If x >= x_n, n is returned,
  and if x < x_0, -1 is returned. If `last_interval_is_closed` is set to `True`,
  the last interval [x_{n-1}, x_n] is interpreted as closed (including x_n).

  ### Example

  ```python
  interval_lower_xs = [0.25, 0.5, 1.0, 2.0, 3.0]
  query_xs = [0.25, 3.0, 5.0, 0.0, 0.5, 0.8]
  result = find_interval_index(query_xs, interval_lower_xs)
  # result == [0, 4, 4, -1, 1, 1]
  ```

  Args:
    query_xs: Rank 1 real `Tensor` of any size, the list of x coordinates for
      which the interval index is to be found. The values must be strictly
      increasing.
    interval_lower_xs: Rank 1 `Tensor` of the same shape and dtype as
      `query_xs`. The values x_0, ..., x_n that define the interval starts.
    last_interval_is_closed: If set to `True`, the last interval is interpreted
      as closed.
    dtype: Optional `tf.Dtype`. If supplied, the dtype for `query_xs` and
      `interval_lower_xs`.
      Default value: None which maps to the default dtype inferred by TensorFlow
        (float32).
    name: Optional name of the operation.

  Returns:
    A tensor that matches the shape of `query_xs` with dtype=int32 containing
    the indices of the intervals containing query points. `-1` means the query
    point lies before all intervals and `n-1` means that the point lies in the
    last half-open interval (if `last_interval_is_closed` is `False`) or that
    the point lies to the right of all intervals (if `last_interval_is_closed`
    is `True`).
  """
    with tf.compat.v1.name_scope(
            name,
            default_name='find_interval_index',
            values=[query_xs, interval_lower_xs, last_interval_is_closed]):
        # TODO(b/138988951): add ability to validate that intervals are increasing.
        # TODO(b/138988951): validate that if last_interval_is_closed, input size
        # must be > 1.
        query_xs = tf.convert_to_tensor(query_xs, dtype=dtype)
        interval_lower_xs = tf.convert_to_tensor(interval_lower_xs,
                                                 dtype=dtype)

        # Result assuming that last interval is half-open.
        indices = tf.searchsorted(interval_lower_xs, query_xs,
                                  side='right') - 1

        # Handling the branch if the last interval is closed.
        last_index = tf.shape(interval_lower_xs)[-1] - 1
        last_x = tf.gather(interval_lower_xs, [last_index], axis=-1)
        # should_cap is a tensor true where a cell is true iff indices is the last
        # index at that cell and the query x <= the right boundary of the last
        # interval.
        should_cap = tf.logical_and(tf.equal(indices, last_index),
                                    tf.less_equal(query_xs, last_x))

        # cap to last_index if the query x is not in the last interval, otherwise,
        # cap to last_index - 1.
        caps = last_index - tf.cast(should_cap, dtype=tf.dtypes.int32)

        return tf.compat.v1.where(last_interval_is_closed,
                                  tf.minimum(indices, caps), indices)
Пример #14
0
 def fold_in(seed, axes):
     for name in axes:
         axis_index = get_axis_index(name)
         seed = samplers.fold_in(seed, tf.cast(axis_index, tf.int32))
     return seed
Пример #15
0
  def _head(self, neck_outputs):

    # <tf.float32>[time * batch_size, 1, hidden_dim]
    visual_feature = neck_outputs['visual_feature']
    # <tf.float32>[time * batch_size, num_tokens, hidden_dim]
    text_feature = neck_outputs['text_feature']

    # <tf.float32>[time, batch_size, 1, hidden_dim]
    visual_feature = tf.reshape(
        visual_feature,
        [self._current_num_timesteps, self._current_batch_size] +
        visual_feature.shape[1:].as_list())

    # <tf.float32>[batch_size, time, hidden_dim]
    visual_feature = tf.squeeze(visual_feature, axis=2)
    visual_feature = tf.transpose(visual_feature, [1, 0, 2])

    first_true = utils.get_first_true_column(
        tf.reshape(neck_outputs[constants.DISC_MASK],
                   [self._current_num_timesteps, self._current_batch_size]))

    # <tf.float32>[batch_size, num_tokens, hidden_dim]
    text_feature = tf.cond(
        tf.keras.backend.any(first_true),
        lambda: tf.boolean_mask(text_feature, tf.reshape(first_true, [-1])),
        lambda: tf.reshape(text_feature, [
            self._current_num_timesteps, self._current_batch_size
        ] + text_feature.shape[1:].as_list())[0, :, :, :])
    # visual_feature = tf.nn.l2_normalize(visual_feature, axis=2)
    # text_feature = tf.nn.l2_normalize(text_feature, axis=2)

    # <tf.float32>[batch_size, time, num_tokens]
    alpha_i_j = tf.matmul(visual_feature,
                          tf.transpose(text_feature, perm=[0, 2, 1]))
    # <tf.float32>[batch_size, time, num_tokens]
    ealpha_i_j = tf.exp(alpha_i_j)
    sum_i_j = tf.tile(
        tf.expand_dims(tf.reduce_sum(ealpha_i_j, 2), 2),
        [1, 1, tf.shape(ealpha_i_j)[2]])
    mask = tf.cast(
        tf.transpose(
            tf.reshape(neck_outputs[constants.DISC_MASK],
                       [self._current_num_timesteps, self._current_batch_size]),
            perm=[1, 0]), tf.float32)
    # <tf.float32>[batch, time, num_tokens]
    c_i_j = tf.divide(ealpha_i_j, sum_i_j)
    # <tf.float32>[batch, time]
    score = tf.reduce_sum(c_i_j * alpha_i_j, 2)

    escore = tf.exp(-1 * score) * mask
    sum_escore = tf.tile(
        tf.expand_dims(tf.reduce_sum(escore, 1), 1), [1, tf.shape(escore)[1]])
    score_weight = tf.divide(escore, sum_escore)
    similarities = tf.reduce_sum(mask * score * score_weight, 1)
    similarities = tf.expand_dims(similarities, axis=0)
    # [time_step, batch_size]
    similarities = tf.tile(similarities, [self._current_num_timesteps, 1])

    # Apply an affine transform.
    similarities = similarities * self.affine_a + self.affine_b

    output_a = tf.reshape(tf.convert_to_tensor(self.affine_a), [1, 1])
    output_b = tf.reshape(tf.convert_to_tensor(self.affine_b), [1, 1])

    output_a = tf.tile(output_a,
                       [self._current_num_timesteps, self._current_batch_size])
    output_b = tf.tile(output_b,
                       [self._current_num_timesteps, self._current_batch_size])

    return common.AgentOutput(
        policy_logits=similarities, baseline=(output_a, output_b))
Пример #16
0
def spline_slope_constraint(s, dtype=tf.float32):
    """Maps `s` to all positive with `s[..., 0] == s[..., -1] == 1`."""
    # Slice off a position since this is nknots - 2 vs nknots - 1 for bin sizes.
    min_slope = 1e-2
    return tf.math.softplus(tf.cast(s[..., :-1], dtype)) + min_slope
def _preprocess_train_image(image, mean_rgb, stddev_rgb):
  image = tf.cast(image, tf.float32)
  image = _augment_image(image)
  image = (image - mean_rgb) / stddev_rgb
  return image
Пример #18
0
 def read_value(self):
     val = self._variable.read_value()
     return tf.cast(val, self._cast_dtype)
def _preprocess_eval_image(image, mean_rgb, stddev_rgb):
  image = tf.cast(image, tf.float32)
  image = (image - mean_rgb) / stddev_rgb
  return image
Пример #20
0
 def gather_nd(self, indices, name=None):
     """Gather slices of the variable into a Tensor."""
     val = self._variable.gather_nd(indices, name=name)
     return tf.cast(val, self._cast_dtype)
Пример #21
0
def nelder_mead_one_step(current_simplex,
                         current_objective_values,
                         objective_function=None,
                         dim=None,
                         func_tolerance=None,
                         position_tolerance=None,
                         batch_evaluate_objective=False,
                         reflection=None,
                         expansion=None,
                         contraction=None,
                         shrinkage=None,
                         name=None):
    """A single iteration of the Nelder Mead algorithm."""
    with tf1.name_scope(name, 'nelder_mead_one_step'):
        domain_dtype = current_simplex.dtype.base_dtype
        order = tf.argsort(current_objective_values,
                           direction='ASCENDING',
                           stable=True)
        (best_index, worst_index,
         second_worst_index) = order[0], order[-1], order[-2]

        worst_vertex = current_simplex[worst_index]

        (best_objective_value, worst_objective_value,
         second_worst_objective_value) = (
             current_objective_values[best_index],
             current_objective_values[worst_index],
             current_objective_values[second_worst_index])

        # Compute the centroid of the face opposite the worst vertex.
        face_centroid = tf.reduce_sum(input_tensor=current_simplex,
                                      axis=0) - worst_vertex
        face_centroid /= tf.cast(dim, domain_dtype)

        # Reflect the worst vertex through the opposite face.
        reflected = face_centroid + reflection * (face_centroid - worst_vertex)
        objective_at_reflected = objective_function(reflected)

        num_evaluations = 1
        has_converged = _check_convergence(current_simplex,
                                           current_simplex[best_index],
                                           best_objective_value,
                                           worst_objective_value,
                                           func_tolerance, position_tolerance)

        def _converged_fn():
            return (True, current_simplex, current_objective_values, 0)

        case0 = has_converged, _converged_fn
        accept_reflected = (
            (objective_at_reflected < second_worst_objective_value) &
            (objective_at_reflected >= best_objective_value))
        accept_reflected_fn = _accept_reflected_fn(current_simplex,
                                                   current_objective_values,
                                                   worst_index, reflected,
                                                   objective_at_reflected)
        case1 = accept_reflected, accept_reflected_fn
        do_expansion = objective_at_reflected < best_objective_value
        expansion_fn = _expansion_fn(objective_function, current_simplex,
                                     current_objective_values, worst_index,
                                     reflected, objective_at_reflected,
                                     face_centroid, expansion)
        case2 = do_expansion, expansion_fn
        do_outside_contraction = (
            (objective_at_reflected < worst_objective_value) &
            (objective_at_reflected >= second_worst_objective_value))
        outside_contraction_fn = _outside_contraction_fn(
            objective_function, current_simplex, current_objective_values,
            face_centroid, best_index, worst_index, reflected,
            objective_at_reflected, contraction, shrinkage,
            batch_evaluate_objective)
        case3 = do_outside_contraction, outside_contraction_fn
        default_fn = _inside_contraction_fn(
            objective_function, current_simplex, current_objective_values,
            face_centroid, best_index, worst_index, worst_objective_value,
            contraction, shrinkage, batch_evaluate_objective)
        (converged, next_simplex, next_objective_at_simplex,
         case_evals) = prefer_static.case([case0, case1, case2, case3],
                                          default=default_fn,
                                          exclusive=False)
        next_simplex.set_shape(current_simplex.shape)
        next_objective_at_simplex.set_shape(current_objective_values.shape)
        return (converged, next_simplex, next_objective_at_simplex,
                num_evaluations + case_evals)
Пример #22
0
 def _call(self, r):
     c = tf.cast(self._concentration, r.dtype)
     er = tf.math.exp(r)
     mean = grad_mean = er * c
     variance = er * mean
     return mean, variance, grad_mean
Пример #23
0
def _at_least_x_are_equal(a, b, x):
    """At least `x` of `a` and `b` `Tensors` are equal."""
    match = tf.equal(a, b)
    match = tf.cast(match, tf.int32)
    return tf.greater_equal(tf.reduce_sum(match), x)
Пример #24
0
 def _call(self, r):
     mean = tf.identity(r)
     grad_mean = tf.ones_like(r)
     s = tf.cast(self._scale, r.dtype)
     variance = tf.fill(tf.shape(r), s**2.)
     return mean, variance, grad_mean
Пример #25
0
def equal32(x, y):
    return tf.cast(tf.equal(x, y), tf.float32)
Пример #26
0
 def _as_distribution(self, r):
     scale = DeferredTensor(self._scale,
                            lambda x: tf.cast(x, r.dtype),
                            dtype=r.dtype)
     return tfd.Normal(loc=DeferredTensor(r, tf.math.reciprocal),
                       scale=scale)
Пример #27
0
def compute_loss_and_metrics(mu,
                             log_sigma_sq,
                             regression_targets,
                             labels,
                             task_type,
                             model_uncertainty,
                             loss_config,
                             regularization_loss=0.,
                             confidence_interval=95,
                             mode='train'):
    """Computes loss statistics and other metrics."""

    scalars_to_log = dict()
    vectors_to_log = dict()
    scalars_to_log['regularization_loss'] = regularization_loss
    vectors_to_log['mu'] = mu

    if task_type == TASK_CLASSIFICATION:
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=mu, labels=labels, name='cross_entropy')
        classification_loss = tf.reduce_mean(cross_entropy, name='class_loss')
        total_loss = classification_loss
        sigma = None
        scalars_to_log['classification_loss'] = classification_loss

        predicted_labels = tf.argmax(mu, axis=1)
        correct_predictions = equal32(predicted_labels, labels)

    else:
        regression_loss = mse_loss(mu, regression_targets)
        if 'mse_normalize' in loss_config and loss_config['mse_normalize']:
            assert task_type in [
                TASK_GROUNDED_UNNORMALIZED_REGRESSION,
                TASK_NORMALIZED_REGRESSION
            ]
            regression_loss = normalize_regression_loss(regression_loss, mu)

        avg_regression_loss = tf.reduce_mean(regression_loss)
        vectors_to_log['regression_loss'] = regression_loss
        scalars_to_log['regression_loss'] = avg_regression_loss

        scalars_to_log['avg_mu'] = tf.reduce_mean(mu)
        scalars_to_log['var_mu'] = tf.reduce_mean(
            mse_loss(mu, tf.reduce_mean(mu)))

        predicted_labels = tf.cast(mu > 0, tf.int64)
        correct_predictions = equal32(predicted_labels, labels)

        if model_uncertainty:
            # This implements Eq. (1) in https://arxiv.org/pdf/1612.01474.pdf
            inv_sigma_sq = tf.math.exp(-log_sigma_sq)
            scaled_regression_loss = regression_loss * inv_sigma_sq
            scaled_regression_loss = tf.reduce_mean(scaled_regression_loss)
            uncertainty_loss = tf.reduce_mean(log_sigma_sq)
            total_loss = uncertainty_loss + scaled_regression_loss

            scalars_to_log['uncertainty_loss'] = uncertainty_loss
            scalars_to_log['scaled_regression_loss'] = scaled_regression_loss
            scalars_to_log['uncertainty_plus_scaled_regression'] = total_loss

            sigma = tf.math.exp(log_sigma_sq / 2.)
            vectors_to_log['sigma'] = sigma
            scalars_to_log['avg_sigma'] = tf.reduce_mean(sigma)
            var_sigma = tf.reduce_mean(mse_loss(sigma, tf.reduce_mean(sigma)))
            scalars_to_log['var_sigma'] = var_sigma

            # Compute # of labels that fall into the confidence interval.
            std_factor = get_std_factor_from_confidence_percent(
                confidence_interval)
            lower_bound = mu - std_factor * sigma
            upper_bound = mu + std_factor * sigma
            preds = tf.logical_and(tf.greater(regression_targets, lower_bound),
                                   tf.less(regression_targets, upper_bound))
            percent_in_conf_interval = tf.reduce_mean(
                tf.cast(preds, tf.float32))
            scalars_to_log[
                'percent_in_conf_interval'] = percent_in_conf_interval * 100

            error_sigma_corr = tfp.stats.correlation(x=regression_loss,
                                                     y=sigma,
                                                     event_axis=None)
            scalars_to_log['error_sigma_correlation'] = error_sigma_corr

            dists = tfp.distributions.Normal(mu, sigma)
            probs = dists.prob(regression_targets)
            scalars_to_log['avg_prob'] = tf.reduce_mean(probs)

        else:
            total_loss = avg_regression_loss

    loss_name = str(mode) + '_loss'
    total_loss = tf.add(total_loss, regularization_loss, name=loss_name)
    scalars_to_log[loss_name] = total_loss
    vectors_to_log['correct_predictions'] = correct_predictions
    scalars_to_log['prediction_accuracy'] = tf.reduce_mean(correct_predictions)

    # Validate that metrics outputted are exactly what is expected
    expected = get_all_metric_names(task_type, model_uncertainty, loss_config,
                                    mode, False)
    assert set(expected) == set(scalars_to_log.keys())

    return scalars_to_log, vectors_to_log
Пример #28
0
 def _call(self, r):
     mean = tf.math.softplus(r)
     grad_mean = tf.math.sigmoid(r)
     variance = mean + mean**2 / tf.cast(self._total_count, r.dtype)
     return mean, variance, grad_mean
          epochs=1,
          validation_data=(test_images, test_labels))

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

tflite_models_dir = pathlib.Path("/tmp/mnist_tflite_models/")
tflite_models_dir.mkdir(exist_ok=True, parents=True)

tflite_model_file = tflite_models_dir / "mnist_model.tflite"
tflite_model_file.write_bytes(tflite_model)

converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]

mnist_train, _ = tf.keras.datasets.mnist.load_data()
images = tf.cast(mnist_train[0], tf.float32) / 255.0
mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)


def representative_data_gen():
    for input_value in mnist_ds.take(100):
        yield [input_value]


converter.representative_dataset = representative_data_gen

tflite_model_quant = converter.convert()
tflite_model_quant_file = tflite_models_dir / "mnist_model_quant.tflite"
tflite_model_quant_file.write_bytes(tflite_model_quant)

# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
Пример #30
0
def spline_bin_size_constraint(x, lo=-1, hi=1, dtype=tf.float32):
    """Maps innermost axis of `x` to positive values."""
    nbins = tf.cast(tf.shape(x)[-1], dtype)
    min_width = 1e-2
    scale = hi - lo - nbins * min_width
    return tf.math.softmax(tf.cast(x, dtype)) * scale + min_width
Пример #31
0
def main(argv):
    del argv  # unused arg
    if not FLAGS.use_gpu:
        raise ValueError('Only GPU is currently supported.')
    if FLAGS.num_cores > 1:
        raise ValueError('Only a single accelerator is currently supported.')
    tf.enable_v2_behavior()
    tf.random.set_seed(FLAGS.seed)
    tf.io.gfile.makedirs(FLAGS.output_dir)

    batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores
    steps_per_eval = IMAGENET_VALIDATION_IMAGES // batch_size

    dataset_test = utils.ImageNetInput(is_training=False,
                                       data_dir=FLAGS.data_dir,
                                       batch_size=FLAGS.per_core_batch_size,
                                       use_bfloat16=False).input_fn()
    test_datasets = {'clean': dataset_test}
    corruption_types, max_intensity = utils.load_corrupted_test_info()
    for name in corruption_types:
        for intensity in range(1, max_intensity + 1):
            dataset_name = '{0}_{1}'.format(name, intensity)
            test_datasets[dataset_name] = utils.load_corrupted_test_dataset(
                name=name,
                intensity=intensity,
                batch_size=FLAGS.per_core_batch_size,
                drop_remainder=True,
                use_bfloat16=False)

    model = deterministic_model.resnet50(input_shape=(224, 224, 3),
                                         num_classes=NUM_CLASSES)

    logging.info('Model input shape: %s', model.input_shape)
    logging.info('Model output shape: %s', model.output_shape)
    logging.info('Model number of weights: %s', model.count_params())
    # Search for checkpoints from their index file; then remove the index suffix.
    ensemble_filenames = tf.io.gfile.glob(
        os.path.join(FLAGS.checkpoint_dir, '**/*.index'))
    ensemble_filenames = [filename[:-6] for filename in ensemble_filenames]
    ensemble_size = len(ensemble_filenames)
    logging.info('Ensemble size: %s', ensemble_size)
    logging.info('Ensemble number of weights: %s',
                 ensemble_size * model.count_params())
    logging.info('Ensemble filenames: %s', str(ensemble_filenames))
    checkpoint = tf.train.Checkpoint(model=model)

    # Write model predictions to files.
    num_datasets = len(test_datasets)
    for m, ensemble_filename in enumerate(ensemble_filenames):
        checkpoint.restore(ensemble_filename)
        for n, (name, test_dataset) in enumerate(test_datasets.items()):
            filename = '{dataset}_{member}.npy'.format(dataset=name, member=m)
            filename = os.path.join(FLAGS.output_dir, filename)
            if not tf.io.gfile.exists(filename):
                logits = []
                test_iterator = iter(test_dataset)
                for _ in range(steps_per_eval):
                    features, _ = next(test_iterator)
                    logits.append(model(features, training=False))

                logits = tf.concat(logits, axis=0)
                with tf.io.gfile.GFile(filename, 'w') as f:
                    np.save(f, logits.numpy())
            percent = (m * num_datasets +
                       (n + 1)) / (ensemble_size * num_datasets)
            message = (
                '{:.1%} completion for prediction: ensemble member {:d}/{:d}. '
                'Dataset {:d}/{:d}'.format(percent, m + 1, ensemble_size,
                                           n + 1, num_datasets))
            logging.info(message)

    metrics = {
        'test/negative_log_likelihood': tf.keras.metrics.Mean(),
        'test/gibbs_cross_entropy': tf.keras.metrics.Mean(),
        'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(),
        'test/ece':
        ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins),
    }
    corrupt_metrics = {}
    for name in test_datasets:
        corrupt_metrics['test/nll_{}'.format(name)] = tf.keras.metrics.Mean()
        corrupt_metrics['test/accuracy_{}'.format(name)] = (
            tf.keras.metrics.SparseCategoricalAccuracy())
        corrupt_metrics['test/ece_{}'.format(
            name)] = ed.metrics.ExpectedCalibrationError(
                num_bins=FLAGS.num_bins)

    # Evaluate model predictions.
    for n, (name, test_dataset) in enumerate(test_datasets.items()):
        logits_dataset = []
        for m in range(ensemble_size):
            filename = '{dataset}_{member}.npy'.format(dataset=name, member=m)
            filename = os.path.join(FLAGS.output_dir, filename)
            with tf.io.gfile.GFile(filename, 'rb') as f:
                logits_dataset.append(np.load(f))

        logits_dataset = tf.convert_to_tensor(logits_dataset)
        test_iterator = iter(test_dataset)
        for step in range(steps_per_eval):
            _, labels = next(test_iterator)
            logits = logits_dataset[:, (step * batch_size):((step + 1) *
                                                            batch_size)]
            labels = tf.cast(tf.reshape(labels, [-1]), tf.int32)
            negative_log_likelihood = tf.reduce_mean(
                ensemble_negative_log_likelihood(labels, logits))
            per_probs = tf.nn.softmax(logits)
            probs = tf.reduce_mean(per_probs, axis=0)
            if name == 'clean':
                gibbs_ce = tf.reduce_mean(gibbs_cross_entropy(labels, logits))
                metrics['test/negative_log_likelihood'].update_state(
                    negative_log_likelihood)
                metrics['test/gibbs_cross_entropy'].update_state(gibbs_ce)
                metrics['test/accuracy'].update_state(labels, probs)
                metrics['test/ece'].update_state(labels, probs)
            else:
                corrupt_metrics['test/nll_{}'.format(name)].update_state(
                    negative_log_likelihood)
                corrupt_metrics['test/accuracy_{}'.format(name)].update_state(
                    labels, probs)
                corrupt_metrics['test/ece_{}'.format(name)].update_state(
                    labels, probs)

        message = (
            '{:.1%} completion for evaluation: dataset {:d}/{:d}'.format(
                (n + 1) / num_datasets, n + 1, num_datasets))
        logging.info(message)

    corrupt_results = utils.aggregate_corrupt_metrics(
        corrupt_metrics, corruption_types, max_intensity,
        FLAGS.alexnet_errors_path)
    total_results = {name: metric.result() for name, metric in metrics.items()}
    total_results.update(corrupt_results)
    logging.info('Metrics: %s', total_results)