Пример #1
0
        def step_fn(inputs):
            """Per-Replica StepFn."""
            images, labels = inputs
            images = tf.tile(images, [FLAGS.num_models, 1, 1, 1])
            logits = model(images, training=False)
            if FLAGS.use_bfloat16:
                logits = tf.cast(logits, tf.float32)
            probs = tf.nn.softmax(logits)
            per_probs = tf.split(probs,
                                 num_or_size_splits=FLAGS.num_models,
                                 axis=0)
            for i in range(FLAGS.num_models):
                member_probs = per_probs[i]
                member_loss = tf.keras.losses.sparse_categorical_crossentropy(
                    labels, member_probs)
                test_nlls[i].update_state(member_loss)
                test_accs[i].update_state(labels, member_probs)

            probs = tf.reduce_mean(per_probs, axis=0)

            negative_log_likelihood = tf.reduce_mean(
                tf.keras.losses.sparse_categorical_crossentropy(labels, probs))
            test_nll.update_state(negative_log_likelihood)
            test_accuracy.update_state(labels, probs)
Пример #2
0
    def __call__(self, mask_outputs, mask_targets, select_class_targets):
        """Computes the mask loss of Mask-RCNN.

    This function implements the mask loss of Mask-RCNN. As the `mask_outputs`
    produces `num_classes` masks for each RoI, the reference model expands
    `mask_targets` to match the shape of `mask_outputs` and selects only the
    target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py)  # pylint: disable=line-too-long
    Instead, this implementation selects the `mask_outputs` by the `class_targets`
    so that it doesn't expand `mask_targets`. Note that the selection logic is
    done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py.

    Args:
      mask_outputs: a float tensor representing the prediction for each mask,
        with a shape of
        [batch_size, num_masks, mask_height, mask_width].
      mask_targets: a float tensor representing the binary mask of ground truth
        labels for each mask with a shape of
        [batch_size, num_masks, mask_height, mask_width].
      select_class_targets: a tensor with a shape of [batch_size, num_masks],
        representing the foreground mask targets.
    Returns:
      mask_loss: a float tensor representing total mask loss.
    """
        with tf.compat.v1.name_scope('mask_loss'):
            (batch_size, num_masks, mask_height,
             mask_width) = mask_outputs.get_shape().as_list()

            weights = tf.tile(
                tf.reshape(tf.greater(select_class_targets, 0),
                           [batch_size, num_masks, 1, 1]),
                [1, 1, mask_height, mask_width])
            return tf.compat.v1.losses.sigmoid_cross_entropy(
                mask_targets,
                mask_outputs,
                weights=weights,
                reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
Пример #3
0
 def _fast_rcnn_box_loss(self,
                         box_outputs,
                         box_targets,
                         class_targets,
                         normalizer=1.0,
                         delta=1.):
     """Computes box regression loss."""
     # The delta is typically around the mean value of regression target.
     # for instances, the regression targets of 512x512 input with 6 anchors on
     # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
     with tf.compat.v1.name_scope('fast_rcnn_box_loss'):
         mask = tf.tile(
             tf.expand_dims(tf.greater(class_targets, 0), axis=2),
             [1, 1, 4])
         # The loss is normalized by the sum of non-zero weights before additional
         # normalizer provided by the function caller.
         box_loss = tf.compat.v1.losses.huber_loss(
             box_targets,
             box_outputs,
             weights=mask,
             delta=delta,
             reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
         box_loss /= normalizer
         return box_loss
    def class_loss(self,
                   cls_outputs,
                   cls_targets,
                   num_positives,
                   ignore_label=-2):
        """Computes RetinaNet classification loss."""
        # Onehot encoding for classification labels.
        cls_targets_one_hot = tf.one_hot(cls_targets, self._num_classes)
        bs, height, width, _, _ = cls_targets_one_hot.get_shape().as_list()
        cls_targets_one_hot = tf.reshape(cls_targets_one_hot,
                                         [bs, height, width, -1])
        loss = focal_loss(cls_outputs, cls_targets_one_hot,
                          self._focal_loss_alpha, self._focal_loss_gamma,
                          num_positives)

        ignore_loss = tf.where(
            tf.equal(cls_targets, ignore_label),
            tf.zeros_like(cls_targets, dtype=tf.float32),
            tf.ones_like(cls_targets, dtype=tf.float32),
        )
        ignore_loss = tf.expand_dims(ignore_loss, -1)
        ignore_loss = tf.tile(ignore_loss, [1, 1, 1, 1, self._num_classes])
        ignore_loss = tf.reshape(ignore_loss, tf.shape(input=loss))
        return tf.reduce_sum(input_tensor=ignore_loss * loss)
Пример #5
0
    def _sample_n(self, n, seed=None, conditional_input=None, training=False):
        """Samples from the distribution, with optional conditional input.
        Args:
          n: `int`, number of samples desired.
          seed: `int`, seed for RNG. Setting a random seed enforces reproducability
            of the samples between sessions (not within a single session).
          conditional_input: `Tensor` on which to condition the distribution (e.g.
            class labels), or `None`.
          training: `bool` or `None`. If `bool`, it controls the dropout layer,
            where `True` implies dropout is active. If `None`, it defers to Keras'
            handling of train/eval status.
        Returns:
          samples: a `Tensor` of shape `[n, height, width, num_channels]`.
        """
        if conditional_input is not None:
            conditional_input = tf.convert_to_tensor(conditional_input,
                                                     dtype=self.dtype)
            conditional_event_rank = tensorshape_util.rank(
                self.conditional_shape)
            conditional_input_shape = prefer_static.shape(conditional_input)
            conditional_sample_rank = prefer_static.rank(
                conditional_input) - conditional_event_rank

            # If `conditional_input` has no sample dimensions, prepend a sample
            # dimension
            if conditional_sample_rank == 0:
                conditional_input = conditional_input[tf.newaxis, ...]
                conditional_sample_rank = 1

            # Assert that the conditional event shape in the `PixelCnnNetwork` is the
            # same as that implied by `conditional_input`.
            conditional_event_shape = conditional_input_shape[
                conditional_sample_rank:]
            with tf.control_dependencies([
                    tf.assert_equal(self.conditional_shape,
                                    conditional_event_shape)
            ]):
                conditional_sample_shape = conditional_input_shape[:
                                                                   conditional_sample_rank]
                repeat = n // prefer_static.reduce_prod(
                    conditional_sample_shape)
                h = tf.reshape(
                    conditional_input,
                    prefer_static.concat([(-1, ), self.conditional_shape],
                                         axis=0))
                h = tf.tile(
                    h,
                    prefer_static.pad([repeat],
                                      paddings=[[0, conditional_event_rank]],
                                      constant_values=1))

        samples_0 = tf.random.uniform(prefer_static.concat(
            [(n, ), self.event_shape], axis=0),
                                      minval=-1.,
                                      maxval=1.,
                                      dtype=self.dtype,
                                      seed=seed)
        inputs = samples_0 if conditional_input is None else [samples_0, h]
        params_0 = self.network(inputs, training=training)
        samples_0 = self._sample_channels(*params_0, seed=seed)

        image_height, image_width, _ = tensorshape_util.as_list(
            self.event_shape)

        def loop_body(index, samples):
            """Loop for iterative pixel sampling.
            Args:
            index: 0D `Tensor` of type `int32`. Index of the current pixel.
            samples: 4D `Tensor`. Images with pixels sampled in raster order, up to
              pixel `[index]`, with dimensions `[batch_size, height, width,
              num_channels]`.
            Returns:
            samples: 4D `Tensor`. Images with pixels sampled in raster order, up to
              and including pixel `[index]`, with dimensions `[batch_size, height,
              width, num_channels]`.
            """
            inputs = samples if conditional_input is None else [samples, h]
            params = self.network(inputs, training=training)
            samples_new = self._sample_channels(*params, seed=seed)

            # Update the current pixel
            samples = tf.transpose(samples, [1, 2, 3, 0])
            samples_new = tf.transpose(samples_new, [1, 2, 3, 0])
            row, col = index // image_width, index % image_width
            updates = samples_new[row, col, ...][tf.newaxis, ...]
            samples = tf.tensor_scatter_nd_update(samples, [[row, col]],
                                                  updates)
            samples = tf.transpose(samples, [3, 0, 1, 2])

            return index + 1, samples

        index0 = tf.zeros([], dtype=tf.int32)

        # Construct the while loop for sampling
        total_pixels = image_height * image_width
        loop_cond = lambda ind, _: tf.less(ind, total_pixels)  # noqa: E731
        init_vars = (index0, samples_0)
        _, samples = tf.while_loop(loop_cond,
                                   loop_body,
                                   init_vars,
                                   parallel_iterations=1)

        transformed_samples = (self._low + 0.5 * (self._high - self._low) *
                               (samples + 1.))
        return tf.round(transformed_samples)
Пример #6
0
    def testExplicitBlocks(self, dynamic_shape, batch_shape):
        block_sizes = tf.convert_to_tensor(value=[2, 1, 3])
        block_sizes = tf1.placeholder_with_default(
            block_sizes,
            shape=([None] * len(block_sizes.shape)
                   if dynamic_shape else block_sizes.shape))
        exp = tfb.Exp()
        sp = tfb.Softplus()
        aff = tfb.Affine(scale_diag=[2., 3., 4.])
        blockwise = tfb.Blockwise(bijectors=[exp, sp, aff],
                                  block_sizes=block_sizes,
                                  maybe_changes_size=False)

        x = tf.cast([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=tf.float32)
        for s in batch_shape:
            x = tf.expand_dims(x, 0)
            x = tf.tile(x, [s] + [1] * (tensorshape_util.rank(x.shape) - 1))
        x = tf1.placeholder_with_default(
            x, shape=None if dynamic_shape else x.shape)

        # Identity to break the caching.
        blockwise_y = tf.identity(blockwise.forward(x))
        blockwise_fldj = blockwise.forward_log_det_jacobian(x, event_ndims=1)
        blockwise_x = blockwise.inverse(blockwise_y)
        blockwise_ildj = blockwise.inverse_log_det_jacobian(blockwise_y,
                                                            event_ndims=1)

        if not dynamic_shape:
            self.assertEqual(blockwise_y.shape, batch_shape + [6])
            self.assertEqual(blockwise_fldj.shape, batch_shape + [])
            self.assertEqual(blockwise_x.shape, batch_shape + [6])
            self.assertEqual(blockwise_ildj.shape, batch_shape + [])
        self.assertAllEqual(self.evaluate(tf.shape(blockwise_y)),
                            batch_shape + [6])
        self.assertAllEqual(self.evaluate(tf.shape(blockwise_fldj)),
                            batch_shape + [])
        self.assertAllEqual(self.evaluate(tf.shape(blockwise_x)),
                            batch_shape + [6])
        self.assertAllEqual(self.evaluate(tf.shape(blockwise_ildj)),
                            batch_shape + [])

        expl_y = tf.concat([
            exp.forward(x[..., :2]),
            sp.forward(x[..., 2:3]),
            aff.forward(x[..., 3:]),
        ],
                           axis=-1)
        expl_fldj = sum([
            exp.forward_log_det_jacobian(x[..., :2], event_ndims=1),
            sp.forward_log_det_jacobian(x[..., 2:3], event_ndims=1),
            aff.forward_log_det_jacobian(x[..., 3:], event_ndims=1)
        ])
        expl_x = tf.concat([
            exp.inverse(expl_y[..., :2]),
            sp.inverse(expl_y[..., 2:3]),
            aff.inverse(expl_y[..., 3:])
        ],
                           axis=-1)
        expl_ildj = sum([
            exp.inverse_log_det_jacobian(expl_y[..., :2], event_ndims=1),
            sp.inverse_log_det_jacobian(expl_y[..., 2:3], event_ndims=1),
            aff.inverse_log_det_jacobian(expl_y[..., 3:], event_ndims=1)
        ])

        self.assertAllClose(self.evaluate(expl_y), self.evaluate(blockwise_y))
        self.assertAllClose(self.evaluate(expl_fldj),
                            self.evaluate(blockwise_fldj))
        self.assertAllClose(self.evaluate(expl_x), self.evaluate(blockwise_x))
        self.assertAllClose(self.evaluate(expl_ildj),
                            self.evaluate(blockwise_ildj))
Пример #7
0
    def _sample_n(self, n, seed=None):
        if self._use_static_graph:
            # This sampling approach is almost the same as the approach used by
            # `MixtureSameFamily`. The differences are due to having a list of
            # `Distribution` objects rather than a single object, and maintaining
            # random seed management that is consistent with the non-static code
            # path.
            samples = []
            cat_samples = self.cat.sample(n, seed=seed)
            stream = SeedStream(seed, salt='Mixture')

            for c in range(self.num_components):
                samples.append(self.components[c].sample(n, seed=stream()))
            stack_axis = -1 - tensorshape_util.rank(self._static_event_shape)
            x = tf.stack(samples, axis=stack_axis)  # [n, B, k, E]
            npdt = dtype_util.as_numpy_dtype(x.dtype)
            mask = tf.one_hot(
                indices=cat_samples,  # [n, B]
                depth=self._num_components,  # == k
                on_value=npdt(1),
                off_value=npdt(0))  # [n, B, k]
            mask = distribution_util.pad_mixture_dimensions(
                mask, self, self._cat,
                tensorshape_util.rank(
                    self._static_event_shape))  # [n, B, k, [1]*e]
            return tf.reduce_sum(x * mask, axis=stack_axis)  # [n, B, E]

        n = tf.convert_to_tensor(n, name='n')
        static_n = tf.get_static_value(n)
        n = int(static_n) if static_n is not None else n
        cat_samples = self.cat.sample(n, seed=seed)

        static_samples_shape = cat_samples.shape
        if tensorshape_util.is_fully_defined(static_samples_shape):
            samples_shape = tensorshape_util.as_list(static_samples_shape)
            samples_size = tensorshape_util.num_elements(static_samples_shape)
        else:
            samples_shape = tf.shape(cat_samples)
            samples_size = tf.size(cat_samples)
        static_batch_shape = self.batch_shape
        if tensorshape_util.is_fully_defined(static_batch_shape):
            batch_shape = tensorshape_util.as_list(static_batch_shape)
            batch_size = tensorshape_util.num_elements(static_batch_shape)
        else:
            batch_shape = tf.shape(cat_samples)[1:]
            batch_size = tf.reduce_prod(batch_shape)
        static_event_shape = self.event_shape
        if tensorshape_util.is_fully_defined(static_event_shape):
            event_shape = np.array(
                tensorshape_util.as_list(static_event_shape), dtype=np.int32)
        else:
            event_shape = None

        # Get indices into the raw cat sampling tensor. We will
        # need these to stitch sample values back out after sampling
        # within the component partitions.
        samples_raw_indices = tf.reshape(tf.range(0, samples_size),
                                         samples_shape)

        # Partition the raw indices so that we can use
        # dynamic_stitch later to reconstruct the samples from the
        # known partitions.
        partitioned_samples_indices = tf.dynamic_partition(
            data=samples_raw_indices,
            partitions=cat_samples,
            num_partitions=self.num_components)

        # Copy the batch indices n times, as we will need to know
        # these to pull out the appropriate rows within the
        # component partitions.
        batch_raw_indices = tf.reshape(tf.tile(tf.range(0, batch_size), [n]),
                                       samples_shape)

        # Explanation of the dynamic partitioning below:
        #   batch indices are i.e., [0, 1, 0, 1, 0, 1]
        # Suppose partitions are:
        #     [1 1 0 0 1 1]
        # After partitioning, batch indices are cut as:
        #     [batch_indices[x] for x in 2, 3]
        #     [batch_indices[x] for x in 0, 1, 4, 5]
        # i.e.
        #     [1 1] and [0 0 0 0]
        # Now we sample n=2 from part 0 and n=4 from part 1.
        # For part 0 we want samples from batch entries 1, 1 (samples 0, 1),
        # and for part 1 we want samples from batch entries 0, 0, 0, 0
        #   (samples 0, 1, 2, 3).
        partitioned_batch_indices = tf.dynamic_partition(
            data=batch_raw_indices,
            partitions=cat_samples,
            num_partitions=self.num_components)
        samples_class = [None for _ in range(self.num_components)]

        stream = SeedStream(seed, salt='Mixture')

        for c in range(self.num_components):
            n_class = tf.size(partitioned_samples_indices[c])
            samples_class_c = self.components[c].sample(n_class, seed=stream())

            if event_shape is None:
                batch_ndims = prefer_static.rank_from_shape(batch_shape)
                event_shape = tf.shape(samples_class_c)[1 + batch_ndims:]

            # Pull out the correct batch entries from each index.
            # To do this, we may have to flatten the batch shape.

            # For sample s, batch element b of component c, we get the
            # partitioned batch indices from
            # partitioned_batch_indices[c]; and shift each element by
            # the sample index. The final lookup can be thought of as
            # a matrix gather along locations (s, b) in
            # samples_class_c where the n_class rows correspond to
            # samples within this component and the batch_size columns
            # correspond to batch elements within the component.
            #
            # Thus the lookup index is
            #   lookup[c, i] = batch_size * s[i] + b[c, i]
            # for i = 0 ... n_class[c] - 1.
            lookup_partitioned_batch_indices = (
                batch_size * tf.range(n_class) + partitioned_batch_indices[c])
            samples_class_c = tf.reshape(
                samples_class_c,
                tf.concat([[n_class * batch_size], event_shape], 0))
            samples_class_c = tf.gather(samples_class_c,
                                        lookup_partitioned_batch_indices,
                                        name='samples_class_c_gather')
            samples_class[c] = samples_class_c

        # Stitch back together the samples across the components.
        lhs_flat_ret = tf.dynamic_stitch(indices=partitioned_samples_indices,
                                         data=samples_class)
        # Reshape back to proper sample, batch, and event shape.
        ret = tf.reshape(lhs_flat_ret,
                         tf.concat([samples_shape, event_shape], 0))
        tensorshape_util.set_shape(
            ret,
            tensorshape_util.concatenate(static_samples_shape,
                                         self.event_shape))
        return ret
        def step_fn(inputs):
            """Per-Replica StepFn."""
            images = inputs['features']
            labels = inputs['labels']
            images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])

            # generate lambdas
            lambdas = log_uniform_sample(per_core_batch_size,
                                         lambda_parameters)
            lambdas = tf.reshape(lambdas,
                                 (FLAGS.ensemble_size * per_core_batch_size,
                                  lambdas_config.dim))

            with tf.GradientTape() as tape:
                logits = model([images, lambdas], training=True)

                if FLAGS.use_gibbs_ce:
                    # Average of single model CEs
                    # tiling of labels should be only done for Gibbs CE loss
                    labels = tf.tile(labels, [FLAGS.ensemble_size])
                    negative_log_likelihood = tf.reduce_mean(
                        tf.keras.losses.sparse_categorical_crossentropy(
                            labels, logits, from_logits=True))
                else:
                    # Ensemble CE uses no tiling of the labels
                    negative_log_likelihood = ensemble_crossentropy(
                        labels, logits, FLAGS.ensemble_size)
                # Note: Divide l2_loss by sample_size (this differs from uncertainty_
                # baselines implementation.)
                l2_loss = sum(model.losses) / train_sample_size
                loss = negative_log_likelihood + l2_loss
                # Scale the loss given the TPUStrategy will reduce sum all gradients.
                scaled_loss = loss / strategy.num_replicas_in_sync

            grads = tape.gradient(scaled_loss, model.trainable_variables)

            # Separate learning rate for fast weights.
            grads_and_vars = []
            for grad, var in zip(grads, model.trainable_variables):
                if (('alpha' in var.name or 'gamma' in var.name)
                        and 'batch_norm' not in var.name):
                    grads_and_vars.append(
                        (grad * FLAGS.fast_weight_lr_multiplier, var))
                else:
                    grads_and_vars.append((grad, var))
            optimizer.apply_gradients(grads_and_vars)

            probs = tf.nn.softmax(logits)
            per_probs = tf.split(probs,
                                 num_or_size_splits=FLAGS.ensemble_size,
                                 axis=0)
            per_probs_stacked = tf.stack(per_probs, axis=0)
            metrics['train/ece'].add_batch(probs, label=labels)
            metrics['train/loss'].update_state(loss)
            metrics['train/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics['train/accuracy'].update_state(labels, logits)
            diversity = rm.metrics.AveragePairwiseDiversity()
            diversity.add_batch(per_probs_stacked,
                                num_models=FLAGS.ensemble_size)
            diversity_results = diversity.result()
            for k, v in diversity_results.items():
                metrics['train/' + k].update_state(v)

            if grads_and_vars:
                grads, _ = zip(*grads_and_vars)
 def sigmoid_metric_transform(metrics: tf.Tensor):
     batch_size = tf.shape(tf.nest.flatten(metrics)[0])[0]
     sigmoid_batch_mask = tf.reshape(
         tf.tile(sigmoid_metric_mask, [batch_size]),
         [batch_size, len(sigmoid_metric_mask)])
     return tf.where(sigmoid_batch_mask, tf.sigmoid(metrics), metrics)
Пример #10
0
def sample_halton_sequence(dim,
                           num_results=None,
                           sequence_indices=None,
                           dtype=tf.float32,
                           randomized=True,
                           seed=None,
                           name=None):
    r"""Returns a sample from the `dim` dimensional Halton sequence.

  Warning: The sequence elements take values only between 0 and 1. Care must be
  taken to appropriately transform the domain of a function if it differs from
  the unit cube before evaluating integrals using Halton samples. It is also
  important to remember that quasi-random numbers without randomization are not
  a replacement for pseudo-random numbers in every context. Quasi random numbers
  are completely deterministic and typically have significant negative
  autocorrelation unless randomization is used.

  Computes the members of the low discrepancy Halton sequence in dimension
  `dim`. The `dim`-dimensional sequence takes values in the unit hypercube in
  `dim` dimensions. Currently, only dimensions up to 1000 are supported. The
  prime base for the k-th axes is the k-th prime starting from 2. For example,
  if `dim` = 3, then the bases will be [2, 3, 5] respectively and the first
  element of the non-randomized sequence will be: [0.5, 0.333, 0.2]. For a more
  complete description of the Halton sequences see
  [here](https://en.wikipedia.org/wiki/Halton_sequence). For low discrepancy
  sequences and their applications see
  [here](https://en.wikipedia.org/wiki/Low-discrepancy_sequence).

  If `randomized` is true, this function produces a scrambled version of the
  Halton sequence introduced by [Owen (2017)][1]. For the advantages of
  randomization of low discrepancy sequences see [here](
  https://en.wikipedia.org/wiki/Quasi-Monte_Carlo_method#Randomization_of_quasi-Monte_Carlo).

  The number of samples produced is controlled by the `num_results` and
  `sequence_indices` parameters. The user must supply either `num_results` or
  `sequence_indices` but not both.
  The former is the number of samples to produce starting from the first
  element. If `sequence_indices` is given instead, the specified elements of
  the sequence are generated. For example, sequence_indices=tf.range(10) is
  equivalent to specifying n=10.

  #### Examples

  ```python
  import tensorflow as tf
  import tensorflow_probability as tfp

  # Produce the first 1000 members of the Halton sequence in 3 dimensions.
  num_results = 1000
  dim = 3
  sample = tfp.mcmc.sample_halton_sequence(
    dim,
    num_results=num_results,
    seed=127)

  # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
  # hypercube.
  powers = tf.range(1.0, limit=dim + 1)
  integral = tf.reduce_mean(tf.reduce_prod(sample ** powers, axis=-1))
  true_value = 1.0 / tf.reduce_prod(powers + 1.0)
  with tf.Session() as session:
    values = session.run((integral, true_value))

  # Produces a relative absolute error of 1.7%.
  print ("Estimated: %f, True Value: %f" % values)

  # Now skip the first 1000 samples and recompute the integral with the next
  # thousand samples. The sequence_indices argument can be used to do this.


  sequence_indices = tf.range(start=1000, limit=1000 + num_results,
                              dtype=tf.int32)
  sample_leaped = tfp.mcmc.sample_halton_sequence(
      dim,
      sequence_indices=sequence_indices,
      seed=111217)

  integral_leaped = tf.reduce_mean(tf.reduce_prod(sample_leaped ** powers,
                                                  axis=-1))
  with tf.Session() as session:
    values = session.run((integral_leaped, true_value))
  # Now produces a relative absolute error of 0.05%.
  print ("Leaped Estimated: %f, True Value: %f" % values)
  ```

  Args:
    dim: Positive Python `int` representing each sample's `event_size.` Must
      not be greater than 1000.
    num_results: (Optional) Positive scalar `Tensor` of dtype int32. The number
      of samples to generate. Either this parameter or sequence_indices must
      be specified but not both. If this parameter is None, then the behaviour
      is determined by the `sequence_indices`.
      Default value: `None`.
    sequence_indices: (Optional) `Tensor` of dtype int32 and rank 1. The
      elements of the sequence to compute specified by their position in the
      sequence. The entries index into the Halton sequence starting with 0 and
      hence, must be whole numbers. For example, sequence_indices=[0, 5, 6] will
      produce the first, sixth and seventh elements of the sequence. If this
      parameter is None, then the `num_results` parameter must be specified
      which gives the number of desired samples starting from the first sample.
      Default value: `None`.
    dtype: (Optional) The dtype of the sample. One of: `float16`, `float32` or
      `float64`.
      Default value: `tf.float32`.
    randomized: (Optional) bool indicating whether to produce a randomized
      Halton sequence. If True, applies the randomization described in
      [Owen (2017)][1].
      Default value: `True`.
    seed: PRNG seed; see `tfp.random.sanitize_seed` for details. Only used if
      `randomized` is True. If not supplied and `randomized` is True, no seed is
      set.
      Default value: `None`.
    name:  (Optional) Python `str` describing ops managed by this function. If
      not supplied the name of this function is used.
      Default value: "sample_halton_sequence".

  Returns:
    halton_elements: Elements of the Halton sequence. `Tensor` of supplied dtype
      and `shape` `[num_results, dim]` if `num_results` was specified or shape
      `[s, dim]` where s is the size of `sequence_indices` if `sequence_indices`
      were specified.

  Raises:
    ValueError: if both `sequence_indices` and `num_results` were specified or
      if dimension `dim` is less than 1 or greater than 1000.

  #### References

  [1]: Art B. Owen. A randomized Halton algorithm in R. _arXiv preprint
       arXiv:1706.02808_, 2017. https://arxiv.org/abs/1706.02808
  """
    if dim < 1 or dim > _MAX_DIMENSION:
        raise ValueError(
            'Dimension must be between 1 and {}. Supplied {}'.format(
                _MAX_DIMENSION, dim))
    if (num_results is None) == (sequence_indices is None):
        raise ValueError('Either `num_results` or `sequence_indices` must be'
                         ' specified but not both.')

    if not dtype_util.is_floating(dtype):
        raise ValueError('dtype must be of `float`-type')

    with tf.name_scope(name or 'sample'):
        # Here and in the following, the shape layout is as follows:
        # [sample dimension, event dimension, coefficient dimension].
        # The coefficient dimension is an intermediate axes which will hold the
        # weights of the starting integer when expressed in the (prime) base for
        # an event dimension.
        if num_results is not None:
            num_results = tf.convert_to_tensor(num_results)
        if sequence_indices is not None:
            sequence_indices = tf.convert_to_tensor(sequence_indices)
        indices = _get_indices(num_results, sequence_indices, dtype)
        radixes = tf.constant(_PRIMES[0:dim], dtype=dtype, shape=[dim, 1])

        max_sizes_by_axes = _base_expansion_size(tf.reduce_max(indices),
                                                 radixes)

        max_size = tf.reduce_max(max_sizes_by_axes)

        # The powers of the radixes that we will need. Note that there is a bit
        # of an excess here. Suppose we need the place value coefficients of 7
        # in base 2 and 3. For 2, we will have 3 digits but we only need 2 digits
        # for base 3. However, we can only create rectangular tensors so we
        # store both expansions in a [2, 3] tensor. This leads to the problem that
        # we might end up attempting to raise large numbers to large powers. For
        # example, base 2 expansion of 1024 has 10 digits. If we were in 10
        # dimensions, then the 10th prime (29) we will end up computing 29^10 even
        # though we don't need it. We avoid this by setting the exponents for each
        # axes to 0 beyond the maximum value needed for that dimension.
        exponents_by_axes = tf.tile([tf.range(max_size)], [dim, 1])

        # The mask is true for those coefficients that are irrelevant.
        weight_mask = exponents_by_axes < max_sizes_by_axes
        capped_exponents = tf.where(weight_mask, exponents_by_axes,
                                    tf.constant(0, exponents_by_axes.dtype))
        weights = radixes**capped_exponents
        # The following computes the base b expansion of the indices. Suppose,
        # x = a0 + a1*b + a2*b^2 + ... Then, performing a floor div of x with
        # the vector (1, b, b^2, b^3, ...) will produce
        # (a0 + s1 * b, a1 + s2 * b, ...) where s_i are coefficients we don't care
        # about. Noting that all a_i < b by definition of place value expansion,
        # we see that taking the elements mod b of the above vector produces the
        # place value expansion coefficients.
        coeffs = tf.math.floordiv(indices, weights)
        coeffs *= tf.cast(weight_mask, dtype)
        coeffs %= radixes
        if not randomized:
            coeffs /= radixes
            return tf.reduce_sum(coeffs / weights, axis=-1)

        shuffle_seed, zero_correction_seed = samplers.split_seed(
            seed, salt='MCMCSampleHaltonSequence')

        coeffs = _randomize(coeffs, radixes, seed=shuffle_seed)
        # Remove the contribution from randomizing the trailing zero for the
        # axes where max_size_by_axes < max_size. This will be accounted
        # for separately below (using zero_correction).
        coeffs *= tf.cast(weight_mask, dtype)
        coeffs /= radixes
        base_values = tf.reduce_sum(coeffs / weights, axis=-1)

        # The randomization used in Owen (2017) does not leave 0 invariant. While
        # we have accounted for the randomization of the first `max_size_by_axes`
        # coefficients, we still need to correct for the trailing zeros. Luckily,
        # this is equivalent to adding a uniform random value scaled so the first
        # `max_size_by_axes` coefficients are zero. The following statements perform
        # this correction.
        zero_correction = samplers.uniform([dim, 1],
                                           seed=zero_correction_seed,
                                           dtype=dtype)
        zero_correction /= radixes**max_sizes_by_axes
        return base_values + tf.reshape(zero_correction, [-1])
Пример #11
0
def sample(dim,
           num_results=None,
           sequence_indices=None,
           randomized=True,
           randomization_params=None,
           seed=None,
           validate_args=False,
           dtype=None,
           name=None):
    r"""Returns a sample from the `dim` dimensional Halton sequence.

  Warning: The sequence elements take values only between 0 and 1. Care must be
  taken to appropriately transform the domain of a function if it differs from
  the unit cube before evaluating integrals using Halton samples. It is also
  important to remember that quasi-random numbers without randomization are not
  a replacement for pseudo-random numbers in every context. Quasi random numbers
  are completely deterministic and typically have significant negative
  autocorrelation unless randomization is used.

  Computes the members of the low discrepancy Halton sequence in dimension
  `dim`. The `dim`-dimensional sequence takes values in the unit hypercube in
  `dim` dimensions. Currently, only dimensions up to 1000 are supported. The
  prime base for the k-th axes is the k-th prime starting from 2. For example,
  if `dim` = 3, then the bases will be [2, 3, 5] respectively and the first
  element of the non-randomized sequence will be: [0.5, 0.333, 0.2]. For a more
  complete description of the Halton sequences see
  [here](https://en.wikipedia.org/wiki/Halton_sequence). For low discrepancy
  sequences and their applications see
  [here](https://en.wikipedia.org/wiki/Low-discrepancy_sequence).

  If `randomized` is true, this function produces a scrambled version of the
  Halton sequence introduced by [Owen (2017)][1]. For the advantages of
  randomization of low discrepancy sequences see [here](
  https://en.wikipedia.org/wiki/Quasi-Monte_Carlo_method#Randomization_of_quasi-Monte_Carlo).

  The number of samples produced is controlled by the `num_results` and
  `sequence_indices` parameters. The user must supply either `num_results` or
  `sequence_indices` but not both.
  The former is the number of samples to produce starting from the first
  element. If `sequence_indices` is given instead, the specified elements of
  the sequence are generated. For example, sequence_indices=tf.range(10) is
  equivalent to specifying n=10.

  #### Examples

  ```python
  import tensorflow as tf
  import tensorflow_probability as tfp

  # Produce the first 1000 members of the Halton sequence in 3 dimensions.
  num_results = 1000
  dim = 3
  sample, params = qmc.halton.sample(
    dim,
    num_results=num_results,
    seed=127)

  # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
  # hypercube.
  powers = tf.range(1.0, limit=dim + 1)
  integral = tf.reduce_mean(tf.reduce_prod(sample ** powers, axis=-1))
  true_value = 1.0 / tf.reduce_prod(powers + 1.0)
  with tf.Session() as session:
    values = session.run((integral, true_value))

  # Produces a relative absolute error of 1.7%.
  print ("Estimated: %f, True Value: %f" % values)

  # Now skip the first 1000 samples and recompute the integral with the next
  # thousand samples. The sequence_indices argument can be used to do this.


  sequence_indices = tf.range(start=1000, limit=1000 + num_results,
                              dtype=tf.int32)
  sample_leaped, _ = qmc.halton.sample(
      dim,
      sequence_indices=sequence_indices,
      randomization_params=params)

  integral_leaped = tf.reduce_mean(tf.reduce_prod(sample_leaped ** powers,
                                                  axis=-1))
  with tf.Session() as session:
    values = session.run((integral_leaped, true_value))
  # Now produces a relative absolute error of 0.05%.
  print ("Leaped Estimated: %f, True Value: %f" % values)
  ```

  Args:
    dim: Positive Python `int` representing each sample's `event_size.` Must not
      be greater than 1000.
    num_results: (Optional) Positive scalar `Tensor` of dtype int32. The number
      of samples to generate. Either this parameter or sequence_indices must be
      specified but not both. If this parameter is None, then the behaviour is
      determined by the `sequence_indices`.
      Default value: `None`.
    sequence_indices: (Optional) `Tensor` of dtype int32 and rank 1. The
      elements of the sequence to compute specified by their position in the
      sequence. The entries index into the Halton sequence starting with 0 and
      hence, must be whole numbers. For example, sequence_indices=[0, 5, 6] will
      produce the first, sixth and seventh elements of the sequence. If this
      parameter is None, then the `num_results` parameter must be specified
      which gives the number of desired samples starting from the first sample.
      Default value: `None`.
    randomized: (Optional) bool indicating whether to produce a randomized
      Halton sequence. If True, applies the randomization described in [Owen
      (2017)][1]. If True, either seed or randomization_params must be
      specified. This is because the randomization uses stateless random number
      generation which requires an explicitly specified seed.
      Default value: `True`.
    randomization_params: (Optional) Instance of `HaltonParams` that fully
      describes the randomization behavior. If provided and randomized is True,
      seed will be ignored and these will be used instead of computing them from
      scratch. If randomized is False, this parameter has no effect.
      Default value: `None`. In this case with randomized = True, the necessary
        randomization parameters will be computed from scratch.
    seed: (Optional) Python integer to seed the random number generator. Must be
      specified if `randomized` is True and randomization_params is not
      specified. Ignored if randomized is False or randomization_params is
      specified.
      Default value: `None`.
    validate_args: If True, checks that maximum index is not exceeded and that
      the dimension `dim` is less than 1 or greater than 1000.
      Default value: `False`.
    dtype: Optional `dtype`. The dtype of the output `Tensor` (either `float32`
    or `float64`).
      Default value: `None` which maps to the `float32`.
    name:  (Optional) Python `str` describing ops managed by this function. If
      not supplied the name of this function is used.
      Default value: "halton_sample".

  Returns:
    halton_elements: Elements of the Halton sequence. `Tensor` of supplied dtype
      and `shape` `[num_results, dim]` if `num_results` was specified or shape
      `[s, dim]` where s is the size of `sequence_indices` if `sequence_indices`
      were specified.
    randomization_params: None if randomized is False. If randomized is True
      and randomization_params was supplied as an argument, returns that.
      Otherwise returns the computed randomization_params, an instance of
      `HaltonParams` that fully describes the randomization behavior.

  Raises:
    ValueError: if both `sequence_indices` and `num_results` were specified.
    ValueError: if `randomization` is True but `seed` is not specified.
    InvalidArgumentError: if `validate_args` is True and the maximum supported
      sequence index is exceeded.

  #### References

  [1]: Art B. Owen. A randomized Halton algorithm in R. _arXiv preprint
       arXiv:1706.02808_, 2017. https://arxiv.org/abs/1706.02808
  """
    if (num_results is None) == (sequence_indices is None):
        raise ValueError('Either `num_results` or `sequence_indices` must be'
                         ' specified but not both.')
    dtype = dtype or tf.float32

    with tf.compat.v1.name_scope(name,
                                 'halton_sample',
                                 values=[num_results, sequence_indices]):
        # Here and in the following, the shape layout is as follows:
        # [sample dimension, event dimension, coefficient dimension].
        # The coefficient dimension is an intermediate axes which will hold the
        # weights of the starting integer when expressed in the (prime) base for
        # an event dimension.
        if num_results is not None:
            num_results = tf.convert_to_tensor(value=num_results,
                                               dtype=tf.int32,
                                               name='name_results')
        if sequence_indices is not None:
            sequence_indices = tf.convert_to_tensor(value=sequence_indices,
                                                    dtype=tf.int32,
                                                    name='sequence_indices')
        indices = _get_indices(num_results, sequence_indices, dtype)

        runtime_assertions = []
        if validate_args:
            runtime_assertions.append(
                tf.compat.v1.assert_less_equal(
                    tf.reduce_max(indices),
                    tf.constant(_MAX_INDEX_BY_DTYPE[dtype], dtype=dtype),
                    message=
                    ('Maximum sequence index exceeded. Maximum index for dtype %s '
                     'is %d.' % (dtype, _MAX_INDEX_BY_DTYPE[dtype]))))
            runtime_assertions.append(
                tf.compat.v1.assert_greater_equal(
                    dim, 1, message='`dim` should be greater than 1'))
            runtime_assertions.append(
                tf.compat.v1.assert_less_equal(
                    dim,
                    _MAX_DIMENSION,
                    message='`dim` should be less or equal than %d' %
                    _MAX_DIMENSION))

        with tf.compat.v1.control_dependencies(runtime_assertions):
            radixes = tf.convert_to_tensor(_PRIMES,
                                           dtype=dtype,
                                           name='radixes')
            radixes = tf.reshape(radixes[0:dim], shape=[dim, 1])

            max_sizes_by_axes = tf.convert_to_tensor(
                _MAX_SIZES_BY_AXES[dtype],
                dtype=dtype,
                name='max_sizes_by_axes')[:dim]
            max_size = tf.reduce_max(max_sizes_by_axes)

            # The powers of the radixes that we will need. Note that there is a bit
            # of an excess here. Suppose we need the place value coefficients of 7
            # in base 2 and 3. For 2, we will have 3 digits but we only need 2 digits
            # for base 3. However, we can only create rectangular tensors so we
            # store both expansions in a [2, 3] tensor. This leads to the problem that
            # we might end up attempting to raise large numbers to large powers. For
            # example, base 2 expansion of 1024 has 10 digits. If we were in 10
            # dimensions, then the 10th prime (29) we will end up computing 29^10 even
            # though we don't need it. We avoid this by setting the exponents for each
            # axes to 0 beyond the maximum value needed for that dimension.
            exponents_by_axes = tf.tile([tf.range(max_size, dtype=dtype)],
                                        [dim, 1])

            # The mask is true for those coefficients that are irrelevant.
            weight_mask = exponents_by_axes >= max_sizes_by_axes
            capped_exponents = tf.where(weight_mask,
                                        tf.zeros_like(exponents_by_axes),
                                        exponents_by_axes)
            weights = radixes**capped_exponents
            # The following computes the base b expansion of the indices. Suppose,
            # x = a0 + a1*b + a2*b^2 + ... Then, performing a floor div of x with
            # the vector (1, b, b^2, b^3, ...) will produce
            # (a0 + s1 * b, a1 + s2 * b, ...) where s_i are coefficients we don't care
            # about. Noting that all a_i < b by definition of place value expansion,
            # we see that taking the elements mod b of the above vector produces the
            # place value expansion coefficients.
            coeffs = tf.compat.v1.floor_div(indices, weights)
            coeffs *= 1. - tf.cast(weight_mask, dtype)
            coeffs %= radixes
            if not randomized:
                coeffs /= radixes
                return tf.reduce_sum(input_tensor=coeffs / weights,
                                     axis=-1), None

            if randomization_params is None:
                perms, zero_correction = None, None
            else:
                perms, zero_correction = randomization_params
            coeffs, perms = _randomize(coeffs, radixes, seed, perms=perms)
            # Remove the contribution from randomizing the trailing zero for the
            # axes where max_size_by_axes < max_size. This will be accounted
            # for separately below (using zero_correction).
            coeffs *= 1. - tf.cast(weight_mask, dtype)
            coeffs /= radixes
            base_values = tf.reduce_sum(input_tensor=coeffs / weights, axis=-1)

            # The randomization used in Owen (2017) does not leave 0 invariant. While
            # we have accounted for the randomization of the first `max_size_by_axes`
            # coefficients, we still need to correct for the trailing zeros. Luckily,
            # this is equivalent to adding a uniform random value scaled so the first
            # `max_size_by_axes` coefficients are zero. The following statements
            # perform this correction.
            if zero_correction is None:
                if seed is None:
                    zero_correction = tf.random.uniform([dim, 1], dtype=dtype)
                else:
                    zero_correction = tf.random.stateless_uniform([dim, 1],
                                                                  seed=(seed,
                                                                        seed),
                                                                  dtype=dtype)
                zero_correction /= radixes**max_sizes_by_axes
                zero_correction = tf.reshape(zero_correction, [-1])

            return base_values + zero_correction, HaltonParams(
                perms, zero_correction)
Пример #12
0
    def solve_nu_zeta(self,
                      dataset: dataset_lib.OffpolicyDataset,
                      target_policy: tf_policy.TFPolicy,
                      regularizer: float = 1e-6):
        """Solves for density ratios and then approximates target policy value.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.
      regularizer: A small constant to add to matrices before inverting them or
        to floats before taking square root.

    Returns:
      Estimated average per-step reward of the target policy.
    """

        if not hasattr(self, '_td_mat'):
            # Set up env_steps.
            episodes, valid_steps = dataset.get_all_episodes(
                limit=self._limit_episodes)
            total_num_steps_per_episode = tf.shape(valid_steps)[1] - 1
            num_episodes = tf.shape(valid_steps)[0]
            num_samples = num_episodes * total_num_steps_per_episode
            valid_and_not_last = tf.logical_and(valid_steps,
                                                episodes.discount > 0)
            valid_indices = tf.squeeze(
                tf.where(tf.reshape(valid_and_not_last[:, :-1], [-1])))

            initial_env_step = tf.nest.map_structure(
                lambda t: tf.squeeze(
                    tf.reshape(
                        tf.repeat(t[:, 0:1, ...],
                                  axis=1,
                                  repeats=total_num_steps_per_episode),
                        [num_samples, -1])), episodes)
            initial_env_step = tf.nest.map_structure(
                lambda t: tf.gather(t, valid_indices), initial_env_step)
            tfagents_initial_env_step = dataset_lib.convert_to_tfagents_timestep(
                initial_env_step)

            env_step = tf.nest.map_structure(
                lambda t: tf.squeeze(
                    tf.reshape(t[:, 0:total_num_steps_per_episode, ...],
                               [num_samples, -1])), episodes)
            env_step = tf.nest.map_structure(
                lambda t: tf.gather(t, valid_indices), env_step)
            tfagents_env_step = dataset_lib.convert_to_tfagents_timestep(
                env_step)

            next_env_step = tf.nest.map_structure(
                lambda t: tf.squeeze(
                    tf.reshape(t[:, 1:total_num_steps_per_episode + 1, ...],
                               [num_samples, -1])), episodes)
            next_env_step = tf.nest.map_structure(
                lambda t: tf.gather(t, valid_indices), next_env_step)
            tfagents_next_env_step = dataset_lib.convert_to_tfagents_timestep(
                next_env_step)

            # get probabilities
            initial_target_probs = target_policy.distribution(
                tfagents_initial_env_step).action.probs_parameter()
            next_target_probs = target_policy.distribution(
                tfagents_next_env_step).action.probs_parameter()

            # First, get the nu_loss and data weights
            #current_nu_loss = self._get_nu_loss(initial_env_step, env_step,
            #                                    next_env_step, target_policy)
            #data_weight, _ = self._get_weights(current_nu_loss)

            # # debug only and to reproduce dual dice result, DELETE
            # data_weight = tf.ones_like(data_weight)

            state_action_count = self._get_state_action_counts(env_step)
            counts = tf.reduce_sum(
                tf.one_hot(state_action_count, self._dimension), 0)
            gamma_sample = tf.pow(self._gamma,
                                  tf.cast(env_step.step_num, tf.float32))

            # # debug only and to reproduce dual dice result, DELETE
            # gamma_sample = tf.ones_like(gamma_sample)

            # now we need to expand_dims to include action space in extra dimensions
            #data_weights = tf.reshape(data_weight, [-1, self._num_limits])
            # both are data sample weights for L2 problem, needs to be normalized later
            #gamma_data_weights = tf.reshape(gamma_sample, [-1, 1]) * data_weights

            initial_states = tf.tile(
                tf.reshape(initial_env_step.observation, [-1, 1]),
                [1, self._num_actions])
            initial_actions = tf.tile(
                tf.reshape(tf.range(self._num_actions), [1, -1]),
                [initial_env_step.observation.shape[0], 1])
            initial_nu_indices = self._get_index(initial_states,
                                                 initial_actions)

            # linear term w.r.t. initial distribution
            #b_vec_2 = tf.stack([
            #    tf.reduce_sum(
            #        tf.reshape(
            #            data_weights[:, itr] / tf.reduce_sum(data_weights[:, itr]),
            #            [-1, 1]) * tf.reduce_sum(
            #                tf.one_hot(initial_nu_indices, self._dimension) *
            #                (1 - self._gamma) *
            #                tf.expand_dims(initial_target_probs, axis=-1),
            #                axis=1),
            #        axis=0) for itr in range(self._num_limits)
            #],
            #                   axis=0)

            next_states = tf.tile(
                tf.reshape(next_env_step.observation, [-1, 1]),
                [1, self._num_actions])
            next_actions = tf.tile(
                tf.reshape(tf.range(self._num_actions), [1, -1]),
                [next_env_step.observation.shape[0], 1])
            next_nu_indices = self._get_index(next_states, next_actions)
            next_nu_indices = tf.where(
                tf.expand_dims(next_env_step.is_absorbing(), -1),
                -1 * tf.ones_like(next_nu_indices), next_nu_indices)

            nu_indices = self._get_index(env_step.observation, env_step.action)

            target_log_probabilities = target_policy.distribution(
                tfagents_env_step).action.log_prob(env_step.action)
            if not self._solve_for_state_action_ratio:
                policy_ratio = tf.exp(target_log_probabilities -
                                      env_step.get_log_probability())
            else:
                policy_ratio = tf.ones([
                    target_log_probabilities.shape[0],
                ])
            policy_ratios = tf.tile(tf.reshape(policy_ratio, [-1, 1]),
                                    [1, self._num_actions])

            # the tabular feature vector
            a_vec = tf.one_hot(nu_indices, self._dimension) - tf.reduce_sum(
                self._gamma *
                tf.expand_dims(next_target_probs * policy_ratios, axis=-1) *
                tf.one_hot(next_nu_indices, self._dimension),
                axis=1)

            # linear term w.r.t. reward
            #b_vec_1 = tf.stack([
            #    tf.reduce_sum(
            #        tf.reshape(
            #            (gamma_data_weights[:, itr] /
            #             tf.reduce_sum(gamma_data_weights[:, itr])) * self._reward_fn(env_step), #/
            #            #tf.cast(state_action_count, tf.float32),
            #            [-1, 1]) * a_vec,
            #        axis=0) for itr in range(self._num_limits)
            #],
            #                   axis=0)
            # quadratic term of feature
            # Get weighted outer product by using einsum to save computing resource!
            #a_mat = tf.stack([
            #    tf.einsum(
            #        'ai, a, aj -> ij', a_vec,
            #        #1.0 / tf.cast(state_action_count, tf.float32),
            #        gamma_data_weights[:, itr] /
            #        tf.reduce_sum(gamma_data_weights[:, itr]),
            #        a_vec)
            #    for itr in range(self._num_limits)
            #],
            #                 axis=0)

            td_mat = tf.einsum('ai, a, aj -> ij',
                               tf.one_hot(nu_indices, self._dimension),
                               1.0 / tf.cast(state_action_count, tf.float32),
                               a_vec)

            weighted_rewards = policy_ratio * self._reward_fn(env_step)

            bias = tf.reduce_sum(
                tf.one_hot(nu_indices, self._dimension) *
                tf.reshape(weighted_rewards, [-1, 1]) * 1.0 /
                tf.cast(state_action_count, tf.float32)[:, None],
                axis=0)

            # Initialize
            self._nu = np.ones_like(self._nu) * bias[:, None]
            self._nu2 = np.ones_like(self._nu2) * bias[:, None]

            self._a_vec = a_vec
            self._td_mat = td_mat
            self._bias = bias
            self._weighted_rewards = weighted_rewards
            self._state_action_count = state_action_count
            self._nu_indices = nu_indices
            self._initial_nu_indices = initial_nu_indices
            self._initial_target_probs = initial_target_probs
            self._gamma_sample = gamma_sample
            self._gamma_sample = tf.ones_like(gamma_sample)

        saddle_bellman_residuals = (tf.matmul(self._a_vec, self._nu) -
                                    self._weighted_rewards[:, None])
        saddle_bellman_residuals *= -1 * self._algae_alpha_sign
        saddle_zetas = tf.gather(self._zeta, self._nu_indices)
        saddle_initial_nu_values = tf.reduce_sum(  # Average over actions.
            self._initial_target_probs[:, :, None] *
            tf.gather(self._nu, self._initial_nu_indices),
            axis=1)
        saddle_init_nu_loss = ((1 - self._gamma) * saddle_initial_nu_values *
                               self._algae_alpha_sign)

        saddle_bellman_residuals2 = (tf.matmul(self._a_vec, self._nu2) -
                                     self._weighted_rewards[:, None])
        saddle_bellman_residuals2 *= 1 * self._algae_alpha_sign
        saddle_zetas2 = tf.gather(self._zeta2, self._nu_indices)
        saddle_initial_nu_values2 = tf.reduce_sum(  # Average over actions.
            self._initial_target_probs[:, :, None] *
            tf.gather(self._nu2, self._initial_nu_indices),
            axis=1)
        saddle_init_nu_loss2 = ((1 - self._gamma) * saddle_initial_nu_values2 *
                                -1 * self._algae_alpha_sign)

        saddle_loss = 0.5 * (
            saddle_init_nu_loss + saddle_bellman_residuals * saddle_zetas +
            -tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas) +
            -saddle_init_nu_loss2 + -saddle_bellman_residuals2 * saddle_zetas2
            + tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas2))
        # Binary search to find best alpha.
        left = tf.constant([-8., -8.])
        right = tf.constant([32., 32.])
        for _ in range(16):
            mid = 0.5 * (left + right)
            self._alpha.assign(mid)
            weights, log_weights = self._get_weights(
                saddle_loss * self._gamma_sample[:, None])

            divergence = self._compute_divergence(weights, log_weights)
            divergence_violation = divergence - self._two_sided_limit
            left = tf.where(divergence_violation > 0., mid, left)
            right = tf.where(divergence_violation > 0., right, mid)
        self._alpha.assign(0.5 * (left + right))
        weights, log_weights = self._get_weights(saddle_loss *
                                                 self._gamma_sample[:, None])

        gamma_data_weights = tf.stop_gradient(weights *
                                              self._gamma_sample[:, None])
        #print(tf.concat([gamma_data_weights, saddle_loss], axis=-1))
        avg_saddle_loss = (
            tf.reduce_sum(gamma_data_weights * saddle_loss, axis=0) /
            tf.reduce_sum(gamma_data_weights, axis=0))

        weighted_state_action_count = tf.reduce_sum(
            tf.one_hot(self._nu_indices, self._dimension)[:, :, None] *
            weights[:, None, :],
            axis=0)
        weighted_state_action_count = tf.gather(weighted_state_action_count,
                                                self._nu_indices)
        my_td_mat = tf.einsum(
            'ai, ab, ab, aj -> bij',
            tf.one_hot(self._nu_indices, self._dimension),
            #1.0 / tf.cast(self._state_action_count, tf.float32),
            1.0 / weighted_state_action_count,
            weights,
            self._a_vec)
        my_bias = tf.reduce_sum(
            tf.transpose(weights)[:, :, None] *
            tf.one_hot(self._nu_indices, self._dimension)[None, :, :] *
            tf.reshape(self._weighted_rewards, [1, -1, 1]) *
            #1.0 / tf.cast(self._state_action_count, tf.float32)[None, :, None],
            1.0 / tf.transpose(weighted_state_action_count)[:, :, None],
            axis=1)

        #print('hello', saddle_initial_nu_values[:1], saddle_zetas[:3],
        #      self._nu[:2], my_bias[:, :2], saddle_loss[:4])

        with tf.GradientTape(watch_accessed_variables=False,
                             persistent=True) as tape:
            tape.watch([self._nu, self._nu2, self._alpha])
            bellman_residuals = tf.matmul(
                my_td_mat,
                tf.transpose(self._nu)[:, :, None]) - my_bias[:, :, None]
            bellman_residuals = tf.transpose(tf.squeeze(bellman_residuals, -1))
            bellman_residuals = tf.gather(bellman_residuals, self._nu_indices)
            initial_nu_values = tf.reduce_sum(  # Average over actions.
                self._initial_target_probs[:, :, None] *
                tf.gather(self._nu, self._initial_nu_indices),
                axis=1)

            bellman_residuals *= self._algae_alpha_sign

            init_nu_loss = ((1 - self._gamma) * initial_nu_values *
                            self._algae_alpha_sign)

            nu_loss = (tf.math.square(bellman_residuals) / 2.0 +
                       tf.math.abs(self._algae_alpha) * init_nu_loss)

            loss = (gamma_data_weights * nu_loss /
                    tf.reduce_sum(gamma_data_weights, axis=0, keepdims=True))

            bellman_residuals2 = tf.matmul(
                my_td_mat,
                tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :, None]
            bellman_residuals2 = tf.transpose(
                tf.squeeze(bellman_residuals2, -1))
            bellman_residuals2 = tf.gather(bellman_residuals2,
                                           self._nu_indices)
            initial_nu_values2 = tf.reduce_sum(  # Average over actions.
                self._initial_target_probs[:, :, None] *
                tf.gather(self._nu2, self._initial_nu_indices),
                axis=1)

            bellman_residuals2 *= -1 * self._algae_alpha_sign

            init_nu_loss2 = ((1 - self._gamma) * initial_nu_values2 * -1 *
                             self._algae_alpha_sign)

            nu_loss2 = (tf.math.square(bellman_residuals2) / 2.0 +
                        tf.math.abs(self._algae_alpha) * init_nu_loss2)

            loss2 = (gamma_data_weights * nu_loss2 /
                     tf.reduce_sum(gamma_data_weights, axis=0, keepdims=True))

            divergence = self._compute_divergence(weights, log_weights)
            divergence_violation = divergence - self._two_sided_limit

            alpha_loss = (-tf.exp(self._alpha) *
                          tf.stop_gradient(divergence_violation))

            extra_loss = tf.reduce_sum(tf.math.square(self._nu[-1, :]))
            extra_loss2 = tf.reduce_sum(tf.math.square(self._nu2[-1, :]))
            nu_grad = tape.gradient(loss + extra_loss, [self._nu])[0]
            nu_grad2 = tape.gradient(loss2 + extra_loss2, [self._nu2])[0]
        avg_loss = tf.reduce_sum(0.5 * (loss - loss2) /
                                 tf.math.abs(self._algae_alpha),
                                 axis=0)
        nu_jacob = tape.jacobian(nu_grad, [self._nu])[0]
        nu_hess = tf.stack(
            [nu_jacob[:, i, :, i] for i in range(self._num_limits)], axis=0)

        nu_jacob2 = tape.jacobian(nu_grad2, [self._nu2])[0]
        nu_hess2 = tf.stack(
            [nu_jacob2[:, i, :, i] for i in range(self._num_limits)], axis=0)

        for idx, div in enumerate(divergence):
            tf.summary.scalar('divergence%d' % idx, div)

        #alpha_grads = tape.gradient(alpha_loss, [self._alpha])
        #alpha_grad_op = self._alpha_optimizer.apply_gradients(
        #    zip(alpha_grads, [self._alpha]))
        #self._alpha.assign(tf.minimum(8., tf.maximum(-8., self._alpha)))

        #print(self._alpha, tf.concat([weights, nu_loss], -1))
        #regularizer = 0.1
        nu_transformed = tf.transpose(
            tf.squeeze(
                tf.linalg.solve(
                    nu_hess + regularizer * tf.eye(self._dimension),
                    tf.expand_dims(-tf.transpose(nu_grad), axis=-1))))
        self._nu = self._nu + 0.1 * nu_transformed
        nu_transformed2 = tf.transpose(
            tf.squeeze(
                tf.linalg.solve(
                    nu_hess2 + regularizer * tf.eye(self._dimension),
                    tf.expand_dims(-tf.transpose(nu_grad2), axis=-1))))
        self._nu2 = self._nu2 + 0.1 * nu_transformed2

        print(avg_loss * self._algae_alpha_sign,
              avg_saddle_loss * self._algae_alpha_sign, self._nu[:2],
              divergence)
        #print(init_nu_loss[:8], init_nu_loss[-8:])
        #print(bellman_residuals[:8])
        #print(self._nu[:3], self._zeta[:3])

        zetas = tf.matmul(my_td_mat,
                          tf.transpose(self._nu)[:, :, None]) - my_bias[:, :,
                                                                        None]
        zetas = tf.transpose(tf.squeeze(zetas, -1))
        zetas *= -self._algae_alpha_sign
        zetas /= tf.math.abs(self._algae_alpha)
        self._zeta = self._zeta + 0.1 * (zetas - self._zeta)

        zetas2 = tf.matmul(my_td_mat,
                           tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :,
                                                                          None]
        zetas2 = tf.transpose(tf.squeeze(zetas2, -1))
        zetas2 *= 1 * self._algae_alpha_sign
        zetas2 /= tf.math.abs(self._algae_alpha)
        self._zeta2 = self._zeta2 + 0.1 * (zetas2 - self._zeta2)

        #self._zeta = (
        #    tf.einsum('ij,ja-> ia', self._td_mat, self._nu) -
        #    tf.transpose(my_bias))
        #self._zeta *= -tf.reshape(self._algae_alpha_sign, [1, self._num_limits])
        #self._zeta /= tf.math.abs(self._algae_alpha)
        return [
            avg_saddle_loss * self._algae_alpha_sign,
            avg_loss * self._algae_alpha_sign, divergence
        ]
    def __call__(self, roi_features, class_indices, is_training=None):
        """Mask branch for the Mask-RCNN model.

    Args:
      roi_features: A ROI feature tensor of shape
        [batch_size, num_rois, height_l, width_l, num_filters].
      class_indices: a Tensor of shape [batch_size, num_rois], indicating
        which class the ROI is.
      is_training: `boolean`, if True if model is in training mode.
    Returns:
      mask_outputs: a tensor with a shape of
        [batch_size, num_masks, mask_height, mask_width, num_classes],
        representing the mask predictions.
      fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2],
        representing the fg mask targets.
    Raises:
      ValueError: If boxes is not a rank-3 tensor or the last dimension of
        boxes is not 4.
    """
        def _get_stddev_equivalent_to_msra_fill(kernel_size, fan_out):
            """Returns the stddev of random normal initialization as MSRAFill."""
            # Reference: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/filler_op.h#L445-L463  # pylint: disable=line-too-long
            # For example, kernel size is (3, 3) and fan out is 256, stddev is 0.029.
            # stddev = (2/(3*3*256))^0.5 = 0.029
            return (2 / (kernel_size[0] * kernel_size[1] * fan_out))**0.5

        with backend.get_graph().as_default():
            with tf.name_scope('mask_head'):
                _, num_rois, height, width, filters = roi_features.get_shape(
                ).as_list()
                net = tf.reshape(roi_features, [-1, height, width, filters])

                for i in range(4):
                    kernel_size = (3, 3)
                    fan_out = 256
                    init_stddev = _get_stddev_equivalent_to_msra_fill(
                        kernel_size, fan_out)
                    net = tf.keras.layers.Conv2D(
                        fan_out,
                        kernel_size=kernel_size,
                        strides=(1, 1),
                        padding='same',
                        dilation_rate=(1, 1),
                        activation=None,
                        kernel_initializer=tf.keras.initializers.RandomNormal(
                            stddev=init_stddev),
                        bias_initializer=tf.zeros_initializer(),
                        name='mask-conv-l%d' % i)(net)
                    net = self._batch_norm_relu()(net, is_training=is_training)

                kernel_size = (2, 2)
                fan_out = 256
                init_stddev = _get_stddev_equivalent_to_msra_fill(
                    kernel_size, fan_out)
                net = tf.keras.layers.Conv2DTranspose(
                    fan_out,
                    kernel_size=kernel_size,
                    strides=(2, 2),
                    padding='valid',
                    activation=None,
                    kernel_initializer=tf.keras.initializers.RandomNormal(
                        stddev=init_stddev),
                    bias_initializer=tf.zeros_initializer(),
                    name='conv5-mask')(net)
                net = self._batch_norm_relu()(net, is_training=is_training)

                kernel_size = (1, 1)
                fan_out = self._num_classes
                init_stddev = _get_stddev_equivalent_to_msra_fill(
                    kernel_size, fan_out)
                mask_outputs = tf.keras.layers.Conv2D(
                    fan_out,
                    kernel_size=kernel_size,
                    strides=(1, 1),
                    padding='valid',
                    kernel_initializer=tf.keras.initializers.RandomNormal(
                        stddev=init_stddev),
                    bias_initializer=tf.zeros_initializer(),
                    name='mask_fcn_logits')(net)
                mask_outputs = tf.reshape(mask_outputs, [
                    -1, num_rois, self._mrcnn_resolution,
                    self._mrcnn_resolution, self._num_classes
                ])

                with tf.name_scope('masks_post_processing'):
                    # TODO(pengchong): Figure out the way not to use the static inferred
                    # batch size.
                    batch_size, num_masks = class_indices.get_shape().as_list()
                    mask_outputs = tf.transpose(a=mask_outputs,
                                                perm=[0, 1, 4, 2, 3])
                    # Contructs indices for gather.
                    batch_indices = tf.tile(
                        tf.expand_dims(tf.range(batch_size), axis=1),
                        [1, num_masks])
                    mask_indices = tf.tile(
                        tf.expand_dims(tf.range(num_masks), axis=0),
                        [batch_size, 1])
                    gather_indices = tf.stack(
                        [batch_indices, mask_indices, class_indices], axis=2)
                    mask_outputs = tf.gather_nd(mask_outputs, gather_indices)
            return mask_outputs
    def test_vimco_and_gradient(self):
        dims = 5  # Dimension
        num_draws = int(1e3)
        num_batch_draws = int(3)
        seed = test_util.test_seed()

        with tf.GradientTape(persistent=True) as tape:
            f = lambda logu: tfp.vi.kl_reverse(logu, self_normalized=False)
            np_f = lambda logu: -logu

            s = tf.constant(1.)
            tape.watch(s)
            p = tfd.MultivariateNormalFullCovariance(covariance_matrix=tridiag(
                dims, diag_value=1, offdiag_value=0.5))

            # Variance is very high when approximating Forward KL, so we make
            # scale_diag large. This ensures q "covers" p and thus Var_q[p/q] is
            # smaller.
            q = tfd.MultivariateNormalDiag(scale_diag=tf.tile([s], [dims]))

            vimco = tfp.vi.csiszar_vimco(f=f,
                                         p_log_prob=p.log_prob,
                                         q=q,
                                         num_draws=num_draws,
                                         num_batch_draws=num_batch_draws,
                                         seed=seed)

            # We want the seed to be the same since we will use computations
            # with the same underlying sample to show correctness of vimco.
            if tf.executing_eagerly():
                tf1.set_random_seed(seed)
            x = q.sample(sample_shape=[num_draws, num_batch_draws], seed=seed)
            x = tf.stop_gradient(x)
            logu = p.log_prob(x) - q.log_prob(x)
            f_log_sum_u = f(tfp.stats.log_soomean_exp(logu, axis=0)[::-1][0])
            q_log_prob_x = q.log_prob(x)

        grad_vimco = tape.gradient(vimco, s)
        grad_mean_f_log_sum_u = tape.gradient(f_log_sum_u, s) / num_batch_draws
        jacobian_logqx = tape.jacobian(q_log_prob_x, s)

        [
            logu_,
            jacobian_logqx_,
            vimco_,
            grad_vimco_,
            f_log_sum_u_,
            grad_mean_f_log_sum_u_,
        ] = self.evaluate([
            logu,
            jacobian_logqx,
            vimco,
            grad_vimco,
            f_log_sum_u,
            grad_mean_f_log_sum_u,
        ])

        np_log_avg_u, np_log_sooavg_u = self._csiszar_vimco_helper(logu_)

        # Test VIMCO loss is correct.
        self.assertAllClose(np_f(np_log_avg_u).mean(axis=0),
                            vimco_,
                            rtol=1e-4,
                            atol=1e-5)

        # Test gradient of VIMCO loss is correct.
        #
        # To make this computation we'll inject two gradients from TF:
        # - grad[mean(f(log(sum(p(x)/q(x)))))]
        # - jacobian[log(q(x))].
        #
        # We now justify why using these (and only these) TF values for
        # ground-truth does not undermine the completeness of this test.
        #
        # Regarding `grad_mean_f_log_sum_u_`, note that we validate the
        # correctness of the zero-th order derivative (for each batch member).
        # Since `tfp.vi.csiszar_vimco_helper` itself does not manipulate any
        # gradient information, we can safely rely on TF.
        self.assertAllClose(np_f(np_log_avg_u),
                            f_log_sum_u_,
                            rtol=1e-4,
                            atol=1e-5)
        #
        # Regarding `jacobian_logqx_`, note that testing the gradient of
        # `q.log_prob` is outside the scope of this unit-test thus we may safely
        # use TF to find it.

        # The `mean` is across batches and the `sum` is across iid samples.
        np_grad_vimco = (grad_mean_f_log_sum_u_ + np.mean(np.sum(
            jacobian_logqx_ * (np_f(np_log_avg_u) - np_f(np_log_sooavg_u)),
            axis=0),
                                                          axis=0))

        self.assertAllClose(np_grad_vimco, grad_vimco_, rtol=0.03, atol=1e-3)
 def exact_kl(s):
     p = tfd.MultivariateNormalFullCovariance(
         covariance_matrix=tridiag(d, diag_value=1, offdiag_value=0.5))
     q = tfd.MultivariateNormalDiag(scale_diag=tf.tile([s], [d]))
     return tfd.kl_divergence(q, p)
def crop_mask_in_target_box(masks,
                            boxes,
                            target_boxes,
                            output_size,
                            sample_offset=0):
    """Crop masks in target boxes.

  Args:
    masks: A tensor with a shape of [batch_size, num_masks, height, width].
    boxes: a float tensor representing box cooridnates that tightly enclose
      masks with a shape of [batch_size, num_masks, 4] in un-normalized
      coordinates. A box is represented by [ymin, xmin, ymax, xmax].
    target_boxes: a float tensor representing target box cooridnates for
      masks with a shape of [batch_size, num_masks, 4] in un-normalized
      coordinates. A box is represented by [ymin, xmin, ymax, xmax].
    output_size: A scalar to indicate the output crop size. It currently only
      supports to output a square shape outputs.
    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
      from grid point.

  Returns:
    A 4-D tensor representing feature crop of shape
    [batch_size, num_boxes, output_size, output_size].
  """
    with tf.name_scope('crop_mask_in_target_box'):
        batch_size, num_masks, height, width = masks.get_shape().as_list()
        masks = tf.reshape(masks, [batch_size * num_masks, height, width, 1])
        # Pad zeros on the boundary of masks.
        masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4,
                                             width + 4)
        masks = tf.reshape(masks,
                           [batch_size, num_masks, height + 4, width + 4, 1])

        # Projects target box locations and sizes to corresponding cropped
        # mask coordinates.
        gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(value=boxes,
                                                          num_or_size_splits=4,
                                                          axis=2)
        bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(value=target_boxes,
                                                          num_or_size_splits=4,
                                                          axis=2)
        y_transform = (bb_y_min - gt_y_min) * height / (gt_y_max - gt_y_min +
                                                        _EPSILON) + 2
        x_transform = (bb_x_min - gt_x_min) * height / (gt_x_max - gt_x_min +
                                                        _EPSILON) + 2
        h_transform = (bb_y_max - bb_y_min) * width / (gt_y_max - gt_y_min +
                                                       _EPSILON)
        w_transform = (bb_x_max - bb_x_min) * width / (gt_x_max - gt_x_min +
                                                       _EPSILON)

        boundaries = tf.concat([
            tf.cast(tf.ones_like(y_transform) * ((height + 4) - 1),
                    dtype=tf.float32),
            tf.cast(tf.ones_like(x_transform) * ((width + 4) - 1),
                    dtype=tf.float32)
        ],
                               axis=-1)

        # Reshape tensors to have the right shape for selective_crop_and_resize.
        trasnformed_boxes = tf.concat(
            [y_transform, x_transform, h_transform, w_transform], -1)
        levels = tf.tile(tf.reshape(tf.range(num_masks), [1, num_masks]),
                         [batch_size, 1])

        cropped_masks = selective_crop_and_resize(masks,
                                                  trasnformed_boxes,
                                                  levels,
                                                  boundaries,
                                                  output_size,
                                                  sample_offset=sample_offset)
        cropped_masks = tf.squeeze(cropped_masks, axis=-1)

    return cropped_masks
def selective_crop_and_resize(features,
                              boxes,
                              box_levels,
                              boundaries,
                              output_size=7,
                              sample_offset=0.5):
    """Crop and resize boxes on a set of feature maps.

  Given multiple features maps indexed by different levels, and a set of boxes
  where each box is mapped to a certain level, it selectively crops and resizes
  boxes from the corresponding feature maps to generate the box features.

  We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
  figure 3 for reference). Specifically, for each feature map, we select an
  (output_size, output_size) set of pixels corresponding to the box location,
  and then use bilinear interpolation to select the feature value for each
  pixel.

  For performance, we perform the gather and interpolation on all layers as a
  single operation. This is op the multi-level features are first stacked and
  gathered into [2*output_size, 2*output_size] feature points. Then bilinear
  interpolation is performed on the gathered feature points to generate
  [output_size, output_size] RoIAlign feature map.

  Here is the step-by-step algorithm:
    1. The multi-level features are gathered into a
       [batch_size, num_boxes, output_size*2, output_size*2, num_filters]
       Tensor. The Tensor contains four neighboring feature points for each
       vertice in the output grid.
    2. Compute the interpolation kernel of shape
       [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
       can be seen as stacking 2x2 interpolation kernels for all vertices in the
       output grid.
    3. Element-wise multiply the gathered features and interpolation kernel.
       Then apply 2x2 average pooling to reduce spatial dimension to
       output_size.

  Args:
    features: a 5-D tensor of shape
      [batch_size, num_levels, max_height, max_width, num_filters] where
      cropping and resizing are based.
    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
      information of each box w.r.t. the corresponding feature map.
      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
      in terms of the number of pixels of the corresponding feature map size.
    box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
      the 0-based corresponding feature level index of each box.
    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
      the boundary (in (y, x)) of the corresponding feature map for each box.
      Any resampled grid points that go beyond the bounary will be clipped.
    output_size: a scalar indicating the output crop size.
    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
      from grid point.

  Returns:
    features_per_box: a 5-D tensor of shape
      [batch_size, num_boxes, output_size, output_size, num_filters]
      representing the cropped features.
  """
    (batch_size, num_levels, max_feature_height, max_feature_width,
     num_filters) = features.get_shape().as_list()
    _, num_boxes, _ = boxes.get_shape().as_list()

    # Compute the grid position w.r.t. the corresponding feature map.
    box_grid_x = []
    box_grid_y = []
    for i in range(output_size):
        box_grid_x.append(boxes[:, :, 1] +
                          (i + sample_offset) * boxes[:, :, 3] / output_size)
        box_grid_y.append(boxes[:, :, 0] +
                          (i + sample_offset) * boxes[:, :, 2] / output_size)
    box_grid_x = tf.stack(box_grid_x, axis=2)
    box_grid_y = tf.stack(box_grid_y, axis=2)

    # Compute indices for gather operation.
    box_grid_y0 = tf.floor(box_grid_y)
    box_grid_x0 = tf.floor(box_grid_x)
    box_grid_x0 = tf.maximum(0., box_grid_x0)
    box_grid_y0 = tf.maximum(0., box_grid_y0)
    box_gridx0x1 = tf.stack([
        tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)),
        tf.minimum(box_grid_x0 + 1, tf.expand_dims(boundaries[:, :, 1], -1))
    ],
                            axis=3)
    box_gridy0y1 = tf.stack([
        tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)),
        tf.minimum(box_grid_y0 + 1, tf.expand_dims(boundaries[:, :, 0], -1))
    ],
                            axis=3)

    x_indices = tf.cast(tf.reshape(box_gridx0x1,
                                   [batch_size, num_boxes, output_size * 2]),
                        dtype=tf.int32)
    y_indices = tf.cast(tf.reshape(box_gridy0y1,
                                   [batch_size, num_boxes, output_size * 2]),
                        dtype=tf.int32)

    height_dim_offset = max_feature_width
    level_dim_offset = max_feature_height * height_dim_offset
    batch_dim_offset = num_levels * level_dim_offset
    indices = tf.reshape(
        tf.tile(
            tf.reshape(
                tf.range(batch_size) * batch_dim_offset,
                [batch_size, 1, 1, 1]),
            [1, num_boxes, output_size * 2, output_size * 2]) + tf.tile(
                tf.reshape(box_levels * level_dim_offset,
                           [batch_size, num_boxes, 1, 1]),
                [1, 1, output_size * 2, output_size * 2]) + tf.tile(
                    tf.reshape(y_indices * height_dim_offset,
                               [batch_size, num_boxes, output_size * 2, 1]),
                    [1, 1, 1, output_size * 2]) +
        tf.tile(
            tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
            [1, 1, output_size * 2, 1]), [-1])

    features = tf.reshape(features, [-1, num_filters])
    features_per_box = tf.reshape(
        tf.gather(features, indices),
        [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])

    # The RoIAlign feature f can be computed by bilinear interpolation of four
    # neighboring feature points f0, f1, f2, and f3.
    # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
    #                       [f10, f11]]
    # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
    # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
    ly = box_grid_y - box_grid_y0
    lx = box_grid_x - box_grid_x0
    hy = 1.0 - ly
    hx = 1.0 - lx
    kernel_x = tf.reshape(tf.stack([hx, lx], axis=3),
                          [batch_size, num_boxes, 1, output_size * 2])
    kernel_y = tf.reshape(tf.stack([hy, ly], axis=3),
                          [batch_size, num_boxes, output_size * 2, 1])
    # Uses implicit broadcast to generate the interpolation kernel. The
    # multiplier `4` is for avg pooling.
    interpolation_kernel = kernel_y * kernel_x * 4

    # Interpolates the gathered features with computed interpolation kernels.
    features_per_box *= tf.cast(tf.expand_dims(interpolation_kernel, axis=4),
                                dtype=features_per_box.dtype)
    features_per_box = tf.reshape(features_per_box, [
        batch_size * num_boxes, output_size * 2, output_size * 2, num_filters
    ])
    features_per_box = tf.nn.avg_pool2d(input=features_per_box,
                                        ksize=[1, 2, 2, 1],
                                        strides=[1, 2, 2, 1],
                                        padding='VALID')
    features_per_box = tf.reshape(
        features_per_box,
        [batch_size, num_boxes, output_size, output_size, num_filters])

    return features_per_box
    def _parse_train_data(self, data):
        """Parses data for training.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      image: image tensor that is preproessed to have normalized value and
        dimension [output_size[0], output_size[1], 3]
      labels: a dictionary of tensors used for training. The following describes
        {key: value} pairs in the dictionary.
        image_info: a 2D `Tensor` that encodes the information of the image and
          the applied preprocessing. It is in the format of
          [[original_height, original_width], [scaled_height, scaled_width],
        anchor_boxes: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, 4] representing anchor boxes at each level.
        rpn_score_targets: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, anchors_per_location]. The height_l and
          width_l represent the dimension of class logits at l-th level.
        rpn_box_targets: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        gt_boxes: Groundtruth bounding box annotations. The box is represented
           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
           image that is fed to the network. The tennsor is padded with -1 to
           the fixed dimension [self._max_num_instances, 4].
        gt_classes: Groundtruth classes annotations. The tennsor is padded
          with -1 to the fixed dimension [self._max_num_instances].
        gt_masks: groundtrugh masks cropped by the bounding box and
          resized to a fixed size determined by mask_crop_size.
    """
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        if self._include_mask:
            masks = data['groundtruth_instance_masks']

        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)
            if self._include_mask:
                masks = tf.gather(masks, indices)

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            if self._include_mask:
                image, boxes, masks = input_utils.random_horizontal_flip(
                    image, boxes, masks)
            else:
                image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        # Now the coordinates of boxes are w.r.t. the original image.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        # Now the coordinates of boxes are w.r.t the scaled image.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  (image_height, image_width),
                                                  offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        if self._include_mask:
            masks = tf.gather(masks, indices)
            cropped_boxes = boxes + tf.cast(tf.tile(
                tf.expand_dims(offset, axis=0), [1, 2]),
                                            dtype=tf.float32)
            cropped_boxes = box_utils.normalize_boxes(cropped_boxes,
                                                      image_info[1, :])
            num_masks = tf.shape(masks)[0]
            masks = tf.image.crop_and_resize(
                tf.expand_dims(masks, axis=-1),
                cropped_boxes,
                box_indices=tf.range(num_masks, dtype=tf.int32),
                crop_size=[self._mask_crop_size, self._mask_crop_size],
                method='bilinear')
            masks = tf.squeeze(masks, axis=-1)

        # Assigns anchor targets.
        # Note that after the target assignment, box targets are absolute pixel
        # offsets w.r.t. the scaled image.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.RpnAnchorLabeler(input_anchor,
                                                 self._rpn_match_threshold,
                                                 self._rpn_unmatched_threshold,
                                                 self._rpn_batch_size_per_im,
                                                 self._rpn_fg_fraction)
        rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
            boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
            'rpn_score_targets': rpn_score_targets,
            'rpn_box_targets': rpn_box_targets,
        }
        labels['gt_boxes'] = input_utils.pad_to_fixed_size(
            boxes, self._max_num_instances, -1)
        labels['gt_classes'] = input_utils.pad_to_fixed_size(
            classes, self._max_num_instances, -1)
        if self._include_mask:
            labels['gt_masks'] = input_utils.pad_to_fixed_size(
                masks, self._max_num_instances, -1)

        return image, labels
Пример #19
0
 def _make_pairs(x):
     return tf.reshape(tf.tile(x[:, tf.newaxis, :], [1, 2, 1]),
                       [-1, x.shape[-1]])
Пример #20
0
    def evaluate_multiclass(self, predictions, weights):
        """Evaluates the multiclass hinge loss on the given predictions.

    Given a rank-1 `Tensor` of predictions with shape (n,), where n is the
    number of examples, and a rank-2 `Tensor` of weights with shape (m, 2),
    where m is broadcastable to n, this method will return a `Tensor` of shape
    (n,) where the ith element is:

    ```python
    hinge_loss[i] = weights[i, 0] + sum_{j=0}^{num_classes - 2} (
        (weights[i, j+1] - weights[i, j]) * max_{l=j+1}^{num_classes-1}
        max{0, margin + predictions[i, l] - mean_{k=0}^j predictions[i, k]}
    )
    ```

    where we've assumed (without loss of generality) that the weights and
    predictions are ordered in such a way that weights[i, j] <= weights[i, j+1].
    In the implementation, of course, we cannot simply assume this, and actually
    perform a sort.

    This is admittedly a somewhat strange-seeming formulation, and it's
    complicated and expensive to implement. The reason it was chosen is that it
    satisfies the following properties:

    1. It's shift invariant: adding a constant to every weight will shift the
       loss by the same constant.
    2. It's scale invariant: multiplying every weight by a constant will scale
       the loss by the same constant.
    3. When there are only two classes, it's equivalent to the binary hinge loss
       implemented in evaluate_binary_classification().
    4. When the weights represent a misclassification rate (i.e. weights[i, 0] =
       0 and weights[i, j] = 1 for i > 0, assuming the weights are sorted), it's
       equivalent to the usual multiclass hinge misclassification loss.
    5. It's convex in the predictions, and upper bounds the multiclass 0-1 loss
       when margin >= 1.

    Args:
      predictions: a `Tensor` of shape (n, k), where n is the number of examples
        and k is the number of classes.
      weights: a `Tensor` of shape (m, k), where m is broadcastable to n. This
        `Tensor` is *not* necessarily non-negative.

    Returns:
      A `Tensor` of shape (n,) and dtype=predictions.dtype, containing the
      hinge losses for each example.

    Raises:
      TypeError: if "predictions" is not a floating-point `Tensor`, or "weights"
        is not a `Tensor`.
      ValueError: if "predictions" and "weights" have different numbers of
        columns (i.e. if the number of classes is inconsistent).
    """
        num_classes = helpers.get_num_columns_of_2d_tensor(
            predictions, name="multiclass predictions")
        weights_num_classes = helpers.get_num_columns_of_2d_tensor(
            weights, name="weights")
        if weights_num_classes != num_classes:
            raise ValueError(
                "weights must have the same number of columns as "
                "predictions ({} vs. {}): did you specify num_classes "
                "correctly when you created your context?".format(
                    weights_num_classes, num_classes))
        dtype = predictions.dtype.base_dtype
        if not dtype.is_floating:
            raise TypeError("multiclass predictions must be floating-point")
        zero = tf.zeros(1, dtype=dtype)

        weights_rows = tf.shape(weights)[0]
        predictions_rows = tf.shape(predictions)[0]

        # We start out by finding a permutation for each row that will cause the
        # weights to be nondecreasing.
        weights_permutation = tf.argsort(weights, axis=1)
        # This won't work if predictions_rows isn't divisible by weights_rows
        # (tf.stack() below will fail), but we require weights to be broadcastable
        # to predictions (usually, weights_rows will either be 1, or equal to
        # predictions_rows).
        predictions_permutation = tf.tile(weights_permutation,
                                          [predictions_rows / weights_rows, 1])

        # First we create a Tensor of shape [weights_rows, num_classes, 2], for
        # which:
        #   weights_indices[i, j, 0] = i
        #   weights_indices[i, j, 1] = weights_permutation[j]
        # Next, we use gather_nd to re-organize the weights such that:
        #   new_weights[i, j] = old_weights[i, weights_permutation[j]]
        weights_iota = tf.range(weights_rows)
        weights_iota = tf.expand_dims(weights_iota, axis=-1)
        weights_iota = tf.tile(weights_iota, [1, num_classes])
        weights_indices = tf.stack([weights_iota, weights_permutation], axis=2)
        weights = tf.gather_nd(tf.cast(weights, dtype=dtype), weights_indices)

        # Next we create a Tensor of shape [predictions_rows, num_classes, 2], for
        # which:
        #   predictions_indices[i, j, 0] = i
        #   predictions_indices[i, j, 1] = predictions_permutation[j]
        # Next, we use gather_nd to re-organize the predictions such that:
        #   new_predictions[i, j] = old_predictions[i, predictions_permutation[j]]
        predictions_iota = tf.range(predictions_rows)
        predictions_iota = tf.expand_dims(predictions_iota, axis=-1)
        predictions_iota = tf.tile(predictions_iota, [1, num_classes])
        predictions_indices = tf.stack(
            [predictions_iota, predictions_permutation], axis=2)
        predictions = tf.gather_nd(predictions, predictions_indices)

        # At this point, every row of weights and predictions has been sorted in
        # such a way that the weights are nondecreasing. We wish to calculate the
        # following:
        #   result[i] = weights[i, 0] + \sum_{j=0}^{num_classes - 2} (
        #     (weights[i, j+1] - weights[i, j]) * max_{l=j+1}^{num_classes-1}
        #     max{0, margin + predictions[i, l] - mean_{k=0}^j predictions[i, k]}
        #   )
        # Notice that the innermost max is a hinge.
        result = weights[:, 0]
        for ii in xrange(num_classes - 1):
            scale = weights[:, ii + 1] - weights[:, ii]
            # The "included" predictions are those in the above max over l, and the
            # "excluded" predictions are those in the above mean over k.
            included = predictions[:, (ii + 1):num_classes]
            included = tf.reduce_max(included, axis=1)
            excluded = predictions[:, 0:(ii + 1)]
            excluded = tf.reduce_mean(excluded, axis=1)
            result += scale * tf.maximum(zero,
                                         self._margin + included - excluded)

        return result
Пример #21
0
def simpson(func, lower, upper, num_points=1001, dtype=None, name=None):
  """Evaluates definite integral using composite Simpson's 1/3 rule.

  Integrates `func` using composite Simpson's 1/3 rule [1].

  Evaluates function at points of evenly spaced grid of `num_points` points,
  then uses obtained values to interpolate `func` with quadratic polynomials
  and integrates these polynomials.

  ## References
  [1] Weisstein, Eric W. "Simpson's Rule." From MathWorld - A Wolfram Web
      Resource. http://mathworld.wolfram.com/SimpsonsRule.html

  ## Example
  ```python
    f = lambda x: x*x
    a = tf.constant(0.0)
    b = tf.constant(3.0)
    integrate(f, a, b, num_points=1001) # 9.0
  ```

  Args:
    func: Python callable representing a function to be integrated. It must be a
      callable of a single `Tensor` parameter and return a `Tensor` of the same
      shape and dtype as its input. It will be called with a `Tesnor` of shape
      `lower.shape + [n]` (where n is integer number of points) and of the same
      `dtype` as `lower`.
    lower: `Tensor` or Python float representing the lower limits of
      integration. `func` will be integrated between each pair of points defined
      by `lower` and `upper`.
    upper: `Tensor` of the same shape and dtype as `lower` or Python float
      representing the upper limits of intergation.
    num_points: Scalar int32 `Tensor`. Number of points at which function `func`
      will be evaluated. Must be odd and at least 3.
      Default value: 1001.
    dtype: Optional `tf.Dtype`. If supplied, the dtype for the `lower` and
      `upper`. Result will have the same dtype.
      Default value: None which maps to dtype of `lower`.
    name: Python str. The name to give to the ops created by this function.
      Default value: None which maps to 'integrate_simpson_composite'.

  Returns:
    `Tensor` of shape `func_batch_shape + limits_batch_shape`, containing
      value of the definite integral.

  """
  with tf.compat.v1.name_scope(
      name, default_name='integrate_simpson_composite', values=[lower, upper]):
    lower = tf.convert_to_tensor(lower, dtype=dtype, name='lower')
    dtype = lower.dtype
    upper = tf.convert_to_tensor(upper, dtype=dtype, name='upper')
    num_points = tf.convert_to_tensor(
        num_points, dtype=tf.int32, name='num_points')

    assertions = [
        tf.debugging.assert_greater_equal(num_points, 3),
        tf.debugging.assert_equal(num_points % 2, 1),
    ]

    with tf.compat.v1.control_dependencies(assertions):
      dx = (upper - lower) / (tf.cast(num_points, dtype=dtype) - 1)
      dx_expand = tf.expand_dims(dx, -1)
      lower_exp = tf.expand_dims(lower, -1)
      grid = lower_exp + dx_expand * tf.cast(tf.range(num_points), dtype=dtype)
      weights_first = tf.constant([1.0], dtype=dtype)
      weights_mid = tf.tile(
          tf.constant([4.0, 2.0], dtype=dtype), [(num_points - 3) // 2])
      weights_last = tf.constant([4.0, 1.0], dtype=dtype)
      weights = tf.concat([weights_first, weights_mid, weights_last], axis=0)

    return tf.reduce_sum(func(grid) * weights, axis=-1) * dx / 3
Пример #22
0
def update_confusion_matrix_variables(
    variables_to_update,
    y_true,
    y_pred,
    thresholds,
    top_k=None,
    class_id=None,
    sample_weight=None,
    multi_label=False,
    label_weights=None,
    thresholds_distributed_evenly=False,
):
    """Returns op to update the given confusion matrix variables.

    For every pair of values in y_true and y_pred:

    true_positive: y_true == True and y_pred > thresholds
    false_negatives: y_true == True and y_pred <= thresholds
    true_negatives: y_true == False and y_pred <= thresholds
    false_positive: y_true == False and y_pred > thresholds

    The results will be weighted and added together. When multiple thresholds are
    provided, we will repeat the same for every threshold.

    For estimation of these metrics over a stream of data, the function creates an
    `update_op` operation that updates the given variables.

    If `sample_weight` is `None`, weights default to 1.
    Use weights of 0 to mask values.

    Args:
      variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
        and corresponding variables to update as values.
      y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
      y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
        the range `[0, 1]`.
      thresholds: A float value, float tensor, python list, or tuple of float
        thresholds in `[0, 1]`, or NEG_INF (used when top_k is set).
      top_k: Optional int, indicates that the positive labels should be limited to
        the top k predictions.
      class_id: Optional int, limits the prediction and labels to the class
        specified by this argument.
      sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
        `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
        be either `1`, or the same as the corresponding `y_true` dimension).
      multi_label: Optional boolean indicating whether multidimensional
        prediction/labels should be treated as multilabel responses, or flattened
        into a single label. When True, the valus of `variables_to_update` must
        have a second dimension equal to the number of labels in y_true and
        y_pred, and those tensors must not be RaggedTensors.
      label_weights: (optional) tensor of non-negative weights for multilabel
        data. The weights are applied when calculating TP, FP, FN, and TN without
        explicit multilabel handling (i.e. when the data is to be flattened).
      thresholds_distributed_evenly: Boolean, whether the thresholds are evenly
        distributed within the list. An optimized method will be used if this is
        the case. See _update_confusion_matrix_variables_optimized() for more
        details.

    Returns:
      Update op.

    Raises:
      ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
        `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
        `variables_to_update` contains invalid keys.
    """
    if multi_label and label_weights is not None:
        raise ValueError(
            "`label_weights` for multilabel data should be handled "
            "outside of `update_confusion_matrix_variables` when "
            "`multi_label` is True.")
    if variables_to_update is None:
        return
    if not any(key
               for key in variables_to_update if key in list(ConfusionMatrix)):
        raise ValueError(
            "Please provide at least one valid confusion matrix "
            "variable to update. Valid variable key options are: "
            f'"{list(ConfusionMatrix)}". Received: "{variables_to_update.keys()}"'
        )

    variable_dtype = list(variables_to_update.values())[0].dtype

    y_true = tf.cast(y_true, dtype=variable_dtype)
    y_pred = tf.cast(y_pred, dtype=variable_dtype)

    if thresholds_distributed_evenly:
        # Check whether the thresholds has any leading or tailing epsilon added
        # for floating point imprecision. The leading and tailing threshold will be
        # handled bit differently as the corner case.
        # At this point, thresholds should be a list/array with more than 2 items,
        # and ranged between [0, 1]. See is_evenly_distributed_thresholds() for more
        # details.
        thresholds_with_epsilon = thresholds[0] < 0.0 or thresholds[-1] > 1.0

    thresholds = tf.convert_to_tensor(thresholds, dtype=variable_dtype)
    num_thresholds = thresholds.shape.as_list()[0]

    if multi_label:
        one_thresh = tf.equal(
            tf.cast(1, dtype=tf.int32),
            tf.rank(thresholds),
            name="one_set_of_thresholds_cond",
        )
    else:
        [y_pred, y_true
         ], _ = ragged_assert_compatible_and_get_flat_values([y_pred, y_true],
                                                             sample_weight)
        one_thresh = tf.cast(True, dtype=tf.bool)

    invalid_keys = [
        key for key in variables_to_update if key not in list(ConfusionMatrix)
    ]
    if invalid_keys:
        raise ValueError(
            f'Invalid keys: "{invalid_keys}". '
            f'Valid variable key options are: "{list(ConfusionMatrix)}"')

    if sample_weight is None:
        y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
            y_pred, y_true)
    else:
        sample_weight = tf.cast(sample_weight, dtype=variable_dtype)
        (
            y_pred,
            y_true,
            sample_weight,
        ) = losses_utils.squeeze_or_expand_dimensions(
            y_pred, y_true, sample_weight=sample_weight)
    y_pred.shape.assert_is_compatible_with(y_true.shape)

    if top_k is not None:
        y_pred = _filter_top_k(y_pred, top_k)
    if class_id is not None:
        y_true = y_true[..., class_id]
        y_pred = y_pred[..., class_id]

    if thresholds_distributed_evenly:
        return _update_confusion_matrix_variables_optimized(
            variables_to_update,
            y_true,
            y_pred,
            thresholds,
            multi_label=multi_label,
            sample_weights=sample_weight,
            label_weights=label_weights,
            thresholds_with_epsilon=thresholds_with_epsilon,
        )

    pred_shape = tf.shape(y_pred)
    num_predictions = pred_shape[0]
    if y_pred.shape.ndims == 1:
        num_labels = 1
    else:
        num_labels = tf.math.reduce_prod(pred_shape[1:], axis=0)
    thresh_label_tile = tf.where(one_thresh, num_labels,
                                 tf.ones([], dtype=tf.int32))

    # Reshape predictions and labels, adding a dim for thresholding.
    if multi_label:
        predictions_extra_dim = tf.expand_dims(y_pred, 0)
        labels_extra_dim = tf.expand_dims(tf.cast(y_true, dtype=tf.bool), 0)
    else:
        # Flatten predictions and labels when not multilabel.
        predictions_extra_dim = tf.reshape(y_pred, [1, -1])
        labels_extra_dim = tf.reshape(tf.cast(y_true, dtype=tf.bool), [1, -1])

    # Tile the thresholds for every prediction.
    if multi_label:
        thresh_pretile_shape = [num_thresholds, 1, -1]
        thresh_tiles = [1, num_predictions, thresh_label_tile]
        data_tiles = [num_thresholds, 1, 1]
    else:
        thresh_pretile_shape = [num_thresholds, -1]
        thresh_tiles = [1, num_predictions * num_labels]
        data_tiles = [num_thresholds, 1]

    thresh_tiled = tf.tile(tf.reshape(thresholds, thresh_pretile_shape),
                           tf.stack(thresh_tiles))

    # Tile the predictions for every threshold.
    preds_tiled = tf.tile(predictions_extra_dim, data_tiles)

    # Compare predictions and threshold.
    pred_is_pos = tf.greater(preds_tiled, thresh_tiled)

    # Tile labels by number of thresholds
    label_is_pos = tf.tile(labels_extra_dim, data_tiles)

    if sample_weight is not None:
        sample_weight = tf.__internal__.ops.broadcast_weights(
            tf.cast(sample_weight, dtype=variable_dtype), y_pred)
        weights_tiled = tf.tile(tf.reshape(sample_weight, thresh_tiles),
                                data_tiles)
    else:
        weights_tiled = None

    if label_weights is not None and not multi_label:
        label_weights = tf.expand_dims(label_weights, 0)
        label_weights = tf.__internal__.ops.broadcast_weights(
            label_weights, y_pred)
        label_weights_tiled = tf.tile(tf.reshape(label_weights, thresh_tiles),
                                      data_tiles)
        if weights_tiled is None:
            weights_tiled = label_weights_tiled
        else:
            weights_tiled = tf.multiply(weights_tiled, label_weights_tiled)

    update_ops = []

    def weighted_assign_add(label, pred, weights, var):
        label_and_pred = tf.cast(tf.logical_and(label, pred), dtype=var.dtype)
        if weights is not None:
            label_and_pred *= tf.cast(weights, dtype=var.dtype)
        return var.assign_add(tf.reduce_sum(label_and_pred, 1))

    loop_vars = {
        ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
    }
    update_tn = ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
    update_fp = ConfusionMatrix.FALSE_POSITIVES in variables_to_update
    update_fn = ConfusionMatrix.FALSE_NEGATIVES in variables_to_update

    if update_fn or update_tn:
        pred_is_neg = tf.logical_not(pred_is_pos)
        loop_vars[ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos,
                                                      pred_is_neg)

    if update_fp or update_tn:
        label_is_neg = tf.logical_not(label_is_pos)
        loop_vars[ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg,
                                                      pred_is_pos)
        if update_tn:
            loop_vars[ConfusionMatrix.TRUE_NEGATIVES] = (
                label_is_neg,
                pred_is_neg,
            )

    for matrix_cond, (label, pred) in loop_vars.items():

        if matrix_cond in variables_to_update:
            update_ops.append(
                weighted_assign_add(label, pred, weights_tiled,
                                    variables_to_update[matrix_cond]))

    return tf.group(update_ops)
        def step_fn(inputs):
            """Per-Replica StepFn."""
            # Note that we don't use tf.tile for labels here
            images = inputs['features']
            labels = inputs['labels']
            images = tf.tile(images, [ensemble_size, 1, 1, 1])

            # get lambdas
            samples = log_uniform_sample(n_samples, lambda_parameters)
            if num_eval_samples >= 0:
                lambdas = log_uniform_mean(lambda_parameters)
                lambdas = tf.expand_dims(lambdas, 1)
                lambdas = tf.concat((lambdas, samples), 1)
            else:
                lambdas = samples

            # lambdas with shape (ens size, samples, dim of lambdas)
            rep_lambdas = tf.repeat(lambdas, per_core_batch_size, axis=1)
            rep_lambdas = tf.reshape(rep_lambdas,
                                     (ensemble_size * per_core_batch_size, -1))

            # eval on testsets
            logits = model([images, rep_lambdas], training=False)
            probs = tf.nn.softmax(logits)
            per_probs = tf.split(probs,
                                 num_or_size_splits=ensemble_size,
                                 axis=0)

            # per member performance and gibbs performance (average per member perf)
            if dataset_name == 'clean':
                for i in range(FLAGS.ensemble_size):
                    # we record the first sample of lambdas per batch-ens member
                    first_member_index = i * (ensemble_size //
                                              FLAGS.ensemble_size)
                    member_probs = per_probs[first_member_index]
                    member_loss = tf.keras.losses.sparse_categorical_crossentropy(
                        labels, member_probs)
                    metrics['test/nll_member_{}'.format(i)].update_state(
                        member_loss)
                    metrics['test/accuracy_member_{}'.format(i)].update_state(
                        labels, member_probs)

                labels_tile = tf.tile(labels, [ensemble_size])
                metrics['test/gibbs_nll'].update_state(
                    tf.reduce_mean(
                        tf.keras.losses.sparse_categorical_crossentropy(
                            labels_tile, logits, from_logits=True)))
                metrics['test/gibbs_accuracy'].update_state(labels_tile, probs)

            # ensemble performance
            negative_log_likelihood = ensemble_crossentropy(
                labels, logits, ensemble_size)
            probs = tf.reduce_mean(per_probs, axis=0)
            if dataset_name == 'clean':
                metrics['test/negative_log_likelihood'].update_state(
                    negative_log_likelihood)
                metrics['test/accuracy'].update_state(labels, probs)
                metrics['test/ece'].add_batch(probs, label=labels)
            else:
                corrupt_metrics['test/nll_{}'.format(
                    dataset_name)].update_state(negative_log_likelihood)
                corrupt_metrics['test/accuracy_{}'.format(
                    dataset_name)].update_state(labels, probs)
                corrupt_metrics['test/ece_{}'.format(dataset_name)].add_batch(
                    probs, label=labels)

            if dataset_name == 'clean':
                per_probs_stacked = tf.stack(per_probs, axis=0)
                diversity = rm.metrics.AveragePairwiseDiversity()
                diversity.add_batch(per_probs_stacked,
                                    num_models=ensemble_size)
                diversity_results = diversity.result()
                for k, v in diversity_results.items():
                    metrics['test/' + k].update_state(v)
Пример #24
0
def _replicate(n, tensor):
  """Replicate the input tensor n times along a new (major) dimension."""
  # TODO(axch) Does this already exist somewhere?  Should it get contributed?
  multiples = tf.concat([[n], tf.ones_like(tensor.shape)], axis=0)
  return tf.tile(tf.expand_dims(tensor, axis=0), multiples)
def get_true_shapes(input_tensor):
  input_shape = tf.shape(input_tensor)
  batch = input_shape[0]
  image_shape = input_shape[1:]
  true_shapes = tf.tile(image_shape[tf.newaxis, :], [batch, 1])
  return true_shapes
Пример #26
0
    def __init__(self,
                 num_timesteps,
                 period,
                 frequency_multipliers,
                 drift_scale,
                 initial_state_prior,
                 observation_noise_scale=0.,
                 initial_step=0,
                 validate_args=False,
                 allow_nan_stats=True,
                 name=None):
        """Build a smooth seasonal state space model.

    Args:
      num_timesteps: Scalar `int` `Tensor` number of timesteps to model
        with this distribution.
      period: positive scalar `float` `Tensor` giving the number of timesteps
        required for the longest cyclic effect to repeat.
      frequency_multipliers: One-dimensional `float` `Tensor` listing the
        frequencies (cyclic components) included in the model, as multipliers of
        the base/fundamental frequency `2. * pi / period`. Each component is
        specified by the number of times it repeats per period, and adds two
        latent dimensions to the model. A smooth seasonal model that can
        represent any periodic function is given by `frequency_multipliers = [1,
        2, ..., floor(period / 2)]`. However, it is often desirable to enforce a
        smoothness assumption (and reduce the computational burden) by dropping
        some of the higher frequencies.
      drift_scale: Scalar (any additional dimensions are treated as batch
        dimensions) `float` `Tensor` indicating the standard deviation of the
        latent state transitions.
      initial_state_prior: instance of `tfd.MultivariateNormal`
        representing the prior distribution on latent states.  Must have
        event shape `[num_features]`.
      observation_noise_scale: Scalar (any additional dimensions are
        treated as batch dimensions) `float` `Tensor` indicating the standard
        deviation of the observation noise.
        Default value: `0.`.
      initial_step: scalar `int` `Tensor` specifying the starting timestep.
        Default value: `0`.
      validate_args: Python `bool`. Whether to validate input with asserts. If
        `validate_args` is `False`, and the inputs are invalid, correct behavior
        is not guaranteed.
        Default value: `False`.
      allow_nan_stats: Python `bool`. If `False`, raise an
        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
        batch member. If `True`, batch members with valid parameters leading to
        undefined statistics will return NaN for this statistic.
        Default value: `True`.
      name: Python `str` name prefixed to ops created by this class.
        Default value: 'SmoothSeasonalStateSpaceModel'.

    """

        with tf.name_scope(name or 'SmoothSeasonalStateSpaceModel') as name:

            dtype = dtype_util.common_dtype([
                period, frequency_multipliers, drift_scale, initial_state_prior
            ])

            period = tf.convert_to_tensor(value=period,
                                          name='period',
                                          dtype=dtype)

            frequency_multipliers = tf.convert_to_tensor(
                value=frequency_multipliers,
                name='frequency_multipliers',
                dtype=dtype)

            drift_scale = tf.convert_to_tensor(value=drift_scale,
                                               name='drift_scale',
                                               dtype=dtype)

            observation_noise_scale = tf.convert_to_tensor(
                value=observation_noise_scale,
                name='observation_noise_scale',
                dtype=dtype)

            num_frequencies = static_num_frequencies(frequency_multipliers)

            observation_matrix = tf.tile(tf.constant([[1., 0.]], dtype=dtype),
                                         multiples=[1, num_frequencies])

            transition_matrix = build_smooth_seasonal_transition_matrix(
                period=period,
                frequency_multipliers=frequency_multipliers,
                dtype=dtype)

            self._drift_scale = drift_scale
            self._observation_noise_scale = observation_noise_scale
            self._period = period
            self._frequency_multipliers = frequency_multipliers

            super(SmoothSeasonalStateSpaceModel, self).__init__(
                num_timesteps=num_timesteps,
                transition_matrix=transition_matrix,
                transition_noise=tfd.MultivariateNormalDiag(
                    scale_diag=(drift_scale[..., tf.newaxis] *
                                tf.ones([2 * num_frequencies], dtype=dtype)),
                    name='transition_noise'),
                observation_matrix=observation_matrix,
                observation_noise=tfd.MultivariateNormalDiag(
                    scale_diag=observation_noise_scale[..., tf.newaxis],
                    name='observation_noise'),
                initial_state_prior=initial_state_prior,
                initial_step=initial_step,
                allow_nan_stats=allow_nan_stats,
                validate_args=validate_args,
                name=name)
Пример #27
0
def soft_multivariate_quantiles(x,
                                quantiles,
                                quantile_width=None,
                                **kwargs):
  """Computes soft multivariate quantiles via optimal transport.

  Transport multivariate input values in x onto 2^d + 1 weighted points,
  {0,1}^d + [0.5, ..., 0.5]. Target weights are adjusted so
  that those values in x that are transported to the middle value in the target
  vector correspond to those concentrating around the quantile of interest.

  Args:
   x: Tensor<float> of shape [batch, N, d]
   quantiles: Tensor<float> of shape [r, d], r targeted quantiles of dimension d
   quantile_width: (float) mass given to the bucket supposed to attract points
     whose value concentrate around the desired quantile value. Bigger width
     means that we allow the soft quantile to be a mixture of more points
     further away from the quantile. If None, the width is set at 1/n where n is
     the number of values considered (the size along the 'axis').
   **kwargs: see sinkhorn.autodiff_sinkhorn for possible extra parameters.

  Returns:
    A Tensor<float> [N,r,d] of multivariate quantiles per batch.

  """
  quantiles = tf.constant(quantiles, tf.float32)
  batch_size = x.shape[0]
  n = tf.cast(x.shape[1], tf.float32)
  d = x.shape[2]
  if quantile_width is None:
    quantile_width = 2 / n
  num_quantiles = tf.shape(quantiles)[0]
  hypercube_vertices = tf.constant(
      list(itertools.product([-1, 1], repeat=d)), tf.float32)
  # weights attached to vertices for each quantile. this is n_quantiles x 2^r
  weights = quantiles[:, tf.newaxis, :]**(
      0.5 * (1 - hypercube_vertices))[tf.newaxis, Ellipsis]
  weights *= (1 - quantiles)[:, tf.newaxis, :]**(
      0.5 * (1 + hypercube_vertices))[tf.newaxis, Ellipsis]

  weights = (1 - quantile_width) * tf.reduce_prod(weights, axis=2)
  # adding weights for quantile itself (in position 0).
  weights = tf.concat((quantile_width * tf.ones((num_quantiles, 1)), weights),
                      axis=1)
  # augmenting and formating as batch_size * 2^r +1 * num_quantiles
  weights = tf.reshape(
      tf.tile(tf.transpose(weights), [batch_size, 1]),
      [batch_size, 2**d + 1, num_quantiles])
  # set target locations, by adding the point at 0 that will absorb the quantile
  # augment it with batch_size
  y = tf.concat((tf.zeros((1, d), dtype=tf.float32), hypercube_vertices),
                axis=0)
  y = tf.reshape(tf.tile(y, [batch_size, 1]), [batch_size, 2**d + 1, d])
  # center x
  x_mean = tf.reduce_mean(x, axis=1)
  x = x - x_mean[:, tf.newaxis, :]
  transports = sinkhorn.autodiff_sinkhorn(
      x, y,
      tf.ones([batch_size, n, num_quantiles], dtype=tf.float32) / n, weights,
      **kwargs)

  # recover convex combinations resulting from transporting to central point in
  # in all batches and quantile variations.
  transports = 1 / quantile_width * tf.reshape(transports[:, :, 0, :],
                                               [batch_size, n, -1])
  # apply these convex combinations to data points + recenter.
  all_soft_quantiles = tf.reduce_sum(
      transports[:, :, :, tf.newaxis] *
      x[:, :, tf.newaxis, :],
      axis=1) + x_mean[:, tf.newaxis, :]
  # reshape those quantiles after having applied convex combinations.
  return tf.reshape(all_soft_quantiles, [batch_size, num_quantiles, d])
Пример #28
0
    def train_step(inputs):
        """Build `step_fn` for efficientnet learning."""
        images, labels = inputs
        images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])
        labels = tf.tile(labels, [FLAGS.ensemble_size, 1])

        num_replicas = tf.cast(strategy.num_replicas_in_sync, tf.float32)
        l2_coeff = tf.cast(FLAGS.l2, tf.float32)

        with tf.GradientTape() as tape:
            logits = model(images, training=True)
            logits = tf.cast(logits, tf.float32)
            negative_log_likelihood = tf.reduce_mean(
                tf.keras.losses.categorical_crossentropy(
                    labels,
                    logits,
                    from_logits=True,
                    label_smoothing=FLAGS.label_smoothing))

            filtered_variables = []
            for var in model.trainable_variables:
                # Apply l2 on the slow weights and bias terms. This excludes BN
                # parameters and fast weight approximate posterior/prior parameters,
                # but pay caution to their naming scheme.
                if 'kernel' in var.name or 'bias' in var.name:
                    filtered_variables.append(tf.reshape(var, (-1, )))

            l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss(
                tf.concat(filtered_variables, axis=0))
            loss = negative_log_likelihood + l2_coeff * l2_loss
            scaled_loss = loss / num_replicas

        grads = tape.gradient(scaled_loss, model.trainable_weights)

        # Separate learning rate implementation.
        if FLAGS.fast_weight_lr_multiplier != 1.0:
            grads_and_vars = []
            for grad, var in zip(grads, model.trainable_variables):
                # Apply different learning rate on the fast weights. This excludes BN
                # and slow weights, but pay caution to the naming scheme.
                if ('batch_norm' not in var.name and 'kernel' not in var.name):
                    grads_and_vars.append(
                        (grad * FLAGS.fast_weight_lr_multiplier, var))
                else:
                    grads_and_vars.append((grad, var))
            optimizer.apply_gradients(grads_and_vars)
        else:
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        sparse_labels = tf.cast(
            tf.math.argmax(labels, axis=-1, output_type=tf.int32), tf.float32)
        probs = tf.nn.softmax(logits)
        metrics['train/loss'].update_state(loss)
        metrics['train/negative_log_likelihood'].update_state(
            negative_log_likelihood)
        metrics['train/accuracy'].update_state(labels, logits)
        metrics['train/ece'].update_state(sparse_labels, probs)

        step_info = {
            'loss/negative_log_likelihood':
            negative_log_likelihood / num_replicas,
            'loss/total_loss': scaled_loss,
        }
        return step_info
Пример #29
0
def _replicate(n, tensor):
    """Replicate the input tensor n times along a new (major) dimension."""
    # TODO(axch) Does this already exist somewhere?  Should it get contributed?
    multiples = tf.concat([[n], tf.ones([tf.rank(tensor)], dtype=n.dtype)],
                          axis=0)
    return tf.tile(tensor[tf.newaxis], multiples)
def single_level_feature_crop(features, level_boxes, detection_prior_levels,
                              min_mask_level, mask_crop_size):
    """Crop the FPN features at the appropriate levels for each detection.


  Args:
    features: a float tensor of shape [batch_size, num_levels,
      max_feature_size, max_feature_size, num_downsample_channels].
    level_boxes: a float Tensor of the level boxes to crop from.
        [batch_size, num_instances, 4].
    detection_prior_levels: an int Tensor of instance assigned level of shape
        [batch_size, num_instances].
    min_mask_level: minimum FPN level to crop mask feature from.
    mask_crop_size: an int of mask crop size.

  Returns:
    crop_features: a float Tensor of shape [batch_size * num_instances,
        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
        instance feature crop.
  """
    (batch_size, num_levels, max_feature_size, _,
     num_downsample_channels) = features.get_shape().as_list()
    _, num_of_instances, _ = level_boxes.get_shape().as_list()
    level_boxes = tf.cast(level_boxes, tf.int32)
    assert num_of_instances == detection_prior_levels.get_shape().as_list()[1]

    x_start_indices = level_boxes[:, :, 1]
    y_start_indices = level_boxes[:, :, 0]
    # generate the full indices (not just the starting index)
    x_idx_list = []
    y_idx_list = []
    for i in range(mask_crop_size):
        x_idx_list.append(x_start_indices + i)
        y_idx_list.append(y_start_indices + i)

    x_indices = tf.stack(x_idx_list, axis=2)
    y_indices = tf.stack(y_idx_list, axis=2)
    levels = detection_prior_levels - min_mask_level
    height_dim_size = max_feature_size
    level_dim_size = max_feature_size * height_dim_size
    batch_dim_size = num_levels * level_dim_size
    # TODO(weicheng) change this to gather_nd for better readability.
    indices = tf.reshape(
        tf.tile(
            tf.reshape(
                tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
            [1, num_of_instances, mask_crop_size, mask_crop_size]) + tf.tile(
                tf.reshape(levels * level_dim_size,
                           [batch_size, num_of_instances, 1, 1]),
                [1, 1, mask_crop_size, mask_crop_size]) +
        tf.tile(
            tf.reshape(y_indices * height_dim_size,
                       [batch_size, num_of_instances, mask_crop_size, 1]),
            [1, 1, 1, mask_crop_size]) + tf.tile(
                tf.reshape(x_indices,
                           [batch_size, num_of_instances, 1, mask_crop_size]),
                [1, 1, mask_crop_size, 1]), [-1])

    features_r2 = tf.reshape(features, [-1, num_downsample_channels])
    crop_features = tf.reshape(tf.gather(features_r2, indices), [
        batch_size * num_of_instances, mask_crop_size, mask_crop_size,
        num_downsample_channels
    ])

    return crop_features