예제 #1
0
  def _noisy_identity_kernel_initializer(shape,
                                         dtype=tf.float32,
                                         partition_info=None):
    """Constructs a noisy identity kernel.

    Args:
      shape: List of integers. Represents shape of result.
      dtype: data type for values in result.
      partition_info: Partition information for initializer functions. Ignored.

    Returns:
      Tensor of desired shape and dtype such that applying it as a convolution
        kernel results in a noisy near-identity operation.

    Raises:
      ValueError: If shape does not define a valid kernel.
                  If filter width and height differ.
                  If filter width and height are not odd numbers.
                  If number of input and output channels are not multiples of
                    base_num_channels.
    """
    if len(shape) != 4:
      raise ValueError("Convolution kernels must be rank 4.")

    filter_height, filter_width, in_channels, out_channels = shape

    if filter_width != filter_height:
      raise ValueError(
          "Noisy identity initializer only works for square filters.")
    if filter_width % 2 != 1:
      raise ValueError(
          "Noisy identity initializer requires filters have odd height and "
          "width.")
    if (in_channels % base_num_channels != 0 or
        out_channels % base_num_channels != 0):
      raise ValueError("in_channels and out_channels must both be multiples of "
                       "base_num_channels.")

    middle_pixel = filter_height // 2
    is_middle_pixel = tf.logical_and(
        tf.equal(_range_along_dimension(0, shape), middle_pixel),
        tf.equal(_range_along_dimension(1, shape), middle_pixel))
    is_same_channel_multiple = tf.equal(
        tf.floordiv(
            _range_along_dimension(2, shape) * base_num_channels, in_channels),
        tf.floordiv(
            _range_along_dimension(3, shape) * base_num_channels, out_channels))
    noise = tf.truncated_normal(shape, stddev=stddev, dtype=dtype)
    return tf.where(
        tf.logical_and(is_same_channel_multiple, is_middle_pixel),
        tf.ones(
            shape, dtype=dtype) * (base_num_channels / out_channels),
        noise)
def fpn_feature_levels(num_levels, unit_scale_index, image_ratio, boxes):
  """Returns fpn feature level for each box based on its area.

  See section 4.2 of https://arxiv.org/pdf/1612.03144.pdf for details.

  Args:
    num_levels: An integer indicating the number of feature levels to crop boxes
      from.
    unit_scale_index: An 0-based integer indicating the index of feature map
      which most closely matches the resolution of the pretrained model.
    image_ratio: A float indicating the ratio of input image area to pretraining
      image area.
    boxes: A float tensor of shape [batch, num_boxes, 4] containing boxes of the
      form [ymin, xmin, ymax, xmax] in normalized coordinates.

  Returns:
    An int32 tensor of shape [batch_size, num_boxes] containing feature indices.
  """
  assert num_levels > 0, (
      '`num_levels` must be > 0. Found {}'.format(num_levels))
  assert unit_scale_index < num_levels and unit_scale_index >= 0, (
      '`unit_scale_index` must be in [0, {}). Found {}.'.format(
          num_levels, unit_scale_index))
  box_height_width = boxes[:, :, 2:4] - boxes[:, :, 0:2]
  areas_sqrt = tf.sqrt(tf.reduce_prod(box_height_width, axis=2))
  log_2 = tf.cast(tf.log(2.0), dtype=boxes.dtype)
  levels = tf.cast(
      tf.floordiv(tf.log(areas_sqrt * image_ratio), log_2)
      +
      unit_scale_index,
      dtype=tf.int32)
  levels = tf.maximum(0, tf.minimum(num_levels - 1, levels))
  return levels
예제 #3
0
파일: vncsmc.py 프로젝트: amoretti86/phylo
    def extend_partial_state(self, JCK, potentials, map_to_indices, l_br, r_br, r):
        shape_1 = tf.cast(ncr(self.N-r, 2)*self.M, tf.int32)

        indices = tf.cast(tf.random.categorical(potentials, 1), tf.int32)
        indices_remainder = tf.floordiv(indices, self.M)

        coalesced_indices = tf.cast(tf.gather_nd(map_to_indices, indices_remainder), tf.int32)
        transformed_coalesced_indices = tf.cast(
            self.N*10*tf.reduce_sum(tf.one_hot(coalesced_indices, self.N-r), axis=1), tf.int32)
        all_indices = tf.tile(tf.expand_dims(tf.range(self.N-r), axis=0), [self.K,1])
        remaining_indices, _ = tf.nn.top_k(all_indices - transformed_coalesced_indices, self.N - r - 2)
        JC_keep = gather_across_2d(JCK, remaining_indices, self.N-r, self.N-r-2)
        particles = gather_across_2d(JCK, coalesced_indices, self.N-r, 2)
        particle1 = particles[:, 0]
        particle2 = particles[:, 1]
        # Form new state
        particle_coalesced = particle1 + '+' + particle2
        # Form new Jump Chain
        JCK = tf.concat([JC_keep, tf.expand_dims(particle_coalesced, axis=1)], axis=1)
        
        q_log_proposal = gather_across_2d(potentials, indices, shape_1, 1)
        q_log_proposal = tf.reduce_mean(q_log_proposal, axis=1) # q should be Kx1, but is Kx?, and reduce_mean simply changes ? to 1
        l_br = gather_across_2d(l_br, indices, shape_1, 1)
        l_br = tf.squeeze(tf.reduce_mean(l_br, axis=1))
        r_br = gather_across_2d(r_br, indices, shape_1, 1)
        r_br = tf.squeeze(tf.reduce_mean(r_br, axis=1))

        return coalesced_indices, remaining_indices, q_log_proposal, l_br, r_br, JCK
예제 #4
0
    def _map_fn(index):
      x = tf.floordiv(index, 2)
      y = tf.floormod(index, 2)

      label = tf.cast(index + 1, tf.float32)
      label = tf.reshape(label, [1])

      target_dense = tf.stack([x + y, x + y + 1])
      return ({KEY_NAME: dense_to_sparse(target_dense, tf.int64)}, label)
예제 #5
0
    def define_predictions(self, features, outputs):
        """Define model predictions."""
        predictions = {
            "example_id": features["example_id"],
            "service_id": features["service_id"],
            "is_real_example": features["is_real_example"],
        }
        # Scores are output for each intent.
        # Note that the intent indices are shifted by 1 to account for NONE intent.
        predictions["intent_status"] = tf.argmax(
            outputs["logit_intent_status"], axis=-1)

        # Scores are output for each requested slot.
        predictions["req_slot_status"] = tf.sigmoid(
            outputs["logit_req_slot_status"])

        # For categorical slots, the status of each slot and the predicted value are
        # output.
        predictions["cat_slot_status"] = tf.argmax(
            outputs["logit_cat_slot_status"], axis=-1)
        predictions["cat_slot_value"] = tf.argmax(
            outputs["logit_cat_slot_value"], axis=-1)

        # For non-categorical slots, the status of each slot and the indices for
        # spans are output.
        predictions["noncat_slot_status"] = tf.argmax(
            outputs["logit_noncat_slot_status"], axis=-1)
        start_scores = tf.nn.softmax(outputs["logit_noncat_slot_start"],
                                     axis=-1)
        end_scores = tf.nn.softmax(outputs["logit_noncat_slot_end"], axis=-1)
        _, max_num_slots, max_num_tokens = end_scores.get_shape().as_list()
        batch_size = tf.shape(end_scores)[0]
        # Find the span with the maximum sum of scores for start and end indices.
        total_scores = (tf.expand_dims(start_scores, axis=3) +
                        tf.expand_dims(end_scores, axis=2))
        # Mask out scores where start_index > end_index.
        start_idx = tf.reshape(tf.range(max_num_tokens), [1, 1, -1, 1])
        end_idx = tf.reshape(tf.range(max_num_tokens), [1, 1, 1, -1])
        invalid_index_mask = tf.tile((start_idx > end_idx),
                                     [batch_size, max_num_slots, 1, 1])
        total_scores = tf.where(invalid_index_mask,
                                tf.zeros_like(total_scores), total_scores)
        max_span_index = tf.argmax(tf.reshape(
            total_scores, [-1, max_num_slots, max_num_tokens**2]),
                                   axis=-1)
        span_start_index = tf.floordiv(max_span_index, max_num_tokens)
        span_end_index = tf.floormod(max_span_index, max_num_tokens)
        predictions["noncat_slot_start"] = span_start_index
        predictions["noncat_slot_end"] = span_end_index
        # Add inverse alignments.
        predictions["noncat_alignment_start"] = features[
            "noncat_alignment_start"]
        predictions["noncat_alignment_end"] = features["noncat_alignment_end"]

        return predictions
예제 #6
0
 def _loop_body(i_, img_, xstartpreds_):
   # Sample p(x_{t-1} | x_t) as usual
   sample, pred_xstart = self.p_sample(
     denoise_fn=denoise_fn, x=img_, t=tf.fill([shape[0]], i_), noise_fn=noise_fn, return_pred_xstart=True)
   assert sample.shape == pred_xstart.shape == shape
   # Keep track of prediction of x0
   insert_mask = tf.equal(tf.floordiv(i_, include_xstartpred_freq),
                          tf.range(num_recorded_xstartpred, dtype=tf.int32))
   insert_mask = tf.reshape(tf.cast(insert_mask, dtype=tf.float32),
                            [1, num_recorded_xstartpred, *([1] * len(shape[1:]))])  # [1, N, 1, 1, 1]
   new_xstartpreds = insert_mask * pred_xstart[:, None, ...] + (1. - insert_mask) * xstartpreds_
   return [i_ - 1, sample, new_xstartpreds]
예제 #7
0
def ae_latent_softmax(latents_pred, latents_discrete, hparams):
    """Latent prediction and loss."""
    vocab_size = 2**hparams.z_size
    if hparams.num_decode_blocks < 2:
        latents_logits = tf.layers.dense(latents_pred,
                                         vocab_size,
                                         name="extra_logits")
        if hparams.logit_normalization:
            latents_logits *= tf.rsqrt(
                1e-8 + tf.reduce_mean(tf.square(latents_logits)))

        loss = None
        if latents_discrete is not None:
            if hparams.soft_em:
                # latents_discrete is actually one-hot of multinomial samples
                assert hparams.num_decode_blocks == 1
                loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                    labels=latents_discrete, logits=latents_logits)
            else:
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=latents_discrete, logits=latents_logits)
        sample = multinomial_sample(latents_logits, vocab_size,
                                    hparams.sampling_temp)
        return sample, loss

    # Multi-block case.
    vocab_bits = int(math.log(vocab_size, 2))
    assert vocab_size == 2**vocab_bits
    assert vocab_bits % hparams.num_decode_blocks == 0
    block_vocab_size = 2**(vocab_bits // hparams.num_decode_blocks)
    latents_logits = [
        tf.layers.dense(latents_pred,
                        block_vocab_size,
                        name="extra_logits_%d" % i)
        for i in range(hparams.num_decode_blocks)
    ]
    loss = None
    if latents_discrete is not None:
        losses = []
        for i in range(hparams.num_decode_blocks):
            d = tf.floormod(tf.floordiv(latents_discrete, block_vocab_size**i),
                            block_vocab_size)
            losses.append(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=d, logits=latents_logits[i]))
        loss = sum(losses)
    samples = [
        multinomial_sample(l, block_vocab_size, hparams.sampling_temp)
        for l in latents_logits
    ]
    sample = sum([s * block_vocab_size**i for i, s in enumerate(samples)])
    return sample, loss
예제 #8
0
def unravel_index_2d(indices, dims):
    """Unravel index, for 2D inputs only.

  See Numpy's unravel.

  Args:
    indices: <int32> [num_elements], coordinates into 2D row-major tensor.
    dims: (N, M), dimensions of the 2D tensor.

  Returns:
    coordinates: <int32> [2, num_elements], row (1st) and column (2nd) indices.
  """
    row_inds = tf.floordiv(indices, dims[1])
    col_inds = tf.floormod(indices, dims[1])
    return tf.stack([row_inds, col_inds], axis=0)
예제 #9
0
    def bucket_fn(x):
        """Compute the element bucket and update the histogram."""
        ix = len_fn(x)
        if ix.dtype == tf.int32:
            ix = tf.to_int64(ix)
        elif ix.dtype != tf.int64:
            raise ValueError("Len function returned a non-int")

        adds_to_bins = tf.to_int64(tf.greater(hist_bounds, ix))
        # pad with a 1 for the "larger than all" bin
        adds_to_bins = tf.pad(adds_to_bins, [[0, 1]], constant_values=1)
        new_counts = tf.assign_add(hist_counts, adds_to_bins)
        bin_ix = n_hist_binds - tf.reduce_sum(adds_to_bins)
        # Computes the quantile based on the counts of the exammple's bucket
        bucket_ix = tf.floordiv(((n_buckets - 1) * new_counts[bin_ix]),
                                new_counts[-1])
        return bucket_ix
예제 #10
0
def compute_progress(current_image_id, stable_stage_num_images,
                     transition_stage_num_images, num_blocks):
    """Computes the training progress.

  The training alternates between stable phase and transition phase.
  The `progress` indicates the training progress, i.e. the training is at
  - a stable phase p if progress = p
  - a transition stage between p and p + 1 if progress = p + fraction
  where p = 0,1,2.,...

  Note the max value of progress is `num_blocks` - 1.

  In terms of LOD (of the original implementation):
  progress = `num_blocks` - 1 - LOD

  Args:
    current_image_id: An scalar integer `Tensor` of the current image id, count
        from 0.
    stable_stage_num_images: An integer representing the number of images in
        each stable stage.
    transition_stage_num_images: An integer representing the number of images in
        each transition stage.
    num_blocks: Number of network blocks.

  Returns:
    A scalar float `Tensor` of the training progress.
  """
    # Note when current_image_id >= min_total_num_images - 1 (which means we
    # are already at the highest resolution), we want to keep progress constant.
    # Therefore, cap current_image_id here.
    capped_current_image_id = tf.minimum(
        current_image_id,
        min_total_num_images(stable_stage_num_images,
                             transition_stage_num_images, num_blocks) - 1)

    stage_num_images = stable_stage_num_images + transition_stage_num_images
    progress_integer = tf.floordiv(capped_current_image_id, stage_num_images)
    progress_fraction = tf.maximum(
        0.0,
        tf.to_float(
            tf.mod(capped_current_image_id, stage_num_images) -
            stable_stage_num_images) /
        tf.to_float(transition_stage_num_images))
    return tf.to_float(progress_integer) + progress_fraction
예제 #11
0
    def int_to_bit(self, x_int, num_bits, base=2):
        """Turn x_int representing numbers into a bitwise (lower-endian) tensor.

    Args:
        x_int: Tensor containing integer to be converted into base
        notation.
        num_bits: Number of bits in the representation.
        base: Base of the representation.

    Returns:
        Corresponding number expressed in base.
    """
        x_l = tf.to_int32(tf.expand_dims(x_int, axis=-1))
        # pylint: disable=g-complex-comprehension
        x_labels = [
            tf.floormod(tf.floordiv(tf.to_int32(x_l),
                                    tf.to_int32(base)**i), tf.to_int32(base))
            for i in range(num_bits)
        ]
        res = tf.concat(x_labels, axis=-1)
        return tf.to_float(res)
예제 #12
0
  def loss_function(self, inputs, build_network_result):
    """Computes the ctc loss for the current batch of predictions.

    Args:
      inputs: the input list of the model.
      build_network_result: a BuildNetworkResult returned by build_network().

    Returns:
      The loss tensor of the model.
    """
    logits = build_network_result.logits
    actual_time_steps = inputs[2]
    probs = tf.nn.softmax(logits)
    ctc_time_steps = tf.shape(probs)[1]
    ctc_input_length = tf.to_float(
        tf.multiply(actual_time_steps, ctc_time_steps))
    ctc_input_length = tf.to_int32(
        tf.floordiv(ctc_input_length, tf.to_float(self.max_time_steps)))

    label_length = inputs[3]
    label_length = tf.to_int32(tf.squeeze(label_length))
    ctc_input_length = tf.to_int32(tf.squeeze(ctc_input_length))

    labels = inputs[1]
    sparse_labels = tf.to_int32(
        tf.keras.backend.ctc_label_dense_to_sparse(labels, label_length))
    y_pred = tf.log(
        tf.transpose(probs, perm=[1, 0, 2]) + tf.keras.backend.epsilon())

    losses = tf.expand_dims(
        tf.nn.ctc_loss(
            labels=sparse_labels,
            inputs=y_pred,
            sequence_length=ctc_input_length,
            ignore_longer_outputs_than_inputs=True),
        axis=1)
    loss = tf.reduce_mean(losses)
    return loss
예제 #13
0
def multilevel_crop_and_resize(features,
                               boxes,
                               output_size=7,
                               use_einsum_gather=False):
    """Crop and resize on multilevel feature pyramid.

  Generate the (output_size, output_size) set of pixels for each input box
  by first locating the box into the correct feature level, and then cropping
  and resizing it using the correspoding feature map of that level.

  Here is the step-by-step algorithm with use_einsum_gather=True:
  1. Compute sampling points and their four neighbors for each output points.
     Each box is mapped to [output_size, output_size] points.
     Each output point is averaged among #sampling_raitio^2 points.
     Each sampling point is computed using bilinear
     interpolation of its four neighboring points on the feature map.
  2. Gather output points seperately for each level. Gather and computation of
     output points are done for the boxes mapped to this level only.
     2.1. Compute indices of four neighboring point of each sampling
          point for x and y seperately of shape
          [batch_size, num_boxes, output_size, 2].
     2.2. Compute the interpolation kernel for axis x and y seperately of
          shape [batch_size, num_boxes, output_size, 2, 1].
     2.3. The features are colleced into a
          [batch_size, num_boxes, output_size, output_size, num_filters]
          Tensor.
          Instead of a one-step algorithm, a two-step approach is used.
          That is, first, an intermediate output is stored with a shape of
          [batch_size, num_boxes, output_size, width, num_filters];
          second, the final output is produced with a shape of
          [batch_size, num_boxes, output_size, output_size, num_filters].

          Blinear interpolation is done during the two step gather:
          f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
                                [f10, f11]]
          [[f00, f01],
           [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot)
          where [hy, ly] and [hx, lx] are the bilinear interpolation kernel.

          Note:
            a. Use one_hot with einsum to replace gather;
            b. Bilinear interpolation and averaging of
               multiple sampling points are fused into the one_hot vector.

  Args:
    features: A dictionary with key as pyramid level and value as features. The
      features are in shape of [batch_size, height_l, width_l, num_filters].
    boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents
      a box with [y1, x1, y2, x2] in un-normalized coordinates.
    output_size: A scalar to indicate the output crop size.
    use_einsum_gather: use einsum to replace gather or not. Replacing einsum
      with gather can improve performance when feature size is not large, einsum
      is friendly with model partition as well. Gather's performance is better
      when feature size is very large and there are multiple box levels.

  Returns:
    A 5-D tensor representing feature crop of shape
    [batch_size, num_boxes, output_size, output_size, num_filters].
  """

    with tf.name_scope('multilevel_crop_and_resize'):
        levels = list(features.keys())
        min_level = min(levels)
        max_level = max(levels)
        batch_size, max_feature_height, max_feature_width, num_filters = (
            features[min_level].get_shape().as_list())
        if batch_size is None:
            batch_size = tf.shape(features[min_level])[0]
        _, num_boxes, _ = boxes.get_shape().as_list()

        # Assigns boxes to the right level.
        box_width = boxes[:, :, 3] - boxes[:, :, 1]
        box_height = boxes[:, :, 2] - boxes[:, :, 0]
        areas_sqrt = tf.sqrt(box_height * box_width)
        levels = tf.cast(
            tf.floordiv(tf.log(tf.div(areas_sqrt, 224.0)), tf.log(2.0)) + 4.0,
            dtype=tf.int32)
        # Maps levels between [min_level, max_level].
        levels = tf.minimum(max_level, tf.maximum(levels, min_level))

        # Projects box location and sizes to corresponding feature levels.
        scale_to_level = tf.cast(tf.pow(tf.constant(2.0),
                                        tf.cast(levels, tf.float32)),
                                 dtype=boxes.dtype)
        boxes /= tf.expand_dims(scale_to_level, axis=2)
        box_width /= scale_to_level
        box_height /= scale_to_level
        boxes = tf.concat([
            boxes[:, :, 0:2],
            tf.expand_dims(box_height, -1),
            tf.expand_dims(box_width, -1)
        ],
                          axis=-1)

        if use_einsum_gather:

            def two_step_gather_per_level(features_level, mask):
                """Performs two-step gather using einsum for every level of features."""
                (_, feature_height, feature_width,
                 _) = features_level.get_shape().as_list()
                boundaries = tf.tile(
                    tf.expand_dims(
                        tf.expand_dims([feature_height, feature_width], 0), 0),
                    [batch_size, num_boxes, 1])
                boundaries = tf.cast(boundaries, boxes.dtype)
                kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
                    boxes, boundaries, output_size, sample_offset=0.5)

                # shape is:
                # [batch_size, num_boxes, output_size, 2, spatial_size]
                box_grid_y_one_hot, box_grid_x_one_hot = get_grid_one_hot(
                    box_gridy0y1, box_gridx0x1, feature_height, feature_width)

                # # shape is [batch_size, num_boxes, output_size, spatial_size]
                box_grid_y_weight = tf.reduce_sum(tf.multiply(
                    box_grid_y_one_hot, kernel_y),
                                                  axis=-2)
                box_grid_x_weight = tf.reduce_sum(tf.multiply(
                    box_grid_x_one_hot, kernel_x),
                                                  axis=-2)

                # shape is [batch_size, num_boxes, output_size, width, feature]
                y_outputs = tf.einsum(
                    'bhwf,bnyh->bnywf', features_level,
                    tf.cast(box_grid_y_weight, dtype=features_level.dtype))

                # shape is [batch_size, num_boxes, output_size, output_size, feature]
                x_outputs = tf.einsum(
                    'bnywf,bnxw->bnyxf', y_outputs,
                    tf.cast(box_grid_x_weight, dtype=features_level.dtype))

                outputs = tf.where(tf.equal(mask, tf.zeros_like(mask)),
                                   tf.zeros_like(x_outputs), x_outputs)
                return outputs

            features_per_box = tf.zeros(
                [batch_size, num_boxes, output_size, output_size, num_filters],
                dtype=features[min_level].dtype)
            for level in range(min_level, max_level + 1):
                level_equal = tf.equal(levels, level)
                mask = tf.tile(
                    tf.reshape(level_equal, [batch_size, num_boxes, 1, 1, 1]),
                    [1, 1, output_size, output_size, num_filters])
                features_per_box += two_step_gather_per_level(
                    features[level], mask)

            return features_per_box

        # Stack feature pyramid into a features_all of shape
        # [batch_size, levels, height, width, num_filters].
        features_all = []
        feature_heights = []
        feature_widths = []
        for level in range(min_level, max_level + 1):
            shape = features[level].get_shape().as_list()
            feature_heights.append(shape[1])
            feature_widths.append(shape[2])
            # Concat tensor of [batch_size, height_l * width_l, num_filters] for each
            # levels.
            features_all.append(
                tf.reshape(features[level], [batch_size, -1, num_filters]))
        features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters])

        # Calculate height_l * width_l for each level.
        level_dim_sizes = [
            feature_widths[i] * feature_heights[i]
            for i in range(len(feature_widths))
        ]
        # level_dim_offsets is accumulated sum of level_dim_size.
        level_dim_offsets = [0]
        for i in range(len(feature_widths) - 1):
            level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i])
        batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1]
        level_dim_offsets = tf.constant(level_dim_offsets, tf.int32)
        height_dim_sizes = tf.constant(feature_widths, tf.int32)

        # Maps levels to [0, max_level-min_level].
        levels -= min_level
        level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
        boundary = tf.cast(
            tf.concat([
                tf.expand_dims([[tf.cast(max_feature_height, tf.float32)]] /
                               level_strides - 1,
                               axis=-1),
                tf.expand_dims([[tf.cast(max_feature_width, tf.float32)]] /
                               level_strides - 1,
                               axis=-1),
            ],
                      axis=-1), boxes.dtype)

        # Compute grid positions.
        kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
            boxes, boundary, output_size, sample_offset=0.5)

        x_indices = tf.cast(tf.reshape(
            box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
                            dtype=tf.int32)
        y_indices = tf.cast(tf.reshape(
            box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
                            dtype=tf.int32)

        batch_size_offset = tf.tile(
            tf.reshape(
                tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
            [1, num_boxes, output_size * 2, output_size * 2])
        # Get level offset for each box. Each box belongs to one level.
        levels_offset = tf.tile(
            tf.reshape(tf.gather(level_dim_offsets, levels),
                       [batch_size, num_boxes, 1, 1]),
            [1, 1, output_size * 2, output_size * 2])
        y_indices_offset = tf.tile(
            tf.reshape(
                y_indices *
                tf.expand_dims(tf.gather(height_dim_sizes, levels), -1),
                [batch_size, num_boxes, output_size * 2, 1]),
            [1, 1, 1, output_size * 2])
        x_indices_offset = tf.tile(
            tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
            [1, 1, output_size * 2, 1])
        indices = tf.reshape(
            batch_size_offset + levels_offset + y_indices_offset +
            x_indices_offset, [-1])

        # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
        # performance.
        features_per_box = tf.reshape(tf.gather(features_r2, indices), [
            batch_size, num_boxes, output_size * 2, output_size * 2,
            num_filters
        ])

        # Bilinear interpolation.
        features_per_box = feature_bilinear_interpolation(
            features_per_box, kernel_y, kernel_x)
        return features_per_box
예제 #14
0
def generate_detections_per_image_tpu(cls_outputs,
                                      box_outputs,
                                      anchor_boxes,
                                      image_info,
                                      pre_nms_num_detections=1000,
                                      post_nms_num_detections=100,
                                      nms_threshold=0.3,
                                      bbox_reg_weights=(10., 10., 5., 5.)):
    """Generate the final detections per image given the model outputs.

  Args:
    cls_outputs: a tensor with shape [N, num_classes], which stacks class
      logit outputs on all feature levels. The N is the number of total anchors
      on all levels. The num_classes is the number of classes predicted by the
      model. Note that the cls_outputs should be the output of softmax().
    box_outputs: a tensor with shape [N, num_classes*4], which stacks box
      regression outputs on all feature levels. The N is the number of total
      anchors on all levels.
    anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of total anchors on all levels.
    image_info: a tensor of shape [5] which encodes the input image's [height,
      width, scale, original_height, original_width]
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

  Returns:
    detections: Tuple of tensors corresponding to number of valid boxes,
    box coordinates, object categories for each boxes, and box scores
    -- respectively.
  """
    num_boxes, num_classes = cls_outputs.get_shape().as_list()

    # Remove background class scores.
    cls_outputs = cls_outputs[:, 1:num_classes]
    top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
        tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=False)
    classes = tf.mod(top_k_indices_with_classes, num_classes - 1)
    top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)

    anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
    box_outputs = tf.reshape(box_outputs,
                             [num_boxes, num_classes, 4])[:, 1:num_classes, :]
    class_indices = classes
    box_outputs = tf.gather_nd(
        box_outputs, tf.stack([top_k_indices, class_indices], axis=1))

    # apply bounding box regression to anchors
    boxes = box_utils.decode_boxes(box_outputs, anchor_boxes, bbox_reg_weights)
    boxes = box_utils.clip_boxes(boxes, image_info[0], image_info[1])

    list_of_all_boxes = []
    list_of_all_scores = []
    list_of_all_classes = []
    # Skip background class.
    for class_i in range(num_classes):
        # Compute bitmask for the given classes.
        class_i_bitmask = tf.cast(tf.equal(classes, class_i),
                                  top_k_scores.dtype)
        # This works because score is in [0, 1].
        class_i_scores = top_k_scores * class_i_bitmask
        # The TPU and CPU have different behaviors for
        # tf.image.non_max_suppression_padded (b/116754376).
        (class_i_post_nms_indices,
         class_i_nms_num_valid) = tf.image.non_max_suppression_padded(
             tf.to_float(boxes),
             tf.to_float(class_i_scores),
             post_nms_num_detections,
             iou_threshold=nms_threshold,
             score_threshold=0.05,
             pad_to_max_output_size=True,
             name='nms_detections_' + str(class_i))
        class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices)
        class_i_post_nms_scores = tf.gather(class_i_scores,
                                            class_i_post_nms_indices)
        mask = tf.less(tf.range(post_nms_num_detections),
                       [class_i_nms_num_valid])
        class_i_post_nms_scores = tf.where(
            mask, class_i_post_nms_scores,
            tf.zeros_like(class_i_post_nms_scores))
        class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores),
                                  class_i + 1)
        list_of_all_boxes.append(class_i_post_nms_boxes)
        list_of_all_scores.append(class_i_post_nms_scores)
        list_of_all_classes.append(class_i_classes)

    post_nms_boxes = tf.concat(list_of_all_boxes, axis=0)
    post_nms_scores = tf.concat(list_of_all_scores, axis=0)
    post_nms_classes = tf.concat(list_of_all_classes, axis=0)

    # sort all results.
    post_nms_scores, sorted_indices = tf.nn.top_k(tf.to_float(post_nms_scores),
                                                  k=post_nms_num_detections,
                                                  sorted=True)
    post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices)
    post_nms_classes = tf.gather(post_nms_classes, sorted_indices)

    valid_mask = tf.where(tf.greater(post_nms_scores, 0),
                          tf.ones_like(post_nms_scores),
                          tf.zeros_like(post_nms_scores))
    num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1)
    box_classes = tf.to_float(post_nms_classes)
    return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
예제 #15
0
def generate_detections_per_image(cls_outputs,
                                  box_outputs,
                                  anchor_boxes,
                                  pre_nms_num_detections=1000,
                                  post_nms_num_detections=100,
                                  nms_threshold=0.3):
    """Generate the final detections per image given the model outputs.

  Args:
    cls_outputs: a tensor with shape [N, num_classes], which stacks class
      logit outputs on all feature levels. The N is the number of total anchors
      on all levels. The num_classes is the number of classes predicted by the
      model. Note that the cls_outputs should be the output of softmax().
    box_outputs: a tensor with shape [N, num_classes*4], which stacks box
      regression outputs on all feature levels. The N is the number of total
      anchors on all levels.
    anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of total anchors on all levels.
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.

  Returns:
    detections: Tuple of tensors corresponding to number of valid boxes,
    box coordinates, object categories for each boxes, and box scores
    -- respectively.
  """
    num_classes = cls_outputs.get_shape().as_list()[1]

    top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
        tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=False)
    classes = tf.mod(top_k_indices_with_classes, num_classes)
    top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes)
    anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
    box_outputs = tf.gather(box_outputs, top_k_indices)

    # apply bounding box regression to anchors
    boxes = _decode_boxes(box_outputs, anchor_boxes)
    list_of_all_boxes = []
    list_of_all_scores = []
    list_of_all_classes = []
    for class_i in range(num_classes):
        # Compute bitmask for the given classes.
        class_i_bitmask = tf.cast(tf.equal(classes, class_i),
                                  top_k_scores.dtype)
        # This works because score is in [0, 1].
        class_i_scores = top_k_scores * class_i_bitmask
        (class_i_post_nms_indices,
         class_i_nms_num_valid) = tf.image.non_max_suppression_padded(
             tf.to_float(boxes),
             tf.to_float(class_i_scores),
             post_nms_num_detections,
             iou_threshold=nms_threshold,
             score_threshold=0.05,
             pad_to_max_output_size=True,
             name='nms_detections_' + str(class_i))

        class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices)
        class_i_post_nms_scores = tf.gather(class_i_scores,
                                            class_i_post_nms_indices)
        mask = tf.less(tf.range(post_nms_num_detections),
                       [class_i_nms_num_valid])
        class_i_post_nms_scores = tf.where(
            mask, class_i_post_nms_scores,
            tf.zeros_like(class_i_post_nms_scores))
        class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores),
                                  class_i + 1)
        list_of_all_boxes.append(class_i_post_nms_boxes)
        list_of_all_scores.append(class_i_post_nms_scores)
        list_of_all_classes.append(class_i_classes)

    post_nms_boxes = tf.concat(list_of_all_boxes, axis=0)
    post_nms_scores = tf.concat(list_of_all_scores, axis=0)
    post_nms_classes = tf.concat(list_of_all_classes, axis=0)

    # sort all results.
    post_nms_scores, sorted_indices = tf.nn.top_k(tf.to_float(post_nms_scores),
                                                  k=post_nms_num_detections,
                                                  sorted=True)
    post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices)
    post_nms_classes = tf.gather(post_nms_classes, sorted_indices)
    valid_mask = tf.where(tf.greater(post_nms_scores, 0),
                          tf.ones_like(post_nms_scores),
                          tf.zeros_like(post_nms_scores))
    num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1)
    box_classes = tf.to_float(post_nms_classes)

    return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
예제 #16
0
def hier_homography_fmask_estimator(color_inputs, num_param=8, num_layer=7,
                                    num_level=3, dropout_keep_prob=0.8,
                                    reuse=None, is_training=True,
                                    trainable=True,
                                    scope='hier_hmg'):
  """A hierarchical neural network with mask for homograhy estimation.

  Args:
    color_inputs: batch of input image pairs of data type float32 and of shape
      [batch_size, height, width, 6]
    num_param: the number of parameters for homography (default 8)
    num_layer: the number of convolutional layers in the motion feature network
    num_level: the number of hierarchical levels
    dropout_keep_prob: the percentage of activation values that are kept
    reuse: whether to reuse this network weights
    is_training: whether used for training or testing
    trainable: whether this network is to be trained or not
    scope: the scope of variables in this function

  Returns:
    a list of homographies at each level and motion feature maps if
    final_endpoint='mfeature'; otherwise a list of images warped by the list of
    corresponding homographies
  """
  _, h_input, w_input = color_inputs.get_shape().as_list()[0 : 3]
  vgg_inputs = (color_inputs[Ellipsis, 3 : 6] * 256 + 128)- VGG_MEANS

  with slim.arg_scope([slim.conv2d, slim.max_pool2d], padding='SAME'):
    with slim.arg_scope([slim.conv2d, slim.fully_connected], trainable=False):
      with slim.arg_scope([slim.conv2d], normalizer_fn=None):
        with slim.arg_scope(contrib_slim_nets_vgg.vgg_arg_scope()):
          sfeature, _ = contrib_slim_nets_vgg.vgg_16(
              vgg_inputs,
              1000,
              predictions_fn=slim.softmax,
              global_pool=False,
              is_training=False,
              reuse=reuse,
              spatial_squeeze=True,
              final_endpoint='pool5',
              scope='vgg_16')

  gray_image1 = tf.image.rgb_to_grayscale(color_inputs[Ellipsis, 0 : 3])
  gray_image2 = tf.image.rgb_to_grayscale(color_inputs[Ellipsis, 3 : 6])
  inputs = tf.concat([gray_image1, gray_image2], 3)

  hmgs_list = []
  warped_list = []
  with tf.variable_scope(scope, [inputs], reuse=reuse):
    for level_index in range(num_level):
      scale = 2 ** (num_level - 1 - level_index)
      h = tf.to_float(tf.floordiv(h_input, scale))
      w = tf.to_float(tf.floordiv(w_input, scale))
      inputs_il = tf.image.resize_images(inputs, tf.to_int32([h, w]))
      if level_index == 0:
        mfeature = hier_base_layers(inputs_il,
                                    num_layer + 1 - num_level + level_index,
                                    level_index, is_training=is_training,
                                    trainable=trainable)
        hmgs_il = homography_regression(mfeature, num_param, level_index,
                                        dropout_keep_prob=dropout_keep_prob,
                                        is_training=is_training,
                                        trainable=trainable)
        hmgs_list.append(hmgs_il)
      else:
        warped, _ = hmg_util.homography_scale_warp_per_batch(
            inputs_il[:, :, :, 0], w / 2, h / 2, hmgs_list[level_index - 1])
        pre_warped_inputs_il = tf.stack([warped, inputs_il[:, :, :, 1]], -1)
        warped_list.append(pre_warped_inputs_il)
        mfeature = hier_base_layers(pre_warped_inputs_il,
                                    num_layer + 1 - num_level + level_index,
                                    level_index, is_training=is_training,
                                    trainable=trainable)
        if level_index == num_level - 1:
          mfeature = fmask_layers_semantic(mfeature, sfeature, level_index,
                                           is_training=is_training,
                                           trainable=trainable)
        hmgs_il = homography_regression(mfeature, num_param, level_index,
                                        dropout_keep_prob=dropout_keep_prob,
                                        is_training=is_training,
                                        trainable=trainable)
        new_hmgs_il = hmg_util.homography_shift_mult_batch(
            hmgs_list[level_index - 1], w / 2, h / 2, hmgs_il, w, h, w, h)
        hmgs_list.append(new_hmgs_il)
  return hmgs_list, warped_list
예제 #17
0
def hier_homography_estimator(inputs, num_param=8, num_layer=7, num_level=3,
                              dropout_keep_prob=0.8, reuse=None,
                              is_training=True, trainable=True,
                              final_endpoint=None, scope='hier_hmg'):
  """A hierarchical VGG-style neural network for homograhy estimation.

  Args:
    inputs: batch of input image pairs of data type float32 and of shape
      [batch_size, height, width, 2]
    num_param: the number of parameters for homography (default 8)
    num_layer: the number of convolutional layers in the motion feature network
    num_level: the number of hierarchical levels
    dropout_keep_prob: the percentage of activation values that are kept
    reuse: whether to reuse this network weights
    is_training: whether used for training or testing
    trainable: whether this network is to be trained or not
    final_endpoint: specifies the endpoint to construct the network up to
    scope: the scope of variables in this function

  Returns:
    a list of homographies at each level and motion feature maps if
    final_endpoint='mfeature'; otherwise a list of images warped by the list of
    corresponding homographies
  """
  _, h_input, w_input = inputs.get_shape().as_list()[0:3]
  hmgs_list = []
  warped_list = []
  with tf.variable_scope(scope, [inputs], reuse=reuse):
    for level_index in range(num_level):
      scale = 2 ** (num_level - 1 - level_index)
      h = tf.to_float(tf.floordiv(h_input, scale))
      w = tf.to_float(tf.floordiv(w_input, scale))
      inputs_il = tf.image.resize_images(inputs, tf.to_int32([h, w]))
      if level_index == 0:
        mfeature = hier_base_layers(inputs_il,
                                    num_layer + 1 - num_level + level_index,
                                    level_index, is_training=is_training,
                                    trainable=trainable)
        hmgs_il = homography_regression(mfeature, num_param, level_index,
                                        dropout_keep_prob=dropout_keep_prob,
                                        is_training=is_training,
                                        trainable=trainable)
        hmgs_list.append(hmgs_il)
      else:
        warped, _ = hmg_util.homography_scale_warp_per_batch(
            inputs_il[:, :, :, 0], w / 2, h / 2, hmgs_list[level_index - 1])
        pre_warped_inputs_il = tf.stack([warped, inputs_il[:, :, :, 1]], -1)
        warped_list.append(pre_warped_inputs_il)
        if level_index == num_level - 1 and final_endpoint == 'mfeature':
          mfeature = hier_base_layers(pre_warped_inputs_il,
                                      num_layer - num_level + level_index,
                                      level_index, is_training=is_training,
                                      trainable=trainable)
          return hmgs_list, mfeature
        else:
          mfeature = hier_base_layers(pre_warped_inputs_il,
                                      num_layer + 1 - num_level + level_index,
                                      level_index, is_training=is_training,
                                      trainable=trainable)
        hmgs_il = homography_regression(mfeature, num_param, level_index,
                                        dropout_keep_prob=dropout_keep_prob,
                                        is_training=is_training,
                                        trainable=trainable)
        new_hmgs_il = hmg_util.homography_shift_mult_batch(
            hmgs_list[level_index - 1], w / 2, h / 2, hmgs_il, w, h, w, h)
        hmgs_list.append(new_hmgs_il)
  return hmgs_list, warped_list
예제 #18
0
def multilevel_crop_and_resize(features, boxes, output_size=7):
    """Crop and resize on multilevel feature pyramid.

  Generate the (output_size, output_size) set of pixels for each input box
  by first locating the box into the correct feature level, and then cropping
  and resizing it using the correspoding feature map of that level.

  Args:
    features: A dictionary with key as pyramid level and value as features. The
      features are in shape of [batch_size, height_l, width_l, num_filters].
    boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents
      a box with [y1, x1, y2, x2] in un-normalized coordinates.
    output_size: A scalar to indicate the output crop size.

  Returns:
    A 5-D tensor representing feature crop of shape
    [batch_size, num_boxes, output_size, output_size, num_filters].
  """

    with tf.name_scope('multilevel_crop_and_resize'):
        levels = list(features.keys())
        min_level = min(levels)
        max_level = max(levels)
        batch_size, max_feature_height, max_feature_width, num_filters = (
            features[min_level].get_shape().as_list())
        if batch_size is None:
            batch_size = tf.shape(features[min_level])[0]
        _, num_boxes, _ = boxes.get_shape().as_list()

        # Stack feature pyramid into a features_all of shape
        # [batch_size, levels, height, width, num_filters].
        features_all = []
        feature_heights = []
        feature_widths = []
        for level in range(min_level, max_level + 1):
            shape = features[level].get_shape().as_list()
            feature_heights.append(shape[1])
            feature_widths.append(shape[2])
            # Concat tensor of [batch_size, height_l * width_l, num_filters] for each
            # levels.
            features_all.append(
                tf.reshape(features[level], [batch_size, -1, num_filters]))
        features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters])

        # Calculate height_l * width_l for each level.
        level_dim_sizes = [
            feature_widths[i] * feature_heights[i]
            for i in range(len(feature_widths))
        ]
        # level_dim_offsets is accumulated sum of level_dim_size.
        level_dim_offsets = [0]
        for i in range(len(feature_widths) - 1):
            level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i])
        batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1]
        level_dim_offsets = tf.constant(level_dim_offsets, tf.int32)
        height_dim_sizes = tf.constant(feature_widths, tf.int32)

        # Assigns boxes to the right level.
        box_width = boxes[:, :, 3] - boxes[:, :, 1]
        box_height = boxes[:, :, 2] - boxes[:, :, 0]
        areas_sqrt = tf.sqrt(box_height * box_width)
        levels = tf.cast(
            tf.floordiv(tf.log(tf.div(areas_sqrt, 224.0)), tf.log(2.0)) + 4.0,
            dtype=tf.int32)
        # Maps levels between [min_level, max_level].
        levels = tf.minimum(max_level, tf.maximum(levels, min_level))

        # Projects box location and sizes to corresponding feature levels.
        scale_to_level = tf.cast(tf.pow(tf.constant(2.0),
                                        tf.cast(levels, tf.float32)),
                                 dtype=boxes.dtype)
        boxes /= tf.expand_dims(scale_to_level, axis=2)
        box_width /= scale_to_level
        box_height /= scale_to_level
        boxes = tf.concat([
            boxes[:, :, 0:2],
            tf.expand_dims(box_height, -1),
            tf.expand_dims(box_width, -1)
        ],
                          axis=-1)

        # Maps levels to [0, max_level-min_level].
        levels -= min_level
        level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
        boundary = tf.cast(
            tf.concat([
                tf.expand_dims([[tf.cast(max_feature_height, tf.float32)]] /
                               level_strides - 1,
                               axis=-1),
                tf.expand_dims([[tf.cast(max_feature_width, tf.float32)]] /
                               level_strides - 1,
                               axis=-1),
            ],
                      axis=-1), boxes.dtype)

        # Compute grid positions.
        kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
            boxes, boundary, output_size, sample_offset=0.5)

        x_indices = tf.cast(tf.reshape(
            box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
                            dtype=tf.int32)
        y_indices = tf.cast(tf.reshape(
            box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
                            dtype=tf.int32)

        batch_size_offset = tf.tile(
            tf.reshape(
                tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
            [1, num_boxes, output_size * 2, output_size * 2])
        # Get level offset for each box. Each box belongs to one level.
        levels_offset = tf.tile(
            tf.reshape(tf.gather(level_dim_offsets, levels),
                       [batch_size, num_boxes, 1, 1]),
            [1, 1, output_size * 2, output_size * 2])
        y_indices_offset = tf.tile(
            tf.reshape(
                y_indices *
                tf.expand_dims(tf.gather(height_dim_sizes, levels), -1),
                [batch_size, num_boxes, output_size * 2, 1]),
            [1, 1, 1, output_size * 2])
        x_indices_offset = tf.tile(
            tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
            [1, 1, output_size * 2, 1])
        indices = tf.reshape(
            batch_size_offset + levels_offset + y_indices_offset +
            x_indices_offset, [-1])

        # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
        # performance.
        features_per_box = tf.reshape(tf.gather(features_r2, indices), [
            batch_size, num_boxes, output_size * 2, output_size * 2,
            num_filters
        ])

        # Bilinear interpolation.
        features_per_box = feature_bilinear_interpolation(
            features_per_box, kernel_y, kernel_x)
        return features_per_box
예제 #19
0
 def _preprocess(self, obs, obs_shape):
   """Preprocess the input."""
   obs = tf.cast(obs, tf.float32)
   obs = tf.image.resize_bilinear(obs, obs_shape)
   denom = tf.constant(256 // self.quantization_factor, dtype=tf.float32)
   return tf.floordiv(obs, denom)
예제 #20
0
def generate_detections_per_image_op(cls_outputs,
                                     box_outputs,
                                     anchor_boxes,
                                     image_id,
                                     image_info,
                                     num_detections=100,
                                     pre_nms_num_detections=1000,
                                     nms_threshold=0.3,
                                     bbox_reg_weights=(10., 10., 5., 5.)):
    """Generates detections with model outputs and anchors.

  Args:
    cls_outputs: a Tensor with shape [N, num_classes], which stacks class
      logit outputs on all feature levels. The N is the number of total anchors
      on all levels. The num_classes is the number of classes predicted by the
      model. Note that the cls_outputs should be the output of softmax().
    box_outputs: a Tensor with shape [N, num_classes*4], which stacks
      box regression outputs on all feature levels. The N is the number of total
      anchors on all levels.
    anchor_boxes: a Tensor with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of total anchors on all levels.
    image_id: an integer number to specify the image id.
    image_info: a tensor of shape [5] which encodes the input image's [height,
      width, scale, original_height, original_width]
    num_detections: Number of detections after NMS.
    pre_nms_num_detections: Number of candidates before NMS.
    nms_threshold: a float number to specify the threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.
  Returns:
    detections: detection results in a tensor with each row representing
      [image_id, ymin, xmin, ymax, xmax, score, class]
  """
    num_boxes, num_classes = cls_outputs.get_shape().as_list()

    # Removes background class scores.
    cls_outputs = cls_outputs[:, 1:num_classes]
    top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
        tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=True)
    classes = tf.mod(top_k_indices_with_classes, num_classes - 1)
    top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)

    anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
    box_outputs = tf.reshape(box_outputs,
                             [num_boxes, num_classes, 4])[:, 1:num_classes, :]
    box_outputs = tf.gather_nd(box_outputs,
                               tf.stack([top_k_indices, classes], axis=1))

    # Applies bounding box regression to anchors.
    boxes = box_utils.batch_decode_box_outputs_op(
        tf.expand_dims(anchor_boxes, axis=0),
        tf.expand_dims(box_outputs, axis=0), bbox_reg_weights)[0]
    boxes = box_utils.clip_boxes(tf.expand_dims(boxes, axis=0),
                                 tf.expand_dims(image_info[:2], axis=0))[0]

    classes = tf.tile(tf.reshape(classes, [1, pre_nms_num_detections]),
                      [num_classes - 1, 1])
    scores = tf.tile(tf.reshape(top_k_scores, [1, pre_nms_num_detections]),
                     [num_classes - 1, 1])
    boxes = tf.tile(tf.reshape(boxes, [1, pre_nms_num_detections, 4]),
                    [num_classes - 1, 1, 1])

    class_bitmask = tf.tile(
        tf.reshape(tf.range(num_classes - 1), [num_classes - 1, 1]),
        [1, pre_nms_num_detections])
    scores = tf.where(tf.equal(classes, class_bitmask), scores,
                      tf.zeros_like(scores))
    scores = tf.where(tf.greater(scores, 0.05), scores, tf.zeros_like(scores))
    # Reshape classes to be compartible with the top_k function.
    classes = tf.reshape(classes, [num_classes - 1, pre_nms_num_detections, 1])
    scores, sorted_tensors = box_utils.top_k(scores,
                                             k=pre_nms_num_detections,
                                             tensors=[boxes, classes])
    boxes = sorted_tensors[0]
    classes = tf.reshape(sorted_tensors[1],
                         [num_classes - 1, pre_nms_num_detections])

    idx, num_valid = non_max_suppression.non_max_suppression_padded(
        scores,
        boxes,
        max_output_size=num_detections,
        iou_threshold=nms_threshold,
        level=0)

    post_nms_boxes = non_max_suppression.gather_boxes_by_indices(
        boxes, num_detections, idx, num_valid)
    post_nms_scores = non_max_suppression.gather_scores_by_indices(
        scores, num_detections, idx, num_valid)

    # Sorts all results.
    sorted_scores, sorted_indices = tf.nn.top_k(tf.to_float(
        tf.reshape(post_nms_scores, [-1])),
                                                k=num_detections,
                                                sorted=True)
    post_nms_boxes = tf.gather(tf.reshape(post_nms_boxes, [-1, 4]),
                               sorted_indices)
    classes = tf.batch_gather(classes, idx)
    post_nms_classes = tf.gather(tf.reshape(classes, [-1]), sorted_indices) + 1

    if isinstance(image_id, int):
        image_id = tf.constant(image_id)
    image_id = tf.reshape(image_id, [])
    detections_result = tf.stack([
        tf.to_float(tf.fill(tf.shape(sorted_scores), image_id)),
        post_nms_boxes[:, 0],
        post_nms_boxes[:, 1],
        post_nms_boxes[:, 2],
        post_nms_boxes[:, 3],
        sorted_scores,
        tf.to_float(post_nms_classes),
    ],
                                 axis=1)
    return detections_result
예제 #21
0
def multilevel_crop_and_resize(features,
                               boxes,
                               output_size=7,
                               is_gpu_inference=False):
  """Crop and resize on multilevel feature pyramid.

  Generate the (output_size, output_size) set of pixels for each input box
  by first locating the box into the correct feature level, and then cropping
  and resizing it using the correspoding feature map of that level.

  Args:
    features: A dictionary with key as pyramid level and value as features. The
      features are in shape of [batch_size, height_l, width_l, num_filters].
    boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents
      a box with [y1, x1, y2, x2] in un-normalized coordinates.
    output_size: A scalar to indicate the output crop size.
    is_gpu_inference: whether to build the model for GPU inference.

  Returns:
    A 5-D tensor representing feature crop of shape
    [batch_size, num_boxes, output_size, output_size, num_filters].
  """
  with tf.name_scope('multilevel_crop_and_resize'):
    levels = features.keys()
    min_level = min(levels)
    max_level = max(levels)
    _, max_feature_height, max_feature_width, _ = (
        features[min_level].get_shape().as_list())
    # Stack feature pyramid into a features_all of shape
    # [batch_size, levels, height, width, num_filters].
    features_all = []
    for level in range(min_level, max_level + 1):
      features_all.append(
          tf.image.pad_to_bounding_box(features[level], 0, 0,
                                       max_feature_height, max_feature_width))
    features_all = tf.stack(features_all, axis=1)

    # Assign boxes to the right level.
    box_width = tf.squeeze(boxes[:, :, 3:4] - boxes[:, :, 1:2], axis=-1)
    box_height = tf.squeeze(boxes[:, :, 2:3] - boxes[:, :, 0:1], axis=-1)
    areas_sqrt = tf.sqrt(box_height * box_width)
    levels = tf.floordiv(tf.log(tf.div(areas_sqrt, 224.0)), tf.log(2.0)) + 4.0
    if not is_gpu_inference:
      levels = tf.cast(levels, dtype=tf.int32)

    # Map levels between [min_level, max_level].
    levels = tf.minimum(
        float(max_level) if is_gpu_inference else max_level,
        tf.maximum(levels,
                   float(min_level) if is_gpu_inference else min_level))

    # Project box location and sizes to corresponding feature levels.
    scale_to_level = tf.cast(
        tf.pow(
            tf.constant(2.0),
            levels if is_gpu_inference else tf.cast(levels, tf.float32)),
        dtype=boxes.dtype)
    boxes /= tf.expand_dims(scale_to_level, axis=2)
    box_width /= scale_to_level
    box_height /= scale_to_level
    boxes = tf.concat([boxes[:, :, 0:2],
                       tf.expand_dims(box_height, -1),
                       tf.expand_dims(box_width, -1)], axis=-1)

    # Map levels to [0, max_level-min_level].
    levels -= min_level
    level_strides = tf.pow(
        [[2.0]], levels if is_gpu_inference else tf.cast(levels, tf.float32))
    boundary = tf.cast(
        tf.concat([
            tf.expand_dims([[tf.cast(max_feature_height, tf.float32)]] /
                           level_strides - 1,
                           axis=-1),
            tf.expand_dims([[tf.cast(max_feature_width, tf.float32)]] /
                           level_strides - 1,
                           axis=-1),
        ], axis=-1),
        boxes.dtype)

    return selective_crop_and_resize(
        features_all, boxes, levels, boundary, output_size, is_gpu_inference)
예제 #22
0
파일: ops.py 프로젝트: xjx0524/models
    boxes: A float tensor of shape [batch, num_boxes, 4] containing boxes of the
      form [ymin, xmin, ymax, xmax] in normalized coordinates.

  Returns:
    An int32 tensor of shape [batch_size, num_boxes] containing feature indices.
  """
  assert num_levels > 0, (
      '`num_levels` must be > 0. Found {}'.format(num_levels))
  assert unit_scale_index < num_levels and unit_scale_index >= 0, (
      '`unit_scale_index` must be in [0, {}). Found {}.'.format(
          num_levels, unit_scale_index))
  box_height_width = boxes[:, :, 2:4] - boxes[:, :, 0:2]
  areas_sqrt = tf.sqrt(tf.reduce_prod(box_height_width, axis=2))
  log_2 = tf.cast(tf.log(2.0), dtype=boxes.dtype)
  levels = tf.cast(
      tf.floordiv(tf.log(areas_sqrt * image_ratio), log_2)
      +
      unit_scale_index,
      dtype=tf.int32)
  levels = tf.maximum(0, tf.minimum(num_levels - 1, levels))
  return levels


def bfloat16_to_float32_nested(input_nested):
  """Convert float32 tensors in a nested structure to bfloat16.

  Args:
    input_nested: A Python dict, values being Tensor or Python list/tuple of
      Tensor or Non-Tensor.

  Returns:
    def create_perturbation_ops(self, minibatch, synonym_values, vocab_table):
        """Perturb data_batch using synonym_values."""
        data_batch = _pad_fixed(utils.get_padded_indexes(
            vocab_table, minibatch.tokens, self.batch_size),
                                axis=1,
                                padded_length=self.config['max_padded_length'])

        # synonym_values: [vocab_size x max_num_synonyms]
        # data_batch: [batch_size x seq_length]
        # [batch_size x seq_length x max_num_synonyms] - synonyms for each token.
        # Defaults to same word in case of no other synonyms.
        synonym_ids = tf.gather(synonym_values, data_batch, axis=0)

        # Split along batchsize. Elements shape: [seq_length x max_num_synonyms].
        synonym_ids_per_example = tf.unstack(synonym_ids, axis=0)

        # Loop across batch.
        # synonym_ids_this_example shape: [seq_length x max_num_synonyms]
        sequence_positions_across_batch, values_across_batch = [], []
        for i_sample, synonym_ids_this_example in enumerate(
                synonym_ids_per_example):
            # [num_nonzero, 2]. The rows are pairs of (t,s), where t is an index for
            # a time step, and s is an index into the max_num_synonyms dimension.
            nonzero_indices = tf.where(synonym_ids_this_example)

            # shape [num_nonzero]. Corresponding to the entries at nonzero_indices
            synonym_tokens = tf.gather_nd(params=synonym_ids_this_example,
                                          indices=nonzero_indices)

            # [num_nonzero] - Of the (t,s) pairs in nonzero_indices, pick only the
            # time dimension (t), corresponding to perturbation positions in the
            # sequence.
            perturbation_positions_this_example = nonzero_indices[:, 0]

            # The main logic is done. Now follows padding to a fixed length of
            # num_perturbations. However, this cannot be done with 0-padding, as it
            # would introduce a new (zero) vertex. Instead, we duplicate existing
            # tokens as perturbations (which have no effect), until we have reached a
            # total of num_perturbations perturbations. In this case, the padded
            # tokens are the original tokens from the data_batch. The padded positions
            # are all the positions (using range) corresponding to the padded tokens.

            # How often seq-length fits into maximum num perturbations
            padding_multiplier = tf.floordiv(
                self.config['num_perturbations'],
                tf.cast(minibatch.num_tokens[i_sample], tf.int32)) + 1

            # original tokens  # [seq_length]
            original_tokens = data_batch[
                i_sample, :minibatch.num_tokens[i_sample]]
            # [padding_multiplier * seq_length]. Repeat several times, use as padding.
            padding_tokens = tf.tile(original_tokens,
                                     multiples=[padding_multiplier])
            synonym_tokens_padded = tf.concat(
                [synonym_tokens,
                 tf.cast(padding_tokens, dtype=tf.int64)],
                axis=0)
            # Crop at exact num_perturbations size.
            synonym_tokens_padded = synonym_tokens_padded[:self.config[
                'num_perturbations']]

            # [seq_length] padding sequence positions with tiles of range()
            pad_positions = tf.range(minibatch.num_tokens[i_sample], delta=1)
            # [padding_multiplier*seq_length]
            padding_positions = tf.tile(pad_positions,
                                        multiples=[padding_multiplier])
            perturbation_positions_this_example_padded = tf.concat([
                perturbation_positions_this_example,
                tf.cast(padding_positions, dtype=tf.int64)
            ],
                                                                   axis=0)
            # Crop at exact size num_perturbations.
            sequence_positions_padded = perturbation_positions_this_example_padded[:self.config[
                'num_perturbations']]

            # Collect across the batch for tf.stack later.
            sequence_positions_across_batch.append(sequence_positions_padded)
            values_across_batch.append(synonym_tokens_padded)

        # Both [batch_size x max_n_perturbations]
        perturbation_positions = tf.stack(sequence_positions_across_batch,
                                          axis=0)
        perturbation_tokens = tf.stack(values_across_batch, axis=0)

        # Explicitly setting the shape to self.config['num_perturbations']
        perturbation_positions_shape = perturbation_positions.shape.as_list()
        perturbation_positions_shape[1] = self.config['num_perturbations']
        perturbation_positions.set_shape(perturbation_positions_shape)
        perturbation_tokens_shape = perturbation_tokens.shape.as_list()
        perturbation_tokens_shape[1] = self.config['num_perturbations']
        perturbation_tokens.set_shape(perturbation_tokens_shape)

        return Perturbation(positions=perturbation_positions,
                            tokens=perturbation_tokens)