def _noisy_identity_kernel_initializer(shape, dtype=tf.float32, partition_info=None): """Constructs a noisy identity kernel. Args: shape: List of integers. Represents shape of result. dtype: data type for values in result. partition_info: Partition information for initializer functions. Ignored. Returns: Tensor of desired shape and dtype such that applying it as a convolution kernel results in a noisy near-identity operation. Raises: ValueError: If shape does not define a valid kernel. If filter width and height differ. If filter width and height are not odd numbers. If number of input and output channels are not multiples of base_num_channels. """ if len(shape) != 4: raise ValueError("Convolution kernels must be rank 4.") filter_height, filter_width, in_channels, out_channels = shape if filter_width != filter_height: raise ValueError( "Noisy identity initializer only works for square filters.") if filter_width % 2 != 1: raise ValueError( "Noisy identity initializer requires filters have odd height and " "width.") if (in_channels % base_num_channels != 0 or out_channels % base_num_channels != 0): raise ValueError("in_channels and out_channels must both be multiples of " "base_num_channels.") middle_pixel = filter_height // 2 is_middle_pixel = tf.logical_and( tf.equal(_range_along_dimension(0, shape), middle_pixel), tf.equal(_range_along_dimension(1, shape), middle_pixel)) is_same_channel_multiple = tf.equal( tf.floordiv( _range_along_dimension(2, shape) * base_num_channels, in_channels), tf.floordiv( _range_along_dimension(3, shape) * base_num_channels, out_channels)) noise = tf.truncated_normal(shape, stddev=stddev, dtype=dtype) return tf.where( tf.logical_and(is_same_channel_multiple, is_middle_pixel), tf.ones( shape, dtype=dtype) * (base_num_channels / out_channels), noise)
def fpn_feature_levels(num_levels, unit_scale_index, image_ratio, boxes): """Returns fpn feature level for each box based on its area. See section 4.2 of https://arxiv.org/pdf/1612.03144.pdf for details. Args: num_levels: An integer indicating the number of feature levels to crop boxes from. unit_scale_index: An 0-based integer indicating the index of feature map which most closely matches the resolution of the pretrained model. image_ratio: A float indicating the ratio of input image area to pretraining image area. boxes: A float tensor of shape [batch, num_boxes, 4] containing boxes of the form [ymin, xmin, ymax, xmax] in normalized coordinates. Returns: An int32 tensor of shape [batch_size, num_boxes] containing feature indices. """ assert num_levels > 0, ( '`num_levels` must be > 0. Found {}'.format(num_levels)) assert unit_scale_index < num_levels and unit_scale_index >= 0, ( '`unit_scale_index` must be in [0, {}). Found {}.'.format( num_levels, unit_scale_index)) box_height_width = boxes[:, :, 2:4] - boxes[:, :, 0:2] areas_sqrt = tf.sqrt(tf.reduce_prod(box_height_width, axis=2)) log_2 = tf.cast(tf.log(2.0), dtype=boxes.dtype) levels = tf.cast( tf.floordiv(tf.log(areas_sqrt * image_ratio), log_2) + unit_scale_index, dtype=tf.int32) levels = tf.maximum(0, tf.minimum(num_levels - 1, levels)) return levels
def extend_partial_state(self, JCK, potentials, map_to_indices, l_br, r_br, r): shape_1 = tf.cast(ncr(self.N-r, 2)*self.M, tf.int32) indices = tf.cast(tf.random.categorical(potentials, 1), tf.int32) indices_remainder = tf.floordiv(indices, self.M) coalesced_indices = tf.cast(tf.gather_nd(map_to_indices, indices_remainder), tf.int32) transformed_coalesced_indices = tf.cast( self.N*10*tf.reduce_sum(tf.one_hot(coalesced_indices, self.N-r), axis=1), tf.int32) all_indices = tf.tile(tf.expand_dims(tf.range(self.N-r), axis=0), [self.K,1]) remaining_indices, _ = tf.nn.top_k(all_indices - transformed_coalesced_indices, self.N - r - 2) JC_keep = gather_across_2d(JCK, remaining_indices, self.N-r, self.N-r-2) particles = gather_across_2d(JCK, coalesced_indices, self.N-r, 2) particle1 = particles[:, 0] particle2 = particles[:, 1] # Form new state particle_coalesced = particle1 + '+' + particle2 # Form new Jump Chain JCK = tf.concat([JC_keep, tf.expand_dims(particle_coalesced, axis=1)], axis=1) q_log_proposal = gather_across_2d(potentials, indices, shape_1, 1) q_log_proposal = tf.reduce_mean(q_log_proposal, axis=1) # q should be Kx1, but is Kx?, and reduce_mean simply changes ? to 1 l_br = gather_across_2d(l_br, indices, shape_1, 1) l_br = tf.squeeze(tf.reduce_mean(l_br, axis=1)) r_br = gather_across_2d(r_br, indices, shape_1, 1) r_br = tf.squeeze(tf.reduce_mean(r_br, axis=1)) return coalesced_indices, remaining_indices, q_log_proposal, l_br, r_br, JCK
def _map_fn(index): x = tf.floordiv(index, 2) y = tf.floormod(index, 2) label = tf.cast(index + 1, tf.float32) label = tf.reshape(label, [1]) target_dense = tf.stack([x + y, x + y + 1]) return ({KEY_NAME: dense_to_sparse(target_dense, tf.int64)}, label)
def define_predictions(self, features, outputs): """Define model predictions.""" predictions = { "example_id": features["example_id"], "service_id": features["service_id"], "is_real_example": features["is_real_example"], } # Scores are output for each intent. # Note that the intent indices are shifted by 1 to account for NONE intent. predictions["intent_status"] = tf.argmax( outputs["logit_intent_status"], axis=-1) # Scores are output for each requested slot. predictions["req_slot_status"] = tf.sigmoid( outputs["logit_req_slot_status"]) # For categorical slots, the status of each slot and the predicted value are # output. predictions["cat_slot_status"] = tf.argmax( outputs["logit_cat_slot_status"], axis=-1) predictions["cat_slot_value"] = tf.argmax( outputs["logit_cat_slot_value"], axis=-1) # For non-categorical slots, the status of each slot and the indices for # spans are output. predictions["noncat_slot_status"] = tf.argmax( outputs["logit_noncat_slot_status"], axis=-1) start_scores = tf.nn.softmax(outputs["logit_noncat_slot_start"], axis=-1) end_scores = tf.nn.softmax(outputs["logit_noncat_slot_end"], axis=-1) _, max_num_slots, max_num_tokens = end_scores.get_shape().as_list() batch_size = tf.shape(end_scores)[0] # Find the span with the maximum sum of scores for start and end indices. total_scores = (tf.expand_dims(start_scores, axis=3) + tf.expand_dims(end_scores, axis=2)) # Mask out scores where start_index > end_index. start_idx = tf.reshape(tf.range(max_num_tokens), [1, 1, -1, 1]) end_idx = tf.reshape(tf.range(max_num_tokens), [1, 1, 1, -1]) invalid_index_mask = tf.tile((start_idx > end_idx), [batch_size, max_num_slots, 1, 1]) total_scores = tf.where(invalid_index_mask, tf.zeros_like(total_scores), total_scores) max_span_index = tf.argmax(tf.reshape( total_scores, [-1, max_num_slots, max_num_tokens**2]), axis=-1) span_start_index = tf.floordiv(max_span_index, max_num_tokens) span_end_index = tf.floormod(max_span_index, max_num_tokens) predictions["noncat_slot_start"] = span_start_index predictions["noncat_slot_end"] = span_end_index # Add inverse alignments. predictions["noncat_alignment_start"] = features[ "noncat_alignment_start"] predictions["noncat_alignment_end"] = features["noncat_alignment_end"] return predictions
def _loop_body(i_, img_, xstartpreds_): # Sample p(x_{t-1} | x_t) as usual sample, pred_xstart = self.p_sample( denoise_fn=denoise_fn, x=img_, t=tf.fill([shape[0]], i_), noise_fn=noise_fn, return_pred_xstart=True) assert sample.shape == pred_xstart.shape == shape # Keep track of prediction of x0 insert_mask = tf.equal(tf.floordiv(i_, include_xstartpred_freq), tf.range(num_recorded_xstartpred, dtype=tf.int32)) insert_mask = tf.reshape(tf.cast(insert_mask, dtype=tf.float32), [1, num_recorded_xstartpred, *([1] * len(shape[1:]))]) # [1, N, 1, 1, 1] new_xstartpreds = insert_mask * pred_xstart[:, None, ...] + (1. - insert_mask) * xstartpreds_ return [i_ - 1, sample, new_xstartpreds]
def ae_latent_softmax(latents_pred, latents_discrete, hparams): """Latent prediction and loss.""" vocab_size = 2**hparams.z_size if hparams.num_decode_blocks < 2: latents_logits = tf.layers.dense(latents_pred, vocab_size, name="extra_logits") if hparams.logit_normalization: latents_logits *= tf.rsqrt( 1e-8 + tf.reduce_mean(tf.square(latents_logits))) loss = None if latents_discrete is not None: if hparams.soft_em: # latents_discrete is actually one-hot of multinomial samples assert hparams.num_decode_blocks == 1 loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=latents_discrete, logits=latents_logits) else: loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=latents_discrete, logits=latents_logits) sample = multinomial_sample(latents_logits, vocab_size, hparams.sampling_temp) return sample, loss # Multi-block case. vocab_bits = int(math.log(vocab_size, 2)) assert vocab_size == 2**vocab_bits assert vocab_bits % hparams.num_decode_blocks == 0 block_vocab_size = 2**(vocab_bits // hparams.num_decode_blocks) latents_logits = [ tf.layers.dense(latents_pred, block_vocab_size, name="extra_logits_%d" % i) for i in range(hparams.num_decode_blocks) ] loss = None if latents_discrete is not None: losses = [] for i in range(hparams.num_decode_blocks): d = tf.floormod(tf.floordiv(latents_discrete, block_vocab_size**i), block_vocab_size) losses.append( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=d, logits=latents_logits[i])) loss = sum(losses) samples = [ multinomial_sample(l, block_vocab_size, hparams.sampling_temp) for l in latents_logits ] sample = sum([s * block_vocab_size**i for i, s in enumerate(samples)]) return sample, loss
def unravel_index_2d(indices, dims): """Unravel index, for 2D inputs only. See Numpy's unravel. Args: indices: <int32> [num_elements], coordinates into 2D row-major tensor. dims: (N, M), dimensions of the 2D tensor. Returns: coordinates: <int32> [2, num_elements], row (1st) and column (2nd) indices. """ row_inds = tf.floordiv(indices, dims[1]) col_inds = tf.floormod(indices, dims[1]) return tf.stack([row_inds, col_inds], axis=0)
def bucket_fn(x): """Compute the element bucket and update the histogram.""" ix = len_fn(x) if ix.dtype == tf.int32: ix = tf.to_int64(ix) elif ix.dtype != tf.int64: raise ValueError("Len function returned a non-int") adds_to_bins = tf.to_int64(tf.greater(hist_bounds, ix)) # pad with a 1 for the "larger than all" bin adds_to_bins = tf.pad(adds_to_bins, [[0, 1]], constant_values=1) new_counts = tf.assign_add(hist_counts, adds_to_bins) bin_ix = n_hist_binds - tf.reduce_sum(adds_to_bins) # Computes the quantile based on the counts of the exammple's bucket bucket_ix = tf.floordiv(((n_buckets - 1) * new_counts[bin_ix]), new_counts[-1]) return bucket_ix
def compute_progress(current_image_id, stable_stage_num_images, transition_stage_num_images, num_blocks): """Computes the training progress. The training alternates between stable phase and transition phase. The `progress` indicates the training progress, i.e. the training is at - a stable phase p if progress = p - a transition stage between p and p + 1 if progress = p + fraction where p = 0,1,2.,... Note the max value of progress is `num_blocks` - 1. In terms of LOD (of the original implementation): progress = `num_blocks` - 1 - LOD Args: current_image_id: An scalar integer `Tensor` of the current image id, count from 0. stable_stage_num_images: An integer representing the number of images in each stable stage. transition_stage_num_images: An integer representing the number of images in each transition stage. num_blocks: Number of network blocks. Returns: A scalar float `Tensor` of the training progress. """ # Note when current_image_id >= min_total_num_images - 1 (which means we # are already at the highest resolution), we want to keep progress constant. # Therefore, cap current_image_id here. capped_current_image_id = tf.minimum( current_image_id, min_total_num_images(stable_stage_num_images, transition_stage_num_images, num_blocks) - 1) stage_num_images = stable_stage_num_images + transition_stage_num_images progress_integer = tf.floordiv(capped_current_image_id, stage_num_images) progress_fraction = tf.maximum( 0.0, tf.to_float( tf.mod(capped_current_image_id, stage_num_images) - stable_stage_num_images) / tf.to_float(transition_stage_num_images)) return tf.to_float(progress_integer) + progress_fraction
def int_to_bit(self, x_int, num_bits, base=2): """Turn x_int representing numbers into a bitwise (lower-endian) tensor. Args: x_int: Tensor containing integer to be converted into base notation. num_bits: Number of bits in the representation. base: Base of the representation. Returns: Corresponding number expressed in base. """ x_l = tf.to_int32(tf.expand_dims(x_int, axis=-1)) # pylint: disable=g-complex-comprehension x_labels = [ tf.floormod(tf.floordiv(tf.to_int32(x_l), tf.to_int32(base)**i), tf.to_int32(base)) for i in range(num_bits) ] res = tf.concat(x_labels, axis=-1) return tf.to_float(res)
def loss_function(self, inputs, build_network_result): """Computes the ctc loss for the current batch of predictions. Args: inputs: the input list of the model. build_network_result: a BuildNetworkResult returned by build_network(). Returns: The loss tensor of the model. """ logits = build_network_result.logits actual_time_steps = inputs[2] probs = tf.nn.softmax(logits) ctc_time_steps = tf.shape(probs)[1] ctc_input_length = tf.to_float( tf.multiply(actual_time_steps, ctc_time_steps)) ctc_input_length = tf.to_int32( tf.floordiv(ctc_input_length, tf.to_float(self.max_time_steps))) label_length = inputs[3] label_length = tf.to_int32(tf.squeeze(label_length)) ctc_input_length = tf.to_int32(tf.squeeze(ctc_input_length)) labels = inputs[1] sparse_labels = tf.to_int32( tf.keras.backend.ctc_label_dense_to_sparse(labels, label_length)) y_pred = tf.log( tf.transpose(probs, perm=[1, 0, 2]) + tf.keras.backend.epsilon()) losses = tf.expand_dims( tf.nn.ctc_loss( labels=sparse_labels, inputs=y_pred, sequence_length=ctc_input_length, ignore_longer_outputs_than_inputs=True), axis=1) loss = tf.reduce_mean(losses) return loss
def multilevel_crop_and_resize(features, boxes, output_size=7, use_einsum_gather=False): """Crop and resize on multilevel feature pyramid. Generate the (output_size, output_size) set of pixels for each input box by first locating the box into the correct feature level, and then cropping and resizing it using the correspoding feature map of that level. Here is the step-by-step algorithm with use_einsum_gather=True: 1. Compute sampling points and their four neighbors for each output points. Each box is mapped to [output_size, output_size] points. Each output point is averaged among #sampling_raitio^2 points. Each sampling point is computed using bilinear interpolation of its four neighboring points on the feature map. 2. Gather output points seperately for each level. Gather and computation of output points are done for the boxes mapped to this level only. 2.1. Compute indices of four neighboring point of each sampling point for x and y seperately of shape [batch_size, num_boxes, output_size, 2]. 2.2. Compute the interpolation kernel for axis x and y seperately of shape [batch_size, num_boxes, output_size, 2, 1]. 2.3. The features are colleced into a [batch_size, num_boxes, output_size, output_size, num_filters] Tensor. Instead of a one-step algorithm, a two-step approach is used. That is, first, an intermediate output is stored with a shape of [batch_size, num_boxes, output_size, width, num_filters]; second, the final output is produced with a shape of [batch_size, num_boxes, output_size, output_size, num_filters]. Blinear interpolation is done during the two step gather: f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T [f10, f11]] [[f00, f01], [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot) where [hy, ly] and [hx, lx] are the bilinear interpolation kernel. Note: a. Use one_hot with einsum to replace gather; b. Bilinear interpolation and averaging of multiple sampling points are fused into the one_hot vector. Args: features: A dictionary with key as pyramid level and value as features. The features are in shape of [batch_size, height_l, width_l, num_filters]. boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents a box with [y1, x1, y2, x2] in un-normalized coordinates. output_size: A scalar to indicate the output crop size. use_einsum_gather: use einsum to replace gather or not. Replacing einsum with gather can improve performance when feature size is not large, einsum is friendly with model partition as well. Gather's performance is better when feature size is very large and there are multiple box levels. Returns: A 5-D tensor representing feature crop of shape [batch_size, num_boxes, output_size, output_size, num_filters]. """ with tf.name_scope('multilevel_crop_and_resize'): levels = list(features.keys()) min_level = min(levels) max_level = max(levels) batch_size, max_feature_height, max_feature_width, num_filters = ( features[min_level].get_shape().as_list()) if batch_size is None: batch_size = tf.shape(features[min_level])[0] _, num_boxes, _ = boxes.get_shape().as_list() # Assigns boxes to the right level. box_width = boxes[:, :, 3] - boxes[:, :, 1] box_height = boxes[:, :, 2] - boxes[:, :, 0] areas_sqrt = tf.sqrt(box_height * box_width) levels = tf.cast( tf.floordiv(tf.log(tf.div(areas_sqrt, 224.0)), tf.log(2.0)) + 4.0, dtype=tf.int32) # Maps levels between [min_level, max_level]. levels = tf.minimum(max_level, tf.maximum(levels, min_level)) # Projects box location and sizes to corresponding feature levels. scale_to_level = tf.cast(tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)), dtype=boxes.dtype) boxes /= tf.expand_dims(scale_to_level, axis=2) box_width /= scale_to_level box_height /= scale_to_level boxes = tf.concat([ boxes[:, :, 0:2], tf.expand_dims(box_height, -1), tf.expand_dims(box_width, -1) ], axis=-1) if use_einsum_gather: def two_step_gather_per_level(features_level, mask): """Performs two-step gather using einsum for every level of features.""" (_, feature_height, feature_width, _) = features_level.get_shape().as_list() boundaries = tf.tile( tf.expand_dims( tf.expand_dims([feature_height, feature_width], 0), 0), [batch_size, num_boxes, 1]) boundaries = tf.cast(boundaries, boxes.dtype) kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions( boxes, boundaries, output_size, sample_offset=0.5) # shape is: # [batch_size, num_boxes, output_size, 2, spatial_size] box_grid_y_one_hot, box_grid_x_one_hot = get_grid_one_hot( box_gridy0y1, box_gridx0x1, feature_height, feature_width) # # shape is [batch_size, num_boxes, output_size, spatial_size] box_grid_y_weight = tf.reduce_sum(tf.multiply( box_grid_y_one_hot, kernel_y), axis=-2) box_grid_x_weight = tf.reduce_sum(tf.multiply( box_grid_x_one_hot, kernel_x), axis=-2) # shape is [batch_size, num_boxes, output_size, width, feature] y_outputs = tf.einsum( 'bhwf,bnyh->bnywf', features_level, tf.cast(box_grid_y_weight, dtype=features_level.dtype)) # shape is [batch_size, num_boxes, output_size, output_size, feature] x_outputs = tf.einsum( 'bnywf,bnxw->bnyxf', y_outputs, tf.cast(box_grid_x_weight, dtype=features_level.dtype)) outputs = tf.where(tf.equal(mask, tf.zeros_like(mask)), tf.zeros_like(x_outputs), x_outputs) return outputs features_per_box = tf.zeros( [batch_size, num_boxes, output_size, output_size, num_filters], dtype=features[min_level].dtype) for level in range(min_level, max_level + 1): level_equal = tf.equal(levels, level) mask = tf.tile( tf.reshape(level_equal, [batch_size, num_boxes, 1, 1, 1]), [1, 1, output_size, output_size, num_filters]) features_per_box += two_step_gather_per_level( features[level], mask) return features_per_box # Stack feature pyramid into a features_all of shape # [batch_size, levels, height, width, num_filters]. features_all = [] feature_heights = [] feature_widths = [] for level in range(min_level, max_level + 1): shape = features[level].get_shape().as_list() feature_heights.append(shape[1]) feature_widths.append(shape[2]) # Concat tensor of [batch_size, height_l * width_l, num_filters] for each # levels. features_all.append( tf.reshape(features[level], [batch_size, -1, num_filters])) features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters]) # Calculate height_l * width_l for each level. level_dim_sizes = [ feature_widths[i] * feature_heights[i] for i in range(len(feature_widths)) ] # level_dim_offsets is accumulated sum of level_dim_size. level_dim_offsets = [0] for i in range(len(feature_widths) - 1): level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i]) batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1] level_dim_offsets = tf.constant(level_dim_offsets, tf.int32) height_dim_sizes = tf.constant(feature_widths, tf.int32) # Maps levels to [0, max_level-min_level]. levels -= min_level level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32)) boundary = tf.cast( tf.concat([ tf.expand_dims([[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1, axis=-1), tf.expand_dims([[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1, axis=-1), ], axis=-1), boxes.dtype) # Compute grid positions. kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions( boxes, boundary, output_size, sample_offset=0.5) x_indices = tf.cast(tf.reshape( box_gridx0x1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) y_indices = tf.cast(tf.reshape( box_gridy0y1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) batch_size_offset = tf.tile( tf.reshape( tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]), [1, num_boxes, output_size * 2, output_size * 2]) # Get level offset for each box. Each box belongs to one level. levels_offset = tf.tile( tf.reshape(tf.gather(level_dim_offsets, levels), [batch_size, num_boxes, 1, 1]), [1, 1, output_size * 2, output_size * 2]) y_indices_offset = tf.tile( tf.reshape( y_indices * tf.expand_dims(tf.gather(height_dim_sizes, levels), -1), [batch_size, num_boxes, output_size * 2, 1]), [1, 1, 1, output_size * 2]) x_indices_offset = tf.tile( tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]), [1, 1, output_size * 2, 1]) indices = tf.reshape( batch_size_offset + levels_offset + y_indices_offset + x_indices_offset, [-1]) # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar # performance. features_per_box = tf.reshape(tf.gather(features_r2, indices), [ batch_size, num_boxes, output_size * 2, output_size * 2, num_filters ]) # Bilinear interpolation. features_per_box = feature_bilinear_interpolation( features_per_box, kernel_y, kernel_x) return features_per_box
def generate_detections_per_image_tpu(cls_outputs, box_outputs, anchor_boxes, image_info, pre_nms_num_detections=1000, post_nms_num_detections=100, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generate the final detections per image given the model outputs. Args: cls_outputs: a tensor with shape [N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the cls_outputs should be the output of softmax(). box_outputs: a tensor with shape [N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_info: a tensor of shape [5] which encodes the input image's [height, width, scale, original_height, original_width] pre_nms_num_detections: an integer that specifies the number of candidates before NMS. post_nms_num_detections: an integer that specifies the number of candidates after NMS. nms_threshold: a float number to specify the IOU threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: detections: Tuple of tensors corresponding to number of valid boxes, box coordinates, object categories for each boxes, and box scores -- respectively. """ num_boxes, num_classes = cls_outputs.get_shape().as_list() # Remove background class scores. cls_outputs = cls_outputs[:, 1:num_classes] top_k_scores, top_k_indices_with_classes = tf.nn.top_k( tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=False) classes = tf.mod(top_k_indices_with_classes, num_classes - 1) top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1) anchor_boxes = tf.gather(anchor_boxes, top_k_indices) box_outputs = tf.reshape(box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :] class_indices = classes box_outputs = tf.gather_nd( box_outputs, tf.stack([top_k_indices, class_indices], axis=1)) # apply bounding box regression to anchors boxes = box_utils.decode_boxes(box_outputs, anchor_boxes, bbox_reg_weights) boxes = box_utils.clip_boxes(boxes, image_info[0], image_info[1]) list_of_all_boxes = [] list_of_all_scores = [] list_of_all_classes = [] # Skip background class. for class_i in range(num_classes): # Compute bitmask for the given classes. class_i_bitmask = tf.cast(tf.equal(classes, class_i), top_k_scores.dtype) # This works because score is in [0, 1]. class_i_scores = top_k_scores * class_i_bitmask # The TPU and CPU have different behaviors for # tf.image.non_max_suppression_padded (b/116754376). (class_i_post_nms_indices, class_i_nms_num_valid) = tf.image.non_max_suppression_padded( tf.to_float(boxes), tf.to_float(class_i_scores), post_nms_num_detections, iou_threshold=nms_threshold, score_threshold=0.05, pad_to_max_output_size=True, name='nms_detections_' + str(class_i)) class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices) class_i_post_nms_scores = tf.gather(class_i_scores, class_i_post_nms_indices) mask = tf.less(tf.range(post_nms_num_detections), [class_i_nms_num_valid]) class_i_post_nms_scores = tf.where( mask, class_i_post_nms_scores, tf.zeros_like(class_i_post_nms_scores)) class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores), class_i + 1) list_of_all_boxes.append(class_i_post_nms_boxes) list_of_all_scores.append(class_i_post_nms_scores) list_of_all_classes.append(class_i_classes) post_nms_boxes = tf.concat(list_of_all_boxes, axis=0) post_nms_scores = tf.concat(list_of_all_scores, axis=0) post_nms_classes = tf.concat(list_of_all_classes, axis=0) # sort all results. post_nms_scores, sorted_indices = tf.nn.top_k(tf.to_float(post_nms_scores), k=post_nms_num_detections, sorted=True) post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices) post_nms_classes = tf.gather(post_nms_classes, sorted_indices) valid_mask = tf.where(tf.greater(post_nms_scores, 0), tf.ones_like(post_nms_scores), tf.zeros_like(post_nms_scores)) num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1) box_classes = tf.to_float(post_nms_classes) return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
def generate_detections_per_image(cls_outputs, box_outputs, anchor_boxes, pre_nms_num_detections=1000, post_nms_num_detections=100, nms_threshold=0.3): """Generate the final detections per image given the model outputs. Args: cls_outputs: a tensor with shape [N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the cls_outputs should be the output of softmax(). box_outputs: a tensor with shape [N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. pre_nms_num_detections: an integer that specifies the number of candidates before NMS. post_nms_num_detections: an integer that specifies the number of candidates after NMS. nms_threshold: a float number to specify the IOU threshold of NMS. Returns: detections: Tuple of tensors corresponding to number of valid boxes, box coordinates, object categories for each boxes, and box scores -- respectively. """ num_classes = cls_outputs.get_shape().as_list()[1] top_k_scores, top_k_indices_with_classes = tf.nn.top_k( tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=False) classes = tf.mod(top_k_indices_with_classes, num_classes) top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes) anchor_boxes = tf.gather(anchor_boxes, top_k_indices) box_outputs = tf.gather(box_outputs, top_k_indices) # apply bounding box regression to anchors boxes = _decode_boxes(box_outputs, anchor_boxes) list_of_all_boxes = [] list_of_all_scores = [] list_of_all_classes = [] for class_i in range(num_classes): # Compute bitmask for the given classes. class_i_bitmask = tf.cast(tf.equal(classes, class_i), top_k_scores.dtype) # This works because score is in [0, 1]. class_i_scores = top_k_scores * class_i_bitmask (class_i_post_nms_indices, class_i_nms_num_valid) = tf.image.non_max_suppression_padded( tf.to_float(boxes), tf.to_float(class_i_scores), post_nms_num_detections, iou_threshold=nms_threshold, score_threshold=0.05, pad_to_max_output_size=True, name='nms_detections_' + str(class_i)) class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices) class_i_post_nms_scores = tf.gather(class_i_scores, class_i_post_nms_indices) mask = tf.less(tf.range(post_nms_num_detections), [class_i_nms_num_valid]) class_i_post_nms_scores = tf.where( mask, class_i_post_nms_scores, tf.zeros_like(class_i_post_nms_scores)) class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores), class_i + 1) list_of_all_boxes.append(class_i_post_nms_boxes) list_of_all_scores.append(class_i_post_nms_scores) list_of_all_classes.append(class_i_classes) post_nms_boxes = tf.concat(list_of_all_boxes, axis=0) post_nms_scores = tf.concat(list_of_all_scores, axis=0) post_nms_classes = tf.concat(list_of_all_classes, axis=0) # sort all results. post_nms_scores, sorted_indices = tf.nn.top_k(tf.to_float(post_nms_scores), k=post_nms_num_detections, sorted=True) post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices) post_nms_classes = tf.gather(post_nms_classes, sorted_indices) valid_mask = tf.where(tf.greater(post_nms_scores, 0), tf.ones_like(post_nms_scores), tf.zeros_like(post_nms_scores)) num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1) box_classes = tf.to_float(post_nms_classes) return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
def hier_homography_fmask_estimator(color_inputs, num_param=8, num_layer=7, num_level=3, dropout_keep_prob=0.8, reuse=None, is_training=True, trainable=True, scope='hier_hmg'): """A hierarchical neural network with mask for homograhy estimation. Args: color_inputs: batch of input image pairs of data type float32 and of shape [batch_size, height, width, 6] num_param: the number of parameters for homography (default 8) num_layer: the number of convolutional layers in the motion feature network num_level: the number of hierarchical levels dropout_keep_prob: the percentage of activation values that are kept reuse: whether to reuse this network weights is_training: whether used for training or testing trainable: whether this network is to be trained or not scope: the scope of variables in this function Returns: a list of homographies at each level and motion feature maps if final_endpoint='mfeature'; otherwise a list of images warped by the list of corresponding homographies """ _, h_input, w_input = color_inputs.get_shape().as_list()[0 : 3] vgg_inputs = (color_inputs[Ellipsis, 3 : 6] * 256 + 128)- VGG_MEANS with slim.arg_scope([slim.conv2d, slim.max_pool2d], padding='SAME'): with slim.arg_scope([slim.conv2d, slim.fully_connected], trainable=False): with slim.arg_scope([slim.conv2d], normalizer_fn=None): with slim.arg_scope(contrib_slim_nets_vgg.vgg_arg_scope()): sfeature, _ = contrib_slim_nets_vgg.vgg_16( vgg_inputs, 1000, predictions_fn=slim.softmax, global_pool=False, is_training=False, reuse=reuse, spatial_squeeze=True, final_endpoint='pool5', scope='vgg_16') gray_image1 = tf.image.rgb_to_grayscale(color_inputs[Ellipsis, 0 : 3]) gray_image2 = tf.image.rgb_to_grayscale(color_inputs[Ellipsis, 3 : 6]) inputs = tf.concat([gray_image1, gray_image2], 3) hmgs_list = [] warped_list = [] with tf.variable_scope(scope, [inputs], reuse=reuse): for level_index in range(num_level): scale = 2 ** (num_level - 1 - level_index) h = tf.to_float(tf.floordiv(h_input, scale)) w = tf.to_float(tf.floordiv(w_input, scale)) inputs_il = tf.image.resize_images(inputs, tf.to_int32([h, w])) if level_index == 0: mfeature = hier_base_layers(inputs_il, num_layer + 1 - num_level + level_index, level_index, is_training=is_training, trainable=trainable) hmgs_il = homography_regression(mfeature, num_param, level_index, dropout_keep_prob=dropout_keep_prob, is_training=is_training, trainable=trainable) hmgs_list.append(hmgs_il) else: warped, _ = hmg_util.homography_scale_warp_per_batch( inputs_il[:, :, :, 0], w / 2, h / 2, hmgs_list[level_index - 1]) pre_warped_inputs_il = tf.stack([warped, inputs_il[:, :, :, 1]], -1) warped_list.append(pre_warped_inputs_il) mfeature = hier_base_layers(pre_warped_inputs_il, num_layer + 1 - num_level + level_index, level_index, is_training=is_training, trainable=trainable) if level_index == num_level - 1: mfeature = fmask_layers_semantic(mfeature, sfeature, level_index, is_training=is_training, trainable=trainable) hmgs_il = homography_regression(mfeature, num_param, level_index, dropout_keep_prob=dropout_keep_prob, is_training=is_training, trainable=trainable) new_hmgs_il = hmg_util.homography_shift_mult_batch( hmgs_list[level_index - 1], w / 2, h / 2, hmgs_il, w, h, w, h) hmgs_list.append(new_hmgs_il) return hmgs_list, warped_list
def hier_homography_estimator(inputs, num_param=8, num_layer=7, num_level=3, dropout_keep_prob=0.8, reuse=None, is_training=True, trainable=True, final_endpoint=None, scope='hier_hmg'): """A hierarchical VGG-style neural network for homograhy estimation. Args: inputs: batch of input image pairs of data type float32 and of shape [batch_size, height, width, 2] num_param: the number of parameters for homography (default 8) num_layer: the number of convolutional layers in the motion feature network num_level: the number of hierarchical levels dropout_keep_prob: the percentage of activation values that are kept reuse: whether to reuse this network weights is_training: whether used for training or testing trainable: whether this network is to be trained or not final_endpoint: specifies the endpoint to construct the network up to scope: the scope of variables in this function Returns: a list of homographies at each level and motion feature maps if final_endpoint='mfeature'; otherwise a list of images warped by the list of corresponding homographies """ _, h_input, w_input = inputs.get_shape().as_list()[0:3] hmgs_list = [] warped_list = [] with tf.variable_scope(scope, [inputs], reuse=reuse): for level_index in range(num_level): scale = 2 ** (num_level - 1 - level_index) h = tf.to_float(tf.floordiv(h_input, scale)) w = tf.to_float(tf.floordiv(w_input, scale)) inputs_il = tf.image.resize_images(inputs, tf.to_int32([h, w])) if level_index == 0: mfeature = hier_base_layers(inputs_il, num_layer + 1 - num_level + level_index, level_index, is_training=is_training, trainable=trainable) hmgs_il = homography_regression(mfeature, num_param, level_index, dropout_keep_prob=dropout_keep_prob, is_training=is_training, trainable=trainable) hmgs_list.append(hmgs_il) else: warped, _ = hmg_util.homography_scale_warp_per_batch( inputs_il[:, :, :, 0], w / 2, h / 2, hmgs_list[level_index - 1]) pre_warped_inputs_il = tf.stack([warped, inputs_il[:, :, :, 1]], -1) warped_list.append(pre_warped_inputs_il) if level_index == num_level - 1 and final_endpoint == 'mfeature': mfeature = hier_base_layers(pre_warped_inputs_il, num_layer - num_level + level_index, level_index, is_training=is_training, trainable=trainable) return hmgs_list, mfeature else: mfeature = hier_base_layers(pre_warped_inputs_il, num_layer + 1 - num_level + level_index, level_index, is_training=is_training, trainable=trainable) hmgs_il = homography_regression(mfeature, num_param, level_index, dropout_keep_prob=dropout_keep_prob, is_training=is_training, trainable=trainable) new_hmgs_il = hmg_util.homography_shift_mult_batch( hmgs_list[level_index - 1], w / 2, h / 2, hmgs_il, w, h, w, h) hmgs_list.append(new_hmgs_il) return hmgs_list, warped_list
def multilevel_crop_and_resize(features, boxes, output_size=7): """Crop and resize on multilevel feature pyramid. Generate the (output_size, output_size) set of pixels for each input box by first locating the box into the correct feature level, and then cropping and resizing it using the correspoding feature map of that level. Args: features: A dictionary with key as pyramid level and value as features. The features are in shape of [batch_size, height_l, width_l, num_filters]. boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents a box with [y1, x1, y2, x2] in un-normalized coordinates. output_size: A scalar to indicate the output crop size. Returns: A 5-D tensor representing feature crop of shape [batch_size, num_boxes, output_size, output_size, num_filters]. """ with tf.name_scope('multilevel_crop_and_resize'): levels = list(features.keys()) min_level = min(levels) max_level = max(levels) batch_size, max_feature_height, max_feature_width, num_filters = ( features[min_level].get_shape().as_list()) if batch_size is None: batch_size = tf.shape(features[min_level])[0] _, num_boxes, _ = boxes.get_shape().as_list() # Stack feature pyramid into a features_all of shape # [batch_size, levels, height, width, num_filters]. features_all = [] feature_heights = [] feature_widths = [] for level in range(min_level, max_level + 1): shape = features[level].get_shape().as_list() feature_heights.append(shape[1]) feature_widths.append(shape[2]) # Concat tensor of [batch_size, height_l * width_l, num_filters] for each # levels. features_all.append( tf.reshape(features[level], [batch_size, -1, num_filters])) features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters]) # Calculate height_l * width_l for each level. level_dim_sizes = [ feature_widths[i] * feature_heights[i] for i in range(len(feature_widths)) ] # level_dim_offsets is accumulated sum of level_dim_size. level_dim_offsets = [0] for i in range(len(feature_widths) - 1): level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i]) batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1] level_dim_offsets = tf.constant(level_dim_offsets, tf.int32) height_dim_sizes = tf.constant(feature_widths, tf.int32) # Assigns boxes to the right level. box_width = boxes[:, :, 3] - boxes[:, :, 1] box_height = boxes[:, :, 2] - boxes[:, :, 0] areas_sqrt = tf.sqrt(box_height * box_width) levels = tf.cast( tf.floordiv(tf.log(tf.div(areas_sqrt, 224.0)), tf.log(2.0)) + 4.0, dtype=tf.int32) # Maps levels between [min_level, max_level]. levels = tf.minimum(max_level, tf.maximum(levels, min_level)) # Projects box location and sizes to corresponding feature levels. scale_to_level = tf.cast(tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)), dtype=boxes.dtype) boxes /= tf.expand_dims(scale_to_level, axis=2) box_width /= scale_to_level box_height /= scale_to_level boxes = tf.concat([ boxes[:, :, 0:2], tf.expand_dims(box_height, -1), tf.expand_dims(box_width, -1) ], axis=-1) # Maps levels to [0, max_level-min_level]. levels -= min_level level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32)) boundary = tf.cast( tf.concat([ tf.expand_dims([[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1, axis=-1), tf.expand_dims([[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1, axis=-1), ], axis=-1), boxes.dtype) # Compute grid positions. kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions( boxes, boundary, output_size, sample_offset=0.5) x_indices = tf.cast(tf.reshape( box_gridx0x1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) y_indices = tf.cast(tf.reshape( box_gridy0y1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) batch_size_offset = tf.tile( tf.reshape( tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]), [1, num_boxes, output_size * 2, output_size * 2]) # Get level offset for each box. Each box belongs to one level. levels_offset = tf.tile( tf.reshape(tf.gather(level_dim_offsets, levels), [batch_size, num_boxes, 1, 1]), [1, 1, output_size * 2, output_size * 2]) y_indices_offset = tf.tile( tf.reshape( y_indices * tf.expand_dims(tf.gather(height_dim_sizes, levels), -1), [batch_size, num_boxes, output_size * 2, 1]), [1, 1, 1, output_size * 2]) x_indices_offset = tf.tile( tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]), [1, 1, output_size * 2, 1]) indices = tf.reshape( batch_size_offset + levels_offset + y_indices_offset + x_indices_offset, [-1]) # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar # performance. features_per_box = tf.reshape(tf.gather(features_r2, indices), [ batch_size, num_boxes, output_size * 2, output_size * 2, num_filters ]) # Bilinear interpolation. features_per_box = feature_bilinear_interpolation( features_per_box, kernel_y, kernel_x) return features_per_box
def _preprocess(self, obs, obs_shape): """Preprocess the input.""" obs = tf.cast(obs, tf.float32) obs = tf.image.resize_bilinear(obs, obs_shape) denom = tf.constant(256 // self.quantization_factor, dtype=tf.float32) return tf.floordiv(obs, denom)
def generate_detections_per_image_op(cls_outputs, box_outputs, anchor_boxes, image_id, image_info, num_detections=100, pre_nms_num_detections=1000, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generates detections with model outputs and anchors. Args: cls_outputs: a Tensor with shape [N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the cls_outputs should be the output of softmax(). box_outputs: a Tensor with shape [N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a Tensor with shape [N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_id: an integer number to specify the image id. image_info: a tensor of shape [5] which encodes the input image's [height, width, scale, original_height, original_width] num_detections: Number of detections after NMS. pre_nms_num_detections: Number of candidates before NMS. nms_threshold: a float number to specify the threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: detections: detection results in a tensor with each row representing [image_id, ymin, xmin, ymax, xmax, score, class] """ num_boxes, num_classes = cls_outputs.get_shape().as_list() # Removes background class scores. cls_outputs = cls_outputs[:, 1:num_classes] top_k_scores, top_k_indices_with_classes = tf.nn.top_k( tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=True) classes = tf.mod(top_k_indices_with_classes, num_classes - 1) top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1) anchor_boxes = tf.gather(anchor_boxes, top_k_indices) box_outputs = tf.reshape(box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :] box_outputs = tf.gather_nd(box_outputs, tf.stack([top_k_indices, classes], axis=1)) # Applies bounding box regression to anchors. boxes = box_utils.batch_decode_box_outputs_op( tf.expand_dims(anchor_boxes, axis=0), tf.expand_dims(box_outputs, axis=0), bbox_reg_weights)[0] boxes = box_utils.clip_boxes(tf.expand_dims(boxes, axis=0), tf.expand_dims(image_info[:2], axis=0))[0] classes = tf.tile(tf.reshape(classes, [1, pre_nms_num_detections]), [num_classes - 1, 1]) scores = tf.tile(tf.reshape(top_k_scores, [1, pre_nms_num_detections]), [num_classes - 1, 1]) boxes = tf.tile(tf.reshape(boxes, [1, pre_nms_num_detections, 4]), [num_classes - 1, 1, 1]) class_bitmask = tf.tile( tf.reshape(tf.range(num_classes - 1), [num_classes - 1, 1]), [1, pre_nms_num_detections]) scores = tf.where(tf.equal(classes, class_bitmask), scores, tf.zeros_like(scores)) scores = tf.where(tf.greater(scores, 0.05), scores, tf.zeros_like(scores)) # Reshape classes to be compartible with the top_k function. classes = tf.reshape(classes, [num_classes - 1, pre_nms_num_detections, 1]) scores, sorted_tensors = box_utils.top_k(scores, k=pre_nms_num_detections, tensors=[boxes, classes]) boxes = sorted_tensors[0] classes = tf.reshape(sorted_tensors[1], [num_classes - 1, pre_nms_num_detections]) idx, num_valid = non_max_suppression.non_max_suppression_padded( scores, boxes, max_output_size=num_detections, iou_threshold=nms_threshold, level=0) post_nms_boxes = non_max_suppression.gather_boxes_by_indices( boxes, num_detections, idx, num_valid) post_nms_scores = non_max_suppression.gather_scores_by_indices( scores, num_detections, idx, num_valid) # Sorts all results. sorted_scores, sorted_indices = tf.nn.top_k(tf.to_float( tf.reshape(post_nms_scores, [-1])), k=num_detections, sorted=True) post_nms_boxes = tf.gather(tf.reshape(post_nms_boxes, [-1, 4]), sorted_indices) classes = tf.batch_gather(classes, idx) post_nms_classes = tf.gather(tf.reshape(classes, [-1]), sorted_indices) + 1 if isinstance(image_id, int): image_id = tf.constant(image_id) image_id = tf.reshape(image_id, []) detections_result = tf.stack([ tf.to_float(tf.fill(tf.shape(sorted_scores), image_id)), post_nms_boxes[:, 0], post_nms_boxes[:, 1], post_nms_boxes[:, 2], post_nms_boxes[:, 3], sorted_scores, tf.to_float(post_nms_classes), ], axis=1) return detections_result
def multilevel_crop_and_resize(features, boxes, output_size=7, is_gpu_inference=False): """Crop and resize on multilevel feature pyramid. Generate the (output_size, output_size) set of pixels for each input box by first locating the box into the correct feature level, and then cropping and resizing it using the correspoding feature map of that level. Args: features: A dictionary with key as pyramid level and value as features. The features are in shape of [batch_size, height_l, width_l, num_filters]. boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents a box with [y1, x1, y2, x2] in un-normalized coordinates. output_size: A scalar to indicate the output crop size. is_gpu_inference: whether to build the model for GPU inference. Returns: A 5-D tensor representing feature crop of shape [batch_size, num_boxes, output_size, output_size, num_filters]. """ with tf.name_scope('multilevel_crop_and_resize'): levels = features.keys() min_level = min(levels) max_level = max(levels) _, max_feature_height, max_feature_width, _ = ( features[min_level].get_shape().as_list()) # Stack feature pyramid into a features_all of shape # [batch_size, levels, height, width, num_filters]. features_all = [] for level in range(min_level, max_level + 1): features_all.append( tf.image.pad_to_bounding_box(features[level], 0, 0, max_feature_height, max_feature_width)) features_all = tf.stack(features_all, axis=1) # Assign boxes to the right level. box_width = tf.squeeze(boxes[:, :, 3:4] - boxes[:, :, 1:2], axis=-1) box_height = tf.squeeze(boxes[:, :, 2:3] - boxes[:, :, 0:1], axis=-1) areas_sqrt = tf.sqrt(box_height * box_width) levels = tf.floordiv(tf.log(tf.div(areas_sqrt, 224.0)), tf.log(2.0)) + 4.0 if not is_gpu_inference: levels = tf.cast(levels, dtype=tf.int32) # Map levels between [min_level, max_level]. levels = tf.minimum( float(max_level) if is_gpu_inference else max_level, tf.maximum(levels, float(min_level) if is_gpu_inference else min_level)) # Project box location and sizes to corresponding feature levels. scale_to_level = tf.cast( tf.pow( tf.constant(2.0), levels if is_gpu_inference else tf.cast(levels, tf.float32)), dtype=boxes.dtype) boxes /= tf.expand_dims(scale_to_level, axis=2) box_width /= scale_to_level box_height /= scale_to_level boxes = tf.concat([boxes[:, :, 0:2], tf.expand_dims(box_height, -1), tf.expand_dims(box_width, -1)], axis=-1) # Map levels to [0, max_level-min_level]. levels -= min_level level_strides = tf.pow( [[2.0]], levels if is_gpu_inference else tf.cast(levels, tf.float32)) boundary = tf.cast( tf.concat([ tf.expand_dims([[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1, axis=-1), tf.expand_dims([[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1, axis=-1), ], axis=-1), boxes.dtype) return selective_crop_and_resize( features_all, boxes, levels, boundary, output_size, is_gpu_inference)
boxes: A float tensor of shape [batch, num_boxes, 4] containing boxes of the form [ymin, xmin, ymax, xmax] in normalized coordinates. Returns: An int32 tensor of shape [batch_size, num_boxes] containing feature indices. """ assert num_levels > 0, ( '`num_levels` must be > 0. Found {}'.format(num_levels)) assert unit_scale_index < num_levels and unit_scale_index >= 0, ( '`unit_scale_index` must be in [0, {}). Found {}.'.format( num_levels, unit_scale_index)) box_height_width = boxes[:, :, 2:4] - boxes[:, :, 0:2] areas_sqrt = tf.sqrt(tf.reduce_prod(box_height_width, axis=2)) log_2 = tf.cast(tf.log(2.0), dtype=boxes.dtype) levels = tf.cast( tf.floordiv(tf.log(areas_sqrt * image_ratio), log_2) + unit_scale_index, dtype=tf.int32) levels = tf.maximum(0, tf.minimum(num_levels - 1, levels)) return levels def bfloat16_to_float32_nested(input_nested): """Convert float32 tensors in a nested structure to bfloat16. Args: input_nested: A Python dict, values being Tensor or Python list/tuple of Tensor or Non-Tensor. Returns:
def create_perturbation_ops(self, minibatch, synonym_values, vocab_table): """Perturb data_batch using synonym_values.""" data_batch = _pad_fixed(utils.get_padded_indexes( vocab_table, minibatch.tokens, self.batch_size), axis=1, padded_length=self.config['max_padded_length']) # synonym_values: [vocab_size x max_num_synonyms] # data_batch: [batch_size x seq_length] # [batch_size x seq_length x max_num_synonyms] - synonyms for each token. # Defaults to same word in case of no other synonyms. synonym_ids = tf.gather(synonym_values, data_batch, axis=0) # Split along batchsize. Elements shape: [seq_length x max_num_synonyms]. synonym_ids_per_example = tf.unstack(synonym_ids, axis=0) # Loop across batch. # synonym_ids_this_example shape: [seq_length x max_num_synonyms] sequence_positions_across_batch, values_across_batch = [], [] for i_sample, synonym_ids_this_example in enumerate( synonym_ids_per_example): # [num_nonzero, 2]. The rows are pairs of (t,s), where t is an index for # a time step, and s is an index into the max_num_synonyms dimension. nonzero_indices = tf.where(synonym_ids_this_example) # shape [num_nonzero]. Corresponding to the entries at nonzero_indices synonym_tokens = tf.gather_nd(params=synonym_ids_this_example, indices=nonzero_indices) # [num_nonzero] - Of the (t,s) pairs in nonzero_indices, pick only the # time dimension (t), corresponding to perturbation positions in the # sequence. perturbation_positions_this_example = nonzero_indices[:, 0] # The main logic is done. Now follows padding to a fixed length of # num_perturbations. However, this cannot be done with 0-padding, as it # would introduce a new (zero) vertex. Instead, we duplicate existing # tokens as perturbations (which have no effect), until we have reached a # total of num_perturbations perturbations. In this case, the padded # tokens are the original tokens from the data_batch. The padded positions # are all the positions (using range) corresponding to the padded tokens. # How often seq-length fits into maximum num perturbations padding_multiplier = tf.floordiv( self.config['num_perturbations'], tf.cast(minibatch.num_tokens[i_sample], tf.int32)) + 1 # original tokens # [seq_length] original_tokens = data_batch[ i_sample, :minibatch.num_tokens[i_sample]] # [padding_multiplier * seq_length]. Repeat several times, use as padding. padding_tokens = tf.tile(original_tokens, multiples=[padding_multiplier]) synonym_tokens_padded = tf.concat( [synonym_tokens, tf.cast(padding_tokens, dtype=tf.int64)], axis=0) # Crop at exact num_perturbations size. synonym_tokens_padded = synonym_tokens_padded[:self.config[ 'num_perturbations']] # [seq_length] padding sequence positions with tiles of range() pad_positions = tf.range(minibatch.num_tokens[i_sample], delta=1) # [padding_multiplier*seq_length] padding_positions = tf.tile(pad_positions, multiples=[padding_multiplier]) perturbation_positions_this_example_padded = tf.concat([ perturbation_positions_this_example, tf.cast(padding_positions, dtype=tf.int64) ], axis=0) # Crop at exact size num_perturbations. sequence_positions_padded = perturbation_positions_this_example_padded[:self.config[ 'num_perturbations']] # Collect across the batch for tf.stack later. sequence_positions_across_batch.append(sequence_positions_padded) values_across_batch.append(synonym_tokens_padded) # Both [batch_size x max_n_perturbations] perturbation_positions = tf.stack(sequence_positions_across_batch, axis=0) perturbation_tokens = tf.stack(values_across_batch, axis=0) # Explicitly setting the shape to self.config['num_perturbations'] perturbation_positions_shape = perturbation_positions.shape.as_list() perturbation_positions_shape[1] = self.config['num_perturbations'] perturbation_positions.set_shape(perturbation_positions_shape) perturbation_tokens_shape = perturbation_tokens.shape.as_list() perturbation_tokens_shape[1] = self.config['num_perturbations'] perturbation_tokens.set_shape(perturbation_tokens_shape) return Perturbation(positions=perturbation_positions, tokens=perturbation_tokens)