def DistanceBetweenCentroidsAndBBoxesFastAndFurious(centroids, bboxes, masks): """Computes the distance between centroids and bboxes. The distance/loss is loosely following the 'Fast and Furious' paper by Luo et al., CVPR'18. This is just one way of calculating the distances. We will probably develop other ways. Args: centroids: [..., 4]. x/y/w/h for bboxes. bboxes: [..., 4]. ymin/xmin/ymax/xmax for bboxes. masks: [...]. masks[i] == 1 means i-th entry (centroids[i] and bboxes[i]) should be considered in the distance/loss calculation. Returns: A [...] tensor. i-th value is the distance measure of centroids[i] and bboxes[i]. """ x, y, w, h = tf.unstack(centroids, axis=-1, num=4) # "gt" suffix means 'ground truth'. x_gt, y_gt, w_gt, h_gt = tf.unstack(BBoxesToXYWH(bboxes), axis=-1, num=4) def Pos(x): return tf.maximum(tf.constant(1e-8, x.dtype), x) # The following terms are zeros when masks[i] is 0. l_x = py_utils.CheckNumerics(masks * (x - x_gt) / Pos(w_gt)) l_y = py_utils.CheckNumerics(masks * (y - y_gt) / Pos(h_gt)) s_w = py_utils.CheckNumerics(masks * tf.math.log(Pos(w) / Pos(w_gt))) s_h = py_utils.CheckNumerics(masks * tf.math.log(Pos(h) / Pos(h_gt))) return (_SmoothL1Norm(l_x) + _SmoothL1Norm(l_y) + _SmoothL1Norm(s_w) + _SmoothL1Norm(s_h))
def FProp(self, theta, inputs, paddings=None): """Apply group normalization. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. inputs: The inputs tensor with shape [batch_size, height, width, channel]. paddings: The paddings tensor with shape [batch_size, height]. Intended to be used for sequence processing where `height` is `time`. Returns: A single tensor as the output after applying group normalization, with the same shape as 'inputs'. Or a output, output_paddings pair if input paddings is not None. """ p = self.params n, h, w, c = tf.unstack(tf.shape(inputs), axis=0, num=4) group_size = p.dim // p.num_groups num_groups = p.num_groups min_group_size = p.min_group_size if p.dim > p.min_group_size else p.dim if group_size <= min_group_size: group_size = min_group_size num_groups = p.dim // group_size with tf.name_scope(p.name): x = tf.reshape(inputs, [n, h, w, num_groups, group_size]) if paddings is None: counts, means_ss, variance_ss, _, = tf.nn.sufficient_statistics( x, axes=[1, 2, 4], keepdims=True) norm_mean, norm_variance = tf.nn.normalize_moments( counts, means_ss, variance_ss, None) else: expanded_paddings = tf.reshape(paddings, [n, h, 1, 1, 1]) norm_mean, norm_variance = ComputeMomentsWithPadding( x, expanded_paddings, [1, 2, 4], keepdims=True) norm_mean = py_utils.CheckNumerics( norm_mean, 'mean of %s failed numeric check' % p.name) norm_variance = py_utils.CheckNumerics( norm_variance, 'variance of %s failed numeric check' % p.name) beta = theta.beta gamma = theta.gamma with tf.control_dependencies([ py_utils.assert_greater_equal( norm_variance, tf.cast(0., norm_variance.dtype)), py_utils.assert_shape_match([n, 1, 1, num_groups, 1], tf.shape(norm_mean)), py_utils.assert_shape_match([n, 1, 1, num_groups, 1], tf.shape(norm_variance)), ]): x = (x - norm_mean) / tf.sqrt(norm_variance + self._epsilon) x = tf.reshape(x, [n, h, w, c]) gn_output = x * gamma + beta gn_output = tf.reshape(gn_output, [n, h, w, c]) if paddings is None: return gn_output else: return gn_output, paddings
def _testTransparentInputs(self, num_layers=6, dtype=tf.float32, is_eval_mode=False): src_time = 5 src_batch = 4 emb_dims = 4 encoder_outputs, tgts, num_hyps = self._Inputs(dtype) src_enc = tf.constant( np.random.normal(size=[src_time, src_batch, emb_dims, num_layers]), dtype=dtype) if not is_eval_mode: src_enc = tf.unstack(src_enc, axis=3) encoder_outputs.encoded = src_enc return (encoder_outputs, tgts, num_hyps)
def _check_paddings(self, paddings): with tf.name_scope('check_paddings'): unpacked_paddings = tf.unstack(paddings) non_decr = [] for t in unpacked_paddings: non_d = tf.is_non_decreasing(t) non_decr.append(non_d) all_non_decr = tf.stack(non_decr) paddings = py_utils.with_dependencies([ tf.assert_equal(tf.reduce_any(tf.equal(paddings, 0.0)), True, message='must have at least one zero value.'), tf.assert_equal( all_non_decr, True, message='must be non-decreasing') ], paddings) return paddings
def TopKAccuracy(k, logits, labels, weights): """Compute top-k accuracy. Args: k: An int scalar. Top-k. logits: A [N, C] float tensor. labels: A [N] int vector. weights: A [N] float vector. Returns: A float scalar. The accuracy at precision k. """ logits = py_utils.HasRank(logits, 2) n, _ = tf.unstack(tf.shape(logits), 2) labels = py_utils.HasShape(labels, [n]) weights = py_utils.HasShape(weights, [n]) correct = tf.nn.in_top_k(targets=labels, predictions=logits, k=k) return tf.reduce_sum(tf.cast(correct, weights.dtype) * weights) / tf.maximum( 1e-8, tf.reduce_sum(weights))
def ComputeXentOutput(softmax_layer, softmax_theta, activations, labels, num_samples=1): """Compute Softmax CrossEntropy output.""" seqlen, batch, _ = tf.unstack(tf.shape(activations), num=3) if labels is None: # We can only compute the logits here. logits = softmax_layer.Logits( theta=softmax_theta, inputs=tf.reshape(activations, [seqlen * batch * num_samples, -1])) xent_output = py_utils.NestedMap( logits=tf.reshape(logits, [seqlen, batch, -1])) elif 'class_ids' in labels: # labels.class_ids: [len, batch] if num_samples > 1: class_ids = tf.tile(labels.class_ids, [1, num_samples]) class_weights = tf.tile(labels.class_weights, [1, num_samples]) else: class_ids = labels.class_ids class_weights = labels.class_weights xent_output = softmax_layer.FProp( theta=softmax_theta, inputs=activations, class_weights=class_weights, class_ids=class_ids) else: assert 'class_probabilities' in labels if num_samples > 1: class_probabilities = tf.tile(labels.class_probabilities, [1, num_samples]) class_weights = tf.tile(labels.class_weights, [1, num_samples]) else: class_probabilities = labels.class_probabilities class_weights = labels.class_weights xent_output = softmax_layer.FProp( theta=softmax_theta, inputs=activations, class_weights=class_weights, class_probabilities=class_probabilities) return xent_output
def ResidualsToBBoxes(self, anchor_bboxes, residuals, min_angle_rad=-np.pi, max_angle_rad=np.pi): r"""Converts anchor_boxes and residuals to predicted bboxes. This converts predicted residuals into bboxes using the following formulae:: x_predicted = x_a + x_residual * diagonal_xy y_predicted = y_a + y_residual * diagonal_xy z_predicted = z_a + z_residual * dz_a dx_predicted = dx_a * exp(dx_residual) dy_predicted = dy_a * exp(dy_residual) dz_predicted = dz_a * exp(dz_residual) # Adding the residual, and bounding it between # [min_angle_rad, max_angle_rad] phi_predicted = NormalizeAngleRad(phi_a + phi_residual, min_angle_rad, max_angle_rad) These equations follow from those in LocalizationResiduals, where we solve for the \*_gt variables. Args: anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz, phi), corresponding to each anchor bbox parameters. residuals: tf.float32 of the same shape as anchor_bboxes containing predicted residuals at each anchor location. min_angle_rad: Scalar with the minimum angle allowed (before wrapping) in radians. max_angle_rad: Scalar with the maximum angle allowed (before wrapping) in radians. This value usually should be pi. Returns: A tf.float32 tensor of the same shape as anchor_bboxes with predicted bboxes. """ anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes) anchor_bboxes = py_utils.with_dependencies( [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes) residuals = py_utils.HasShape(residuals, anchor_bboxes_shape) x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes, num=7, axis=-1) (x_residual, y_residual, z_residual, dx_residual, dy_residual, dz_residual, phi_residual) = tf.unstack(residuals, num=7, axis=-1) diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a)) x_predicted = x_a + x_residual * diagonal_xy y_predicted = y_a + y_residual * diagonal_xy z_predicted = z_a + z_residual * dz_a dx_predicted = dx_a * tf.exp(dx_residual) dy_predicted = dy_a * tf.exp(dy_residual) dz_predicted = dz_a * tf.exp(dz_residual) # We bound the angle between [min_angle_rad, max_angle_rad], which should # be passed in depending on the heading handling in the calling model. # If the model uses a sine(delta_phi) transformation in the loss, then it # cannot distinguish direction and a [0, np.pi] # [min_angle_rad, max_angle_rad] should be used. # If there is a heading encoding that is directional, most likely you # should use a [-np.pi, np.pi] [min_angle_rad, max_angle_rad]. phi_predicted = phi_a + phi_residual phi_predicted = geometry.WrapAngleRad(phi_predicted, min_angle_rad, max_angle_rad) return tf.stack([ x_predicted, y_predicted, z_predicted, dx_predicted, dy_predicted, dz_predicted, phi_predicted, ], axis=-1) # pyformat: disable
def LocalizationResiduals(self, anchor_bboxes, assigned_gt_bboxes): """Computes the anchor residuals for every bbox. For a given bbox, compute residuals in the following way: Let ``anchor_bbox = (x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a)`` and ``assigned_gt_bbox = (x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt)`` Define ``diagonal_xy = sqrt(dx_a^2 + dy_a^2)`` Then the corresponding residuals are given by:: x_residual = (x_gt - x_a) / (diagonal_xy) y_residual = (y_gt - y_a) / (diagonal_xy) z_residual = (z_gt - z_a) / (dz_a) dx_residual = log(dx_gt / dx_a) dy_residual = log(dy_gt / dy_a) dz_residual = log(dz_gt / dz_a) phi_residual = phi_gt - phi_a The normalization for x and y residuals by the diagonal was first proposed by [1]. Intuitively, this reflects that objects can usually move freely in the x-y plane, including diagonally. On the other hand, moving in the z-axis (up and down) can be considered orthogonal to x-y. For phi_residual, one way to frame the loss is with SmoothL1(sine(phi_residual - phi_predicted)). The use of sine to wrap the phi residual was proposed by [2]. This stems from the observation that bboxes at phi and phi + pi are the same bbox, fully overlapping in 3D space, except that the direction is different. Note that the use of sine makes this residual invariant to direction when a symmetric loss like SmoothL1 is used. In ResidualsToBBoxes, we ensure that the phi predicted is between [0, pi). The Huber (SmoothL1) loss can then be applied to the delta between these target residuals and the model predicted residuals. [1] VoxelNet: End-to-End Learning for Point Cloud Based 3D Object Detection https://arxiv.org/abs/1711.06396 [2] SECOND: Sparsely Embedded Convolutional Detection https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf Args: anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz, phi), corresponding to each anchor bbox parameters. assigned_gt_bboxes: tf.float32 of the same shape as anchor_bboxes containing the corresponding assigned ground-truth bboxes. Returns: A tf.float32 tensor of the same shape as anchor_bboxes with target residuals for every corresponding bbox. """ anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes) anchor_bboxes = py_utils.with_dependencies( [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes) assigned_gt_bboxes = py_utils.HasShape(assigned_gt_bboxes, anchor_bboxes_shape) x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes, num=7, axis=-1) x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt = tf.unstack( assigned_gt_bboxes, num=7, axis=-1) diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a)) # The anchor dimensions is usually a hard-coded param given to the input # generator and should not be 0. We use CheckNumerics to ensure that is the # case. x_residual = py_utils.CheckNumerics((x_gt - x_a) / diagonal_xy) y_residual = py_utils.CheckNumerics((y_gt - y_a) / diagonal_xy) z_residual = py_utils.CheckNumerics((z_gt - z_a) / dz_a) dx_residual = py_utils.CheckNumerics(tf.log(dx_gt / dx_a)) dy_residual = py_utils.CheckNumerics(tf.log(dy_gt / dy_a)) dz_residual = py_utils.CheckNumerics(tf.log(dz_gt / dz_a)) phi_residual = phi_gt - phi_a return tf.stack([ x_residual, y_residual, z_residual, dx_residual, dy_residual, dz_residual, phi_residual, ], axis=-1) # pyformat: disable
def _Extract(self, features): p = self.params source_id = py_utils.HasShape(features['image/source_id'], []) xmin = _Dense(features['object/image/bbox/xmin']) xmax = _Dense(features['object/image/bbox/xmax']) ymin = _Dense(features['object/image/bbox/ymin']) ymax = _Dense(features['object/image/bbox/ymax']) # 2d bounding box in image coordinates. bboxes = tf.stack([ymin, xmin, ymax, xmax], axis=1) bboxes_count = tf.shape(bboxes)[0] bboxes = py_utils.PadOrTrimTo(bboxes, [p.max_num_objects, 4]) bboxes_padding = 1.0 - py_utils.PadOrTrimTo(tf.ones([bboxes_count]), [p.max_num_objects]) dim_xyz = tf.reshape(_Dense(features['object/velo/bbox/dim_xyz']), [-1, 3]) loc_xyz = tf.reshape(_Dense(features['object/velo/bbox/xyz']), [-1, 3]) phi = tf.reshape(_Dense(features['object/velo/bbox/phi']), [-1, 1]) # bboxes_3d is in [x, y, z, dx, dy, dz, phi]. bboxes_3d = tf.concat([loc_xyz, dim_xyz, phi], axis=1) cx, cy, _, dx, dy, _, _ = tf.unstack(bboxes_3d, num=7, axis=-1) bboxes_td = tf.stack([ cy - dy / 2, cx - dx / 2, cy + dy / 2, cx + dx / 2, ], axis=-1) # pyformat: disable bboxes_td = py_utils.PadOrTrimTo(bboxes_td, [p.max_num_objects, 4]) has_3d_info = tf.cast(_Dense(features['object/has_3d_info']), tf.float32) bboxes_3d_mask = py_utils.PadOrTrimTo(has_3d_info, [p.max_num_objects]) bboxes_td_mask = bboxes_3d_mask # Fill in difficulties from bounding box height, truncation and occlusion. bb_height = ymax - ymin box_image_height = py_utils.PadOrTrimTo(bb_height, [p.max_num_objects]) box_image_height *= bboxes_3d_mask # 0 to 3 indicating occlusion level. 0 means fully visible, 1 means partly, occlusion = tf.reshape(_Dense(features['object/occlusion']), [-1]) occlusion = tf.cast(occlusion, tf.float32) occlusion = py_utils.PadOrTrimTo(occlusion, [p.max_num_objects]) occlusion *= bboxes_3d_mask # Truncation: 0 -> not truncated, 1.0 -> truncated truncation = tf.reshape(_Dense(features['object/truncation']), [-1]) truncation = py_utils.PadOrTrimTo(truncation, [p.max_num_objects]) truncation *= bboxes_3d_mask difficulties = ComputeKITTIDifficulties(box_image_height, occlusion, truncation) difficulties = py_utils.PadOrTrimTo(difficulties, [p.max_num_objects]) # Make a batch axis to call BBoxCorners, and take the first result back. bbox3d_corners = geometry.BBoxCorners(bboxes_3d[tf.newaxis, ...])[0] # Project the 3D bbox to the image plane. velo_to_image_plane = features['transform/velo_to_image_plane'] bboxes3d_proj_to_image_plane = geometry.PointsToImagePlane( tf.reshape(bbox3d_corners, [-1, 3]), velo_to_image_plane) # Output is [num_objects, 8 corners per object, (x, y)]. bboxes3d_proj_to_image_plane = tf.reshape(bboxes3d_proj_to_image_plane, [-1, 8, 2]) bboxes3d_proj_to_image_plane = py_utils.PadOrTrimTo( bboxes3d_proj_to_image_plane, [p.max_num_objects, 8, 2]) texts = features['object/label'].values labels = ops.static_map_string_int(x=texts, keys=self.KITTI_CLASS_NAMES) labels = py_utils.PadOrTrimTo(labels, [p.max_num_objects]) texts = py_utils.PadOrTrimTo(texts, [p.max_num_objects]) # Filter labels by setting bboxes_padding, bboxes_3d_mask, and # bboxes_td_mask appropriately. if p.filter_labels is not None: valid_labels = tf.constant([p.filter_labels]) bbox_mask = tf.reduce_any(tf.equal(tf.expand_dims(labels, 1), valid_labels), axis=1) bbox_mask = tf.cast(bbox_mask, tf.float32) bboxes_padding = 1 - bbox_mask * (1 - bboxes_padding) filtered_bboxes_3d_mask = bboxes_3d_mask * bbox_mask bboxes_td_mask *= bbox_mask else: filtered_bboxes_3d_mask = bboxes_3d_mask # Placeholder for counting the number of laser points that reside within # each 3-d bounding box. This must be filled in outside of this function # based on the loaded 3-d laser points. bboxes_3d_num_points = tf.zeros([p.max_num_objects], dtype=tf.int32) bboxes_3d_num_points = py_utils.PadOrTrimTo(bboxes_3d_num_points, [p.max_num_objects]) # Pad bboxes_3d. bboxes_3d = py_utils.PadOrTrimTo(bboxes_3d, [p.max_num_objects, 7]) return py_utils.NestedMap( source_id=source_id, bboxes_count=bboxes_count, bboxes=bboxes, bboxes_padding=bboxes_padding, bboxes_3d=bboxes_3d, bboxes_3d_mask=filtered_bboxes_3d_mask, unfiltered_bboxes_3d_mask=bboxes_3d_mask, bboxes3d_proj_to_image_plane=bboxes3d_proj_to_image_plane, bboxes_td=bboxes_td, bboxes_td_mask=bboxes_td_mask, bboxes_3d_num_points=bboxes_3d_num_points, labels=labels, texts=texts, box_image_height=box_image_height, occlusion=occlusion, truncation=truncation, difficulties=difficulties)
def FProp(self, theta, inputs, paddings, state0=None, labels=None): """Computes xent loss given the language model input activations. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: Input activation. A tensor of shape [time, batch, model_dim]. paddings: A 0/1 tensor of shape [time, batch]. state0: Not used for Transformer. labels: If not None, a `.NestedMap` containing the following fields: - class_weights, a tensor with shape [time, batch] containing the weights for each target word. - class_ids, a tensor with shape [time, batch] of int32 dtype containing the target class labels. - class_probabilities, a tensor with shape [time, batch, vocab_size] of float values indicating class-membership probabilities. Returns: If `labels` is not None, returns (xent_output, None), where `xent_output` is a `.NestedMap` as defined by `SoftmaxLayer`'s return value. Otherwise, `xent_output` only contains the softmax logits. """ p = self.params inputs = py_utils.HasRank(inputs, 3) seqlen, batch, _ = tf.unstack(tf.shape(inputs), num=3) inputs = py_utils.HasShape(inputs, [seqlen, batch, p.model_dim]) paddings = py_utils.HasShape(paddings, [seqlen, batch]) # [time, 1, model_dim] posit_embs = tf.expand_dims( self.position_emb.FProp(theta.position_emb, seqlen), 1) # [time, batch, model_dim] input_embs = inputs + posit_embs input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs) layer_in = input_embs for layer, layer_theta in zip(self.trans, theta.trans): # [time, batch, model_dim] layer_out, _ = layer.FProp(layer_theta, layer_in, paddings) layer_in = layer_out if labels is None: # We can only compute the logits here. logits = self.softmax.Logits( theta=theta.softmax, inputs=tf.reshape(layer_out, [seqlen * batch, -1])) xent_output = py_utils.NestedMap( logits=tf.reshape(logits, [seqlen, batch, -1])) elif 'class_ids' in labels: xent_output = self.softmax.FProp( theta=theta.softmax, inputs=layer_out, class_weights=labels.class_weights, class_ids=labels.class_ids) else: assert 'class_probabilities' in labels xent_output = self.softmax.FProp( theta=theta.softmax, inputs=layer_out, class_weights=labels.class_weights, class_probabilities=labels.class_probabilities) xent_output.last_hidden = layer_out return xent_output, None
def FProp(self, theta, inputs, paddings, state0, labels=None): """Forward compute.""" p = self.params ids = py_utils.HasRank(inputs, 2) paddings = py_utils.HasShape(paddings, tf.shape(ids)) seqlen, batch = tf.unstack(tf.shape(inputs), num=2) assert state0 paddings_3d = tf.expand_dims(paddings, axis=2) # RNNs if p.shared_emb: emb_act = [self.emb.EmbLookup(theta.emb, inputs) ] * (1 + p.number_of_experts) else: emb_act = [ self.emb[i].EmbLookup(theta.emb[i], inputs) for i in range(1 + p.number_of_experts) ] state1 = py_utils.NestedMap(rnns=[]) rnns_act = [] for i, act in enumerate(emb_act): act, state = self.rnns[i].FProp(theta.rnns[i], act, paddings_3d, state0.rnns[i]) act = py_utils.HasRank(act, 3) rnns_act += [act] state1.rnns += [state] # [time, batch, experts, dims]. expert_stacked = tf.stack(rnns_act[1:], axis=2) # Compute gating softmax. The 0-th rnns is used as the expert # predictor. Because SoftmaxLayer.Logits takes a matrix as input, # we reshape rnns_act[0], the domain predictor activation, to a # matrix here. act = tf.reshape(rnns_act[0], [seqlen * batch, -1]) logits = self.domain_predictor_softmax.Logits( theta.domain_predictor_softmax, act) # [time, batch, experts] gating = tf.reshape(tf.nn.softmax(logits), [seqlen, batch, -1]) # Mix the experts. # [time, batch, dims] combined = tf.squeeze( tf.matmul( # [time, batch, 1, experts] tf.expand_dims(gating, axis=2), # [time, batch, experts, dims] expert_stacked), axis=2) if p.add_postgating_rnn: # Note that this layer includes 1 or more RNN layers followed # by a softmax. xent_loss, state1.merge = self.merge.FProp(theta.merge, combined, paddings, state0.merge, labels) else: xent_loss = self.output_softmax.FProp( theta=theta.output_softmax, inputs=combined, class_weights=labels.class_weights, class_ids=labels.class_ids) # return xent_loss, state1 return xent_loss, state1
def FProp(self, theta, inputs, paddings, state0, labels=None, direct_features=None): """Computes xent loss given the language model input activations. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: input activation. A tensor of shape [time, batch, dims]. paddings: a 0/1 tensor of shape [time, batch]. state0: A `.NestedMap` containing the initial recurrent state. labels: If not None, a `.NestedMap` containing the following fields. - class_weights, a tensor with shape [time, batch] containing the weights for each target word. - class_ids, a tensor with shape [time, batch] of int32 dtype containing the target class labels. - class_probabilities, a tensor with shape [time, batch, vocab_size] of float values indicating class-membership probabilities. direct_features: If not None, a tensor of [time, batch, direct_feature_dims] that is concatenated to the output of the last RNN layer. Returns: If `labels` is not None, returns (xent_output, state1), where `xent_output` is a `.NestedMap` as defined by `SoftmaxLayer`'s return value and `state1` is the next recurrent state. Otherwise, `xent_output` contains the softmax logits, probabilities (.probs) and log-probabilities (.log_probs). """ inputs = py_utils.HasRank(inputs, 3) seqlen, batch, _ = tf.unstack(tf.shape(inputs), num=3) paddings = py_utils.HasShape(paddings, [seqlen, batch]) assert state0 is not None activation, state1 = self.rnns.FProp(theta.rnns, inputs, tf.expand_dims(paddings, 2), state0) if direct_features is not None: direct_features = py_utils.HasRank(direct_features, 3) activation = tf.concat([activation, direct_features], axis=2) if labels is None: # We can only compute the logits here. logits = self.softmax.Logits( theta=theta.softmax, inputs=tf.reshape(activation, [seqlen * batch, -1])) xent_output = py_utils.NestedMap( logits=tf.reshape(logits, [seqlen, batch, -1])) xent_output.probs = tf.nn.softmax(xent_output.logits) xent_output.log_probs = tf.nn.log_softmax(xent_output.logits) elif 'class_ids' in labels: xent_output = self.softmax.FProp( theta=theta.softmax, inputs=activation, class_weights=labels.class_weights, class_ids=labels.class_ids) else: assert 'class_probabilities' in labels xent_output = self.softmax.FProp( theta=theta.softmax, inputs=activation, class_weights=labels.class_weights, class_probabilities=labels.class_probabilities) xent_output.last_hidden = activation return xent_output, state1
def Logits(self, theta, inputs, paddings, *args, **kwargs): """FProp and returns the logits for the whole sequence.""" p = self.params del theta, paddings time, batch = tf.unstack(tf.shape(inputs)[:2]) return tf.zeros([time, batch, p.vocab_size], dtype=p.dtype)
def _Preprocess(self, raw): data = tf.stack([ tf.image.per_image_standardization(img) for img in tf.unstack(raw) ]) data.set_shape(raw.shape) return data
def ResidualsToBBoxes(self, anchor_bboxes, residuals): r"""Converts anchor_boxes and residuals to predicted bboxes. This converts predicted residuals into bboxes using the following formulae: x_predicted = x_a + x_residual \* diagonal_xy y_predicted = y_a + y_residual \* diagonal_xy z_predicted = z_a + z_residual \* dz_a dx_predicted = dx_a \* exp(dx_residual) dy_predicted = dy_a \* exp(dy_residual) dz_predicted = dz_a \* exp(dz_residual) phi_predicted = phi_a + phi_residual These equations follow from those in LocalizationResiduals, where we solve for the \*_gt variables. Args: anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz, phi), corresponding to each anchor bbox parameters. residuals: tf.float32 of the same shape as anchor_bboxes containing predicted residuals at each anchor location. Returns: A tf.float32 tensor of the same shape as anchor_bboxes with predicted bboxes. """ anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes) anchor_bboxes = py_utils.with_dependencies( [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes) residuals = py_utils.HasShape(residuals, anchor_bboxes_shape) x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack( anchor_bboxes, num=7, axis=-1) (x_residual, y_residual, z_residual, dx_residual, dy_residual, dz_residual, phi_residual) = tf.unstack( residuals, num=7, axis=-1) diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a)) x_predicted = x_a + x_residual * diagonal_xy y_predicted = y_a + y_residual * diagonal_xy z_predicted = z_a + z_residual * dz_a dx_predicted = dx_a * tf.exp(dx_residual) dy_predicted = dy_a * tf.exp(dy_residual) dz_predicted = dz_a * tf.exp(dz_residual) # Assuming a sine(delta_phi) transformation is used in the loss, then, it # is not possible to distinguish direction, hence, we use floormod here to # ensure that the predicted_phi is always in [0, np.pi) for consistency. # A separate direction classifier should be added the model if needed. phi_predicted = phi_a + phi_residual phi_predicted = tf.floormod(phi_predicted, np.pi) return tf.stack([ x_predicted, y_predicted, z_predicted, dx_predicted, dy_predicted, dz_predicted, phi_predicted, ], axis=-1) # pyformat: disable