def chunked_causal_numerator_func(qs, ks, vs): """Forward pass of not-normalized FAVOR causal attention using chunks. Args: qs: query_prime tensor of the shape [L,B,H,M]. ks: key_prime tensor of the shape [L,B,H,M]. vs: value tensor of the shape [L,B,H,D]. Returns: Not-normalized FAVOR causal attention A_{masked}V. Last prefix sum state. """ result = [] sums = tf.zeros_like(ks[0])[..., None] * tf.zeros_like(vs[0])[..., None, :] for start_index in range(0, qs.shape[0], _ITER_CHUNK_SIZE): end_index = min(qs.shape[0], start_index + _ITER_CHUNK_SIZE) chunk = tf.einsum("sijk,sijl->sijkl", ks[start_index:end_index], vs[start_index:end_index]) chunk = sums[None, ...] + tf.math.cumsum(chunk, axis=0) sums = chunk[-1] result_elem = tf.einsum("sijkl,sijk->sijl", chunk, qs[start_index:end_index]) result.append(result_elem) result = tf.concat(result, axis=0) return result, sums
def _Moments(inputs, mask, enable_cross_replica_sum_on_tpu=False): """Computes mean and variance over the valid data points in inputs.""" inputs = py_utils.with_dependencies([ py_utils.assert_equal(tf.rank(inputs), tf.rank(mask)), py_utils.assert_greater_equal(mask, tf.zeros_like(mask)), ], inputs) rank = tf.rank(mask) reduce_over_dims = tf.range(0, rank - 1) sum_v = tf.reduce_sum(inputs * tf.cast(mask, inputs.dtype), reduce_over_dims) count_v = tf.reduce_sum(mask, reduce_over_dims) # Input shape is guaranteed to be a multiple of mask shape because the # inputs * mask op above was successfully broadcasted. mask_multiplier = tf.shape(inputs)[:-1] // tf.shape(mask)[:-1] count_v *= tf.cast(tf.reduce_prod(mask_multiplier), count_v.dtype) if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu: sum_v = tf.tpu.cross_replica_sum(sum_v) count_v = tf.tpu.cross_replica_sum(count_v) count_v = tf.maximum(count_v, 1.0) mean = sum_v / count_v sum_vv = tf.reduce_sum((inputs - mean) * (inputs - mean) * mask, reduce_over_dims) if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu: sum_vv = tf.tpu.cross_replica_sum(sum_vv) variance = py_utils.with_dependencies([ py_utils.assert_greater_equal(sum_vv, tf.zeros_like(sum_vv)), ], sum_vv / count_v) return mean, variance
def Bak(inputs, outputs, d_outputs): """Backward step.""" del inputs # unused output_acts, step_seeds = outputs d_outputs = d_outputs[0] d_layer_thetas = [] for layer_idx in reversed(range(num_layers)): f_seed, g_seed = step_seeds[layer_idx] layer = self.sub_layers[layer_idx] layer_theta = theta.sub_layers[layer_idx] input_acts, d_inputs, d_theta = layer.ReverseAndGrad( layer_theta, output_acts, d_outputs, f_seed, g_seed, *extra_inputs) d_layer_thetas.append(d_theta) # Passes reconstructed inputs to the previous layer. output_acts = input_acts d_outputs = d_inputs py_utils.ResetStepSeed(final_step_seed) d_theta = py_utils.NestedMap( global_step=tf.zeros_like(initial_step_seed)) d_theta.sub_layers = list(reversed(d_layer_thetas)) extra_grads = [tf.zeros_like(t) for t in extra_inputs] return [ tf.zeros_like(initial_step_seed), d_theta, d_inputs, extra_grads ]
def chunked_causal_numerator_grad(qs, ks, vs, sums, res_grad): """Backward pass of not-normalized FAVOR causal attention using chunks. Args: qs: query_prime tensor of the shape [L,B,H,M]. ks: key_prime tensor of the shape [L,B,H,M]. vs: value tensor of the shape [L,B,H,D]. sums: last prefix sum state. res_grad: gradient of the last prefix sum state. Returns: Gradient of qs. Gradient of ks. Gradient of vs. """ grads = tf.zeros_like(ks[0])[..., None] * tf.zeros_like(vs[0])[..., None, :] gr_sums = sums q_grads = [] k_grads = [] v_grads = [] res_grad = res_grad[::-1] qs_rev = qs[::-1] ks_rev = ks[::-1] vs_rev = vs[::-1] for start_index in range(0, qs_rev.shape[0], _ITER_CHUNK_SIZE): end_index = min(qs_rev.shape[0], start_index + _ITER_CHUNK_SIZE) chunk = tf.einsum("sijk,sijl->sijkl", ks_rev[start_index:end_index - 1], vs_rev[start_index:end_index - 1]) chunk = tf.concat([tf.zeros_like(gr_sums[None, ...]), chunk], axis=0) chunk = gr_sums[None, ...] - tf.math.cumsum(chunk, axis=0) gr_sums = chunk[-1] - tf.einsum("ijk,ijl->ijkl", ks_rev[end_index - 1], vs_rev[end_index - 1]) q_grads.append( tf.einsum("sijkl,sijl->sijk", chunk, res_grad[start_index:end_index])) grad_chunk = tf.einsum("sijk,sijl->sijkl", qs_rev[start_index:end_index], res_grad[start_index:end_index]) grad_chunk = grads[None, ...] + tf.math.cumsum(grad_chunk, axis=0) grads = grad_chunk[-1] k_grads.append( tf.einsum("sijkl,sijl->sijk", grad_chunk, vs_rev[start_index:end_index])) v_grads.append( tf.einsum("sijkl,sijk->sijl", grad_chunk, ks_rev[start_index:end_index])) q_grads = tf.concat(q_grads, axis=0)[::-1] k_grads = tf.concat(k_grads, axis=0)[::-1] v_grads = tf.concat(v_grads, axis=0)[::-1] return q_grads, k_grads, v_grads
def CornerLoss(self, gt_bboxes, predicted_bboxes, symmetric=True): """Corner regularization loss. This function computes the corner loss, an alternative regression loss for box residuals. This was used in the Frustum-PointNets paper [1]. We compute the predicted bboxes (all 8 corners) and compute a SmoothedL1 loss between the corners of the predicted boxes and ground truth. Hence, this loss can help encourage the model to maximize the IoU of the predictions. [1] Frustum PointNets for 3D Object Detection from RGB-D Data https://arxiv.org/pdf/1711.08488.pdf Args: gt_bboxes: tf.float32 of shape [..., 7] which contains (x, y, z, dx, dy, dz, phi), corresponding to ground truth bbox parameters. predicted_bboxes: tf.float32 of same shape as gt_bboxes containing predicted bbox parameters. symmetric: boolean. If True, computes the minimum of the corner loss with respect to both the gt box and the gt box rotated 180 degrees. Returns: tf.float32 Tensor of shape [...] where each entry contains the corner loss for the corresponding bbox. """ bbox_shape = py_utils.GetShape(gt_bboxes) batch_size = bbox_shape[0] gt_bboxes = tf.reshape(gt_bboxes, [batch_size, -1, 7]) predicted_bboxes = tf.reshape(predicted_bboxes, [batch_size, -1, 7]) gt_corners = geometry.BBoxCorners(gt_bboxes) predicted_corners = geometry.BBoxCorners(predicted_bboxes) corner_dist = tf.norm(predicted_corners - gt_corners, axis=-1) huber_loss = self.ScaledHuberLoss(labels=tf.zeros_like(corner_dist), predictions=corner_dist) huber_loss = tf.reduce_sum(huber_loss, axis=-1) if symmetric: # Compute the loss assuming the ground truth is flipped 180, and # take the minimum of the two losses. rot = tf.constant([[[0., 0., 0., 0., 0., 0., np.pi]]], dtype=tf.float32) rotated_gt_bboxes = gt_bboxes + rot rotated_gt_corners = geometry.BBoxCorners(rotated_gt_bboxes) rotated_corner_dist = tf.norm(predicted_corners - rotated_gt_corners, axis=-1) rotated_huber_loss = self.ScaledHuberLoss( labels=tf.zeros_like(rotated_corner_dist), predictions=rotated_corner_dist) rotated_huber_loss = tf.reduce_sum(rotated_huber_loss, axis=-1) huber_loss = tf.minimum(huber_loss, rotated_huber_loss) huber_loss = tf.reshape(huber_loss, bbox_shape[:-1]) return huber_loss
def ComputeMoments(inputs, padding, reduce_over_dims, cumulative_axis=None, enable_cross_replica_sum_on_tpu=False, keepdims=False): """Computes mean and variance over the valid data points in inputs.""" mask = 1.0 - padding inputs = py_utils.with_dependencies([ py_utils.assert_equal(tf.rank(inputs), tf.rank(mask)), py_utils.assert_greater_equal(mask, tf.zeros_like(mask)), ], inputs) sum_v = tf.reduce_sum(inputs * tf.cast(mask, inputs.dtype), reduce_over_dims, keepdims=keepdims) count_v = tf.reduce_sum(mask, reduce_over_dims, keepdims=keepdims) if cumulative_axis is not None: sum_v = tf.math.cumsum(sum_v, axis=cumulative_axis) count_v = tf.math.cumsum(count_v, axis=cumulative_axis) # Input shape is guaranteed to be a multiple of mask shape because the # inputs * mask op above was successfully broadcasted. input_size_on_reduced_dims = tf.reduce_prod( tf.gather(tf.shape(inputs), reduce_over_dims)) mask_size_on_reduced_dims = tf.reduce_prod( tf.gather(tf.shape(mask), reduce_over_dims)) mask_multiplier = tf.math.truediv(input_size_on_reduced_dims, mask_size_on_reduced_dims) count_v *= tf.cast(mask_multiplier, count_v.dtype) if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu: sum_v = tf.tpu.cross_replica_sum(sum_v) count_v = tf.tpu.cross_replica_sum(count_v) count_v = tf.maximum(count_v, 1.0) mean = sum_v / count_v sum_vv = tf.reduce_sum((inputs - mean) * (inputs - mean) * mask, reduce_over_dims, keepdims=keepdims) if cumulative_axis is not None: sum_vv = tf.math.cumsum(sum_vv, axis=cumulative_axis) if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu: sum_vv = tf.tpu.cross_replica_sum(sum_vv) variance = py_utils.with_dependencies([ py_utils.assert_greater_equal(sum_vv, tf.zeros_like(sum_vv)), ], sum_vv / count_v) return mean, variance
def chunked_causal_denominator_func(qs, ks): """Forward pass of FAVOR normalizer in causal attention using chunks. Args: qs: query_prime tensor of the shape [L,B,H,M]. ks: key_prime tensor of the shape [L,B,H,M]. Returns: Not-normalized FAVOR causal attention A_{masked}V. Last prefix sum state. """ result = [] sums = tf.zeros_like(ks[0]) for start_index in range(0, qs.shape[0], _ITER_CHUNK_SIZE): end_index = min(qs.shape[0], start_index + _ITER_CHUNK_SIZE) chunk = ks[start_index:end_index] chunk = sums[None, ...] + tf.math.cumsum(chunk, axis=0) sums = chunk[-1] result_elem = tf.reduce_sum(qs[start_index:end_index] * chunk, axis=3) result.append(result_elem) result = tf.concat(result, axis=0) return result, sums
def _internal_apply_dense(self, grad, var, magnitude_optimizer_apply_fn, direction_optimizer_apply_fn): # pylint: disable=g-doc-args """Main optimization logic of AdaGraft, which calls the child optimizers. Args: grad: Tensor containing gradients. var: Tensor containing parameter values. magnitude_optimizer_apply_fn: Apply magnitude optimizer. direction_optimizer_apply_fn: Apply direction optimizer. Returns: The final update op, which increments var by the grafted step. Pseudocode: - Copy weights into scratch space 'scratch_copy'. - Run magnitude_optimizer in-place. - Use scratch copy to figure out how far we moved ('magnitude_step'). - Copy weights back. - Run direction_optimizer in-place. - Move weights along the line segment with scratch_copy. """ if self.use_global_norm: self._variables.append(var) # Slot with current parameter values scratch_slot = self.get_slot(var, "scratch_copy") old_var = tf.assign(scratch_slot, var) with tf.control_dependencies([old_var]): m_updated_var = magnitude_optimizer_apply_fn(grad, var) # pylint: disable=protected-access # Run magnitude optimizer and compute the norm of the update. with tf.control_dependencies([m_updated_var]): m_step = var - old_var m_step_norm = tf.norm(m_step) if self.diagnostic or self.use_global_norm: m_step_norm = tf.assign(self.get_slot(var, "m_step_norm"), m_step_norm) # Run direction optimizer and compute its norm, and the direction. with tf.control_dependencies([m_step_norm]): flushed_var = tf.assign(var, old_var) with tf.control_dependencies([flushed_var]): d_updated_var = direction_optimizer_apply_fn(grad, var) # pylint: disable=protected-access # Run an update of the direction optimizer with magnitude optimizer norm. with tf.control_dependencies([d_updated_var]): d_step = var - old_var d_step_norm = tf.norm(d_step) if self.diagnostic or self.use_global_norm: d_step_norm = tf.assign(self.get_slot(var, "d_step_norm"), d_step_norm) if self.use_global_norm: flushed_var = tf.assign(var, old_var) with tf.control_dependencies([d_step_norm, flushed_var]): return tf.assign(scratch_slot, d_step) step = tf.where( tf.greater(d_step_norm, 0), (m_step_norm / tf.maximum(d_step_norm, 1e-30)) * d_step, tf.zeros_like(d_step)) return tf.assign(var, old_var + self._learning_rate_tensor * step)
def _finish(self, update_ops, name_scope): with tf.control_dependencies(update_ops): ops1 = self.magnitude_optimizer._finish([], name_scope + "_m") # pylint: disable=protected-access ops2 = self.direction_optimizer._finish([], name_scope + "_d") # pylint: disable=protected-access if self.use_global_norm: # apply global grafting with tf.control_dependencies([ops1, ops2]): m_global_norm = tf.Variable(0.) d_global_norm = tf.Variable(0.) for var in self._variables: m_step_norm = self.get_slot(var, "m_step_norm") d_step_norm = self.get_slot(var, "d_step_norm") tf.assign_add(m_global_norm, m_step_norm**2) tf.assign_add(d_global_norm, d_step_norm**2) multiplier = tf.sqrt(m_global_norm / tf.maximum(d_global_norm, 1e-30)) step_ops = [] for var in self._variables: d_step = self.get_slot(var, "scratch_copy") step = tf.where(tf.greater(d_step_norm, 0), multiplier * d_step, tf.zeros_like(d_step)) step_op = tf.assign_add( var, self._learning_rate_tensor * step) step_ops.append(step_op) return tf.group(*step_ops, name=name_scope) return tf.group(*([ops1, ops2] + update_ops), name=name_scope)
def _common_gpipe_transformer_encoder_fprop( layer, layer_class, theta, source_vecs, source_paddings, target_vecs, target_paddings, source_segment_id, target_segment_id, labels, label_weights, transparent_acc, transparent_acc_helper): """GPipe encoder FProp.""" p = layer.params h, _ = super(layer_class, layer).FProp(theta, source_vecs, source_paddings, source_segment_id=source_segment_id) h.set_shape(source_vecs.shape) if p.is_transparent: if p.transparent_merger_tpl is not None: transparent_acc_helper = layer.transparent_merger.FProp( theta.transparent_merger) transparent_acc = tf.zeros_like(source_vecs) transparent_acc = transparent_acc + transparent_acc_helper[ 0] * source_vecs if p.final_enc_layer: h = transparent_acc + h * transparent_acc_helper[-1] transparent_acc = None transparent_acc_helper = None else: transparent_acc_helper = transparent_acc_helper[1:] if p.normalize_output: h = layer.layer_norm.FProp(theta.layer_norm, h) return (h, source_paddings, target_vecs, target_paddings, source_segment_id, target_segment_id, labels, label_weights, transparent_acc, transparent_acc_helper)
def _GetOutputs(enc, dec): x, seg_id, pos_id = self._GetInputs() enc_inputs = py_utils.NestedMap(vec=x, segment_id=seg_id, segment_pos=pos_id, aux_loss=tf.constant(0.0)) enc_outs = enc.FPropDefaultTheta(enc_inputs) dec_inputs = py_utils.NestedMap( vec=x, segment_id=seg_id, segment_pos=pos_id, encoder_output=enc_outs.vec, encoder_segment_id=tf.zeros_like(seg_id), encoder_segment_pos=tf.zeros_like(pos_id), aux_loss=enc_outs.aux_loss) return dec.FPropDefaultTheta(dec_inputs).vec
def _ApplyAndReset(): with tf.control_dependencies([ self._opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grad, 1. / p.accum_steps)) ]): return tf.group( *[tf.assign(a, tf.zeros_like(a)) for _, a in var_grad.Flatten()])
def _MoeOrFFLayer(self, theta, inputs, paddings): """FProp for MoE or Feed forward layer. Args: theta: Layer theta: A NestedMap of Tensors. inputs: A Tensor of shape [batch, seqlen, dim0]. paddings: A Tensor of shape [batch, seqlen]. Returns: out_nmap: A NestedMap of output tensors: * features: Tensor of shape [batch, seqlen, dim0]. * paddings: A Tensor of shape [batch, seqlen]. * aux_loss: [Optional] Scalar tensor. """ if 'fflayer_end' in self.children: outputs = self.fflayer_end.FProp(theta.fflayer_end, inputs, paddings) return py_utils.NestedMap(features=outputs, paddings=paddings) else: # 0 - padded positions and 1 - non-padded positions. segment_ids = tf.cast(1. - paddings, tf.int32) segment_pos = tf.zeros_like( segment_ids) # not used but required by MoE. ys, aux_loss = self.fflayer_end_moe.FProp(theta.fflayer_end_moe, inputs, segment_ids, segment_pos) return py_utils.NestedMap(features=ys, paddings=paddings, aux_loss=aux_loss)
def _ComputeBN(self, inputs, paddings, gamma, beta, norm_mean, norm_variance): p = self.params with tf.control_dependencies([ py_utils.assert_greater_equal(norm_variance, tf.zeros_like(norm_variance)), py_utils.assert_shape_match([tf.shape(inputs)[-1]], tf.shape(norm_mean)), py_utils.assert_shape_match([tf.shape(inputs)[-1]], tf.shape(norm_variance)), ]): if p.use_fused_batch_norm_for_eval and (self.do_eval or p.freeze_bn_stats): bn_output, _, _ = nn.fused_batch_norm(inputs, gamma, beta, norm_mean, norm_variance, self._epsilon, is_training=False) else: bn_output = tf.nn.batch_normalization(inputs, norm_mean, norm_variance, beta, gamma, self._epsilon) if p.set_padded_output_to_zero: bn_output = py_utils.ApplyPadding(paddings, bn_output) return bn_output
def grad(res_grad): grads = tf.zeros_like(tf.einsum("ijk,ijl->ijkl", ks[0], vs[0])) gr_sums = sums q_grads = [] k_grads = [] v_grads = [] for index in range(qs.shape[0] - 1, -1, -1): q_grads.append( tf.einsum("ijkl,ijl->ijk", gr_sums, res_grad[index])[None, ...]) grads = grads + tf.einsum("ijk,ijl->ijkl", qs[index], res_grad[index]) k_grads.append( tf.einsum("ijkl,ijl->ijk", grads, vs[index])[None, ...]) v_grads.append( tf.einsum("ijkl,ijk->ijl", grads, ks[index])[None, ...]) gr_sums = gr_sums - tf.einsum("ijk,ijl->ijkl", ks[index], vs[index]) q_grads = tf.concat(q_grads[::-1], axis=0) k_grads = tf.concat(k_grads[::-1], axis=0) v_grads = tf.concat(v_grads[::-1], axis=0) return q_grads, k_grads, v_grads
def FProp(self, theta, inputs, paddings=None): """Apply batch normalization. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: The inputs tensor. Shaped [..., dim]. paddings: The paddings tensor. Shaped [..., 1], with the same rank as the input tensor. Returns: Output after applying batch normalization, with the same shape as 'inputs'. """ p = self.params if paddings is None: paddings = self._GetDefaultPaddings(inputs) with tf.name_scope(p.name): norm_mean, norm_variance, beta, gamma = self.ComputeAndUpdateMoments( theta, inputs, paddings) with tf.control_dependencies([ py_utils.assert_greater_equal( norm_variance, tf.zeros_like(norm_variance)), py_utils.assert_shape_match([tf.shape(inputs)[-1]], tf.shape(norm_mean)), py_utils.assert_shape_match([tf.shape(inputs)[-1]], tf.shape(norm_variance)), ]): bn_output = tf.nn.batch_normalization(inputs, norm_mean, norm_variance, beta, gamma, self._epsilon) bn_output *= 1.0 - paddings return bn_output
def BatchMakeRotationMatrix(yaw, clockwise=False): """Create a Nx3x3 rotation matrix from yaw. Args: yaw: float tensor representing a yaw angle in radians. clockwise: Whether to have the rotation be applied clockwise (True) or counter-clockwise (False). Defaults to counter-clockwise to maintain same semantics to MakeRotationMatrix. Returns: A [N, 3, 3] tensor corresponding to a rotation matrix. """ if clockwise: yaw = -yaw cos = tf.cos(yaw) sin = tf.sin(yaw) zero = tf.zeros_like(cos) one = tf.ones_like(cos) rotation_matrix = tf.stack( [cos, -sin, zero, sin, cos, zero, zero, zero, one], axis=-1) # pyformat: disable rotation_matrix = tf.reshape(rotation_matrix, [-1, 3, 3]) return rotation_matrix
def _TokenizeOneSentence(i, text, token_ids_ta, target_ids_ta, paddings_ta): """Tokenizes a single sentence.""" if tf.is_tensor(i): text_i = tf.gather(text, i) else: text_i = text[i] ids = self._tokenizer.tokenize(text_i).merge_dims(0, -1) ids.set_shape([None]) if append_eos: ids = tf.concat([ids, [self.eos_id]], axis=0) sos_ids = tf.concat([[self.sos_id], ids], axis=0) if p.prepend_sos: ids = sos_ids # This truncates after the EOS is added, so some sentences might # not have EOS at the end. token_ids_ta = token_ids_ta.write( i, py_utils.PadOrTrimTo(sos_ids, [max_length], 0)) target_ids_ta = target_ids_ta.write( i, py_utils.PadOrTrimTo(ids, [max_length], 0)) paddings_ta = paddings_ta.write( i, py_utils.PadOrTrimTo(tf.zeros_like(ids, dtype=tf.float32), [max_length], 1.)) return i + 1, strs, token_ids_ta, target_ids_ta, paddings_ta
def chunked_causal_denominator_grad(qs, ks, sums, res_grad): """Backward pass of FAVOR normalizer in causal attention using chunks. Args: qs: query_prime tensor of the shape [L,B,H,M]. ks: key_prime tensor of the shape [L,B,H,M]. sums: last prefix sum state. res_grad: last prefix sum state's grad. Returns: Gradients of qs. Gradients of ks. """ k_grad = tf.zeros_like(ks[0]) gr_sums = sums q_grads = [] k_grads = [] res_grad = res_grad[::-1] qs_rev = qs[::-1] ks_rev = ks[::-1] for start_index in range(0, qs_rev.shape[0], _ITER_CHUNK_SIZE): end_index = min(qs_rev.shape[0], start_index + _ITER_CHUNK_SIZE) chunk = ks_rev[start_index:end_index - 1] chunk = tf.concat([tf.zeros_like(gr_sums[None, ...]), chunk], axis=0) chunk = gr_sums[None, ...] - tf.math.cumsum(chunk, axis=0) gr_sums = chunk[-1] - ks_rev[end_index - 1] q_grads.append( tf.einsum("sijk,sij->sijk", chunk, res_grad[start_index:end_index])) k_grad_chunk = tf.einsum("sijk,sij->sijk", qs_rev[start_index:end_index], res_grad[start_index:end_index]) k_grad_chunk = k_grad[None, ...] + tf.math.cumsum(k_grad_chunk, axis=0) k_grad = k_grad_chunk[-1] k_grads.append(k_grad_chunk) q_grads = tf.concat(q_grads, axis=0)[::-1] k_grads = tf.concat(k_grads, axis=0)[::-1] return q_grads, k_grads
def _GetAP(self, gt_bbox, gt_imgid, pd_bbox, pd_imgid, pd_score): g = tf.Graph() with g.as_default(): iou, pr = ops.average_precision3d( iou_threshold=0.5, groundtruth_bbox=gt_bbox, groundtruth_imageid=gt_imgid, groundtruth_ignore=tf.zeros_like(gt_imgid, dtype=tf.int32), prediction_bbox=pd_bbox, prediction_imageid=pd_imgid, prediction_score=pd_score, prediction_ignore=tf.zeros_like(pd_imgid, dtype=tf.int32), num_recall_points=41, algorithm='KITTI') with self.session(graph=g) as sess: val = sess.run([iou, pr]) return val
def testMoEModelDimReshapeFProp(self): """Test to verify MoEBuilder.MoE() supports dynamic shapes. Test without this change fails. """ builder = gshard_builder.DenseBuilder.Params().Set( e_dim=2, c_dim=2, deterministic_dropout=True, dtype=tf.float32, relative_attention_type='bias', model_dim=4, attention_num_heads=2, attention_combine_dims=True, attention_num_memory_heads=1, model_dim_reshape_segments=2, ff_dim=8, attention_key_value_dim=2, moe_hidden_dim=8).Instantiate() p = builder.DecoderLayerStack( 'decoder', sub_layers=[ builder.DecSelfAttentionRelativeBias('dec_self_attention'), builder.MoE('moe', decoder=True) ], num=2, use_repeat_layer=True) with self.session(graph=tf.Graph()) as sess: tf.random.set_seed(2019) # we will reduce the length_dim by 2 dynamically. layer = p.Instantiate() inputs, segment_ids, segment_pos = self._GetInputs(reshape_m=True) dec_inputs = py_utils.NestedMap( vec=inputs, segment_id=segment_ids, segment_pos=segment_pos, encoder_output=inputs, encoder_segment_id=tf.zeros_like(segment_ids), encoder_segment_pos=tf.zeros_like(segment_pos), aux_loss=tf.constant(0.0)) # Verify length dimension shape is dynamic(a Tensor). out = layer.FPropDefaultTheta(dec_inputs).vec sess.run(tf.global_variables_initializer()) sess.run([out])
def FProp(self, theta, inputs, paddings, domain_ids=None): """Applies data augmentation by randomly mask spectrum in inputs. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. inputs: A tensor of shape [batch, time, freq, num_channels]. paddings: A 0/1 tensor of shape [batch, time]. domain_ids: input domain_ids of shape [batch, time]. Returns: A pair of 2 tensors: - augmented_inputs: A tensor of shape [batch, time, freq, num_channels]. - paddings: A 0/1 tensor of shape [batch, time]. """ p = self.params global_seed = None # A tensor seed in case stateless random ops are needed. if p.use_input_dependent_random_seed: global_seed = _global_seed_from_inputs(inputs) batch_size, series_length, _, _ = py_utils.GetShape(inputs) if len(p.domain_ids) > 1: augmented_inputs = tf.zeros_like(inputs) original_inputs = inputs for i, domain_id in enumerate(p.domain_ids): augmented_domain = self._AugmentationNetwork( series_length, inputs, paddings, global_seed=global_seed, domain_id_index=i) target_domain = tf.cast(tf.expand_dims( tf.tile([domain_id], [batch_size]), -1), dtype=p.dtype) # [batch, time]. domain_mask = tf.cast(tf.equal(domain_ids, target_domain), dtype=p.dtype) augmented_domain = tf.einsum('bxyc,bx->bxyc', augmented_domain, domain_mask, name='einsum_domainmasking') original_inputs = tf.einsum('bxyc,bx->bxyc', original_inputs, 1.0 - domain_mask, name='einsum_domainmasking2') augmented_inputs = augmented_domain + augmented_inputs augmented_inputs = original_inputs + augmented_inputs else: augmented_inputs = self._AugmentationNetwork( series_length, inputs, paddings, global_seed=global_seed, domain_id_index=0) return augmented_inputs, paddings
def _testElmanHelper(self, seqlen, use_grad, stop_fn=None): with self.session() as sess: tf.set_random_seed(342462) batch = 3 dims = 4 theta = py_utils.NestedMap() theta.w = self.Rand([2 * dims, dims]) theta.b = self.Rand([dims]) state0 = py_utils.NestedMap() state0.h = self.Rand([batch, dims]) inputs = py_utils.NestedMap() inputs.x = self.Rand([seqlen, batch, dims]) # Static unrolled. s = state0 out = [] for i in range(seqlen): inp = py_utils.NestedMap() inp.x = inputs.x[i, :] s, _ = self.Elman(theta, s, inp) out += [s.h] if stop_fn and stop_fn(i + 1, theta, s): out += [ tf.zeros_like(out[-1]) for _ in range(seqlen - i - 1) ] break acc0, final0 = tf.stack(out), s.h loss0 = tf.reduce_sum(acc0) + tf.reduce_sum(final0) (dw0, db0, dh0, di0) = tf.gradients(loss0, [theta.w, theta.b, state0.h, inputs.x]) # Uses the Recurrent() library. acc1, final1 = recurrent.Recurrent( theta=theta, state0=state0, inputs=inputs, cell_fn=self.Elman, cell_grad=self.ElmanGrad if use_grad else None, stop_fn=stop_fn) acc1, final1 = acc1.h, final1.h loss1 = tf.reduce_sum(acc1) + tf.reduce_sum(final1) (dw1, db1, dh1, di1) = tf.gradients(loss1, [theta.w, theta.b, state0.h, inputs.x]) # Fetches a bunch of values and compare them. (acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0, di1) = sess.run([ acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0, di1 ]) self.assertAllClose(acc0, acc1) self.assertAllClose(final0, final1) self.assertAllClose(dw0, dw1) self.assertAllClose(db0, db1) self.assertAllClose(dh0, dh1) self.assertAllClose(di0, di1)
def _MoeOrFFLayer(self, theta, fflayer_name, in_nmap): """FProp for MoE or Feed forward layer. Args: theta: Layer theta: A NestedMap of Tensors. fflayer_name: Child FFLayer name as created in __init__. For example: 'fflayer_end'. This assumes the moe_layer if created would have the convention as (`fflayer_name` + `_moe`). in_nmap: Nested Map containing the following: * inputs: A Tensor of shape [batch, seqlen, dim0]. * paddings: A Tensor of shape [batch, seqlen]. * moe_aux_loss: [None] Optional aux loss if present in input batch. Returns: out_nmap: A NestedMap of output tensors: * features: Tensor of shape [batch, seqlen, dim0]. * paddings: A Tensor of shape [batch, seqlen]. * aux_loss: [Optional] Scalar tensor. Output moe auxiliary loss with input aux loss added. """ out_nmap = in_nmap.copy() if fflayer_name in self.children: outputs = self.children[fflayer_name].FProp( theta.GetItem(fflayer_name), in_nmap.features, in_nmap.paddings) out_nmap.features = outputs return out_nmap else: moe_fflayer_name = fflayer_name + '_moe' if moe_fflayer_name not in self.children: raise AssertionError( '{} child layer not present.'.format(moe_fflayer_name)) if moe_fflayer_name not in theta: raise AssertionError( '{} layer theta not present.'.format(moe_fflayer_name)) # 0 - padded positions and 1 - non-padded positions. segment_ids = tf.cast(1. - in_nmap.paddings, tf.int32) segment_pos = tf.zeros_like(segment_ids) # not used but required by MoE. moe_in = py_utils.NestedMap( vec=in_nmap.features, segment_id=segment_ids, segment_pos=segment_pos) moe_out = self.children[moe_fflayer_name].FProp( theta.GetItem(moe_fflayer_name), moe_in) out_nmap.features = moe_out.vec aux_loss = moe_out.aux_loss if 'aux_loss' in in_nmap: assert not aux_loss.shape.rank, 'MoE aux-loss should be a scalar.' if len(py_utils.GetShape(in_nmap.aux_loss)) == 1: b_size = py_utils.GetShape(in_nmap.aux_loss)[0] aux_loss = tf.tile(tf.expand_dims(aux_loss, axis=0), [b_size]) assert in_nmap.aux_loss.shape.rank == aux_loss.shape.rank aux_loss += in_nmap.aux_loss # Add 'aux_loss' in out_nmap. out_nmap.aux_loss = aux_loss return out_nmap
def _TestStreamStepHelper(self, **kwargs): """Main helper method.""" batch_size, max_seqlen, input_dim = 2, 32, kwargs['input_dim'] stride = kwargs.get('stride', 1) # max_seqlen is divisible by stride. assert max_seqlen % stride == 0 right_context = kwargs.get('right_context', 0) # Prepares inputs. inputs, paddings = self._GetInputs(batch_size, max_seqlen, input_dim) # Gets params p = self._GetParams(**kwargs) # Builds graph. with self.session(use_gpu=False) as sess: l = p.Instantiate() init_op = tf.global_variables_initializer() fprop_out = self._FProp(l, inputs, paddings) base_outputs = self._GetFPropOutput(fprop_out) out_rank = py_utils.GetRank(base_outputs) base_outputs *= py_utils.AppendDims(1. - paddings, out_rank - 2) try: state = l.zero_state(batch_size) except TypeError: state = l.zero_state(l.theta, batch_size) outputs = [] for i in range(max_seqlen // stride + int(math.ceil(right_context / stride))): if i < max_seqlen // stride: step_inputs = inputs[:, stride * i:stride * (i + 1)] step_paddings = paddings[:, stride * i:stride * (i + 1)] else: step_inputs = tf.zeros_like(inputs[:, 0:stride]) step_paddings = tf.ones_like(paddings[:, 0:stride]) output, _, state = l.StreamStep(l.theta, step_inputs, step_paddings, state) outputs.append(output) outputs = tf.concat(outputs, axis=1) outputs = self._NormalizeStreamStepOutput(outputs, paddings, right_context, max_seqlen) sess.run(init_op) expected, actual = sess.run([base_outputs, outputs]) print(f'expected: {repr(expected)}, {expected.shape}') print(f'actual: {repr(actual)}, {actual.shape}') print(f'np.sum(np.abs(expected)): {np.sum(np.abs(expected))}') print(f'np.sum(np.abs(actual)): {np.sum(np.abs(actual))}') tol = kwargs.get('tol', 1e-6) self.assertAllClose(expected, actual, atol=tol, rtol=tol)
def grad_fn(d_outputs): with tf.name_scope("entmax_grad"): gppr = tf.where(p_m > 0, tf.math.pow(p_m, 2.0 - alpha), tf.zeros_like(p_m)) d_inputs = d_outputs * gppr q = tf.math.reduce_sum(d_inputs, axis) / tf.math.reduce_sum( gppr, axis) q = tf.expand_dims(q, axis) d_inputs -= q * gppr return d_inputs, d_inputs
def _ParseRecord(self, record): """Reads and parses a single record.""" p = self.params name_to_features = { 'input_ids': tf.io.FixedLenFeature([p.max_sequence_length], tf.int64), 'input_mask': tf.io.FixedLenFeature([p.max_sequence_length], tf.int64), 'masked_lm_positions': tf.io.FixedLenFeature([p.max_predictions_per_seq], tf.int64), 'masked_lm_ids': tf.io.FixedLenFeature([p.max_predictions_per_seq], tf.int64), 'masked_lm_weights': tf.io.FixedLenFeature([p.max_predictions_per_seq], tf.float32), } example = tf.io.parse_single_example(record, name_to_features) mask_length = tf.cast(tf.reduce_sum(example['masked_lm_weights']), dtype=tf.int32) masked_lm_positions = tf.slice(example['masked_lm_positions'], [0], [mask_length]) masked_lm_ids = tf.cast(tf.slice(example['masked_lm_ids'], [0], [mask_length]), dtype=tf.int32) ret = py_utils.NestedMap() ret.masked_ids = tf.cast(example['input_ids'], dtype=tf.int32) # Get back non-masked, original ids. ret.ids = tf.tensor_scatter_nd_update(tensor=ret.masked_ids, indices=tf.reshape( masked_lm_positions, [-1, 1]), updates=masked_lm_ids) ret.masked_pos = tf.tensor_scatter_nd_update( tensor=tf.zeros_like(ret.masked_ids, dtype=tf.float32), indices=tf.reshape(masked_lm_positions, [-1, 1]), updates=tf.ones_like(masked_lm_ids, dtype=tf.float32)) ret.segment_ids = tf.cast(example['input_mask'], dtype=tf.float32) first_eos_idx = tf.where(tf.math.equal(ret.ids, p.eos_token_id))[0][0] def _RemoveFirstEos(x): # We remove the element at position `first_eos_idx`, and pad with 0 # to keep length unchanged. zero = tf.constant(0, shape=(1, ), dtype=x.dtype) return tf.concat([x[:first_eos_idx], x[first_eos_idx + 1:], zero], axis=0) ret = ret.Transform(_RemoveFirstEos) ret.paddings = 1.0 - ret.segment_ids pos = tf.cast(tf.range(p.max_sequence_length), dtype=tf.float32) ret.segment_pos = tf.cast(ret.segment_ids * pos, dtype=tf.int32) if p.remove_mask: del ret.masked_pos del ret.masked_ids return ret
def NMSIndices(self, bboxes, scores, max_output_size, nms_iou_threshold=0.3, score_threshold=0.01): """Apply NMS to a series of 3d bounding boxes in 7-DOF format. Args: bboxes: A [num_boxes, 7] floating point Tensor of bounding boxes in [x, y, z, dx, dy, dz, phi] format. scores: A [num_boxes] floating point Tensor containing box scores. max_output_size: Maximum number of boxes to predict per input. nms_iou_threshold: IoU threshold to use when determining whether two boxes overlap for purposes of suppression. score_threshold: The score threshold passed to NMS that allows NMS to quickly ignore irrelevant boxes. Returns: The NMS indices and the mask of the padded indices. """ bboxes = py_utils.HasShape(bboxes, [-1, 7]) # Extract x, y, w, h, then convert to extrema. # # Note that we drop the rotation angle because we don't have an NMS # operation that takes rotation into account. bboxes_2d = tf.stack( [bboxes[:, 0], bboxes[:, 1], bboxes[:, 3], bboxes[:, 4]], axis=-1) bboxes_extrema = geometry.XYWHToBBoxes(bboxes_2d) # Compute NMS with padding; we use the padded version so this function can # be used in a map_fn. This function returns the scalar number of boxes # for each example. # # We use an IoU threshold of 0.3 since our anchor boxes have rotations # that make the default IoU threshold of 0.5 possibly too high. nms_index_padded, num_valid = tf.image.non_max_suppression_padded( bboxes_extrema, scores, iou_threshold=nms_iou_threshold, max_output_size=max_output_size, score_threshold=score_threshold, pad_to_max_output_size=True) # Return the mask of valid indices instead of just a scalar number. mask = tf.concat( [tf.ones([num_valid]), tf.zeros([max_output_size - num_valid])], axis=0) nms_index_padded = tf.where(mask > 0, nms_index_padded, tf.zeros_like(nms_index_padded)) return nms_index_padded, mask
def _ApplyAndReset(): normalized_accums = accums if self._apply_crs_to_grad: normalized_accums = [ tf.tpu.cross_replica_sum(accum.read_value()) for accum in accums ] apply_op = self._opt.apply_gradients( list(zip(normalized_accums, variables))) with tf.control_dependencies([apply_op]): zero_op = [tf.assign(accum, tf.zeros_like(accum)) for accum in accums] return tf.group(zero_op, tf.assign_add(global_step, 1))
def BBoxCorners(bboxes): """Extract the corner points from a 7-DOF bbox representation. Args: bboxes: A [batch, num_boxes, 7] floating point bounding box representation ([x, y, z, dx, dy, dz, phi]). Returns: A [batch, num_boxes, 8, 3] floating point Tensor containing the corner (x, y, z) points for every bounding box. """ # Code adapted from vale/soapbox codebase. # # Corners in normalized box frame (unit cube centered at origin). # # Dimensions is [length, width, height]. corners = tf.constant([ [0.5, 0.5, 0.5], # top [-0.5, 0.5, 0.5], # top [-0.5, -0.5, 0.5], # top [0.5, -0.5, 0.5], # top [0.5, 0.5, -0.5], # bottom [-0.5, 0.5, -0.5], # bottom [-0.5, -0.5, -0.5], # bottom [0.5, -0.5, -0.5], # bottom ]) batch, nb, _ = py_utils.GetShape(bboxes, 3) # Extract location, dimension, and rotation. location = bboxes[:, :, :3] dimensions = bboxes[:, :, 3:6] phi_world = bboxes[:, :, 6] # Convert rotation_phis into rotation matrices along unit z. cos = tf.cos(phi_world) sin = tf.sin(phi_world) zero = tf.zeros_like(cos) one = tf.ones_like(cos) rotations_world = tf.reshape( tf.stack([cos, -sin, zero, sin, cos, zero, zero, zero, one], axis=2), [batch, nb, 3, 3]) # Create axis-aligned corners from length/width/height. corners = tf.einsum('bni,ji->bnji', dimensions, corners) # Rotate the corners coordinates to the rotated world frame. corners = tf.einsum('bnij,bnkj->bnki', rotations_world, corners) # Translate corners to the world location. corners = corners + tf.reshape(location, (batch, nb, 1, 3)) return corners