def _testBeamSearch(self, expected_values, dtype=tf.float32, init_step_ids=False, has_task_ids=False): tf.random.set_seed(_TF_RANDOM_SEED) src_batch = 4 src_time = 5 p = self._DecoderParams(dtype=dtype, init_step_ids=init_step_ids) p.beam_search.num_hyps_per_beam = 2 p.beam_search.coverage_penalty = 0.0 p.beam_search.length_normalization = 0 dec = decoder.TransformerDecoder(p) encoder_outputs, _, _ = self._Inputs( dtype=dtype, has_task_ids=has_task_ids, init_step_ids=init_step_ids) decode = dec.BeamSearchDecode(encoder_outputs) # topk_decoded is None in MT decoder, set it to a fake tensor to pass # sess.run(decode). decode = decode._replace(topk_decoded=tf.constant(0, tf.float32)) with self.session(use_gpu=True) as sess: tf.global_variables_initializer().run() actual_decode = sess.run(decode) self.assertTupleEqual( (src_time, src_batch * p.beam_search.num_hyps_per_beam), actual_decode.done_hyps.shape) self.assertTupleEqual( (src_batch, p.beam_search.num_hyps_per_beam), actual_decode.topk_hyps.shape) self.assertTupleEqual( (src_batch * p.beam_search.num_hyps_per_beam, src_time), actual_decode.topk_ids.shape) self.assertTupleEqual( (src_batch * p.beam_search.num_hyps_per_beam,), actual_decode.topk_lens.shape) self.assertTupleEqual( (src_batch, p.beam_search.num_hyps_per_beam), actual_decode.topk_scores.shape) # Assert expected IDs etc self.assertAllEqual(expected_values['topk_ids'], actual_decode.topk_ids) self.assertAllEqual(expected_values['topk_lens'], actual_decode.topk_lens) self.assertAllClose(expected_values['topk_scores'], actual_decode.topk_scores) # Assert expected attention probs. hypstr = actual_decode.topk_hyps.flatten()[1] hyp = Hypothesis() hyp.ParseFromString(hypstr) print('HYP:', hyp) atten_vec_0 = list(np.expand_dims(np.array(hyp.atten_vecs[0].prob), 0)[0]) atten_vec_1 = list(np.expand_dims(np.array(hyp.atten_vecs[1].prob), 0)[0]) self.assertAllClose(atten_vec_0, expected_values['atten_vec_0']) self.assertAllClose(atten_vec_1, expected_values['atten_vec_1']) # Test normalized scores of hypotheses. CompareToGoldenSingleFloat(self, expected_values['normalized_score'], hyp.normalized_score)
def testDecoderSampleTargetSequences(self): p = self._DecoderParams(vn_config=py_utils.VariationalNoiseParams( None, False, False), num_classes=8) p.target_seq_len = 5 p.random_seed = 1 config = tf.config_pb2.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions(do_function_inlining=False))) with self.session(use_gpu=False, config=config) as sess: tf.random.set_seed(8372740) np.random.seed(35315) dec = p.Instantiate() source_sequence_length = 5 batch_size = 4 source_encodings = tf.constant(np.random.normal( size=[source_sequence_length, batch_size, p.source_dim]), dtype=tf.float32) source_encoding_padding = tf.constant( [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0]], dtype=tf.float32) encoder_outputs = py_utils.NestedMap( encoded=source_encodings, padding=source_encoding_padding) sampled_sequences = dec.SampleTargetSequences(dec.theta, encoder_outputs, random_seed=tf.cast( 123, tf.int32)) self.assertAllEqual([batch_size, p.target_seq_len], sampled_sequences.ids.shape) self.evaluate(tf.global_variables_initializer()) decoder_output = sess.run(sampled_sequences) print('ids=%s' % np.array_repr(decoder_output.ids)) lens = np.sum(1 - decoder_output.paddings, axis=1) print('lens=%s' % lens) # pyformat: disable # pylint: disable=bad-whitespace,bad-continuation expected_ids = [[6, 2, 2, 2, 2], [0, 0, 7, 5, 1], [6, 1, 5, 1, 5], [6, 7, 7, 4, 4]] # pylint: enable=bad-whitespace,bad-continuation # pyformat: enable expected_lens = [2, 5, 5, 5] self.assertAllEqual(expected_lens, lens) self.assertAllEqual(expected_ids, decoder_output.ids) # Sample again with the same random seed. decoder_output2 = sess.run( dec.SampleTargetSequences(dec.theta, encoder_outputs, random_seed=tf.cast(123, tf.int32))) # Get the same output. self.assertAllEqual(decoder_output.ids, decoder_output2.ids) self.assertAllEqual(decoder_output.paddings, decoder_output2.paddings) # Sample again with a different random seed. decoder_output3 = sess.run( dec.SampleTargetSequences(dec.theta, encoder_outputs, random_seed=tf.cast( 123456, tf.int32))) # Get different sequences. self.assertNotAllClose(expected_ids, decoder_output3.ids)
def ComputeLoss(self, theta, predictions, input_batch): """Compute loss for the sparse detector model v1. Args: theta: A `.NestedMap` object containing variable values of this task. predictions: A `.NestedMap` object containing residuals and classification_logits. input_batch: A `.NestedMap` expected to contain cell_center_xyz, cell_points_xyz, cell_feature, anchor_bboxes, anchor_localization_residuals, assigned_gt_labels, and assigned_cls_mask. See class doc string for details. Returns: Two dicts: A dict containing str keys and (metric, weight) pairs as values, where one of the keys is expected to be 'loss'. A dict containing arbitrary tensors describing something about each training example, where the first dimension of each tensor is the batch index. """ p = self.params batch_size, num_centers = py_utils.GetShape( input_batch.cell_center_xyz, 2) # Assert shapes of inputs. anchor_bboxes = py_utils.HasShape( input_batch.anchor_bboxes, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7]) anchor_localization_residuals = py_utils.HasShape( input_batch.anchor_localization_residuals, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7]) predicted_residuals = py_utils.HasShape( predictions.residuals, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 7]) assigned_gt_labels = py_utils.HasShape( input_batch.assigned_gt_labels, [batch_size, num_centers, p.num_anchor_bboxes_per_center]) predicted_classification_logits = py_utils.HasShape( predictions.classification_logits, [ batch_size, num_centers, p.num_anchor_bboxes_per_center, p.num_classes ]) # assigned_cls_mask is for weighting the classification loss. # Ignored targets will have their mask = 0; this happens when their IOU is # not high enough to be a foreground object and not low enough to be # background. class_weights = py_utils.HasShape( input_batch.assigned_cls_mask, [batch_size, num_centers, p.num_anchor_bboxes_per_center]) class_weights = tf.reshape( class_weights, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 1]) # Broadcast per class loss weights. For each anchor, there are num_classes # prediction heads, we weight the outputs of these heads by the per class # loss weights. per_class_loss_weight = tf.constant([[[p.per_class_loss_weight]]], dtype=tf.float32) per_class_loss_weight = py_utils.HasShape(per_class_loss_weight, [1, 1, 1, p.num_classes]) class_weights *= per_class_loss_weight class_weights = py_utils.HasShape(class_weights, [ batch_size, num_centers, p.num_anchor_bboxes_per_center, p.num_classes ]) # We use assigned_reg_mask for masking the regression loss. # Only foreground objects will have assigned_reg_mask = 1. reg_weights = py_utils.HasShape( input_batch.assigned_reg_mask, [batch_size, num_centers, p.num_anchor_bboxes_per_center]) reg_weights = tf.reshape( reg_weights, [batch_size, num_centers, p.num_anchor_bboxes_per_center, 1]) if p.loss_norm_type == LossNormType.NORM_BY_NUM_POS_PER_CENTER: # Compute number of positive anchors per example. foreground_mask = py_utils.HasShape( input_batch.assigned_reg_mask, [batch_size, num_centers, p.num_anchor_bboxes_per_center]) # Sum to get the number of foreground anchors for each example. loss_normalization = tf.reduce_sum(foreground_mask, axis=2) loss_normalization = tf.maximum(loss_normalization, tf.ones_like(loss_normalization)) # Reshape for broadcasting. loss_normalization = tf.reshape(loss_normalization, [batch_size, num_centers, 1, 1]) # Normalize so that the loss is independent of # centers. loss_normalization *= num_centers class_weights /= loss_normalization reg_weights /= loss_normalization classification_loss = py_utils.SigmoidCrossEntropyFocalLoss( logits=predicted_classification_logits, labels=tf.one_hot(assigned_gt_labels, p.num_classes), alpha=p.focal_loss_alpha, gamma=p.focal_loss_gamma) # Apply mask. classification_loss *= class_weights # TODO(jngiam): Consider normalizing by num_foreground_anchors for each # example instead. This would match the 1/N_positive normalization in # point pillars. # Reduce sum over centers, boxes and classes. classification_loss = tf.reduce_sum(classification_loss, axis=[1, 2, 3]) # Reduce mean over batch. classification_loss = tf.reduce_mean(classification_loss) # Localization regression loss with Huber loss (SmoothL1). regression_loc_and_dims_loss = self._utils_3d.ScaledHuberLoss( labels=anchor_localization_residuals[..., :6], predictions=predicted_residuals[..., :6], delta=p.huber_loss_delta) # TODO(jngiam): Consider other methods for rotation loss such as softmax # binning. # For the rotation loss, we use SmoothL1(sine(delta)), this enables the # rotation loss to be the same independent of direction. rotation_delta = (predicted_residuals[..., 6:] - anchor_localization_residuals[..., 6:]) regression_rotation_loss = self._utils_3d.ScaledHuberLoss( labels=tf.zeros_like(rotation_delta), predictions=tf.sin(rotation_delta), delta=p.huber_loss_delta) reg_loc_loss = regression_loc_and_dims_loss[..., :3] reg_dim_loss = regression_loc_and_dims_loss[..., 3:6] gt_bboxes = self._utils_3d.ResidualsToBBoxes( anchor_bboxes, anchor_localization_residuals) predicted_bboxes = self._utils_3d.ResidualsToBBoxes( anchor_bboxes, predicted_residuals) # Apply mask to individual losses. # # And then reduce sum over centers, boxes, residuals, and batch # and divide by the batch_size. regression_rotation_loss *= reg_weights reg_rot_loss = tf.reduce_sum(regression_rotation_loss) / batch_size reg_loc_loss *= reg_weights reg_loc_loss = tf.reduce_sum(reg_loc_loss) / batch_size reg_dim_loss *= reg_weights reg_dim_loss = tf.reduce_sum(reg_dim_loss) / batch_size # Do not create corner loss graph if weight is 0.0 # TODO(bcyang): Remove condition after fixing corner loss NaN issue if p.corner_loss_weight != 0.0: reg_corner_loss = self._utils_3d.CornerLoss( gt_bboxes=gt_bboxes, predicted_bboxes=predicted_bboxes) reg_corner_loss = tf.expand_dims(reg_corner_loss, axis=-1) reg_corner_loss *= reg_weights reg_corner_loss = tf.reduce_sum(reg_corner_loss) / batch_size else: reg_corner_loss = 0.0 # Sum components of regression loss. regression_loss = (p.location_loss_weight * reg_loc_loss + p.dimension_loss_weight * reg_dim_loss + p.rotation_loss_weight * reg_rot_loss + p.corner_loss_weight * reg_corner_loss) # Compute total loss. total_loss = (p.loss_weight_localization * regression_loss + p.loss_weight_classification * classification_loss) metrics_dict = py_utils.NestedMap({ 'loss': (total_loss, batch_size), 'loss/regression': (regression_loss, batch_size), 'loss/regression/loc': (reg_loc_loss, batch_size), 'loss/regression/dim': (reg_dim_loss, batch_size), 'loss/regression/rot': (reg_rot_loss, batch_size), 'loss/regression/corner': (reg_corner_loss, batch_size), 'loss/classification': (classification_loss, batch_size), }) # Calculate dimension errors dimension_errors_dict = self._BBoxDimensionErrors( gt_bboxes, predicted_bboxes, reg_weights) metrics_dict.update(dimension_errors_dict) per_example_dict = py_utils.NestedMap({ 'residuals': predicted_residuals, 'classification_logits': predicted_classification_logits, 'predicted_bboxes': predicted_bboxes, 'gt_bboxes': gt_bboxes, 'reg_weights': reg_weights, }) return metrics_dict, per_example_dict
def _GetMask(self, batch_size, choose_range, mask_size, max_length=None, masks_per_frame=0.0, multiplicity=1, dtype=tf.float32, max_ratio=1.0): """Returns fixed size multi-masks starting from random positions. A multi-mask is a mask obtained by applying multiple masks. This function when max_length is given: 1) Sample random mask lengths less than max_length with shape (batch_size, multiplicity). 2) Truncate lengths to a max of (choose_range * max_ratio), so that each mask is fully contained within the corresponding sequence. 3) Random sample start points of shape (batch_size, multiplicity) with in (choose_range - lengths). 4) For each batch, multiple masks (whose number is given by the multiplicity) are constructed. 5) Return a mask of shape (batch_size, mask_size) where masks are obtained by composing the masks constructed in step 4). If masks_per_frame > 0, the number is given by min(masks_per_frame * choose_range, multiplicity). If not, all the masks are composed. The masked regions are set to zero. This function when max_length is not given: 1) Sample random mask lengths less than (choose_range * max_ratio) with shape (batch_size, multiplicity). 2) Proceed to steps 3), 4) and 5) of the above. Args: batch_size: Batch size. Integer number. choose_range: Range within which the masked entries must lie. Tensor of shape (batch_size,). mask_size: Size of the mask. Integer number. max_length: Maximum number of allowed consecutive masked entries. Integer number or None. masks_per_frame: Number of masks per frame. Float number. If > 0, the multiplicity of the mask is set to be masks_per_frame * choose_range. multiplicity: Maximum number of total masks. Integer number. dtype: Data type. max_ratio: Maximum portion of the entire range allowed to be masked. Float number. Returns: mask: a fixed size multi-mask starting from a random position with shape (batch_size, mask_size). """ p = self.params # Non-empty random seed values are only used for testing # seed_1 and seed_2 are set separately to avoid correlation of # mask size and mask position. if p.random_seed: seed_1 = p.random_seed + 1 seed_2 = 2 * p.random_seed else: seed_1 = p.random_seed seed_2 = p.random_seed # Sample lengths for multiple masks. if max_length and max_length > 0: max_length = tf.broadcast_to(tf.cast(max_length, dtype), (batch_size,)) else: max_length = tf.cast(choose_range, dtype=dtype) * max_ratio masked_portion = tf.random.uniform((batch_size, multiplicity), minval=0.0, maxval=1.0, dtype=dtype, seed=seed_1) masked_frame_size = tf.einsum('b,bm->bm', max_length, masked_portion) masked_frame_size = tf.cast(masked_frame_size, dtype=tf.int32) # Make sure the sampled length was smaller than max_ratio * length_bound. # Note that sampling in this way was biased # (shorter sequence may over-masked.) choose_range = tf.expand_dims(choose_range, -1) choose_range = tf.tile(choose_range, [1, multiplicity]) length_bound = tf.cast(choose_range, dtype=dtype) length_bound = tf.cast(max_ratio * length_bound, dtype=tf.int32) length = tf.minimum(masked_frame_size, tf.maximum(length_bound, 1)) # Choose starting point. random_start = tf.random.uniform((batch_size, multiplicity), maxval=1.0, seed=seed_2) start_with_in_valid_range = random_start * tf.cast( (choose_range - length + 1), dtype=dtype) start = tf.cast(start_with_in_valid_range, tf.int32) end = start + length - 1 # Shift starting and end point by small value. delta = tf.constant(0.1) start = tf.expand_dims(tf.cast(start, dtype) - delta, -1) start = tf.tile(start, [1, 1, mask_size]) end = tf.expand_dims(tf.cast(end, dtype) + delta, -1) end = tf.tile(end, [1, 1, mask_size]) # Construct pre-mask of shape (batch_size, multiplicity, mask_size). diagonal = tf.expand_dims( tf.expand_dims(tf.cast(tf.range(mask_size), dtype=dtype), 0), 0) diagonal = tf.tile(diagonal, [batch_size, multiplicity, 1]) pre_mask = tf.cast( tf.logical_and(diagonal < end, diagonal > start), dtype=dtype) # Sum masks with appropriate multiplicity. if masks_per_frame > 0: multiplicity_weights = tf.tile( tf.expand_dims(tf.range(multiplicity, dtype=dtype), 0), [batch_size, 1]) multiplicity_tensor = masks_per_frame * tf.cast(choose_range, dtype=dtype) multiplicity_weights = tf.cast( multiplicity_weights < multiplicity_tensor, dtype=dtype) pre_mask = tf.einsum('bmt,bm->bt', pre_mask, multiplicity_weights) else: pre_mask = tf.reduce_sum(pre_mask, 1) mask = tf.cast(1.0 - tf.cast(pre_mask > 0, dtype=dtype), dtype=dtype) if p.fprop_dtype is not None and p.fprop_dtype != p.dtype: mask = tf.cast(mask, p.fprop_dtype) return mask
def __init__(self, params): super(TestInputGenerator, self).__init__(params) self._input_batch_size = tf.constant(1)
def AssignAnchors(self, anchor_bboxes, gt_bboxes, gt_bboxes_labels, gt_bboxes_mask, foreground_assignment_threshold=0.5, background_assignment_threshold=0.35, background_class_id=0, force_match=True, similarity_fn=None): """Assigns anchors to bboxes using a similarity function (SSD-based). Each anchor box is assigned to the top matching ground truth box. Ground truth boxes can be assigned to multiple anchor boxes. Assignments can result in 3 outcomes: - Positive assignment (if score >= foreground_assignment_threshold): assigned_gt_labels will reflect the assigned box label and assigned_cls_mask will be set to 1.0 - Background assignment (if score <= background_assignment_threshold): assigned_gt_labels will be background_class_id and assigned_cls_mask will be set to 1.0 - Ignore assignment (otherwise): assigned_gt_labels will be background_class_id and assigned_cls_mask will be set to 0.0 The detection loss function would usually: - Use assigned_cls_mask for weighting the classification loss. The mask is set such that the loss applies to foreground and background assignments only - ignored anchors will be set to 0. - Use assigned_reg_mask for weighting the regression loss. The mask is set such that the loss applies to foreground assignments only. The thresholds (foreground_assignment_threshold and background_assignment_threshold) should be tuned per dataset. TODO(jngiam): Consider having a separate threshold for regression boxes; a separate threshold is used in PointRCNN. Args: anchor_bboxes: tf.float32. [A, 7], where [..., :] corresponds to box parameters (x, y, z, dx, dy, dz, r). gt_bboxes: tf.float32. [G, 7], where [..., :] corresponds to ground truth box parameters (x, y, z, dx, dy, dz, r). gt_bboxes_labels: tensor with shape [G]. Ground truth labels for each bounding box. gt_bboxes_mask: tensor with shape [G]. Mask for ground truth boxes, 1 iff the gt_bbox is a real bbox. foreground_assignment_threshold: Similarity score threshold for assigning foreground bounding boxes; scores need to be >= foreground_assignment_threshold to be assigned to foreground. background_assignment_threshold: Similarity score threshold for assigning background bounding boxes; scores need to be <= background_assignment_threshold to be assigned to background. background_class_id: class id to be assigned to anchors_gt_class if no anchor boxes match. force_match: Boolean specifying if force matching is enabled. If force matching is enabled, then matched anchors which are also the highest scoring with a ground-truth box are considered foreground matches as long as their similarity score > 0. similarity_fn: Function that computes the a similarity score (e.g., IOU) between pairs of bounding boxes. This function should take in two tensors corresponding to anchor and ground-truth bboxes, and return a matrix [A, G] with the similarity score between each pair of bboxes. The score must be non-negative, with greater scores representing more similar. The fore/background_assignment_thresholds will be applied to this score to determine if the an anchor is foreground, background or ignored. If set to None, the function will default to IOU2DRotatedBoxes. Returns: NestedMap with the following keys - assigned_gt_idx: shape [A] index corresponding to the index of the assigned ground truth box. Anchors not assigned to a ground truth box will have the index set to -1. - assigned_gt_bbox: shape [A, 7] bbox parameters assigned to each anchor. - assigned_gt_similarity_score: shape [A] (iou) score between the anchor and the gt bbox. - assigned_gt_labels: shape [A] label assigned to bbox. - assigned_cls_mask: shape [A] mask for classification loss per anchor. This should be 1.0 if the anchor has a foreground or background assignment; otherwise, it will be assigned to 0.0. - assigned_reg_mask: shape [A] mask for regression loss per anchor. This should be 1.0 if the anchor has a foreground assignment; otherwise, it will be assigned to 0.0. Note: background anchors do not have regression targets. """ if similarity_fn is None: similarity_fn = self.IOU2DRotatedBoxes # Shape validation. anchor_bboxes = py_utils.HasShape(anchor_bboxes, [-1, 7]) num_anchor_bboxes, _ = py_utils.GetShape(anchor_bboxes, 2) gt_bboxes = py_utils.HasShape(gt_bboxes, [-1, 7]) num_gt_bboxes, _ = py_utils.GetShape(gt_bboxes, 2) # Compute similarity score and reduce max by anchors and by ground-truth. similarity_score = similarity_fn(anchor_bboxes, gt_bboxes) similarity_score = py_utils.HasShape( similarity_score, [num_anchor_bboxes, num_gt_bboxes]) # Reduce over ground-truth boxes, so we have the max score per anchor. anchor_max_score = tf.reduce_max(similarity_score, axis=1) anchor_max_idx = tf.argmax(similarity_score, axis=1) if force_match: # Reduce over anchors, so we have the max score per ground truth box. gt_max_score = tf.reduce_max(similarity_score, axis=0, keepdims=True) # Force matches occur when the top matching gt bbox for an anchor is the # top matching anchor for the gt bbox. When force matching, we match # these boxes as long as their similarity score exceeds 0. force_matches = ( tf.equal(similarity_score, gt_max_score) & tf.equal(similarity_score, anchor_max_score[..., tf.newaxis]) & tf.greater(similarity_score, 0.) & tf.cast(gt_bboxes_mask[tf.newaxis, ...], tf.bool)) force_match_indicator = tf.reduce_any(force_matches, axis=1) force_match_idx = tf.argmax(tf.cast(force_matches, tf.int32), axis=1) # In assigning foreground/background anchors later, force_match_indicator # is used to determine which anchors are force foreground, and the index # assigned will be taken from anchor_max_idx. # Force matchers must also be the max scoring gt bbox per anchor. # We overwrite anchor_max_idx to ensure that the right match is done. anchor_max_idx = tf.where(force_match_indicator, force_match_idx, anchor_max_idx) # Ensure that max score boxes are not padded boxes by setting score to 0 # for boxes that are padded. gathered_mask = tf.array_ops.batch_gather(gt_bboxes_mask, anchor_max_idx) anchor_max_score = tf.where(tf.equal(gathered_mask, 1), anchor_max_score, tf.zeros_like(anchor_max_score)) # Boolean tensors corresponding to whether an anchor is background or # foreground based on thresholding. background_anchors = tf.less_equal(anchor_max_score, background_assignment_threshold) foreground_anchors = tf.greater_equal(anchor_max_score, foreground_assignment_threshold) if force_match: # Background anchors are below threshold and not force matches. background_anchors &= ~force_match_indicator # Foreground anchors are above thresholds or force matches. foreground_anchors |= force_match_indicator # Add dummy background bbox to gt_boxes to facilitate batch gather. dummy_bbox = tf.constant([[0, 0, 0, 1, 1, 1, 0]], dtype=tf.float32) # Since we are concatenating the dummy bbox, the index corresponds to the # number of boxes. dummy_bbox_idx = py_utils.GetShape(gt_bboxes, 1)[0] dummy_bbox_idx = tf.cast(dummy_bbox_idx, tf.int64) gt_bboxes = tf.concat([gt_bboxes, dummy_bbox], axis=0) gt_bboxes_labels = tf.concat([gt_bboxes_labels, [background_class_id]], axis=0) # Gather indices so that all foreground boxes are gathered from gt_bboxes, # while all background and ignore boxes gather the dummy_bbox. anchor_gather_idx = tf.where( foreground_anchors, anchor_max_idx, tf.ones_like(anchor_max_idx) * dummy_bbox_idx) # Gather the bboxes and weights. assigned_gt_bbox = tf.array_ops.batch_gather(gt_bboxes, anchor_gather_idx) assigned_gt_labels = tf.array_ops.batch_gather(gt_bboxes_labels, anchor_gather_idx) # Set masks for classification and regression losses. assigned_cls_mask = tf.cast(background_anchors | foreground_anchors, tf.float32) assigned_reg_mask = tf.cast(foreground_anchors, tf.float32) # Set assigned_gt_idx such that dummy boxes have idx = -1. assigned_gt_idx = tf.where(tf.equal(anchor_gather_idx, dummy_bbox_idx), tf.ones_like(anchor_gather_idx) * -1, anchor_gather_idx) assigned_gt_idx = tf.cast(assigned_gt_idx, tf.int32) return py_utils.NestedMap( assigned_gt_idx=assigned_gt_idx, assigned_gt_bbox=assigned_gt_bbox, assigned_gt_similarity_score=anchor_max_score, assigned_gt_labels=assigned_gt_labels, assigned_cls_mask=assigned_cls_mask, assigned_reg_mask=assigned_reg_mask)
def BeamSearchDecode(self, theta, encoder_outputs, num_hyps_per_beam_override=0, init_beam_search_state=None, pre_beam_search_step_callback=None, post_beam_search_step_callback=None, max_steps=None): """Performs beam-search based decoding. Args: theta: A NestedMap object containing weights' values of the decoder layer and its children layers. encoder_outputs: A NestedMap containing encoder outputs to be passed to the callbacks. Mostly opaque to BeamSearchHelper, except that it should contain either a 'seq_lengths' field of shape [source_batch_size] or a 'paddings' field of shape [source_max_lengths, source_batch_size]. num_hyps_per_beam_override: If set to a value <= 0, this parameter is ignored. If set to a value > 0, then this value will be used to override `p.num_hyps_per_beam`. init_beam_search_state: The `InitBeamSearchState` callback. Please refer to the class header comments for more details. pre_beam_search_step_callback: The `PreBeamSearchStepCallback` callback. Please refer to the class header comments for more details. post_beam_search_step_callback: The `PostBeamSearchStepCallback` callback. Please refer to the class header comments for more details. max_steps: maximum beam search steps. If None, use self.params.target_seq_len. Returns: A `BeamSearchDecodeOutput`. """ p = self.params num_hyps_per_beam = p.num_hyps_per_beam if num_hyps_per_beam_override > 0: num_hyps_per_beam = num_hyps_per_beam_override if max_steps is None: max_steps = p.target_seq_len initial_results, other_states = init_beam_search_state( theta, encoder_outputs, num_hyps_per_beam) num_hyps = tf.shape(initial_results.log_probs)[0] num_beams = num_hyps // num_hyps_per_beam if 'step_ids' in initial_results: # [num_hyps, 1] step_ids = tf.ensure_shape(initial_results.step_ids, [None, 1]) else: step_ids = tf.fill([num_hyps, 1], tf.constant(p.target_sos_id, dtype=tf.int32)) min_score = -1e36 best_scores = (tf.zeros(shape=[num_beams], dtype=p.dtype) + min_score) cumulative_scores = tf.zeros(shape=[num_hyps], dtype=p.dtype) in_scores = tf.zeros([max_steps, num_hyps], dtype=p.dtype) in_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.int32) in_prev_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.int32) in_done_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.string) bs_atten_probs = tf.zeros( [max_steps, num_hyps, tf.shape(initial_results.atten_probs)[1]], dtype=p.dtype) beam_done = tf.zeros([num_beams], dtype=tf.bool) cur_step = tf.constant(0, dtype=tf.int32) all_done = tf.constant(False, dtype=tf.bool) core_bs_states = (best_scores, cumulative_scores, in_scores, in_hyps, in_prev_hyps, in_done_hyps, bs_atten_probs, beam_done) def LoopContinue(cur_step, all_done, unused_step_ids, unused_core_bs_states, unused_other_states_list): return tf.math.logical_and(cur_step < max_steps, tf.math.logical_not(all_done)) def LoopBody(cur_step, unused_all_done, step_ids, core_bs_states, other_states_list): (cur_step, all_done, new_step_ids, new_bs_states, new_other_states) = self._BeamSearchStep( theta, encoder_outputs, cur_step, step_ids, core_bs_states, other_states.Pack(other_states_list), num_hyps_per_beam, pre_beam_search_step_callback, post_beam_search_step_callback) return (cur_step, all_done, new_step_ids, new_bs_states, new_other_states.Flatten()) flat_other_states = other_states.Flatten() _, _, _, final_bs_states, flat_final_other_states = tf.while_loop( LoopContinue, LoopBody, loop_vars=(cur_step, all_done, step_ids, core_bs_states, flat_other_states), parallel_iterations=10, back_prop=False, swap_memory=False, shape_invariants=(tf.TensorShape(cur_step.get_shape()), tf.TensorShape(all_done.get_shape()), tf.TensorShape(step_ids.get_shape()), _GetShapes(core_bs_states), _GetShapes(flat_other_states, none_shapes=True))) # [target_seq_len, num_beams * num_hyps_per_beam]. final_done_hyps = final_bs_states[5] final_other_states = other_states.Pack(flat_final_other_states) # Assume that `paddings` has shape [source_max_lengths, source_batch_size] # by default, and compute `encoded_seq_lengths` accordingly. This can be # overridden by directly passing `seq_lengths` in the `encoder_outputs` # NestedMap. encoded_seq_lengths = getattr(encoder_outputs, 'seq_lengths', None) if encoded_seq_lengths is None: source_paddings = encoder_outputs.padding if isinstance(source_paddings, py_utils.NestedMap): encoded_seq_lengths = tf.cast( tf.round( tf.reduce_sum( 1.0 - tf.transpose(source_paddings.Flatten()[0]), 1)), tf.int32) else: encoded_seq_lengths = tf.cast( tf.round( tf.reduce_sum( 1.0 - tf.cast(tf.transpose(source_paddings), tf.float32), 1)), tf.int32) # [num_beams, num_hyps_per_beam]. topk_hyps = ops.top_k_terminated_hyps( final_done_hyps, encoded_seq_lengths, k=num_hyps_per_beam, num_hyps_per_beam=num_hyps_per_beam, length_normalization=p.length_normalization, coverage_penalty=p.coverage_penalty, target_seq_length_ratio=p.target_seq_length_ratio) # [num_beams * num_hyps_per_beam, ...]. max_seq_length = 0 if isinstance(max_steps, tf.Tensor) else max_steps topk_ids, topk_lens, topk_scores = ops.unpack_hyp( tf.reshape(topk_hyps, [-1]), max_seq_length=max_seq_length) # [num_beams, num_hyps_per_beam]. topk_scores = tf.reshape(topk_scores, tf.shape(topk_hyps)) return BeamSearchDecodeOutput(topk_hyps, topk_ids, topk_lens, topk_scores, None, final_other_states)
def testConv2DLayerStridedWithPaddingFProp(self, seq_len): """Check strided convs get the same values for different length dim.""" # TODO(isaace): THIS TEST SHOWS THAT THERE IS A BUG IN THE CODE. with self.session(use_gpu=True): batch_size = 3 expected_seq_len = 3 params = conv_layers.Conv2DLayerWithPadding.Params() params.weight_norm = False params.filter_stride = [2, 2] params.name = 'conv' params.filter_shape = [3, 3, 1, 1] params.params_init = py_utils.WeightInit.Constant(1.0) conv_layer = params.Instantiate() # Set up the padding for the sequence length. (starting at 5). in_padding = tf.constant([ [0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 1, 1], ], tf.float32) in_padding = tf.pad(in_padding, [[0, 0], [0, seq_len - 5]], constant_values=1.0) inputs = 1.0 + tf.tile( tf.reshape(tf.range(seq_len, dtype=tf.float32), [1, seq_len, 1, 1]), [batch_size, 1, 3, 1]) inputs = py_utils.ApplyPadding( tf.reshape(in_padding, [batch_size, seq_len, 1, 1]), inputs) # [[[[1], [1], [1]], [[2], [2], [2]], [[3], [3], [3]], [[4], [4], [4]], # [[5], [5], [5]], [[0], [0], [0]]], # [[[1], [1], [1]], [[2], [2], [2]], [[3], [3], [3]], [[4], [4], [4]], # [[0], [0], [0]], [[0], [0], [0]]], # [[[1], [1], [1]], [[2], [2], [2]], [[3], [3], [3]], [[0], [0], [0]], # [[0], [0], [0]], [[0], [0], [0]]]] inputs = py_utils.Debug(inputs) output, out_padding = conv_layer.FPropDefaultTheta( inputs, in_padding) output = py_utils.Debug(output) out_padding = py_utils.Debug(out_padding) self.evaluate(tf.global_variables_initializer()) output, out_padding = self.evaluate([output, out_padding]) self.assertEqual((batch_size, expected_seq_len, 2, 1), output.shape) self.assertAllClose([ [0, 0, 1], [0, 0, 1], [0, 1, 1], ], out_padding) # This here shows a bug in the implementation; the output should be the # same. Also there are bugs with the output not having the correct # padding. if seq_len == 5: self.assertAllClose([ [[[6], [6]], [[18], [18]], [[18], [18]]], [[[6], [6]], [[18], [18]], [[8], [8]]], [[[6], [6]], [[10], [10]], [[0], [0]]], ], output) elif seq_len == 6: self.assertAllClose([ [[[12], [12]], [[24], [24]], [[10], [10]]], [[[12], [12]], [[14], [14]], [[0], [0]]], [[[12], [12]], [[6], [6]], [[0], [0]]], ], output) else: raise ValueError('Test does not handle length {seq_len}')
def testCausalConv2DLayerStridedWithPaddingFProp(self, seq_len): """Check strided convs get the same values for different length dim.""" # TODO(isaace): THIS TEST SHOWS THAT THERE IS A BUG WITH PADDING with self.session(use_gpu=True): batch_size = 5 expected_seq_len = 3 params = conv_layers.CausalConv2DLayerWithPadding.Params() params.weight_norm = False params.filter_stride = [2, 2] params.name = 'conv' params.filter_shape = [3, 1, 1, 1] params.params_init = py_utils.WeightInit.Constant(1.0) conv_layer = params.Instantiate() # Set up the padding for the sequence length. (starting at 5). in_padding = tf.constant([ [0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 1, 1], [0, 0, 1, 1, 1], [0, 1, 1, 1, 1], ], tf.float32) in_padding = tf.pad(in_padding, [[0, 0], [0, seq_len - 5]], constant_values=1.0) inputs = 1.0 + tf.tile( tf.reshape(tf.range(seq_len, dtype=tf.float32), [1, seq_len, 1, 1]), [batch_size, 1, 3, 1]) inputs = py_utils.ApplyPadding( tf.reshape(in_padding, [batch_size, seq_len, 1, 1]), inputs) inputs = py_utils.Debug(inputs) output, out_padding = conv_layer.FPropDefaultTheta( inputs, in_padding) output = py_utils.Debug(output) out_padding = py_utils.Debug(out_padding) self.evaluate(tf.global_variables_initializer()) output, out_padding = self.evaluate([output, out_padding]) self.assertEqual((batch_size, expected_seq_len, 2, 1), output.shape) self.assertAllClose([ [0, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 1], [1, 1, 1], ], out_padding) # NOTE: There is a bug in the output not being padded correctly. self.assertAllClose([ [[[1], [1]], [[6], [6]], [[12], [12]]], [[[1], [1]], [[6], [6]], [[7], [7]]], [[[1], [1]], [[6], [6]], [[3], [3]]], [[[1], [1]], [[3], [3]], [[0], [0]]], [[[1], [1]], [[1], [1]], [[0], [0]]], ], output)
def testTargetSequenceSamplerWithNumHypsPerBeam4(self, use_recurrent): with self.session(use_gpu=False): np.random.seed(9384758) tf.random.set_seed(8274758) vocab_size = 12 src_len = 5 tgt_len = 7 batch_size = 2 def InitBeamSearchCallBack(unused_theta, unused_encoder_outputs, num_hyps_per_beam): self.assertEqual(4, num_hyps_per_beam) logits = tf.zeros((batch_size * num_hyps_per_beam, vocab_size), dtype=tf.float32) return (py_utils.NestedMap(log_probs=logits), py_utils.NestedMap(step=tf.constant(0))) def PreBeamSearchStepCallback(unused_theta, unused_encoder_outputs, unused_step_ids, states, num_hyps_per_beam): self.assertEqual(4, num_hyps_per_beam) logits = tf.random.stateless_normal( [batch_size * num_hyps_per_beam, vocab_size], seed=[8273747, 9]) return (py_utils.NestedMap(log_probs=logits), py_utils.NestedMap(step=states.step + 1)) def PostBeamSearchStepCallback(unused_theta, unused_encoder_outputs, unused_new_step_ids, states): return states src_enc = tf.random.stateless_normal([src_len, batch_size, 8], seed=[982774838, 9]) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float32) encoder_outputs = py_utils.NestedMap( encoded=src_enc, padding=src_enc_padding) theta = py_utils.NestedMap() random_seed = tf.constant(123) p = target_sequence_sampler.TargetSequenceSampler.Params().Set( name='bsh', target_seq_len=tgt_len, num_hyps_per_beam=4, use_recurrent=use_recurrent) seq_sampler = p.Instantiate() decoder_output = seq_sampler.Sample(theta, encoder_outputs, random_seed, InitBeamSearchCallBack, PreBeamSearchStepCallback, PostBeamSearchStepCallback) ids, lens = self.evaluate([ decoder_output.ids, tf.reduce_sum(1 - decoder_output.paddings, 1), ]) print(np.array_repr(ids)) print(np.array_repr(lens)) expected_ids = [[9, 0, 2, 2, 2, 2, 2], [0, 0, 11, 8, 1, 0, 7], [10, 4, 8, 4, 9, 3, 7], [8, 9, 10, 3, 4, 2, 2], [11, 9, 7, 9, 8, 7, 11], [1, 4, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2], [9, 3, 6, 9, 6, 2, 2]] expected_lens = [3, 7, 7, 6, 7, 3, 1, 6] self.assertAllEqual(expected_ids, ids) self.assertAllEqual(expected_lens, lens) p = target_sequence_sampler.TargetSequenceSampler.Params().Set( name='bsh', target_seq_len=tgt_len, top_k=1, num_hyps_per_beam=4) seq_sampler = p.Instantiate() decoder_output = seq_sampler.Sample(theta, encoder_outputs, random_seed, InitBeamSearchCallBack, PreBeamSearchStepCallback, PostBeamSearchStepCallback) ids, lens = self.evaluate([ decoder_output.ids, tf.reduce_sum(1 - decoder_output.paddings, 1), ]) print(np.array_repr(ids)) print(np.array_repr(lens)) expected_ids = [[0, 0, 0, 0, 0, 0, 0], [7, 7, 7, 7, 7, 7, 7], [7, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 0, 0, 0], [8, 8, 8, 8, 8, 8, 8], [10, 10, 10, 10, 10, 10, 10], [2, 2, 2, 2, 2, 2, 2], [6, 6, 6, 6, 6, 6, 6]] expected_lens = [7, 7, 7, 7, 7, 7, 1, 7] self.assertAllEqual(expected_ids, ids) self.assertAllEqual(expected_lens, lens) p = target_sequence_sampler.TargetSequenceSampler.Params().Set( name='bsh', target_seq_len=tgt_len, top_k=5, num_hyps_per_beam=4) seq_sampler = p.Instantiate() decoder_output = seq_sampler.Sample(theta, encoder_outputs, random_seed, InitBeamSearchCallBack, PreBeamSearchStepCallback, PostBeamSearchStepCallback) ids, lens = self.evaluate([ decoder_output.ids, tf.reduce_sum(1 - decoder_output.paddings, 1), ]) print(np.array_repr(ids)) print(np.array_repr(lens)) expected_ids = [[5, 0, 0, 0, 8, 0, 6], [7, 7, 10, 0, 7, 7, 0], [11, 7, 11, 7, 11, 7, 10], [3, 4, 4, 9, 1, 9, 1], [10, 11, 9, 11, 9, 9, 10], [10, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2], [9, 6, 1, 9, 5, 6, 10]] expected_lens = [7, 7, 7, 7, 7, 2, 1, 7] self.assertAllEqual(expected_ids, ids) self.assertAllEqual(expected_lens, lens) p = target_sequence_sampler.TargetSequenceSampler.Params().Set( name='bsh', target_seq_len=tgt_len, temperature=0.2, num_hyps_per_beam=4) seq_sampler = p.Instantiate() decoder_output = seq_sampler.Sample(theta, encoder_outputs, random_seed, InitBeamSearchCallBack, PreBeamSearchStepCallback, PostBeamSearchStepCallback) ids, lens = self.evaluate([ decoder_output.ids, tf.reduce_sum(1 - decoder_output.paddings, 1), ]) print(np.array_repr(ids)) print(np.array_repr(lens)) expected_ids = [[0, 0, 0, 0, 0, 0, 9], [0, 0, 11, 7, 1, 0, 7], [7, 7, 7, 7, 7, 6, 7], [0, 0, 3, 0, 0, 0, 0], [9, 8, 8, 8, 8, 8, 9], [2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2], [6, 5, 6, 6, 6, 1, 6]] expected_lens = [7, 7, 7, 7, 7, 1, 1, 7] self.assertAllEqual(expected_ids, ids) self.assertAllEqual(expected_lens, lens)
def InitBeamSearchCallBack(unused_theta, unused_encoder_outputs, num_hyps_per_beam): self.assertEqual(1, num_hyps_per_beam) logits = tf.zeros((batch_size, vocab_size), dtype=tf.float32) return (py_utils.NestedMap(log_probs=logits), py_utils.NestedMap(step=tf.constant(0)))
def testTargetSequenceSamplerWithEOC(self, use_recurrent): with self.session(use_gpu=False): np.random.seed(9384758) tf.random.set_seed(8274758) vocab_size = 4 src_len = 5 tgt_len = 20 batch_size = 2 p = target_sequence_sampler.TargetSequenceSampler.Params().Set( name='bsh', target_seq_len=tgt_len, target_eoc_id=0, use_recurrent=use_recurrent) seq_sampler = p.Instantiate() def InitBeamSearchCallBack(unused_theta, unused_encoder_outputs, num_hyps_per_beam): self.assertEqual(1, num_hyps_per_beam) logits = tf.zeros((batch_size, vocab_size), dtype=tf.float32) is_last_chunk = tf.constant(False, shape=[batch_size]) result = py_utils.NestedMap( log_probs=logits, is_last_chunk=is_last_chunk) states = py_utils.NestedMap( step=tf.constant(0), src_step=tf.zeros([batch_size], dtype=tf.int32)) return result, states def PreBeamSearchStepCallback(unused_theta, unused_encoder_outputs, unused_step_ids, states, num_hyps_per_beam): self.assertEqual(1, num_hyps_per_beam) logits = tf.random.stateless_normal([batch_size, vocab_size], seed=[8273747, 9]) # Make it never predict <eos>. logits -= tf.one_hot([p.target_eos_id], vocab_size, 1e30) is_last_chunk = tf.equal(states.src_step, src_len - 1) result = py_utils.NestedMap( log_probs=logits, is_last_chunk=is_last_chunk) return result, states def PostBeamSearchStepCallback(unused_theta, unused_encoder_outputs, new_step_ids, states): return py_utils.NestedMap( step=states.step + 1, src_step=states.src_step + tf.cast( tf.equal(new_step_ids, p.target_eoc_id), dtype=tf.int32)) src_enc = tf.random.stateless_normal([src_len, batch_size, 8], seed=[982774838, 9]) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float32) encoder_outputs = py_utils.NestedMap( encoded=src_enc, padding=src_enc_padding) theta = py_utils.NestedMap() random_seed = tf.constant(123) decoder_output = seq_sampler.Sample( theta, encoder_outputs, random_seed, InitBeamSearchCallBack, PreBeamSearchStepCallback, PostBeamSearchStepCallback) ids, lens = self.evaluate([ decoder_output.ids, tf.reduce_sum(1 - decoder_output.paddings, 1), ]) print(np.array_repr(ids)) print(np.array_repr(lens)) expected_ids = [ [0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [0, 0, 3, 3, 1, 0, 3, 0, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], ] expected_lens = [5, 11] self.assertAllEqual(expected_ids, ids) self.assertAllEqual(expected_lens, lens) # Now do the same, except with use_stop_fn=True. p = target_sequence_sampler.TargetSequenceSampler.Params().Set( name='bsh', target_seq_len=tgt_len, target_eoc_id=0, use_stop_fn=True) seq_sampler = p.Instantiate() decoder_output = seq_sampler.Sample(theta, encoder_outputs, random_seed, InitBeamSearchCallBack, PreBeamSearchStepCallback, PostBeamSearchStepCallback) ids, lens = self.evaluate([ decoder_output.ids, tf.reduce_sum(1 - decoder_output.paddings, 1), ]) print(np.array_repr(ids)) print(np.array_repr(lens)) expected_ids = [ [0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [0, 0, 3, 3, 1, 0, 3, 0, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], ] expected_lens = [5, 11] self.assertAllEqual(expected_ids, ids) self.assertAllEqual(expected_lens, lens)
def _runBeamSearchOpHelper(self, hyp_size, num_beams, seq_len, init_best_score, probs, init_atten_probs, atten_probs, beam_size=3.0, ensure_full_beam=False, force_eos_in_last_step=False, local_eos_threshold=-100.0, independence=True, use_v2=True): eos_id = 2 num_hyps_per_beam = hyp_size / num_beams best_scores = tf.zeros([num_beams]) cumulative_scores = tf.zeros([hyp_size]) scores = tf.zeros([seq_len, hyp_size]) hyps = tf.zeros([seq_len, hyp_size], dtype=tf.int32) prev_hyps = tf.zeros([seq_len, hyp_size], dtype=tf.int32) done_hyps = tf.constant('', shape=[seq_len, hyp_size], dtype=tf.string) best_scores += init_best_score beam_done = tf.zeros([num_beams], dtype=tf.bool) for i, prob in enumerate(probs): if use_v2: (best_scores, cumulative_scores, scores, hyps, prev_hyps, done_hyps, atten_probs, beam_done, done) = ops.beam_search_step( prob, init_atten_probs, best_scores, cumulative_scores, scores, hyps, prev_hyps, done_hyps, atten_probs, beam_done, [], i, eos_id=eos_id, beam_size=beam_size, ensure_full_beam=ensure_full_beam, num_hyps_per_beam=num_hyps_per_beam, valid_eos_max_logit_delta=0.1, force_eos_in_last_step=force_eos_in_last_step, local_eos_threshold=local_eos_threshold, beam_independence=independence) else: (best_scores, cumulative_scores, scores, hyps, prev_hyps, done_hyps, atten_probs, done) = ops.beam_search_step_deprecated( prob, init_atten_probs, best_scores, cumulative_scores, scores, hyps, prev_hyps, done_hyps, atten_probs, [], i, eos_id=eos_id, beam_size=beam_size, ensure_full_beam=ensure_full_beam, num_hyps_per_beam=num_hyps_per_beam, valid_eos_max_logit_delta=0.1, force_eos_in_last_step=force_eos_in_last_step, local_eos_threshold=local_eos_threshold) with self.session(use_gpu=False): (best_scores, cumulative_scores, scores, hyps, prev_hyps, done_hyps, atten_probs, done, beam_done) = self.evaluate([ best_scores, cumulative_scores, scores, hyps, prev_hyps, done_hyps, atten_probs, done, beam_done ]) return (best_scores, cumulative_scores, scores, hyps, prev_hyps, done_hyps, atten_probs, done, beam_done)
def testForwardPass(self): with self.session(use_gpu=False) as sess: bs = 2 sl = 21 tf.set_random_seed(8372749040) p = self._EncoderParams() mt_enc = encoder.TransformerEncoder(p) batch = py_utils.NestedMap() batch.ids = tf.constant( np.random.randint(low=0, high=63, size=[bs, sl], dtype=np.int32)) batch.paddings = tf.zeros([bs, sl]) out = mt_enc.FPropDefaultTheta(batch) enc_out_sum = tf.reduce_sum(out.encoded, 0) emb_out_sum = tf.reduce_sum(out.embedded_inputs, 0) enc_padding = out.padding tf.global_variables_initializer().run() actual_enc_out, actual_enc_out_sum, actual_emb_out_sum, \ actual_padding = sess.run( [out.encoded, enc_out_sum, emb_out_sum, enc_padding]) # pyformat: disable # pylint: disable=bad-whitespace expected_enc_out = [[ 49.45291519, -31.5743885, 39.43684387, -47.67513275, 35.39754105, 14.41970444, 29.58752823, -43.06747055, 24.09403419, -7.62717247, 18.48112106, 20.42408371, 5.1519866, -19.66542244, 29.81095314, 56.90407944 ], [ 55.26333618, -30.39743614, 29.68314743, -37.61392975, 43.02292252, 13.88345146, 15.73033905, -24.68696213, 24.70776558, -29.18026161, 15.41469955, 27.77672577, -5.36326742, -22.78984642, 22.15843391, 22.7237072 ]] expected_emb_out_sum = [[ 3.11785889, 1.33086884, -1.96904886, -4.81911993, 1.25389254, 1.52582073, 0.79906291, 4.07078457, -1.20546532, -2.97308111, 0.22460097, 2.99702668, -2.29453254, 6.06631422, 1.68836212, 5.35728741 ], [ 1.41723049, -1.39409399, -1.49569404, -0.24654561, 1.09658146, 4.51638842, 2.72023368, -0.45651400, 3.46091199, -0.43925080, 1.02091551, 3.89704037, 1.87841535, -0.27947778, -0.91630745, 1.34230828 ]] # pylint: enable=bad-whitespace # pyformat: enable self.assertAllEqual(actual_enc_out.shape, [sl, bs, p.model_dim]) self.assertAllEqual(actual_padding.shape, [sl, bs]) self.assertAllClose(expected_enc_out, actual_enc_out_sum, rtol=1e-05, atol=1e-05) self.assertAllClose(expected_emb_out_sum, actual_emb_out_sum, rtol=1e-05, atol=1e-05)
def testCustomStepIds(self): with self.session(use_gpu=False) as sess: np.random.seed(9384758) tf.set_random_seed(8274758) vocab_size = 12 src_len = 5 tgt_len = 7 num_hyps_per_beam = 3 src_batch_size = 2 tgt_batch_size = src_batch_size * num_hyps_per_beam p = beam_search_helper.BeamSearchHelper.Params().Set( name='bsh', target_seq_len=tgt_len) bs_helper = p.Instantiate() def InitBeamSearchState(unused_theta, unused_encoder_outputs, unused_num_hyps_per_beam): atten_probs = tf.constant( np.random.normal(size=(tgt_batch_size, src_len)), dtype=tf.float32) return (py_utils.NestedMap({ 'log_probs': tf.zeros([tgt_batch_size, vocab_size]), 'atten_probs': atten_probs, 'step_ids': tf.zeros([tgt_batch_size, 1], dtype=tf.int32) }), py_utils.NestedMap({'atten_probs': atten_probs})) def PreBeamSearchStepCallback(unused_theta, unused_encoder_outputs, unused_step_ids, states, unused_num_hyps_per_beam): atten_probs = tf.identity(states.atten_probs) logits = tf.random_normal([tgt_batch_size, vocab_size], seed=8273747) return (py_utils.NestedMap({ 'atten_probs': atten_probs, 'log_probs': logits }), states) def PostBeamSearchStepCallback(unused_theta, unused_encoder_outputs, unused_new_step_ids, states): return states src_enc = tf.random_normal([src_len, src_batch_size, 8], seed=982774838) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float32) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding) theta = py_utils.NestedMap() decoder_output = bs_helper.BeamSearchDecode( theta, encoder_outputs, num_hyps_per_beam, InitBeamSearchState, PreBeamSearchStepCallback, PostBeamSearchStepCallback) topk_ids, topk_lens, topk_scores = sess.run([ decoder_output.topk_ids, decoder_output.topk_lens, decoder_output.topk_scores ]) print(np.array_repr(topk_ids)) print(np.array_repr(topk_lens)) print(np.array_repr(topk_scores)) expected_topk_ids = [[4, 3, 4, 3, 2, 0, 0], [4, 3, 11, 2, 0, 0, 0], [4, 3, 6, 2, 0, 0, 0], [6, 0, 4, 6, 6, 11, 2], [6, 0, 4, 6, 1, 2, 0], [6, 0, 4, 6, 6, 2, 0]] expected_topk_lens = [5, 4, 4, 7, 6, 6] expected_topk_scores = [[8.27340603, 6.26949024, 5.59490776], [9.74691486, 8.46679497, 7.14809656]] self.assertEqual(expected_topk_ids, topk_ids.tolist()) self.assertEqual(expected_topk_lens, topk_lens.tolist()) self.assertAllClose(expected_topk_scores, topk_scores)
def Pos(x): return tf.maximum(tf.constant(1e-8, x.dtype), x)
def testGreedySearchHelper(self): with self.session(use_gpu=False) as sess: np.random.seed(9384758) tf.set_random_seed(8274758) vocab_size = 12 src_len = 5 tgt_len = 7 src_batch_size = 2 tgt_batch_size = src_batch_size p = beam_search_helper.GreedySearchHelper.Params().Set( name='gsh', target_seq_len=tgt_len) gs_helper = p.Instantiate() def InitGreedySearchState(unused_theta, unused_encoder_outputs, unused_num_hyps_per_beam): atten_probs = tf.constant( np.random.normal(size=(tgt_batch_size, src_len)), dtype=tf.float32) return (py_utils.NestedMap({ 'log_probs': tf.zeros([tgt_batch_size, vocab_size]), 'atten_probs': atten_probs, }), py_utils.NestedMap({'atten_probs': atten_probs})) def PreGreedySearchStepCallback(unused_theta, unused_encoder_outputs, unused_step_ids, states, unused_num_hyps_per_beam): atten_probs = tf.identity(states.atten_probs) logits = tf.random_normal([tgt_batch_size, vocab_size], seed=8273747) return (py_utils.NestedMap({ 'atten_probs': atten_probs, 'log_probs': logits }), states) def PostGreedySearchStepCallback(unused_theta, unused_encoder_outputs, unused_new_step_ids, states): return states src_enc = tf.random_normal([src_len, src_batch_size, 8], seed=982774838) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float32) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding) theta = py_utils.NestedMap() (final_hyp_ids, final_hyp_lens, final_done_hyps) = gs_helper.GreedySearchDecode( theta, encoder_outputs, InitGreedySearchState, PreGreedySearchStepCallback, PostGreedySearchStepCallback) (final_hyp_ids, final_hyp_lens, final_done_hyps) = sess.run( [final_hyp_ids, final_hyp_lens, final_done_hyps]) print(np.array_repr(final_hyp_ids)) print(np.array_repr(final_hyp_lens)) print(np.array_repr(final_done_hyps)) expected_hyp_ids = [[2, 2, 6, 7, 1, 9, 4], [3, 9, 3, 9, 6, 5, 10]] expected_hyp_lens = [1, 7] expected_done_hyps = [True, False] self.assertEqual(expected_hyp_ids, final_hyp_ids.tolist()) self.assertEqual(expected_hyp_lens, final_hyp_lens.tolist()) self.assertEqual(expected_done_hyps, final_done_hyps.tolist())
def testDecoderFPropWithAdapters(self): """Create decoder with adapters, and verify that FProp runs.""" with self.session(use_gpu=False): tf.random.set_seed(8372749040) params = self._DecoderParams( num_rnn_layers=2, vn_config=py_utils.VariationalNoiseParams(None, True, False, seed=12345)) params.rnn_cell_dim = 3 params.adapter_layer_tpl.Set( bottleneck_dim=4, num_tasks=16, projection_params_init=py_utils.WeightInit.Gaussian(0.01)) params.adapter_task_id_field = 'domain_ids' dec = params.Instantiate() src_seq_len = 5 src_enc = tf.random.normal([src_seq_len, 2, 8], seed=982774838, dtype=py_utils.FPropDtype(params)) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=py_utils.FPropDtype(params)) domain_ids = tf.constant( np.random.randint(low=0, high=16, size=[2])) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding, domain_ids=domain_ids) # shape=[4, 5] target_ids = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15], [5, 6, 7, 8], [10, 5, 2, 5]], dtype=tf.int32)) # shape=[4, 5] target_labels = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13], [5, 7, 8, 10], [10, 5, 2, 4]], dtype=tf.int32)) # shape=[4, 5] target_paddings = tf.transpose( tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 0]], dtype=py_utils.FPropDtype(params))) target_transcripts = tf.constant( ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf']) target_weights = 1.0 - target_paddings # ids/labels/weights/paddings are all in [batch, time] shape. targets = py_utils.NestedMap({ 'ids': target_ids, 'labels': target_labels, 'weights': target_weights, 'paddings': target_paddings, 'transcripts': target_transcripts, }) decoder_outputs = dec.FPropDefaultTheta(encoder_outputs, targets) metrics = decoder_outputs.metrics per_sequence_loss = decoder_outputs.per_sequence['loss'] self.assertIn('fraction_of_correct_next_step_preds', metrics) self.evaluate(tf.global_variables_initializer()) metrics_val, per_sequence_loss_val = self.evaluate( [metrics, per_sequence_loss]) tf.logging.info('metrics=%s, per_sequence_loss=%s', metrics_val, per_sequence_loss_val) self.assertEqual(metrics_val['loss'], metrics_val['log_pplx']) # Target batch size is 4. Therefore, we should expect 4 here. self.assertEqual(per_sequence_loss_val.shape, (4, ))
def Value(self): return tf.constant(self.params.value, self.params.dtype)
def FarthestPointSampler(points, padding, num_sampled_points, precomputed_squared_distance=None, num_seeded_points=0, random_seed=None): """Samples num_sampled_points from points using farthest point sampling. Algorithm: 1. Start by selecting a random point and adding to a selected set. 2. For all remaining points, find the furthest point from those selected. 3. Add furthest point to selected. 4. Repeat 2-3 until num_sampled_points are selected. More details at https://en.wikipedia.org/wiki/Farthest-first_traversal This output of this function can be used with tf.array_ops.batch_gather to extract the desired points, for example: tf.array_ops.batch_gather(points, sampled_idx) Args: points: floating point tf.Tensor of shape [N, P1, dims] padding: A floating point tf.Tensor of shape [N, P1] with 0 if the point is real, and 1 otherwise. num_sampled_points: integer number of points to sample. precomputed_squared_distance: optional tf.Tensor of shape [N, P1, P1] of distances between each point. if None, distances will be computed on the fly. num_seeded_points: If num_seeded_points > 0, then the first num_seeded_points in points are considered to be seeded in the FPS sampling. Note that we assume that these points are *not* padded, and do not check padding when seeding them. random_seed: optional integer random seed to use with all the random ops. Returns: A tuple of tf.Tensors (sampled_idx, closest_idx) of types (tf.int32, tf.int32). sampled_idx is of shape [N, num_sampled_points] representing the indices selected using the sampler. This will have range of [0, P1]. closest_idx is of shape [N, P1] representing the indices of the closest sampled points for each input point. closest_idx is used in PCNN as part of the pooling operation: each point is assigned to the closest sampled point and a max is taken over them. This will have a range of [0, P2] with the index of the closest sampled point that remains. """ points = py_utils.HasRank(points, 3) batch_size, num_points, dims = py_utils.GetShape(points, 3) points = py_utils.with_dependencies( [py_utils.assert_greater_equal(num_points, num_sampled_points)], points) # Add a tiny bit of noise to the distance matrix or points so all # points are unique. This will also ensure true repeated points # like padded points are only selected after all valid points are selected. if precomputed_squared_distance is not None: precomputed_squared_distance = py_utils.HasShape( precomputed_squared_distance, [batch_size, num_points, num_points]) precomputed_squared_distance += tf.random.uniform( (batch_size, num_points, 1), minval=1e-6, maxval=1e-5, dtype=tf.float32, seed=random_seed) else: points += tf.random.uniform((batch_size, num_points, dims), minval=1e-6, maxval=1e-5, dtype=tf.float32, seed=random_seed) # TensorArray to store the sampled indices in the loop. sampled_idx = tf.TensorArray(tf.int32, num_sampled_points) # Initialize distance_to_selected to inf for all points. distance_to_selected = float('inf') * tf.ones((batch_size, num_points)) # For tracking the index to the closest selected point. closest_idx = tf.zeros((batch_size, num_points), dtype=tf.int32) # Current loop index counter. curr_idx = tf.constant(0, dtype=tf.int32) # Get number of valid points (1 is padded, so num_points - num_padded). num_valid_points = tf.cast( tf.cast(num_points, dtype=tf.float32) - tf.reduce_sum(padding, axis=1), dtype=tf.int32) def _BodyFn(curr_idx, distance_to_selected, sampled_idx, closest_idx): """Loop body for farthest point sampler.""" def _GetRandomRealPoint(): """Select the first point. For the first point, we want any random real (non padded) point, so we create a random values per point, and then set all padded ones to some large value (more than the maxval). We then take the min per batch element to get the first points. Returns: Tensor containing the index of a random point selected for each example in the batch. """ random_values = tf.random.uniform((batch_size, num_points), minval=0, maxval=1, dtype=tf.float32, seed=random_seed) random_values = tf.where( tf.equal(padding, 0.0), random_values, padding * 10) return tf.argmin(random_values, axis=1, output_type=tf.int32) def _GetFurthestPoint(): """Get point that is furthest from those already selected. We also bias the sampling towards real points by setting the distance to padded points negative until we are out of real points. Returns: Tensor containing the index of the next farthest point selected for each example in the batch. """ # Set padded points distance to negative so they aren't selected. padding_masked_distance_to_selected = tf.where( tf.equal(padding, 0.0), distance_to_selected, -1.0 * tf.ones( (batch_size, num_points), dtype=tf.float32)) # But only do this when we still have valid points left. padding_masked_distance_to_selected = tf.where( tf.less(curr_idx, num_valid_points), padding_masked_distance_to_selected, distance_to_selected) return tf.argmax( padding_masked_distance_to_selected, axis=-1, output_type=tf.int32) def _GetSeededPoint(): """Select a seeded point. Seeded points are assumed to be at the beginning of the original points. Returns: Tensor containing the index of the next seeded point to select for each example in the batch. """ return tf.ones((batch_size,), dtype=tf.int32) * curr_idx # Select indices for this loop iteration. def _Seeded(): return tf.cond( tf.less(curr_idx, num_seeded_points), _GetSeededPoint, _GetFurthestPoint) def _Real(): return tf.cond( tf.equal(curr_idx, 0), _GetRandomRealPoint, _GetFurthestPoint) new_selected = tf.cond(tf.greater(num_seeded_points, 0), _Seeded, _Real) sampled_idx = sampled_idx.write(curr_idx, new_selected) # Extract the distance to the latest point selected to update # distance_to_selected. new_selected_gather_idx = tf.stack([tf.range(batch_size), new_selected], axis=1) if precomputed_squared_distance is not None: new_distance = tf.gather_nd(precomputed_squared_distance, new_selected_gather_idx) else: new_points = tf.reshape( tf.gather_nd(points, new_selected_gather_idx), [batch_size, 1, dims]) new_distance = tf.reshape( SquaredDistanceMatrix(points, new_points), [batch_size, num_points]) is_newly_closest = tf.less(new_distance, distance_to_selected) distance_to_selected = tf.minimum(distance_to_selected, new_distance) # Track the index to the closest selected point. new_selected_tiled = tf.tile([[curr_idx]], [batch_size, num_points]) closest_idx = tf.cond( tf.equal(curr_idx, 0), # At the first loop iteration, the init points are the closest. lambda: new_selected_tiled, # Otherwise, update with the new points based on the distances. lambda: tf.where(is_newly_closest, new_selected_tiled, closest_idx)) return curr_idx + 1, distance_to_selected, sampled_idx, closest_idx _, _, sampled_idx, closest_idx = tf.while_loop( lambda curr_idx, *args: tf.less(curr_idx, num_sampled_points), _BodyFn, loop_vars=(curr_idx, distance_to_selected, sampled_idx, closest_idx), back_prop=False, maximum_iterations=num_sampled_points) sampled_idx = sampled_idx.stack() # num_sampled_points x n sampled_idx = tf.transpose(sampled_idx, [1, 0]) if isinstance(batch_size, int) and isinstance(num_sampled_points, int): sampled_idx.set_shape((batch_size, num_sampled_points)) return sampled_idx, closest_idx
def GreedySearchDecode(self, theta, encoder_outputs, init_beam_search_state=None, pre_beam_search_step_callback=None, post_beam_search_step_callback=None, max_steps=None): """Performs greedy-search based decoding. Args: theta: A NestedMap object containing weights' values of the decoder layer and its children layers. encoder_outputs: A NestedMap containing encoder outputs to be passed to the callbacks. init_beam_search_state: The `InitBeamSearchState` callback. Please refer to the class header comments for more details. pre_beam_search_step_callback: The `PreBeamSearchStepCallback` callback. Please refer to the class header comments for more details. post_beam_search_step_callback: The `PostBeamSearchStepCallback` callback. Please refer to the class header comments for more details. max_steps: maximum beam search steps. If None, use self.params.target_seq_len. Returns: A tuple (hyp_ids, hyp_lens, done_hyps). Note that num_hyps is same as src_batch_size. - hyp_ids: [num_hyps, max_step]. Hyps end with <eos> token if the <eos> token is encountered during search. - hyp_lens: [num_hyps]. - done_hyps: [num_hyps], whether or not an eos is encountered. """ p = self.params if max_steps is None: max_steps = p.target_seq_len initial_results, other_states = init_beam_search_state( theta, encoder_outputs, 1 # num_hyps_per_beam ) num_hyps = tf.shape(initial_results.log_probs)[0] if 'step_ids' in initial_results: # [num_hyps, 1] step_ids = tf.ensure_shape(initial_results.step_ids, [None, 1]) else: step_ids = tf.fill([num_hyps, 1], tf.constant(p.target_sos_id, dtype=tf.int32)) cur_step = tf.constant(0, dtype=tf.int32) done_hyps = inplace_ops.empty(shape=[num_hyps], dtype=tf.bool, init=True, name='done_hyps') hyp_lens = inplace_ops.empty(shape=[num_hyps], dtype=tf.int32, init=True, name='hyp_lens') hyp_ids = inplace_ops.empty(shape=[max_steps, num_hyps], dtype=tf.int32, init=True, name='hyp_ids') def LoopContinue(cur_step, unused_step_ids, unused_hyp_ids, unused_hyp_lens, done_hyps, unused_other_states_list): return tf.math.logical_and( cur_step < max_steps, tf.math.logical_not(tf.reduce_all(done_hyps))) def LoopBody(cur_step, step_ids, hyp_ids, hyp_lens, done_hyps, other_states_list): (cur_step, new_step_ids, hyp_ids, hyp_lens, done_hyps, new_other_states) = self._GreedySearchStep( theta, encoder_outputs, cur_step, step_ids, hyp_ids, hyp_lens, done_hyps, other_states.Pack(other_states_list), pre_beam_search_step_callback, post_beam_search_step_callback) return (cur_step, new_step_ids, hyp_ids, hyp_lens, done_hyps, new_other_states.Flatten()) flat_other_states = other_states.Flatten() _, _, final_hyp_ids, final_hyp_lens, final_done_hyps, _ = tf.while_loop( LoopContinue, LoopBody, loop_vars=(cur_step, step_ids, hyp_ids, hyp_lens, done_hyps, flat_other_states), parallel_iterations=10, back_prop=False, swap_memory=False, shape_invariants=(tf.TensorShape(cur_step.get_shape()), tf.TensorShape(step_ids.get_shape()), tf.TensorShape(hyp_ids.get_shape()), tf.TensorShape(hyp_lens.get_shape()), tf.TensorShape(done_hyps.get_shape()), _GetShapes(flat_other_states, none_shapes=True))) # transpose hyp_ids so it matches BeamSearchDecode's output final_hyp_ids = tf.transpose(final_hyp_ids) return final_hyp_ids, final_hyp_lens, final_done_hyps
def try_apply_dense(self, grad, var): assert grad is not None cond = tf.constant(True) is_finite_checks = [] stats = {} grad_dtype = var.dtype # TODO(lepikhin): add to params grad = tf.cast(grad, grad_dtype) factored_dims = self._factored_dims(var.shape.as_list()) if factored_dims: vr = self.get_slot(var, 'vr') vc = self.get_slot(var, 'vc') else: v = self.get_slot(var, 'v') if self._beta1: m = self.get_slot(var, 'm') def _Upd(c, k, x): stats[k] = x is_finite_checks.append(tf.reduce_all(tf.math.is_finite(x))) return c with tf.variable_scope(var.name[:-2] + '/Adafactor'): grad_squared = tf.math.square(grad) + tf.cast(self._epsilon1, grad_dtype) cond = _Upd(cond, 'grad_squared', grad_squared) # 0 (factored) decay_rate = tf.cast(self._decay_rate, var.dtype) old_val = tf.identity(var) # TODO(lepikhin): introduce gradient dtype assert self._multiply_by_parameter_scale if self._multiply_by_parameter_scale: parameter_scale = self._parameter_scale(old_val) cond = _Upd(cond, 'parameter_scale', parameter_scale) # 1 (factored) update_scale = self._parameter_scale(old_val) * tf.cast( self._learning_rate, grad_dtype) else: update_scale = self._learning_rate mixing_rate = tf.cast(1.0 - decay_rate, grad_dtype) update_scale = tf.cast(update_scale, grad_dtype) if factored_dims: d0, d1 = factored_dims vr_axis, vc_axis = d0, d1 grad_squared_row_mean = tf.reduce_mean(grad_squared, axis=vr_axis) grad_squared_col_mean = tf.reduce_mean(grad_squared, axis=vc_axis) # new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) new_vr = vr * decay_rate + grad_squared_row_mean * mixing_rate # new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) new_vc = vc * decay_rate + grad_squared_col_mean * mixing_rate cond = _Upd(cond, 'new_vr', new_vr) # 2 (factored) cond = _Upd(cond, 'new_vc', new_vc) # 3 (factored) # vr_update = _Wrap(tf.assign, vr, new_vr) # vc_update = _Wrap(tf.assign, vc, new_vc) # updates.extend([vr_update, vc_update]) long_term_mean = tf.reduce_mean(new_vr, -1, keepdims=True) r_factor = tf.math.rsqrt(new_vr / long_term_mean) c_factor = tf.math.rsqrt(new_vc) mult = tf.expand_dims(r_factor, vr_axis) * tf.expand_dims( c_factor, vc_axis) cond = _Upd(cond, 'mult', mult) # 4 (factored) x = grad * mult else: new_v = v * decay_rate + grad_squared * mixing_rate cond = _Upd(cond, 'new_v', new_v) # v_update = _Wrap(tf.assign, v, new_v) # updates.append(v_update) x = grad * tf.math.rsqrt(new_v) assert self._clipping_threshold is not None if self._clipping_threshold is not None: clipping_denom = tf.maximum( tf.constant(1.0, grad_dtype), _ReduceRms(x) / tf.constant(self._clipping_threshold, grad_dtype)) x /= clipping_denom cond = _Upd(cond, 'x', x) subtrahend = x * update_scale if self._beta1: new_m = ( m * tf.constant(self._beta1, dtype=grad_dtype) + subtrahend * tf.constant(1.0 - self._beta1, dtype=grad_dtype)) subtrahend = new_m cond = _Upd(cond, 'new_m', new_m) # updates.append(_Wrap(tf.assign, m, new_m)) # It is critical to use assign_sub instead of tf.assign(var - subtrahend) # for the case of bfloat16 activations, so as to avoid repeatedly # rounding the slice value, which results in poor quality. cond = _Upd(cond, 'subtrahend', subtrahend) # 5 (factored) # var_update = _Wrap(tf.assign_sub, var, subtrahend) # updates.append(var_update) return is_finite_checks, stats
def FProp(self, theta, input_batch): """Embeds source ids and transforms with TransformerStack. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. input_batch: A `.NestedMap` with fields: - ids: The inputs tensor. It is expected to be of shape [batch, time]. - paddings: The paddings tensor. Expected shape [batch, time]. - task_ids: If p.task_emb is provided, must contain per-token task ids of shape [batch, time]. Returns: A NestedMap containing - encoded: The encoded features, either a tensor of shape [time, batch, depth], or a list of tensors if is_transparent is set in transformer_stack. - padding: of shape [time, batch] - segment_id: [time, batch] if packed inputs are supported by the model (and all layers), or None otherwise. - embedded_inputs: [time, batch, depth] embedded inputs tokens without positional encodings. """ p = self.params with tf.name_scope(p.name): src_segment_id = None src_segment_pos = None input_ids = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(input_batch.ids), tf.shape(input_batch.paddings)), py_utils.assert_equal(tf.rank(input_batch.ids), 2) ], input_batch.ids) if (not py_utils.use_tpu() and tf.flags.FLAGS.transformer_encoder_truncates_inputs): max_seq_length = tf.cast( tf.reduce_max(tf.reduce_sum(1.0 - input_batch.paddings, 1)), tf.int32) paddings = py_utils.with_dependencies([ py_utils.assert_equal( tf.constant(True, tf.bool), tf.reduce_all( input_batch.paddings[:, max_seq_length:] > 0.5)) ], input_batch.paddings) input_ids = input_ids[:, :max_seq_length] paddings = paddings[:, :max_seq_length] if p.packed_input: src_segment_id = input_batch.segment_ids[:, : max_seq_length] src_segment_pos = input_batch.segment_pos[:, : max_seq_length] else: paddings = input_batch.paddings if p.packed_input: src_segment_id = input_batch.segment_ids src_segment_pos = input_batch.segment_pos max_time = tf.shape(input_ids)[1] # Input token embeddings + positional embeddings input_embs = self.token_emb.EmbLookup(theta.token_emb, tf.reshape(input_ids, [-1])) input_embs = tf.reshape(input_embs, [-1, max_time, p.token_emb.embedding_dim]) # [time, batch, dim] orig_input_embs = tf.transpose(input_embs, [1, 0, 2]) if p.packed_input: position_embs = self.position_emb.FPropWithPosition( theta.position_emb, src_segment_pos) else: position_embs = self.position_emb.FProp( theta.position_emb, max_time) position_embs = tf.reshape( position_embs, [1, max_time, p.token_emb.embedding_dim]) input_embs += position_embs if p.task_emb: input_embs += self.task_emb.EmbLookup(theta.task_emb, input_batch.task_ids) if p.model_dim != p.token_emb.embedding_dim: input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs) paddings = tf.transpose(paddings) if p.packed_input: src_segment_id = tf.transpose(src_segment_id) input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs) # [time, batch, dim] transformer_input = tf.transpose(input_embs, [1, 0, 2]) if not p.is_eval and p.apply_source_mask: # Augment padding for masked source word positions. dtype = paddings.dtype source_mask = tf.where(tf.equal(input_ids, p.source_mask_id), tf.ones_like(input_ids, dtype=dtype), tf.zeros_like(input_ids, dtype=dtype)) # Make sure padding is between 0 and 1. paddings = tf.clip_by_value(paddings + tf.transpose(source_mask), 0.0, 1.0) encoded, padding, segment_id = self.transformer_stack.FProp( theta.transformer_stack, transformer_input, paddings, src_segment_id) return py_utils.NestedMap(encoded=encoded, padding=padding, segment_id=segment_id, embedded_inputs=orig_input_embs)
def _resource_apply_dense(self, grad, var): if grad is None: tf.logging.warning('Gradient is None for variable %s' % var.name) return [] grad_dtype = var.dtype # TODO(lepikhin): add to params grad = tf.cast(grad, grad_dtype) factored_dims = self._factored_dims(var.shape.as_list()) if factored_dims: vr = self.get_slot(var, 'vr') vc = self.get_slot(var, 'vc') else: v = self.get_slot(var, 'v') if self._beta1: m = self.get_slot(var, 'm') cond = tf.constant(True) def _Upd(c, x): if not self._cond_is_finite: return c c = tf.math.logical_and(c, tf.reduce_all(tf.math.is_finite(x))) c = tf.math.logical_and( c, tf.reduce_all(tf.math.logical_not(tf.math.is_inf(x)))) return c def _Wrap(fn, x, y): if not self._cond_is_finite: return fn(x, y) return tf.cond(cond, lambda: fn(x, y), lambda: x) with tf.variable_scope(var.name[:-2] + '/Adafactor'): grad_squared = tf.math.square(grad) + tf.cast(self._epsilon1, grad_dtype) cond = _Upd(cond, grad_squared) decay_rate = tf.cast(self._decay_rate, var.dtype) old_val = tf.identity(var) # TODO(lepikhin): introduce gradient dtype if self._multiply_by_parameter_scale: update_scale = self._parameter_scale(old_val) * tf.cast( self._learning_rate, grad_dtype) else: update_scale = self._learning_rate mixing_rate = tf.cast(1.0 - decay_rate, grad_dtype) update_scale = tf.cast(update_scale, grad_dtype) updates = [] if factored_dims: d0, d1 = factored_dims vr_axis, vc_axis = d0, d1 grad_squared_row_mean = tf.reduce_mean(grad_squared, axis=vr_axis) grad_squared_col_mean = tf.reduce_mean(grad_squared, axis=vc_axis) # new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) new_vr = vr * decay_rate + grad_squared_row_mean * mixing_rate # new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) new_vc = vc * decay_rate + grad_squared_col_mean * mixing_rate cond = _Upd(cond, new_vr) cond = _Upd(cond, new_vc) vr_update = _Wrap(tf.assign, vr, new_vr) vc_update = _Wrap(tf.assign, vc, new_vc) updates.extend([vr_update, vc_update]) long_term_mean = tf.reduce_mean(new_vr, -1, keepdims=True) r_factor = tf.math.rsqrt(new_vr / long_term_mean) c_factor = tf.math.rsqrt(new_vc) x = grad * tf.expand_dims(r_factor, vr_axis) * tf.expand_dims( c_factor, vc_axis) else: new_v = v * decay_rate + grad_squared * mixing_rate cond = _Upd(cond, new_v) v_update = _Wrap(tf.assign, v, new_v) updates.append(v_update) x = grad * tf.math.rsqrt(new_v) if self._clipping_threshold is not None: clipping_denom = tf.maximum( tf.constant(1.0, grad_dtype), _ReduceRms(x) / tf.constant(self._clipping_threshold, grad_dtype)) x /= clipping_denom subtrahend = x * update_scale if self._beta1: new_m = ( m * tf.constant(self._beta1, dtype=grad_dtype) + subtrahend * tf.constant(1.0 - self._beta1, dtype=grad_dtype)) subtrahend = new_m cond = _Upd(cond, new_m) updates.append(_Wrap(tf.assign, m, new_m)) # It is critical to use assign_sub instead of tf.assign(var - subtrahend) # for the case of bfloat16 activations, so as to avoid repeatedly # rounding the slice value, which results in poor quality. cond = _Upd(cond, subtrahend) var_update = _Wrap(tf.assign_sub, var, subtrahend) updates.append(var_update) return tf.group(*updates)
def _testDecoderFPropGradientCheckerHelper(self, func_inline=False): config = tf.config_pb2.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( do_function_inlining=func_inline))) with self.session(use_gpu=False, config=config) as sess: tf.random.set_seed(8372749040) np.random.seed(274854) vn_config = py_utils.VariationalNoiseParams(None, False, False) p = self._DecoderParams(vn_config) p.dtype = tf.float64 dec = p.Instantiate() src_seq_len = 5 src_enc = tf.constant(np.random.uniform(size=(src_seq_len, 2, 8)), tf.float64) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float64) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding) target_ids = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15], [5, 6, 7, 8], [10, 5, 2, 5]], dtype=tf.int32)) target_labels = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13], [5, 7, 8, 10], [10, 5, 2, 4]], dtype=tf.int32)) target_paddings = tf.transpose( tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 1]], dtype=tf.float64)) target_transcripts = tf.constant( ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf']) target_weights = 1.0 - target_paddings targets = py_utils.NestedMap({ 'ids': target_ids, 'labels': target_labels, 'weights': target_weights, 'paddings': target_paddings, 'transcripts': target_transcripts, }) metrics = dec.FPropDefaultTheta(encoder_outputs, targets).metrics loss = metrics['loss'][0] all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) def DenseGrad(var, grad): if isinstance(grad, tf.Tensor): return grad elif isinstance(grad, tf.IndexedSlices): return tf.math.unsorted_segment_sum( grad.values, grad.indices, tf.shape(var)[0]) dense_grads = [DenseGrad(x, y) for (x, y) in zip(all_vars, grads)] self.evaluate(tf.global_variables_initializer()) test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval()) # Second run to make sure the function is determistic. test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval()) symbolic_grads = [x.eval() for x in dense_grads if x is not None] numerical_grads = [] for v in all_vars: numerical_grads.append( test_utils.ComputeNumericGradient(sess, loss, v)) for x, y in zip(symbolic_grads, numerical_grads): self.assertAllClose(x, y)
def _InputBatch(self): return py_utils.NestedMap( inp=tf.constant(1.0, shape=[128, 3], dtype=tf.float32))
def _InputBatch(self): np.random.seed(1) bs, sl = 10, 7 src_ids = tf.constant( np.random.randint(low=0, high=8192 - 1, size=[bs, sl], dtype=np.int32)) tgt_ids = tf.constant( np.random.randint(low=0, high=8192 - 1, size=[bs, sl], dtype=np.int32)) tgt_labels = tf.constant( np.random.randint(low=0, high=8192 - 1, size=[bs, sl], dtype=np.int32)) tgt_weights = tf.constant(np.ones(shape=[bs, sl], dtype=np.float32)) src_paddings = tf.zeros([bs, sl]) tgt_paddings = tf.zeros([bs, sl]) ret = py_utils.NestedMap() ret.src = py_utils.NestedMap() ret.tgt = py_utils.NestedMap() if self.params.split: src_ids = tf.split(src_ids, 2, 0) src_paddings = tf.split(src_paddings, 2, 0) tgt_ids = tf.split(tgt_ids, 2, 0) tgt_labels = tf.split(tgt_labels, 2, 0) tgt_paddings = tf.split(tgt_paddings, 2, 0) tgt_weights = tf.split(tgt_weights, 2, 0) ret.src.ids = tf.cond( tf.equal(tf.mod(py_utils.GetGlobalStep(), 2), 0), lambda: src_ids[0], lambda: src_ids[1]) ret.src.paddings = tf.cond( tf.equal(tf.mod(py_utils.GetGlobalStep(), 2), 0), lambda: src_paddings[0], lambda: src_paddings[1]) ret.tgt.ids = tf.cond( tf.equal(tf.mod(py_utils.GetGlobalStep(), 2), 0), lambda: tgt_ids[0], lambda: tgt_ids[1]) ret.tgt.labels = tf.cond( tf.equal(tf.mod(py_utils.GetGlobalStep(), 2), 0), lambda: tgt_labels[0], lambda: tgt_labels[1]) ret.tgt.paddings = tf.cond( tf.equal(tf.mod(py_utils.GetGlobalStep(), 2), 0), lambda: tgt_paddings[0], lambda: tgt_paddings[1]) ret.tgt.weights = tf.cond( tf.equal(tf.mod(py_utils.GetGlobalStep(), 2), 0), lambda: tgt_weights[0], lambda: tgt_weights[1]) else: ret.src.ids = src_ids ret.src.paddings = src_paddings ret.tgt.ids = tgt_ids ret.tgt.labels = tgt_labels ret.tgt.paddings = tgt_paddings ret.tgt.weights = tgt_weights return ret
def _InputBatch(self): return [ py_utils.NestedMap( inp=tf.constant(1.0, shape=[16, 3], dtype=tf.float32)) for _ in range(8) ]
def testOrientedNMSIndices(self): utils_3d = detection_3d_lib.Utils3D() # Assignments and IoU scores calculated offline. bboxes_data = tf.constant( [[ [10.35, 8.429, -1.003, 3.7, 1.64, 1.49, 1.582], [10.35, 8.429, -1.003, 3.7, 1.64, 1.49, 0.0], # box 0 rotated [11.5, 8.429, -1.003, 3.7, 1.64, 1.49, 1.0 ], # Rotated to overlap [13.01, 8.149, -0.953, 4.02, 1.55, 1.52, 1.592], [13.51, 8.39, -1.0, 4.02, 1.55, 1.52, 1.592 ], # Slight translation [13.51, 8.39, -1.0, 1.0, 1.0, 1.52, 1.592], # Smaller box [13.51, 8.39, -1.0, 1.0, 1.0, 1.52, 1.9], # Smaller box ]], dtype=tf.float32) # Notes on the data: # Lets say we have 3 classes and a thresh of 0.1 # Keep box [0, 3] for class 0 # Keep box [6] only for class 1 # Keep box [2] for class 2 scores_data = tf.constant([[ [0.9, 0.1, 0.0], [0.89, 0.1, 0.01], [0.5, 0.01, 0.49], [0.8, 0.1, 0.1], [0.79, 0.11, 0.2], [0.2, 0.8, 0.1], [0.1, 0.9, 0.0], ]], dtype=tf.float32) with self.session() as sess: outputs = utils_3d.BatchedOrientedNMSIndices(bboxes_data, scores_data, nms_iou_threshold=0.1, score_threshold=0.3, max_boxes_per_class=5) indices, scores, valid_mask = sess.run(outputs) class_masks = [ valid_mask[0, cls_idx, :].astype(np.bool) for cls_idx in range(3) ] # Check the correct number of valid results per class self.assertEqual(class_masks[0].sum(), 2) self.assertEqual(class_masks[1].sum(), 1) self.assertEqual(class_masks[2].sum(), 1) # Check the results for each class self.assertAllEqual(indices[0, 0, class_masks[0]], [0, 3]) self.assertAllClose(scores[0, 0, class_masks[0]], [0.9, 0.8]) self.assertAllEqual(indices[0, 1, class_masks[1]], [6]) self.assertAllClose(scores[0, 1, class_masks[1]], [0.9]) self.assertAllEqual(indices[0, 2, class_masks[2]], [2]) self.assertAllClose(scores[0, 2, class_masks[2]], [0.49]) # Use a list of score thresholds instead outputs = utils_3d.BatchedOrientedNMSIndices( bboxes_data, scores_data, nms_iou_threshold=[0.1, 0.1, 0.1], score_threshold=[0.899, 0.5, 0.3], max_boxes_per_class=5) indices, scores, valid_mask = sess.run(outputs) class_masks = [ valid_mask[0, cls_idx, :].astype(np.bool) for cls_idx in range(3) ] # Check the correct number of valid results per class self.assertEqual(class_masks[0].sum(), 1) self.assertEqual(class_masks[1].sum(), 1) self.assertEqual(class_masks[2].sum(), 1) # Check the results for each class self.assertAllEqual(indices[0, 0, class_masks[0]], [0]) self.assertAllClose(scores[0, 0, class_masks[0]], [0.9]) self.assertAllEqual(indices[0, 1, class_masks[1]], [6]) self.assertAllClose(scores[0, 1, class_masks[1]], [0.9]) self.assertAllEqual(indices[0, 2, class_masks[2]], [2]) self.assertAllClose(scores[0, 2, class_masks[2]], [0.49])
def ScaleGradients(self, var_grads, gradient_adjuster=None): """Scales gradients according to training params. Args: var_grads: a `.NestedMap` whose values are (var, grad) pairs. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: A `.NestedMap` containing - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs, where gradients have already been scaled. - grad_scale: the gradient scale. 0 if gradient updates should be skipped for the step. (Optional, only returned in case global norm clipping is used.) """ p = self.params # Computes gradients' norm and adds their summaries. Note that all_grad_norm # may be nan, which may cause grad_scale to be nan. for name, vg in var_grads.FlattenItems(): summary_utils.AddNormSummary( py_utils.SanitizeScopeKey(name) + '/' + p.name, vg) flatten = py_utils.Flatten(var_grads) all_grad_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten])) all_var_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten])) grad_norm_is_nan_or_inf = tf.math.logical_or( tf.math.is_nan(all_grad_norm), tf.math.is_inf(all_grad_norm)) self._AddEvalMetric('grad_norm_is_nan_or_inf', grad_norm_is_nan_or_inf, tf.constant(1.0)) # Optional gradient adjustment. Note that this happens after computing # all_grad_norm. if gradient_adjuster is not None: tf.logging.info('gradient_adjuster=%s', gradient_adjuster) var_grads = gradient_adjuster(var_grads) # Handles NaN/Inf gradients. has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads) # Grad norm can still be inf even if none of the individual grad is inf. has_nan_or_inf = tf.math.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf) self._AddEvalMetric('has_nan_or_inf', has_nan_or_inf, tf.constant(1.0)) return_values = py_utils.NestedMap() if p.clip_gradient_single_norm_to_value: # Currently using both types of clipping simultaneously is unsupported. if p.clip_gradient_norm_to_value: raise ValueError( 'Cannot use clip_gradient_single_norm_to_value=%f and ' 'clip_gradient_norm_to_value=%f.' % (p.clip_gradient_single_norm_to_value, p.clip_gradient_norm_to_value)) final_var_grads = py_utils.ApplyGradNormClipping( var_grads, p.clip_gradient_single_norm_to_value) else: grad_scale = self._GetGlobalGradScale(all_grad_norm, has_nan_or_inf) # grad_norm/all is both a eval metric(collected by trainer) and a summary # (collected by controller). summary_utils.scalar(f'grad_norm/all/{p.name}', all_grad_norm) self._AddEvalMetric('grad_norm/all', all_grad_norm, tf.constant(1.0)) self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0)) self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0)) final_var_grads = py_utils.ApplyGradMultiplier( var_grads, grad_scale) return_values.grad_scale = grad_scale return_values.final_var_grads = final_var_grads return return_values