def testDecoderSampleTargetSequences(self): p = self._DecoderParams(vn_config=py_utils.VariationalNoiseParams( None, False, False), num_classes=8) p.target_seq_len = 5 p.random_seed = 1 config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions(do_function_inlining=False))) with self.session(use_gpu=False, config=config) as sess: tf.set_random_seed(8372740) np.random.seed(35315) dec = p.Instantiate() source_sequence_length = 5 batch_size = 4 source_encodings = tf.constant(np.random.normal( size=[source_sequence_length, batch_size, p.source_dim]), dtype=tf.float32) source_encoding_padding = tf.constant( [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0]], dtype=tf.float32) encoder_outputs = py_utils.NestedMap( encoded=source_encodings, padding=source_encoding_padding) sampled_sequences = dec.SampleTargetSequences( dec.theta, encoder_outputs, random_seed=tf.to_int32(123)) self.assertAllEqual([batch_size, p.target_seq_len], sampled_sequences.ids.shape) tf.global_variables_initializer().run() decoder_output = sess.run(sampled_sequences) print('ids=%s' % np.array_repr(decoder_output.ids)) lens = np.sum(1 - decoder_output.paddings, axis=1) print('lens=%s' % lens) # pyformat: disable # pylint: disable=bad-whitespace,bad-continuation expected_ids = [[6, 2, 2, 2, 2], [0, 0, 7, 5, 1], [6, 1, 5, 1, 5], [6, 7, 7, 4, 4]] # pylint: enable=bad-whitespace,bad-continuation # pyformat: enable expected_lens = [2, 5, 5, 5] self.assertAllEqual(expected_lens, lens) self.assertAllEqual(expected_ids, decoder_output.ids) # Sample again with the same random seed. decoder_output2 = sess.run( dec.SampleTargetSequences(dec.theta, encoder_outputs, random_seed=tf.to_int32(123))) # Get the same output. self.assertAllEqual(decoder_output.ids, decoder_output2.ids) self.assertAllEqual(decoder_output.paddings, decoder_output2.paddings) # Sample again with a different random seed. decoder_output3 = sess.run( dec.SampleTargetSequences(dec.theta, encoder_outputs, random_seed=tf.to_int32(123456))) # Get different sequences. self.assertNotAllClose(expected_ids, decoder_output3.ids)
def _Extract(self, features): p = self.params # Label values match the proto enum car.open_dataset.Label.Type. The value # range is [1..4] for non-background labels. labels = tf.to_int32(_Dense(features['labels'])) labels = py_utils.PadOrTrimTo(labels, [p.max_num_objects]) label_ids = tf.reshape(_Dense(features['label_ids'], ''), [-1]) label_ids = py_utils.PadOrTrimTo(label_ids, [p.max_num_objects], '') bboxes_3d = tf.reshape(_Dense(features['bboxes_3d']), [-1, 7]) bboxes_3d_mask = tf.ones([tf.shape(bboxes_3d)[0]]) bboxes_3d_num_points = tf.to_int32( _Dense(features['bboxes_3d_num_points'])) bboxes_3d = py_utils.PadOrTrimTo(bboxes_3d, [p.max_num_objects, 7]) bboxes_3d_mask = py_utils.PadOrTrimTo(bboxes_3d_mask, [p.max_num_objects]) bboxes_3d_num_points = py_utils.PadOrTrimTo(bboxes_3d_num_points, [p.max_num_objects]) label_metadata = tf.reshape(_Dense(features['label_metadata']), [-1, 4]) label_metadata = py_utils.PadOrTrimTo(label_metadata, [p.max_num_objects, 4]) detection_difficulties = py_utils.PadOrTrimTo( tf.to_int32(_Dense(features['detection_difficulties'])), [p.max_num_objects]) tracking_difficulties = py_utils.PadOrTrimTo( tf.to_int32(_Dense(features['tracking_difficulties'])), [p.max_num_objects]) unfiltered_bboxes_3d_mask = bboxes_3d_mask if p.filter_labels: valid_labels = tf.constant([p.filter_labels]) bbox_mask = tf.reduce_any(tf.equal(tf.expand_dims(labels, 1), valid_labels), axis=1) bboxes_3d_mask *= tf.to_float(bbox_mask) outputs = { 'labels': labels, 'label_ids': label_ids, 'detection_difficulties': detection_difficulties, 'tracking_difficulties': tracking_difficulties, 'bboxes_3d': bboxes_3d, 'bboxes_3d_mask': bboxes_3d_mask, 'bboxes_3d_num_points': bboxes_3d_num_points, 'unfiltered_bboxes_3d_mask': unfiltered_bboxes_3d_mask, 'speed': label_metadata[:, :2], 'acceleration': label_metadata[:, 2:], } return py_utils.NestedMap(outputs)
def SequenceConcat(x, x_paddings, y, y_paddings, pad=0): """Concats sequence `x` with sequence `y`. This function is length aware (based off the paddings). Args: x: A sequence of tokens of shape [batch_size, x_len_max]. x_paddings: The paddings of `x`. y: A sequence of tokens of shape [batch_size, y_len_max]. y_paddings: The paddings of `y`. pad: The <pad> token to fill the concatenated sequence (of type integer). Returns: A tuple. - Concatenation of `x` and `y` of shape [batch_size, x_len_max + y_len_max]. - Paddings of the concatenation of shape [batch_size, x_len_max + y_len_max]. """ # Get the length (w/ eos). x_len = tf.to_int32(tf.round(tf.reduce_sum(1 - x_paddings, 1))) y_len = tf.to_int32(tf.round(tf.reduce_sum(1 - y_paddings, 1))) batch_size = py_utils.GetShape(x)[0] y_len_max = py_utils.GetShape(y)[1] # Pad `x` with necessary <pad>. x = tf.concat([x, tf.fill(py_utils.GetShape(y), pad)], 1) # Replace all <pad> with 0. x = tf.where(tf.not_equal(x, pad), x, tf.fill(py_utils.GetShape(x), 0)) # Compute the write indices of `y` in `xy`. indices = tf.stack([ tf.tile(tf.expand_dims(tf.range(batch_size), 1), [1, y_len_max]), (tf.tile(tf.expand_dims(tf.range(y_len_max), 0), [batch_size, 1]) + tf.expand_dims(x_len, 1)), ], 2) xy = x + tf.scatter_nd(indices, y, py_utils.GetShape(x)) # We need to remap all <pad> to `pad`. xy = tf.where( tf.less(tf.expand_dims(tf.range(py_utils.GetShape(xy)[1]), 0), tf.expand_dims(x_len + y_len, 1)), xy, tf.fill(py_utils.GetShape(xy), pad)) xy_paddings = 1 - tf.sequence_mask(x_len + y_len, py_utils.GetShape(xy)[1], x_paddings.dtype) return xy, xy_paddings
def CreateDenseCoordinates(self, ranges): """Create a matrix of coordinate locations corresponding to a dense grid. Example: To create (x, y) coordinates corresponding over a 10x10 grid with step sizes 1, call CreateDenseCoordinates([(1, 10, 10), (1, 10, 10)]). Args: ranges: A list of 3-tuples, each tuple is expected to contain (min, max, num_steps). Each list element corresponds to one dimesion. Each tuple will be passed into np.linspace to create the values for a single dimension. Returns: tf.float32 tensor of shape [total_points, len(ranges)], where total_points = product of all num_steps. """ total_points = int(np.prod([r_steps for _, _, r_steps in ranges])) cycle_steps = total_points stack_coordinates = [] for r_start, r_stop, r_steps in ranges: values = tf.lin_space( tf.to_float(r_start), tf.to_float(r_stop), tf.to_int32(r_steps)) cycle_steps //= r_steps gather_idx = (tf.range(total_points) // cycle_steps) % r_steps stack_coordinates.append(tf.gather(values, gather_idx)) return tf.stack(stack_coordinates, axis=1)
def SequenceAppendToken(x, x_paddings, token, extend=False): """Appends <token> to sequence `x`. Args: x: A sequence of tokens of shape [batch_size, x_len_max]. x_paddings: The paddings of `x`. token: The token to append (of type integer). extend: Whether to extend `x` along the length dimension, this must be true for any sequence length in `x` that is `x_len_max` or else an invalid sequence will be emitted. Returns: A tuple. - The new sequence, Tensor of shape [batch_size, x_len_max]. - The new paddings, Tensor of shape [batch_size, x_len_max]. """ batch_size = py_utils.GetShape(x)[0] x_len = tf.to_int32(tf.round(tf.reduce_sum(1 - x_paddings, 1))) if extend: x = tf.pad(x, [[0, 0], [0, 1]]) # Mask all invalid entries of `x` to 0. x *= tf.sequence_mask(x_len, py_utils.GetShape(x)[1], x.dtype) # Append the <token> based on `x_len`. x += tf.scatter_nd(tf.stack([tf.range(batch_size), x_len], axis=1), tf.cast(tf.fill([batch_size], token), x.dtype), py_utils.GetShape(x)) x_paddings = 1 - tf.sequence_mask(x_len + 1, py_utils.GetShape(x)[1], x_paddings.dtype) return x, x_paddings
def _FlatOutputProcessor(inputs): """Returns a flattened list of 'processor(inputs)'.""" output, bucketing_key = processor(inputs) if isinstance(output, list): assert output assert all(isinstance(x, tf.Tensor) for x in output), '{}'.format(output) else: assert isinstance(output, py_utils.NestedMap), '{}'.format(output) assert output assert all(isinstance(x, tf.Tensor) for x in output.Flatten()), '{}'.format( output.DebugString()) bucketing_key = tf.to_int32(bucketing_key) tf.logging.debug('Processor outputs=%s bucketing_key=%s', output, bucketing_key) output_tmpl.values = output flat_output_tmpl = output_tmpl.Flatten() tf.logging.debug('Processor flat outputs=%s', flat_output_tmpl) tf.logging.debug('extra_inputs=%s extra_args=%s extra_vars=%s', function.get_extra_inputs(), function.get_extra_args(), function.get_extra_vars()) assert not function.get_extra_args(), ( 'fns {} is not pure: extra_args={}'.format( processor, function.get_extra_args())) return flat_output_tmpl + [bucketing_key]
def ComputeKITTIDifficulties(box_image_height, occlusion, truncation): """Compute difficulties from box height, occlusion, and truncation.""" # Easy: No occlusion, max truncation 15% easy_level = tf.to_int32((box_image_height >= 40.) & (occlusion <= 0.) & (truncation <= 0.15)) * 3 # Moderate: max occlusion: partly occluded, max truncation 30% moderate_level = tf.to_int32((occlusion <= 1.) & (truncation <= 0.3) & (box_image_height >= 25.)) * 2 # Hard: Difficult to see, max truncation 50% hard_level = tf.to_int32((occlusion <= 2.) & (truncation <= 0.5) & (box_image_height >= 25.)) * 1 # Occlusion = 3 and higher truncation is "super hard", and # will map to 0 (ignored). difficulties = tf.maximum(tf.maximum(hard_level, moderate_level), easy_level) return difficulties
def add_labels(self, feature, labels, points_xyz): """Add 3d bounding box labels into the output feature map. Args: feature: A tf.Example feature map. labels: A repeated car.open_dataset.Label proto. points_xyz: A numpy array of shape [-1, 3] with the pointcloud. This is used to calculate the number of points in each 3D bounding box. """ label_classes = [] label_ids = [] detection_difficulty_levels = [] tracking_difficulty_levels = [] bboxes = [] label_md = [] for label in labels: box = label.box bbox_3d = [ box.center_x, box.center_y, box.center_z, box.length, box.width, box.height, box.heading ] md = [ label.metadata.speed_x, label.metadata.speed_y, label.metadata.accel_x, label.metadata.accel_y ] label_md += md bboxes += bbox_3d label_classes += [label.type] label_ids += [tf.compat.as_bytes(label.id)] detection_difficulty_levels += [label.detection_difficulty_level] tracking_difficulty_levels += [label.tracking_difficulty_level] # Calculate the number of points in each ground truth box which are needed # to fill in difficulty levels for each ground truth and to filter boxes # with less points than a configurable minimum. points_xyz = tf.convert_to_tensor(points_xyz, dtype=tf.float32) bboxes_3d = tf.convert_to_tensor( np.array(bboxes).reshape(-1, 7), dtype=tf.float32) points_in_bboxes_mask = geometry.IsWithinBBox3D(points_xyz, bboxes_3d) bboxes_3d_num_points = tf.reduce_sum( tf.to_int32(points_in_bboxes_mask), axis=0, keepdims=False) bboxes_3d_num_points = bboxes_3d_num_points.numpy().reshape([-1]) bboxes = np.array(bboxes).reshape(-1) label_md = np.array(label_md).reshape(-1) feature['labels'].int64_list.value[:] = label_classes feature['label_ids'].bytes_list.value[:] = label_ids feature[ 'detection_difficulties'].int64_list.value[:] = detection_difficulty_levels feature[ 'tracking_difficulties'].int64_list.value[:] = tracking_difficulty_levels feature['bboxes_3d'].float_list.value[:] = list(bboxes) feature['label_metadata'].float_list.value[:] = list(label_md) feature['bboxes_3d_num_points'].int64_list.value[:] = list( bboxes_3d_num_points)
def _ConcatOnehotFn(input_data): """Concat the input features with a onehot version of the label ids.""" features = input_data.features label = input_data.label num_pts = tf.shape(features)[1] label_one_hot = tf.one_hot(tf.to_int32(label), depth=16) label_one_hot = tf.tile(tf.expand_dims(label_one_hot, 1), [1, num_pts, 1]) input_data.features = tf.concat([features, label_one_hot], axis=-1) return input_data
def _ComputeDecoderMetrics(self, decoder_outs, input_batch): """Computes metrics on output from decoder. Args: decoder_outs: A `BeamSearchDecodeOutput`, a namedtuple containing the decode results. input_batch: A `NestedMap` of tensors representing the source, target, and other components of the input batch. Returns: A dict of Tensors containing decoder output and metrics. """ p = self.params topk = self._GetTopK(decoder_outs) tgt = self._GetTargetForDecoderMetrics(input_batch) transcripts = self.input_generator.IdsToStrings( tgt.labels, tf.to_int32(tf.round(tf.reduce_sum(1.0 - tgt.paddings, 1) - 1.0))) # Filter out all isolated '<noise>' tokens. noise_pattern = ' <noise> |^<noise> | <noise>$|^<noise>$' filtered_refs = tf.regex_replace(transcripts, noise_pattern, ' ') filtered_hyps = tf.regex_replace(topk.decoded, noise_pattern, ' ') # Compute translation quality scores for all hyps. filtered_refs = tf.tile( tf.reshape(filtered_refs, [-1, 1]), [1, p.decoder.beam_search.num_hyps_per_beam]) filtered_hyps = tf.reshape(filtered_hyps, [-1]) filtered_refs = tf.reshape(filtered_refs, [-1]) norm_wer_errors, norm_wer_words = self._ComputeNormalizedWER( filtered_hyps, filtered_refs) ret_dict = { 'target_ids': tgt.ids, 'target_labels': tgt.labels, 'target_weights': tgt.weights, 'target_paddings': tgt.paddings, 'transcripts': transcripts, 'topk_decoded': topk.decoded, 'topk_ids': topk.ids, 'topk_lens': topk.lens, 'topk_scores': topk.scores, 'norm_wer_errors': norm_wer_errors, 'norm_wer_words': norm_wer_words, } if not py_utils.use_tpu(): ret_dict['utt_id'] = input_batch.sample_ids ret_dict.update( self.AddAdditionalDecoderMetricsToGraph(topk, filtered_hyps, filtered_refs, input_batch, decoder_outs)) return ret_dict
def _BeamSearchDecode(self, input_batch): p = self.params with tf.name_scope('fprop'), tf.name_scope(p.name): encoder_outputs = self.enc.FPropDefaultTheta(input_batch.src) encoder_outputs = self.dec.AddExtraDecodingInfo( encoder_outputs, input_batch.tgt) decoder_outs = self.dec.BeamSearchDecode(encoder_outputs) topk_hyps = decoder_outs.topk_hyps topk_ids = decoder_outs.topk_ids topk_lens = decoder_outs.topk_lens topk_scores = decoder_outs.topk_scores slen = tf.to_int32( tf.round(tf.reduce_sum(1 - input_batch.src.paddings, 1) - 1)) srcs = self.input_generator.IdsToStrings( input_batch.src.ids, slen, self._GetTokenizerKeyToUse('src')) topk_decoded = self.input_generator.IdsToStrings( topk_ids, topk_lens - 1, self._GetTokenizerKeyToUse('tgt')) topk_decoded = tf.reshape(topk_decoded, tf.shape(topk_hyps)) topk_scores = tf.reshape(topk_scores, tf.shape(topk_hyps)) refs = self.input_generator.IdsToStrings( input_batch.tgt.labels, tf.to_int32( tf.round( tf.reduce_sum(1.0 - input_batch.tgt.paddings, 1) - 1.0)), self._GetTokenizerKeyToUse('tgt')) ret_dict = { 'target_ids': input_batch.tgt.ids, 'target_labels': input_batch.tgt.labels, 'target_weights': input_batch.tgt.weights, 'target_paddings': input_batch.tgt.paddings, 'sources': srcs, 'targets': refs, 'topk_decoded': topk_decoded, 'topk_lens': topk_lens, 'topk_scores': topk_scores, } return ret_dict
def _ProcessLine(self, line): """A single-text-line processor. Gets a string tensor representing a line of text that have been read from the input file, and splits it to graphemes (characters). We use original characters as the target labels, and the lowercased and punctuation-removed characters as the source labels. Args: line: a 1D string tensor. Returns: A list of tensors, in the expected order by __init__. """ # Tokenize the input into integer ids. # tgt_ids has the start-of-sentence token prepended, and tgt_labels has the # end-of-sentence token appended. tgt_ids, tgt_labels, tgt_paddings = self.StringsToIds( tf.convert_to_tensor([line])) def Normalize(line): # Lowercase and remove punctuation. line = line.lower().translate(None, string.punctuation.encode('utf-8')) # Convert multiple consecutive spaces to a single one. line = b' '.join(line.split()) return line normalized_line = tf.py_func(Normalize, [line], tf.string, stateful=False) _, src_labels, src_paddings = self.StringsToIds(tf.convert_to_tensor( [normalized_line]), is_source=True) # The model expects the source without a start-of-sentence token. src_ids = src_labels # Compute the length for bucketing. bucket_key = tf.to_int32( tf.round( tf.maximum(tf.reduce_sum(1.0 - src_paddings), tf.reduce_sum(1.0 - tgt_paddings)))) tgt_weights = 1.0 - tgt_paddings # Return tensors in an order consistent with __init__. out_tensors = [ src_ids, src_paddings, tgt_ids, tgt_paddings, tgt_labels, tgt_weights ] return [tf.squeeze(t, axis=0) for t in out_tensors], bucket_key
def SequenceLength(padding): """Computes the length of a sequence based on binary padding. Args: padding: A tensor of binary paddings shaped [batch, seqlen]. Returns: seq_lens, A tensor of shape [batch] containing the non-padded length of each element of plot_tensor along the batch dimension. """ seq_lens = tf.to_int32(tf.round(tf.reduce_sum(1 - padding, axis=1))) # Get rid of any extra dimensions. batch_size = tf.shape(padding)[0] seq_lens = tf.reshape(seq_lens, [batch_size], name='seq_lens') return seq_lens
def Filter(self, outputs): """Optionally filters the data based on context info.""" p = self.params if p.equality_filters is None: return 1 allowed_example = tf.convert_to_tensor(True) for filter_key, filter_values in p.equality_filters: if filter_key not in outputs: raise ValueError( 'Filter key `{}` not found in extracted data.'.format(filter_key)) has_allowed_data = tf.reduce_any( tf.equal(outputs[filter_key], filter_values)) allowed_example = tf.logical_and(allowed_example, has_allowed_data) not_allowed_example = 1 - tf.to_int32(allowed_example) return 1 + (not_allowed_example * input_extractor.BUCKET_UPPER_BOUND)
def Proc(record): """Parses a serialized tf.Example record.""" outputs = [ ('source_id', tf.VarLenFeature(tf.int64)), ('source_padding', tf.VarLenFeature(tf.float32)), ('target_id', tf.VarLenFeature(tf.int64)), ('target_padding', tf.VarLenFeature(tf.float32)), ('target_label', tf.VarLenFeature(tf.int64)), ('target_weight', tf.VarLenFeature(tf.float32)), ] features = tf.parse_single_example(record, dict(outputs)) for k, v in six.iteritems(features): features[k] = v.values bucket_key = tf.to_int32( tf.maximum(tf.reduce_sum(1.0 - features['source_padding']), tf.reduce_sum(1.0 - features['target_padding']))) return [features[k] for k, _ in outputs] + [bucket_key]
def _MaybePadSourceInputs(self, src_inputs, src_paddings): p = self.params if not p.append_eos_frame: return src_inputs, src_paddings per_src_len = tf.reduce_sum(1 - src_paddings, 1) per_src_len += 1 max_src_len = tf.reduce_max(per_src_len) input_shape = tf.shape(src_inputs) input_len = tf.maximum(input_shape[1], tf.to_int32(max_src_len)) pad_steps = input_len - input_shape[1] src_inputs = tf.concat([ src_inputs, tf.zeros(inplace_ops.inplace_update(input_shape, 1, pad_steps), src_inputs.dtype) ], 1) src_paddings = 1 - tf.sequence_mask( tf.reshape(per_src_len, [input_shape[0]]), tf.reshape( input_len, []), src_paddings.dtype) return src_inputs, src_paddings
def bleu_score(predictions, labels, **unused_kwargs): """BLEU score computation between labels and predictions. An approximate BLEU scoring method since we do not glue word pieces or decode the ids and tokenize the output. By default, we use ngram order of 4 and use brevity penalty. Also, this does not have beam search. Args: predictions: tensor, model predictions labels: tensor, gold output. Returns: bleu: int, approx bleu score """ outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) # Convert the outputs and labels to a [batch_size, input_length] tensor. outputs = tf.squeeze(outputs, axis=[-1, -2]) labels = tf.squeeze(labels, axis=[-1, -2]) bleu = tf.py_func(compute_bleu, (labels, outputs), tf.float32) return bleu, tf.constant(1.0)
def Proc(record): """Parses a serialized tf.Example record.""" features = [ ('uttid', tf.VarLenFeature(tf.string)), ('transcript', tf.VarLenFeature(tf.string)), ('frames', tf.VarLenFeature(tf.float32)), ] example = tf.parse_single_example(record, dict(features)) fval = {k: v.values for k, v in six.iteritems(example)} # Reshape the flattened vector into its original time-major # representation. fval['frames'] = tf.reshape( fval['frames'], shape=[-1, self.params.frame_size]) # Input duration determines the bucket. bucket_key = tf.to_int32(tf.shape(fval['frames'])[0]) if self.params.append_eos_frame: bucket_key += 1 tgt_ids, tgt_labels, tgt_paddings = self.StringsToIds(fval['transcript']) src_paddings = tf.zeros([tf.shape(fval['frames'])[0]], dtype=tf.float32) return fval['uttid'], tgt_ids, tgt_labels, tgt_paddings, fval[ 'frames'], src_paddings, bucket_key
def _FlatOutputProcessor(source_id, record): """Returns a flattened list of 'processor(inputs)'.""" processor_spec = tf_inspect.getargspec(processor) tf.logging.debug('GenericInput.processor.argspec=%s', processor_spec) processor_args = set(processor_spec.args) - set(['self']) if len(processor_args) == 1: output, bucketing_key = processor(record) elif processor_args == set(['source_id', 'record']): output, bucketing_key = processor(source_id=source_id, record=record) else: raise ValueError( 'GenericInput: processor should take either a single arg ' 'or two args named as "source_id" and "record". ' 'Actual: %s' % processor_args) if isinstance(output, list): assert output assert all(isinstance(x, tf.Tensor) for x in output), '{}'.format(output) else: assert isinstance(output, py_utils.NestedMap), '{}'.format(output) assert output assert all( isinstance(x, tf.Tensor) for x in output.Flatten()), '{}'.format( output.DebugString()) bucketing_key = tf.to_int32(bucketing_key) tf.logging.debug('Processor outputs=%s bucketing_key=%s', output, bucketing_key) output_tmpl.out_values = output flat_output_tmpl = output_tmpl.Flatten() tf.logging.debug('Processor flat outputs=%s', flat_output_tmpl) tf.logging.debug('extra_inputs=%s extra_args=%s extra_vars=%s', function.get_extra_inputs(), function.get_extra_args(), function.get_extra_vars()) assert not function.get_extra_args(), ( 'fns {} is not pure: extra_args={}'.format(processor, function.get_extra_args())) return flat_output_tmpl + [bucketing_key]
def FProp(self, theta, x, x_paddings=None, eos_id=1, force_sample_last_token=True): """Applies SymbolInsertionLayer. We take in a `x`, which represents the groundtruth sequence (i.e., English sequence). We return a sampled rollin (observed) canvas (i.e., random subset of the English sequence), as well as the target (indices) for an insertion-based model (i.e., the targets given the random observed subset). Args: theta: Ignored, this can be None. x: The symbol ids of shape `[batch_size, time_dim]`. x_paddings: The paddings (1 or 0) of shape `[batch_size, time_dim]` where 0 is valid and 1 is invalid. eos_id: The <eos> token id to represent end-of-slot. force_sample_last_token: Set True to force sample the last token of `x`. Returns: A `NestedMap`. - canvas: The canvas (based off of the `rollin_policy`) of shape [batch_size, c_dim]. Note that, `c_dim` <= `time_dim` but need not be equal. - canvas_indices: The canvas indices (into `x`). - canvas_paddings: The paddings of `canvas_indices`. - target_indices: The target indices of shape [num_targets, 3]. `num_targets` is the number of total targets in the entire batch. [:, 0] captures the batch, [:, 1] captures the slot, and [:, 2] captures the token. Each row [batch, slot, vocab] represents the indices of the target -- i.e., the batch, slot and vocab combination of the target. Typical usage of these indices is to tf.gather_nd the log-probs (from the softmax layer). - target_weights: The target weights. Raises: ValueError: If invalid params. """ p = self.params batch_size = py_utils.GetShape(x)[0] time_dim = py_utils.GetShape(x)[1] if x_paddings is None: x_paddings = tf.zeros([batch_size, time_dim], tf.float32) oracle_policy = p.oracle_policy rollin_policy = (oracle_policy if p.rollin_policy == 'oracle' else p.rollin_policy) if rollin_policy != 'uniform': raise ValueError('Unknown or unsupported rollin policy: %s' % rollin_policy) if oracle_policy != 'uniform': raise ValueError('Unknown or unsupported oracle policy: %s' % oracle_policy) x_len = tf.to_int32(tf.round(tf.reduce_sum(1 - x_paddings, 1))) # Compute the desired length per example in the batch. ratio = tf.random.uniform([batch_size], 0.0, 1.0, seed=p.random_seed) if force_sample_last_token: c_len = tf.minimum( tf.cast(ratio * tf.cast(x_len, tf.float32), tf.int32), x_len - 1) + 1 else: c_len = tf.minimum( tf.cast(ratio * tf.cast(x_len + 1, tf.float32), tf.int32), x_len) # Compute the maximum length across the batch. c_len_max = tf.reduce_max(c_len) # Grab subset of random valid indices per example. z_logits = tf.cast( tf.expand_dims(tf.range(time_dim), 0) >= tf.expand_dims(x_len, 1), tf.float32) * -1e9 if force_sample_last_token: # Force sample the last token -- i.e., as indexed by `x_len - 1`. We can # accomplish this by add +LARGE_NUMBER to the logits. z_logits += tf.cast( tf.equal(tf.expand_dims(tf.range(time_dim), 0), tf.expand_dims(x_len - 1, 1)), tf.float32) * 1e9 # Gumbel-max trick to sample (we only sample valid positions per sample in # the batch). z = -tf.math.log(-tf.math.log( tf.random.uniform([batch_size, time_dim], seed=p.random_seed))) unused_c_values, c_indices = tf.nn.top_k(z_logits + z, time_dim) # Trim everything > c_len_max. c_indices = c_indices[:, :c_len_max] # Invalidate any indices >= c_len, we use the last index as the default # invalid index. c_indices = tf.where( tf.expand_dims(tf.range(c_len_max), 0) < tf.expand_dims(c_len, 1), c_indices, tf.fill(py_utils.GetShape(c_indices), time_dim - 1)) # Materialize the canvas. c_indices = tf.sort(c_indices) c = tf.gather_nd( x, tf.stack([ tf.reshape( tf.tile(tf.expand_dims(tf.range(batch_size), 1), [1, c_len_max]), [-1]), tf.reshape(c_indices, [-1]) ], 1)) c = tf.reshape(c, [batch_size, c_len_max]) # Compute the paddings. c_paddings = 1 - tf.sequence_mask( c_len, c_len_max, dtype=x_paddings.dtype) c *= tf.cast(1 - c_paddings, tf.int32) indices = tf.concat([ tf.reshape( tf.tile(tf.expand_dims(tf.range(batch_size), 1), [1, c_len_max]), [batch_size * c_len_max, 1]), tf.reshape(c_indices, [batch_size * c_len_max, 1]) ], 1) x_token_is_observed = tf.scatter_nd( indices, tf.ones([batch_size * c_len_max], tf.int32), py_utils.GetShape(x)) # `x_segments` captures which slot each `x` belongs to (both observed and # tokens that need to be observed). x_segments = tf.cumsum(x_token_is_observed, 1, exclusive=True) x_token_is_observed = tf.cast(x_token_is_observed, tf.bool) prev_x_token_is_observed = tf.pad(x_token_is_observed[:, :-1], [[0, 0], [1, 0]], constant_values=True) x_token_is_observed = tf.reshape(x_token_is_observed, [-1]) prev_x_token_is_observed = tf.reshape(prev_x_token_is_observed, [-1]) x_is_valid = tf.cast(1 - x_paddings, tf.bool) x_is_valid = tf.reshape(x_is_valid, [-1]) # Remap all the observed to <eos>, note some of these need a zero weight # (or else there would be <eos> and valid token in the same slot). target_indices = tf.cast(tf.reshape(x, [-1, 1]), tf.int32) target_indices = tf.where( x_token_is_observed, tf.fill(py_utils.GetShape(target_indices), eos_id), target_indices) # TODO(williamchan): We give uniform 1.0 weight, however, math suggests # we may want to weigh this term by the original sequence length. target_weights = tf.ones_like(target_indices, tf.float32) # We need to set all the weights for <eos> which actually have valid tokens # in the slot to zero. target_weights = tf.where( x_token_is_observed & ~prev_x_token_is_observed, tf.zeros_like(target_weights), target_weights) # TODO(williamchan): Consider dropping the entries w/ weight zero. # Add the batch and slot indices. target_indices = tf.concat([ tf.reshape( tf.tile(tf.expand_dims(tf.range(batch_size), 1), [1, time_dim]), [batch_size * time_dim, 1]), tf.reshape(x_segments, [-1, 1]), target_indices ], 1) # Select only the valid indices. The selected valid ones include slots w/ # <eos>. target_indices = target_indices[x_is_valid] target_weights = target_weights[x_is_valid] return py_utils.NestedMap(canvas=c, canvas_indices=c_indices, canvas_paddings=c_paddings, target_indices=target_indices, target_weights=target_weights)
def Sample(self, decoder_theta, encoder_outputs, random_seed, init_state_callback, pre_step_callback, post_step_callback): """Samples target sequences, one target sequence per source sequence. (Please see beam_search_helper.py for description of decoder callbacks.) Args: decoder_theta: A NestedMap object containing weights' values of the decoder layer and its children layers, to be passed to decoder callbacks. encoder_outputs: the outputs of the encoder, to be passed to callbacks. random_seed: a scalar int32 tensor representing the random seed. init_state_callback: decoder._InitBeamSearchStateCallback. pre_step_callback: decoder._PreBeamSearchStepCallback. post_step_callback: decoder._PostBeamSearchStepCallback. Returns: A NestedMap containing the following tensors: - 'logits': [batch, max_target_length, vocab_size], representing the distribution from which target sequences are sampled. - 'ids': [batch, max_target_length] of int32, representing the target sequence ids, not including target_sos_id, but maybe ending with target_eos_id if end-of-sequence is reached before target_seq_len. - 'paddings': [batch, max_target_length] of 0/1, where 1 represents a padded timestep. """ p = self.params assert p.temperature > 0 # 'recurrent_theta' represents all cross-timestep information used by the # recurrent loop below, including layer theta and encoder outputs. recurrent_theta = py_utils.NestedMap( theta=decoder_theta, random_seed=random_seed, encoder_outputs=encoder_outputs) bs_result, bs_state = init_state_callback( recurrent_theta.theta, encoder_outputs, num_hyps_per_beam=1) batch = tf.shape(bs_result.log_probs)[0] recurrent_state0 = py_utils.NestedMap( timestep=tf.zeros(shape=[], dtype=tf.int32), logits=bs_result.log_probs, # Start with target_sos_id. ids=tf.fill([batch], tf.to_int32(p.target_sos_id)), bs_state=bs_state) inputs = py_utils.NestedMap(dummy=tf.zeros([p.target_seq_len, batch])) def Step(recurrent_theta, state0, inputs): """Computes one decoder step.""" del inputs with tf.name_scope('single_sampler_step'): # Compute logits and states. bs_result, bs_state1 = pre_step_callback( recurrent_theta.theta, recurrent_theta.encoder_outputs, tf.expand_dims(state0.ids, 1), # [batch, 1]. state0.bs_state, num_hyps_per_beam=1) batch = tf.shape(bs_result.log_probs)[0] state1 = py_utils.NestedMap(timestep=state0.timestep + 1) state1.logits = bs_result.log_probs # Sample ids from logits. [batch]. state1.ids = tf.reshape( tf.random.stateless_multinomial( state1.logits / p.temperature, num_samples=1, seed=tf.stack([recurrent_theta.random_seed, state0.timestep]), output_dtype=state0.ids.dtype, name='sample_next_id'), [batch]) if 'is_last_chunk' in bs_result and p.target_eoc_id >= 0: state1.ids = tf.where( tf.logical_and(bs_result.is_last_chunk, tf.equal(state1.ids, p.target_eoc_id)), tf.fill(tf.shape(state1.ids), p.target_eos_id), state1.ids) state1.bs_state = post_step_callback(recurrent_theta.theta, recurrent_theta.encoder_outputs, state1.ids, bs_state1) return state1, py_utils.NestedMap() accumulated_states, _ = recurrent.Recurrent(recurrent_theta, recurrent_state0, inputs, Step) result = py_utils.NestedMap( logits=tf.transpose(accumulated_states.logits, [1, 0, 2]), ids=tf.transpose(accumulated_states.ids)) result.paddings = tf.cast( _ComputePaddings(result.ids, p.target_eos_id), result.logits.dtype) # Force ids to be eos_id if the timestep is padded. result.ids = tf.where( tf.equal(result.paddings, 0), result.ids, tf.fill(tf.shape(result.ids), p.target_eos_id)) static_batch_size = bs_result.log_probs.shape[0] result.ids.set_shape([static_batch_size, p.target_seq_len]) result.paddings.set_shape([static_batch_size, p.target_seq_len]) return result
def _ComputePaddings(ids, eos_id): is_eos = tf.to_int32(tf.equal(ids, eos_id)) # eos_in_prefix[i, j] = any(ids[i, k] == eos_id for k in range(j)) eos_in_prefix = tf.cumsum(is_eos, axis=-1, exclusive=True) return tf.where( tf.equal(eos_in_prefix, 0), tf.zeros_like(ids), tf.ones_like(ids))
def _InferenceSubgraph_Default(self): """Default inference subgraph. Returns: (fetches, feeds), with: - fetches: A dictionary of fetches, containing: - log_pplx_per_token: A matrix of shape [batch, time]. [i, j] is i-th input text's j-th token's log prob. - paddings: A matrix of shape [batch, time]. The padding mask. - log_pplx_per_sample: A vector of shape [batch]. [i] is i-th input text's log prob. - num_oovs_per_sample: A vector of shape [batch] counting the total number of out-of-vocabulary tokens in each input. - tokens_from_labels: A vector of shape [batch] returning the predicted tokens as a sequence after mapping them back to strings from ids using the vocabulary. - ids: A matrix of shape [batch, time]. [i, j] is i-th input text's j-th token's id. - feeds: A dictionary of feeds, containing: - text: A placeholder for a vector of strings. """ text = tf.placeholder(tf.string, shape=[None]) # [batch, time] ids, labels, paddings = self.input_generator.StringsToIds(text) lengths = tf.reduce_sum(tf.to_int32(1 - paddings), axis=1) tokens_from_labels = self.input_generator.IdsToStrings(labels, lengths) oovs = tf.equal(labels, self.input_generator.tokenizer.unk_id) num_oovs_per_sample = tf.to_int32( tf.reduce_sum(tf.to_float(oovs) * (1 - paddings), axis=1)) # [time, batch] ids, paddings, labels, weights = self._TrimIfPossibleThenTranspose( ids, paddings, labels, 1.0 - paddings) batch_size = tf.shape(ids)[1] xent_output, _ = self.lm.FPropDefaultTheta( inputs=ids, paddings=paddings, state0=self.lm.zero_state(self.theta.lm, batch_size), labels=py_utils.NestedMap(class_ids=labels, class_weights=weights)) per_example_xent = py_utils.HasShape(xent_output.per_example_xent, tf.shape(ids)) log_pplx_per_sample = tf.reduce_sum(per_example_xent * (1 - paddings), axis=0) fetches = { 'log_pplx_per_token': # [batch, time] tf.transpose(per_example_xent), 'paddings': # [batch, time] tf.transpose(paddings), 'lengths': # [batch] lengths, 'log_pplx_per_sample': # [batch] log_pplx_per_sample, 'num_oovs_per_sample': # [batch], int32 num_oovs_per_sample, 'tokens_from_labels': # [batch], string tokens_from_labels, 'ids': # [batch, time], int32 ids } feeds = {'text': text} return fetches, feeds
def BeamSearchDecode(self, theta, encoder_outputs, num_hyps_per_beam_override=0, init_beam_search_state=None, pre_beam_search_step_callback=None, post_beam_search_step_callback=None, max_steps=None): """Performs beam-search based decoding. Args: theta: A NestedMap object containing weights' values of the decoder layer and its children layers. encoder_outputs: A NestedMap containing encoder outputs to be passed to the callbacks. num_hyps_per_beam_override: If set to a value <= 0, this parameter is ignored. If set to a value > 0, then this value will be used to override `p.num_hyps_per_beam`. init_beam_search_state: The `InitBeamSearchState` callback. Please refer to the class header comments for more details. pre_beam_search_step_callback: The `PreBeamSearchStepCallback` callback. Please refer to the class header comments for more details. post_beam_search_step_callback: The `PostBeamSearchStepCallback` callback. Please refer to the class header comments for more details. max_steps: maximum beam search steps. If None, use self.params.target_seq_len. Returns: A `BeamSearchDecodeOutput`. """ p = self.params num_hyps_per_beam = p.num_hyps_per_beam if num_hyps_per_beam_override > 0: num_hyps_per_beam = num_hyps_per_beam_override if max_steps is None: max_steps = p.target_seq_len initial_results, other_states = init_beam_search_state( theta, encoder_outputs, num_hyps_per_beam) num_hyps = tf.shape(initial_results.log_probs)[0] num_beams = num_hyps // num_hyps_per_beam if 'step_ids' in initial_results: # [num_hyps, 1] step_ids = tf.ensure_shape(initial_results.step_ids, [None, 1]) else: step_ids = tf.fill([num_hyps, 1], tf.constant(p.target_sos_id, dtype=tf.int32)) min_score = -1e36 best_scores = (tf.zeros(shape=[num_beams], dtype=p.dtype) + min_score) cumulative_scores = tf.zeros(shape=[num_hyps], dtype=p.dtype) in_scores = tf.zeros([max_steps, num_hyps], dtype=p.dtype) in_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.int32) in_prev_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.int32) in_done_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.string) bs_atten_probs = tf.zeros( [max_steps, num_hyps, tf.shape(initial_results.atten_probs)[1]], dtype=p.dtype) cur_step = tf.constant(0, dtype=tf.int32) all_done = tf.constant(False, dtype=tf.bool) core_bs_states = (best_scores, cumulative_scores, in_scores, in_hyps, in_prev_hyps, in_done_hyps, bs_atten_probs) def LoopContinue(cur_step, all_done, unused_step_ids, unused_core_bs_states, unused_other_states_list): return tf.logical_and(cur_step < max_steps, tf.logical_not(all_done)) def LoopBody(cur_step, unused_all_done, step_ids, core_bs_states, other_states_list): (cur_step, all_done, new_step_ids, new_bs_states, new_other_states) = self._BeamSearchStep( theta, encoder_outputs, cur_step, step_ids, core_bs_states, other_states.Pack(other_states_list), num_hyps_per_beam, pre_beam_search_step_callback, post_beam_search_step_callback) return (cur_step, all_done, new_step_ids, new_bs_states, new_other_states.Flatten()) flat_other_states = other_states.Flatten() _, _, _, final_bs_states, flat_final_other_states = tf.while_loop( LoopContinue, LoopBody, loop_vars=(cur_step, all_done, step_ids, core_bs_states, flat_other_states), parallel_iterations=10, back_prop=False, swap_memory=False, shape_invariants=(tf.TensorShape(cur_step.get_shape()), tf.TensorShape(all_done.get_shape()), tf.TensorShape(step_ids.get_shape()), _GetShapes(core_bs_states), _GetShapes(flat_other_states, none_shapes=True))) # [target_seq_len, num_beams * num_hyps_per_beam]. final_done_hyps = final_bs_states[5] final_other_states = other_states.Pack(flat_final_other_states) # TODO(rpang): avoid inspecting 'encoder_outputs'. source_paddings = encoder_outputs.padding if isinstance(source_paddings, py_utils.NestedMap): source_seq_lengths = tf.to_int32( tf.reduce_sum(1.0 - tf.transpose(source_paddings.Flatten()[0]), 1)) else: source_seq_lengths = tf.to_int32( tf.reduce_sum(1.0 - tf.transpose(source_paddings), 1)) # [num_beams, num_hyps_per_beam]. topk_hyps = ops.top_k_terminated_hyps( final_done_hyps, source_seq_lengths, k=num_hyps_per_beam, num_hyps_per_beam=num_hyps_per_beam, length_normalization=p.length_normalization, coverage_penalty=p.coverage_penalty, target_seq_length_ratio=p.target_seq_length_ratio, eoc_id=p.target_eoc_id, merge_paths=p.merge_paths) # [num_beams * num_hyps_per_beam, ...]. max_seq_length = 0 if isinstance(max_steps, tf.Tensor) else max_steps topk_ids, topk_lens, topk_scores = ops.unpack_hyp( tf.reshape(topk_hyps, [-1]), max_seq_length=max_seq_length) # [num_beams, num_hyps_per_beam]. topk_scores = tf.reshape(topk_scores, tf.shape(topk_hyps)) return BeamSearchDecodeOutput(final_done_hyps, topk_hyps, topk_ids, topk_lens, topk_scores, None, final_other_states)
def AssignAnchors(self, anchor_bboxes, gt_bboxes, gt_bboxes_labels, gt_bboxes_mask, foreground_assignment_threshold=0.5, background_assignment_threshold=0.35, background_class_id=0, force_match=True, similarity_fn=None): """Assigns anchors to bboxes using a similarity function (SSD-based). Each anchor box is assigned to the top matching ground truth box. Ground truth boxes can be assigned to multiple anchor boxes. Assignments can result in 3 outcomes: Positive assignment (if score >= foreground_assignment_threshold): assigned_gt_labels will reflect the assigned box label and assigned_cls_mask will be set to 1.0 Background assignment (if score <= background_assignment_threshold): assigned_gt_labels will be background_class_id and assigned_cls_mask will be set to 1.0 Ignore assignment (otherwise): assigned_gt_labels will be background_class_id and assigned_cls_mask will be set to 0.0 The detection loss function would usually: Use assigned_cls_mask for weighting the classification loss. The mask is set such that the loss applies to foreground and background assignments only - ignored anchors will be set to 0. Use assigned_reg_mask for weighting the regression loss. The mask is set such that the loss applies to foreground assignments only. The thresholds (foreground_assignment_threshold and background_assignment_threshold) should be tuned per dataset. TODO(jngiam): Consider having a separate threshold for regression boxes; a separate threshold is used in PointRCNN. Args: anchor_bboxes: tf.float32. [A, 7], where [..., :] corresponds to box parameters (x, y, z, dx, dy, dz, r). gt_bboxes: tf.float32. [G, 7], where [..., :] corresponds to ground truth box parameters (x, y, z, dx, dy, dz, r). gt_bboxes_labels: tensor with shape [G]. Ground truth labels for each bounding box. gt_bboxes_mask: tensor with shape [G]. Mask for ground truth boxes, 1 iff the gt_bbox is a real bbox. foreground_assignment_threshold: Similarity score threshold for assigning foreground bounding boxes; scores need to be >= foreground_assignment_threshold to be assigned to foreground. background_assignment_threshold: Similarity score threshold for assigning background bounding boxes; scores need to be <= background_assignment_threshold to be assigned to background. background_class_id: class id to be assigned to anchors_gt_class if no anchor boxes match. force_match: Boolean specifying if force matching is enabled. If force matching is enabled, then matched anchors which are also the highest scoring with a ground-truth box are considered foreground matches as long as their similarity score > 0. similarity_fn: Function that computes the a similarity score (e.g., IOU) between pairs of bounding boxes. This function should take in two tensors corresponding to anchor and ground-truth bboxes, and return a matrix [A, G] with the similarity score between each pair of bboxes. The score must be non-negative, with greater scores representing more similar. The fore/background_assignment_thresholds will be applied to this score to determine if the an anchor is foreground, background or ignored. If set to None, the function will default to IOU2DRotatedBoxes. Returns: NestedMap with the following keys: assigned_gt_bbox: shape [A, 7] bbox parameters assigned to each anchor. assigned_gt_similarity_score: shape [A] (iou) score between the anchor and the gt bbox. assigned_gt_labels: shape [A] label assigned to bbox. assigned_cls_mask: shape [A] mask for classification loss per anchor. This should be 1.0 if the anchor has a foreground or background assignment; otherwise, it will be assigned to 0.0. assigned_reg_mask: shape [A] mask for regression loss per anchor. This should be 1.0 if the anchor has a foreground assignment; otherwise, it will be assigned to 0.0. Note: background anchors do not have regression targets. """ if similarity_fn is None: similarity_fn = self.IOU2DRotatedBoxes # Shape validation. anchor_bboxes = py_utils.HasShape(anchor_bboxes, [-1, 7]) num_anchor_bboxes, _ = py_utils.GetShape(anchor_bboxes, 2) gt_bboxes = py_utils.HasShape(gt_bboxes, [-1, 7]) num_gt_bboxes, _ = py_utils.GetShape(gt_bboxes, 2) # Compute similarity score and reduce max by anchors and by ground-truth. similarity_score = similarity_fn(anchor_bboxes, gt_bboxes) similarity_score = py_utils.HasShape(similarity_score, [num_anchor_bboxes, num_gt_bboxes]) # Reduce over ground-truth boxes, so we have the max score per anchor. anchor_max_score = tf.reduce_max(similarity_score, axis=1) anchor_max_idx = tf.argmax(similarity_score, axis=1) if force_match: # Reduce over anchors, so we have the max score per ground truth box. gt_max_score = tf.reduce_max(similarity_score, axis=0, keep_dims=True) # Force matches occur when the top matching gt bbox for an anchor is the # top matching anchor for the gt bbox. When force matching, we match # these boxes as long as their similarity score exceeds 0. force_matches = ( tf.equal(similarity_score, gt_max_score) & tf.equal(similarity_score, anchor_max_score[..., tf.newaxis]) & tf.greater(similarity_score, 0.) & tf.cast(gt_bboxes_mask[tf.newaxis, ...], tf.bool)) force_match_indicator = tf.reduce_any(force_matches, axis=1) force_match_idx = tf.argmax(tf.to_int32(force_matches), axis=1) # In assigning foreground/background anchors later, force_match_indicator # is used to determine which anchors are force foreground, and the index # assigned will be taken from anchor_max_idx. # Force matchers must also be the max scoring gt bbox per anchor. # We overwrite anchor_max_idx to ensure that the right match is done. anchor_max_idx = tf.where(force_match_indicator, force_match_idx, anchor_max_idx) # Ensure that max score boxes are not padded boxes by setting score to 0 # for boxes that are padded. gathered_mask = tf.batch_gather(gt_bboxes_mask, anchor_max_idx) anchor_max_score = tf.where( tf.equal(gathered_mask, 1), anchor_max_score, tf.zeros_like(anchor_max_score)) # Boolean tensors corresponding to whether an anchor is background or # foreground based on thresholding. background_anchors = tf.less_equal(anchor_max_score, background_assignment_threshold) foreground_anchors = tf.greater_equal(anchor_max_score, foreground_assignment_threshold) if force_match: # Background anchors are below threshold and not force matches. background_anchors &= ~force_match_indicator # Foreground anchors are above thresholds or force matches. foreground_anchors |= force_match_indicator # Add dummy background bbox to gt_boxes to facilitate batch gather. dummy_bbox = tf.constant([[0, 0, 0, 1, 1, 1, 0]], dtype=tf.float32) # Since we are concatenating the dummy bbox, the index corresponds to the # number of boxes. dummy_bbox_idx = py_utils.GetShape(gt_bboxes, 1)[0] gt_bboxes = tf.concat([gt_bboxes, dummy_bbox], axis=0) gt_bboxes_labels = tf.concat([gt_bboxes_labels, [background_class_id]], axis=0) # Gather indices so that all foreground boxes are gathered from gt_bboxes, # while all background and ignore boxes gather the dummy_bbox. anchor_gather_idx = tf.where( foreground_anchors, anchor_max_idx, tf.constant( dummy_bbox_idx, shape=py_utils.GetShape(anchor_max_idx), dtype=anchor_max_idx.dtype)) # Gather the bboxes and weights. assigned_gt_bbox = tf.batch_gather(gt_bboxes, anchor_gather_idx) assigned_gt_labels = tf.batch_gather(gt_bboxes_labels, anchor_gather_idx) # Set masks for classification and regression losses. assigned_cls_mask = tf.to_float(background_anchors | foreground_anchors) assigned_reg_mask = tf.to_float(foreground_anchors) return py_utils.NestedMap( assigned_gt_bbox=assigned_gt_bbox, assigned_gt_similarity_score=anchor_max_score, assigned_gt_labels=assigned_gt_labels, assigned_cls_mask=assigned_cls_mask, assigned_reg_mask=assigned_reg_mask)
def bucket_fn(num): # Drops record if num[0] is odd. return tf.cond(tf.equal(tf.mod(num[0], 2), 0), lambda: 1, lambda: -tf.to_int32(num[0]))
def _StringsToIdsImpl(self, strs, max_length, append_eos, languages): """Takes a tensor of strings and returns id/padding tensors. This generates `token_ids`, `target_ids`, and `paddings` in the format that is expected for tokenizers. This performs padding to a fixed length and appends the end-of-sentence token as appropriate. Args: strs: a string Tensor. max_length: a python integer. The second dimension of the returned arrays. All sequences are padded or truncated to that length. append_eos: a python bool. See `BaseTokenizer` for explanation. languages: A vector of strings with the same length as `strs`. Returns: token_ids: a tensor of sequences of WPM ids starting with SOS. Sequences always end with EOS unless the sequence exceeds the maximum length. Always padded with EOS. target_ids: a tensor of sequences of WPM ids not starting with SOS but ending with EOS. Always padded with EOS. paddings: a tensor of floats indicating, at each position, whether the corresponding position is padded. """ p = self.params if append_eos is None: append_eos = p.append_eos batch_size = py_utils.GetShape(strs)[0] token_ids_ta = tf.TensorArray(tf.int32, batch_size) target_ids_ta = tf.TensorArray(tf.int32, batch_size) paddings_ta = tf.TensorArray(tf.float32, batch_size) def _TokenizeOneSentence(i, strs, token_ids_ta, target_ids_ta, paddings_ta): """Tokenizes a single sentence.""" ids, _ = self._wpm_encoder.Encode(strs[i]) if append_eos: ids = tf.concat([ids, [self.eos_id]], axis=0) # This truncates after the eos is added, so some sentences might # not have </s> at the end. token_ids_ta = token_ids_ta.write( i, py_utils.PadOrTrimTo( tf.concat([[self.sos_id], ids], axis=0), [max_length], self.eos_id)) target_ids_ta = target_ids_ta.write( i, py_utils.PadOrTrimTo(ids, [max_length], self.eos_id)) paddings_ta = paddings_ta.write( i, py_utils.PadOrTrimTo( tf.zeros_like(ids, dtype=tf.float32), [max_length], 1.)) return i + 1, strs, token_ids_ta, target_ids_ta, paddings_ta _, _, token_ids_ta, target_ids_ta, paddings_ta = tf.while_loop( lambda i, *_: i < batch_size, _TokenizeOneSentence, loop_vars=(tf.constant(0, tf.int32), strs, token_ids_ta, target_ids_ta, paddings_ta), parallel_iterations=30, back_prop=False) token_ids = token_ids_ta.stack() target_ids = target_ids_ta.stack() paddings = paddings_ta.stack() if not p.pad_to_max_length: maxlen = tf.to_int32( tf.round(tf.reduce_max(tf.reduce_sum(1.0 - paddings, axis=1)))) token_ids = token_ids[:, :maxlen] target_ids = target_ids[:, :maxlen] paddings = paddings[:, :maxlen] return token_ids, target_ids, paddings
def ComputeLoss(self, theta, predictions, input_batch): """Computes loss and other metrics for the given predictions. Args: theta: A `.NestedMap` object containing variable values of this task. predictions: The output of `ComputePredictions`, contains: logits - [b, nx, ny, nz, na, 7 + num_classes]. na is the number of anchor boxes per cell. [..., :7] are (dx, dy, dz, dw, dl, dh, dt). input_batch: The input batch from which we accesses the groundtruth. Returns: Two dicts defined as BaseTask.ComputeLoss. """ p = self.params predicted_residuals = py_utils.HasShape(predictions.residuals, [-1, -1, -1, -1, p.num_anchors, 7]) predicted_class_logits = py_utils.HasShape( predictions.classification_logits, [-1, -1, -1, -1, p.num_anchors, p.num_classes]) bs, nx, ny, nz, na, _ = py_utils.GetShape(predicted_class_logits, 6) # Compute class and regression weights. class_weights = input_batch.assigned_cls_mask class_weights = py_utils.HasShape(class_weights, [bs, nx, ny, nz, na]) reg_weights = input_batch.assigned_reg_mask reg_weights = py_utils.HasShape(reg_weights, [bs, nx, ny, nz, na]) reg_weights = tf.expand_dims(reg_weights, -1) if p.loss_norm_type == LossNormType.NORM_BY_NUM_POSITIVES: # Compute number of positive anchors per example. foreground_mask = py_utils.HasShape(input_batch.assigned_reg_mask, [bs, nx, ny, nz, na]) # Sum to get the number of foreground anchors for each example. loss_normalization = tf.reduce_sum(foreground_mask, axis=[1, 2, 3, 4]) loss_normalization = tf.maximum(loss_normalization, tf.ones_like(loss_normalization)) # Reshape for broadcasting. loss_normalization = tf.reshape(loss_normalization, [bs, 1, 1, 1, 1, 1]) class_weights /= loss_normalization reg_weights /= loss_normalization # Classification loss. assigned_gt_labels = py_utils.HasShape(input_batch.assigned_gt_labels, [bs, nx, ny, nz, na]) class_loss = py_utils.SigmoidCrossEntropyFocalLoss( logits=predicted_class_logits, labels=tf.one_hot(assigned_gt_labels, p.num_classes), alpha=p.focal_loss_alpha, gamma=p.focal_loss_gamma) class_loss *= class_weights[..., tf.newaxis] class_loss_sum = tf.reduce_sum(class_loss) # Regression loss. anchor_localization_residuals = py_utils.HasShape( input_batch.anchor_localization_residuals, [bs, nx, ny, nz, na, 7]) # Location and dimensions loss. reg_loc_and_dims_loss = self._utils.ScaledHuberLoss( predictions=py_utils.HasShape(predicted_residuals[..., :6], [bs, nx, ny, nz, na, 6]), labels=anchor_localization_residuals[..., :6], delta=1 / (3.**2)) # Rotation loss with SmoothL1(sin(delta)). rot_delta = ( predicted_residuals[..., 6:] - input_batch.anchor_localization_residuals[..., 6:]) reg_rot_loss = self._utils.ScaledHuberLoss( predictions=tf.sin(rot_delta), labels=tf.zeros_like(rot_delta), delta=1 / (3.**2)) # Direction loss if p.direction_classifier_weight > 0.0: # The target rotations are in the assigned_gt_bbox tensor, # which already has assigned a gt bounding box to every anchor. rot_target = input_batch.assigned_gt_bbox[..., 6] # If rotation is > 0, the class is 1, else it is 0. rot_dir = tf.to_int32(rot_target > 0.) # Compute one-hot labels as a target. rot_dir_onehot = tf.one_hot(rot_dir, 2) # Manually handle loss reduction. dir_loss = tf.losses.softmax_cross_entropy( onehot_labels=rot_dir_onehot, logits=predictions.predicted_dir, weights=tf.squeeze(reg_weights, axis=-1), reduction=tf.losses.Reduction.NONE) # Reduce across all dimensions (we'll divide by the batch size below). dir_loss_sum = tf.reduce_sum(dir_loss) else: dir_loss_sum = 0.0 # Compute loss contribution from location and dimension separately. reg_loc_loss = reg_loc_and_dims_loss[..., :3] * reg_weights reg_loc_loss_sum = tf.reduce_sum(reg_loc_loss) reg_dim_loss = reg_loc_and_dims_loss[..., 3:6] * reg_weights reg_dim_loss_sum = tf.reduce_sum(reg_dim_loss) # Compute rotation loss contribution. reg_rot_loss *= reg_weights reg_rot_loss_sum = tf.reduce_sum(reg_rot_loss) # Num. predictions. # TODO(zhifengc): Consider other normalization factors. E.g., # of bboxes. preds = tf.cast(bs, class_loss_sum.dtype) # Normalize all of the components by batch size. reg_loc_loss = reg_loc_loss_sum / preds reg_dim_loss = reg_dim_loss_sum / preds reg_rot_loss = reg_rot_loss_sum / preds class_loss = class_loss_sum / preds dir_loss = dir_loss_sum / preds # Compute total localization regression loss. reg_loss = ( p.location_loss_weight * reg_loc_loss + p.dimension_loss_weight * reg_dim_loss + p.rotation_loss_weight * reg_rot_loss) # Apply weights to normalized class losses. loss = ( class_loss * p.classification_loss_weight + reg_loss * p.localization_loss_weight + dir_loss * p.direction_classifier_weight) metrics_dict = { 'loss': (loss, preds), 'loss/class': (class_loss, preds), 'loss/reg': (reg_loss, preds), 'loss/reg/rot': (reg_rot_loss, preds), 'loss/reg/loc': (reg_loc_loss, preds), 'loss/reg/dim': (reg_dim_loss, preds), 'loss/dir': (dir_loss, preds), } per_example_dict = { 'residuals': predicted_residuals, 'classification_logits': predicted_class_logits, } return metrics_dict, per_example_dict