def _RelPositionBiasCausal(query, abs_pos_emb): """Computes relative position bias for causal self attention.""" _, t, n, h = py_utils.GetShape(query) abs_pos_emb = py_utils.HasShape(abs_pos_emb, [2 * t - 1, n, h]) # abs_pos_emb is [-(T-1), -(T-2), ... 0, 1, 2, ... T-1] # Retain only half and change order to [T-1, T-2, ... 0] # [T, N, H] abs_pos_emb = tf.reverse(abs_pos_emb, [0])[:t] # [B, N, T, L=T] term_bd = tf.einsum('BTNH,LNH->BNTL', query, abs_pos_emb) # Perform shifting. term_bd = tf.reverse(term_bd, [2, 3]) term_bd = RelShift(term_bd) return tf.reverse(term_bd, [2, 3])
def ComputeLoss(self, theta, input_batch, predicted): diff = predicted - input_batch.tgt_ids per_example_loss = diff * diff batch_dim = py_utils.GetShape(per_example_loss)[0] def replicate_var(name): return tf.convert_to_tensor([self._private_vars[name]] * batch_dim, dtype=tf.float32) metrics = {'loss': (tf.reduce_sum(per_example_loss), batch_dim)} per_example_tensors = { 'input': input_batch.src_ids, 'loss': per_example_loss, 'diff': diff, 'm': replicate_var('m'), 'b': replicate_var('b'), } return metrics, per_example_tensors
def _BuildCrossBatchMixingDataSource(self): """Read and return input batch from a p.file_pattern list. `p.file_pattern` should be a list of (file_pattern, weight, optional_bprop_filter) tuples. Every batch returned will be filled from one source only and batches will be mixed proportionally to the weights. Additionally some backprop filters may be applied for different input sources. Returns: A tuple which contains the output of `self._DataSourceFromFilePattern()` and a tensor of size [batch_size, number of data sources] which contains the source selected for each element in the input batch. With cross batch mixing the complete input batch comes from the same source. Raises: ValueError: If unknown token type. """ p = self.params input_file_pattern = p.file_pattern def _MakeDataSourceFromFilePatternFunc(file_pattern): # It's important to invoke self._DataSourceFromFilePattern() inside the # lambda to make sure that the record is drawn from data source # only if it will be used. return lambda: self._DataSourceFromFilePattern(file_pattern) inputs = [] weights = [] self._bprop_variable_filters = [] for input_entry in input_file_pattern: file_pattern, weight = input_entry[:2] inputs.append(_MakeDataSourceFromFilePatternFunc(file_pattern)) weights.append(weight) bprop_variable_filter = input_entry[2] if len( input_entry) > 2 else '' self._bprop_variable_filters.append(bprop_variable_filter) data_source, selected_bprop = py_utils.MixByWeight(inputs, weights) # TODO(neerajgaur): Remove _bprop_onehot and change code that uses it to # use source_selected from input_batch. self._bprop_onehot = selected_bprop batch_size = py_utils.GetShape(tf.nest.flatten(data_source)[0])[0] return data_source, tf.tile(tf.expand_dims(selected_bprop, 0), [batch_size, 1])
def FProp(self, theta, input_batch): """Perform signal processing on a sequence of PCM data. NOTE: This implementation does not currently support paddings, and they are accepted for compatibility with the super-class. TODO(laurenzo): Rework this to support paddings. Args: theta: Layer theta. input_batch: PCM input map: - 'src_inputs': int16 or float32 tensor of PCM audio data, scaled to +/-32768 (versus [-1..1)!). Shaped: [batch, frame_count]. - 'paddings': per frame 0/1 paddings. Shaped: [batch, frame]. Returns: NestedMap of encoder inputs which can be passed directly to a compatible encoder and contains: - 'src_inputs': inputs to the encoder, minimally of shape [batch, time, ...]. - 'paddings': a 0/1 tensor of shape [batch, time]. """ p = self.params pcm_audio_data = input_batch.src_inputs batch_size, frame_count = py_utils.GetShape(pcm_audio_data, 2) mel_spectrogram_norm = self._FPropChunk(theta, pcm_audio_data) # Stacking across the whole sequence. assert p.left_context == 2, 'Only p.left context 2 is implemented.' first_frame = mel_spectrogram_norm[:, 0:1, :] padded_mel_spectrogram = tf.concat( (first_frame, first_frame, mel_spectrogram_norm), axis=1) frame_count = tf.shape(padded_mel_spectrogram)[1] // 3 triple_mel = tf.reshape( padded_mel_spectrogram[:, 0:3 * frame_count, :], [batch_size, frame_count, 3 * p.num_bins]) output_padding = 0 * tf.reduce_sum(triple_mel, axis=2) # Add feature dim. Shape = [batch, time, features, 1] outputs = triple_mel outputs = tf.expand_dims(triple_mel, -1) return py_utils.NestedMap(src_inputs=outputs, paddings=output_padding)
def _FrequencyMask(self, inputs, global_seed, dtype=tf.float32, domain_id_index=0): """Applies frequency masking with given degree to inputs. Args: inputs: Batch of input features of shape (batch_size, time_length, num_freq, channels). global_seed: an integer seed tensor for stateless random ops. dtype: Data type. domain_id_index: domain id index. Returns: Inputs with random frequency masking applied. """ p = self.params # Mask parameters. freq_mask_max_bins = p.freq_mask_max_bins[domain_id_index] multiplicity = p.freq_mask_count[domain_id_index] # If masking length or count is zero, do nothing. if freq_mask_max_bins == 0 or multiplicity == 0: return inputs # Arguments to pass to mask generator. batch_size, _, num_freq, _ = py_utils.GetShape(inputs) choose_range = tf.cast(tf.broadcast_to(num_freq, (batch_size, )), dtype=tf.int32) # Create masks in frequency direction and apply. block_arrays = self._GetMask(tf.shape(inputs)[0], choose_range=choose_range, mask_size=num_freq, global_seed=global_seed, max_length=freq_mask_max_bins, masks_per_frame=0.0, multiplicity=multiplicity, dtype=dtype, max_ratio=1.0) outputs = tf.einsum('bxyc,by->bxyc', inputs, block_arrays) return outputs
def SequenceTrimLastToken(x, x_paddings): """Trims the last token off of sequence `x`, and set trimmed elements to 0. Args: x: A sequence of tokens of shape [batch_size, x_len_max]. x_paddings: The paddings of `x`. Returns: A tuple. - The new sequence, Tensor of shape [batch_size, x_len_max]. - The new paddings, Tensor of shape [batch_size, x_len_max]. """ x_len = tf.reduce_sum(1 - x_paddings, 1) x_len_max = py_utils.GetShape(x)[1] x_trimmed_len = tf.maximum(x_len - 1, 0) x_trimmed_paddings = tf.sequence_mask(x_trimmed_len, x_len_max, x_paddings.dtype) x_trimmed = x * tf.cast(x_trimmed_paddings, x.dtype) return x_trimmed, 1 - x_trimmed_paddings
def _Slice(tensor): """Return a slice of this tensor at time=state0.t.""" shape = py_utils.GetShape(tensor) # All zeros except for t in the time dimension. # e.g. if params.axis=1, begin is [0, t, 0, 0, 0, ...] begin = tf.one_hot(self.params.axis, tf.rank(tensor), on_value=state0.t) # Same as shape, but with a 1 in the time dimension. # e.g. if params.axis=1, shape is [shape[0], 1, shape[2], shape[3], ...] size = tf.concat([ shape[0:self.params.axis], tf.constant([1], dtype=tf.int32), shape[self.params.axis + 1:] ], axis=0) # Make a slice where the time dimension is fixed at state0.t. time_slice = tf.slice(tensor, begin, size) # Remove the time dimension. return tf.squeeze(time_slice, axis=self.params.axis)
def RelShift(x): """Performs relative shift on 4D tensor (first 2 axis are batching dims). Given input of shape [?, ?, W, W], this does "relative shifting" for the last two dims, s.t. output[b, n, i, j] = 0 if i > j else input[b, n, i, j-i] Args: x: A Tensor of shape [?, ?, W, W] Returns: A Tensor of the same shape as input with its content shifted (as described above). """ b, n, w, _ = py_utils.GetShape(x) x = py_utils.HasShape(x, [-1, -1, w, w]) x = tf.pad(x, ((0, 0), (0, 0), (0, 0), (0, 1))) x = tf.reshape(x, [b, n, w + 1, w]) x = x[:, :, :w, :] return x
def IsSpecialExample(task_ids, special_task_ids): """A utility function indicates whether inputs belong to specific tasks. Args: task_ids: Task ids for the input batch. Tensor of shape [batch]. special_task_ids: A list of specified task ids. Returns: A tensor indicating whether each sample in the batch belong to the specified task. Return a tensor of size [batch]. """ batch_size = py_utils.GetShape(task_ids)[0] return tf.reduce_any( tf.equal( tf.expand_dims(task_ids, -1), tf.cast( tf.broadcast_to( special_task_ids, [batch_size, len(special_task_ids)]), tf.int32)), -1)
def StreamStep(self, theta, inputs, paddings, state0): """Apply a singele step of convolution to input_tensor. Only supports 1d causal convolution. Doesn't support dilation. Args: theta: A NestedMap of layer params. inputs: A Tensor of shape [b, t, 1, c] paddings: A 0/1 valued tensor of shape [b, t]. state0: A NestedMap of tensors of the same struct as returned by zero_state(). Returns: outputs: A Tensor of shape [b, t, 1, c * channel_multiplier] padding: the same as input paddings. state1: A NestedMap of the same struct as input state """ p = self.params assert p.filter_shape[1] == 1, ( 'StreamStep only supports 1d causal convolution.') assert p.filter_stride[0] == 1, ( 'StreamStep doesn\'t support striding') assert p.dilation_rate == (1, 1), ('StreamStep doesn\'t support dilation') with tf.name_scope(p.name): inputs = py_utils.HasShape(inputs, [-1, -1, 1, p.filter_shape[2]]) paddings = py_utils.HasShape(paddings, py_utils.GetShape(inputs)[:2]) concat_inputs = tf.concat([ state0.context, inputs * (1 - py_utils.AppendDims(paddings, 2)) ], axis=1) outputs = tf.nn.depthwise_conv2d(concat_inputs, self._GetWeight(theta), strides=(1, 1, 1, 1), dilations=(1, 1), data_format='NHWC', padding='VALID') new_context = concat_inputs[:, -(p.filter_shape[0] - 1):] return outputs, paddings, py_utils.NestedMap(context=new_context)
def _InitBeamSearchStateCallback(self, theta, source_encs, source_paddings, num_hyps_per_beam, additional_source_info=None): """Returns initial beams search states. Args: source_encs: A tensor of shape [src_len, src_batch, source_dim]. source_paddings: A tensor of shape [src_len, src_batch]. num_hyps_per_beam: An int, number hyps to keep for source sentence. additional_source_info: a `.NestedMap` of tensors containing extra context information about the source that may be useful for decoding. Returns: A tuple (initial_results, states). initial_results: a `.NestedMap` of initial results. atten_probs: The initial attention probs, of shape [tgt_batch, src_len]. states: a `.NestedMap` of initial model states. rnn_states: Initial state of the RNN. atten_context: Initial attention context vector. atten_states: Initial attention state. """ # additional_source_info is currently not used. del additional_source_info num_beams = py_utils.GetShape(source_encs)[1] num_hyps = num_beams * num_hyps_per_beam rnn_states, init_atten_context, atten_probs, atten_states = ( self._InitDecoder(theta, source_encs, source_paddings, num_hyps)) initial_results = py_utils.NestedMap({'atten_probs': atten_probs}) return initial_results, py_utils.NestedMap({ 'rnn_states': rnn_states, 'atten_context': init_atten_context, 'atten_probs': atten_probs, 'atten_states': atten_states, })
def _InitBeamSearchStateCallback(self, theta, encoder_outputs, num_hyps_per_beam): """Returns initial beams search states. Args: theta: a NestedMap of parameters. encoder_outputs: a NestedMap computed by encoder. num_hyps_per_beam: An int, number hyps to keep for source sentence. Returns: A tuple (initial_results, states). initial_results: a `.NestedMap` of initial results. atten_probs: The initial attention probs, of shape [tgt_batch, src_len]. states: a `.NestedMap` of initial model states. rnn_states: Initial state of the RNN. atten_context: Initial attention context vector. atten_states: Initial attention state. """ p = self.params num_beams = py_utils.GetShape(encoder_outputs.padding)[1] num_hyps = num_beams * num_hyps_per_beam rnn_states, init_atten_context, atten_probs, atten_states = ( self._InitDecoder(theta, encoder_outputs, num_hyps)) initial_results = py_utils.NestedMap(log_probs=tf.zeros( [num_hyps, p.softmax.num_classes], dtype=py_utils.FPropDtype(p)), atten_probs=atten_probs) return initial_results, py_utils.NestedMap({ 'rnn_states': rnn_states, 'atten_context': init_atten_context, 'atten_probs': atten_probs, 'atten_states': atten_states, })
def _Normalize(self, theta, grouped_inputs, group_mean, group_variance): p = self.params group_mean = py_utils.CheckNumerics( group_mean, f'mean of {p.name} failed numeric check.') group_variance = py_utils.CheckNumerics( group_variance, f'variance of {p.name} failed numeric check.') input_shape = py_utils.GetShape(grouped_inputs) moment_shape = list(input_shape) if p.input_rank == 4: moment_shape[2] = 1 moment_shape[-1] = 1 else: moment_shape[-1] = 1 if not p.cumulative: # If not cumulative, the seqlen dimension is also reduced. moment_shape[1] = 1 group_mean = py_utils.HasShape(group_mean, moment_shape) group_variance = py_utils.HasShape(group_variance, moment_shape) group_variance = py_utils.with_dependencies([ py_utils.assert_greater_equal(group_variance, tf.cast(0, group_variance.dtype)) ], group_variance) if group_variance.dtype == tf.bfloat16: # tf.rsqrt is not implemented for bfloat16, hence we always cast into # tf.float32. group_stddev_inv = tf.cast( tf.math.rsqrt( tf.cast(group_variance + self._epsilon, tf.float32)), group_mean.dtype) else: group_stddev_inv = tf.math.rsqrt(group_variance + self._epsilon) grouped_inputs = (grouped_inputs - group_mean) * group_stddev_inv # Merges the last two dims. grouped_inputs = tf.reshape(grouped_inputs, input_shape[:-2] + [-1]) # Note, The real gamma to use is 1 + gamma. outputs = grouped_inputs * (theta.gamma + 1) + theta.beta return outputs
def _PaddedMeanFn(inp): """Apply padded mean using reduce_sum and dividing by # real points.""" # Replace all padded features with 0 by masking the padded features out. mask = 1 - inp.padding features = inp.features * mask[..., tf.newaxis] features = tf.reduce_sum(features, axis=-2) num_real_points = tf.reduce_sum(mask, axis=-1, keep_dims=True) # Prevent the divisor of our padded mean from ever being 0, so that # the gradient flowing back through this op doesn't give us NaNs. num_real_points = tf.maximum(num_real_points, 1) features = features / num_real_points # Replace features of all padded points by zeros. If a batch of points are # all padded, then num_real_points will be zero. We set the features to be # zero, so that we don't get any downstream issue with NaNs. # Note that inf * 0 = NaN. all_padded = tf.equal(num_real_points, 0.) all_padded = tf.broadcast_to(all_padded, py_utils.GetShape(features)) features = tf.where(all_padded, tf.zeros_like(features), features) return py_utils.CheckNumerics(features)
def FProp(self, theta, inputs): """Apply dropout to inputs. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: The inputs tensor. Returns: inputs with dropout applied at training time. """ p = self.params if p.keep_prob >= 1.0 or p.is_eval: return inputs with tf.name_scope(p.name): mb_tensor = gpipe.GetOverWriteGlobalStep() if p.burn_in_steps > 0: current_step = tf.cast(mb_tensor // p.num_micro_batches, inputs.dtype) current_ratio = current_step / tf.cast(p.burn_in_steps, inputs.dtype) current_ratio = tf.minimum(tf.cast(1.0, inputs.dtype), current_ratio) keep_prob = (1 - current_ratio * (1 - p.keep_prob)) else: keep_prob = tf.cast(p.keep_prob, inputs.dtype) seeds = gpipe.GenerateStepSeedPair(p) noise_shape = py_utils.GetShape(inputs) if p.noise_shape_dim and p.noise_shape_dim < inputs.shape.ndims: for d in range(inputs.shape.ndims): if d != p.noise_shape_dim: noise_shape[d] = 1 random_tensor = (tf.cast(keep_prob, tf.float32) + tf.contrib.stateless.stateless_random_uniform( noise_shape, seed=seeds, dtype=tf.float32)) binary_tensor = tf.cast(tf.floor(random_tensor), inputs.dtype) ret = tf.div(inputs, keep_prob) * binary_tensor ret.set_shape(inputs.get_shape()) return ret
def ComputeConvOutputPadding(paddings, window, stride, padding_algorithm='SAME', v2_padding=False): """Computes paddings for convolution and pooling output. WARNING: This implementation is buggy prefer using ComputeConvOutputPaddingV2. out_padding[i] == 1 iff any in_padding corresponding to that output is 1. Args: paddings: The paddings tensor. It is expected to be of shape [batch, time]. window: The size of the windows. stride: The time-stride between adjacent windows. padding_algorithm: 'SAME' or 'VALID'. v2_padding: Prefer setting to True. The default implementation is buggy for strided convolutions. Returns: out_padding, The new padding tensor of size [batch, ceil(time / stride)]. """ if v2_padding: return _ComputeConvOutputPaddingV2(paddings, window, stride, padding_algorithm) if stride == 1: return paddings # Pad so input_length divides stride. input_length = py_utils.GetShape(paddings)[1] pad_len = (input_length + stride - 1) // stride * stride - input_length paddings = tf.pad(paddings, [[0, 0], [0, pad_len]], constant_values=1.0) out_padding = tf.nn.pool( tf.expand_dims(paddings, -1), [window], 'MAX', padding=padding_algorithm, strides=[stride], ) return tf.squeeze(out_padding, -1)
def _GetBetaGamma(self, theta, inputs, **kwargs): assert 'class_emb' in kwargs class_emb = kwargs['class_emb'] # class_emb is a one-hot vector of shape [batch, class_emb_dim=num_classes]. class_ids = tf.math.argmax(class_emb, axis=-1, output_type=tf.int32) # [batch, dim] # Not using matmul/einsum to avoid potential precision problem on TPU with # sparse inputs. beta = tf.gather(theta.beta, class_ids) gamma = tf.gather(theta.gamma, class_ids) # Extend to [batch, 1, ... 1, dim] batch = py_utils.GetShape(inputs)[0] to_shape = tf.concat([[batch], tf.ones([py_utils.GetRank(inputs) - 2], tf.int32), [self.params.dim]], axis=0) beta = tf.reshape(beta, to_shape) gamma = tf.reshape(gamma, to_shape) return beta, gamma
def PrepareSequenceForPlot(tensor, padding, name): """Prepares a sequence feature for plotting. The sequence feature is transposed and channels are flattened. Args: tensor: A n-D Tensor of shape [batch, time, ...]. padding: A Tensor of shape [batch, time]. name: A string as the name of the reshaped Tensor, which will be used as the subcaption for plotting. Returns: A tuple of: reshaped_tensor: A 3-D Tensor of shape [batch, dim, time]. sequence_length: A 1-D Tensor of shape [batch]. """ # Flatten any dimensions beyond the third into the third. batch_size, max_len = py_utils.GetShape(tensor, 2) plot_tensor = tf.reshape(tensor, [batch_size, max_len, -1]) plot_tensor = tf.transpose(plot_tensor, [0, 2, 1], name=name) return (plot_tensor, SequenceLength(padding))
def _Extract(self, features): """Returns the laser Tensor.""" p = self.params ret = super()._Extract(features) all_vxyz = [] all_classes = [] for lidar in p.lidar_names: for ri in p.lidar_returns: feature_name = 'laser_%s_%s' % (lidar, ri) laser_data = tf.reshape( _Dense(features[feature_name]), [-1, 3 + p.num_features]) num = py_utils.GetShape(laser_data)[0] # We expect lidar_$lidar_$ri and lidar_$lidar_$ri_flow has # same number of points. feature_name += '_flow' laser_data = tf.reshape(_Dense(features[feature_name]), [num, 3 + 1]) points_vxyz = laser_data[..., 0:3] points_classes = laser_data[..., 3] all_vxyz += [points_vxyz] all_classes += [points_classes] # Stack all of the points along the major dimension points_vxyz = tf.concat(all_vxyz, axis=0) points_class = tf.concat(all_classes, axis=0) # The precomputed class uses -1 to mean 5 in our current code. points_class = tf.where( tf.less(points_class, 0), 5. * tf.ones_like(points_class), points_class) if p.max_num_points is not None: assert 'points_padding' in ret points_vxyz = py_utils.PadOrTrimTo(points_vxyz, [p.max_num_points, 3]) points_class = py_utils.PadOrTrimTo(points_class, [p.max_num_points]) assert 'points_xyz' in ret ret.world_flow = points_vxyz ret.pointwise_class = tf.cast(points_class, tf.int32) return ret
def BBoxCorners2D(bboxes): """Extract the corner points from a 5-DOF bbox representation. Args: bboxes: A [..., 5] floating point bounding box representation ([x, y, dx, dy, phi]). Returns: A [..., 4, 2] floating point Tensor containing the corner (x, y) points for every bounding box. """ corners = tf.constant([ [0.5, 0.5], [-0.5, 0.5], [-0.5, -0.5], [0.5, -0.5], ]) leading_shape = py_utils.GetShape(bboxes)[:-1] # Extract location, dimension, and rotation. location = bboxes[..., :2] dimensions = bboxes[..., 2:4] phi_world = bboxes[..., 4] # Convert rotation_phis into rotation matrices along unit z. cos = tf.cos(phi_world) sin = tf.sin(phi_world) rotations_world = tf.reshape(tf.stack([cos, -sin, sin, cos], axis=-1), leading_shape + [2, 2]) # Create axis-aligned corners from length/width/height. corners = tf.einsum('...i,ji->...ji', dimensions, corners) # Rotate the corners coordinates to the rotated world frame. corners = tf.einsum('...ij,...kj->...ki', rotations_world, corners) # Translate corners to the world location. corners = corners + tf.reshape(location, leading_shape + [1, 2]) return corners
def _BBoxesAndLogits(self, input_batch): """Decode an input batch, computing predicted bboxes from residuals.""" _, per_example_dict = self.FPropTower(self.theta, input_batch) # Decode residuals. predicted_bboxes = self._utils.ResidualsToBBoxes( input_batch.anchor_bboxes, per_example_dict['residuals']) # predicted_bboxes is a [batch, nx, ny, nz, na, 7] Tensor. batch_size, nx, ny, nz, na, _ = py_utils.GetShape(predicted_bboxes, 6) num_boxes = nx * ny * nz * na # Reshape to [batch_size, num_boxes, 7] predicted_bboxes = tf.reshape(predicted_bboxes, [batch_size, num_boxes, 7]) classification_logits = tf.reshape( per_example_dict['classification_logits'], [batch_size, num_boxes, -1]) return py_utils.NestedMap({ 'predicted_bboxes': predicted_bboxes, 'classification_logits': classification_logits })
def _FrequencyWarp(self, inputs, global_seed, dtype=tf.float32, domain_id_index=0): """Applies frequency warping with given degree to inputs. Args: inputs: Batch of input features of shape (batch_size, time_length, num_freq, channels). global_seed: an integer seed tensor for stateless random ops. dtype: Data type. domain_id_index: Domain ID index. Returns: Inputs with random frequency warping applied. """ p = self.params batch_size, _, num_freq, _ = py_utils.GetShape(inputs) # Get parameters for warping. freq_warp_max_bins = p.freq_warp_max_bins[domain_id_index] # If maximum warp length is zero, do nothing. if freq_warp_max_bins == 0: return inputs choose_range = tf.ones((batch_size, ), dtype=tf.int32) * num_freq # Create warping matrix in time direction and apply warp_matrix = self._GetWarpMatrix(batch_size, choose_range=choose_range, matrix_size=num_freq, global_seed=global_seed, max_warp_frames=freq_warp_max_bins, dtype=dtype) return self.EinsumBxycBzyBxzc(inputs, warp_matrix, name='einsum_forfreqwarping')
def testMoEFPropDynamicShapes(self): """Test to verify MoEBuilder.MoE() supports dynamic shapes. Test without this change fails. """ batch_dim = 2 length_dim = 4 input_dim = 4 builder = gshard_builder.MoEBuilder.Params().Set( model_dim=input_dim, num_devices=2, moe_hidden_dim=16, e_dim=2, c_dim=2) p = builder.Instantiate().MoE('moe') with self.session(graph=tf.Graph()) as sess: tf.random.set_seed(2019) # we will reduce the length_dim by 2 dynamically. layer = p.Instantiate() inputs, segment_ids, segment_pos = self._CreateDynamicShapeInputs( batch_dim, length_dim, input_dim) # Verify length dimension shape is dynamic(a Tensor). self.assertIsInstance(py_utils.GetShape(inputs)[1], tf.Tensor) out, aux_loss = layer.FPropDefaultTheta(inputs, segment_ids, segment_pos) sess.run(tf.global_variables_initializer()) _ = sess.run([out, aux_loss])
def _PadAndReshapeSpec(self, mel_spectrogram, mel_spectrogram_paddings): p = self.params batch_size = py_utils.GetShape(mel_spectrogram)[0] # Stack and sub-sample. Only subsampling with a stride of the stack size # is supported. if p.stack_left_context > 0: # Since left context is leading, pad the left by duplicating the first # frame. stack_size = 1 + p.stack_left_context mel_spectrogram = tf.concat( [mel_spectrogram[:, 0:1, :]] * p.stack_left_context + [mel_spectrogram], axis=1) mel_spectrogram_paddings = tf.concat( [mel_spectrogram_paddings[:, 0:1]] * p.stack_left_context + [mel_spectrogram_paddings], axis=1) # Note that this is the maximum number of frames. Actual frame count # depends on padding. stacked_frame_dim = tf.shape(mel_spectrogram)[1] // stack_size mel_spectrogram = tf.reshape( mel_spectrogram[:, 0:(stack_size) * stacked_frame_dim, :], [batch_size, stacked_frame_dim, stack_size * p.num_bins]) # After stacking paddings, pad if any source frame was padded. # Stacks into [batch_size, stacked_frame_dim, stack_size] like the # spectrogram stacking above, and then reduces the stack_size dim # to the max (effectively, making padding = 1.0 if any of the pre-stacked # frames were 1.0). Final shape is [batch_size, stacked_frame_dim]. mel_spectrogram_paddings = tf.reshape( mel_spectrogram_paddings[:, 0:(stack_size) * stacked_frame_dim], [batch_size, stacked_frame_dim, stack_size]) mel_spectrogram_paddings = tf.reduce_max(mel_spectrogram_paddings, axis=2) # Add feature dim. Shape = [batch, time, features, 1] mel_spectrogram = tf.expand_dims(mel_spectrogram, -1) return mel_spectrogram, mel_spectrogram_paddings
def FProp(self, theta, inputs): """Apply projection to inputs. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. inputs: The inputs tensor. Shaped [..., input_dims]. Returns: Projected inputs. """ p = self.params with tf.name_scope(p.name): computation_cost.Add( self, 'flops', tf.reduce_prod(tf.cast(tf.shape(inputs)[:-1], tf.int64)) * tf.cast( symbolic.EvalExpr(symbolic.TENSOR_VALUES, p.input_dims * p.output_dims), tf.int64) * 2) use_tpu = py_utils.use_tpu() shape = inputs.shape if use_tpu and (shape is not None and shape.rank is not None and shape.rank < 26): # Avoids reshape if feasible and uses Einsum. if shape.rank == 2: return tf.matmul(inputs, theta.w) else: s = ''.join([chr(x) for x in range(97, 123)]) # abc...xyz r = shape.rank return tf.einsum('{0}y,yz->{0}z'.format(s[:r - 1]), inputs, theta.w) input_dim = py_utils.GetShape(inputs)[-1] act = tf.matmul(tf.reshape(inputs, [-1, input_dim]), theta.w) output_dim = tf.shape(theta.w)[-1] act = tf.reshape( act, tf.concat([tf.shape(inputs)[:-1], [output_dim]], axis=0)) return act
def FProp(self, theta, inputs, paddings, class_emb): """Apply batch normalization. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: The inputs tensor. Shaped [batch, ..., dim]. paddings: The paddings tensor. Shaped [batch, ..., 1], with the same rank as the input tensor. class_emb: The conditioning inputs, Shaped [batch, emb_dim]. Returns: Output after applying batch normalization, with the same shape as 'inputs'. """ if py_utils.testonly_skip_norm_layers(): return inputs p = self.params batch = py_utils.GetShape(inputs)[0] class_emb = py_utils.HasShape(class_emb, [batch, p.class_emb_dim]) if not py_utils.use_tpu(): class_emb = py_utils.with_dependencies([ py_utils.assert_less_equal( tf.cast(class_emb, tf.int32), 1, name='one_hot_assert1'), py_utils.assert_greater_equal( tf.cast(class_emb, tf.int32), 0, name='one_hot_assert2'), py_utils.assert_equal(tf.ones([batch], tf.int32), tf.cast(tf.reduce_sum(class_emb, -1), tf.int32), name='one_hot_assert3'), ], class_emb) with tf.name_scope(p.name): norm_mean, norm_variance, beta, gamma = self.ComputeAndUpdateMoments( theta, inputs, paddings=paddings, class_emb=class_emb) return self._ComputeBN(inputs, paddings, gamma, beta, norm_mean, norm_variance)
def ConvertToBlocks(x, block_size, padding_val=0.0): """Turns a sequence to non overlapping blocks. Args: x: a tensor of [batch, time, ...]. block_size: int. Number of time frames in a block. padding_val: float. value on the padded frames. Returns: A tensor of [batch, num_blocks, block_size, ...], with necessary paddings, where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...]. """ shape = py_utils.GetShape(x) b, t = shape[:2] if block_size < 1: raise ValueError('block_size must be at least 1, got {}'.format(block_size)) w = block_size # Pad t to be a multiply of w. num_blocks = (t + w - 1) // w pad_to_length = num_blocks * w padded = py_utils.PadSequenceDimension(x, pad_to_length, padding_val) reshaped = tf.reshape(padded, [b, num_blocks, w] + shape[2:]) return reshaped
def _CreateFrustumMask(self, bbox_corners_image, bbox2d_corners_image_clipped, image_height, image_width): """Creates a box mask for boxes whose projections fall outside of image.""" p = self.params batch_size, num_boxes = py_utils.GetShape(bbox_corners_image, 2) if not p.filter_predictions_outside_frustum: return tf.ones(shape=(batch_size, num_boxes), dtype=tf.float32) def _MinMax(bbox_corners): """Computes the min and max over corners.""" bbox_min = tf.reduce_min(bbox_corners, axis=-1) bbox_max = tf.reduce_max(bbox_corners, axis=-1) bbox_min = py_utils.HasShape(bbox_min, [batch_size, num_boxes]) bbox_max = py_utils.HasShape(bbox_max, [batch_size, num_boxes]) return bbox_min, bbox_max bbox_min_x, bbox_max_x = _MinMax(bbox_corners_image[:, :, :, 0]) bbox_min_y, bbox_max_y = _MinMax(bbox_corners_image[:, :, :, 1]) # Compute the fraction of the clipped 2d image projection and the # full 2d image projection. We simply need to divide the area # of each cropped box by the area of the full box to get the # overlap fraction. original_area = (bbox_max_x - bbox_min_x) * (bbox_max_y - bbox_min_y) bbox_clipped_x_min = bbox2d_corners_image_clipped[..., 0] bbox_clipped_y_min = bbox2d_corners_image_clipped[..., 1] bbox_clipped_x_max = bbox2d_corners_image_clipped[..., 2] bbox_clipped_y_max = bbox2d_corners_image_clipped[..., 3] clipped_area = (bbox_clipped_x_max - bbox_clipped_x_min) * ( bbox_clipped_y_max - bbox_clipped_y_min) fraction = clipped_area / original_area frustum_mask = (fraction > p.truncation_threshold) frustum_mask = py_utils.HasShape(frustum_mask, [batch_size, num_boxes]) frustum_mask = tf.cast(frustum_mask, tf.float32) return frustum_mask
def _ReshapeToMono2D(self, pcm_audio_data, paddings): """Reshapes a 3D or 4D input to 2D. Since the input to FProp can be 3D or 4D (see class comments), this will collapse it back to a 2D, mono shape for internal processing. Args: pcm_audio_data: 2D, 3D or 4D audio input. See class comments. Must have a rank. paddings: Original paddings shaped to the first two dims of pcm_audio_data. Returns: Tuple of 2D [batch_size, timestep] mono audio data, new paddings. """ shape = py_utils.GetShape(pcm_audio_data) rank = len(shape) if rank == 2: return pcm_audio_data, paddings elif rank == 3: # [batch, time, channel] with tf.control_dependencies([tf.assert_equal(shape[2], 1)]): return tf.squeeze(pcm_audio_data, axis=2), paddings elif rank == 4: # [batch, time, packet, channel] batch_size, orig_time, orig_packet_size, channel = shape time = orig_time * orig_packet_size with tf.control_dependencies([tf.assert_equal(channel, 1)]): pcm_audio_data = tf.reshape(pcm_audio_data, (batch_size, time)) # Transform paddings into the new time base with a padding per time # step vs per packet by duplicating each packet. paddings = tf.reshape( tf.tile(tf.expand_dims(paddings, axis=2), [1, 1, orig_packet_size]), (batch_size, time)) return pcm_audio_data, paddings else: raise ValueError('Illegal pcm_audio_data shape')
def CornersToImagePlane(self, corners, velo_to_image_plane): """Project 3d box corners to the image plane. Args: corners: A [batch, num_boxes, 8, 3] floating point tensor containing the 8 corners points for each 3d bounding box. velo_to_image_plane: A [batch, 3, 4] batch set of projection matrices from velo xyz to image plane xy. After multiplication, you need to divide by last coordinate to recover 2D pixel locations. Returns: A [batch, num_boxes, 8, 2] floating point Tensor containing the 3D bounding box corners projected to the image plane. """ batch_size, num_boxes, _, _ = py_utils.GetShape(corners, 4) def CornersToPlaneBody(args): """Body of function to convert each bounding box to the image plane.""" (corners, velo_to_image_plane) = args # corners[i] is [num_boxes, 8, 3]: flatten the points in this batch and do # the conversion in one call. bbox_corners = tf.reshape(corners, [-1, 3]) image_plane_corners = geometry.PointsToImagePlane( bbox_corners, velo_to_image_plane) image_plane_corners = tf.reshape(image_plane_corners, [-1, 8, 2]) return image_plane_corners corners_in_image_plane = tf.map_fn(fn=CornersToPlaneBody, elems=(corners, velo_to_image_plane), dtype=tf.float32, back_prop=False) corners_in_image_plane = py_utils.HasShape( corners_in_image_plane, [batch_size, num_boxes, 8, 2]) return corners_in_image_plane