def _GetWarpMatrix(self, batch_size, choose_range, matrix_size, global_seed, max_warp_frames=None, dtype=tf.float32, max_ratio=1.0): """Returns warp matrices starting from random positions. In this function when max_warp_frames != None: 1) Sample random warp displacements from the interval [-max_warp_frames, max_warp_frames) to yield shift tensor with shape (batch_size,). 2) Truncate lengths to a maximum magnitude of (choose_range * max_ratio), so that each shift is fully contained within the corresponding sequence. 3) Random sample origin points of shape (batch_size, multiplicity) with in [shift, choose_range - shift). 4) Return a batch of 1-D linear maps that fix the boundary points and shift the origin point by the shift. When max_warp_frames == None: 1) Sample random warp displacements with magnitudes less than (choose_range * max_ratio) to yield shift tensor with shape (batch_size,). 2) Proceed through steps 3), 4). Args: batch_size: Batch size. Integer number. choose_range: Range within which the warp reference points must lie. Tensor of shape (batch_size,). matrix_size: Dimension of vector space warp matrix is applied to. Integer number. global_seed: an integer seed tensor for stateless random ops. max_warp_frames: Upper-bound on the warp distance. Integer or None. dtype: Data type. max_ratio: Maximum ratio between the shift distance and choose_range. Float number. Returns: warp_matrix: An array of fixed size warp matrices with shape (batch_size, matrix_size, matrix_size). """ p = self.params # Non-empty random seed values are only used for testing or when using # stateless random ops. seed_3, seed_4, and seed_5 are set separately to # avoid correlation of warp magnitude and origin position. if p.use_input_dependent_random_seed: seed_3 = global_seed + 3 seed_4 = global_seed + 4 seed_5 = global_seed + 5 elif p.random_seed: seed_3 = p.random_seed - 1 seed_4 = p.random_seed - 1 seed_5 = 2 * p.random_seed + 1 else: seed_3 = p.random_seed seed_4 = p.random_seed seed_5 = p.random_seed choose_range_dtype = tf.cast(choose_range, dtype=dtype) length_upper_bound = tf.cast(max_ratio * choose_range_dtype, dtype=tf.int32) # Set shift length. random_uniform = _random_uniform_op(p.use_input_dependent_random_seed) if max_warp_frames and max_warp_frames > 0: shift = random_uniform( shape=(batch_size,), minval=-1 * max_warp_frames, maxval=max_warp_frames + 1, dtype=tf.int32, seed=seed_3) else: random_ratio = random_uniform( shape=(batch_size,), minval=-1.0, maxval=1.0, dtype=dtype, seed=seed_4) shift = tf.cast(random_ratio * tf.cast(length_upper_bound, dtype=dtype), tf.int32) # Make sure the sampled length was smaller than max_ratio * length_bound. # Note that sampling in this way is biased. # (Shorter sequence may over-masked.) final_shift = tf.maximum(-length_upper_bound, tf.minimum(shift, length_upper_bound)) # Choose origin anchor point. mid_range = tf.cast(choose_range, dtype=tf.int32) mid_range = tf.maximum(choose_range - 2, 0) random_origin = random_uniform(shape=(batch_size,), maxval=1.0, seed=seed_5) origin_with_in_valid_range = random_origin * tf.cast(mid_range, dtype=dtype) origin = tf.cast(origin_with_in_valid_range, tf.int32) + 1 # Set destination point of the origin anchor point under the warp map. destination = origin + final_shift # Cast origin and destination. origin = tf.cast(origin, dtype=dtype) destination = tf.cast(destination, dtype=dtype) return self._ConstructWarpMatrix( batch_size=batch_size, matrix_size=matrix_size, origin=origin, destination=destination, choose_range=choose_range_dtype, dtype=dtype)
def _XYZFromRangeImage(self, lidar_image, lidar_image_mask, extrinsics, inclinations, pixel_pose=None, frame_pose=None): """Extract the cartesian coordinates from the range image. Args: lidar_image: [H, W, C] range image Tensor. lidar_image_mask: [H, W] boolean indicating which 2d coordinates in the lidar image are present. extrinsics: [4, 4] float matrix representing transformation matrix to world coordinates. inclinations: [V] beam inclinations vector. pixel_pose: [64, 2650, 4, 4] tensor representing per pixel pose of GBR. frame_pose: [4, 4] matrix representing vehicle to world transformation. Returns: [H, W, 3] range image cartesian coordinates. """ height, width, channels = py_utils.GetShape(lidar_image, 3) conversion_dtype = tf.float32 lidar_image = tf.cast(lidar_image, conversion_dtype) extrinsics = tf.cast(extrinsics, conversion_dtype) inclinations = tf.cast(inclinations, conversion_dtype) inclinations = tf.reverse(inclinations, axis=[-1]) az_correction = py_utils.HasShape( tf.atan2(extrinsics[1, 0], extrinsics[0, 0]), []) ratios = (tf.cast(tf.range(width, 0, -1), dtype=conversion_dtype) - .5) / tf.cast(width, conversion_dtype) ratios = py_utils.HasShape(ratios, [width]) azimuth = (ratios * 2. - 1.) * np.pi - az_correction[..., tf.newaxis] azimuth = py_utils.HasShape(azimuth, [width]) lidar_image_mask = lidar_image_mask[..., tf.newaxis] lidar_image_mask = tf.tile(lidar_image_mask, [1, 1, channels]) lidar_image = tf.where(lidar_image_mask, lidar_image, tf.zeros_like(lidar_image)) lidar_image_range = lidar_image[..., 0] azimuth = py_utils.HasShape(azimuth[tf.newaxis, ...], [1, width]) inclinations = py_utils.HasShape(inclinations[..., tf.newaxis], [height, 1]) cos_azimuth = tf.cos(azimuth) sin_azimuth = tf.sin(azimuth) cos_incl = tf.cos(inclinations) sin_incl = tf.sin(inclinations) x = cos_azimuth * cos_incl * lidar_image_range y = sin_azimuth * cos_incl * lidar_image_range z = sin_incl * lidar_image_range lidar_image_points = tf.stack([x, y, z], -1) lidar_image_points = py_utils.HasShape(lidar_image_points, [height, width, 3]) rotation = extrinsics[0:3, 0:3] translation = extrinsics[0:3, 3][tf.newaxis, ...] # Transform the image points in cartesian coordinates to # the world coordinate system using the extrinsics matrix. # # We first flatten the points, apply rotation, then # reshape to restore the original input and then apply # translation. lidar_image_points = tf.matmul(tf.reshape(lidar_image_points, [-1, 3]), rotation, transpose_b=True) lidar_image_points = tf.reshape(lidar_image_points, [height, width, 3]) lidar_image_points += translation lidar_image_points = py_utils.HasShape(lidar_image_points, [height, width, 3]) # GBR uses per pixel pose. if pixel_pose is not None: pixel_pose_rotation = pixel_pose[..., 0:3, 0:3] pixel_pose_translation = pixel_pose[..., 0:3, 3] lidar_image_points = tf.einsum( 'hwij,hwj->hwi', pixel_pose_rotation, lidar_image_points) + pixel_pose_translation if frame_pose is None: raise ValueError( 'frame_pose must be set when pixel_pose is set.') # To vehicle frame corresponding to the given frame_pose # [4, 4] world_to_vehicle = tf.linalg.inv(frame_pose) world_to_vehicle_rotation = world_to_vehicle[0:3, 0:3] world_to_vehicle_translation = world_to_vehicle[0:3, 3] # [H, W, 3] lidar_image_points = tf.einsum( 'ij,hwj->hwi', world_to_vehicle_rotation, lidar_image_points ) + world_to_vehicle_translation[tf.newaxis, tf.newaxis, :] return lidar_image_points
def _SingleClassDecodeWithNMS(predicted_bboxes, classification_scores, nms_iou_threshold, score_threshold, max_boxes_per_class=None): """Perform NMS on predicted bounding boxes / associated logits. Args: predicted_bboxes: [batch_size, num_boxes, 7] float Tensor containing predicted bounding box coordinates. classification_scores: [batch_size, num_boxes, num_classes] float Tensor containing predicted classification scores for each box. nms_iou_threshold: IoU threshold to use when determining whether two boxes overlap for purposes of suppression. score_threshold: The score threshold passed to NMS that allows NMS to quickly ignore irrelevant boxes. max_boxes_per_class: The maximum number of boxes per example to emit. If None, this value is set to num_boxes from the shape of predicted_bboxes. Returns: nms_indices: Indices of the boxes selected after NMS. Tensor of shape [batch_size, num_classes, max_boxes_per_class]. predicted_bboxes: Filtered bboxes after NMS of shape [batch_size, num_classes, max_boxes_per_class, 7]. bbox_scores: A float32 Tensor with the score for each box of shape [batch_size, num_classes, max_boxes_per_class]. valid_mask: A float32 Tensor with 1/0 values indicating the validity of each box. 1 indicates valid, and 0 invalid. Tensor of shape [batch_size, num_classes, max_boxes_per_class]. """ utils_3d = detection_3d_lib.Utils3D() predicted_bboxes = py_utils.HasShape(predicted_bboxes, [-1, -1, 7]) batch_size, num_predicted_boxes, _ = py_utils.GetShape(predicted_bboxes) classification_scores = py_utils.HasShape( classification_scores, [batch_size, num_predicted_boxes, -1]) _, _, num_classes = py_utils.GetShape(classification_scores) if not isinstance(nms_iou_threshold, float): raise ValueError('Single class NMS only supports a scalar ' '`nms_iou_threshold`.') if not isinstance(score_threshold, float): raise ValueError('Single class NMS only supports a scalar ' '`score_threshold`.') if max_boxes_per_class is None: max_boxes_per_class = num_predicted_boxes # TODO(jngiam): Change to be per-class bboxes, and hence, per-class NMS, and # per-class thresholding. # [batch, num_predicted_boxes] nms_scores = tf.reduce_max(classification_scores, axis=-1) # Compute the most likely label by computing the highest class score from # the output of the sigmoid. likely_labels = tf.argmax(classification_scores, axis=-1) # When background is the most likely class for the box, mask out the scores # of that box from NMS scoring so the background boxes don't dominate the # NMS. nms_scores *= tf.cast(likely_labels > 0, tf.float32) # Compute NMS for every sample in the batch. nms_indices, valid_mask = utils_3d.BatchedNMSIndices( predicted_bboxes, nms_scores, nms_iou_threshold=nms_iou_threshold, score_threshold=score_threshold, max_num_boxes=max_boxes_per_class) # Reorder the box data and logits according to NMS scoring. predicted_bboxes = tf.array_ops.batch_gather(predicted_bboxes, nms_indices) classification_scores = tf.array_ops.batch_gather(classification_scores, nms_indices) # Now reformat the output of NMS to match the format of the # MultiClassOrientedDecodeWithNMS, which outputs a per class NMS result. # This takes the leading shape of # [batch_size, num_classes, max_boxes_per_class] for all outputs, which # means since this NMS is not class specific we need to tile the outputs # num_classes times or reorder the data such that its [batch, num_classes]. predicted_bboxes = tf.tile(predicted_bboxes[:, tf.newaxis, :, :], [1, num_classes, 1, 1]) classification_scores = tf.transpose(classification_scores, (0, 2, 1)) classification_scores = py_utils.HasShape( classification_scores, [batch_size, num_classes, max_boxes_per_class]) valid_mask = tf.tile(valid_mask[:, tf.newaxis, :], [1, num_classes, 1]) return nms_indices, predicted_bboxes, classification_scores, valid_mask
def FProp(self, theta, input_batch): """Embeds source ids and transforms with TransformerStack. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. input_batch: A `.NestedMap` object containing: ids - The inputs tensor of shape [batch, time]. paddings - The ids' paddings of shape [batch, time]. Returns: A '.NestedMap' object containing: encoded - The encoded features of shape [time, batch, dim] or [batch, time, dim], depending p.output_data_format. padding - The encoded features' padding of shape [time, batch] or [batch, time]. segment_id - The segmentation of packed inputs of shape [time, batch] or [batch, time] if it is supported by the model, or None otherwise. embedded_inputs - The embedded inputs tokens without positional encodings of shape [time, batch, dim] or [batch, time, dim]. """ p = self.params with tf.name_scope(p.name): # [batch, time] input_ids = input_batch.ids # [batch, time] paddings = input_batch.paddings # [batch, time] segment_ids = input_batch.segment_ids if p.packed_input else None batch = py_utils.GetShape(input_ids)[0] time = py_utils.GetShape(input_ids)[1] # Embedding layer. # [batch, time, dim] if not p.shared_emb: input_embs = self.token_emb.EmbLookup(theta.token_emb, input_ids) else: input_embs = self.softmax.EmbLookup(theta.softmax, input_ids) orig_input_embs = input_embs # [1, time, dim] if p.packed_input: positions = input_batch.segment_pos position_embs = tf.expand_dims( self.position_emb.FPropWithPosition( theta.position_emb, positions), 0) else: position_embs = tf.expand_dims( self.position_emb.FProp(theta.position_emb, time), 0) # [batch, time, dim] input_embs += position_embs if p.input_dropout_tpl.fprop_dtype: input_embs = tf.cast(input_embs, p.input_dropout_tpl.fprop_dtype) paddings = tf.cast(paddings, p.input_dropout_tpl.fprop_dtype) input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs) # [batch, time, dim] transformer_input = input_embs # Explicitly set the input shape of Transformer layers, to avoid # unknown shape error occurred to tf.einsum on nonTPU devices. transformer_input = tf.reshape(transformer_input, [batch, time, p.model_dim]) # Compute self-attention segment mask once. if p.packed_input: segment_mask = batch_major_attention.SegmentMask( segment_ids, segment_ids, dtype=transformer_input.dtype) else: segment_mask = tf.zeros([batch, 1, time, time]) encoded, padding = self.transformer_stack.FProp( theta.transformer_stack, transformer_input, paddings, segment_mask) if p.final_layer_norm: encoded = self.final_ln.FProp(theta.final_ln, encoded) seq_lengths = tf.cast(tf.reduce_sum(1. - padding, axis=1), tf.int32) if p.output_data_format == 'TBC': encoded = tf.transpose(encoded, [1, 0, 2]) # [time, batch, dim] padding = tf.transpose(padding) # [time, batch] segment_ids = tf.transpose( segment_ids) if p.packed_input else None orig_input_embs = tf.transpose(orig_input_embs, [1, 0, 2]) return py_utils.NestedMap( encoded=encoded, padding=padding, seq_lengths=seq_lengths, # used by beam_search_helper. segment_id=segment_ids, embedded_inputs=orig_input_embs)
def BeamSearchDecode(self, theta, encoder_outputs, num_hyps_per_beam_override=0, init_beam_search_state=None, pre_beam_search_step_callback=None, post_beam_search_step_callback=None, max_steps=None): """Performs beam-search based decoding. Args: theta: A NestedMap object containing weights' values of the decoder layer and its children layers. encoder_outputs: A NestedMap containing encoder outputs to be passed to the callbacks. Mostly opaque to BeamSearchHelper, except that it should contain either a 'seq_lengths' field of shape [source_batch_size] or a 'paddings' field of shape [source_max_lengths, source_batch_size]. num_hyps_per_beam_override: If set to a value <= 0, this parameter is ignored. If set to a value > 0, then this value will be used to override `p.num_hyps_per_beam`. init_beam_search_state: The `InitBeamSearchState` callback. Please refer to the class header comments for more details. pre_beam_search_step_callback: The `PreBeamSearchStepCallback` callback. Please refer to the class header comments for more details. post_beam_search_step_callback: The `PostBeamSearchStepCallback` callback. Please refer to the class header comments for more details. max_steps: maximum beam search steps. If None, use self.params.target_seq_len. Returns: A `BeamSearchDecodeOutput`. """ p = self.params num_hyps_per_beam = p.num_hyps_per_beam if num_hyps_per_beam_override > 0: num_hyps_per_beam = num_hyps_per_beam_override if max_steps is None: max_steps = p.target_seq_len initial_results, other_states = init_beam_search_state( theta, encoder_outputs, num_hyps_per_beam) num_hyps = tf.shape(initial_results.log_probs)[0] num_beams = num_hyps // num_hyps_per_beam if 'step_ids' in initial_results: # [num_hyps, 1] step_ids = tf.ensure_shape(initial_results.step_ids, [None, 1]) else: step_ids = tf.fill([num_hyps, 1], tf.constant(p.target_sos_id, dtype=tf.int32)) min_score = -1e36 best_scores = (tf.zeros(shape=[num_beams], dtype=p.dtype) + min_score) cumulative_scores = tf.zeros(shape=[num_hyps], dtype=p.dtype) in_scores = tf.zeros([max_steps, num_hyps], dtype=p.dtype) in_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.int32) in_prev_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.int32) in_done_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.string) bs_atten_probs = tf.zeros( [max_steps, num_hyps, tf.shape(initial_results.atten_probs)[1]], dtype=p.dtype) cur_step = tf.constant(0, dtype=tf.int32) all_done = tf.constant(False, dtype=tf.bool) core_bs_states = (best_scores, cumulative_scores, in_scores, in_hyps, in_prev_hyps, in_done_hyps, bs_atten_probs) def LoopContinue(cur_step, all_done, unused_step_ids, unused_core_bs_states, unused_other_states_list): return tf.math.logical_and(cur_step < max_steps, tf.math.logical_not(all_done)) def LoopBody(cur_step, unused_all_done, step_ids, core_bs_states, other_states_list): (cur_step, all_done, new_step_ids, new_bs_states, new_other_states) = self._BeamSearchStep( theta, encoder_outputs, cur_step, step_ids, core_bs_states, other_states.Pack(other_states_list), num_hyps_per_beam, pre_beam_search_step_callback, post_beam_search_step_callback) return (cur_step, all_done, new_step_ids, new_bs_states, new_other_states.Flatten()) flat_other_states = other_states.Flatten() _, _, _, final_bs_states, flat_final_other_states = tf.while_loop( LoopContinue, LoopBody, loop_vars=(cur_step, all_done, step_ids, core_bs_states, flat_other_states), parallel_iterations=10, back_prop=False, swap_memory=False, shape_invariants=(tf.TensorShape(cur_step.get_shape()), tf.TensorShape(all_done.get_shape()), tf.TensorShape(step_ids.get_shape()), _GetShapes(core_bs_states), _GetShapes(flat_other_states, none_shapes=True))) # [target_seq_len, num_beams * num_hyps_per_beam]. final_done_hyps = final_bs_states[5] final_other_states = other_states.Pack(flat_final_other_states) # Assume that `paddings` has shape [source_max_lengths, source_batch_size] # by default, and compute `encoded_seq_lengths` accordingly. This can be # overridden by directly passing `seq_lengths` in the `encoder_outputs` # NestedMap. encoded_seq_lengths = getattr(encoder_outputs, 'seq_lengths', None) if encoded_seq_lengths is None: source_paddings = encoder_outputs.padding if isinstance(source_paddings, py_utils.NestedMap): encoded_seq_lengths = tf.cast( tf.round( tf.reduce_sum( 1.0 - tf.transpose(source_paddings.Flatten()[0]), 1)), tf.int32) else: encoded_seq_lengths = tf.cast( tf.round( tf.reduce_sum( 1.0 - tf.cast(tf.transpose(source_paddings), tf.float32), 1)), tf.int32) # [num_beams, num_hyps_per_beam]. topk_hyps = ops.top_k_terminated_hyps( final_done_hyps, encoded_seq_lengths, k=num_hyps_per_beam, num_hyps_per_beam=num_hyps_per_beam, length_normalization=p.length_normalization, coverage_penalty=p.coverage_penalty, target_seq_length_ratio=p.target_seq_length_ratio) # [num_beams * num_hyps_per_beam, ...]. max_seq_length = 0 if isinstance(max_steps, tf.Tensor) else max_steps topk_ids, topk_lens, topk_scores = ops.unpack_hyp( tf.reshape(topk_hyps, [-1]), max_seq_length=max_seq_length) # [num_beams, num_hyps_per_beam]. topk_scores = tf.reshape(topk_scores, tf.shape(topk_hyps)) return BeamSearchDecodeOutput(final_done_hyps, topk_hyps, topk_ids, topk_lens, topk_scores, None, final_other_states)
def _resource_apply_dense(self, grad, var): if grad is None: tf.logging.warning('Gradient is None for variable %s' % var.name) return [] grad_dtype = var.dtype # TODO(lepikhin): add to params grad = tf.cast(grad, grad_dtype) factored_dims = self._factored_dims(var.shape.as_list()) if factored_dims: vr = self.get_slot(var, 'vr') vc = self.get_slot(var, 'vc') else: v = self.get_slot(var, 'v') if self._beta1: m = self.get_slot(var, 'm') cond = tf.constant(True) def _Upd(c, x): if not self._cond_is_finite: return c c = tf.math.logical_and(c, tf.reduce_all(tf.math.is_finite(x))) c = tf.math.logical_and( c, tf.reduce_all(tf.math.logical_not(tf.math.is_inf(x)))) return c def _Wrap(fn, x, y): if not self._cond_is_finite: return fn(x, y) return tf.cond(cond, lambda: fn(x, y), lambda: x) with tf.variable_scope(var.name[:-2] + '/Adafactor'): grad_squared = tf.math.square(grad) + tf.cast(self._epsilon1, grad_dtype) cond = _Upd(cond, grad_squared) decay_rate = tf.cast(self._decay_rate, var.dtype) old_val = tf.identity(var) # TODO(lepikhin): introduce gradient dtype if self._multiply_by_parameter_scale: update_scale = self._parameter_scale(old_val) * tf.cast( self._learning_rate, grad_dtype) else: update_scale = self._learning_rate mixing_rate = tf.cast(1.0 - decay_rate, grad_dtype) update_scale = tf.cast(update_scale, grad_dtype) updates = [] if factored_dims: d0, d1 = factored_dims vr_axis, vc_axis = d0, d1 grad_squared_row_mean = tf.reduce_mean(grad_squared, axis=vr_axis) grad_squared_col_mean = tf.reduce_mean(grad_squared, axis=vc_axis) # new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) new_vr = vr * decay_rate + grad_squared_row_mean * mixing_rate # new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) new_vc = vc * decay_rate + grad_squared_col_mean * mixing_rate cond = _Upd(cond, new_vr) cond = _Upd(cond, new_vc) vr_update = _Wrap(tf.assign, vr, new_vr) vc_update = _Wrap(tf.assign, vc, new_vc) updates.extend([vr_update, vc_update]) long_term_mean = tf.reduce_mean(new_vr, -1, keepdims=True) r_factor = tf.math.rsqrt(new_vr / long_term_mean) c_factor = tf.math.rsqrt(new_vc) x = grad * tf.expand_dims(r_factor, vr_axis) * tf.expand_dims( c_factor, vc_axis) else: new_v = v * decay_rate + grad_squared * mixing_rate cond = _Upd(cond, new_v) v_update = _Wrap(tf.assign, v, new_v) updates.append(v_update) x = grad * tf.math.rsqrt(new_v) if self._clipping_threshold is not None: clipping_denom = tf.maximum( tf.constant(1.0, grad_dtype), py_utils.ReduceRms(x) / tf.constant(self._clipping_threshold, grad_dtype)) x /= clipping_denom subtrahend = x * update_scale if self._beta1: new_m = ( m * tf.constant(self._beta1, dtype=grad_dtype) + subtrahend * tf.constant(1.0 - self._beta1, dtype=grad_dtype)) subtrahend = new_m cond = _Upd(cond, new_m) updates.append(_Wrap(tf.assign, m, new_m)) # It is critical to use assign_sub instead of tf.assign(var - subtrahend) # for the case of bfloat16 activations, so as to avoid repeatedly # rounding the slice value, which results in poor quality. cond = _Upd(cond, subtrahend) var_update = _Wrap(tf.assign_sub, var, subtrahend) updates.append(var_update) return tf.group(*updates)
def ApplyClipping(self, theta, x): p = self.params if not p.cc_schedule: return x cap = tf.cast(self.cc_schedule.GetState(theta.cc_schedule), x.dtype) return tf.clip_by_value(x, -cap, cap)
def FProp(self, theta, current_step): p = self.params num_decays = tf.floor( tf.div(tf.cast(current_step, tf.float32), float(p.num_steps_per_decay))) return tf.pow(p.decay, num_decays)
def FProp(self, theta, current_step): p = self.params step_num = tf.cast(current_step, tf.float32) learning_rate = tf.math.rsqrt(tf.maximum(step_num, p.warmup_steps)) learning_rate *= p.multiplier return learning_rate
def _CastFloats(v): if v is None: return None return tf.cast( v, py_utils.FPropDtype(p)) if v.dtype.is_floating else v
def FProp(self, theta, current_step): return self._exp(tf.cast(current_step, dtype=self.params.dtype))
def _Gradient(inputs, _, original_grad): # Compute the gradients for each loss w.r.t. the inputs. # TODO(jngiam): Look into whether TF dedups this computation. per_loss_grads = [] for loss, _ in self._losses: per_loss_grad = tf.gradients(loss, self._output_tensor)[0] if per_loss_grad is None: tf.logging.warn( 'Loss %s did not result in a gradient during ' 'GradDrop computation.', loss) else: per_loss_grads.append(per_loss_grad) if not per_loss_grads: raise ValueError('No valid gradients for GradDrop.') # Multiply the gradients with the inputs. grads = per_loss_grads if p.use_input_sign_only: input_abs = tf.abs( tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs) grads = [grad * ((inputs) / (input_abs)) for grad in grads] else: grads = [grad * inputs for grad in grads] # Sum gradient over batch, assuming that batch is always on dim 0. if p.marginalize_batch_dim: grads = [tf.reduce_sum(grad, axis=0, keepdims=True) for grad in grads] # First discretize all gradients into their sign values. grad_sign_positive = [tf.cast(grad > 0.0, tf.float32) for grad in grads] grad_sign_negative = [tf.cast(grad < 0.0, tf.float32) for grad in grads] # Calculate the probability of positive gradients based on equation (1) # in the GradDrop paper. grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads]) prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon)) # Implementation of different scales for the keep function. Larger # scales result in steeper keep functions. prob_pos *= p.keep_prob_function_scale if p.keep_prob_function == 'sigmoid': # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0 # allows the function scale in sigmoid to be compatible with the # function scale in the linear case. prob_pos = tf.sigmoid(4.0 * prob_pos) elif p.keep_prob_function == 'linear': prob_pos += 0.5 # The main, default mode of GradDrop. Only gradients of one sign are kept, # and which sign is calculated via equation (1) of the main paper. prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape), tf.float32) - 0.5 grad_masks = [(gsp - gsn) * prob_pos >= 0 for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive)] # This diag value gives us the percentage of grads which are kept. gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks] diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks)) summary_utils.scalar('average_grad_mask', diag) leak_ratios = [leak_ratio for _, leak_ratio in self._losses] transformed_per_loss_grads = [ grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32)) for (leak, grad, grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks) ] transformed_grad = tf.cast( tf.add_n(transformed_per_loss_grads), original_grad.dtype) if not p.keep_gradnorm_constant: return transformed_grad transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2)) original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2)) return transformed_grad * transformed_grad_norm / ( original_grad_norm + p.epsilon)
def _TimeMask(self, inputs, seq_lengths, global_seed, noisify=False, gaussian_noise=False, dtype=tf.float32, domain_id_index=0): """Applies time masking with given degree to inputs. Args: inputs: Batch of input features of shape (batch_size, time_length, num_freq, channels). seq_lengths: The actual sequence lengths which mask been sampled of shape (batch_size,). global_seed: an integer seed tensor for stateless random ops. noisify: Whether to noisify the masked out regions. gaussian_noise: Whether to use gaussian noise when noisifying. dtype: Data type. domain_id_index: domain id index. Returns: Inputs with random time masking applied. """ p = self.params # Get time masking parameters. time_mask_max_frames = p.time_mask_max_frames[domain_id_index] time_masks_per_frame = p.time_masks_per_frame[domain_id_index] use_dynamic_time_mask_max_frames = \ p.use_dynamic_time_mask_max_frames[domain_id_index] multiplicity = p.time_mask_count[domain_id_index] max_ratio = p.time_mask_max_ratio[domain_id_index] # If maximum mask length is zero, do nothing. if ((time_mask_max_frames == 0 and not use_dynamic_time_mask_max_frames) or max_ratio <= 0.0): return inputs if multiplicity == 0: return inputs seq_lengths = tf.cast(seq_lengths, tf.int32) batch_size, time_length, _, _ = py_utils.GetShape(inputs) # When using dynamic time mask size, discard upper-bound on # maximum allowed frames for time mask. if use_dynamic_time_mask_max_frames: time_mask_max_frames = None # Create masks in time direction and apply. block_arrays = self._GetMask( batch_size, choose_range=seq_lengths, mask_size=time_length, global_seed=global_seed, max_length=time_mask_max_frames, masks_per_frame=time_masks_per_frame, multiplicity=multiplicity, dtype=dtype, max_ratio=max_ratio) # Non-empty random seed values are only used for testing or when using # stateless random ops. seed_6 and seed_7 are set separately to avoid # correlation of warp magnitude and origin position. if p.use_input_dependent_random_seed: seed_6 = global_seed + 6 seed_7 = global_seed + 7 else: seed_6 = p.random_seed seed_7 = p.random_seed outputs = self.EinsumBxycBxBxyc( inputs, block_arrays, name='einsum_formasking') if noisify: # Sample noise with standard deviation with factor * 0.1 + 0.0001 # TODO(ngyuzh): Make sure this won't affect EOS. if gaussian_noise: stddev = 1.0 else: random_uniform = _random_uniform_op(p.use_input_dependent_random_seed) factor = random_uniform( shape=(), minval=1.0, maxval=2.0, dtype=dtype, seed=seed_6) stddev = factor * 0.1 + 0.0001 random_normal = _random_normal_op(p.use_input_dependent_random_seed) noise = random_normal( shape=[tf.shape(inputs)[0], tf.shape(inputs)[1], tf.shape(inputs)[2]], stddev=stddev, seed=seed_7) if p.fprop_dtype is not None and p.fprop_dtype != p.dtype: noise = tf.cast(noise, p.fprop_dtype) outputs_mask = self.EinsumBxyBxBxy( noise, 1.0 - block_arrays, name='einsum_fornoisymasking') outputs = outputs + tf.expand_dims(outputs_mask, -1) return outputs
def _ConstructWarpMatrix(self, batch_size, matrix_size, origin, destination, choose_range, dtype): """Returns warp matrices according to origin, destination and choose_range. This function constructs a batch of warp matrices which maps the batch of origin points to the batch of destination points with fixed boundary coordinates at 0 and choose_range. The warping function, defined by the origin anchor point `origin`, the destination of the origin anchor point `destination` and the length of the domain in the warping axis `choose_range` is a piecewise linear map that fixes the points 0 and `choose_range` and maps `origin` to `destination`. For the warping matrix to be non-singular, destination must lie in the range 1<= destination <= choose_range - 1, so a destination out of this range is adjusted to be in this range before the warping matrix is constructed. The warping map can be explicitly written by first defining the slopes: 1) slope_0 = origin / destination. 2) slope_1 = (choose_range - origin) / (choose_range - destination). 3) slope_2 = 1.0. Then the origin point orig_i of the mapped coordinate i is given by: 1) i < destination: orig_i = slope_0 * i. 2) destination <= i < choose_range: orig_i = slope_1 * i - (slope_1 - slope_0) * destination. 3) i >= choose_range: orig_i = i. Denoting n_i = ceil(orig_i), the warp matrix element warp[i][j] is given by: 1) j = n_i: 1 - n_i + orig_i. 2) j = n_i - 1: n_i - orig_i. 3) Otherwise: 0. Applying the warp matrix to an array of pixels, i.e., warped_pixel[i] = sum_j warp[i][j] * pixel[j], one would get warped_pixel[i] = (n_i-orig_i) pixel[n_i-1] + (1-n_i+orig_i) pixel[n_i]. Args: batch_size: Batch size. Integer number. matrix_size: Dimension of the vector space the warp matrix is applied to. Integer number. origin: Origin anchor point for warping. Tensor of shape (batch_size,) and data type dtype. destination: Destination of the origin anchor point upon warping. Tensor of shape (batch_size,) and data type dtype. choose_range: Range within which the warp reference points must lie. Tensor of shape (batch_size,) data type dtype. dtype: Data type of origin, destination, choose_range and the output warp matrix. Returns: warp_matrix: An array of fixed size warp matrices with shape (batch_size, matrix_size, matrix_size). """ p = self.params # Entries of destination must be in the range # 1 <= destination <= choose_range - 1 # for warp matrix to have non-singular values. destination = tf.minimum(tf.maximum(destination, 1.0), choose_range - 1.0) # Construct piece-wise linear function fixing boundary points # specified by zero, choose_range and matrix size and maps # the origin anchor point to the destination. destination_bc = tf.broadcast_to(destination, (matrix_size, batch_size)) destination_bc = tf.transpose(destination_bc) choose_range_bc = tf.broadcast_to(choose_range, (matrix_size, batch_size)) choose_range_bc = tf.transpose(choose_range_bc) # Slopes of piece-wise linear function. slope_0 = origin / destination slope_1 = (choose_range - origin) / (choose_range - destination) slope_2 = 1.0 # x is a batch of origin matrices. # The origin matrix is the matrix such that # origin[i][j] = Origin coordinate of coordinate i for the warp map. # Denoting the destination of the origin anchor point in the # warp map as "dest," the origin coordinate of point i is given by: # 1) i < dest: slope_0 * i. # 2) dest <= i < choose_range: slope_1 * i - (slope_1 - slope_0) * dest. # 3) i >= choose_range: i. x = tf.broadcast_to( tf.cast(tf.range(matrix_size), dtype=dtype), (batch_size, matrix_size)) x = ( self.EinsumBBmBm(slope_0, x) + self.EinsumBBmBm(slope_1 - slope_0, tf.nn.relu(x - destination_bc)) + self.EinsumBBmBm(slope_2 - slope_1, tf.nn.relu(x - choose_range_bc))) x = tf.broadcast_to(x, (matrix_size, batch_size, matrix_size)) x = tf.transpose(x, perm=[1, 2, 0]) # y is a batch of coordinate matrices. # A coordinate matrix is a matrix such that # coordinate[i][j] = j. y = tf.broadcast_to( tf.cast(tf.range(matrix_size), dtype=dtype), (batch_size, matrix_size, matrix_size)) # Warp matrix is obtained by applying hat function element-wise to (x-y). # Denoting the origin point of i under the warp map as orig_i, # and n_i = ceil(orig_i), the warp matrix element warp[i][j] is given by: # 1) j = n_i: 1 - n_i + orig_i. # 2) j = n_i - 1: n_i - orig_i. # 3) Otherwise: 0. # Applying the warp matrix to pixels, i.e., # warped_pixel[i] = sum_j warp[i][j] * original_pixel[j], one would get # warped_pixel[i] = (n_i - orig_i) * original_pixel[n_i-1] # + (1 - n_i + orig_i) * original_pixel[n_i]. warp_matrix = x - y warp_matrix = _hat(warp_matrix) if p.fprop_dtype is not None and p.fprop_dtype != dtype: warp_matrix = tf.cast(warp_matrix, p.fprop_dtype) return warp_matrix
def _StepNum(): return tf.cast(tf.train.get_or_create_global_step(), tf.float32)
def AddAttentionSummaryBatchMajor(name, attention_tensors, src_paddings, tgt_paddings, transcripts=None, max_outputs=3): """Adds an image summary showing the attention probability matrix and state. As opposed to AddAttentionSummary() takes all tensors with batch dimension in axis 0. Args: name: Summary name. attention_tensors: A list of 3D tensors shaped [batch_size, target_len, source_len] where attention[b, i, j] is the probability for the i-th output attending to the j-th input for element b in the batch. src_paddings: A tensor of binary paddings shaped [batch, source_len] for the source sequence. Or a list of tensors of the same length as attention_tensors with a separate paddings for each entry in attention_tensors. tgt_paddings: A tensor of binary paddings shaped [batch, target_len] for the target sequence. Or a list of tensors of the same length as attention_tensors with a separate paddings for each entry in attention_tensors. transcripts: Optional, transcripts shaped [batch, source_len] for the source sequence. max_outputs: Integer maximum number of elements of the batch to plot. """ def VerifyLen(paddings): length = len(paddings) if isinstance(paddings, list) else 1 if length != 1 and length != len(attention_tensors): raise ValueError('Bad length of paddings list {}'.format(length)) VerifyLen(src_paddings) VerifyLen(tgt_paddings) # Verify shapes. for i, attention_tensor in enumerate(attention_tensors): src, tgt = src_paddings, tgt_paddings src = src[0 if len(src) == 1 else i] if isinstance(src, list) else src tgt = tgt[0 if len(tgt) == 1 else i] if isinstance(tgt, list) else tgt tgt_shape = py_utils.GetShape(tgt) attention_tensors[i] = tf.identity( py_utils.with_dependencies([ py_utils.assert_equal( py_utils.GetShape(attention_tensor), tgt_shape[:2] + [py_utils.GetShape(src)[1]] + tgt_shape[2:]) ], attention_tensor), re.sub(':.*$', '', attention_tensor.name)) if not _ShouldAddSummary(): return def ToLengths(paddings): paddings = paddings if isinstance(paddings, list) else [paddings] return [SequenceLength(p) for p in paddings] def Get(lengths, i): return lengths[0 if len(lengths) == 1 else i] src_lens = ToLengths(src_paddings) tgt_lens = ToLengths(tgt_paddings) with plot.MatplotlibFigureSummary(name + '/Attention', max_outputs=max_outputs, gridspec_kwargs={'hspace': 0.3}) as fig: for n, atten in enumerate(attention_tensors): # Diagnostic metric that decreases as attention picks up. max_entropy = tf.math.log(tf.cast(Get(src_lens, n), tf.float32)) max_entropy = tf.expand_dims(tf.expand_dims(max_entropy, -1), -1) atten_normalized_entropy = -atten * tf.math.log( atten + 1e-10) / max_entropy scalar(name + '/Attention/average_normalized_entropy/%d' % n, tf.reduce_mean(atten_normalized_entropy)) args = [atten, Get(src_lens, n), Get(tgt_lens, n)] if transcripts is not None and n == 0: args.append(transcripts) fig.AddSubplot(args, TrimPaddingAndPlotAttention, title=atten.name, xlabel='Input', ylabel='Output')
def try_apply_dense(self, grad, var): assert grad is not None cond = tf.constant(True) is_finite_checks = [] stats = {} grad_dtype = var.dtype # TODO(lepikhin): add to params grad = tf.cast(grad, grad_dtype) factored_dims = self._factored_dims(var.shape.as_list()) if factored_dims: vr = self.get_slot(var, 'vr') vc = self.get_slot(var, 'vc') else: v = self.get_slot(var, 'v') if self._beta1: m = self.get_slot(var, 'm') def _Upd(c, k, x): stats[k] = x is_finite_checks.append(tf.reduce_all(tf.math.is_finite(x))) return c with tf.variable_scope(var.name[:-2] + '/Adafactor'): grad_squared = tf.math.square(grad) + tf.cast(self._epsilon1, grad_dtype) cond = _Upd(cond, 'grad_squared', grad_squared) # 0 (factored) decay_rate = tf.cast(self._decay_rate, var.dtype) old_val = tf.identity(var) # TODO(lepikhin): introduce gradient dtype assert self._multiply_by_parameter_scale if self._multiply_by_parameter_scale: parameter_scale = self._parameter_scale(old_val) cond = _Upd(cond, 'parameter_scale', parameter_scale) # 1 (factored) update_scale = self._parameter_scale(old_val) * tf.cast( self._learning_rate, grad_dtype) else: update_scale = self._learning_rate mixing_rate = tf.cast(1.0 - decay_rate, grad_dtype) update_scale = tf.cast(update_scale, grad_dtype) if factored_dims: d0, d1 = factored_dims vr_axis, vc_axis = d0, d1 grad_squared_row_mean = tf.reduce_mean(grad_squared, axis=vr_axis) grad_squared_col_mean = tf.reduce_mean(grad_squared, axis=vc_axis) # new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) new_vr = vr * decay_rate + grad_squared_row_mean * mixing_rate # new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) new_vc = vc * decay_rate + grad_squared_col_mean * mixing_rate cond = _Upd(cond, 'new_vr', new_vr) # 2 (factored) cond = _Upd(cond, 'new_vc', new_vc) # 3 (factored) # vr_update = _Wrap(tf.assign, vr, new_vr) # vc_update = _Wrap(tf.assign, vc, new_vc) # updates.extend([vr_update, vc_update]) long_term_mean = tf.reduce_mean(new_vr, -1, keepdims=True) r_factor = tf.math.rsqrt(new_vr / long_term_mean) c_factor = tf.math.rsqrt(new_vc) mult = tf.expand_dims(r_factor, vr_axis) * tf.expand_dims( c_factor, vc_axis) cond = _Upd(cond, 'mult', mult) # 4 (factored) x = grad * mult else: new_v = v * decay_rate + grad_squared * mixing_rate cond = _Upd(cond, 'new_v', new_v) # v_update = _Wrap(tf.assign, v, new_v) # updates.append(v_update) x = grad * tf.math.rsqrt(new_v) assert self._clipping_threshold is not None if self._clipping_threshold is not None: clipping_denom = tf.maximum( tf.constant(1.0, grad_dtype), py_utils.ReduceRms(x) / tf.constant(self._clipping_threshold, grad_dtype)) x /= clipping_denom cond = _Upd(cond, 'x', x) subtrahend = x * update_scale if self._beta1: new_m = ( m * tf.constant(self._beta1, dtype=grad_dtype) + subtrahend * tf.constant(1.0 - self._beta1, dtype=grad_dtype)) subtrahend = new_m cond = _Upd(cond, 'new_m', new_m) # updates.append(_Wrap(tf.assign, m, new_m)) # It is critical to use assign_sub instead of tf.assign(var - subtrahend) # for the case of bfloat16 activations, so as to avoid repeatedly # rounding the slice value, which results in poor quality. cond = _Upd(cond, 'subtrahend', subtrahend) # 5 (factored) # var_update = _Wrap(tf.assign_sub, var, subtrahend) # updates.append(var_update) return is_finite_checks, stats
def _BuildMetric(self, feed_data, classid): """Construct tensors and the feed_dict for Waymo metric op. Args: feed_data: a NestedMap returned by _GetData(). classid: integer. Returns: A tuple of 3 dicts: - scalar_metrics: a dict mapping all the metric names to fetch tensors. - curves: a dict mapping all the curve names to fetch tensors. - feed_dict: a dict mapping the tensors in feed_tensors to feed values. """ breakdown_names = config_util.get_breakdown_names_from_config( self._waymo_metric_config) if feed_data is None: dummy_scalar = tf.constant(np.nan) dummy_curve = tf.zeros( [self.metadata.NumberOfPrecisionRecallPoints(), 2], tf.float32) scalar_metrics = { 'ap': dummy_scalar, 'ap_ha_weighted': dummy_scalar } curve_metrics = {'pr': dummy_curve, 'pr_ha_weighted': dummy_curve} for i, metric in enumerate(breakdown_names): scalar_metrics['ap_%s' % metric] = dummy_scalar scalar_metrics['ap_ha_weighted_%s' % metric] = dummy_scalar curve_metrics['pr_%s' % metric] = dummy_curve curve_metrics['pr_ha_weighted_%s' % metric] = dummy_curve return scalar_metrics, curve_metrics, {} feed_dict = {} f_gt_bbox = tf.placeholder(tf.float32) feed_dict[f_gt_bbox] = feed_data.gt.bbox f_gt_imgid = tf.placeholder(tf.int32) feed_dict[f_gt_imgid] = feed_data.gt.imgid f_pd_bbox = tf.placeholder(tf.float32) feed_dict[f_pd_bbox] = feed_data.pd.bbox f_pd_imgid = tf.placeholder(tf.int32) feed_dict[f_pd_imgid] = feed_data.pd.imgid f_pd_score = tf.placeholder(tf.float32) feed_dict[f_pd_score] = feed_data.pd.score num_gt_bboxes = feed_data.gt.imgid.shape[0] num_pd_bboxes = feed_data.pd.imgid.shape[0] gt_class_ids = tf.constant(classid, dtype=tf.uint8, shape=[num_gt_bboxes]) pd_class_ids = tf.constant(classid, dtype=tf.uint8, shape=[num_pd_bboxes]) ap, ap_ha, pr, pr_ha, _ = py_metrics_ops.detection_metrics( prediction_bbox=f_pd_bbox, prediction_type=pd_class_ids, prediction_score=f_pd_score, prediction_frame_id=tf.cast(f_pd_imgid, tf.int64), prediction_overlap_nlz=tf.zeros_like(f_pd_imgid, dtype=tf.bool), ground_truth_bbox=f_gt_bbox, ground_truth_type=gt_class_ids, ground_truth_frame_id=tf.cast(f_gt_imgid, tf.int64), ground_truth_difficulty=tf.zeros_like(f_gt_imgid, dtype=tf.uint8), config=self._waymo_metric_config.SerializeToString()) # All tensors returned by Waymo's metric op have a leading dimension # B=number of breakdowns. At this moment we always use B=1 to make # it compatible to the python code. scalar_metrics = {'ap': ap[0], 'ap_ha_weighted': ap_ha[0]} curve_metrics = {'pr': pr[0], 'pr_ha_weighted': pr_ha[0]} for i, metric in enumerate(breakdown_names): # There is a scalar / curve for every breakdown. scalar_metrics['ap_%s' % metric] = ap[i] scalar_metrics['ap_ha_weighted_%s' % metric] = ap_ha[i] curve_metrics['pr_%s' % metric] = pr[i] curve_metrics['pr_ha_weighted_%s' % metric] = pr_ha[i] return scalar_metrics, curve_metrics, feed_dict
def AssignAnchors(self, anchor_bboxes, gt_bboxes, gt_bboxes_labels, gt_bboxes_mask, foreground_assignment_threshold=0.5, background_assignment_threshold=0.35, background_class_id=0, force_match=True, similarity_fn=None): """Assigns anchors to bboxes using a similarity function (SSD-based). Each anchor box is assigned to the top matching ground truth box. Ground truth boxes can be assigned to multiple anchor boxes. Assignments can result in 3 outcomes: - Positive assignment (if score >= foreground_assignment_threshold): assigned_gt_labels will reflect the assigned box label and assigned_cls_mask will be set to 1.0 - Background assignment (if score <= background_assignment_threshold): assigned_gt_labels will be background_class_id and assigned_cls_mask will be set to 1.0 - Ignore assignment (otherwise): assigned_gt_labels will be background_class_id and assigned_cls_mask will be set to 0.0 The detection loss function would usually: - Use assigned_cls_mask for weighting the classification loss. The mask is set such that the loss applies to foreground and background assignments only - ignored anchors will be set to 0. - Use assigned_reg_mask for weighting the regression loss. The mask is set such that the loss applies to foreground assignments only. The thresholds (foreground_assignment_threshold and background_assignment_threshold) should be tuned per dataset. TODO(jngiam): Consider having a separate threshold for regression boxes; a separate threshold is used in PointRCNN. Args: anchor_bboxes: tf.float32. [A, 7], where [..., :] corresponds to box parameters (x, y, z, dx, dy, dz, r). gt_bboxes: tf.float32. [G, 7], where [..., :] corresponds to ground truth box parameters (x, y, z, dx, dy, dz, r). gt_bboxes_labels: tensor with shape [G]. Ground truth labels for each bounding box. gt_bboxes_mask: tensor with shape [G]. Mask for ground truth boxes, 1 iff the gt_bbox is a real bbox. foreground_assignment_threshold: Similarity score threshold for assigning foreground bounding boxes; scores need to be >= foreground_assignment_threshold to be assigned to foreground. background_assignment_threshold: Similarity score threshold for assigning background bounding boxes; scores need to be <= background_assignment_threshold to be assigned to background. background_class_id: class id to be assigned to anchors_gt_class if no anchor boxes match. force_match: Boolean specifying if force matching is enabled. If force matching is enabled, then matched anchors which are also the highest scoring with a ground-truth box are considered foreground matches as long as their similarity score > 0. similarity_fn: Function that computes the a similarity score (e.g., IOU) between pairs of bounding boxes. This function should take in two tensors corresponding to anchor and ground-truth bboxes, and return a matrix [A, G] with the similarity score between each pair of bboxes. The score must be non-negative, with greater scores representing more similar. The fore/background_assignment_thresholds will be applied to this score to determine if the an anchor is foreground, background or ignored. If set to None, the function will default to IOU2DRotatedBoxes. Returns: NestedMap with the following keys - assigned_gt_bbox: shape [A, 7] bbox parameters assigned to each anchor. - assigned_gt_similarity_score: shape [A] (iou) score between the anchor and the gt bbox. - assigned_gt_labels: shape [A] label assigned to bbox. - assigned_cls_mask: shape [A] mask for classification loss per anchor. This should be 1.0 if the anchor has a foreground or background assignment; otherwise, it will be assigned to 0.0. - assigned_reg_mask: shape [A] mask for regression loss per anchor. This should be 1.0 if the anchor has a foreground assignment; otherwise, it will be assigned to 0.0. Note: background anchors do not have regression targets. """ if similarity_fn is None: similarity_fn = self.IOU2DRotatedBoxes # Shape validation. anchor_bboxes = py_utils.HasShape(anchor_bboxes, [-1, 7]) num_anchor_bboxes, _ = py_utils.GetShape(anchor_bboxes, 2) gt_bboxes = py_utils.HasShape(gt_bboxes, [-1, 7]) num_gt_bboxes, _ = py_utils.GetShape(gt_bboxes, 2) # Compute similarity score and reduce max by anchors and by ground-truth. similarity_score = similarity_fn(anchor_bboxes, gt_bboxes) similarity_score = py_utils.HasShape( similarity_score, [num_anchor_bboxes, num_gt_bboxes]) # Reduce over ground-truth boxes, so we have the max score per anchor. anchor_max_score = tf.reduce_max(similarity_score, axis=1) anchor_max_idx = tf.argmax(similarity_score, axis=1) if force_match: # Reduce over anchors, so we have the max score per ground truth box. gt_max_score = tf.reduce_max(similarity_score, axis=0, keep_dims=True) # Force matches occur when the top matching gt bbox for an anchor is the # top matching anchor for the gt bbox. When force matching, we match # these boxes as long as their similarity score exceeds 0. force_matches = ( tf.equal(similarity_score, gt_max_score) & tf.equal(similarity_score, anchor_max_score[..., tf.newaxis]) & tf.greater(similarity_score, 0.) & tf.cast(gt_bboxes_mask[tf.newaxis, ...], tf.bool)) force_match_indicator = tf.reduce_any(force_matches, axis=1) force_match_idx = tf.argmax(tf.cast(force_matches, tf.int32), axis=1) # In assigning foreground/background anchors later, force_match_indicator # is used to determine which anchors are force foreground, and the index # assigned will be taken from anchor_max_idx. # Force matchers must also be the max scoring gt bbox per anchor. # We overwrite anchor_max_idx to ensure that the right match is done. anchor_max_idx = tf.where(force_match_indicator, force_match_idx, anchor_max_idx) # Ensure that max score boxes are not padded boxes by setting score to 0 # for boxes that are padded. gathered_mask = tf.batch_gather(gt_bboxes_mask, anchor_max_idx) anchor_max_score = tf.where(tf.equal(gathered_mask, 1), anchor_max_score, tf.zeros_like(anchor_max_score)) # Boolean tensors corresponding to whether an anchor is background or # foreground based on thresholding. background_anchors = tf.less_equal(anchor_max_score, background_assignment_threshold) foreground_anchors = tf.greater_equal(anchor_max_score, foreground_assignment_threshold) if force_match: # Background anchors are below threshold and not force matches. background_anchors &= ~force_match_indicator # Foreground anchors are above thresholds or force matches. foreground_anchors |= force_match_indicator # Add dummy background bbox to gt_boxes to facilitate batch gather. dummy_bbox = tf.constant([[0, 0, 0, 1, 1, 1, 0]], dtype=tf.float32) # Since we are concatenating the dummy bbox, the index corresponds to the # number of boxes. dummy_bbox_idx = py_utils.GetShape(gt_bboxes, 1)[0] gt_bboxes = tf.concat([gt_bboxes, dummy_bbox], axis=0) gt_bboxes_labels = tf.concat([gt_bboxes_labels, [background_class_id]], axis=0) # Gather indices so that all foreground boxes are gathered from gt_bboxes, # while all background and ignore boxes gather the dummy_bbox. anchor_gather_idx = tf.where( foreground_anchors, anchor_max_idx, tf.constant(dummy_bbox_idx, shape=py_utils.GetShape(anchor_max_idx), dtype=anchor_max_idx.dtype)) # Gather the bboxes and weights. assigned_gt_bbox = tf.batch_gather(gt_bboxes, anchor_gather_idx) assigned_gt_labels = tf.batch_gather(gt_bboxes_labels, anchor_gather_idx) # Set masks for classification and regression losses. assigned_cls_mask = tf.cast(background_anchors | foreground_anchors, tf.float32) assigned_reg_mask = tf.cast(foreground_anchors, tf.float32) return py_utils.NestedMap( assigned_gt_bbox=assigned_gt_bbox, assigned_gt_similarity_score=anchor_max_score, assigned_gt_labels=assigned_gt_labels, assigned_cls_mask=assigned_cls_mask, assigned_reg_mask=assigned_reg_mask)
def ComputeAndUpdateMoments(self, theta, inputs, paddings=None, **kwargs): """Computes moments and updates state. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: The inputs tensor. Shaped [..., dim]. paddings: The paddings tensor. Shaped [..., 1], with the same rank as the input tensor. **kwargs: Additional inputs. Returns: Tuple of (mean, variance, beta, gamma). """ p = self.params if paddings is None: paddings = self._GetDefaultPaddings(inputs) inputs = py_utils.with_dependencies([ py_utils.assert_shape_match([tf.shape(paddings)[-1]], [1]), ], inputs) with tf.name_scope(p.name): if self.do_eval or p.freeze_bn_stats: # The mean and variance used for normalization. norm_mean, norm_variance = (self.vars.moving_mean, self.vars.moving_variance) else: rank = tf.rank(paddings) reduce_over_dims = tf.range(0, rank - 1) mean, variance = ComputeMoments( inputs, paddings, reduce_over_dims, None, p.enable_cross_replica_sum_on_tpu) py_utils.UpdateBatchNormVars(self.vars.moving_mean, mean, self._decay) py_utils.UpdateBatchNormVars(self.vars.moving_variance, variance, self._decay) # Add some summaries for visualization. summary_utils.histogram('%s_mean' % p.name, tf.cast(mean, tf.float32)) summary_utils.histogram('%s_variance' % p.name, tf.cast(variance, tf.float32)) summary_utils.histogram( '%s_moving_mean' % p.name, tf.cast(self.vars.moving_mean, tf.float32)) summary_utils.histogram( '%s_moving_variance' % p.name, tf.cast(self.vars.moving_variance, tf.float32)) summary_utils.histogram( '%s_mean_diff' % p.name, tf.cast( tf.cast(mean, self.vars.moving_mean.dtype.base_dtype) - self.vars.moving_mean, tf.float32)) summary_utils.histogram( '%s_variance_diff' % p.name, tf.cast( tf.cast(variance, self.vars.moving_variance.dtype.base_dtype) - self.vars.moving_variance, tf.float32)) if p.use_moving_avg_in_training: # Use the global statistics for normalization. # Control dependencies on mean and variance make sure # moving_mean and variance will be updated for every training step. norm_mean = py_utils.with_dependencies( [mean], self.vars.moving_mean) norm_variance = py_utils.with_dependencies( [variance], self.vars.moving_variance) else: # Use the batch statistics for normalization. norm_mean = mean norm_variance = variance norm_mean = py_utils.CheckNumerics( norm_mean, 'mean of %s failed numeric check' % p.name) norm_variance = py_utils.CheckNumerics( norm_variance, 'variance of %s failed numeric check' % p.name) beta, gamma = self._GetBetaGamma(theta, inputs, **kwargs) return norm_mean, norm_variance, beta, gamma
def FProp(self, theta, input_batch): """Embeds source ids and transforms with TransformerStack. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. input_batch: A `.NestedMap` with fields: - ids: The inputs tensor. It is expected to be of shape [batch, time]. - paddings: The paddings tensor. Expected shape [batch, time]. - task_ids: If p.task_emb is provided, must contain per-token task ids of shape [batch, time]. Returns: A NestedMap containing - encoded: The encoded features, either a tensor of shape [time, batch, depth], or a list of tensors if is_transparent is set in transformer_stack. - padding: of shape [time, batch] - segment_id: [time, batch] if packed inputs are supported by the model (and all layers), or None otherwise. - embedded_inputs: [time, batch, depth] embedded inputs tokens without positional encodings. """ p = self.params with tf.name_scope(p.name): src_segment_id = None src_segment_pos = None input_ids = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(input_batch.ids), tf.shape(input_batch.paddings)), py_utils.assert_equal(tf.rank(input_batch.ids), 2) ], input_batch.ids) if (not py_utils.use_tpu() and tf.flags.FLAGS.transformer_encoder_truncates_inputs): max_seq_length = tf.cast( tf.reduce_max(tf.reduce_sum(1.0 - input_batch.paddings, 1)), tf.int32) paddings = py_utils.with_dependencies([ py_utils.assert_equal( tf.constant(True, tf.bool), tf.reduce_all( input_batch.paddings[:, max_seq_length:] > 0.5)) ], input_batch.paddings) input_ids = input_ids[:, :max_seq_length] paddings = paddings[:, :max_seq_length] if p.packed_input: src_segment_id = input_batch.segment_ids[:, : max_seq_length] src_segment_pos = input_batch.segment_pos[:, : max_seq_length] else: paddings = input_batch.paddings if p.packed_input: src_segment_id = input_batch.segment_ids src_segment_pos = input_batch.segment_pos max_time = tf.shape(input_ids)[1] # Input token embeddings + positional embeddings if not p.shared_emb: input_embs = self.token_emb.EmbLookup( theta.token_emb, tf.reshape(input_ids, [-1])) else: input_embs = self.softmax.EmbLookup( theta.softmax, tf.reshape(input_ids, [-1])) input_embs = tf.reshape(input_embs, [-1, max_time, p.token_emb.embedding_dim]) # [time, batch, dim] orig_input_embs = tf.transpose(input_embs, [1, 0, 2]) if p.packed_input: position_embs = self.position_emb.FPropWithPosition( theta.position_emb, src_segment_pos) else: position_embs = self.position_emb.FProp( theta.position_emb, max_time) position_embs = tf.reshape( position_embs, [1, max_time, p.token_emb.embedding_dim]) input_embs += position_embs if p.task_emb: input_embs += self.task_emb.EmbLookup(theta.task_emb, input_batch.task_ids) if p.model_dim != p.token_emb.embedding_dim: input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs) paddings = tf.cast(tf.transpose(paddings), py_utils.FPropDtype(p)) if p.packed_input: src_segment_id = tf.transpose(src_segment_id) input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs) # [time, batch, dim] transformer_input = tf.transpose(input_embs, [1, 0, 2]) if not self.do_eval and p.apply_source_mask: # Augment padding for masked source word positions. dtype = paddings.dtype source_mask = tf.where(tf.equal(input_ids, p.source_mask_id), tf.ones_like(input_ids, dtype=dtype), tf.zeros_like(input_ids, dtype=dtype)) # Make sure padding is between 0 and 1. paddings = tf.clip_by_value(paddings + tf.transpose(source_mask), 0.0, 1.0) encoded, padding, segment_id = self.transformer_stack.FProp( theta.transformer_stack, transformer_input, paddings, src_segment_id) return py_utils.NestedMap(encoded=encoded, padding=padding, segment_id=segment_id, embedded_inputs=orig_input_embs)
def Update(self, value): """Adds value to the accumulator.""" self.SetValue(self.GetValue() + tf.cast(value, self.dtype))
def _BeamSearchStep(self, theta, encoder_outputs, cur_step, step_ids, core_bs_states, other_states, num_hyps_per_beam, pre_beam_search_step_callback, post_beam_search_step_callback): """Extend beam search hyps for one step. | num_beams = Number of source sequences to be decoded. | num_hyps_per_beam = Number of hyps to keep per source sequence. | num_hyps = num_beams * num_hyps_per_beam | src_seq_len = Number of time steps in the source sequence. | src_batch = Number of examples in the source sequence. | tgt_seq_len = Maximum allowed time steps in the target sequence. | tgt_batch = num_hyps_per_beam * src_batch Args: theta: A `.NestedMap` object containing weights' values of the decoder layer and its children layers. encoder_outputs: A `.NestedMap` containing encoder outputs to be passed to the callbacks. cur_step: A scalar int tensor, the current time step, 0-based. step_ids: An int tensor of shape [num_hyps, 1]. The input ids to the current search step. core_bs_states: A tuple of core beam search states. This list is maintained by this helper class. other_states: A `.NestedMap` of other beam search states. This `.NestedMap` is managed and updated by the client. It is expected that each of its member tensors are of rank >= 1. t[i, ...] is the state of the i-th hyp at the beginning of this search step. num_hyps_per_beam: Num of hyps to keep per beam. pre_beam_search_step_callback: The `PreBeamSearchStepCallback` callback. See class header comments for more details. post_beam_search_step_callback: The `PostBeamSearchStepCallback` callback. See class header comments for more details. Returns: A tuple of following elements for the next beam search step, (next step, all_done, step_ids, core_bs_states, other_states) """ p = self.params bs_results, other_states = pre_beam_search_step_callback( theta, encoder_outputs, step_ids, other_states, num_hyps_per_beam) (best_scores, cumulative_scores, in_scores, in_hyps, in_prev_hyps, in_done_hyps, in_atten_probs) = core_bs_states (out_best_scores, out_cumulative_scores, out_scores, out_hyps, out_prev_hyps, out_done_hyps, out_atten_probs, all_done) = ops.beam_search_step( tf.cast(bs_results.log_probs, dtype=p.dtype), tf.cast(bs_results.atten_probs, dtype=p.dtype), best_scores, cumulative_scores, in_scores, in_hyps, in_prev_hyps, in_done_hyps, in_atten_probs, bs_results.is_last_chunk if self._model_uses_eoc_id else [], cur_step, eoc_id=p.target_eoc_id, eos_id=p.target_eos_id, beam_size=p.beam_size, num_hyps_per_beam=num_hyps_per_beam, valid_eos_max_logit_delta=p.valid_eos_max_logit_delta, merge_paths=p.merge_paths, allow_empty_terminated_hyp=p.allow_empty_terminated_hyp, ensure_full_beam=p.ensure_full_beam, force_eos_in_last_step=p.force_eos_in_last_step, local_eos_threshold=p.local_eos_threshold) new_step_ids = tf.reshape(out_hyps[cur_step, :], tf.shape(step_ids)) new_step_ids.set_shape(step_ids.get_shape()) # [num_hyps_per_beam * num_beams]. old_hyp_ids = tf.reshape( tf.slice(out_prev_hyps, begin=[cur_step, 0], size=[1, -1]), [-1]) if p.batch_major_compute: # Transformed the indices into the key/value cache for fast decoding # (prefix_states in other_states) due to the num_hyps dimension of # cache is computed as num_beams by num_hyps_per_beam, which is different # from the old_hyp_ids assumption (num_hyps_per_beam by num_beams). # Both transpose and recomputation are required to correct the indices. num_beams = tf.shape(best_scores)[0] # [num_beams * num_hyps_per_beam]. old_hyp_ids_in_cache_order = tf.reshape( tf.transpose(tf.reshape(old_hyp_ids, [num_hyps_per_beam, -1])), [-1]) old_hyp_ids_in_cache_order = ( (old_hyp_ids_in_cache_order % num_beams) * num_hyps_per_beam + old_hyp_ids_in_cache_order // num_beams) new_bs_states = (out_best_scores, out_cumulative_scores, out_scores, out_hyps, out_prev_hyps, out_done_hyps, out_atten_probs) def ReOrderHyps(key, x_in): """Reorders x_in based on prev hyp ids.""" correct_old_hyp_ids = (old_hyp_ids_in_cache_order if p.batch_major_compute else old_hyp_ids) if (isinstance(x_in, tf.Tensor) and x_in.shape.ndims): if x_in.shape.ndims > 2 and not p.batch_major_state: # Use corrected indices only here for batch major compute as key/value # caches are the states being affected. x_out = tf.gather(x_in, correct_old_hyp_ids, axis=1) elif key in POSSIBLY_TIME_MAJOR_STATE_KEYS: x_out = tf.gather(x_in, old_hyp_ids, axis=-1) else: x_out = tf.gather(x_in, correct_old_hyp_ids) x_out.set_shape(x_in.get_shape()) return x_out else: return x_in new_other_states = other_states.TransformWithKey(ReOrderHyps) final_other_states = post_beam_search_step_callback( theta, encoder_outputs, new_step_ids, new_other_states) return (cur_step + 1, all_done, new_step_ids, new_bs_states, final_other_states)
def _StreamMoments(self, inputs, paddings, cached_sum, cached_count, cached_var): """Computes mean and variance over the valid data points in inputs. Args: inputs: [B, T, F, N, G] or [B, T, N, G] paddings: [B, T, 1, 1, 1] or [B, T, 1, 1] cached_sum: [B, 1, 1, N, 1] or [B, 1, N, 1] cached_count: same shape as cached_sum. cached_var: same shape as cached_sum. Returns: mean: [B, T, 1, N, 1] or [B, T, N, 1] variance: same shape as mean. new_cached_sum: same shape as cached_sum. new_cached_count: same shape as cached_count. """ tf.logging.vlog(1, 'inputs: %r', inputs) tf.logging.vlog(1, 'paddings: %r', paddings) tf.logging.vlog(1, 'cached_sum: %r', cached_sum) tf.logging.vlog(1, 'cached_count: %r', cached_count) mask = 1.0 - paddings inputs *= tf.cast(mask, inputs.dtype) input_rank = py_utils.GetRank(inputs) assert input_rank is not None, (f'inputs rank must be staic for ' f'{repr(inputs)}') reduce_over_dims = list(range(input_rank)) # Skip B, T, and N. Reduce {F,G} or just G. reduce_over_dims = reduce_over_dims[2:-2] + reduce_over_dims[-1:] tf.logging.vlog(1, 'reduce_over_dims: %s', reduce_over_dims) # [B, T, 1, N, 1] or [B, T, N, 1] sum_v = tf.reduce_sum(inputs, reduce_over_dims, keepdims=True) sum_v = tf.math.cumsum(sum_v, axis=1) sum_v += cached_sum # [B, T, 1, 1, 1] or [B, T, 1, 1] count_v = tf.reduce_sum(mask, reduce_over_dims, keepdims=True) count_v = tf.math.cumsum(count_v, axis=1) input_shape = py_utils.GetShape(inputs) if input_rank == 4: # F * G multiplier = input_shape[-1] * input_shape[-3] else: # G multiplier = input_shape[-1] count_v *= multiplier count_v += cached_count count_v = tf.maximum(count_v, 1.0) tf.logging.vlog(1, 'sum_v: %r', sum_v) tf.logging.vlog(1, 'count_v: %r', count_v) mean = sum_v / count_v sum_vv = tf.reduce_sum((inputs - mean)**2 * mask, reduce_over_dims, keepdims=True) sum_vv = tf.math.cumsum(sum_vv, axis=1) sum_vv += cached_var cached_sum = sum_v[:, -1:] cached_count = count_v[:, -1:] cached_var = sum_vv[:, -1:] variance = py_utils.with_dependencies([ py_utils.assert_greater_equal(sum_vv, tf.cast(0, sum_vv.dtype)), ], sum_vv / count_v) return mean, variance, cached_sum, cached_count, cached_var
def _Extract(self, features): p = self.params ri_outputs = {} outputs = {} frame_pose = tf.reshape(_Dense(features['pose']), [4, 4]) for laser in p.cbr_laser_names + p.gbr_laser_names: # Extract range images. for returns in p.returns: ri_shape = tf.reshape( _Dense(features['%s_%s_shape' % (laser, returns)]), [-1]) range_image = tf.reshape( _Dense(features['%s_%s' % (laser, returns)]), ri_shape) shape_to_check = (p.cbr_ri_shape if laser in p.cbr_laser_names else p.gbr_ri_shape) range_image = py_utils.HasShape(range_image, shape_to_check) ri_outputs['%s_%s' % (laser, returns)] = range_image # Extract beam inclinations and extrinsics outputs['%s_extrinsics' % laser] = tf.reshape( _Dense(features['%s_extrinsics' % laser]), [4, 4]) # CBRs have uniform inclination for laser in p.cbr_laser_names: beam_inclination_min = tf.reshape( _Dense(features['%s_beam_inclination_min' % laser]), []) beam_inclination_max = tf.reshape( _Dense(features['%s_beam_inclination_max' % laser]), []) outputs['%s_beam_inclinations' % laser] = tf.stack( [beam_inclination_min, beam_inclination_max], axis=0) # GBRs have non-uniform inclinations defined by 64 floats. for laser in p.gbr_laser_names: outputs['%s_beam_inclinations' % laser] = tf.reshape( _Dense(features['%s_beam_inclinations' % laser]), [64]) # Embed xyz onto each range image pixel. for laser in p.cbr_laser_names + p.gbr_laser_names: extrinsics = outputs['%s_extrinsics' % laser] inclinations = outputs['%s_beam_inclinations' % laser] if laser in p.cbr_laser_names: ri_shape = p.cbr_ri_shape # Convert from 2-tuple range inclination to the full range # via linear interpolation. # # CBR lasers currently are always uniform inclinations specified by a # length 2 vector. height = ri_shape[0] min_inclination = inclinations[0] max_inclination = inclinations[1] diff = max_inclination - min_inclination ratio = (.5 + tf.cast(tf.range( 0, height), tf.float32)) / tf.cast(height, tf.float32) # interpolate from min to max inclination. inclinations = (ratio * diff) + min_inclination else: ri_shape = p.gbr_ri_shape pixel_pose = None if laser in p.gbr_laser_names: pixel_pose = tf.reshape(_Dense(features['%s_pose' % laser]), shape=p.gbr_ri_shape[0:2] + [4, 4]) outputs['%s_pose' % laser] = pixel_pose for returns in p.returns: range_image = ri_outputs['%s_%s' % (laser, returns)] range_image = tf.reshape(range_image, ri_shape) range_image_mask = range_image[..., 0] >= 0 ri_xyz = tf.cast( self._XYZFromRangeImage(range_image, range_image_mask, extrinsics, inclinations, pixel_pose, frame_pose), tf.float32) # Produce the NestedMap of xyz, features, mask. ri_result = py_utils.NestedMap({ 'xyz': ri_xyz, 'features': range_image, 'mask': tf.cast(range_image_mask, tf.float32), }) outputs['%s_%s' % (laser, returns)] = ri_result return py_utils.NestedMap(outputs)
def _StringsToIdsImpl(self, strs, max_length, append_eos, languages): """Takes a tensor of strings and returns id/padding tensors. This generates `token_ids`, `target_ids`, and `paddings` in the format that is expected for tokenizers. This performs padding to a fixed length and appends the end-of-sentence token as appropriate. Args: strs: a string Tensor. max_length: a python integer. The second dimension of the returned arrays. All sequences are padded or truncated to that length. append_eos: a python bool. See `BaseTokenizer` for explanation. languages: A vector of strings with the same length as `strs`. Returns: A tuple of 3 tensors: - token_ids: a tensor of sequences of WPM ids starting with SOS. Sequences always end with EOS unless the sequence exceeds the maximum length. Always padded with EOS. - target_ids: a tensor of sequences of WPM ids not starting with SOS but ending with EOS. Always padded with EOS. - paddings: a tensor of floats indicating, at each position, whether the corresponding position is padded. """ p = self.params if append_eos is None: append_eos = p.append_eos batch_size = py_utils.GetShape(strs)[0] token_ids_ta = tf.TensorArray(tf.int32, batch_size) target_ids_ta = tf.TensorArray(tf.int32, batch_size) paddings_ta = tf.TensorArray(tf.float32, batch_size) def _TokenizeOneSentence(i, strs, token_ids_ta, target_ids_ta, paddings_ta): """Tokenizes a single sentence.""" ids, _ = self._wpm_encoder.Encode(strs[i]) if append_eos: ids = tf.concat([ids, [self.eos_id]], axis=0) # This truncates after the eos is added, so some sentences might # not have </s> at the end. token_ids_ta = token_ids_ta.write( i, py_utils.PadOrTrimTo(tf.concat([[self.sos_id], ids], axis=0), [max_length], self.eos_id)) target_ids_ta = target_ids_ta.write( i, py_utils.PadOrTrimTo(ids, [max_length], self.eos_id)) paddings_ta = paddings_ta.write( i, py_utils.PadOrTrimTo(tf.zeros_like(ids, dtype=tf.float32), [max_length], 1.)) return i + 1, strs, token_ids_ta, target_ids_ta, paddings_ta _, _, token_ids_ta, target_ids_ta, paddings_ta = tf.while_loop( lambda i, *_: i < batch_size, _TokenizeOneSentence, loop_vars=(tf.constant(0, tf.int32), strs, token_ids_ta, target_ids_ta, paddings_ta), parallel_iterations=30, back_prop=False) token_ids = token_ids_ta.stack() target_ids = target_ids_ta.stack() paddings = paddings_ta.stack() if not p.pad_to_max_length: maxlen = tf.cast( tf.round(tf.reduce_max(tf.reduce_sum(1.0 - paddings, axis=1))), tf.int32) token_ids = token_ids[:, :maxlen] target_ids = target_ids[:, :maxlen] paddings = paddings[:, :maxlen] return token_ids, target_ids, paddings
def _StringsToIdsImpl(self, strs, max_length, append_eos, languages): del languages p = self.params if append_eos is None: append_eos = p.append_eos batch_size = py_utils.GetShape(strs)[0] token_ids_ta = tf.TensorArray(tf.int32, batch_size) target_ids_ta = tf.TensorArray(tf.int32, batch_size) paddings_ta = tf.TensorArray(tf.float32, batch_size) def _TokenizeOneSentence(i, text, token_ids_ta, target_ids_ta, paddings_ta): """Tokenizes a single sentence.""" if tf.is_tensor(i): text_i = tf.gather(text, i) else: text_i = text[i] ids = self._tokenizer.tokenize(text_i).merge_dims(0, -1) ids.set_shape([None]) if append_eos: ids = tf.concat([ids, [self.eos_id]], axis=0) sos_ids = tf.concat([[self.sos_id], ids], axis=0) if p.prepend_sos: ids = sos_ids # This truncates after the EOS is added, so some sentences might # not have EOS at the end. token_ids_ta = token_ids_ta.write( i, py_utils.PadOrTrimTo(sos_ids, [max_length], 0)) target_ids_ta = target_ids_ta.write( i, py_utils.PadOrTrimTo(ids, [max_length], 0)) paddings_ta = paddings_ta.write( i, py_utils.PadOrTrimTo(tf.zeros_like(ids, dtype=tf.float32), [max_length], 1.)) return i + 1, strs, token_ids_ta, target_ids_ta, paddings_ta _, _, token_ids_ta, target_ids_ta, paddings_ta = tf.while_loop( lambda i, *_: i < batch_size, _TokenizeOneSentence, loop_vars=(tf.constant(0, tf.int32), strs, token_ids_ta, target_ids_ta, paddings_ta), parallel_iterations=30, back_prop=False) token_ids = token_ids_ta.stack() target_ids = target_ids_ta.stack() paddings = paddings_ta.stack() if not p.pad_to_max_length: maxlen = tf.cast( tf.round(tf.reduce_max(tf.reduce_sum(1.0 - paddings, axis=1))), tf.int32) token_ids = token_ids[:, :maxlen] target_ids = target_ids[:, :maxlen] paddings = paddings[:, :maxlen] return token_ids, target_ids, paddings
def MaybeCastToFPropDtype(x): if x is not None and x.dtype == self._params.dtype: return tf.cast(x, self._params.fprop_dtype) else: return x
def _ComputePaddings(ids, eos_id): is_eos = tf.cast(tf.equal(ids, eos_id), tf.int32) # eos_in_prefix[i, j] = any(ids[i, k] == eos_id for k in range(j)) eos_in_prefix = tf.cumsum(is_eos, axis=-1, exclusive=True) return tf.where(tf.equal(eos_in_prefix, 0), tf.zeros_like(ids), tf.ones_like(ids))
def _GetMask(self, batch_size, choose_range, mask_size, global_seed, max_length=None, masks_per_frame=0.0, multiplicity=1, dtype=tf.float32, max_ratio=1.0): """Returns fixed size multi-masks starting from random positions. A multi-mask is a mask obtained by applying multiple masks. This function when max_length is given: 1) Sample random mask lengths less than max_length with shape (batch_size, multiplicity). 2) Truncate lengths to a max of (choose_range * max_ratio), so that each mask is fully contained within the corresponding sequence. 3) Random sample start points of shape (batch_size, multiplicity) with in (choose_range - lengths). 4) For each batch, multiple masks (whose number is given by the multiplicity) are constructed. 5) Return a mask of shape (batch_size, mask_size) where masks are obtained by composing the masks constructed in step 4). If masks_per_frame > 0, the number is given by min(masks_per_frame * choose_range, multiplicity). If not, all the masks are composed. The masked regions are set to zero. This function when max_length is not given: 1) Sample random mask lengths less than (choose_range * max_ratio) with shape (batch_size, multiplicity). 2) Proceed to steps 3), 4) and 5) of the above. Args: batch_size: Batch size. Integer number. choose_range: Range within which the masked entries must lie. Tensor of shape (batch_size,). mask_size: Size of the mask. Integer number. global_seed: an integer seed tensor for stateless random ops. max_length: Maximum number of allowed consecutive masked entries. Integer number or None. masks_per_frame: Number of masks per frame. Float number. If > 0, the multiplicity of the mask is set to be masks_per_frame * choose_range. multiplicity: Maximum number of total masks. Integer number. dtype: Data type. max_ratio: Maximum portion of the entire range allowed to be masked. Float number. Returns: mask: a fixed size multi-mask starting from a random position with shape (batch_size, mask_size). """ p = self.params # Non-empty random seed values are only used for testing or when using # stateless random ops. seed_1 and seed_2 are set separately to avoid # correlation of mask size and mask position. if p.use_input_dependent_random_seed: seed_1 = global_seed + 1 seed_2 = global_seed + 2 elif p.random_seed: seed_1 = p.random_seed + 1 seed_2 = 2 * p.random_seed else: seed_1 = p.random_seed seed_2 = p.random_seed # Sample lengths for multiple masks. if max_length and max_length > 0: max_length = tf.broadcast_to(tf.cast(max_length, dtype), (batch_size,)) else: max_length = tf.cast(choose_range, dtype=dtype) * max_ratio random_uniform = _random_uniform_op(p.use_input_dependent_random_seed) masked_portion = random_uniform( shape=(batch_size, multiplicity), minval=0.0, maxval=1.0, dtype=dtype, seed=seed_1) masked_frame_size = self.EinsumBBmBm(max_length, masked_portion) masked_frame_size = tf.cast(masked_frame_size, dtype=tf.int32) # Make sure the sampled length was smaller than max_ratio * length_bound. # Note that sampling in this way was biased # (shorter sequence may over-masked.) choose_range = tf.expand_dims(choose_range, -1) choose_range = tf.tile(choose_range, [1, multiplicity]) length_bound = tf.cast(choose_range, dtype=dtype) length_bound = tf.cast(max_ratio * length_bound, dtype=tf.int32) length = tf.minimum(masked_frame_size, tf.maximum(length_bound, 1)) # Choose starting point. random_start = random_uniform( shape=(batch_size, multiplicity), maxval=1.0, seed=seed_2) start_with_in_valid_range = random_start * tf.cast( (choose_range - length + 1), dtype=dtype) start = tf.cast(start_with_in_valid_range, tf.int32) end = start + length - 1 # Shift starting and end point by small value. delta = tf.constant(0.1) start = tf.expand_dims(tf.cast(start, dtype) - delta, -1) start = tf.tile(start, [1, 1, mask_size]) end = tf.expand_dims(tf.cast(end, dtype) + delta, -1) end = tf.tile(end, [1, 1, mask_size]) # Construct pre-mask of shape (batch_size, multiplicity, mask_size). diagonal = tf.expand_dims( tf.expand_dims(tf.cast(tf.range(mask_size), dtype=dtype), 0), 0) diagonal = tf.tile(diagonal, [batch_size, multiplicity, 1]) pre_mask = tf.cast( tf.math.logical_and(diagonal < end, diagonal > start), dtype=dtype) # Sum masks with appropriate multiplicity. if masks_per_frame > 0: multiplicity_weights = tf.tile( tf.expand_dims(tf.range(multiplicity, dtype=dtype), 0), [batch_size, 1]) multiplicity_tensor = masks_per_frame * tf.cast(choose_range, dtype=dtype) multiplicity_weights = tf.cast( multiplicity_weights < multiplicity_tensor, dtype=dtype) pre_mask = self.EinsumBmtBmBt(pre_mask, multiplicity_weights) else: pre_mask = tf.reduce_sum(pre_mask, 1) mask = tf.cast(1.0 - tf.cast(pre_mask > 0, dtype=dtype), dtype=dtype) if p.fprop_dtype is not None and p.fprop_dtype != p.dtype: mask = tf.cast(mask, p.fprop_dtype) return mask