def _testHelper(self, base_frnn_p, frnn_p, packed_input=False): inputs, padding, m0, c0, segment_id = self._GetTestInputs(packed_input) base_frnn = base_frnn_p.Instantiate() frnn = frnn_p.Instantiate() with self.session() as sess: tf.global_variables_initializer().run() state0 = py_utils.NestedMap(m=m0, c=c0) act, state = base_frnn.FPropDefaultTheta(inputs, padding, state0=state0, segment_id=segment_id) # Compute grads loss = -tf.log( tf.sigmoid((tf.reduce_sum(tf.math.square(act)) + tf.reduce_sum(state.m * state.c * state.c)))) grads = tf.gradients(loss, base_frnn.vars.Flatten()) expected_act, expected_state, expected_grads = sess.run( [act, state, grads]) act, state = frnn.FPropDefaultTheta(inputs, padding, state0=state0, segment_id=segment_id) # Compute grads loss = -tf.log( tf.sigmoid((tf.reduce_sum(tf.math.square(act)) + tf.reduce_sum(state.m * state.c * state.c)))) grads = tf.gradients(loss, frnn.vars.Flatten()) actual_act, actual_state, actual_grads = sess.run( [act, state, grads]) tf.logging.info('expected_act:{}'.format(expected_act)) tf.logging.info('actual_act:{}'.format(actual_act)) tf.logging.info('expected_state:{}'.format(expected_state)) tf.logging.info('actual_state:{}'.format(actual_state)) tf.logging.info('expected_grads:{}'.format(expected_grads)) tf.logging.info('actual_grads:{}'.format(actual_grads)) self.assertAllClose(expected_act, actual_act) self.assertAllClose(expected_state.m, actual_state.m) self.assertAllClose(expected_state.c, actual_state.c) for (vname, _), expected, actual in zip(frnn.vars.FlattenItems(), expected_grads, actual_grads): self.assertAllClose(expected, actual, msg=vname)
def Elman(theta, state0, inputs): h0, w, b, x = state0.h, theta.w, theta.b, inputs.x xw = py_utils.Matmul(tf.concat([x, h0], axis=1), w) # 1st part # 2nd part padding = inputs.get('padding', None) h1 = _ApplyPadding(padding, v_no_pad=tf.sigmoid(xw + b), v_pad=state0.h) state1 = py_utils.NestedMap(h=h1) if padding is not None: state1.padding = inputs.padding return (state1, py_utils.NestedMap(h=h1))
def Inference(self): """Builds the inference graph. Default subgraph should return: predicted_bboxes: A [batch_size, num_boxes, 7] float Tensor. classification_scores: A [batch_size, num_boxes, num_classes] float Tensor. Returns: A dictionary whose values are a tuple of fetches and feeds. """ p = self.params subgraphs = {} with tf.name_scope('inference'): input_placeholders = self._Placeholders() predictions = self.ComputePredictions(self.theta, input_placeholders) bboxes_and_logits = self._BBoxesAndLogits(input_placeholders, predictions) predicted_bboxes = bboxes_and_logits.predicted_bboxes classification_logits = bboxes_and_logits.classification_logits classification_scores = tf.sigmoid(classification_logits) _, per_cls_bboxes, per_cls_bbox_scores, per_cls_valid_mask = ( detection_decoder.DecodeWithNMS( predicted_bboxes, classification_scores, nms_iou_threshold=p.nms_iou_threshold, score_threshold=p.nms_score_threshold, max_boxes_per_class=p.max_nms_boxes, use_oriented_per_class_nms=p.use_oriented_per_class_nms)) per_cls_bbox_scores *= per_cls_valid_mask # TODO(vrv): Fix the inference graph for KITTI, since we need # to apply frustum clipping. This requires customizing the # inference placeholders for each model. fetches = { 'per_class_predicted_bboxes': per_cls_bboxes, 'per_class_predicted_bbox_scores': per_cls_bbox_scores, 'per_class_valid_mask': per_cls_valid_mask } subgraphs['default'] = fetches, dict( input_placeholders.FlattenItems()) return subgraphs
def Gate(x): u, v = tf.split(x, 2, axis=-1) return u * tf.sigmoid(v)
def Decode(self, input_batch): """Decode an input batch, computing predicted bboxes from residuals.""" p = self.params predictions = self.ComputePredictions(self.theta, input_batch) bboxes_and_logits = self._BBoxesAndLogits(input_batch, predictions) predicted_bboxes = bboxes_and_logits.predicted_bboxes batch_size, num_bboxes, _ = py_utils.GetShape(predicted_bboxes, 3) classification_logits = bboxes_and_logits.classification_logits classification_logits = py_utils.HasShape( classification_logits, [batch_size, num_bboxes, p.num_classes]) classification_scores = tf.sigmoid(classification_logits) _, per_example_dict = self.ComputeLoss(self.theta, predictions, input_batch) if 'score_scaler' in per_example_dict: classification_scores *= per_example_dict['score_scaler'] with tf.device('/cpu:0'): # Decode the predicted bboxes, performing NMS. per_cls_idxs, per_cls_bboxes, per_cls_bbox_scores, per_cls_valid_mask = ( detection_decoder.DecodeWithNMS( predicted_bboxes, classification_scores, nms_iou_threshold=p.nms_iou_threshold, score_threshold=p.nms_score_threshold, max_boxes_per_class=p.max_nms_boxes, use_oriented_per_class_nms=p.use_oriented_per_class_nms)) # per_cls_valid_mask is [batch, num_classes, num_boxes] Tensor that # indicates which boxes were selected by NMS. Each example will have a # different number of chosen bboxes, so the mask is present to allow us # to keep the boxes as a batched dense Tensor. # # We mask the scores by the per_cls_valid_mask so that none of these boxes # will be interpreted as valid. per_cls_bbox_scores *= per_cls_valid_mask visualization_weights = py_utils.HasShape( per_cls_bbox_scores, [batch_size, p.num_classes, p.max_nms_boxes]) # For top down visualization, filter boxes whose scores are not above the # visualization threshold. visualization_weights = tf.where( tf.greater_equal(visualization_weights, p.visualization_classification_threshold), visualization_weights, tf.zeros_like(visualization_weights)) model_outputs = py_utils.NestedMap() model_outputs.per_class_predicted_bboxes = per_cls_bboxes model_outputs.per_class_predicted_bbox_scores = per_cls_bbox_scores model_outputs.per_class_valid_mask = per_cls_valid_mask decoder_outputs = py_utils.NestedMap({ 'per_class_predicted_bboxes': per_cls_bboxes, 'per_class_predicted_bbox_scores': per_cls_bbox_scores, 'per_class_valid_mask': per_cls_valid_mask, 'visualization_weights': visualization_weights, }) if p.decode_include_residuals: # Including the residuals in the decoder output makes it possible to save # the outputs for further analysis. Note that we ensure that the outputs # match the per-class NMS output format of [batch, num_classes, ...]. def _ReshapeGather(tensor): """Reshapes tensor and then gathers using the nms indices.""" tensor = tf.gather( tf.reshape(tensor, [batch_size, num_bboxes, -1]), per_cls_idxs, batch_dims=1) if not p.use_oriented_per_class_nms: # Tile so that the data fits the expected per class shape of # [batch_size, num_classes, ...]. When *not* using oriented NMS, the # num_classes dimension will be missing since the indices will not # have it. tensor = tf.tile(tensor[:, tf.newaxis, :, :], [1, p.num_classes, 1, 1]) return tensor decoder_outputs.update({ 'per_class_gt_residuals': _ReshapeGather(input_batch.anchor_localization_residuals), 'per_class_gt_labels': _ReshapeGather(input_batch.assigned_gt_labels), 'per_class_residuals': _ReshapeGather(predictions.residuals), 'per_class_logits': _ReshapeGather(predictions.classification_logits), 'per_class_anchor_boxes': _ReshapeGather(input_batch.anchor_bboxes), }) decoder_outputs.update( self.output_decoder.ProcessOutputs(input_batch, model_outputs)) # Produce global step as an output (which is the step # of the checkpoint being decoded.) decoder_outputs.global_step = py_utils.GetGlobalStep() return decoder_outputs
def Decode(self, input_batch): """Decode an input batch, computing predicted bboxes from residuals.""" p = self.params bboxes_and_logits = self._BBoxesAndLogits(input_batch) predicted_bboxes = bboxes_and_logits.predicted_bboxes batch_size, num_bboxes, _ = py_utils.GetShape(predicted_bboxes, 3) classification_logits = bboxes_and_logits.classification_logits classification_logits = py_utils.HasShape( classification_logits, [batch_size, num_bboxes, p.num_classes]) classification_scores = tf.sigmoid(classification_logits) # Score scaler. if 'score_scaler' in bboxes_and_logits: classification_scores *= bboxes_and_logits.score_scaler with tf.device('/cpu:0'): # Decode the predicted bboxes, performing NMS. per_cls_bboxes, per_cls_bbox_scores, per_cls_valid_mask = ( detection_decoder.DecodeWithNMS( predicted_bboxes, classification_scores, nms_iou_threshold=p.nms_iou_threshold, score_threshold=p.nms_score_threshold, max_boxes_per_class=p.max_nms_boxes, use_oriented_per_class_nms=p.use_oriented_per_class_nms)) # per_cls_valid_mask is [batch, num_classes, num_boxes] Tensor that # indicates which boxes were selected by NMS. Each example will have a # different number of chosen bboxes, so the mask is present to allow us # to keep the boxes as a batched dense Tensor. # # We mask the scores by the per_cls_valid_mask so that none of these boxes # will be interpreted as valid. per_cls_bbox_scores *= per_cls_valid_mask visualization_weights = py_utils.HasShape( per_cls_bbox_scores, [batch_size, p.num_classes, p.max_nms_boxes]) # For top down visualization, filter boxes whose scores are not above the # visualization threshold. visualization_weights = tf.where( tf.greater_equal(visualization_weights, p.visualization_classification_threshold), visualization_weights, tf.zeros_like(visualization_weights)) model_outputs = py_utils.NestedMap() model_outputs.per_class_predicted_bboxes = per_cls_bboxes model_outputs.per_class_predicted_bbox_scores = per_cls_bbox_scores model_outputs.per_class_valid_mask = per_cls_valid_mask decoder_outputs = py_utils.NestedMap({ 'per_class_predicted_bboxes': per_cls_bboxes, 'per_class_predicted_bbox_scores': per_cls_bbox_scores, 'per_class_valid_mask': per_cls_valid_mask, 'visualization_weights': visualization_weights, }) decoder_outputs.update( self.output_decoder.ProcessOutputs(input_batch, model_outputs)) # Produce global step as an output (which is the step # of the checkpoint being decoded.) decoder_outputs.global_step = py_utils.GetGlobalStep() return decoder_outputs
def _GLU(self, gated_inputs, act_inputs): p = self.params return self._ApplyActivation(act_inputs, p.glu_activation) * tf.sigmoid(gated_inputs)
def _GLU(self, inputs): p = self.params gated_inputs, act_inputs = tf.split(inputs, 2, axis=-1) return self._ApplyActivation( act_inputs, p.glu_activation) * tf.sigmoid(gated_inputs)
def _Gradient(inputs, _, original_grad): # Compute the gradients for each loss w.r.t. the inputs. # TODO(jngiam): Look into whether TF dedups this computation. per_loss_grads = [] for loss, _ in self._losses: per_loss_grad = tf.gradients(loss, self._output_tensor)[0] if per_loss_grad is None: tf.logging.warning( 'Loss %s did not result in a gradient during ' 'GradDrop computation.', loss) else: per_loss_grads.append(per_loss_grad) if not per_loss_grads: raise ValueError('No valid gradients for GradDrop.') # Multiply the gradients with the inputs. grads = per_loss_grads if p.use_input_sign_only: input_abs = tf.abs( tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs) grads = [grad * ((inputs) / (input_abs)) for grad in grads] else: grads = [grad * inputs for grad in grads] # Sum gradient over batch, assuming that batch is always on dim 0. if p.marginalize_batch_dim: grads = [ tf.reduce_sum(grad, axis=0, keepdims=True) for grad in grads ] # First discretize all gradients into their sign values. grad_sign_positive = [ tf.cast(grad > 0.0, tf.float32) for grad in grads ] grad_sign_negative = [ tf.cast(grad < 0.0, tf.float32) for grad in grads ] # Calculate the probability of positive gradients based on equation (1) # in the GradDrop paper. grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads]) prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon)) # Implementation of different scales for the keep function. Larger # scales result in steeper keep functions. prob_pos *= p.keep_prob_function_scale if p.keep_prob_function == 'sigmoid': # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0 # allows the function scale in sigmoid to be compatible with the # function scale in the linear case. prob_pos = tf.sigmoid(4.0 * prob_pos) elif p.keep_prob_function == 'linear': prob_pos += 0.5 # The main, default mode of GradDrop. Only gradients of one sign are kept, # and which sign is calculated via equation (1) of the main paper. prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape), tf.float32) - 0.5 grad_masks = [ (gsp - gsn) * prob_pos >= 0 for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive) ] # This diag value gives us the percentage of grads which are kept. gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks] diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks)) summary_utils.scalar('average_grad_mask', diag) leak_ratios = [leak_ratio for _, leak_ratio in self._losses] transformed_per_loss_grads = [ grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32)) for (leak, grad, grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks) ] transformed_grad = tf.cast(tf.add_n(transformed_per_loss_grads), original_grad.dtype) if not p.keep_gradnorm_constant: return transformed_grad transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2)) original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2)) return transformed_grad * original_grad_norm / ( transformed_grad_norm + p.epsilon)
def _GatedTanhFn(inputs): gated_inputs, act_inputs = tf.split(inputs, 2, axis=-1) return tf.tanh(act_inputs) * tf.sigmoid(gated_inputs)
def _GLUFn(inputs): gated_inputs, act_inputs = tf.split(inputs, 2, axis=-1) return act_inputs * tf.sigmoid(gated_inputs)