def test_masked_minimum(self): tf.reset_default_graph() data = tf.placeholder(tf.float32, shape=[None, None]) mask = tf.placeholder(tf.float32, shape=[None, None]) masked_minimums = utils.masked_minimum(data, mask) with self.test_session() as sess: result = sess.run(masked_minimums, feed_dict={ data: [[-2.0, 1.0, 2.0, -1.0, 0.0], [-2.0, -1.0, -3.0, -5.0, -4.0]], mask: [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]] }) self.assertAllClose(result, [[-2.0], [-5.0]]) result = sess.run(masked_minimums, feed_dict={ data: [[-2.0, 1.0, 2.0, -1.0, 0.0], [-2.0, -1.0, -3.0, -5.0, -4.0]], mask: [[0, 1, 1, 0, 1], [1, 1, 1, 0, 1]] }) self.assertAllClose(result, [[0.0], [-4.0]]) result = sess.run(masked_minimums, feed_dict={ data: [[-2.0, 1.0, 2.0, -1.0, 0.0], [-2.0, -1.0, -3.0, -5.0, -4.0]], mask: [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] }) self.assertAllClose(result, [[2.0], [-1.0]])
def build_loss(self, predictions, **kwargs): """Build tf graph to compute loss. Args: predictions: dict of prediction results keyed by name. Returns: loss_dict: dict of loss tensors keyed by name. """ options = self._model_proto (image_id, image_ids_gathered, similarity) = (predictions[_FIELD_IMAGE_ID], predictions[_FIELD_IMAGE_IDS_GATHERED], predictions[_FIELD_SIMILARITY]) distance = 1.0 - similarity pos_mask = tf.cast( tf.equal(tf.expand_dims(image_id, axis=1), tf.expand_dims(image_ids_gathered, axis=0)), tf.float32) neg_mask = 1.0 - pos_mask if options.triplet_ap_use_avg: distance_ap = utils.masked_avg(distance, pos_mask) else: distance_ap = utils.masked_maximum(distance, pos_mask) # negatives_outside: smallest D_an where D_an > D_ap. mask = tf.cast(tf.greater(distance, distance_ap), tf.float32) mask = mask * neg_mask negatives_outside = utils.masked_minimum(distance, mask) # negatives_inside: largest D_an. negatives_inside = utils.masked_maximum(distance, neg_mask) # distance_an: the semihard negatives. mask_condition = tf.greater(tf.reduce_sum(mask, axis=1, keepdims=True), 0.0) distance_an = tf.where(mask_condition, negatives_outside, negatives_inside) # Triplet loss. losses = tf.maximum(distance_ap - distance_an + options.triplet_margin, 0) return { 'triplet_loss': tf.reduce_mean(losses), }
def build_loss(self, predictions, examples, **kwargs): """Build tf graph to compute loss. Args: predictions: dict of prediction results keyed by name. examples: dict of inputs keyed by name. Returns: loss_dict: dict of loss tensors keyed by name. """ options = self._model_proto loss_dict = {} # Extracts tensors and shapes. (image_id, image_ids_gathered, similarity) = (predictions[VisualW2vPredictions.image_id], predictions[VisualW2vPredictions.image_ids_gathered], predictions[VisualW2vPredictions.similarity]) # Triplet loss. # Distance matrix, shape = [batch, num_captions_in_batch]. distance = 1.0 - similarity pos_mask = tf.cast( tf.equal( tf.expand_dims(image_id, axis=1), tf.expand_dims(image_ids_gathered, axis=0)), tf.float32) neg_mask = 1.0 - pos_mask distance_ap = utils.masked_maximum(distance, pos_mask) if options.triplet_loss_use_semihard: # Use the semihard. # negatives_outside: smallest D_an where D_an > D_ap. mask = tf.cast(tf.greater(distance, distance_ap), tf.float32) mask = mask * neg_mask negatives_outside = utils.masked_minimum(distance, mask) # negatives_inside: largest D_an. negatives_inside = utils.masked_maximum(distance, neg_mask) # distance_an: the semihard negatives. mask_condition = tf.greater( tf.reduce_sum(mask, axis=1, keepdims=True), 0.0) distance_an = tf.where(mask_condition, negatives_outside, negatives_inside) else: # Use the hardest. distance_an = utils.masked_minimum(distance, neg_mask) # Triplet loss. losses = tf.maximum(distance_ap - distance_an + options.triplet_loss_margin, 0) num_loss_examples = tf.count_nonzero(losses, dtype=tf.float32) loss = tf.reduce_mean(losses) tf.summary.scalar('loss/num_loss_examples', num_loss_examples) tf.summary.scalar('loss/triplet_loss', loss) return {'triplet_loss': loss}
def build_loss(self, predictions, examples, **kwargs): """Build tf graph to compute loss. Args: predictions: dict of prediction results keyed by name. examples: dict of inputs keyed by name. Returns: loss_dict: dict of loss tensors keyed by name. """ options = self._model_proto loss_dict = {} with tf.name_scope('losses'): # Extract image-level labels. labels = self._extract_class_label( class_texts=slim.flatten(predictions[ NOD3Predictions.training_only_caption_strings]), vocabulary_list=self._vocabulary_list) # A prediction model from caption to class # Loss of the multi-instance detection network. midn_class_logits = predictions[NOD3Predictions.midn_class_logits] losses = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=midn_class_logits) # Hard-negative mining. if options.midn_loss_negative_mining == nod3_model_pb2.NOD3Model.NONE: if options.classification_loss_use_sum: assert False loss_dict['midn_cross_entropy_loss'] = tf.multiply( tf.reduce_mean(tf.reduce_sum(losses, axis=-1)), options.midn_loss_weight) else: if options.caption_as_label: loss_masks = tf.to_float( tf.reduce_any(labels > 0, axis=-1)) loss_dict['midn_cross_entropy_loss'] = tf.multiply( tf.squeeze( utils.masked_avg(tf.reduce_mean(losses, axis=-1), mask=loss_masks, dim=0)), options.midn_loss_weight) else: loss_dict['midn_cross_entropy_loss'] = tf.multiply( tf.reduce_mean(losses), options.midn_loss_weight) elif options.midn_loss_negative_mining == nod3_model_pb2.NOD3Model.HARDEST: assert False loss_masks = self._midn_loss_mine_hardest_negative( labels, losses) loss_dict['midn_cross_entropy_loss'] = tf.reduce_mean( utils.masked_avg(data=losses, mask=loss_masks, dim=1)) else: raise ValueError('Invalid negative mining method.') # Triplet loss if options.triplet_loss_weight > 0: (image_id, image_ids_gathered, similarity) = (predictions[NOD3Predictions.image_id], predictions[NOD3Predictions.image_id], predictions[NOD3Predictions.similarity]) distance = 1.0 - similarity pos_mask = tf.cast( tf.equal(tf.expand_dims(image_id, axis=1), tf.expand_dims(image_ids_gathered, axis=0)), tf.float32) neg_mask = 1.0 - pos_mask distance_ap = utils.masked_maximum(distance, pos_mask) if options.triplet_loss_use_semihard: # Use the semihard. # negatives_outside: smallest D_an where D_an > D_ap. mask = tf.cast(tf.greater(distance, distance_ap), tf.float32) mask = mask * neg_mask negatives_outside = utils.masked_minimum(distance, mask) # negatives_inside: largest D_an. negatives_inside = utils.masked_maximum(distance, neg_mask) # distance_an: the semihard negatives. mask_condition = tf.greater( tf.reduce_sum(mask, axis=1, keepdims=True), 0.0) distance_an = tf.where(mask_condition, negatives_outside, negatives_inside) else: # Use the hardest. distance_an = utils.masked_minimum(distance, neg_mask) losses = tf.maximum( distance_ap - distance_an + options.triplet_loss_margin, 0) num_loss_examples = tf.count_nonzero(losses, dtype=tf.float32) triplet_loss = tf.reduce_mean(losses) loss_dict['triplet_loss'] = tf.multiply( triplet_loss, options.triplet_loss_weight) # Losses of the online instance classifier refinement network. (num_proposals, proposals) = (predictions[DetectionResultFields.num_proposals], predictions[DetectionResultFields.proposal_boxes]) batch, max_num_proposals, _ = utils.get_tensor_shape(proposals) proposal_scores_0 = predictions[ NOD3Predictions.oicr_proposal_scores + '_at_0'] if options.oicr_use_proba_r_given_c: proposal_scores_0 = predictions[ NOD3Predictions.midn_proba_r_given_c] proposal_scores_0 = tf.concat([ tf.fill([batch, max_num_proposals, 1], 0.0), proposal_scores_0 ], axis=-1) global_step = tf.train.get_or_create_global_step() oicr_loss_mask = tf.cast(global_step > options.oicr_start_step, tf.float32) for i in range(options.oicr_iterations): proposal_scores_1 = predictions[ NOD3Predictions.oicr_proposal_scores + '_at_{}'.format(i + 1)] oicr_cross_entropy_loss_at_i = model_utils.calc_oicr_loss( labels, num_proposals, proposals, tf.stop_gradient(proposal_scores_0), proposal_scores_1, scope='oicr_{}'.format(i + 1), iou_threshold=options.oicr_iou_threshold) loss_dict['oicr_cross_entropy_loss_at_{}'.format( i + 1)] = tf.multiply( oicr_loss_mask * oicr_cross_entropy_loss_at_i, options.oicr_loss_weight) proposal_scores_0 = tf.nn.softmax(proposal_scores_1, axis=-1) # Min-entropy loss. mask = tf.sequence_mask(num_proposals, maxlen=max_num_proposals, dtype=tf.float32) proba_r_given_c = predictions[NOD3Predictions.midn_proba_r_given_c] losses = tf.log(proba_r_given_c + _EPSILON) losses = tf.squeeze(utils.masked_sum_nd(data=losses, mask=mask, dim=1), axis=1) min_entropy_loss = tf.reduce_mean( tf.reduce_sum(losses * labels, axis=1)) min_entropy_loss = tf.multiply(min_entropy_loss, options.min_entropy_loss_weight) max_proba = tf.reduce_mean( utils.masked_maximum(data=proba_r_given_c, mask=tf.expand_dims(mask, -1), dim=1)) tf.losses.add_loss(min_entropy_loss) if options.triplet_loss_weight > 0: tf.summary.scalar('loss/num_loss_examples', num_loss_examples) tf.summary.scalar('loss/min_entropy_loss', min_entropy_loss) tf.summary.scalar('loss/max_proba', max_proba) return loss_dict
def _predict_similarity(self, examples): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training # Extracts input data fields. (image, image_id, num_captions, caption_strings, caption_lengths) = (examples[InputDataFields.image], examples[InputDataFields.image_id], examples[InputDataFields.num_captions], examples[InputDataFields.caption_strings], examples[InputDataFields.caption_lengths]) image_feature = self._encode_images( image, cnn_name=options.cnn_name, cnn_trainable=options.cnn_trainable, cnn_weight_decay=options.cnn_weight_decay, cnn_feature_map=options.cnn_feature_map, cnn_dropout_keep_prob=options.cnn_dropout_keep_prob, cnn_checkpoint=options.cnn_checkpoint, cnn_scope=GAPVariableScopes.cnn, is_training=is_training) (image_ids_gathered, caption_strings_gathered, caption_lengths_gathered) = model_utils.gather_in_batch_captions( image_id, num_captions, caption_strings, caption_lengths) # Extract image feature, shape = # [batch, feature_height * feature_width, common_dimensions]. with tf.name_scope(OperationNames.image_model): image_feature = self._project_images( image_feature, common_dimensions=options.common_dimensions, scope=GAPVariableScopes.image_proj, hyperparams=options.image_proj_hyperparams, is_training=is_training) (batch, feature_height, feature_width, common_dimensions) = utils.get_tensor_shape(image_feature) image_feature = tf.reshape(image_feature, [batch, -1, common_dimensions]) # Extract caption feature, shape = # [num_captions_in_batch, max_caption_length, common_dimensions]. vocabulary_list = self._read_vocabulary(options.vocabulary_file) tf.logging.info("Read a vocabulary with %i words.", len(vocabulary_list)) with tf.name_scope(OperationNames.text_model): caption_feature = self._encode_captions( caption_strings_gathered, vocabulary_list=vocabulary_list, common_dimensions=options.common_dimensions, scope=GAPVariableScopes.word_embedding, is_training=is_training) (num_captions_in_batch, max_caption_length, common_dimensions) = utils.get_tensor_shape(caption_feature) # Calculates similarity matrix, shape=[batch, num_captions_in_batch]. with tf.name_scope(OperationNames.calc_pairwise_similarity): # Compute dot-product similarity. similarity = self._calc_pairwise_similarity( image_feature=tf.nn.l2_normalize(image_feature, axis=-1), text_feature=tf.nn.l2_normalize(caption_feature, axis=-1), dropout_keep_prob=options.dropout_keep_prob, is_training=is_training) word_mask = tf.sequence_mask(caption_lengths_gathered, maxlen=max_caption_length, dtype=tf.float32) similarity = similarity * tf.expand_dims( tf.expand_dims(word_mask, 0), 0) if options.use_saliency_score: # Predict saliency score. # image_saliency shape = [batch, num_regions]. # caption_saliency shape = [num_captions_in_batch, max_caption_length]. image_saliency = self._calc_saliency_score( image_feature, scope=GAPVariableScopes.image_saliency, hyperparams=options.image_saliency_hyperparams, is_training=is_training) if options.l2_norm_for_word_saliency: caption_feature = tf.nn.l2_normalize(caption_feature, axis=-1) caption_saliency = self._calc_saliency_score( caption_feature, scope=GAPVariableScopes.word_saliency, hyperparams=options.word_saliency_hyperparams, is_training=is_training) # Apply masked attention. image_attention = tf.nn.softmax(image_saliency, axis=-1) caption_attention = utils.masked_softmax(caption_saliency, word_mask, dim=-1) tf.summary.scalar( 'loss/image_attention_max', tf.reduce_mean(tf.reduce_max(image_attention, axis=1))) tf.summary.scalar( 'loss/image_attention_min', tf.reduce_mean(tf.reduce_min(image_attention, axis=1))) tf.summary.scalar( 'loss/caption_attention_max', tf.reduce_mean( utils.masked_maximum(caption_attention, word_mask, dim=1))) tf.summary.scalar( 'loss/caption_attention_min', tf.reduce_mean( utils.masked_minimum(caption_attention, word_mask, dim=1))) if options.image_regularizer_weight > 0.0: log_image_attention = tf.log( tf.maximum(image_attention, _LOG_SMALL_NUMBER)) loss = tf.multiply( options.image_regularizer_weight, tf.reduce_mean( tf.reduce_sum(log_image_attention, axis=1))) tf.losses.add_loss(loss) tf.summary.scalar('loss/image_attention_log_loss', loss) if options.text_regularizer_weight > 0.0: log_caption_attention = tf.log( tf.maximum(caption_attention, _LOG_SMALL_NUMBER)) loss = tf.multiply( options.text_regularizer_weight, tf.reduce_mean( tf.reduce_sum(log_caption_attention * word_mask, axis=1))) tf.losses.add_loss(loss) tf.summary.scalar('loss/caption_attention_log_loss', loss) saliency_mask = self._calc_pairwise_similarity( image_feature=tf.expand_dims(image_attention, -1), text_feature=tf.expand_dims(caption_attention, -1), dropout_keep_prob=options.dropout_keep_prob, is_training=is_training) # Compute weighted sum. similarity = tf.reduce_sum(similarity * saliency_mask, axis=[1, 3]) self.visualize( image, tf.reshape(image_saliency, [-1, feature_height, feature_width])) tf.summary.histogram('image_saliency', image_saliency) tf.summary.histogram('text_saliency', caption_saliency) else: # Simple Global Average Pooling. similarity = tf.div( tf.reduce_sum(similarity, axis=[1, 3]), _SMALL_NUMBER + tf.cast( feature_width * feature_height * caption_lengths_gathered, tf.float32)) predictions = { GAPPredictions.image_id: image_id, GAPPredictions.image_ids_gathered: image_ids_gathered, GAPPredictions.similarity: similarity, } return predictions