Пример #1
0
    def test_masked_minimum(self):
        tf.reset_default_graph()

        data = tf.placeholder(tf.float32, shape=[None, None])
        mask = tf.placeholder(tf.float32, shape=[None, None])
        masked_minimums = utils.masked_minimum(data, mask)

        with self.test_session() as sess:
            result = sess.run(masked_minimums,
                              feed_dict={
                                  data: [[-2.0, 1.0, 2.0, -1.0, 0.0],
                                         [-2.0, -1.0, -3.0, -5.0, -4.0]],
                                  mask: [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]
                              })
            self.assertAllClose(result, [[-2.0], [-5.0]])

            result = sess.run(masked_minimums,
                              feed_dict={
                                  data: [[-2.0, 1.0, 2.0, -1.0, 0.0],
                                         [-2.0, -1.0, -3.0, -5.0, -4.0]],
                                  mask: [[0, 1, 1, 0, 1], [1, 1, 1, 0, 1]]
                              })
            self.assertAllClose(result, [[0.0], [-4.0]])

            result = sess.run(masked_minimums,
                              feed_dict={
                                  data: [[-2.0, 1.0, 2.0, -1.0, 0.0],
                                         [-2.0, -1.0, -3.0, -5.0, -4.0]],
                                  mask: [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
                              })
            self.assertAllClose(result, [[2.0], [-1.0]])
Пример #2
0
    def build_loss(self, predictions, **kwargs):
        """Build tf graph to compute loss.

    Args:
      predictions: dict of prediction results keyed by name.

    Returns:
      loss_dict: dict of loss tensors keyed by name.
    """
        options = self._model_proto

        (image_id, image_ids_gathered,
         similarity) = (predictions[_FIELD_IMAGE_ID],
                        predictions[_FIELD_IMAGE_IDS_GATHERED],
                        predictions[_FIELD_SIMILARITY])

        distance = 1.0 - similarity

        pos_mask = tf.cast(
            tf.equal(tf.expand_dims(image_id, axis=1),
                     tf.expand_dims(image_ids_gathered, axis=0)), tf.float32)
        neg_mask = 1.0 - pos_mask

        if options.triplet_ap_use_avg:
            distance_ap = utils.masked_avg(distance, pos_mask)
        else:
            distance_ap = utils.masked_maximum(distance, pos_mask)

        # negatives_outside: smallest D_an where D_an > D_ap.

        mask = tf.cast(tf.greater(distance, distance_ap), tf.float32)
        mask = mask * neg_mask
        negatives_outside = utils.masked_minimum(distance, mask)

        # negatives_inside: largest D_an.

        negatives_inside = utils.masked_maximum(distance, neg_mask)

        # distance_an: the semihard negatives.

        mask_condition = tf.greater(tf.reduce_sum(mask, axis=1, keepdims=True),
                                    0.0)

        distance_an = tf.where(mask_condition, negatives_outside,
                               negatives_inside)

        # Triplet loss.

        losses = tf.maximum(distance_ap - distance_an + options.triplet_margin,
                            0)

        return {
            'triplet_loss': tf.reduce_mean(losses),
        }
Пример #3
0
  def build_loss(self, predictions, examples, **kwargs):
    """Build tf graph to compute loss.

    Args:
      predictions: dict of prediction results keyed by name.
      examples: dict of inputs keyed by name.

    Returns:
      loss_dict: dict of loss tensors keyed by name.
    """
    options = self._model_proto

    loss_dict = {}

    # Extracts tensors and shapes.

    (image_id, image_ids_gathered,
     similarity) = (predictions[VisualW2vPredictions.image_id],
                    predictions[VisualW2vPredictions.image_ids_gathered],
                    predictions[VisualW2vPredictions.similarity])

    # Triplet loss.
    # Distance matrix, shape = [batch, num_captions_in_batch].

    distance = 1.0 - similarity

    pos_mask = tf.cast(
        tf.equal(
            tf.expand_dims(image_id, axis=1),
            tf.expand_dims(image_ids_gathered, axis=0)), tf.float32)
    neg_mask = 1.0 - pos_mask

    distance_ap = utils.masked_maximum(distance, pos_mask)

    if options.triplet_loss_use_semihard:

      # Use the semihard.

      # negatives_outside: smallest D_an where D_an > D_ap.

      mask = tf.cast(tf.greater(distance, distance_ap), tf.float32)
      mask = mask * neg_mask
      negatives_outside = utils.masked_minimum(distance, mask)

      # negatives_inside: largest D_an.

      negatives_inside = utils.masked_maximum(distance, neg_mask)

      # distance_an: the semihard negatives.

      mask_condition = tf.greater(
          tf.reduce_sum(mask, axis=1, keepdims=True), 0.0)

      distance_an = tf.where(mask_condition, negatives_outside,
                             negatives_inside)

    else:

      # Use the hardest.

      distance_an = utils.masked_minimum(distance, neg_mask)

    # Triplet loss.

    losses = tf.maximum(distance_ap - distance_an + options.triplet_loss_margin,
                        0)

    num_loss_examples = tf.count_nonzero(losses, dtype=tf.float32)
    loss = tf.reduce_mean(losses)

    tf.summary.scalar('loss/num_loss_examples', num_loss_examples)
    tf.summary.scalar('loss/triplet_loss', loss)
    return {'triplet_loss': loss}
Пример #4
0
    def build_loss(self, predictions, examples, **kwargs):
        """Build tf graph to compute loss.

    Args:
      predictions: dict of prediction results keyed by name.
      examples: dict of inputs keyed by name.

    Returns:
      loss_dict: dict of loss tensors keyed by name.
    """
        options = self._model_proto

        loss_dict = {}

        with tf.name_scope('losses'):

            # Extract image-level labels.

            labels = self._extract_class_label(
                class_texts=slim.flatten(predictions[
                    NOD3Predictions.training_only_caption_strings]),
                vocabulary_list=self._vocabulary_list)

            # A prediction model from caption to class

            # Loss of the multi-instance detection network.

            midn_class_logits = predictions[NOD3Predictions.midn_class_logits]
            losses = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=labels, logits=midn_class_logits)

            # Hard-negative mining.

            if options.midn_loss_negative_mining == nod3_model_pb2.NOD3Model.NONE:
                if options.classification_loss_use_sum:
                    assert False
                    loss_dict['midn_cross_entropy_loss'] = tf.multiply(
                        tf.reduce_mean(tf.reduce_sum(losses, axis=-1)),
                        options.midn_loss_weight)
                else:
                    if options.caption_as_label:
                        loss_masks = tf.to_float(
                            tf.reduce_any(labels > 0, axis=-1))
                        loss_dict['midn_cross_entropy_loss'] = tf.multiply(
                            tf.squeeze(
                                utils.masked_avg(tf.reduce_mean(losses,
                                                                axis=-1),
                                                 mask=loss_masks,
                                                 dim=0)),
                            options.midn_loss_weight)
                    else:
                        loss_dict['midn_cross_entropy_loss'] = tf.multiply(
                            tf.reduce_mean(losses), options.midn_loss_weight)
            elif options.midn_loss_negative_mining == nod3_model_pb2.NOD3Model.HARDEST:
                assert False
                loss_masks = self._midn_loss_mine_hardest_negative(
                    labels, losses)
                loss_dict['midn_cross_entropy_loss'] = tf.reduce_mean(
                    utils.masked_avg(data=losses, mask=loss_masks, dim=1))
            else:
                raise ValueError('Invalid negative mining method.')

            # Triplet loss
            if options.triplet_loss_weight > 0:
                (image_id, image_ids_gathered,
                 similarity) = (predictions[NOD3Predictions.image_id],
                                predictions[NOD3Predictions.image_id],
                                predictions[NOD3Predictions.similarity])

                distance = 1.0 - similarity
                pos_mask = tf.cast(
                    tf.equal(tf.expand_dims(image_id, axis=1),
                             tf.expand_dims(image_ids_gathered, axis=0)),
                    tf.float32)
                neg_mask = 1.0 - pos_mask
                distance_ap = utils.masked_maximum(distance, pos_mask)

                if options.triplet_loss_use_semihard:

                    # Use the semihard.

                    # negatives_outside: smallest D_an where D_an > D_ap.

                    mask = tf.cast(tf.greater(distance, distance_ap),
                                   tf.float32)
                    mask = mask * neg_mask
                    negatives_outside = utils.masked_minimum(distance, mask)

                    # negatives_inside: largest D_an.

                    negatives_inside = utils.masked_maximum(distance, neg_mask)

                    # distance_an: the semihard negatives.

                    mask_condition = tf.greater(
                        tf.reduce_sum(mask, axis=1, keepdims=True), 0.0)

                    distance_an = tf.where(mask_condition, negatives_outside,
                                           negatives_inside)

                else:

                    # Use the hardest.

                    distance_an = utils.masked_minimum(distance, neg_mask)

                losses = tf.maximum(
                    distance_ap - distance_an + options.triplet_loss_margin, 0)

                num_loss_examples = tf.count_nonzero(losses, dtype=tf.float32)
                triplet_loss = tf.reduce_mean(losses)

                loss_dict['triplet_loss'] = tf.multiply(
                    triplet_loss, options.triplet_loss_weight)

            # Losses of the online instance classifier refinement network.

            (num_proposals,
             proposals) = (predictions[DetectionResultFields.num_proposals],
                           predictions[DetectionResultFields.proposal_boxes])
            batch, max_num_proposals, _ = utils.get_tensor_shape(proposals)

            proposal_scores_0 = predictions[
                NOD3Predictions.oicr_proposal_scores + '_at_0']
            if options.oicr_use_proba_r_given_c:
                proposal_scores_0 = predictions[
                    NOD3Predictions.midn_proba_r_given_c]

            proposal_scores_0 = tf.concat([
                tf.fill([batch, max_num_proposals, 1], 0.0), proposal_scores_0
            ],
                                          axis=-1)

            global_step = tf.train.get_or_create_global_step()
            oicr_loss_mask = tf.cast(global_step > options.oicr_start_step,
                                     tf.float32)

            for i in range(options.oicr_iterations):
                proposal_scores_1 = predictions[
                    NOD3Predictions.oicr_proposal_scores +
                    '_at_{}'.format(i + 1)]
                oicr_cross_entropy_loss_at_i = model_utils.calc_oicr_loss(
                    labels,
                    num_proposals,
                    proposals,
                    tf.stop_gradient(proposal_scores_0),
                    proposal_scores_1,
                    scope='oicr_{}'.format(i + 1),
                    iou_threshold=options.oicr_iou_threshold)
                loss_dict['oicr_cross_entropy_loss_at_{}'.format(
                    i + 1)] = tf.multiply(
                        oicr_loss_mask * oicr_cross_entropy_loss_at_i,
                        options.oicr_loss_weight)

                proposal_scores_0 = tf.nn.softmax(proposal_scores_1, axis=-1)

            # Min-entropy loss.

            mask = tf.sequence_mask(num_proposals,
                                    maxlen=max_num_proposals,
                                    dtype=tf.float32)
            proba_r_given_c = predictions[NOD3Predictions.midn_proba_r_given_c]
            losses = tf.log(proba_r_given_c + _EPSILON)
            losses = tf.squeeze(utils.masked_sum_nd(data=losses,
                                                    mask=mask,
                                                    dim=1),
                                axis=1)
            min_entropy_loss = tf.reduce_mean(
                tf.reduce_sum(losses * labels, axis=1))
            min_entropy_loss = tf.multiply(min_entropy_loss,
                                           options.min_entropy_loss_weight)

            max_proba = tf.reduce_mean(
                utils.masked_maximum(data=proba_r_given_c,
                                     mask=tf.expand_dims(mask, -1),
                                     dim=1))
            tf.losses.add_loss(min_entropy_loss)

        if options.triplet_loss_weight > 0:
            tf.summary.scalar('loss/num_loss_examples', num_loss_examples)
        tf.summary.scalar('loss/min_entropy_loss', min_entropy_loss)
        tf.summary.scalar('loss/max_proba', max_proba)

        return loss_dict
Пример #5
0
    def _predict_similarity(self, examples):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        # Extracts input data fields.

        (image, image_id, num_captions, caption_strings,
         caption_lengths) = (examples[InputDataFields.image],
                             examples[InputDataFields.image_id],
                             examples[InputDataFields.num_captions],
                             examples[InputDataFields.caption_strings],
                             examples[InputDataFields.caption_lengths])

        image_feature = self._encode_images(
            image,
            cnn_name=options.cnn_name,
            cnn_trainable=options.cnn_trainable,
            cnn_weight_decay=options.cnn_weight_decay,
            cnn_feature_map=options.cnn_feature_map,
            cnn_dropout_keep_prob=options.cnn_dropout_keep_prob,
            cnn_checkpoint=options.cnn_checkpoint,
            cnn_scope=GAPVariableScopes.cnn,
            is_training=is_training)

        (image_ids_gathered, caption_strings_gathered,
         caption_lengths_gathered) = model_utils.gather_in_batch_captions(
             image_id, num_captions, caption_strings, caption_lengths)

        # Extract image feature, shape =
        #   [batch, feature_height * feature_width, common_dimensions].

        with tf.name_scope(OperationNames.image_model):
            image_feature = self._project_images(
                image_feature,
                common_dimensions=options.common_dimensions,
                scope=GAPVariableScopes.image_proj,
                hyperparams=options.image_proj_hyperparams,
                is_training=is_training)

            (batch, feature_height, feature_width,
             common_dimensions) = utils.get_tensor_shape(image_feature)
            image_feature = tf.reshape(image_feature,
                                       [batch, -1, common_dimensions])

        # Extract caption feature, shape =
        #   [num_captions_in_batch, max_caption_length, common_dimensions].

        vocabulary_list = self._read_vocabulary(options.vocabulary_file)
        tf.logging.info("Read a vocabulary with %i words.",
                        len(vocabulary_list))

        with tf.name_scope(OperationNames.text_model):
            caption_feature = self._encode_captions(
                caption_strings_gathered,
                vocabulary_list=vocabulary_list,
                common_dimensions=options.common_dimensions,
                scope=GAPVariableScopes.word_embedding,
                is_training=is_training)

            (num_captions_in_batch, max_caption_length,
             common_dimensions) = utils.get_tensor_shape(caption_feature)

        # Calculates similarity matrix, shape=[batch, num_captions_in_batch].

        with tf.name_scope(OperationNames.calc_pairwise_similarity):

            # Compute dot-product similarity.

            similarity = self._calc_pairwise_similarity(
                image_feature=tf.nn.l2_normalize(image_feature, axis=-1),
                text_feature=tf.nn.l2_normalize(caption_feature, axis=-1),
                dropout_keep_prob=options.dropout_keep_prob,
                is_training=is_training)

            word_mask = tf.sequence_mask(caption_lengths_gathered,
                                         maxlen=max_caption_length,
                                         dtype=tf.float32)
            similarity = similarity * tf.expand_dims(
                tf.expand_dims(word_mask, 0), 0)

            if options.use_saliency_score:

                # Predict saliency score.
                #   image_saliency shape = [batch, num_regions].
                #   caption_saliency shape = [num_captions_in_batch, max_caption_length].

                image_saliency = self._calc_saliency_score(
                    image_feature,
                    scope=GAPVariableScopes.image_saliency,
                    hyperparams=options.image_saliency_hyperparams,
                    is_training=is_training)

                if options.l2_norm_for_word_saliency:
                    caption_feature = tf.nn.l2_normalize(caption_feature,
                                                         axis=-1)
                caption_saliency = self._calc_saliency_score(
                    caption_feature,
                    scope=GAPVariableScopes.word_saliency,
                    hyperparams=options.word_saliency_hyperparams,
                    is_training=is_training)

                # Apply masked attention.

                image_attention = tf.nn.softmax(image_saliency, axis=-1)
                caption_attention = utils.masked_softmax(caption_saliency,
                                                         word_mask,
                                                         dim=-1)

                tf.summary.scalar(
                    'loss/image_attention_max',
                    tf.reduce_mean(tf.reduce_max(image_attention, axis=1)))
                tf.summary.scalar(
                    'loss/image_attention_min',
                    tf.reduce_mean(tf.reduce_min(image_attention, axis=1)))
                tf.summary.scalar(
                    'loss/caption_attention_max',
                    tf.reduce_mean(
                        utils.masked_maximum(caption_attention,
                                             word_mask,
                                             dim=1)))
                tf.summary.scalar(
                    'loss/caption_attention_min',
                    tf.reduce_mean(
                        utils.masked_minimum(caption_attention,
                                             word_mask,
                                             dim=1)))

                if options.image_regularizer_weight > 0.0:
                    log_image_attention = tf.log(
                        tf.maximum(image_attention, _LOG_SMALL_NUMBER))
                    loss = tf.multiply(
                        options.image_regularizer_weight,
                        tf.reduce_mean(
                            tf.reduce_sum(log_image_attention, axis=1)))
                    tf.losses.add_loss(loss)
                    tf.summary.scalar('loss/image_attention_log_loss', loss)

                if options.text_regularizer_weight > 0.0:
                    log_caption_attention = tf.log(
                        tf.maximum(caption_attention, _LOG_SMALL_NUMBER))
                    loss = tf.multiply(
                        options.text_regularizer_weight,
                        tf.reduce_mean(
                            tf.reduce_sum(log_caption_attention * word_mask,
                                          axis=1)))
                    tf.losses.add_loss(loss)
                    tf.summary.scalar('loss/caption_attention_log_loss', loss)

                saliency_mask = self._calc_pairwise_similarity(
                    image_feature=tf.expand_dims(image_attention, -1),
                    text_feature=tf.expand_dims(caption_attention, -1),
                    dropout_keep_prob=options.dropout_keep_prob,
                    is_training=is_training)

                # Compute weighted sum.

                similarity = tf.reduce_sum(similarity * saliency_mask,
                                           axis=[1, 3])

                self.visualize(
                    image,
                    tf.reshape(image_saliency,
                               [-1, feature_height, feature_width]))
                tf.summary.histogram('image_saliency', image_saliency)
                tf.summary.histogram('text_saliency', caption_saliency)

            else:

                # Simple Global Average Pooling.

                similarity = tf.div(
                    tf.reduce_sum(similarity, axis=[1, 3]),
                    _SMALL_NUMBER + tf.cast(
                        feature_width * feature_height *
                        caption_lengths_gathered, tf.float32))

        predictions = {
            GAPPredictions.image_id: image_id,
            GAPPredictions.image_ids_gathered: image_ids_gathered,
            GAPPredictions.similarity: similarity,
        }
        return predictions