示例#1
0
  def _build_midn_network(self,
                          num_proposals,
                          proposal_features,
                          num_classes=20):
    """Builds the Multiple Instance Detection Network.

    MIDN: An attention network.

    Args:
      num_proposals: A [batch] int tensor.
      proposal_features: A [batch, max_num_proposals, features_dims] 
        float tensor.
      num_classes: Number of classes.

    Returns:
      logits: A [batch, num_classes] float tensor.
      proba_r_given_c: A [batch, max_num_proposals, num_classes] float tensor.
    """
    with tf.name_scope('multi_instance_detection'):

      batch, max_num_proposals, _ = utils.get_tensor_shape(proposal_features)
      mask = tf.sequence_mask(
          num_proposals, maxlen=max_num_proposals, dtype=tf.float32)
      mask = tf.expand_dims(mask, axis=-1)

      # Calculates the attention score: proposal `r` given class `c`.
      #   proba_r_given_c shape = [batch, max_num_proposals, num_classes].

      logits_r_given_c = slim.fully_connected(
          proposal_features,
          num_outputs=num_classes,
          activation_fn=None,
          scope='midn/proba_r_given_c')
      logits_r_given_c = tf.multiply(mask, logits_r_given_c)
      proba_r_given_c = utils.masked_softmax(
          data=logits_r_given_c, mask=mask, dim=1)
      proba_r_given_c = tf.multiply(mask, proba_r_given_c)
      tf.summary.histogram('midn/logits_r_given_c', logits_r_given_c)

      # Calculates the weighted logits:
      #   logits_c_given_r shape = [batch, max_num_proposals, num_classes].
      #   logits shape = [batch, num_classes].

      logits_c_given_r = slim.fully_connected(
          proposal_features,
          num_outputs=num_classes,
          activation_fn=None,
          scope='midn/proba_c_given_r')
      proba_c_given_r = tf.nn.softmax(logits_c_given_r)
      proba_c_given_r = tf.multiply(mask, proba_c_given_r)
      tf.summary.histogram('midn/logits_c_given_r', logits_c_given_r)

      # Aggregates the logits.

      logits = tf.multiply(logits_c_given_r, proba_r_given_c)
      logits = tf.reduce_sum(logits, axis=1)
      tf.summary.histogram('midn/logits', logits)

    return logits, proba_r_given_c
示例#2
0
    def test_masked_softmax(self):
        tf.reset_default_graph()

        data = tf.placeholder(tf.float32, shape=[None, None])
        mask = tf.placeholder(tf.float32, shape=[None, None])
        masked_softmax = utils.masked_softmax(data, mask)

        with self.test_session() as sess:
            result = sess.run(masked_softmax,
                              feed_dict={
                                  data: [[1, 1, 1, 1], [1, 1, 1, 1]],
                                  mask: [[1, 1, 1, 1], [1, 1, 1, 1]]
                              })
            self.assertAllClose(
                result, [[0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25]])

            result = sess.run(masked_softmax,
                              feed_dict={
                                  data: [[1, 1, 1, 1], [1, 1, 1, 1]],
                                  mask: [[1, 1, 0, 0], [0, 0, 1, 1]]
                              })
            self.assertAllClose(result,
                                [[0.5, 0.5, 0.0, 0.0], [0.0, 0.0, 0.5, 0.5]])
示例#3
0
  def encode(self, feature, length, scope=None):
    """Encodes sequence features into representation.

    Args:
      feature: A [batch, max_sequence_len, dims] float tensor.
      length: A [batch] int tensor.

    Returns:
      A [batch, dims] float tensor.
    """
    options = self._model_proto
    is_training = self._is_training

    mask = tf.sequence_mask(
        length, maxlen=utils.get_tensor_shape(feature)[1], dtype=tf.float32)

    # Compute attention distribution.

    node = feature
    for i in range(options.hidden_layers):
      node = tf.contrib.layers.fully_connected(
          inputs=node,
          num_outputs=feature.get_shape()[-1].value,
          scope=scope + '/hidden_{}'.format(i))
    logits = tf.contrib.layers.fully_connected(
        inputs=node, num_outputs=1, activation_fn=None, scope=scope)

    probas = utils.masked_softmax(
        data=logits, mask=tf.expand_dims(mask, axis=-1), dim=1)
    feature = utils.masked_sum_nd(data=feature * probas, mask=mask, dim=1)

    # Summary.

    #tf.summary.histogram('attn/probas/' + scope, probas)
    #tf.summary.histogram('attn/logits/' + scope, logits)

    return tf.squeeze(feature, axis=1)
示例#4
0
    def create_graph(self, proposal_repr, slogan_repr, label_repr,
                     dbpedia_repr, proposal_mask, slogan_mask, label_mask,
                     label_to_slogan_mask, dbpedia_mask,
                     dbpedia_to_slogan_mask):
        """Creates graph."""
        options = self._options
        is_training = self._is_training

        (batch_i, embedding_dims, max_proposal_num, max_label_num,
         max_slogan_num,
         max_dbpedia_num) = (proposal_repr.get_shape()[0].value,
                             proposal_repr.get_shape()[-1].value,
                             utils.get_tensor_shape(proposal_repr)[1],
                             utils.get_tensor_shape(label_repr)[1],
                             utils.get_tensor_shape(slogan_repr)[1],
                             utils.get_tensor_shape(dbpedia_repr)[1])

        # Create access matrix.

        access_matrix = self._create_access_matrix(
            max_proposal_num, max_slogan_num, max_label_num,
            label_to_slogan_mask, max_dbpedia_num, dbpedia_to_slogan_mask,
            batch_i)
        tf.summary.histogram('histogram/access_matrix', access_matrix)

        sentinel_mask = tf.ones([batch_i, 1])
        sentinel_repr = tf.zeros(
            [batch_i, 1, proposal_repr.get_shape()[-1].value])
        node_mask = tf.concat([
            sentinel_mask, proposal_mask, slogan_mask, label_mask, dbpedia_mask
        ],
                              axis=1)

        # Layer level-0 inference.

        with tf.variable_scope('layer_lv0_inference'):

            # lv0 predictions.

            (lv0_proposal_scores, lv0_slogan_scores,
             lv0_label_to_proposal_scores,
             lv0_dbpedia_to_slogan_scores) = self._create_lv0_edge_scores(
                 proposal_repr, slogan_repr, label_repr, dbpedia_repr,
                 proposal_mask, slogan_mask, label_mask, dbpedia_mask)

            # Create lv0 graph, edges to sentinel are not updated.

            node_to_node = self._create_adjacency_matrix(
                proposal_to_sentinel=tf.zeros([batch_i, 1, max_proposal_num]),
                slogan_to_sentinel=tf.zeros([batch_i, 1, max_slogan_num]),
                proposal_to_proposal=tf.linalg.diag(lv0_proposal_scores),
                slogan_to_slogan=tf.linalg.diag(lv0_slogan_scores),
                label_to_proposal=lv0_label_to_proposal_scores,
                dbpedia_to_slogan=lv0_dbpedia_to_slogan_scores)

            adjacency = utils.masked_softmax(node_to_node,
                                             mask=tf.multiply(
                                                 access_matrix,
                                                 tf.expand_dims(node_mask,
                                                                axis=1)),
                                             dim=-1)
            adjacency = tf.multiply(
                adjacency,
                tf.multiply(tf.expand_dims(node_mask, 1),
                            tf.expand_dims(node_mask, 2)))

            node_repr = tf.concat([
                sentinel_repr, proposal_repr, slogan_repr, label_repr,
                dbpedia_repr
            ],
                                  axis=1)
            node_repr = tf.matmul(adjacency, node_repr)

        # Layer level-1 inference.

        with tf.variable_scope('layer_lv1_inference'):

            # Update representation.

            proposal_repr = tf.slice(
                node_repr,
                begin=[0, 1, 0],
                size=[batch_i, max_proposal_num, embedding_dims])
            slogan_repr = tf.slice(
                node_repr,
                begin=[0, 1 + max_proposal_num, 0],
                size=[batch_i, max_slogan_num, embedding_dims])

            # lv1 predictions.

            (lv1_proposal_scores,
             lv1_slogan_scores) = self._create_lv1_edge_scores(
                 proposal_repr, slogan_repr, proposal_mask, slogan_mask)

            # Create lv1 graph, update edges between nodes and the sentinel.

            node_to_node = self._create_adjacency_matrix(
                proposal_to_sentinel=tf.expand_dims(lv1_proposal_scores, 1),
                slogan_to_sentinel=tf.expand_dims(lv1_slogan_scores, 1),
                proposal_to_proposal=tf.linalg.diag(lv0_proposal_scores),
                slogan_to_slogan=tf.linalg.diag(lv0_slogan_scores),
                label_to_proposal=lv0_label_to_proposal_scores,
                dbpedia_to_slogan=lv0_dbpedia_to_slogan_scores)

            adjacency = utils.masked_softmax(node_to_node,
                                             mask=access_matrix *
                                             tf.expand_dims(node_mask, axis=1),
                                             dim=-1)
            adjacency = tf.multiply(
                adjacency,
                tf.multiply(tf.expand_dims(node_mask, 1),
                            tf.expand_dims(node_mask, 2)))

            node_repr = tf.concat([
                sentinel_repr, proposal_repr, slogan_repr, label_repr,
                dbpedia_repr
            ],
                                  axis=1)
            node_repr = tf.matmul(adjacency, node_repr)

        tf.summary.histogram('histogram/adjacency_logits', node_to_node)

        # Sparse loss.
        self_loop_values = tf.linalg.diag_part(adjacency)
        slogan_values = tf.slice(self_loop_values,
                                 begin=[0, 1 + max_proposal_num],
                                 size=[batch_i, max_slogan_num])
        if options.HasField('sparse_loss_weight'):
            slogan_value_masks = tf.less(slogan_values, 1)
            sparse_loss = -tf.div(
                tf.reduce_sum(
                    tf.boolean_mask(slogan_values, slogan_value_masks)),
                1e-8 + tf.reduce_sum(tf.to_float(slogan_value_masks)))

            tf.summary.scalar('loss/sparse_loss', sparse_loss)
            tf.losses.add_loss(
                tf.multiply(sparse_loss,
                            options.sparse_loss_weight,
                            name='sparse_loss'))

        image_repr = node_repr[:, 0, :]
        return image_repr, adjacency, node_to_node
示例#5
0
    def create_graph(self, proposal_repr, slogan_repr, label_repr,
                     dbpedia_repr, proposal_mask, slogan_mask, label_mask,
                     label_to_slogan_mask, dbpedia_mask,
                     dbpedia_to_slogan_mask):
        """Creates graph."""
        options = self._options
        is_training = self._is_training

        (batch_i, embedding_dims, max_proposal_num, max_label_num,
         max_slogan_num,
         max_dbpedia_num) = (proposal_repr.get_shape()[0].value,
                             proposal_repr.get_shape()[-1].value,
                             utils.get_tensor_shape(proposal_repr)[1],
                             utils.get_tensor_shape(label_repr)[1],
                             utils.get_tensor_shape(slogan_repr)[1],
                             utils.get_tensor_shape(dbpedia_repr)[1])
        # Create access matrix.

        access_matrix = self._create_access_matrix(
            max_proposal_num, max_slogan_num, max_label_num,
            label_to_slogan_mask, max_dbpedia_num, dbpedia_to_slogan_mask,
            batch_i)
        tf.summary.histogram('histogram/access_matrix', access_matrix)

        # Get the graph predictions.

        sentinel_mask = tf.ones([batch_i, 1])
        sentinel_repr = tf.zeros(
            [batch_i, 1, proposal_repr.get_shape()[-1].value])

        # Build adjacency matrix.

        node_to_node = self._create_adjacency_matrix(
            proposal_to_sentinel=self._create_edge_weights_helper(
                sentinel_repr, proposal_repr, 'proposal_to_sentinel'),
            slogan_to_sentinel=self._create_edge_weights_helper(
                sentinel_repr, slogan_repr, 'slogan_to_sentinel'),
            proposal_to_proposal=self._create_edge_weights_helper(
                proposal_repr, proposal_repr, 'proposal_to_proposal'),
            slogan_to_slogan=self._create_edge_weights_helper(
                slogan_repr, slogan_repr, 'slogan_to_slogan'),
            label_to_proposal=self._create_edge_weights_helper(
                proposal_repr, label_repr, 'label_to_proposal'),
            dbpedia_to_slogan=self._create_edge_weights_helper(
                slogan_repr, dbpedia_repr, 'dbpedia_to_slogan'))

        tf.summary.histogram('histogram/adjacency_logits', node_to_node)

        node_mask = tf.concat([
            sentinel_mask, proposal_mask, slogan_mask, label_mask, dbpedia_mask
        ],
                              axis=1)
        node_repr = tf.concat([
            sentinel_repr, proposal_repr, slogan_repr, label_repr, dbpedia_repr
        ],
                              axis=1)

        adjacency = utils.masked_softmax(node_to_node,
                                         mask=access_matrix *
                                         tf.expand_dims(node_mask, axis=1),
                                         dim=-1)
        adjacency = tf.multiply(
            adjacency,
            tf.multiply(tf.expand_dims(node_mask, 1),
                        tf.expand_dims(node_mask, 2)))

        for _ in range(2):
            node_repr = tf.matmul(adjacency, node_repr)

        image_repr = node_repr[:, 0, :]

        return image_repr, adjacency, node_to_node
示例#6
0
    def _build_latent_network(self,
                              num_proposals,
                              proposal_features,
                              num_classes=20,
                              num_latent_factors=20,
                              proba_h_given_c=None):
        """Builds the Multiple Instance Detection Network.

    MIDN: An attention network.

    Args:
      num_proposals: A [batch] int tensor.
      proposal_features: A [batch, max_num_proposals, features_dims] 
        float tensor.
      num_classes: Number of classes.
      proba_h_given_c: A [num_latent_factors, num_classes] float tensor.

    Returns:
      logits: A [batch, num_classes] float tensor.
      proba_r_given_c: A [batch, max_num_proposals, num_classes] float tensor.
      proba_h_given_c: A [num_latent_factors, num_classes] float tensor.
    """
        if proba_h_given_c is not None:
            assert proba_h_given_c.get_shape()[0].value == num_latent_factors

        with tf.name_scope('multi_instance_detection'):

            batch, max_num_proposals, _ = utils.get_tensor_shape(
                proposal_features)
            mask = tf.sequence_mask(num_proposals,
                                    maxlen=max_num_proposals,
                                    dtype=tf.float32)
            mask = tf.expand_dims(mask, axis=-1)

            # Calculates the values of following tensors:
            #   logits_c_given_r shape = [batch, max_num_proposals, num_classes].
            #   logits_r_given_h shape = [batch, max_num_proposals, num_hiddens].
            #   logits_h_given_c shape = [num_latent_factors, num_classes].

            with tf.variable_scope('midn'):
                logits_c_given_r = slim.fully_connected(
                    proposal_features,
                    num_outputs=num_classes,
                    activation_fn=None,
                    scope='proba_c_given_r')
                logits_r_given_h = slim.fully_connected(
                    proposal_features,
                    num_outputs=num_latent_factors,
                    activation_fn=None,
                    scope='proba_r_given_h')

                if proba_h_given_c is None:
                    logits_h_given_c = slim.fully_connected(
                        tf.diag(tf.ones([num_classes])),
                        num_outputs=num_latent_factors,
                        activation_fn=None,
                        scope='proba_h_given_c')
                    logits_h_given_c = tf.transpose(logits_h_given_c)
                    proba_h_given_c = tf.nn.softmax(logits_h_given_c, axis=0)
                    tf.summary.histogram('midn/logits_h_given_c',
                                         logits_h_given_c)

            # Marginalize `h` to get proba_r_given_c.

            logits_r_given_c = tf.matmul(
                tf.reshape(logits_r_given_h, [-1, num_latent_factors]),
                proba_h_given_c)
            logits_r_given_c = tf.reshape(
                logits_r_given_c, [batch, max_num_proposals, num_classes])

            proba_r_given_c = utils.masked_softmax(data=logits_r_given_c,
                                                   mask=mask,
                                                   dim=1)
            proba_r_given_c = tf.multiply(mask, proba_r_given_c)

            # Aggregates the logits.

            logits = tf.multiply(logits_c_given_r, proba_r_given_c)
            logits = tf.reduce_sum(logits, axis=1)

            tf.summary.histogram('midn/logits', logits)
            tf.summary.histogram('midn/logits_c_given_r', logits_c_given_r)
            tf.summary.histogram('midn/logits_r_given_h', logits_r_given_h)

        return logits, proba_r_given_c, proba_h_given_c
示例#7
0
    def _build_midn_network(self,
                            num_proposals,
                            proposal_features,
                            num_classes=20):
        """Builds the Multiple Instance Detection Network.

    MIDN: An attention network.

    Args:
      num_proposals: A [batch] int tensor.
      proposal_features: A [batch, max_num_proposals, features_dims] 
        float tensor.
      num_classes: Number of classes.

    Returns:
      logits: A [batch, num_classes] float tensor.
      proba_r_given_c: A [batch, max_num_proposals, num_classes] float tensor.
    """
        with tf.name_scope('multi_instance_detection'):

            batch, max_num_proposals, _ = utils.get_tensor_shape(
                proposal_features)
            mask = tf.sequence_mask(num_proposals,
                                    maxlen=max_num_proposals,
                                    dtype=tf.float32)
            mask = tf.expand_dims(mask, axis=-1)

            # Calculates the values of following tensors:
            #   logits_r_given_c shape = [batch, max_num_proposals, num_classes].
            #   logits_c_given_r shape = [batch, max_num_proposals, num_classes].

            with tf.variable_scope('midn'):
                logits_r_given_c = slim.fully_connected(
                    proposal_features,
                    num_outputs=num_classes,
                    activation_fn=None,
                    scope='proba_r_given_c')
                logits_c_given_r = slim.fully_connected(
                    proposal_features,
                    num_outputs=num_classes,
                    activation_fn=None,
                    scope='proba_c_given_r')

            # Calculates the detection scores.

            proba_r_given_c = utils.masked_softmax(data=tf.multiply(
                mask, logits_r_given_c),
                                                   mask=mask,
                                                   dim=1)
            proba_r_given_c = tf.multiply(mask, proba_r_given_c)

            # Aggregates the logits.

            class_logits = tf.multiply(logits_c_given_r, proba_r_given_c)
            class_logits = utils.masked_sum(data=class_logits,
                                            mask=mask,
                                            dim=1)

            proposal_scores = tf.multiply(tf.nn.sigmoid(class_logits),
                                          proba_r_given_c)
            #proposal_scores = tf.multiply(
            #    tf.nn.softmax(class_logits), proba_r_given_c)

            tf.summary.histogram('midn/logits_r_given_c', logits_r_given_c)
            tf.summary.histogram('midn/logits_c_given_r', logits_c_given_r)
            tf.summary.histogram('midn/proposal_scores', proposal_scores)
            tf.summary.histogram('midn/class_logits', class_logits)

        return tf.squeeze(class_logits,
                          axis=1), proposal_scores, proba_r_given_c
示例#8
0
  def build_prediction(self, examples, **kwargs):
    """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
    options = self._model_proto
    is_training = self._is_training

    # Image CNN features.

    inputs = examples[InputDataFields.image]
    image_features = model_utils.calc_cnn_feature(
        inputs, options.cnn_options, is_training=is_training)

    with slim.arg_scope(
        build_hyperparams(options.image_fc_hyperparams, is_training)):
      image_features = slim.fully_connected(
          image_features,
          num_outputs=options.shared_dims,
          activation_fn=None,
          scope='image')

    # Text Global-Average-Pooling features.

    (image_id, num_captions, caption_strings,
     caption_lengths) = (examples[InputDataFields.image_id],
                         examples[InputDataFields.num_captions],
                         examples[InputDataFields.caption_strings],
                         examples[InputDataFields.caption_lengths])
    image_id = tf.string_to_number(image_id, out_type=tf.int64)

    (image_ids_gathered, caption_strings_gathered,
     caption_lengths_gathered) = model_utils.gather_in_batch_captions(
         image_id, num_captions, caption_strings, caption_lengths)

    (caption_token_ids_gathered,
     caption_features_gathered) = self._extract_text_feature(
         caption_strings_gathered,
         caption_lengths_gathered,
         vocabulary_list=self._open_vocabulary_list,
         initial_embedding=self._open_vocabulary_initial_embedding,
         embedding_dims=options.embedding_dims,
         trainable=options.train_word_embedding,
         max_norm=None)

    with slim.arg_scope(
        build_hyperparams(options.text_fc_hyperparams, is_training)):
      if visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor:
        attn = slim.fully_connected(
            caption_features_gathered,
            num_outputs=1,
            activation_fn=None,
            scope='caption_attn')
        attn = tf.squeeze(attn, axis=-1)
      caption_features_gathered = slim.fully_connected(
          caption_features_gathered,
          num_outputs=options.shared_dims,
          activation_fn=None,
          scope='caption')

    oov = len(self._open_vocabulary_list)
    caption_masks_gathered = tf.logical_not(
        tf.equal(caption_token_ids_gathered, oov))
    caption_masks_gathered = tf.to_float(caption_masks_gathered)

    if visual_w2v_model_pb2.VisualW2vModel.GAP == options.text_feature_extractor:
      caption_features_gathered = utils.masked_avg_nd(
          data=caption_features_gathered, mask=caption_masks_gathered, dim=1)
      caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1)
    elif visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor:
      attn = utils.masked_softmax(attn, mask=caption_masks_gathered, dim=-1)
      caption_features_gathered = tf.multiply(
          tf.expand_dims(attn, axis=-1), caption_features_gathered)
      caption_features_gathered = utils.masked_sum_nd(
          caption_features_gathered, mask=caption_masks_gathered, dim=1)
      caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1)
    else:
      raise ValueError('Invalid text feature extractor.')

    # Export token embeddings.

    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
      _, token_embeddings = self._encode_tokens(
          tokens=tf.constant(self._open_vocabulary_list),
          embedding_dims=options.embedding_dims,
          vocabulary_list=self._open_vocabulary_list,
          initial_embedding=self._open_vocabulary_initial_embedding,
          trainable=options.train_word_embedding)
      with slim.arg_scope(
          build_hyperparams(options.text_fc_hyperparams, is_training)):
        token_embeddings = slim.fully_connected(
            token_embeddings,
            num_outputs=options.shared_dims,
            activation_fn=None,
            scope='caption')
    var_to_assign = tf.get_variable(
        name='weights_proj',
        shape=[len(self._open_vocabulary_list), options.shared_dims])
    var_to_assign = tf.assign(var_to_assign, token_embeddings)
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, var_to_assign)

    tf.summary.histogram('token_embedding_proj', token_embeddings)

    # Compute similarity.

    similarity = model_utils.calc_pairwise_similarity(
        feature_a=image_features,
        feature_b=caption_features_gathered,
        l2_normalize=True,
        dropout_keep_prob=options.cross_modal_dropout_keep_prob,
        is_training=is_training)

    predictions = {
        VisualW2vPredictions.image_id: image_id,
        VisualW2vPredictions.image_ids_gathered: image_ids_gathered,
        VisualW2vPredictions.similarity: similarity,
        VisualW2vPredictions.word2vec: var_to_assign,
    }
    return predictions
示例#9
0
    def _predict_similarity(self, examples):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        # Extracts input data fields.

        (image, image_id, num_captions, caption_strings,
         caption_lengths) = (examples[InputDataFields.image],
                             examples[InputDataFields.image_id],
                             examples[InputDataFields.num_captions],
                             examples[InputDataFields.caption_strings],
                             examples[InputDataFields.caption_lengths])

        image_feature = self._encode_images(
            image,
            cnn_name=options.cnn_name,
            cnn_trainable=options.cnn_trainable,
            cnn_weight_decay=options.cnn_weight_decay,
            cnn_feature_map=options.cnn_feature_map,
            cnn_dropout_keep_prob=options.cnn_dropout_keep_prob,
            cnn_checkpoint=options.cnn_checkpoint,
            cnn_scope=GAPVariableScopes.cnn,
            is_training=is_training)

        (image_ids_gathered, caption_strings_gathered,
         caption_lengths_gathered) = model_utils.gather_in_batch_captions(
             image_id, num_captions, caption_strings, caption_lengths)

        # Extract image feature, shape =
        #   [batch, feature_height * feature_width, common_dimensions].

        with tf.name_scope(OperationNames.image_model):
            image_feature = self._project_images(
                image_feature,
                common_dimensions=options.common_dimensions,
                scope=GAPVariableScopes.image_proj,
                hyperparams=options.image_proj_hyperparams,
                is_training=is_training)

            (batch, feature_height, feature_width,
             common_dimensions) = utils.get_tensor_shape(image_feature)
            image_feature = tf.reshape(image_feature,
                                       [batch, -1, common_dimensions])

        # Extract caption feature, shape =
        #   [num_captions_in_batch, max_caption_length, common_dimensions].

        vocabulary_list = self._read_vocabulary(options.vocabulary_file)
        tf.logging.info("Read a vocabulary with %i words.",
                        len(vocabulary_list))

        with tf.name_scope(OperationNames.text_model):
            caption_feature = self._encode_captions(
                caption_strings_gathered,
                vocabulary_list=vocabulary_list,
                common_dimensions=options.common_dimensions,
                scope=GAPVariableScopes.word_embedding,
                is_training=is_training)

            (num_captions_in_batch, max_caption_length,
             common_dimensions) = utils.get_tensor_shape(caption_feature)

        # Calculates similarity matrix, shape=[batch, num_captions_in_batch].

        with tf.name_scope(OperationNames.calc_pairwise_similarity):

            # Compute dot-product similarity.

            similarity = self._calc_pairwise_similarity(
                image_feature=tf.nn.l2_normalize(image_feature, axis=-1),
                text_feature=tf.nn.l2_normalize(caption_feature, axis=-1),
                dropout_keep_prob=options.dropout_keep_prob,
                is_training=is_training)

            word_mask = tf.sequence_mask(caption_lengths_gathered,
                                         maxlen=max_caption_length,
                                         dtype=tf.float32)
            similarity = similarity * tf.expand_dims(
                tf.expand_dims(word_mask, 0), 0)

            if options.use_saliency_score:

                # Predict saliency score.
                #   image_saliency shape = [batch, num_regions].
                #   caption_saliency shape = [num_captions_in_batch, max_caption_length].

                image_saliency = self._calc_saliency_score(
                    image_feature,
                    scope=GAPVariableScopes.image_saliency,
                    hyperparams=options.image_saliency_hyperparams,
                    is_training=is_training)

                if options.l2_norm_for_word_saliency:
                    caption_feature = tf.nn.l2_normalize(caption_feature,
                                                         axis=-1)
                caption_saliency = self._calc_saliency_score(
                    caption_feature,
                    scope=GAPVariableScopes.word_saliency,
                    hyperparams=options.word_saliency_hyperparams,
                    is_training=is_training)

                # Apply masked attention.

                image_attention = tf.nn.softmax(image_saliency, axis=-1)
                caption_attention = utils.masked_softmax(caption_saliency,
                                                         word_mask,
                                                         dim=-1)

                tf.summary.scalar(
                    'loss/image_attention_max',
                    tf.reduce_mean(tf.reduce_max(image_attention, axis=1)))
                tf.summary.scalar(
                    'loss/image_attention_min',
                    tf.reduce_mean(tf.reduce_min(image_attention, axis=1)))
                tf.summary.scalar(
                    'loss/caption_attention_max',
                    tf.reduce_mean(
                        utils.masked_maximum(caption_attention,
                                             word_mask,
                                             dim=1)))
                tf.summary.scalar(
                    'loss/caption_attention_min',
                    tf.reduce_mean(
                        utils.masked_minimum(caption_attention,
                                             word_mask,
                                             dim=1)))

                if options.image_regularizer_weight > 0.0:
                    log_image_attention = tf.log(
                        tf.maximum(image_attention, _LOG_SMALL_NUMBER))
                    loss = tf.multiply(
                        options.image_regularizer_weight,
                        tf.reduce_mean(
                            tf.reduce_sum(log_image_attention, axis=1)))
                    tf.losses.add_loss(loss)
                    tf.summary.scalar('loss/image_attention_log_loss', loss)

                if options.text_regularizer_weight > 0.0:
                    log_caption_attention = tf.log(
                        tf.maximum(caption_attention, _LOG_SMALL_NUMBER))
                    loss = tf.multiply(
                        options.text_regularizer_weight,
                        tf.reduce_mean(
                            tf.reduce_sum(log_caption_attention * word_mask,
                                          axis=1)))
                    tf.losses.add_loss(loss)
                    tf.summary.scalar('loss/caption_attention_log_loss', loss)

                saliency_mask = self._calc_pairwise_similarity(
                    image_feature=tf.expand_dims(image_attention, -1),
                    text_feature=tf.expand_dims(caption_attention, -1),
                    dropout_keep_prob=options.dropout_keep_prob,
                    is_training=is_training)

                # Compute weighted sum.

                similarity = tf.reduce_sum(similarity * saliency_mask,
                                           axis=[1, 3])

                self.visualize(
                    image,
                    tf.reshape(image_saliency,
                               [-1, feature_height, feature_width]))
                tf.summary.histogram('image_saliency', image_saliency)
                tf.summary.histogram('text_saliency', caption_saliency)

            else:

                # Simple Global Average Pooling.

                similarity = tf.div(
                    tf.reduce_sum(similarity, axis=[1, 3]),
                    _SMALL_NUMBER + tf.cast(
                        feature_width * feature_height *
                        caption_lengths_gathered, tf.float32))

        predictions = {
            GAPPredictions.image_id: image_id,
            GAPPredictions.image_ids_gathered: image_ids_gathered,
            GAPPredictions.similarity: similarity,
        }
        return predictions
示例#10
0
    def build_prediction(self, examples, **kwargs):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        (inputs, num_proposals,
         proposals) = (examples[InputDataFields.image],
                       examples[InputDataFields.num_proposals],
                       examples[InputDataFields.proposals])

        predictions = {
            DetectionResultFields.num_proposals: num_proposals,
            DetectionResultFields.proposal_boxes: proposals,
        }

        # FRCNN.
        #   `proposal_features` shape = [batch, max_num_proposals, feature_dims].
        #   `proposal_masks` shape = [batch, max_num_proposals].

        proposal_features = self._extract_frcnn_feature(
            inputs, num_proposals, proposals)

        batch, max_num_proposals, _ = utils.get_tensor_shape(proposal_features)
        proposal_masks = tf.sequence_mask(num_proposals,
                                          maxlen=max_num_proposals,
                                          dtype=tf.float32)

        # Build the SADDN predictions.
        #   `logits_c_given_r` shape = [batch, max_num_proposals, num_classes].
        #   `logits_r_given_c` shape = [batch, max_num_proposals, num_classes].

        with tf.variable_scope('SADDN'), \
            slim.arg_scope(build_hyperparams(options.fc_hyperparams, is_training)):

            logits_c_given_r = slim.fully_connected(
                proposal_features,
                num_outputs=self._num_classes,
                activation_fn=None,
                scope='proba_c_given_r')
            logits_r_given_c = slim.fully_connected(
                proposal_features,
                num_outputs=self._num_classes,
                activation_fn=None,
                scope='proba_r_given_c')

            proba_c_given_r = tf.nn.softmax(logits_c_given_r)
            proba_r_given_c = utils.masked_softmax(
                data=logits_r_given_c,
                mask=tf.expand_dims(proposal_masks, axis=-1),
                dim=1)
            proba_r_given_c = tf.multiply(
                tf.expand_dims(proposal_masks, axis=-1), proba_r_given_c)

        tf.summary.image('inputs', inputs, max_outputs=10)
        model_utils.visl_proposals(inputs,
                                   num_proposals,
                                   proposals,
                                   name='proposals',
                                   top_k=2000)

        # SADDN iterations.

        logits_at_0 = utils.masked_avg_nd(data=logits_c_given_r,
                                          mask=proposal_masks,
                                          dim=1)
        logits_at_0 = tf.squeeze(logits_at_0, axis=1)

        logits_at_i = logits_at_0
        for i in range(options.saddn_iterations):
            # Infer the proba_r_given_c.

            # Infer the proba_c.

            proba_c_at_i = tf.nn.softmax(logits_at_i)
            import pdb
            pdb.set_trace()

            proba_r_at_i = tf.multiply(tf.expand_dims(proba_c_at_i, axis=1),
                                       proba_r_given_c)
            proba_r_at_i = tf.reduce_sum(proba_r_at_i, axis=-1, keepdims=True)

            # Infer the detection results at iter `i`.

            (num_detections_at_i, detection_boxes_at_i, detection_scores_at_i,
             detection_classes_at_i) = model_utils.post_process(
                 proposals, proba_r_at_i * proba_c_given_r)

            (predictions[StackedAttnPredictions.logits + '_at_{}'.format(i)],
             predictions[DetectionResultFields.num_detections +
                         '_at_{}'.format(i)],
             predictions[DetectionResultFields.detection_boxes +
                         '_at_{}'.format(i)],
             predictions[DetectionResultFields.detection_scores +
                         '_at_{}'.format(i)],
             predictions[DetectionResultFields.detection_classes +
                         '_at_{}'.format(i)]) = (logits_at_i,
                                                 num_detections_at_i,
                                                 detection_boxes_at_i,
                                                 detection_scores_at_i,
                                                 detection_classes_at_i)

            model_utils.visl_proposals_top_k(
                inputs,
                num_detections_at_i,
                detection_boxes_at_i,
                detection_scores_at_i,
                tf.gather(self._vocabulary_list,
                          tf.to_int32(detection_classes_at_i - 1)),
                name='detection_{}'.format(i))

            # `logits_at_i` for the next iteration.

            logits_at_i = tf.multiply(proba_r_at_i, logits_c_given_r)
            logits_at_i = tf.reduce_sum(logits_at_i, axis=1)

        return predictions
示例#11
0
文件: mil_model.py 项目: yekeren/WSOD
    def _build_midn_network(self,
                            num_proposals,
                            proposal_feature,
                            num_classes=20,
                            attention_normalizer=1.0,
                            attention_tanh=False,
                            attention_scale_factor=5.0):
        """Builds the Multiple Instance Detection Network.

    MIDN: An attention network.

    Args:
      num_proposals: A [batch] int tensor.
      proposal_feature: A [batch, max_num_proposals, feature_dims] 
        float tensor.
      num_classes: Number of classes.

    Returns:
      proposal_scores: A [batch, max_num_proposals, num_classes] float tensor.
    """
        with tf.name_scope('multi_instance_detection'):

            _, max_num_proposals, _ = utils.get_tensor_shape(proposal_feature)

            # branch1/branch2 shape = [batch, max_num_proposals, num_classes.]
            branch1 = slim.fully_connected(proposal_feature,
                                           num_outputs=num_classes,
                                           activation_fn=None,
                                           scope='midn/branch1')
            branch2 = slim.fully_connected(proposal_feature,
                                           num_outputs=num_classes,
                                           activation_fn=None,
                                           scope='midn/branch2')
            branch1 = branch1 / attention_normalizer
            branch2 = branch2 / attention_normalizer

            if attention_tanh:
                branch1 = attention_scale_factor * tf.nn.tanh(branch1)
                branch2 = attention_scale_factor * tf.nn.tanh(branch2)

            proba_c_given_r = tf.nn.softmax(branch1, axis=2)

            mask = tf.sequence_mask(num_proposals,
                                    maxlen=max_num_proposals,
                                    dtype=tf.float32)
            mask = tf.expand_dims(mask, axis=-1)
            proba_r_given_c = utils.masked_softmax(data=branch2,
                                                   mask=mask,
                                                   dim=1)

            proposal_scores = tf.multiply(proba_c_given_r, proba_r_given_c)

            # branch1 = slim.fully_connected(
            #     proposal_feature,
            #     num_outputs=num_classes,
            #     activation_fn=None,
            #     scope='midn/branch1')
            # branch1 = branch1 / np.sqrt(num_classes)
            # branch2 = slim.fully_connected(
            #     proposal_feature,
            #     num_outputs=num_classes,
            #     activation_fn=None,
            #     scope='midn/branch2')
            # branch2 = branch2 / np.sqrt(max_num_proposals)

            # proba_c_given_r = tf.nn.softmax(branch1, axis=2)

            # #proba_r_given_c = tf.nn.softmax(branch2, axis=1)

            # mask = tf.sequence_mask(
            #     num_proposals, maxlen=max_num_proposals, dtype=tf.float32)
            # mask = tf.expand_dims(mask, axis=-1)
            # proba_r_given_c = utils.masked_softmax(data=branch2, mask=mask, dim=1)

            # proposal_scores = tf.multiply(proba_c_given_r, proba_r_given_c)

        tf.summary.histogram('midn/branch1', branch1)
        tf.summary.histogram('midn/branch2', branch2)
        tf.summary.histogram('midn/proposal_scores', proposal_scores)

        return proposal_scores