Exemplo n.º 1
0
    def build_loss(self, predictions, **kwargs):
        """Build tf graph to compute loss.

    Args:
      predictions: dict of prediction results keyed by name.

    Returns:
      loss_dict: dict of loss tensors keyed by name.
    """
        options = self._model_proto

        (image_id, image_ids_gathered,
         similarity) = (predictions[_FIELD_IMAGE_ID],
                        predictions[_FIELD_IMAGE_IDS_GATHERED],
                        predictions[_FIELD_SIMILARITY])

        distance = 1.0 - similarity

        pos_mask = tf.cast(
            tf.equal(tf.expand_dims(image_id, axis=1),
                     tf.expand_dims(image_ids_gathered, axis=0)), tf.float32)
        neg_mask = 1.0 - pos_mask

        if options.triplet_ap_use_avg:
            distance_ap = utils.masked_avg(distance, pos_mask)
        else:
            distance_ap = utils.masked_maximum(distance, pos_mask)

        # negatives_outside: smallest D_an where D_an > D_ap.

        mask = tf.cast(tf.greater(distance, distance_ap), tf.float32)
        mask = mask * neg_mask
        negatives_outside = utils.masked_minimum(distance, mask)

        # negatives_inside: largest D_an.

        negatives_inside = utils.masked_maximum(distance, neg_mask)

        # distance_an: the semihard negatives.

        mask_condition = tf.greater(tf.reduce_sum(mask, axis=1, keepdims=True),
                                    0.0)

        distance_an = tf.where(mask_condition, negatives_outside,
                               negatives_inside)

        # Triplet loss.

        losses = tf.maximum(distance_ap - distance_an + options.triplet_margin,
                            0)

        return {
            'triplet_loss': tf.reduce_mean(losses),
        }
Exemplo n.º 2
0
    def test_masked_maximum(self):
        tf.reset_default_graph()

        data = tf.placeholder(tf.float32, shape=[None, None])
        mask = tf.placeholder(tf.float32, shape=[None, None])
        masked_maximums = utils.masked_maximum(data, mask)

        with self.test_session() as sess:
            result = sess.run(masked_maximums,
                              feed_dict={
                                  data: [[-2.0, 1.0, 2.0, -1.0, 0.0],
                                         [-2.0, -1.0, -3.0, -5.0, -4.0]],
                                  mask: [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]
                              })
            self.assertAllClose(result, [[2.0], [-1.0]])

            result = sess.run(masked_maximums,
                              feed_dict={
                                  data: [[-2.0, 1.0, 2.0, -1.0, 0.0],
                                         [-2.0, -1.0, -3.0, -5.0, -4.0]],
                                  mask: [[1, 1, 0, 1, 1], [0, 0, 1, 1, 1]]
                              })
            self.assertAllClose(result, [[1.0], [-3.0]])

            result = sess.run(masked_maximums,
                              feed_dict={
                                  data: [[-2.0, 1.0, 2.0, -1.0, 0.0],
                                         [-2.0, -1.0, -3.0, -5.0, -4.0]],
                                  mask: [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
                              })
            self.assertAllClose(result, [[-2.0], [-5.0]])
Exemplo n.º 3
0
    def build_prediction(self, examples, **kwargs):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        # Text Global-Maximum-Pooling features.

        (caption_string,
         caption_length) = (examples[InputDataFields.concat_caption_string],
                            examples[InputDataFields.concat_caption_length])

        (caption_token_ids, caption_features) = self._extract_text_feature(
            caption_string,
            caption_length,
            vocabulary_list=self._open_vocabulary_list,
            initial_embedding=self._open_vocabulary_initial_embedding,
            embedding_dims=options.embedding_dims,
            trainable=options.train_word_embedding,
            max_norm=None)

        with slim.arg_scope(
                build_hyperparams(options.text_fc_hyperparams, is_training)):
            caption_features = slim.fully_connected(
                caption_features,
                num_outputs=self._num_classes,
                activation_fn=None,
                scope='caption')

        oov = len(self._open_vocabulary_list)
        caption_masks = tf.to_float(
            tf.logical_not(tf.equal(caption_token_ids, oov)))

        # logits shape = [batch, num_classes].

        logits = utils.masked_maximum(data=caption_features,
                                      mask=tf.expand_dims(caption_masks,
                                                          axis=-1),
                                      dim=1)
        logits = tf.squeeze(logits, axis=1)

        predictions = {
            TextClassificationPredictions.vocab:
            tf.constant(self._vocabulary_list),
            TextClassificationPredictions.logits: logits,
        }
        return predictions
Exemplo n.º 4
0
  def _predict(self,
               text_strings,
               text_lengths,
               vocabulary_list,
               vocabulary_word_embedding,
               hidden_units,
               output_units,
               dropout_keep_proba=1.0,
               regularizer=1e-5,
               is_training=False):
    """Predicts labels using the texts.

    Args:
      text_strings: A [batch, num_tokens] string tensor.
      text_lengths: A [batch] int tensor.
      vocabulary_list: A list of string of length vocab_size.
      vocabulary_word_embedding: A [vocab_size, embedding_dims] numpy array.
    """
    # Initial embeddings.

    init_width = 0.03
    oov_emb = init_width * (
        np.random.rand(1, vocabulary_word_embedding.shape[-1]) * 2 - 1)
    embedding_array_data = np.concatenate([vocabulary_word_embedding, oov_emb],
                                          axis=0)

    # Word embedding process.

    with tf.name_scope('word_embedding'):
      table = tf.contrib.lookup.index_table_from_tensor(
          vocabulary_list, num_oov_buckets=1)
      embedding_weights = tf.get_variable(
          name='weights',
          initializer=embedding_array_data.astype(np.float32),
          trainable=False)  # Freeze the word embedding.

      token_ids = table.lookup(text_strings)
      token_embs = tf.nn.embedding_lookup(
          embedding_weights, token_ids, max_norm=None)

    # Multiplayer perceptron.

    with tf.variable_scope('text_classifier'):

      oov = len(vocabulary_list)
      masks = tf.to_float(tf.logical_not(tf.equal(token_ids, oov)))

      hiddens = slim.fully_connected(
          token_embs,
          num_outputs=hidden_units,
          activation_fn=None,
          trainable=is_training,
          weights_regularizer=tf.contrib.layers.l2_regularizer(regularizer),
          scope='layer1')
      hiddens = utils.masked_maximum(
          data=hiddens, mask=tf.expand_dims(masks, axis=-1), dim=1)
      hiddens = tf.squeeze(hiddens, axis=1)
      hiddens = tf.nn.relu(hiddens)
      hiddens = slim.dropout(
          hiddens, dropout_keep_proba, is_training=is_training)

      logits = slim.fully_connected(
          hiddens,
          num_outputs=output_units,
          activation_fn=None,
          trainable=is_training,
          weights_regularizer=tf.contrib.layers.l2_regularizer(regularizer),
          scope='layer2')
    return logits
Exemplo n.º 5
0
  def extract_labels(self, examples):
    """Extracts the pseudo labels.
    Args:
      examples: A dictionary involving image-level annotations.
    Returns:
      labels: A [batch, num_classes] tensor denoting the presence of classes.
    """
    init_width = 0.03
    embedding_dims = self._open_vocabulary_word_embedding.shape[-1]

    classes_to_match = _replace_class_names(self._classes)

    # Check if all classes appear in the open-vocabulary.
    for class_name in classes_to_match:
      if not class_name in self._open_vocabulary_list:
        raise ValueError('Class %s has no vector representation.' % class_name)

    with tf.name_scope('word_vector_match_extractor'):

      # Create hash table and word embedding weights.

      table = tf.contrib.lookup.index_table_from_tensor(
          self._open_vocabulary_list, num_oov_buckets=1)
      oov_emb = init_width * (np.random.rand(1, embedding_dims) * 2 - 1)
      embedding_array_data = np.concatenate(
          [self._open_vocabulary_word_embedding, oov_emb], axis=0)
      embedding_weights = tf.get_variable(
          name='weights',
          initializer=embedding_array_data.astype(np.float32),
          trainable=False)  # Freeze the word embedding.

      # Lookup to get the class/token embeddings.

      class_embs = tf.nn.embedding_lookup(
          embedding_weights,
          table.lookup(tf.constant(classes_to_match)),
          max_norm=None)
      token_ids = table.lookup(examples[InputDataFields.concat_caption_string])
      token_embs = tf.nn.embedding_lookup(
          embedding_weights, token_ids, max_norm=None)

      # Compute token-to-class similarity and apply max-pooling.
      # Max-pooling: i.e., treat the top-1 as a match.
      #   similarity shape = [batch, max_num_tokens, num_classes].
      #   similarity_pooled shape = [batch, num_classes]

      batch, num_tokens = utils.get_tensor_shape(
          examples[InputDataFields.concat_caption_string])

      similarity = self._cosine_similarity(class_embs, token_embs)

      oov = len(self._open_vocabulary_list)
      mask = tf.not_equal(token_ids, oov)
      similarity_pooled = utils.masked_maximum(
          data=similarity,
          mask=tf.expand_dims(tf.to_float(mask), axis=-1),
          dim=1)
      similarity_pooled = tf.squeeze(similarity_pooled, axis=1)

      labels_most_similar = tf.one_hot(
          indices=tf.argmax(similarity_pooled, axis=-1),
          depth=self.num_classes,
          dtype=tf.float32)
      labels_most_similar = tf.where(
          tf.reduce_any(mask, axis=-1),
          x=labels_most_similar,
          y=tf.zeros(shape=[batch, self.num_classes]))

      # Consider the exact match.

      labels_exact_match = _match_labels(
          class_texts=examples[InputDataFields.concat_caption_string],
          vocabulary_list=classes_to_match)

      return tf.where(
          tf.reduce_any(labels_exact_match > 0, axis=-1),
          x=labels_exact_match,
          y=labels_most_similar)