def build_loss(self, predictions, **kwargs): """Build tf graph to compute loss. Args: predictions: dict of prediction results keyed by name. Returns: loss_dict: dict of loss tensors keyed by name. """ options = self._model_proto (image_id, image_ids_gathered, similarity) = (predictions[_FIELD_IMAGE_ID], predictions[_FIELD_IMAGE_IDS_GATHERED], predictions[_FIELD_SIMILARITY]) distance = 1.0 - similarity pos_mask = tf.cast( tf.equal(tf.expand_dims(image_id, axis=1), tf.expand_dims(image_ids_gathered, axis=0)), tf.float32) neg_mask = 1.0 - pos_mask if options.triplet_ap_use_avg: distance_ap = utils.masked_avg(distance, pos_mask) else: distance_ap = utils.masked_maximum(distance, pos_mask) # negatives_outside: smallest D_an where D_an > D_ap. mask = tf.cast(tf.greater(distance, distance_ap), tf.float32) mask = mask * neg_mask negatives_outside = utils.masked_minimum(distance, mask) # negatives_inside: largest D_an. negatives_inside = utils.masked_maximum(distance, neg_mask) # distance_an: the semihard negatives. mask_condition = tf.greater(tf.reduce_sum(mask, axis=1, keepdims=True), 0.0) distance_an = tf.where(mask_condition, negatives_outside, negatives_inside) # Triplet loss. losses = tf.maximum(distance_ap - distance_an + options.triplet_margin, 0) return { 'triplet_loss': tf.reduce_mean(losses), }
def test_masked_maximum(self): tf.reset_default_graph() data = tf.placeholder(tf.float32, shape=[None, None]) mask = tf.placeholder(tf.float32, shape=[None, None]) masked_maximums = utils.masked_maximum(data, mask) with self.test_session() as sess: result = sess.run(masked_maximums, feed_dict={ data: [[-2.0, 1.0, 2.0, -1.0, 0.0], [-2.0, -1.0, -3.0, -5.0, -4.0]], mask: [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]] }) self.assertAllClose(result, [[2.0], [-1.0]]) result = sess.run(masked_maximums, feed_dict={ data: [[-2.0, 1.0, 2.0, -1.0, 0.0], [-2.0, -1.0, -3.0, -5.0, -4.0]], mask: [[1, 1, 0, 1, 1], [0, 0, 1, 1, 1]] }) self.assertAllClose(result, [[1.0], [-3.0]]) result = sess.run(masked_maximums, feed_dict={ data: [[-2.0, 1.0, 2.0, -1.0, 0.0], [-2.0, -1.0, -3.0, -5.0, -4.0]], mask: [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] }) self.assertAllClose(result, [[-2.0], [-5.0]])
def build_prediction(self, examples, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training # Text Global-Maximum-Pooling features. (caption_string, caption_length) = (examples[InputDataFields.concat_caption_string], examples[InputDataFields.concat_caption_length]) (caption_token_ids, caption_features) = self._extract_text_feature( caption_string, caption_length, vocabulary_list=self._open_vocabulary_list, initial_embedding=self._open_vocabulary_initial_embedding, embedding_dims=options.embedding_dims, trainable=options.train_word_embedding, max_norm=None) with slim.arg_scope( build_hyperparams(options.text_fc_hyperparams, is_training)): caption_features = slim.fully_connected( caption_features, num_outputs=self._num_classes, activation_fn=None, scope='caption') oov = len(self._open_vocabulary_list) caption_masks = tf.to_float( tf.logical_not(tf.equal(caption_token_ids, oov))) # logits shape = [batch, num_classes]. logits = utils.masked_maximum(data=caption_features, mask=tf.expand_dims(caption_masks, axis=-1), dim=1) logits = tf.squeeze(logits, axis=1) predictions = { TextClassificationPredictions.vocab: tf.constant(self._vocabulary_list), TextClassificationPredictions.logits: logits, } return predictions
def _predict(self, text_strings, text_lengths, vocabulary_list, vocabulary_word_embedding, hidden_units, output_units, dropout_keep_proba=1.0, regularizer=1e-5, is_training=False): """Predicts labels using the texts. Args: text_strings: A [batch, num_tokens] string tensor. text_lengths: A [batch] int tensor. vocabulary_list: A list of string of length vocab_size. vocabulary_word_embedding: A [vocab_size, embedding_dims] numpy array. """ # Initial embeddings. init_width = 0.03 oov_emb = init_width * ( np.random.rand(1, vocabulary_word_embedding.shape[-1]) * 2 - 1) embedding_array_data = np.concatenate([vocabulary_word_embedding, oov_emb], axis=0) # Word embedding process. with tf.name_scope('word_embedding'): table = tf.contrib.lookup.index_table_from_tensor( vocabulary_list, num_oov_buckets=1) embedding_weights = tf.get_variable( name='weights', initializer=embedding_array_data.astype(np.float32), trainable=False) # Freeze the word embedding. token_ids = table.lookup(text_strings) token_embs = tf.nn.embedding_lookup( embedding_weights, token_ids, max_norm=None) # Multiplayer perceptron. with tf.variable_scope('text_classifier'): oov = len(vocabulary_list) masks = tf.to_float(tf.logical_not(tf.equal(token_ids, oov))) hiddens = slim.fully_connected( token_embs, num_outputs=hidden_units, activation_fn=None, trainable=is_training, weights_regularizer=tf.contrib.layers.l2_regularizer(regularizer), scope='layer1') hiddens = utils.masked_maximum( data=hiddens, mask=tf.expand_dims(masks, axis=-1), dim=1) hiddens = tf.squeeze(hiddens, axis=1) hiddens = tf.nn.relu(hiddens) hiddens = slim.dropout( hiddens, dropout_keep_proba, is_training=is_training) logits = slim.fully_connected( hiddens, num_outputs=output_units, activation_fn=None, trainable=is_training, weights_regularizer=tf.contrib.layers.l2_regularizer(regularizer), scope='layer2') return logits
def extract_labels(self, examples): """Extracts the pseudo labels. Args: examples: A dictionary involving image-level annotations. Returns: labels: A [batch, num_classes] tensor denoting the presence of classes. """ init_width = 0.03 embedding_dims = self._open_vocabulary_word_embedding.shape[-1] classes_to_match = _replace_class_names(self._classes) # Check if all classes appear in the open-vocabulary. for class_name in classes_to_match: if not class_name in self._open_vocabulary_list: raise ValueError('Class %s has no vector representation.' % class_name) with tf.name_scope('word_vector_match_extractor'): # Create hash table and word embedding weights. table = tf.contrib.lookup.index_table_from_tensor( self._open_vocabulary_list, num_oov_buckets=1) oov_emb = init_width * (np.random.rand(1, embedding_dims) * 2 - 1) embedding_array_data = np.concatenate( [self._open_vocabulary_word_embedding, oov_emb], axis=0) embedding_weights = tf.get_variable( name='weights', initializer=embedding_array_data.astype(np.float32), trainable=False) # Freeze the word embedding. # Lookup to get the class/token embeddings. class_embs = tf.nn.embedding_lookup( embedding_weights, table.lookup(tf.constant(classes_to_match)), max_norm=None) token_ids = table.lookup(examples[InputDataFields.concat_caption_string]) token_embs = tf.nn.embedding_lookup( embedding_weights, token_ids, max_norm=None) # Compute token-to-class similarity and apply max-pooling. # Max-pooling: i.e., treat the top-1 as a match. # similarity shape = [batch, max_num_tokens, num_classes]. # similarity_pooled shape = [batch, num_classes] batch, num_tokens = utils.get_tensor_shape( examples[InputDataFields.concat_caption_string]) similarity = self._cosine_similarity(class_embs, token_embs) oov = len(self._open_vocabulary_list) mask = tf.not_equal(token_ids, oov) similarity_pooled = utils.masked_maximum( data=similarity, mask=tf.expand_dims(tf.to_float(mask), axis=-1), dim=1) similarity_pooled = tf.squeeze(similarity_pooled, axis=1) labels_most_similar = tf.one_hot( indices=tf.argmax(similarity_pooled, axis=-1), depth=self.num_classes, dtype=tf.float32) labels_most_similar = tf.where( tf.reduce_any(mask, axis=-1), x=labels_most_similar, y=tf.zeros(shape=[batch, self.num_classes])) # Consider the exact match. labels_exact_match = _match_labels( class_texts=examples[InputDataFields.concat_caption_string], vocabulary_list=classes_to_match) return tf.where( tf.reduce_any(labels_exact_match > 0, axis=-1), x=labels_exact_match, y=labels_most_similar)