def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) batch_size = answer_choices.shape[0] # Convert tokens to ids. token_to_id_layer = token_to_id.TokenToIdLayer(options.vocab_file, options.unk_token_id) answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids_reshaped = tf.reshape( answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) # Convert word ids to embedding vectors. glove_embedding_array = create_embedding_matrix( options.glove_file, options.vocab_file) embedding = tf.get_variable('word/embedding', initializer=glove_embedding_array, trainable=True) answer_choices_embs_reshaped = tf.nn.embedding_lookup( embedding, answer_choices_token_ids_reshaped, max_norm=None) # Encode the sequence using BiLSTM model. with tf.variable_scope('answer_choice_encoder'): _, answer_choices_feature_reshaped = rnn.RNN( answer_choices_embs_reshaped, tf.reshape(answer_choices_len, [batch_size * NUM_CHOICES]), options.rnn_config, is_training=is_training) answer_choices_feature = tf.reshape(answer_choices_feature_reshaped, [batch_size, NUM_CHOICES, -1]) # Classification layer. output = tf.compat.v1.layers.dense(answer_choices_feature, units=1, activation=None) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}
def _recognition_to_cognition(self, question_inp_features, question_len, answer_inp_features, answer_len, object_features, num_objects): """Creates the `RecognitionToCognition` network. Args: question_inp_features: Input question features, a [batch*NUM_CHOICES, max_question_len, feature_dims] float tensor. question_len: Question length, a [batch*NUM_CHOICES] int tensor. answer_inp_features: Input answer features, a [batch*NUM_CHOICES, max_answer_len , feature_dims] float tensor. answer_len: Answer length, a [batch*NUM_CHOICES] int tensor. object_features: Object features, a [batch, max_num_objects, object_dims] float tensor. num_objects: A [batch] int tensor. Returns: final_features: A [batch, output_dims] float tensor. answer_seq_features: Contextualized answer features, a [batch*NUM_CHOICES, max_answer_len , feature_dims] float tensor. """ is_training = self._is_training options = self._model_proto (question_max_len, answer_max_len) = (tf.shape(question_inp_features)[1], tf.shape(answer_inp_features)[1]) batch_size = object_features.shape[0] max_num_objects = tf.shape(object_features)[1] # Encode the sequence using BiLSTM model. with tf.variable_scope('grounding_encoder'): question_seq_features, _ = rnn.RNN(question_inp_features, question_len, options.rnn_config, is_training=is_training) with tf.variable_scope('grounding_encoder', reuse=True): answer_seq_features, _ = rnn.RNN(answer_inp_features, answer_len, options.rnn_config, is_training=is_training) # Get the question features attended by the answers. # qa_mask: [batch*NUM_CHOICES, question_max_len, 1]. # qa_similarity: [batch*NUM_CHOICES, question_max_len, answer_max_len]. # qa_attention_weights: [batch*NUM_CHOICES, question_max_len, answer_max_len]. # attended_question: [batch*NUM_CHOICES, answer_max_len, feature_dims]. qa_mask = tf.expand_dims( tf.sequence_mask(question_len, question_max_len, dtype=tf.float32), 2) with tf.variable_scope('qa_bilinear'): qa_similarity = attention_ops.bilinear(question_seq_features, answer_seq_features) qa_attention_weights = masked_ops.masked_softmax(data=qa_similarity, mask=qa_mask, dim=1) attended_question = tf.einsum('bqa,bqd->bad', qa_attention_weights, question_seq_features) # Attention over the objects. # oa_mask: [batch, max_num_object, 1]. # oa_similarity: [batch*NUM_CHOICES, max_num_object, answer_max_len] # oa_attention_weights: [batch*NUM_CHOICES, max_num_object, answer_max_len]. # attended_objects: [batch*NUM_CHOICES, answer_max_len, object_dims]. tile_fn = lambda x: tf.gather(tf.expand_dims(x, 1), [0] * NUM_CHOICES, axis=1) object_features = tf.reshape( tile_fn(object_features), [batch_size * NUM_CHOICES, -1, object_features.shape[-1]]) num_objects = tf.reshape(tile_fn(num_objects), [-1]) oa_mask = tf.expand_dims( tf.sequence_mask(num_objects, max_num_objects, dtype=tf.float32), 2) with tf.variable_scope('oa_bilinear'): oa_similarity = attention_ops.bilinear(object_features, answer_seq_features) oa_attention_weights = masked_ops.masked_softmax(data=oa_similarity, mask=oa_mask, dim=1) attended_objects = tf.einsum('boa,bod->bad', oa_attention_weights, object_features) # Reasoning module. reasoning_inp_features = tf.concat( [answer_seq_features, attended_question, attended_objects], -1) with tf.variable_scope('reasoning'): reasoning_seq_features, _ = rnn.RNN(reasoning_inp_features, answer_len, options.rnn_config, is_training=is_training) # Pool features from the sequence. pooling_fn = (masked_ops.masked_max_nd if options.use_max_pooling else masked_ops.masked_avg_nd) final_seq_features = tf.concat([ reasoning_seq_features, answer_seq_features, attended_question, attended_objects ], -1) final_features = pooling_fn(data=final_seq_features, mask=tf.sequence_mask(answer_len, answer_max_len, dtype=tf.float32), dim=1) # Export summaries. tf.compat.v1.summary.histogram('attention/qa_similarity', qa_similarity) tf.compat.v1.summary.histogram('attention/oa_similarity', oa_similarity) return (tf.squeeze(final_features, 1), answer_seq_features)
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (num_objects, object_bboxes, object_labels, object_scores, object_features) = (inputs[InputFields.num_objects], inputs[InputFields.object_bboxes], inputs[InputFields.object_labels], inputs[InputFields.object_scores], inputs[InputFields.object_features]) (answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) batch_size = answer_choices.shape[0] # Image feature. object_masks = tf.sequence_mask(num_objects, tf.shape(object_bboxes)[1], dtype=tf.float32) # object_features = tf.compat.v1.layers.dense(object_features, # units=512, # activation=tf.nn.tanh) image_feature = masked_ops.masked_avg_nd(object_features, object_masks, dim=1) # Convert tokens to ids. token_to_id_layer = token_to_id.TokenToIdLayer(options.vocab_file, options.unk_token_id) answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids_reshaped = tf.reshape( answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) # Convert word ids to embedding vectors. glove_embedding_array = create_embedding_matrix( options.glove_file, options.vocab_file) embedding = tf.get_variable('word/embedding', initializer=glove_embedding_array, trainable=True) answer_choices_embs_reshaped = tf.nn.embedding_lookup( embedding, answer_choices_token_ids_reshaped, max_norm=None) # Encode the sequence using BiLSTM model. with tf.variable_scope('answer_choice_encoder'): _, answer_choices_feature_reshaped = rnn.RNN( answer_choices_embs_reshaped, tf.reshape(answer_choices_len, [batch_size * NUM_CHOICES]), options.rnn_config, is_training=is_training) answer_choices_feature = tf.reshape(answer_choices_feature_reshaped, [batch_size, NUM_CHOICES, -1]) inputs = tf.concat([ answer_choices_feature, tf.tile(image_feature, [1, NUM_CHOICES, 1]) ], -1) output = tf.compat.v1.layers.dense(inputs, units=512, activation=tf.nn.relu6) output = tf.compat.v1.layers.dense(inputs, units=1, activation=None) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}
def generate_adversarial_masks(self, choice_ids, choice_lengths, question_lengths, labels, hard=True): """Masked language modeling.""" options = self._model_proto is_training = self._is_training batch_size = choice_ids.shape[0] max_choice_len = tf.shape(choice_ids)[-1] with tf.variable_scope('adversarial'): # Lookup for token embeddings. # Note: DONOT share it with BERT, use a brand new embedding matrix instead. with tf.variable_scope("embeddings", reuse=False): (choice_embeddings_reshaped, _) = bert_modeling.embedding_lookup( input_ids=tf.reshape(choice_ids, [batch_size * NUM_CHOICES, -1]), vocab_size=self._bert_config.vocab_size, embedding_size=self._bert_config.hidden_size, initializer_range=self._bert_config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=False, word_embedding_trainable=options. adversarial_train_word_embedding) choice_lengths_reshaped = tf.reshape(choice_lengths, [-1]) # Create label embedding. if options.use_label_embedding: full_label_embeddings = tf.get_variable( name='label_embedding', shape=[2, self._bert_config.hidden_size], initializer=bert_modeling.create_initializer( self._bert_config.initializer_range)) one_hot_labels = tf.one_hot(labels, NUM_CHOICES, on_value=1, off_value=0) label_embeddings = tf.nn.embedding_lookup( full_label_embeddings, one_hot_labels) label_embeddings_reshaped = tf.reshape(label_embeddings, [ batch_size * NUM_CHOICES, 1, self._bert_config.hidden_size ]) choice_embeddings_reshaped += label_embeddings_reshaped # Layer norm. choice_embeddings_reshaped = bert_modeling.layer_norm_and_dropout( choice_embeddings_reshaped, dropout_prob=self._bert_config.hidden_dropout_prob) # RNN. choice_features_reshaped, _ = rnn.RNN( choice_embeddings_reshaped, choice_lengths_reshaped, options=options.adversarial_rnn, is_training=is_training) # Fully-connected layer choice_features = tf.reshape(choice_features_reshaped, [ batch_size, NUM_CHOICES, -1, choice_features_reshaped.shape[-1] ]) choice_shortcut_logits = slim.fully_connected(choice_features, num_outputs=1, activation_fn=None, scope='logits') choice_shortcut_logits = tf.multiply( options.adversarial_logits_scale, tf.squeeze(choice_shortcut_logits, -1)) # END - with tf.variable_scope('adversarial'): # Gumbel-Softmax to get the probable shortcut. choice_masks = tf.logical_and( tf.sequence_mask(choice_lengths, maxlen=max_choice_len), tf.logical_not( tf.sequence_mask(question_lengths, maxlen=max_choice_len))) choice_masks = tf.cast(choice_masks, tf.float32) temperature = tf.Variable(options.temperature_init_value, name='adversarial/temperature_var', trainable=options.temperature_trainable, dtype=tf.float32) temperature = tf.maximum(temperature, EPSILON) tf.summary.histogram('shortcut/logtis', choice_shortcut_logits) tf.summary.scalar('metrics/temperature', temperature) choice_shortcut_logits = choice_shortcut_logits - \ INF * (1.0 - choice_masks) tf.summary.histogram('shortcut/probas', tf.nn.softmax(choice_shortcut_logits)) a_sample = RelaxedOneHotCategorical(temperature, logits=choice_shortcut_logits, allow_nan_stats=False).sample() if hard: k = tf.shape(choice_shortcut_logits)[-1] a_hard_sample = tf.cast(tf.one_hot(tf.argmax(a_sample, -1), k), a_sample.dtype) a_sample = tf.stop_gradient(a_hard_sample - a_sample) + a_sample # Returns the mask sampled from the distribution. return a_sample, choice_shortcut_logits, choice_features, temperature, choice_masks
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ options = self._model_proto is_training = self._is_training token_to_id_layer = token_to_id.TokenToIdLayer(options.vocab_file, options.unk_token_id) fc_scope_fn = hyperparams.build_hyperparams(options.fc_hyperparams, is_training) # Extract input fields. (question, question_len, answer_choices, answer_choices_len) = (inputs[InputFields.question], inputs[InputFields.question_len], inputs[InputFields.answer_choices], inputs[InputFields.answer_choices_len]) batch_size = answer_choices.shape[0] # Convert question tokens into token ids. question_token_ids = token_to_id_layer(question) # Convert answer choice tokens into token ids. answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids = tf.reshape(answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) answer_choices_len = tf.reshape(answer_choices_len, [batch_size * NUM_CHOICES]) # Convert word ids to embedding vectors. glove_embedding_array = create_embedding_matrix(options.glove_file, options.vocab_file) embedding = tf.get_variable('word/embedding', initializer=glove_embedding_array, trainable=True) question_embs = tf.nn.embedding_lookup(embedding, question_token_ids, max_norm=None) answer_choices_embs = tf.nn.embedding_lookup(embedding, answer_choices_token_ids, max_norm=None) # Tile the question embeddings. question_embs = tf.gather(tf.expand_dims(question_embs, 1), [0] * NUM_CHOICES, axis=1) question_embs = tf.reshape( question_embs, [batch_size * NUM_CHOICES, -1, question_embs.shape[-1]]) question_len = tf.gather(tf.expand_dims(question_len, 1), [0] * NUM_CHOICES, axis=1) question_len = tf.reshape(question_len, [batch_size * NUM_CHOICES]) # Encode the sequence using BiLSTM model. with tf.variable_scope('question_encoder'): _, question_features = rnn.RNN(question_embs, question_len, options.rnn_config, is_training=is_training) with tf.variable_scope('answer_choice_encoder'): _, answer_features = rnn.RNN(answer_choices_embs, answer_choices_len, options.rnn_config, is_training=is_training) final_features = tf.concat( [answer_features, answer_features * question_features], axis=-1) # MLP. with slim.arg_scope(fc_scope_fn()): with tf.variable_scope('classification'): with tf.variable_scope('hidden'): output = tf.contrib.layers.fully_connected(final_features, num_outputs=1024, activation_fn=tf.nn.relu) output = tf.contrib.layers.dropout( output, keep_prob=options.dropout_keep_prob, is_training=is_training) with tf.variable_scope('output'): output = tf.contrib.layers.fully_connected(output, num_outputs=1, activation_fn=None) output = tf.reshape(output, [batch_size, NUM_CHOICES]) return { FIELD_ANSWER_PREDICTION: output, }