def test_masked_avg_nd(self): self.assertAllClose( ops.masked_avg_nd(data=[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]]], mask=tf.convert_to_tensor([[1, 0, 1], [0, 1, 0]], dtype=tf.float32)), [[[3, 4]], [[9, 10]]]) self.assertAllClose( ops.masked_avg_nd(data=[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]]], mask=tf.convert_to_tensor([[0, 0, 0], [0, 0, 0]], dtype=tf.float32)), [[[0, 0]], [[0, 0]]])
def _encode_knowledge(self, tokens, tokens_len, vocab_file, glove_file, slim_fc_scope, default_dims=128, is_training=True): """Encodes knowledge into vector representations. Args: tokens: A [batch, max_sentence_len, max_knowledge_len] int tensor. tokens_len: A [batch, max_sentence_len] int tensor. Returns: A [batch, max_sentence_len, dims] float tensor. """ glove_embedding_array = _create_embedding_matrix( glove_file, vocab_file, default_dims=default_dims) embedding = tf.get_variable('knowledge/embedding', initializer=glove_embedding_array, trainable=True) if embedding.shape[-1] != default_dims: with slim.arg_scope(slim_fc_scope): embedding = slim.fully_connected(embedding, num_outputs=default_dims, activation_fn=None, scope='glove_projection') tokens_embedding = tf.nn.embedding_lookup(embedding, tokens, max_norm=None) tokens_mask = tf.sequence_mask(lengths=tokens_len, maxlen=tf.shape(tokens)[2], dtype=tf.float32) output = masked_ops.masked_avg_nd(data=tokens_embedding, mask=tokens_mask, dim=2) return tf.squeeze(output, axis=2)
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (num_objects, object_bboxes, object_labels, object_scores, object_features) = (inputs[InputFields.num_objects], inputs[InputFields.object_bboxes], inputs[InputFields.object_labels], inputs[InputFields.object_scores], inputs[InputFields.object_features]) (answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) batch_size = answer_choices.shape[0] # Image feature. object_masks = tf.sequence_mask(num_objects, tf.shape(object_bboxes)[1], dtype=tf.float32) # object_features = tf.compat.v1.layers.dense(object_features, # units=512, # activation=tf.nn.tanh) image_feature = masked_ops.masked_avg_nd(object_features, object_masks, dim=1) # Convert tokens to ids. token_to_id_layer = token_to_id.TokenToIdLayer(options.vocab_file, options.unk_token_id) answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids_reshaped = tf.reshape( answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) # Convert word ids to embedding vectors. glove_embedding_array = create_embedding_matrix( options.glove_file, options.vocab_file) embedding = tf.get_variable('word/embedding', initializer=glove_embedding_array, trainable=True) answer_choices_embs_reshaped = tf.nn.embedding_lookup( embedding, answer_choices_token_ids_reshaped, max_norm=None) # Encode the sequence using BiLSTM model. with tf.variable_scope('answer_choice_encoder'): _, answer_choices_feature_reshaped = rnn.RNN( answer_choices_embs_reshaped, tf.reshape(answer_choices_len, [batch_size * NUM_CHOICES]), options.rnn_config, is_training=is_training) answer_choices_feature = tf.reshape(answer_choices_feature_reshaped, [batch_size, NUM_CHOICES, -1]) inputs = tf.concat([ answer_choices_feature, tf.tile(image_feature, [1, NUM_CHOICES, 1]) ], -1) output = tf.compat.v1.layers.dense(inputs, units=512, activation=tf.nn.relu6) output = tf.compat.v1.layers.dense(inputs, units=1, activation=None) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (image, height, width, num_objects, object_bboxes, object_labels, object_scores, answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.img_data], inputs[InputFields.img_height], inputs[InputFields.img_width], inputs[InputFields.num_objects], inputs[InputFields.object_bboxes], inputs[InputFields.object_labels], inputs[InputFields.object_scores], inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) # Visualize image and object bboxes. batch_size = image.shape[0] image_batch_shape = tf.shape(image) object_bboxes = _to_batch_coordinates(object_bboxes, height, width, image_batch_shape[1], image_batch_shape[2]) image_with_boxes = visualization.draw_bounding_boxes_on_image_tensors( image, num_objects, object_bboxes, object_labels, object_scores) tf.summary.image('vcr/detection', image_with_boxes, max_outputs=10) # Extract FRCNN feature. frcnn_features = fast_rcnn.FastRCNN(tf.cast(image, tf.float32), object_bboxes, options=options.fast_rcnn_config, is_training=is_training) object_masks = tf.sequence_mask(num_objects, tf.shape(object_bboxes)[1], dtype=tf.float32) image_feature = masked_ops.masked_avg_nd(frcnn_features, object_masks, dim=1) # Convert tokens into token ids. token_to_id_layer = token_to_id.TokenToIdLayer( options.bert_vocab_file, options.bert_unk_token_id) answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids_reshaped = tf.reshape( answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) answer_choices_mask = tf.sequence_mask( answer_choices_len, maxlen=tf.shape(answer_choices)[-1]) answer_choices_mask_reshaped = tf.reshape( answer_choices_mask, [batch_size * NUM_CHOICES, -1]) # Bert prediction. bert_config = BertConfig.from_json_file(options.bert_config_file) bert_model = BertModel(bert_config, is_training, input_ids=answer_choices_token_ids_reshaped, input_mask=answer_choices_mask_reshaped) answer_choices_cls_feature_reshaped = bert_model.get_pooled_output() answer_choices_cls_feature = tf.reshape( answer_choices_cls_feature_reshaped, [batch_size, NUM_CHOICES, -1]) assignment_map, _ = get_assignment_map_from_checkpoint( tf.global_variables(), options.bert_checkpoint_file) # Fuse image feature. image_feature_tiled = tf.tile(image_feature, [1, NUM_CHOICES, 1]) answer_choices_cls_feature = tf.concat( [answer_choices_cls_feature, image_feature_tiled], -1) # Classification layer. output = tf.compat.v1.layers.dense(answer_choices_cls_feature, units=1, activation=None) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}