def _get_instruction_encoder(self): """Return instruction encoder module.""" return instruction_encoder.InstructionEncoder( num_hidden_layers=2, output_dim=256, pretrained_embed_path=self.config['pretrained_embed_path'], oov_bucket_size=self.config['oov_bucket_size'])
def __init__(self, config, mode=None): """Initialize R2R Agent.""" super(R2RAgent, self).__init__(name='agent_r2r') self._instruction_encoder = instruction_encoder.InstructionEncoder( num_hidden_layers=2, output_dim=256, pretrained_embed_path=config.pretrained_embed_path, oov_bucket_size=config.oov_bucket_size, vocab_size=config.vocab_size, word_embed_dim=config.word_embed_dim, mode=mode, ) self._embed_action = config.embed_action if hasattr( config, 'embed_action') else False self._image_encoder = image_encoder.ImageEncoder( 256, 512, num_hidden_layers=2, concat_context=self._embed_action, mode=mode) # Learnable transform of initial state from instruction encoder. self._encoder_transform = tf.keras.layers.Dense( 4 * 512, name='encoder_transform') # Text attention. self._text_attention_size = 512 self._text_attention_project_hidden = tf.keras.layers.Dense( self._text_attention_size, name='text_attention_project_hidden') self._text_attention_project_text = tf.keras.layers.Dense( self._text_attention_size, name='text_attention_project_text') self._text_attention = tf.keras.layers.Attention(use_scale=True, name='text_attention') # Visual attention. self._visual_attention_size = 256 self._visual_attention_project_ctext = tf.keras.layers.Dense( self._visual_attention_size, name='vis_attention_project_ctext') self._visual_attention_project_feature = tf.keras.layers.Dense( self._visual_attention_size, name='vis_attention_project_feature') self._visual_attention = tf.keras.layers.Attention( use_scale=True, name='vis_attention') # Action predictor projection. self._action_projection_size = 256 self._project_feature = tf.keras.layers.Dense( self._action_projection_size, name='action_layer_project_feature') self._project_action = tf.keras.layers.Dense( self._action_projection_size, name='action_layer_project_action') # Dot product over the last dimension. self._dot_product = tf.keras.layers.Dot(axes=2) # Value network. self._value_network = self._get_value_network()
def __init__(self, config, mode=None, name=None): """Initialize R2R Agent.""" super(DiscriminatorAgent, self).__init__(name=name if name else 'discriminator_r2r') use_bert_emb = (config.use_bert_emb if hasattr(config, 'use_bert_emb') else False) self._instruction_encoder = instruction_encoder.InstructionEncoder( num_hidden_layers=2, output_dim=256, pretrained_embed_path=config.pretrained_embed_path, oov_bucket_size=config.oov_bucket_size, vocab_size=config.vocab_size, word_embed_dim=config.word_embed_dim, l2_scale=config.l2_scale, dropout=config.dropout, layernorm=config.layernorm, use_bert_embeddings=use_bert_emb, mode=mode) # If False, the text and image encoders are independent from each other. self._init_with_text_state = ( config.init_image_enc_with_text_state if hasattr( config, 'init_image_enc_with_text_state') else True) self._embed_prev_action = config.embed_prev_action if hasattr( config, 'embed_prev_action') else False self._embed_next_action = config.embed_next_action if hasattr( config, 'embed_next_action') else False self._use_attn_pooling = config.use_attn_pooling if hasattr( config, 'use_attn_pooling') else True image_enc_attention_dim = (config.image_enc_attention_dim if hasattr( config, 'image_enc_attention_dim') else 256) image_enc_hidden_dim = (config.image_enc_hidden_dim if hasattr( config, 'image_enc_hidden_dim') else 512) self._image_encoder = image_encoder.ImageEncoder( attention_space_size=image_enc_attention_dim, num_lstm_units=image_enc_hidden_dim, num_hidden_layers=2, l2_scale=config.l2_scale, dropout=config.dropout, concat_context=config.concat_context, layernorm=config.layernorm, mode=mode, use_attention_pooling=self._use_attn_pooling) # Learnable projection of initial decoder state from instruction encoder. self._project_decoder_input_states = ( config.project_decoder_input_states if hasattr( config, 'project_decoder_input_states') else False) if self._project_decoder_input_states: self._encoder_projection = tf.keras.layers.Dense( 4 * 512, name='encoder_projection') self.affine_a = tf.Variable(1.0, dtype=tf.float32, trainable=True) self.affine_b = tf.Variable(0.0, dtype=tf.float32, trainable=True)
def test_call_r2r(self): encoder = instruction_encoder.InstructionEncoder( num_hidden_layers=2, output_dim=256, pretrained_embed_path='', oov_bucket_size=1) # Initialize tokens tokens = tf.constant([[3, 4, 5, 1, 6, 0]]) result = encoder(tokens) self.assertEqual(result[0].shape, [1, 6, 512]) self.assertEqual(result[1][0][0].shape, [1, 512]) self.assertEqual(result[1][0][1].shape, [1, 512]) self.assertEqual(result[1][1][0].shape, [1, 512]) self.assertEqual(result[1][1][1].shape, [1, 512])
def test_call_ndh(self): encoder = instruction_encoder.InstructionEncoder( num_hidden_layers=2, output_dim=256, pretrained_embed_path=None, oov_bucket_size=1, vocab_size=1082, word_embed_dim=300) tokens = tf.constant([[3, 4, 5, 1, 6, 0]]) result = encoder(tokens) self.assertEqual(result[0].shape, [1, 6, 512]) self.assertEqual(result[1][0][0].shape, [1, 512]) self.assertEqual(result[1][0][1].shape, [1, 512]) self.assertEqual(result[1][1][0].shape, [1, 512]) self.assertEqual(result[1][1][1].shape, [1, 512])
def __init__(self, config): """Initialize R2R Agent.""" super(DiscriminatorAgent, self).__init__(name='discriminator_r2r') self._instruction_encoder = instruction_encoder.InstructionEncoder( num_hidden_layers=2, output_dim=256, pretrained_embed_path=config.pretrained_embed_path, oov_bucket_size=config.oov_bucket_size, vocab_size=config.vocab_size, word_embed_dim=config.word_embed_dim, ) self._image_encoder = image_encoder.ImageEncoder( 256, 512, num_hidden_layers=2) self.affine_a = tf.Variable(1.0, dtype=tf.float32, trainable=True) self.affine_b = tf.Variable(0.0, dtype=tf.float32, trainable=True)
def __init__(self, config): """Initialize R2R Agent.""" super(R2RAgent, self).__init__(name='agent_r2r') self._instruction_encoder = instruction_encoder.InstructionEncoder( num_hidden_layers=2, output_dim=256, pretrained_embed_path=config.pretrained_embed_path, oov_bucket_size=config.oov_bucket_size, vocab_size=config.vocab_size, word_embed_dim=config.word_embed_dim, ) self._image_encoder = image_encoder.ImageEncoder(256, 512, num_hidden_layers=2) # Text attention. self._text_attention_size = 512 self._text_attention_project_hidden = tf.keras.layers.Dense( self._text_attention_size) self._text_attention_project_text = tf.keras.layers.Dense( self._text_attention_size) self._text_attention = tf.keras.layers.Attention(use_scale=True) # Visual attention. self._visual_attention_size = 256 self._visual_attention_project_ctext = tf.keras.layers.Dense( self._visual_attention_size) self._visual_attention_project_feature = tf.keras.layers.Dense( self._visual_attention_size) self._visual_attention = tf.keras.layers.Attention(use_scale=True) # Action predictor projection. self._action_projection_size = 256 self._project_feature = tf.keras.layers.Dense( self._action_projection_size) self._project_action = tf.keras.layers.Dense( self._action_projection_size) # Dot product over the last dimension. self._dot_product = tf.keras.layers.Dot(axes=2) # Value network. self._value_network = self._get_value_network()
def __init__(self, config, mode=None): """Initialize the Agent.""" super(MTEnvAgAgent, self).__init__(name='agent_mt_envag') self._ins_classifier = None self._scan_classifier = None if config.classify_instructions: self._ins_classifier = self._get_ins_classifier( config.classifier_dropout) if config.classify_scans: self._scan_classifier = self._get_scan_classifier( config.classifier_dropout) self._instruction_encoder = instruction_encoder.InstructionEncoder( num_hidden_layers=2, output_dim=256, pretrained_embed_path=config.pretrained_embed_path, oov_bucket_size=config.oov_bucket_size, vocab_size=config.vocab_size, word_embed_dim=config.word_embed_dim, mode=mode, ) self._ndh_instruction_encoder = None if config.use_separate_encoders: self._ndh_instruction_encoder = instruction_encoder.InstructionEncoder( num_hidden_layers=2, output_dim=256, pretrained_embed_path=config.pretrained_embed_path, oov_bucket_size=config.oov_bucket_size, vocab_size=config.vocab_size, word_embed_dim=config.word_embed_dim, mode=mode, ) self._image_encoder = image_encoder.ImageEncoder(256, 512, num_hidden_layers=2, mode=mode) # Text attention. self._text_attention_size = 512 self._text_attention_project_hidden = tf.keras.layers.Dense( self._text_attention_size, name='text_attention_project_hidden') self._text_attention_project_text = tf.keras.layers.Dense( self._text_attention_size, name='text_attention_project_text') self._text_attention = tf.keras.layers.Attention(use_scale=True, name='text_attention') # Visual attention. self._visual_attention_size = 256 self._visual_attention_project_ctext = tf.keras.layers.Dense( self._visual_attention_size, name='vis_attention_project_ctext') self._visual_attention_project_feature = tf.keras.layers.Dense( self._visual_attention_size, name='vis_attention_project_feature') self._visual_attention = tf.keras.layers.Attention( use_scale=True, name='vis_attention') # Action predictor projection. self._action_projection_size = 256 self._project_feature = tf.keras.layers.Dense( self._action_projection_size, name='action_layer_project_feature') self._project_action = tf.keras.layers.Dense( self._action_projection_size, name='action_layer_project_action') # Dot product over the last dimension. self._dot_product = tf.keras.layers.Dot(axes=2) # Value network. self._value_network = self._get_value_network()