예제 #1
0
 def _get_instruction_encoder(self):
   """Return instruction encoder module."""
   return instruction_encoder.InstructionEncoder(
       num_hidden_layers=2,
       output_dim=256,
       pretrained_embed_path=self.config['pretrained_embed_path'],
       oov_bucket_size=self.config['oov_bucket_size'])
예제 #2
0
    def __init__(self, config, mode=None):
        """Initialize R2R Agent."""
        super(R2RAgent, self).__init__(name='agent_r2r')

        self._instruction_encoder = instruction_encoder.InstructionEncoder(
            num_hidden_layers=2,
            output_dim=256,
            pretrained_embed_path=config.pretrained_embed_path,
            oov_bucket_size=config.oov_bucket_size,
            vocab_size=config.vocab_size,
            word_embed_dim=config.word_embed_dim,
            mode=mode,
        )

        self._embed_action = config.embed_action if hasattr(
            config, 'embed_action') else False
        self._image_encoder = image_encoder.ImageEncoder(
            256,
            512,
            num_hidden_layers=2,
            concat_context=self._embed_action,
            mode=mode)

        # Learnable transform of initial state from instruction encoder.
        self._encoder_transform = tf.keras.layers.Dense(
            4 * 512, name='encoder_transform')

        # Text attention.
        self._text_attention_size = 512
        self._text_attention_project_hidden = tf.keras.layers.Dense(
            self._text_attention_size, name='text_attention_project_hidden')
        self._text_attention_project_text = tf.keras.layers.Dense(
            self._text_attention_size, name='text_attention_project_text')
        self._text_attention = tf.keras.layers.Attention(use_scale=True,
                                                         name='text_attention')

        # Visual attention.
        self._visual_attention_size = 256
        self._visual_attention_project_ctext = tf.keras.layers.Dense(
            self._visual_attention_size, name='vis_attention_project_ctext')
        self._visual_attention_project_feature = tf.keras.layers.Dense(
            self._visual_attention_size, name='vis_attention_project_feature')
        self._visual_attention = tf.keras.layers.Attention(
            use_scale=True, name='vis_attention')

        # Action predictor projection.
        self._action_projection_size = 256
        self._project_feature = tf.keras.layers.Dense(
            self._action_projection_size, name='action_layer_project_feature')
        self._project_action = tf.keras.layers.Dense(
            self._action_projection_size, name='action_layer_project_action')
        # Dot product over the last dimension.
        self._dot_product = tf.keras.layers.Dot(axes=2)

        # Value network.
        self._value_network = self._get_value_network()
예제 #3
0
    def __init__(self, config, mode=None, name=None):
        """Initialize R2R Agent."""
        super(DiscriminatorAgent,
              self).__init__(name=name if name else 'discriminator_r2r')
        use_bert_emb = (config.use_bert_emb
                        if hasattr(config, 'use_bert_emb') else False)
        self._instruction_encoder = instruction_encoder.InstructionEncoder(
            num_hidden_layers=2,
            output_dim=256,
            pretrained_embed_path=config.pretrained_embed_path,
            oov_bucket_size=config.oov_bucket_size,
            vocab_size=config.vocab_size,
            word_embed_dim=config.word_embed_dim,
            l2_scale=config.l2_scale,
            dropout=config.dropout,
            layernorm=config.layernorm,
            use_bert_embeddings=use_bert_emb,
            mode=mode)

        # If False, the text and image encoders are independent from each other.
        self._init_with_text_state = (
            config.init_image_enc_with_text_state if hasattr(
                config, 'init_image_enc_with_text_state') else True)
        self._embed_prev_action = config.embed_prev_action if hasattr(
            config, 'embed_prev_action') else False
        self._embed_next_action = config.embed_next_action if hasattr(
            config, 'embed_next_action') else False
        self._use_attn_pooling = config.use_attn_pooling if hasattr(
            config, 'use_attn_pooling') else True
        image_enc_attention_dim = (config.image_enc_attention_dim if hasattr(
            config, 'image_enc_attention_dim') else 256)
        image_enc_hidden_dim = (config.image_enc_hidden_dim if hasattr(
            config, 'image_enc_hidden_dim') else 512)
        self._image_encoder = image_encoder.ImageEncoder(
            attention_space_size=image_enc_attention_dim,
            num_lstm_units=image_enc_hidden_dim,
            num_hidden_layers=2,
            l2_scale=config.l2_scale,
            dropout=config.dropout,
            concat_context=config.concat_context,
            layernorm=config.layernorm,
            mode=mode,
            use_attention_pooling=self._use_attn_pooling)

        # Learnable projection of initial decoder state from instruction encoder.
        self._project_decoder_input_states = (
            config.project_decoder_input_states if hasattr(
                config, 'project_decoder_input_states') else False)
        if self._project_decoder_input_states:
            self._encoder_projection = tf.keras.layers.Dense(
                4 * 512, name='encoder_projection')

        self.affine_a = tf.Variable(1.0, dtype=tf.float32, trainable=True)
        self.affine_b = tf.Variable(0.0, dtype=tf.float32, trainable=True)
예제 #4
0
 def test_call_r2r(self):
   encoder = instruction_encoder.InstructionEncoder(
       num_hidden_layers=2,
       output_dim=256,
       pretrained_embed_path='',
       oov_bucket_size=1)
   # Initialize tokens
   tokens = tf.constant([[3, 4, 5, 1, 6, 0]])
   result = encoder(tokens)
   self.assertEqual(result[0].shape, [1, 6, 512])
   self.assertEqual(result[1][0][0].shape, [1, 512])
   self.assertEqual(result[1][0][1].shape, [1, 512])
   self.assertEqual(result[1][1][0].shape, [1, 512])
   self.assertEqual(result[1][1][1].shape, [1, 512])
예제 #5
0
  def test_call_ndh(self):
    encoder = instruction_encoder.InstructionEncoder(
        num_hidden_layers=2,
        output_dim=256,
        pretrained_embed_path=None,
        oov_bucket_size=1,
        vocab_size=1082,
        word_embed_dim=300)

    tokens = tf.constant([[3, 4, 5, 1, 6, 0]])
    result = encoder(tokens)
    self.assertEqual(result[0].shape, [1, 6, 512])
    self.assertEqual(result[1][0][0].shape, [1, 512])
    self.assertEqual(result[1][0][1].shape, [1, 512])
    self.assertEqual(result[1][1][0].shape, [1, 512])
    self.assertEqual(result[1][1][1].shape, [1, 512])
예제 #6
0
  def __init__(self, config):
    """Initialize R2R Agent."""
    super(DiscriminatorAgent, self).__init__(name='discriminator_r2r')

    self._instruction_encoder = instruction_encoder.InstructionEncoder(
        num_hidden_layers=2,
        output_dim=256,
        pretrained_embed_path=config.pretrained_embed_path,
        oov_bucket_size=config.oov_bucket_size,
        vocab_size=config.vocab_size,
        word_embed_dim=config.word_embed_dim,
    )
    self._image_encoder = image_encoder.ImageEncoder(
        256, 512, num_hidden_layers=2)
    self.affine_a = tf.Variable(1.0, dtype=tf.float32, trainable=True)
    self.affine_b = tf.Variable(0.0, dtype=tf.float32, trainable=True)
예제 #7
0
파일: agent.py 프로젝트: diegozd/valan
    def __init__(self, config):
        """Initialize R2R Agent."""
        super(R2RAgent, self).__init__(name='agent_r2r')

        self._instruction_encoder = instruction_encoder.InstructionEncoder(
            num_hidden_layers=2,
            output_dim=256,
            pretrained_embed_path=config.pretrained_embed_path,
            oov_bucket_size=config.oov_bucket_size,
            vocab_size=config.vocab_size,
            word_embed_dim=config.word_embed_dim,
        )

        self._image_encoder = image_encoder.ImageEncoder(256,
                                                         512,
                                                         num_hidden_layers=2)

        # Text attention.
        self._text_attention_size = 512
        self._text_attention_project_hidden = tf.keras.layers.Dense(
            self._text_attention_size)
        self._text_attention_project_text = tf.keras.layers.Dense(
            self._text_attention_size)
        self._text_attention = tf.keras.layers.Attention(use_scale=True)

        # Visual attention.
        self._visual_attention_size = 256
        self._visual_attention_project_ctext = tf.keras.layers.Dense(
            self._visual_attention_size)
        self._visual_attention_project_feature = tf.keras.layers.Dense(
            self._visual_attention_size)
        self._visual_attention = tf.keras.layers.Attention(use_scale=True)

        # Action predictor projection.
        self._action_projection_size = 256
        self._project_feature = tf.keras.layers.Dense(
            self._action_projection_size)
        self._project_action = tf.keras.layers.Dense(
            self._action_projection_size)
        # Dot product over the last dimension.
        self._dot_product = tf.keras.layers.Dot(axes=2)

        # Value network.
        self._value_network = self._get_value_network()
예제 #8
0
    def __init__(self, config, mode=None):
        """Initialize the Agent."""
        super(MTEnvAgAgent, self).__init__(name='agent_mt_envag')

        self._ins_classifier = None
        self._scan_classifier = None
        if config.classify_instructions:
            self._ins_classifier = self._get_ins_classifier(
                config.classifier_dropout)
        if config.classify_scans:
            self._scan_classifier = self._get_scan_classifier(
                config.classifier_dropout)

        self._instruction_encoder = instruction_encoder.InstructionEncoder(
            num_hidden_layers=2,
            output_dim=256,
            pretrained_embed_path=config.pretrained_embed_path,
            oov_bucket_size=config.oov_bucket_size,
            vocab_size=config.vocab_size,
            word_embed_dim=config.word_embed_dim,
            mode=mode,
        )

        self._ndh_instruction_encoder = None
        if config.use_separate_encoders:
            self._ndh_instruction_encoder = instruction_encoder.InstructionEncoder(
                num_hidden_layers=2,
                output_dim=256,
                pretrained_embed_path=config.pretrained_embed_path,
                oov_bucket_size=config.oov_bucket_size,
                vocab_size=config.vocab_size,
                word_embed_dim=config.word_embed_dim,
                mode=mode,
            )

        self._image_encoder = image_encoder.ImageEncoder(256,
                                                         512,
                                                         num_hidden_layers=2,
                                                         mode=mode)

        # Text attention.
        self._text_attention_size = 512
        self._text_attention_project_hidden = tf.keras.layers.Dense(
            self._text_attention_size, name='text_attention_project_hidden')
        self._text_attention_project_text = tf.keras.layers.Dense(
            self._text_attention_size, name='text_attention_project_text')
        self._text_attention = tf.keras.layers.Attention(use_scale=True,
                                                         name='text_attention')

        # Visual attention.
        self._visual_attention_size = 256
        self._visual_attention_project_ctext = tf.keras.layers.Dense(
            self._visual_attention_size, name='vis_attention_project_ctext')
        self._visual_attention_project_feature = tf.keras.layers.Dense(
            self._visual_attention_size, name='vis_attention_project_feature')
        self._visual_attention = tf.keras.layers.Attention(
            use_scale=True, name='vis_attention')

        # Action predictor projection.
        self._action_projection_size = 256
        self._project_feature = tf.keras.layers.Dense(
            self._action_projection_size, name='action_layer_project_feature')
        self._project_action = tf.keras.layers.Dense(
            self._action_projection_size, name='action_layer_project_action')
        # Dot product over the last dimension.
        self._dot_product = tf.keras.layers.Dot(axes=2)

        # Value network.
        self._value_network = self._get_value_network()