Exemplo n.º 1
0
    def __init__(self, opts, img_fc_dim, img_fc_use_batchnorm, img_dropout, img_feat_input_dim,
                 rnn_hidden_size, rnn_dropout, max_len, fc_bias=True, max_navigable=16):
        super(Configuring, self).__init__()

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.max_navigable = max_navigable
        self.feature_size = img_feat_input_dim
        self.hidden_size = rnn_hidden_size
        self.max_len = max_len

        proj_navigable_kwargs = {
            'input_dim': img_feat_input_dim,
            'hidden_dims': img_fc_dim,
            'use_batchnorm': img_fc_use_batchnorm,
            'dropout': img_dropout,
            'fc_bias': fc_bias,
            'relu': opts.mlp_relu
        }
        self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs)

        self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias)
        self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias)

        self.soft_attn = SoftAttention()

        self.dropout = nn.Dropout(p=rnn_dropout)

        self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size, rnn_hidden_size)

        self.lang_position = PositionalEncoding(rnn_hidden_size, dropout=0.1, max_len=max_len)

        self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1])

        self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1], rnn_hidden_size, bias=fc_bias)

        self.r_linear = nn.Linear(rnn_hidden_size + 128, 2)
    
        self.sm = nn.Softmax(dim=1)

        self.num_predefined_action = 1

        self.state_attention = StateAttention()

        self.config_fc = nn.Linear(768, 512, bias=False)

        if opts.monitor_sigmoid:
            self.critic = nn.Sequential(
                #nn.Linear(max_len + rnn_hidden_size, 1),
                nn.Linear(10 + rnn_hidden_size, 1),
                nn.Sigmoid()
            )
        else:
            self.critic = nn.Sequential(
               # nn.Linear(max_len + rnn_hidden_size, 1),
                nn.Linear(10 + rnn_hidden_size, 1),
                nn.Tanh()
            )
Exemplo n.º 2
0
    def __init__(self,
                 opts,
                 img_fc_dim,
                 img_fc_use_batchnorm,
                 img_dropout,
                 img_feat_input_dim,
                 rnn_hidden_size,
                 rnn_dropout,
                 max_len,
                 fc_bias=True,
                 max_navigable=16):
        super(Regretful, self).__init__()

        self.opts = opts
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.max_navigable = max_navigable

        proj_navigable_kwargs = {
            'input_dim': img_feat_input_dim,
            'hidden_dims': img_fc_dim,
            'use_batchnorm': img_fc_use_batchnorm,
            'dropout': img_dropout,
            'fc_bias': fc_bias
        }
        self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs)

        self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1])

        self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias)
        self.positional_encoding = PositionalEncoding(rnn_hidden_size,
                                                      dropout=0.1,
                                                      max_len=max_len)
        self.soft_attn = SoftAttention()

        self.dropout = nn.Dropout(p=rnn_dropout)
        self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size,
                                rnn_hidden_size)

        self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1])

        self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1],
                                    rnn_hidden_size,
                                    bias=fc_bias)

        self.critic_fc = nn.Linear(max_len + rnn_hidden_size, 1)
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()

        self.critic_valueDiff_fc = nn.Linear(1, 2)
        self.relu = nn.ReLU(inplace=True)
        self.softmax = nn.Softmax(dim=1)

        self.move_fc = nn.Linear(img_fc_dim[-1],
                                 img_fc_dim[-1] + opts.tiled_len)

        self.num_predefined_action = 1
Exemplo n.º 3
0
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, device='cpu'):
        super(TransformerModel, self).__init__()
        self.device = device
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, self.device, dropout).to(self.device)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout).to(self.device)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers).to(self.device)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        '''Since you are using the CrossEntropyLoss, you shouldn't do the log_softmax in the above line. You can 
        either switch to the NLLLoss or remove the log_softmax to fix this. '''

        self.init_weights()
Exemplo n.º 4
0
    def __init__(
        self,
        opts,
        img_fc_dim,
        img_fc_use_batchnorm,
        img_dropout,
        img_feat_input_dim,
        rnn_hidden_size,
        rnn_dropout,
        max_len,
        film_size=2048,
        fc_bias=True,
        max_navigable=16,
        conv_hidden=2048,
        num_resblocks=8,
    ):
        super(SelfMonitoring, self).__init__()

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.max_navigable = max_navigable
        self.feature_size = img_feat_input_dim
        self.hidden_size = rnn_hidden_size
        self.max_len = max_len

        proj_navigable_kwargs = {
            "input_dim": img_feat_input_dim,
            "hidden_dims": img_fc_dim,
            "use_batchnorm": img_fc_use_batchnorm,
            "dropout": img_dropout,
            "fc_bias": fc_bias,
            "relu": opts.mlp_relu,
        }
        self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs)

        self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias)
        self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias)

        self.soft_attn = SoftAttention()

        self.dropout = nn.Dropout(p=rnn_dropout)

        self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size,
                                rnn_hidden_size)

        self.lang_position = PositionalEncoding(rnn_hidden_size,
                                                dropout=0.1,
                                                max_len=max_len)

        self.logit_fc = nn.Linear(film_size, img_fc_dim[-1])

        self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1],
                                    rnn_hidden_size,
                                    bias=fc_bias)

        if opts.monitor_sigmoid:
            self.critic = nn.Sequential(
                nn.Linear(max_len + rnn_hidden_size, 1), nn.Sigmoid())
        else:
            self.critic = nn.Sequential(
                nn.Linear(max_len + rnn_hidden_size, 1), nn.Tanh())

        self.num_predefined_action = 1

        # EDIT: add FiLM
        self.resnet = torch.hub.load("pytorch/vision:v0.5.0",
                                     "resnet152",
                                     pretrained=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-2])

        self.film_gen = FiLMGenerator(
            context_size=rnn_hidden_size,
            num_resblocks=num_resblocks,
            conv_hidden=conv_hidden,
        )

        self.film = FiLMedResBlocks(
            num_blocks=num_resblocks,
            conv_hidden=conv_hidden,
            with_batch_norm=True,
        )

        self.film_tail = nn.AdaptiveAvgPool2d(1)
Exemplo n.º 5
0
    def __init__(self,
                 opts,
                 img_fc_dim,
                 img_fc_use_batchnorm,
                 img_dropout,
                 img_feat_input_dim,
                 rnn_hidden_size,
                 rnn_dropout,
                 max_len,
                 fc_bias=True,
                 max_navigable=16):
        super(SelfMonitoring, self).__init__()

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.max_navigable = max_navigable
        self.feature_size = img_feat_input_dim
        self.hidden_size = rnn_hidden_size
        self.max_len = max_len

        proj_navigable_kwargs = {
            'input_dim': img_feat_input_dim,
            'hidden_dims': img_fc_dim,
            'use_batchnorm': img_fc_use_batchnorm,
            'dropout': img_dropout,
            'fc_bias': fc_bias,
            'relu': opts.mlp_relu
        }
        self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs)

        self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias)
        self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias)

        self.soft_attn = SoftAttention()

        self.dropout = nn.Dropout(p=rnn_dropout)

        self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size,
                                rnn_hidden_size)
        self.lstm = nn.LSTMCell(2587, rnn_hidden_size)

        self.lang_position = PositionalEncoding(rnn_hidden_size,
                                                dropout=0.1,
                                                max_len=max_len)

        self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1])
        self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1],
                                    rnn_hidden_size,
                                    bias=fc_bias)

        if opts.monitor_sigmoid:  #NOT HERE
            self.critic = nn.Sequential(
                nn.Linear(max_len + rnn_hidden_size, 1), nn.Sigmoid())
        else:  #THIS
            self.critic = nn.Sequential(
                nn.Linear(max_len + rnn_hidden_size, 1), nn.Tanh())

        self.num_predefined_action = 1
        self.object_t_size = 17  # object_size
        self.place_t_size = 10  # place_Size

        self.object_attention_layer = ObjectDotAttention(
            rnn_hidden_size, self.object_t_size)
        self.place_attention_layer = VisualSoftDotAttention(
            rnn_hidden_size, self.place_t_size)