def __init__(self, opts, img_fc_dim, img_fc_use_batchnorm, img_dropout, img_feat_input_dim, rnn_hidden_size, rnn_dropout, max_len, fc_bias=True, max_navigable=16): super(Configuring, self).__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.max_navigable = max_navigable self.feature_size = img_feat_input_dim self.hidden_size = rnn_hidden_size self.max_len = max_len proj_navigable_kwargs = { 'input_dim': img_feat_input_dim, 'hidden_dims': img_fc_dim, 'use_batchnorm': img_fc_use_batchnorm, 'dropout': img_dropout, 'fc_bias': fc_bias, 'relu': opts.mlp_relu } self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs) self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias) self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias) self.soft_attn = SoftAttention() self.dropout = nn.Dropout(p=rnn_dropout) self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size, rnn_hidden_size) self.lang_position = PositionalEncoding(rnn_hidden_size, dropout=0.1, max_len=max_len) self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1]) self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1], rnn_hidden_size, bias=fc_bias) self.r_linear = nn.Linear(rnn_hidden_size + 128, 2) self.sm = nn.Softmax(dim=1) self.num_predefined_action = 1 self.state_attention = StateAttention() self.config_fc = nn.Linear(768, 512, bias=False) if opts.monitor_sigmoid: self.critic = nn.Sequential( #nn.Linear(max_len + rnn_hidden_size, 1), nn.Linear(10 + rnn_hidden_size, 1), nn.Sigmoid() ) else: self.critic = nn.Sequential( # nn.Linear(max_len + rnn_hidden_size, 1), nn.Linear(10 + rnn_hidden_size, 1), nn.Tanh() )
def __init__(self, opts, img_fc_dim, img_fc_use_batchnorm, img_dropout, img_feat_input_dim, rnn_hidden_size, rnn_dropout, max_len, fc_bias=True, max_navigable=16): super(Regretful, self).__init__() self.opts = opts self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.max_navigable = max_navigable proj_navigable_kwargs = { 'input_dim': img_feat_input_dim, 'hidden_dims': img_fc_dim, 'use_batchnorm': img_fc_use_batchnorm, 'dropout': img_dropout, 'fc_bias': fc_bias } self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs) self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1]) self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias) self.positional_encoding = PositionalEncoding(rnn_hidden_size, dropout=0.1, max_len=max_len) self.soft_attn = SoftAttention() self.dropout = nn.Dropout(p=rnn_dropout) self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size, rnn_hidden_size) self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1]) self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1], rnn_hidden_size, bias=fc_bias) self.critic_fc = nn.Linear(max_len + rnn_hidden_size, 1) self.tanh = nn.Tanh() self.sigmoid = nn.Sigmoid() self.critic_valueDiff_fc = nn.Linear(1, 2) self.relu = nn.ReLU(inplace=True) self.softmax = nn.Softmax(dim=1) self.move_fc = nn.Linear(img_fc_dim[-1], img_fc_dim[-1] + opts.tiled_len) self.num_predefined_action = 1
def __init__(self, opts, img_fc_dim, img_fc_use_batchnorm, img_dropout, img_feat_input_dim, rnn_hidden_size, rnn_dropout, max_len, fc_bias=True, max_navigable=16): super(ConfiguringObject, self).__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.max_navigable = max_navigable self.feature_size = img_feat_input_dim self.hidden_size = rnn_hidden_size proj_navigable_obj_kwargs = { 'input_dim': 152, 'hidden_dims': img_fc_dim, 'use_batchnorm': img_fc_use_batchnorm, 'dropout': img_dropout, 'fc_bias': fc_bias, 'relu': opts.mlp_relu } self.proj_navigable_obj_mlp = build_mlp(**proj_navigable_obj_kwargs) proj_navigable_img_kwargs = { 'input_dim': img_feat_input_dim, 'hidden_dims': img_fc_dim, 'use_batchnorm': img_fc_use_batchnorm, 'dropout': img_dropout, 'fc_bias': fc_bias, 'relu': opts.mlp_relu } self.proj_navigable_img_mlp = build_mlp(**proj_navigable_img_kwargs) self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=False) self.soft_attn = SoftAttention() self.state_attention = StateAttention() self.config_obj_attention = ConfigObjAttention() self.dropout = nn.Dropout(p=rnn_dropout) #self.lstm = nn.LSTMCell(img_fc_dim[-1] + 768, rnn_hidden_size) self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size, rnn_hidden_size) self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=False) self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1], rnn_hidden_size, bias=fc_bias) self.proj_out = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias) self.state_attention = StateAttention() # self.logit_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1]) self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1]) self.r_linear = nn.Linear(rnn_hidden_size + 128, 4) self.image_linear = nn.Linear(img_feat_input_dim, img_fc_dim[-1]) self.config_fc = nn.Linear(768, 512, bias=False) self.config_atten_linear = nn.Linear(512, 128) #self.config_atten_linear = nn.Linear(768, 128) self.sm = nn.Softmax(dim=1) if opts.monitor_sigmoid: self.critic = nn.Sequential( #nn.Linear(max_len + rnn_hidden_size, 1), nn.Linear(10 + rnn_hidden_size, 1), nn.Sigmoid() ) else: self.critic = nn.Sequential( # nn.Linear(max_len + rnn_hidden_size, 1), nn.Linear(10 + rnn_hidden_size, 1), nn.Tanh() ) self.r_transform = Variable(torch.tensor([[1,0,0.75,0.5],[0,1,0.25,0.5]]).transpose(0,1), requires_grad=False)
def __init__( self, opts, img_fc_dim, img_fc_use_batchnorm, img_dropout, img_feat_input_dim, rnn_hidden_size, rnn_dropout, max_len, film_size=2048, fc_bias=True, max_navigable=16, conv_hidden=2048, num_resblocks=8, ): super(SelfMonitoring, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.max_navigable = max_navigable self.feature_size = img_feat_input_dim self.hidden_size = rnn_hidden_size self.max_len = max_len proj_navigable_kwargs = { "input_dim": img_feat_input_dim, "hidden_dims": img_fc_dim, "use_batchnorm": img_fc_use_batchnorm, "dropout": img_dropout, "fc_bias": fc_bias, "relu": opts.mlp_relu, } self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs) self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias) self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias) self.soft_attn = SoftAttention() self.dropout = nn.Dropout(p=rnn_dropout) self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size, rnn_hidden_size) self.lang_position = PositionalEncoding(rnn_hidden_size, dropout=0.1, max_len=max_len) self.logit_fc = nn.Linear(film_size, img_fc_dim[-1]) self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1], rnn_hidden_size, bias=fc_bias) if opts.monitor_sigmoid: self.critic = nn.Sequential( nn.Linear(max_len + rnn_hidden_size, 1), nn.Sigmoid()) else: self.critic = nn.Sequential( nn.Linear(max_len + rnn_hidden_size, 1), nn.Tanh()) self.num_predefined_action = 1 # EDIT: add FiLM self.resnet = torch.hub.load("pytorch/vision:v0.5.0", "resnet152", pretrained=True) self.resnet = nn.Sequential(*list(self.resnet.children())[:-2]) self.film_gen = FiLMGenerator( context_size=rnn_hidden_size, num_resblocks=num_resblocks, conv_hidden=conv_hidden, ) self.film = FiLMedResBlocks( num_blocks=num_resblocks, conv_hidden=conv_hidden, with_batch_norm=True, ) self.film_tail = nn.AdaptiveAvgPool2d(1)
def __init__(self, opts, img_fc_dim, img_fc_use_batchnorm, img_dropout, img_feat_input_dim, rnn_hidden_size, rnn_dropout, max_len, fc_bias=True, max_navigable=16): super(SelfMonitoring, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.max_navigable = max_navigable self.feature_size = img_feat_input_dim self.hidden_size = rnn_hidden_size self.max_len = max_len proj_navigable_kwargs = { 'input_dim': img_feat_input_dim, 'hidden_dims': img_fc_dim, 'use_batchnorm': img_fc_use_batchnorm, 'dropout': img_dropout, 'fc_bias': fc_bias, 'relu': opts.mlp_relu } self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs) self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias) self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias) self.soft_attn = SoftAttention() self.dropout = nn.Dropout(p=rnn_dropout) self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size, rnn_hidden_size) self.lstm = nn.LSTMCell(2587, rnn_hidden_size) self.lang_position = PositionalEncoding(rnn_hidden_size, dropout=0.1, max_len=max_len) self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1]) self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1], rnn_hidden_size, bias=fc_bias) if opts.monitor_sigmoid: #NOT HERE self.critic = nn.Sequential( nn.Linear(max_len + rnn_hidden_size, 1), nn.Sigmoid()) else: #THIS self.critic = nn.Sequential( nn.Linear(max_len + rnn_hidden_size, 1), nn.Tanh()) self.num_predefined_action = 1 self.object_t_size = 17 # object_size self.place_t_size = 10 # place_Size self.object_attention_layer = ObjectDotAttention( rnn_hidden_size, self.object_t_size) self.place_attention_layer = VisualSoftDotAttention( rnn_hidden_size, self.place_t_size)