def __init__(self, args): super().__init__() self.args = args print(args.dropout) self.word_embed = nn.Embedding(args.vocab_size, args.embed_size, padding_idx=0) self.ques_img_rnn = nn.LSTM(args.embed_size + args.img_feature_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.hist_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.dialog_rnn = nn.LSTM(args.rnn_hidden_size * 2, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.ques_img_rnn = DynamicRNN(self.ques_img_rnn) self.hist_rnn = DynamicRNN(self.hist_rnn) self.dialog_rnn = DynamicRNN(self.dialog_rnn)
def __init__(self, args): super().__init__() self.args = args self.word_embed = nn.Embedding(args.vocab_size, args.embed_size, padding_idx=0) self.hist_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.ques_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.dropout = nn.Dropout(p=args.dropout) # questions and history are right padded sequences of variable length # use the DynamicRNN utility module to handle them properly self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) # fusion layer fusion_size = args.img_feature_size + args.rnn_hidden_size * 2 self.fusion = nn.Linear(fusion_size, args.rnn_hidden_size) if args.weight_init == 'xavier': nn.init.xavier_uniform(self.fusion.weight.data) elif args.weight_init == 'kaiming': nn.init.kaiming_uniform(self.fusion.weight.data) nn.init.constant(self.fusion.bias.data, 0)
def __init__(self, args): super().__init__() self.args = args self.bn = nn.BatchNorm1d(args.hidden_size) self.word_embed = WordEmbedding(args.vocab_size, 300, .0) self.sent_embed = nn.LSTM(args.embed_size, args.hidden_size, 2, dropout=args.dropout, batch_first=True) self.sent_embed = DynamicRNN(self.sent_embed) self.hist_embed = nn.LSTM(args.embed_size, args.hidden_size, 2, dropout=args.dropout, batch_first=True) self.hist_embed = DynamicRNN(self.hist_embed) self.bup_att = FIND(2048, 1024, 1024) self.q_net = FCNet([1024, 1024]) self.v_net = FCNet([2048, 1024]) self.linear = nn.Linear(args.hidden_size * 2, args.hidden_size) self.layer_stack = nn.ModuleList([ REFER(d_model=512, d_inner=1024, n_head=4, d_k=256, d_v=256, dropout=0.2) for _ in range(2) ])
def __init__(self, args): super().__init__() self.args = args img_num_features = 512 img_size = 49 self.args.img_feature_size = (img_size, img_num_features) self.word_embed = nn.Embedding(args.vocab_size, args.embed_size, padding_idx=0) self.ques_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.ques_rnn = DynamicRNN(self.ques_rnn) self.hist_q_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.hist_q_rnn = DynamicRNN(self.hist_q_rnn) self.hist_a_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.qa_fusion = nn.Linear(args.rnn_hidden_size * 2, args.rnn_hidden_size) self.hist_rnn = nn.LSTM(args.rnn_hidden_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.qh_fusion = nn.Linear(args.rnn_hidden_size * 2, args.rnn_hidden_size) self.c_projection = nn.Linear(args.rnn_hidden_size, args.rnn_hidden_size) self.f_projection = nn.Linear(img_num_features, args.rnn_hidden_size) # Transpose + Matrix Multiplication Stuff in Forward # Softmax stuff in forward (nn.softmax) # Elementwise multiplication to get attended image # Currently f^att_t, alpha_t, c_t self.final_concat_size = img_num_features + img_size + args.rnn_hidden_size self.final_fusion = nn.Linear(final_concat_size, args.rnn_hidden_size)
def __init__(self, args): super().__init__() self.args = args self.dropout = nn.Dropout(p=args.dropout) if self.args.lang_model == 'bert': self.ques_net = BertModel.from_pretrained("bert-base-uncased", return_dict=True) else: self.word_embed = nn.Embedding(args.vocab_size, args.embed_size, padding_idx=0) self.ques_net = DynamicRNN( nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout)) fusion_size = args.audio_feature_size \ + args.visual_feature_size \ + args.rnn_hidden_size self.fusion = nn.Linear(fusion_size, args.rnn_hidden_size) self.weight_init()
def __init__(self, args, encoder): super().__init__() self.args = args # share word embedding self.word_embed = encoder.word_embed self.option_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, batch_first=True) self.log_softmax = nn.LogSoftmax(dim=1) # options are variable length padded sequences, use DynamicRNN self.option_rnn = DynamicRNN(self.option_rnn)
def __init__(self, args, encoder): super().__init__() self.args = args # share word embedding self.word_embed = encoder.word_embed self.opt_embed = nn.LSTM(args.embed_size, 512, batch_first=True, dropout=args.dropout) self.opt_embed = DynamicRNN(self.opt_embed) self.log_softmax = nn.LogSoftmax(dim=1)
def __init__(self, args): super().__init__() self.args = args self.img_embed = nn.Linear(args.img_feature_size, args.img_embed_size) self.word_embed = nn.Embedding(args.vocab_size, args.embed_size, padding_idx=0) ques_img_feature_size = args.embed_size + args.img_embed_size self.ques_img_rnn = nn.LSTM(ques_img_feature_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.ques_img_rnn = DynamicRNN(self.ques_img_rnn) self.hist_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.hist_rnn = DynamicRNN(self.hist_rnn) # Original Paper did not use fusion # fusion_size = args.rnn_hidden_size * 2 # self.fusion = nn.Linear(fusion_size, args.rnn_hidden_size) """ if args.weight_init == 'xavier': nn.init.xavier_uniform(self.fusion.weight.data) elif args.weight_init == 'kaiming': nn.init.kaiming_uniform(self.fusion.weight.data) nn.init.constant(self.fusion.bias.data, 0) """ self.dialog_rnn = nn.LSTM(args.rnn_hidden_size * 2, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout)
def __init__(self, args, encoder): super().__init__() self.args = args if args.lang_model == 'bert': # Few notes on option_net - # in original paper, decoder and encoder shares word_embedding # but each has its own rnn # NOT SURE if the code below makes ques_net and option_net # to share the embedding layer... # self.option_net = BertModel.from_pretrained('bert-base-uncased', # return_dict=True) # self.option_net.embeddings = encoder.ques_net.embeddings self.option_net = encoder.ques_net else: self.word_embed = encoder.word_embed self.option_net = DynamicRNN( nn.LSTM(args.embed_size, args.rnn_hidden_size, batch_first=True))