def __getitem__(self, idx): idx = self.movie_idx[idx] F = self.data[idx][2] A = self.data[idx][1] T = self.data[idx][3] y = self.data[idx][4] combined = np.hstack([F, A, T]) #shape: timestamps*sum of dim_modality # Convert to torch tensors F = torch.Tensor(F) A = torch.Tensor(A) T = torch.Tensor(T) # y = torch.Tensor(y) # Instantiate fusion classes FA = fusions.Block([F.shape[1], A.shape[1]], T.shape[1]) FAT = fusions.Block([T.shape[1], T.shape[1]], F.shape[1] + A.shape[1] + T.shape[1]) # compute fusions temp_output_FA = FA([F, A]) final_FAT = FAT([temp_output_FA, T]) # return final_FAT.cpu().detach().numpy(), y return combined, y, F, A, T
def __init__(self, image_model, fusion_method, config): super(TextImageGazeBertEncoder, self).__init__() self.hidden_size = config.hidden_size self.fusion_method = fusion_method if fusion_method == 'concat': self.fc = nn.Linear(self.hidden_size * 2, self.hidden_size) elif fusion_method == 'mcb': self.fusion = fusions.MCB([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'mlb': self.fusion = fusions.MLB([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'mutan': self.fusion = fusions.Mutan([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'block': self.fusion = fusions.Block([self.hidden_size, self.hidden_size], self.hidden_size) if image_model == 'vgg': from model.vgg import VggEncoder self.image_gaze_encoder = VggEncoder(self.hidden_size, gaze=True) elif image_model == 'resnet': from model.resnet import ResNetEncoder self.image_gaze_encoder = ResNetEncoder(self.hidden_size, gaze=True) from model.bert import BertEncoder self.text_encoder = BertEncoder(config) M = torch.FloatTensor(self.hidden_size, self.hidden_size) init.xavier_normal_(M) self.M = nn.Parameter(M, requires_grad=True)
def __init__(self, cfg): super(RelationVKG, self).__init__() q_dim = cfg['rnn_dim']*2 if cfg['rnn_bidirection'] else cfg['rnn_dim'] self.w_emb = WordEmbedding(cfg['n_vocab'], cfg['word_embedding_dim']) self.w_emb.init_embedding(cfg['word_dic_file'], cfg['embedding_file']) self.q_emb = QuestionEmbedding(cfg['word_embedding_dim'], cfg['rnn_dim'], cfg['rnn_layer'], cfg['rnn_type'], keep_seq=True, bidirectional=cfg['rnn_bidirection']) self.reasoning_net = ReasoningUnit(cfg['v_dim'], q_dim, cfg['rel_dim'], cfg['node_att_hid_dim'], gat_att_hid_dim=cfg['gat_att_hid_dim'], gat_out_dim=cfg['v_dim'], gat_n_att=cfg['gat_n_att'], gat_multi_head_type="concat", que_self_att_enable=cfg['ques_self_att_enable'], node_att_enable=cfg['node_att_enable'], gat_enable=cfg['gat_enable'], spatial_feature_enable=cfg['spatial_feature_enable'], recurrent=cfg['recurrent'], dropout=cfg['dropout'], wn=cfg['wn']) if cfg['fuse_type'] == 'LinearSum': self.fuse_net = fusions.LinearSum([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MFB': self.fuse_net = fusions.MFB([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MLB': self.fuse_net = fusions.MLB([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=2*cfg['fused_dim'], dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'BLOCK': self.fuse_net = fusions.Block([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=2*cfg['fused_dim'], dropout_input=cfg['dropout']) self.classifier = SimpleClassifier(cfg['fused_dim'], cfg['classifier_hid_dim'], cfg['classes'], 0.5)
def __init__(self, image_model, fusion_method, id_to_vec, emb_size, vocab_size, config, device='cuda:0'): super(TextImageTransformerEncoder, self).__init__() self.hidden_size = config.hidden_size self.fusion_method = fusion_method if fusion_method == 'concat': self.fc = nn.Linear(self.hidden_size*2, self.hidden_size) elif fusion_method == 'mcb': self.fusion = fusions.MCB([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'mlb': self.fusion = fusions.MLB([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'mutan': self.fusion = fusions.Mutan([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'block': self.fusion = fusions.Block([self.hidden_size, self.hidden_size], self.hidden_size) if image_model == 'vgg': from model.vgg import VggEncoder self.image_encoder = VggEncoder(self.hidden_size) elif image_model == 'resnet': from model.resnet import ResNetEncoder self.image_encoder = ResNetEncoder(self.hidden_size) from model.transformer import TransformerEncoder self.context_encoder = TransformerEncoder(id_to_vec, emb_size, vocab_size, config, device) self.response_encoder = TransformerEncoder(id_to_vec, emb_size, vocab_size, config, device) M = torch.FloatTensor(self.hidden_size, self.hidden_size) init.xavier_normal_(M) self.M = nn.Parameter(M, requires_grad=True)
def __init__(self, vid_encoder, qns_encoder, ans_decoder, max_len_v, max_len_q, device): """ Reasoning with Heterogeneous Graph Alignment for Video Question Answering (AAAI20) """ super(HGA, self).__init__() self.vid_encoder = vid_encoder self.qns_encoder = qns_encoder self.ans_decoder = ans_decoder self.max_len_v = max_len_v self.max_len_q = max_len_q self.device = device hidden_size = vid_encoder.dim_hidden input_dropout_p = vid_encoder.input_dropout_p self.q_input_ln = nn.LayerNorm(hidden_size, elementwise_affine=False) self.v_input_ln = nn.LayerNorm(hidden_size, elementwise_affine=False) self.co_attn = CoAttention(hidden_size, n_layers=vid_encoder.n_layers, dropout_p=input_dropout_p) self.adj_learner = AdjLearner(hidden_size, hidden_size, dropout=input_dropout_p) self.gcn = GCN(hidden_size, hidden_size, hidden_size, num_layers=2, dropout=input_dropout_p) self.gcn_atten_pool = nn.Sequential( nn.Linear(hidden_size, hidden_size // 2), nn.Tanh(), nn.Linear(hidden_size // 2, 1), nn.Softmax( dim=-1)) #dim=-2 for attention-pooling otherwise sum-pooling self.global_fusion = fusions.Block([hidden_size, hidden_size], hidden_size, dropout_input=input_dropout_p) self.fusion = fusions.Block([hidden_size, hidden_size], hidden_size)
def __getitem__(self, idx): idx = self.movie_idx[idx] F = self.new_data[idx][1] Va = self.new_data[idx][2] emb_desc = self.new_data[idx][3] emb_sit = self.new_data[idx][4] emb_sce = self.new_data[idx][5] emb_trans = self.new_data[idx][6] y = self.new_data[idx][7] combined = np.hstack([F, Va, emb_desc, emb_sit, emb_sce, emb_trans]) F = torch.Tensor(F) Va = torch.Tensor(Va) emb_desc = torch.Tensor(emb_desc) emb_sit = torch.Tensor(emb_sit) emb_sce = torch.Tensor(emb_sce) emb_trans = torch.Tensor(emb_trans) # Instantiate fusion classes fusion1 = fusions.Block([F.shape[1], Va.shape[1]], emb_desc.shape[1]) fusion2 = fusions.Block([emb_desc.shape[1], emb_desc.shape[1]], F.shape[1] + Va.shape[1] + emb_desc.shape[1]) fusion3 = fusions.Block([emb_sit.shape[1], emb_sce.shape[1]], emb_trans.shape[1]) fusion4 = fusions.Block([emb_trans.shape[1], emb_trans.shape[1]], emb_sit.shape[1] + emb_sce.shape[1] + emb_trans.shape[1]) # compute fusions temp_output_fusion1 = fusion1([F, Va]) first_three = fusion2([temp_output_fusion1, emb_desc]) temp_output_fusion2 = fusion3([emb_sit, emb_sce]) second_three = fusion4([temp_output_fusion2, emb_trans]) fusion5 = fusions.Block([first_three.shape[1], second_three.shape[1]], first_three.shape[1] + second_three.shape[1]) final_fused = fusion5([first_three, second_three]) return combined, y, F, Va, emb_desc, emb_sit, emb_sce, emb_trans
def __init__(self, config, max_num_region=200, chunks=50, default_gpu=True, dropout_prob=0.1): super(Vilbert, self).__init__(config) self.vilbert = BertModel(config) self.fusion = fusions.Block(input_dims=[ max_num_region * config.v_hidden_size, config.bi_hidden_size ], output_dim=max_num_region, mm_dim=512, chunks=100) self.dropout = nn.Dropout(dropout_prob) self.vision_logit = nn.Linear(config.v_hidden_size, 1)
def __init__(self, vocab_size, s_layers, s_bidirectional, s_rnn_cell, s_embedding, resnet_input_size, c3d_input_size, v_layers, v_bidirectional, v_rnn_cell, hidden_size, dropout_p=0.0, gcn_layers=2, num_heads=8, answer_vocab_size=None, q_max_len=35, v_max_len=80, tf_layers=2, two_loss=False, fusion_type='none', ablation='none'): super().__init__() self.model_name = 'TwoLSTMandBlock' self.task = 'none' self.tf_layers = tf_layers self.two_loss = two_loss self.fusion_type = fusion_type self.ablation = ablation v_input_size = resnet_input_size self.q_max_len = q_max_len self.v_max_len = v_max_len self.dropout = nn.Dropout(p=dropout_p) self.q_input_ln = nn.LayerNorm(hidden_size, elementwise_affine=False) self.v_input_ln = nn.LayerNorm(hidden_size, elementwise_affine=False) self.sentence_encoder = SentenceEncoderRNN( vocab_size, hidden_size, input_dropout_p=dropout_p, dropout_p=dropout_p, n_layers=s_layers, bidirectional=s_bidirectional, rnn_cell=s_rnn_cell, embedding=s_embedding) self.compress_c3d = nlpnn.WeightDropLinear(c3d_input_size, resnet_input_size, weight_dropout=dropout_p, bias=False) # self.video_fusion = fusions.Block( # [v_input_size, v_input_size], v_input_size) self.video_fusion = nlpnn.WeightDropLinear(2 * v_input_size, v_input_size, weight_dropout=dropout_p, bias=False) self.video_encoder = VideoEncoderRNN(v_input_size, hidden_size, input_dropout_p=dropout_p, dropout_p=dropout_p, n_layers=v_layers, bidirectional=v_bidirectional, rnn_cell=v_rnn_cell) self.transformer_encoder = SelfTransformerEncoder( hidden_size, tf_layers, dropout_p, vocab_size, q_max_len, v_max_len, embedding=s_embedding, position=True) # ! masked self.crossover_transformer = MaskedCrossoverTransformer( q_max_len=q_max_len, v_max_len=v_max_len, num_heads=8, num_layers=tf_layers, dropout=dropout_p) self.q_transformer = SelfTransformer(q_max_len, num_heads=8, num_layers=tf_layers, dropout=dropout_p, position=False) self.v_transformer = SelfTransformer(v_max_len, num_heads=8, num_layers=tf_layers, dropout=dropout_p, position=False) self.q_selfattn = SelfAttention(hidden_size, n_layers=tf_layers, dropout_p=dropout_p) self.v_selfattn = SelfAttention(hidden_size, n_layers=tf_layers, dropout_p=dropout_p) self.co_attn = CoAttention(hidden_size, n_layers=tf_layers, dropout_p=dropout_p) self.single_attn_semantic = SingleAttention(hidden_size, n_layers=tf_layers, dropout_p=dropout_p) self.single_attn_visual = SingleAttention(hidden_size, n_layers=tf_layers, dropout_p=dropout_p) self.co_concat_attn = CoConcatAttention(hidden_size, n_layers=tf_layers, dropout_p=dropout_p) self.co_siamese_attn = CoSiameseAttention(hidden_size, n_layers=tf_layers, dropout_p=dropout_p) self.crossover_fusion = fusions.Block([hidden_size, hidden_size], hidden_size, dropout_input=dropout_p) self.adj_learner = AdjLearner(hidden_size, hidden_size, dropout=dropout_p) # self.evo_adj_learner = EvoAdjLearner( # hidden_size, hidden_size, dropout=dropout_p) self.gcn = GCN(hidden_size, hidden_size, hidden_size, num_layers=gcn_layers, dropout=dropout_p) self.gcn_atten_pool = nn.Sequential( nn.Linear(hidden_size, hidden_size // 2), nn.Tanh(), nn.Linear(hidden_size // 2, 1), nn.Softmax(dim=-1)) self.video_adj_learner = AdjLearner(v_input_size, v_input_size, dropout=dropout_p) self.video_gcn = GCN(v_input_size, v_input_size, v_input_size, num_layers=1, dropout=dropout_p) self.video_coattn = CoAttention(v_input_size, n_layers=1, dropout_p=dropout_p) self.global_fusion = fusions.Block([hidden_size, hidden_size], hidden_size, dropout_input=dropout_p) if answer_vocab_size is not None: self.fusion = fusions.Block([hidden_size, hidden_size], answer_vocab_size) self.fc_fusion = nn.Linear(hidden_size, answer_vocab_size) else: self.fusion = fusions.Block([hidden_size, hidden_size], 1) self.fc_fusion = nn.Linear(hidden_size, 1)