def __init__(self, image_model, fusion_method, config): super(TextImageGazeBertEncoder, self).__init__() self.hidden_size = config.hidden_size self.fusion_method = fusion_method if fusion_method == 'concat': self.fc = nn.Linear(self.hidden_size * 2, self.hidden_size) elif fusion_method == 'mcb': self.fusion = fusions.MCB([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'mlb': self.fusion = fusions.MLB([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'mutan': self.fusion = fusions.Mutan([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'block': self.fusion = fusions.Block([self.hidden_size, self.hidden_size], self.hidden_size) if image_model == 'vgg': from model.vgg import VggEncoder self.image_gaze_encoder = VggEncoder(self.hidden_size, gaze=True) elif image_model == 'resnet': from model.resnet import ResNetEncoder self.image_gaze_encoder = ResNetEncoder(self.hidden_size, gaze=True) from model.bert import BertEncoder self.text_encoder = BertEncoder(config) M = torch.FloatTensor(self.hidden_size, self.hidden_size) init.xavier_normal_(M) self.M = nn.Parameter(M, requires_grad=True)
def __init__(self, image_model, fusion_method, id_to_vec, emb_size, vocab_size, config, device='cuda:0'): super(TextImageTransformerEncoder, self).__init__() self.hidden_size = config.hidden_size self.fusion_method = fusion_method if fusion_method == 'concat': self.fc = nn.Linear(self.hidden_size*2, self.hidden_size) elif fusion_method == 'mcb': self.fusion = fusions.MCB([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'mlb': self.fusion = fusions.MLB([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'mutan': self.fusion = fusions.Mutan([self.hidden_size, self.hidden_size], self.hidden_size) elif fusion_method == 'block': self.fusion = fusions.Block([self.hidden_size, self.hidden_size], self.hidden_size) if image_model == 'vgg': from model.vgg import VggEncoder self.image_encoder = VggEncoder(self.hidden_size) elif image_model == 'resnet': from model.resnet import ResNetEncoder self.image_encoder = ResNetEncoder(self.hidden_size) from model.transformer import TransformerEncoder self.context_encoder = TransformerEncoder(id_to_vec, emb_size, vocab_size, config, device) self.response_encoder = TransformerEncoder(id_to_vec, emb_size, vocab_size, config, device) M = torch.FloatTensor(self.hidden_size, self.hidden_size) init.xavier_normal_(M) self.M = nn.Parameter(M, requires_grad=True)
def __init__(self, cfg): super(RelationVKG, self).__init__() q_dim = cfg['rnn_dim']*2 if cfg['rnn_bidirection'] else cfg['rnn_dim'] self.w_emb = WordEmbedding(cfg['n_vocab'], cfg['word_embedding_dim']) self.w_emb.init_embedding(cfg['word_dic_file'], cfg['embedding_file']) self.q_emb = QuestionEmbedding(cfg['word_embedding_dim'], cfg['rnn_dim'], cfg['rnn_layer'], cfg['rnn_type'], keep_seq=True, bidirectional=cfg['rnn_bidirection']) self.reasoning_net = ReasoningUnit(cfg['v_dim'], q_dim, cfg['rel_dim'], cfg['node_att_hid_dim'], gat_att_hid_dim=cfg['gat_att_hid_dim'], gat_out_dim=cfg['v_dim'], gat_n_att=cfg['gat_n_att'], gat_multi_head_type="concat", que_self_att_enable=cfg['ques_self_att_enable'], node_att_enable=cfg['node_att_enable'], gat_enable=cfg['gat_enable'], spatial_feature_enable=cfg['spatial_feature_enable'], recurrent=cfg['recurrent'], dropout=cfg['dropout'], wn=cfg['wn']) if cfg['fuse_type'] == 'LinearSum': self.fuse_net = fusions.LinearSum([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MFB': self.fuse_net = fusions.MFB([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MLB': self.fuse_net = fusions.MLB([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=2*cfg['fused_dim'], dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'BLOCK': self.fuse_net = fusions.Block([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=2*cfg['fused_dim'], dropout_input=cfg['dropout']) self.classifier = SimpleClassifier(cfg['fused_dim'], cfg['classifier_hid_dim'], cfg['classes'], 0.5)
def __init__(self, cfg): super(BottomUp, self).__init__() self.cfg = cfg q_dim = cfg['rnn_dim'] * 2 if cfg['rnn_bidirection'] else cfg['rnn_dim'] self.w_emb = WordEmbedding(cfg['n_vocab'], cfg['word_embedding_dim']) self.w_emb.init_embedding(cfg['word_dic_file'], cfg['embedding_file']) self.q_emb = QuestionEmbedding(cfg['word_embedding_dim'], cfg['rnn_dim'], cfg['rnn_layer'], cfg['rnn_type'], keep_seq=False, bidirectional=cfg['rnn_bidirection']) self.v_att = NewAttention(cfg['v_dim'], q_dim, cfg['fused_dim']) if cfg['fuse_type'] == 'LinearSum': self.fuse_net = fusions.LinearSum([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MFB': self.fuse_net = fusions.MFB([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=1000, factor=5, dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MLB': self.fuse_net = fusions.MLB([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=2 * cfg['fused_dim'], dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MFH': self.fuse_net = fusions.MFH([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=1000, factor=5, dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MCB': from compact_bilinear_pooling import CompactBilinearPooling self.fuse_net = CompactBilinearPooling(cfg['v_dim'], q_dim, cfg['fused_dim']) # self.fuse_net = fusions.MCB([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_output=cfg['dropout']) self.classifier = SimpleClassifier(cfg['fused_dim'], cfg['classifier_hid_dim'], cfg['classes'], 0.5)