def __init__(self, txt_enc={}, self_q_att=False, n_step=3, shared=False, cell={}, agg={}, classif={}, wid_to_word={}, word_to_wid={}, aid_to_ans=[], ans_to_aid={}): super(MuRelNet, self).__init__() self.self_q_att = self_q_att self.n_step = n_step self.shared = shared self.cell = cell self.agg = agg assert self.agg['type'] in ['max', 'mean'] self.classif = classif self.wid_to_word = wid_to_word self.word_to_wid = word_to_wid self.aid_to_ans = aid_to_ans self.ans_to_aid = ans_to_aid # Modules self.txt_enc = factory_text_enc(self.wid_to_word, txt_enc) if self.self_q_att: self.q_att_linear0 = nn.Linear(2400, 512) self.q_att_linear1 = nn.Linear(512, 2) if self.shared: self.cell = MuRelCell(**cell) else: self.cells = nn.ModuleList( [MuRelCell(**cell) for i in range(self.n_step)]) if 'fusion' in self.classif: self.classif_module = block.factory_fusion(self.classif['fusion']) elif 'mlp' in self.classif: self.classif_module = MLP(self.classif['mlp']) else: raise ValueError(self.classif.keys()) Logger().log_value('nparams', sum(p.numel() for p in self.parameters() if p.requires_grad), should_print=True) Logger().log_value('nparams_txt_enc', self.get_nparams_txt_enc(), should_print=True) self.buffer = None
def __init__( self, txt_enc={}, dim_q=2400, max_ans=15, self_q_att=False, wid_to_word={}, word_to_wid={}, aid_to_ans=[], ans_to_aid={}, output="classification", ): super().__init__() self.self_q_att = self_q_att self.wid_to_word = wid_to_word self.word_to_wid = word_to_wid self.aid_to_ans = aid_to_ans self.ans_to_aid = ans_to_aid self.max_ans = max_ans self.output = output if self.output == "classification": self.classifier = nn.Sequential( nn.Linear(2048 + dim_q, 1024), nn.ReLU(), nn.Linear(1024, len(aid_to_ans)), ) elif self.output == "regression": self.classifier = nn.Sequential( nn.Linear(2048 + dim_q, 1024), nn.ReLU(), nn.Linear(1024, 1), ) # Modules self.txt_enc = factory_text_enc(self.wid_to_word, txt_enc) if self.self_q_att: self.q_att_linear0 = nn.Linear(dim_q, 512) self.q_att_linear1 = nn.Linear(512, 2) Logger().log_value( "nparams", sum(p.numel() for p in self.parameters() if p.requires_grad), should_print=True, ) Logger().log_value("nparams_txt_enc", self.get_nparams_txt_enc(), should_print=True) self.buffer = None
def __init__(self, txt_enc={}, q_max_length=14, glimpse=2, objects=36, feat_dims={}, biattention={}, wid_to_word={}, word_to_wid={}, aid_to_ans=[], ans_to_aid={}): super(BanNet, self).__init__() # self.self_q_att = self_q_att self.glimpse = glimpse self.q_max_length = q_max_length self.objects = objects # self.classif = classif self.wid_to_word = wid_to_word self.word_to_wid = word_to_wid self.aid_to_ans = aid_to_ans self.ans_to_aid = ans_to_aid # Modules self.txt_enc = factory_text_enc(self.wid_to_word, txt_enc) self.v_att = BilinearAttentionMap(**feat_dims, glimpse=glimpse) self.b_net = [] self.q_prj = [] self.c_prj = [] for i in range(glimpse): # self.b_net.append(LowLankBilinearPooling( # feat_dims['v_dim'], feat_dims['q_dim'], feat_dims['h_dim'], # None, k=1)) self.b_net.append( LowLankBilinearPooling(**feat_dims, h_out=None, k=1)) self.q_prj.append( FCLayer(feat_dims['h_dim'], feat_dims['h_dim'], '', .2)) self.c_prj.append( FCLayer(objects + 1, feat_dims['h_dim'], 'ReLU', .0)) self.b_net = nn.ModuleList(self.b_net) self.q_prj = nn.ModuleList(self.q_prj) self.c_prj = nn.ModuleList(self.c_prj) self.classifier = Classifier(feat_dims['h_dim'], feat_dims['h_dim'] * 2, 3000, 0.5) # ここで skipthoughts している # if self.self_q_att: # self.q_att_linear0 = nn.Linear(2400, 512) # self.q_att_linear1 = nn.Linear(512, 2) # if self.shared: # self.cell = MuRelCell(**cell) # else: # self.cells = nn.ModuleList([MuRelCell(**cell) for i in range(self.n_step)]) # self.slp = SLP(**self.classif['slp']) # if 'fusion' in self.classif: # self.classif_module = block.factory_fusion(self.classif['fusion']) # elif 'mlp' in self.classif: # self.classif_module = MLP(self.classif['mlp']) # else: # raise ValueError(self.classif.keys()) Logger().log_value('nparams', sum(p.numel() for p in self.parameters() if p.requires_grad), should_print=True)
def get_text_enc(self, vocab_words, options): """ returns the text encoding network. """ return factory_text_enc(self.wid_to_word, options)
def __init__( self, txt_enc={}, self_q_att=False, max_ans=None, # answer_scaling=False, # model add_coords=True, self_attention_vision=True, fusion_vision=True, num_heads_self_att=None, intermediate_dim_self_att=None, hidden_dim=2048, # fusion fusion_mm_dim=1200, fusion_activ="relu", attention_scaling=1.0, residual_fusion=False, # output output=None, output_on="final", output_params={}, wid_to_word={}, word_to_wid={}, aid_to_ans=[], ans_to_aid={}, ): super().__init__() self.self_q_att = self_q_att self.wid_to_word = wid_to_word self.word_to_wid = word_to_wid self.aid_to_ans = aid_to_ans self.ans_to_aid = ans_to_aid self.max_ans = max_ans self.attention_scaling = attention_scaling self.add_coords = add_coords # self. # self.branches = branches self.self_attention_vision = self_attention_vision self.fusion_vision = fusion_vision self.residual_fusion = residual_fusion self.output_on = output_on self.output = output self.output_params = output_params # if self.add_lvis: # self.lin_lvis = nn.Linear(1024, 2048) # Modules self.txt_enc = factory_text_enc(self.wid_to_word, txt_enc) if self.self_q_att: self.q_att_linear0 = nn.Linear(2400, 512) self.q_att_linear1 = nn.Linear(512, 2) self.q_projection = nn.Linear(2400, hidden_dim) self.hidden_dim = hidden_dim if hidden_dim != 2048: self.v_projection = nn.Linear(2048, hidden_dim) def transformer_block(): return TransformerBlock( hidden_dim, hidden_dim, hidden_dim, num_heads=num_heads_self_att, intermediate_dim=intermediate_dim_self_att, ) def fusion_block(): return MLB( [hidden_dim, hidden_dim], hidden_dim, mm_dim=fusion_mm_dim, activ_input=fusion_activ, activ_output=fusion_activ, normalize=True, dropout_input=0.1, dropout_pre_lin=0.0, dropout_output=0.0, ) # different modules if self.self_attention_vision: self.self_attention_vision_block = nn.ModuleList( [transformer_block()]) Logger().log_value( "self-att-vision.nparams", self.get_nparams(self.self_attention_vision_block), should_print=True, ) if self.fusion_vision: self.vi_fusion_blocks = nn.ModuleList([fusion_block()]) Logger().log_value( "vi_fusion_blocks.nparams", self.get_nparams(self.vi_fusion_blocks), should_print=True, ) # text self.question_cells = nn.ModuleList( [nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())]) self.output_module = RegressionFusionOutput(**output_params) if self.add_coords: self.lin_coords = nn.Linear(4, hidden_dim) Logger().log_value( "output.nparams", self.get_nparams(self.output_module), should_print=True, ) Logger().log_value( "nparams", self.get_nparams(self), should_print=True, ) Logger().log_value("nparams_txt_enc", self.get_nparams_txt_enc(), should_print=True)