def __init__(self, config, vocabulary): super().__init__() self.config = config self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX ) self.hist_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"] ) self.ques_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"] ) self.dropout = nn.Dropout(p=config["dropout"]) # questions and history are right padded sequences of variable length # use the DynamicRNN utility module to handle them properly self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) # fusion layer fusion_size = config["img_feature_size"] + config["lstm_hidden_size"] * 2 self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"]) nn.init.kaiming_uniform_(self.fusion.weight) nn.init.constant_(self.fusion.bias, 0)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.dropout = config['dropout'] self.nhid = config['lstm_hidden_size'] self.img_feature_size = config['img_feature_size'] self.ninp = config['word_embedding_size'] self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) self.hist_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.ques_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.dropout = nn.Dropout(p=config["dropout_fc"]) self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) ##q c att on img self.Wq2 = nn.Linear(self.nhid, self.nhid) self.Wh2 = nn.Linear(self.nhid, self.nhid) self.Wi2 = nn.Linear(self.img_feature_size, self.nhid) self.Wall2 = nn.Linear(self.nhid, 1) ##fusion self.Wq3 = nn.Linear(self.nhid, self.nhid) self.Wc3 = nn.Linear(self.nhid, self.nhid) self.fusion = nn.Linear(self.nhid * 2 + self.img_feature_size, self.nhid) ###cap att img self.Wc4 = nn.Linear(self.nhid, self.nhid) self.Wi4 = nn.Linear(self.img_feature_size, self.nhid) self.Wall4 = nn.Linear(self.nhid, 1) self.q_multi1 = nn.Linear(self.nhid, self.nhid) self.q_multi2 = nn.Linear(self.nhid, 3) ##q att on h self.Wq1 = nn.Linear(self.nhid, self.nhid) self.Wh1 = nn.Linear(self.nhid, self.nhid) self.Wqh1 = nn.Linear(self.nhid, 1) for m in self.modules(): if isinstance(m, nn.Linear): nn.init.kaiming_uniform_(m.weight.data) if m.bias is not None: nn.init.constant_(m.bias.data, 0)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.vocabulary = vocabulary self.mcan_config = Cfgs() # ans embedding size # self.image_MCAN_Net = MCAN_Net(self.mcan_config, answer_size=config["lstm_hidden_size"]) self.use_hist = config.get("use_hist", False) self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) self.word_embed_size_for_rnn = config["word_embedding_size"] self.ques_rnn = nn.LSTM( self.word_embed_size_for_rnn, config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True) # SA: removing dropout for mcan self.dropout = nn.Dropout(p=config["dropout"]) # questions and history are right padded sequences of variable length # use the DynamicRNN utility module to handle them properly if self.use_hist: self.hist_MCAN_Net = MCAN_Net(self.mcan_config, answer_size=config["lstm_hidden_size"]) self.hist_rnn = nn.LSTM( self.word_embed_size_for_rnn, config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"] ) self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) # project image features to lstm_hidden_size for computing attention self.image_features_projection = nn.Linear( config["img_feature_size"], config["lstm_hidden_size"] ) fusion_size = ( config["lstm_hidden_size"] * 2 ) self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"]) nn.init.kaiming_uniform_(self.image_features_projection.weight) nn.init.constant_(self.image_features_projection.bias, 0) nn.init.kaiming_uniform_(self.fusion.weight) nn.init.constant_(self.fusion.bias, 0)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX ) self.hist_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], bidirectional=True ) self.ques_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], bidirectional=True ) # questions and history are right padded sequences of variable length # use the DynamicRNN utility module to handle them properly self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) # self attention for question self.Q_ATT_ans = Q_ATT(config) self.Q_ATT_ref = Q_ATT(config) # question-based history attention self.H_ATT_ans = H_ATT(config) # modules self.RvA_MODULE = RvA_MODULE(config) self.V_Filter = V_Filter(config) # fusion layer self.fusion = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear( config["img_feature_size"] + config["word_embedding_size"] + config["lstm_hidden_size"] * 2, config["lstm_hidden_size"] ) ) # other useful functions self.softmax = nn.Softmax(dim=-1) # initialization for m in self.modules(): if isinstance(m, nn.Linear): nn.init.kaiming_uniform(m.weight.data) if m.bias is not None: nn.init.constant_(m.bias.data, 0)
def __init__(self, config, vocabulary): super().__init__() self.config = config use_glove = config.get('use_glove', False) print("encoder use_glove:{}".format(use_glove)) if use_glove == False: self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) else: self.word_embed = nn.Embedding.from_pretrained( vocabulary.get_vocab_emb_tensors()) self.hist_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.ques_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.dropout = nn.Dropout(p=config["dropout"]) # questions and history are right padded sequences of variable length # use the DynamicRNN utility module to handle them properly self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) # project image features to lstm_hidden_size for computing attention self.image_features_projection = nn.Linear(config["img_feature_size"], config["lstm_hidden_size"]) # fc layer for image * question to attention weights self.attention_proj = nn.Linear(config["lstm_hidden_size"], 1) # fusion layer (attended_image_features + question + history) fusion_size = (config["img_feature_size"] + config["lstm_hidden_size"] * 2) self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"]) nn.init.kaiming_uniform_(self.image_features_projection.weight) nn.init.constant_(self.image_features_projection.bias, 0) nn.init.kaiming_uniform_(self.fusion.weight) nn.init.constant_(self.fusion.bias, 0)
def __init__(self, config, vocabulary, glove, elmo): super().__init__() self.config = config self.glove_embed = nn.Embedding(len(vocabulary), config["glove_embedding_size"]) self.elmo_embed = nn.Embedding(len(vocabulary), config["elmo_embedding_size"]) self.glove_embed.weight.data = glove self.elmo_embed.weight.data = elmo #self.glove_embed.weight.requires_grad = False self.elmo_embed.weight.requires_grad = False self.embed_change = nn.Linear(config["elmo_embedding_size"], config["word_embedding_size"]) self.hist_rnn = nn.LSTM(config["glove_embedding_size"] + config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"]) self.ques_rnn = nn.LSTM(config["glove_embedding_size"] + config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"]) # questions and history are right padded sequences of variable length # use the DynamicRNN utility module to handle them properly self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) self.semantic_module = Semantic(config) self.visual_module = Visual(config) self.img_cap_layer = nn.Linear( config["img_feature_size"] + config["captionsize_todecoder"], config["img_feature_size"] + config["captionsize_todecoder"]) self.MemNetFusion = MemNet(config) self.dropout = nn.Dropout(p=config["dropout"]) # initialization nn.init.kaiming_uniform_(self.fusion.weight) nn.init.constant_(self.fusion.bias, 0) nn.init.kaiming_uniform_(self.embed_change.weight) nn.init.constant_(self.embed_change.bias, 0) nn.init.kaiming_uniform_(self.img_cap_layer.weight) nn.init.constant_(self.img_cap_layer.bias, 0)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.dropout = nn.Dropout(p=config["dropout"]) #self.word_embed = nn.Embedding(len(vocab), config["word_embedding_size"], padding_idx=vocab.PAD_INDEX) weights = get_pretrained_weights(vocabulary) self.word_embed = nn.Embedding.from_pretrained(weights) self.ques_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.hist_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.ques_rnn = DynamicRNN(self.ques_rnn) self.hist_rnn = DynamicRNN(self.hist_rnn) self.image_features_projection = nn.Linear(config["img_feature_size"], config["lstm_hidden_size"]) self.attention_proj = nn.Linear(config["lstm_hidden_size"], 1) fusion_size = (config["img_feature_size"] + config["lstm_hidden_size"] * 2) self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"]) nn.init.kaiming_uniform_(self.image_features_projection.weight) nn.init.constant_(self.image_features_projection.bias, 0) nn.init.kaiming_uniform_(self.fusion.weight) nn.init.constant_(self.fusion.bias, 0)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) self.hist_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.ques_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.dropout = nn.Dropout(p=config["dropout"]) # questions and history are right padded sequences of variable length # use the DynamicRNN utility module to handle them properly self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) # project image features to lstm_hidden_size for computing attention self.image_features_projection = nn.Linear(config["img_feature_size"], config["lstm_hidden_size"]) # fc layer for image * question to attention weights self.attention_proj = nn.Linear(config["lstm_hidden_size"], 1) # fusion layer (attended_image_features + question + history) fusion_size = (config["img_feature_size"] + config["lstm_hidden_size"] * 2) self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"]) # block fusion opt = {} opt['type'] = 'block' opt['input_dims'] = [2048, 512] opt['output_dim'] = 2560 self.fusionb = factory_fusion(opt) nn.init.kaiming_uniform_(self.image_features_projection.weight) nn.init.constant_(self.image_features_projection.bias, 0)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.word_embed = nn.Embedding(len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX) self.hist_rnn = nn.LSTM(config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_enc_num_layers"], batch_first=True, dropout=config["lstm_dropout"], bidirectional=True) self.ques_rnn = nn.LSTM(config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_enc_num_layers"], batch_first=True, dropout=config["lstm_dropout"], bidirectional=True) self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) self.util1 = UtilityLayer(hidden_dim=config["lstm_hidden_size"], feedforward_dim=2048, nhead=8, dropout=0.1) self.util2 = UtilityLayer(hidden_dim=config["lstm_hidden_size"], feedforward_dim=2048, nhead=8, dropout=0.1) self.summary_attn = SummaryAttn(dim=config["lstm_hidden_size"], num_attn=3, dropout=config["model_dropout"]) self.context_fusion = nn.Linear(2 * config["lstm_hidden_size"], config["lstm_hidden_size"]) self.v_proj = nn.Linear(config["img_feature_size"], config["lstm_hidden_size"]) self.j_proj = nn.Linear(config["lstm_hidden_size"], config["lstm_hidden_size"]) self.q_proj = nn.Linear(config["lstm_hidden_size"] * 2, config["lstm_hidden_size"]) self.h_proj = nn.Linear(config["lstm_hidden_size"] * 2, config["lstm_hidden_size"])
def __init__(self, config, vocabulary): super().__init__() self.config = config self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) self.hist_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.ques_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.dropout = nn.Dropout(p=config["dropout"]) # questions and history are right padded sequences of variable length # use the DynamicRNN utility module to handle them properly self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) # fc layer for image * question to attention weights self.Wk = nn.Linear(config["lstm_hidden_size"], config["lstm_hidden_size"]) self.Wq = nn.Linear(config["lstm_hidden_size"], config["lstm_hidden_size"]) self.Wv = nn.Linear(config["lstm_hidden_size"], config["lstm_hidden_size"]) self.project_attention = nn.Linear(config["lstm_hidden_size"], 1) # fusion layer (attended_image_features + question + history) fusion_size = (config["lstm_hidden_size"] * 2) self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"]) #nn.init.kaiming_uniform_(self.image_features_projection.weight) #nn.init.constant_(self.image_features_projection.bias, 0) nn.init.kaiming_uniform_(self.fusion.weight) nn.init.constant_(self.fusion.bias, 0)
def __init__(self, config, vocabulary, bert_model=None, stage=1): super().__init__() self.config = config if config['word_embedding_type'] == 'glove': self.word_embed = nn.Embedding.from_pretrained(vocabulary.get_vocab_emb_tensors()) elif config['word_embedding_type'] == 'bert': self.word_embed = BertRefEmbedding(bert_model) else: self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) self.option_rnn = nn.LSTM( config["word_embedding_size"], config["decoder_lstm_hidden_size"], num_layers=config["decoder_lstm_num_layers"], batch_first=True, dropout=config["decoder_lstm_dropout"], ) # Options are variable length padded sequences, use DynamicRNN. self.option_rnn = DynamicRNN(self.option_rnn)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.nhid = config["lstm_hidden_size"] self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) self.option_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.a2a = nn.Linear(self.nhid *2, self.nhid) self.option_rnn = DynamicRNN(self.option_rnn) path = "data/qt_scores.json" file = open(path, 'r') self.count_dict = json.loads(file.read()) file.close() file = open('data/qt_count.json','r') self.qt_file = json.loads(file.read()) self.qt_list = list(self.qt_file.keys()) file.close()
def __init__(self, config, vocabulary,glove,elmo): super().__init__() self.config = config self.glove_embed = nn.Embedding( len(vocabulary), config["glove_embedding_size"] ) self.elmo_embed = nn.Embedding( len(vocabulary), config["elmo_embedding_size"] ) self.glove_embed.weight.data = glove self.elmo_embed.weight.data = elmo #self.glove_embed.weight.requires_grad = False self.elmo_embed.weight.requires_grad = False self.embed_change = nn.Linear( config["elmo_embedding_size"], config["word_embedding_size"] ) self.option_rnn = nn.LSTM(config["glove_embedding_size"] + config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"]) self.option_rnn = DynamicRNN(self.option_rnn) self.dropout = nn.Dropout(p=config["dropout"])
def __init__(self, config, vocabulary): super().__init__() self.config = config self.dropout = config['dropout'] self.nhid = config['lstm_hidden_size'] self.img_feature_size = config['img_feature_size'] self.ninp = config['word_embedding_size'] self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) self.hist_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.option_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.dropout = nn.Dropout(p=config["dropout_fc"]) self.hist_rnn = DynamicRNN(self.hist_rnn) self.option_rnn = DynamicRNN(self.option_rnn) self.Wc = nn.Linear(self.nhid * 2, self.nhid) self.Wd = nn.Linear(self.nhid, self.nhid) self.Wall = nn.Linear(self.nhid, 1) for m in self.modules(): if isinstance(m, nn.Linear): nn.init.kaiming_uniform_(m.weight.data) if m.bias is not None: nn.init.constant_(m.bias.data, 0) initial_path = 'data/100ans_feature.npy' initial_answer_feat = np.load(initial_path) self.user_dict = nn.Parameter(torch.FloatTensor(initial_answer_feat))
def __init__(self, config, vocabulary): super().__init__() self.config = config self.word_embed = nn.Embedding(len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX) self.option_rnn = nn.LSTM(config["word_embedding_size"], config["lstm_hidden_size"], batch_first=True) # options are variable length padded sequences, use DynamicRNN self.option_rnn = DynamicRNN(self.option_rnn)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.word_embed = nn.Embedding(len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX) self.hist_rnn = nn.LSTM(config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_enc_num_layers"], batch_first=True, dropout=config["lstm_dropout"], bidirectional=True) self.ques_rnn = nn.LSTM(config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_enc_num_layers"], batch_first=True, dropout=config["lstm_dropout"], bidirectional=True) self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) self.backbone = BackboneNetwork(dropout=config["model_dropout"]) self.v_proj = weight_norm(nn.Linear(config["img_feature_size"], config["lstm_hidden_size"]), dim=None) self.j_proj = weight_norm(nn.Linear(config["lstm_hidden_size"], config["lstm_hidden_size"]), dim=None) self.q_proj = weight_norm(nn.Linear(config["lstm_hidden_size"] * 2, config["lstm_hidden_size"]), dim=None) self.h_proj = weight_norm(nn.Linear(config["lstm_hidden_size"] * 2, config["lstm_hidden_size"]), dim=None)
def __init__(self, config, vocab): super().__init__() self.config = config # Getting pretrained weights weights = get_pretrained_weights(vocab) self.word_embed = nn.Embedding.from_pretrained(weights) # Discriminative decoder generates scores for each option self.option_rnn = nn.LSTM(config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"]) self.option_rnn = DynamicRNN(self.option_rnn)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) self.option_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) # self.a2a = nn.Linear(self.nhid * 2, self.nhid) # this is useless in this version # Options are variable length padded sequences, use DynamicRNN. self.option_rnn = DynamicRNN(self.option_rnn)
def __init__(self, config): super().__init__() self.config = config self.cap_rnn = nn.LSTM(config["glove_embedding_size"] + config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"]) self.cap_rnn = DynamicRNN(self.cap_rnn) self.ques_push = nn.Linear(config["lstm_hidden_size"], config["ques_change_num"]) self.caption_push = nn.Linear(config["lstm_hidden_size"], config["caption_change_num"]) self.caption_gate = nn.Linear(int(2 * config["lstm_hidden_size"]), int(2 * config["lstm_hidden_size"])) self.caption_dim_change = nn.Linear( int(2 * config["lstm_hidden_size"]), config["lstm_hidden_size"]) self.dropout = nn.Dropout(p=config["dropout"]) nn.init.kaiming_uniform_(self.ques_push.weight) nn.init.constant_(self.ques_push.bias, 0) nn.init.kaiming_uniform_(self.caption_push.weight) nn.init.constant_(self.caption_push.bias, 0) nn.init.kaiming_uniform_(self.caption_gate.weight) nn.init.constant_(self.caption_gate.bias, 0) nn.init.kaiming_uniform_(self.caption_dim_change.weight) nn.init.constant_(self.caption_dim_change.bias, 0)
def __init__(self, config, vocabulary, num_rounds=10): """ SA: TODO we have hardcoded num_rounds for now to 10. We have defined mask in init for speed computation as it is static. Ideally should be in forward. need a better way to masking :param config: :param vocabulary: :param num_rounds: """ super().__init__() self.config = config self.vocabulary = vocabulary self.mcan_config = Cfgs() # ans embedding size self.image_MCAN_Net = MCAN_Net(self.mcan_config, answer_size=config["lstm_hidden_size"]) self.use_hist = config.get("use_hist", False) self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) self.word_embed_size_for_rnn = config["word_embedding_size"] self.ques_rnn = nn.LSTM( self.word_embed_size_for_rnn, config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True) # SA: removing dropout for mcan self.dropout = nn.Dropout(p=config["dropout"]) # questions and history are right padded sequences of variable length # use the DynamicRNN utility module to handle them properly if self.use_hist: self.hist_MCAN_Net = MCAN_Net(self.mcan_config, answer_size=config["lstm_hidden_size"]) self.hist_rnn = nn.LSTM( self.word_embed_size_for_rnn, config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"] ) self.hist_rnn = DynamicRNN(self.hist_rnn) self.vqa_MCAN_Net = MCAN_Net(self.mcan_config, answer_size=config["lstm_hidden_size"]) self.mask_prev_rounds_ = self.mask_prev_rounds(num_rounds=num_rounds, emb_size=config["lstm_hidden_size"]) self.ques_rnn = DynamicRNN(self.ques_rnn) # project image features to lstm_hidden_size for computing attention self.image_features_projection = nn.Linear( config["img_feature_size"], config["lstm_hidden_size"] ) fusion_size = ( config["lstm_hidden_size"] * 2 ) self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"]) nn.init.kaiming_uniform_(self.image_features_projection.weight) nn.init.constant_(self.image_features_projection.bias, 0) nn.init.kaiming_uniform_(self.fusion.weight) nn.init.constant_(self.fusion.bias, 0)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.word_embed = nn.Embedding(len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX) # if config["fix_word_embedding"] == True: # self.word_embed.weight.requires_grad = False self.hist_rnn = nn.LSTM(config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], bidirectional=True) self.ques_rnn = nn.LSTM(config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], bidirectional=True) # questions and history are right padded sequences of variable length # use the DynamicRNN utility module to handle them properly self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) # img_feature_size = config["img_feature_size"] + config["img_loc_size"] img_feature_size = config["img_feature_size"] lstm_hidden_size = config["lstm_hidden_size"] word_embed_size = config["word_embedding_size"] self.img_feature_size = img_feature_size self.lstm_hidden_size = lstm_hidden_size self.word_embed_size = word_embed_size self.relu = nn.ReLU() # new: attention # embedding self.Wii = nn.Sequential(nn.Dropout(p=config["dropout_fc"]), nn.Linear(img_feature_size, lstm_hidden_size), nn.BatchNorm1d(lstm_hidden_size), self.relu) self.Wqi = nn.Sequential(nn.Dropout(p=config["dropout_fc"]), nn.Linear(word_embed_size, lstm_hidden_size), nn.BatchNorm1d(lstm_hidden_size), self.relu) self.Wq_fuse_g = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(word_embed_size, img_feature_size), nn.Sigmoid()) self.Wqq_ans = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size * 2, lstm_hidden_size), nn.BatchNorm1d(lstm_hidden_size), self.relu) self.Wqq_ref = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size * 2, lstm_hidden_size), nn.BatchNorm1d(lstm_hidden_size), self.relu) self.Wqq_inf = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(word_embed_size, lstm_hidden_size), nn.BatchNorm1d(lstm_hidden_size), self.relu) self.Whh_ref = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size * 2, lstm_hidden_size), nn.BatchNorm1d(lstm_hidden_size), self.relu) self.Wqh_ref = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size * 2, lstm_hidden_size), nn.BatchNorm1d(lstm_hidden_size), self.relu) # attention self.Wia = nn.Sequential(nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size, 1)) self.Wqa_ans = nn.Sequential(nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size, 1)) self.Wqa_ref = nn.Sequential(nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size, 1)) self.Wha_ans = nn.Sequential(nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size, 1)) self.Wha_ref = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size * 2, lstm_hidden_size), nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size, 1)) self.Wh_ref = nn.Linear(2, 1) # referring to history self.Wq_inf = nn.Sequential(nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size, 2)) # fusion self.fusion_v = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(img_feature_size, lstm_hidden_size * 2), nn.BatchNorm1d(lstm_hidden_size * 2), self.relu) self.fusion_q = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size * 2, lstm_hidden_size * 2), nn.BatchNorm1d(lstm_hidden_size * 2), self.relu) self.fusion = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear( img_feature_size + lstm_hidden_size * 2 + lstm_hidden_size * 2, lstm_hidden_size * config["ans_cls_num"])) self.fusion_cls = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size * 2, config["ans_cls_num"])) # other useful functions self.softmax = nn.Softmax(dim=-1) # self.G_softmax = F.gumbel_softmax() for m in self.modules(): if isinstance(m, nn.Linear): nn.init.kaiming_uniform_(m.weight.data) if m.bias is not None: nn.init.constant_(m.bias.data, 0)
def __init__(self, config, vocabulary): super().__init__() self.config = config self.dropout = config['dropout'] self.nhid = config['lstm_hidden_size'] self.img_feature_size = config['img_feature_size'] self.ninp = config['word_embedding_size'] self.head_num = config['head_num'] self.word_embed = nn.Embedding( len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX, ) self.hist_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.ques_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.cap_rnn = nn.LSTM( config["word_embedding_size"], config["lstm_hidden_size"], config["lstm_num_layers"], batch_first=True, dropout=config["dropout"], ) self.dropout = nn.Dropout(p=config["dropout_fc"]) self.hist_rnn = DynamicRNN(self.hist_rnn) self.ques_rnn = DynamicRNN(self.ques_rnn) self.cap_rnn = DynamicRNN(self.cap_rnn) ##q c att on img self.Wq2 = nn.Sequential(self.dropout, nn.Linear(self.nhid * 2, self.nhid)) self.Wi2 = nn.Sequential(self.dropout, nn.Linear(self.img_feature_size, self.nhid)) self.Wall2 = nn.Linear(self.nhid, 1) # q_att_on_cap self.Wqs3 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid)) self.Wcs3 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid)) self.Wc3 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid)) self.Wall3 = nn.Linear(self.nhid, 1) self.c2c = nn.Sequential(self.dropout, nn.Linear(self.ninp, self.nhid)) # c_att_on_ques self.Wqs5 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid)) self.Wcs5 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid)) self.Wq5 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid)) self.Wall5 = nn.Linear(self.nhid, 1) self.q2q = nn.Sequential(self.dropout, nn.Linear(self.ninp, self.nhid)) # q att on h self.Wq1 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid)) self.Wh1 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid)) self.Wqh1 = nn.Linear(self.nhid, 1) ###cap att img self.Wc4 = nn.Sequential(self.dropout, nn.Linear(self.nhid * 2, self.nhid)) self.Wi4 = nn.Sequential(self.dropout, nn.Linear(self.img_feature_size, self.nhid)) self.Wall4 = nn.Linear(self.nhid, 1) ##fusion self.i2i = nn.Sequential(self.dropout, nn.Linear(self.img_feature_size, self.nhid)) self.fusion_1 = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(self.nhid * 2 + self.img_feature_size + self.nhid, self.nhid), nn.LeakyReLU()) self.fusion_2 = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(self.nhid * 2 + self.img_feature_size + self.nhid, self.nhid), nn.LeakyReLU()) self.fusion_3 = nn.Sequential( nn.Dropout(p=config["dropout_fc"]), nn.Linear(self.nhid * 2 + self.img_feature_size + self.nhid, self.nhid), nn.LeakyReLU()) self.q_ref = nn.Sequential(nn.Dropout(p=config["dropout_fc"]), nn.Linear(self.nhid * 2, self.nhid), nn.LeakyReLU(), nn.Dropout(p=config["dropout_fc"]), nn.Linear(self.nhid, 2), nn.LeakyReLU()) self.q_multi = nn.Sequential(nn.Dropout(p=config["dropout_fc"]), nn.Linear(self.nhid * 2, self.nhid), nn.LeakyReLU(), nn.Dropout(p=config["dropout_fc"]), nn.Linear(self.nhid, 3), nn.LeakyReLU()) for m in self.modules(): if isinstance(m, nn.Linear): nn.init.kaiming_uniform_(m.weight.data) if m.bias is not None: nn.init.constant_(m.bias.data, 0)