示例#1
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config

        self.word_embed = nn.Embedding(
            len(vocabulary), config["word_embedding_size"], padding_idx=vocabulary.PAD_INDEX
        )
        self.hist_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"]
        )
        self.ques_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"]
        )
        self.dropout = nn.Dropout(p=config["dropout"])

        # questions and history are right padded sequences of variable length
        # use the DynamicRNN utility module to handle them properly
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)

        # fusion layer
        fusion_size = config["img_feature_size"] + config["lstm_hidden_size"] * 2
        self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"])

        nn.init.kaiming_uniform_(self.fusion.weight)
        nn.init.constant_(self.fusion.bias, 0)
示例#2
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config
        self.dropout = config['dropout']
        self.nhid = config['lstm_hidden_size']
        self.img_feature_size = config['img_feature_size']
        self.ninp = config['word_embedding_size']
        self.word_embed = nn.Embedding(
            len(vocabulary),
            config["word_embedding_size"],
            padding_idx=vocabulary.PAD_INDEX,
        )
        self.hist_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.ques_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.dropout = nn.Dropout(p=config["dropout_fc"])
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)

        ##q c att on img
        self.Wq2 = nn.Linear(self.nhid, self.nhid)
        self.Wh2 = nn.Linear(self.nhid, self.nhid)
        self.Wi2 = nn.Linear(self.img_feature_size, self.nhid)
        self.Wall2 = nn.Linear(self.nhid, 1)

        ##fusion
        self.Wq3 = nn.Linear(self.nhid, self.nhid)
        self.Wc3 = nn.Linear(self.nhid, self.nhid)
        self.fusion = nn.Linear(self.nhid * 2 + self.img_feature_size,
                                self.nhid)
        ###cap att img
        self.Wc4 = nn.Linear(self.nhid, self.nhid)
        self.Wi4 = nn.Linear(self.img_feature_size, self.nhid)
        self.Wall4 = nn.Linear(self.nhid, 1)

        self.q_multi1 = nn.Linear(self.nhid, self.nhid)
        self.q_multi2 = nn.Linear(self.nhid, 3)

        ##q att on h
        self.Wq1 = nn.Linear(self.nhid, self.nhid)
        self.Wh1 = nn.Linear(self.nhid, self.nhid)
        self.Wqh1 = nn.Linear(self.nhid, 1)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight.data)
                if m.bias is not None:
                    nn.init.constant_(m.bias.data, 0)
示例#3
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config
        self.vocabulary = vocabulary

        self.mcan_config = Cfgs()
        # ans embedding size
        # self.image_MCAN_Net = MCAN_Net(self.mcan_config, answer_size=config["lstm_hidden_size"])

        self.use_hist = config.get("use_hist", False)

        self.word_embed = nn.Embedding(
            len(vocabulary),
            config["word_embedding_size"],
            padding_idx=vocabulary.PAD_INDEX,
        )
        self.word_embed_size_for_rnn = config["word_embedding_size"]

        self.ques_rnn = nn.LSTM(
            self.word_embed_size_for_rnn,
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True)
        # SA: removing dropout for mcan

        self.dropout = nn.Dropout(p=config["dropout"])

        # questions and history are right padded sequences of variable length
        # use the DynamicRNN utility module to handle them properly

        if self.use_hist:
            self.hist_MCAN_Net = MCAN_Net(self.mcan_config, answer_size=config["lstm_hidden_size"])

            self.hist_rnn = nn.LSTM(
                self.word_embed_size_for_rnn,
                config["lstm_hidden_size"],
                config["lstm_num_layers"],
                batch_first=True,
                dropout=config["dropout"]
            )
            self.hist_rnn = DynamicRNN(self.hist_rnn)


        self.ques_rnn = DynamicRNN(self.ques_rnn)

        # project image features to lstm_hidden_size for computing attention
        self.image_features_projection = nn.Linear(
            config["img_feature_size"], config["lstm_hidden_size"]
        )

        fusion_size = (
            config["lstm_hidden_size"] * 2
        )
        self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"])

        nn.init.kaiming_uniform_(self.image_features_projection.weight)
        nn.init.constant_(self.image_features_projection.bias, 0)
        nn.init.kaiming_uniform_(self.fusion.weight)
        nn.init.constant_(self.fusion.bias, 0)
示例#4
0
文件: rva.py 项目: yuleiniu/rva
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config

        self.word_embed = nn.Embedding(
            len(vocabulary), 
            config["word_embedding_size"], 
            padding_idx=vocabulary.PAD_INDEX
        )

        self.hist_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"], 
            bidirectional=True
        )
        self.ques_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"], 
            bidirectional=True
        )        
        # questions and history are right padded sequences of variable length
        # use the DynamicRNN utility module to handle them properly
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)
        
        # self attention for question
        self.Q_ATT_ans = Q_ATT(config)
        self.Q_ATT_ref = Q_ATT(config)
        # question-based history attention
        self.H_ATT_ans = H_ATT(config)

        # modules
        self.RvA_MODULE = RvA_MODULE(config)
        self.V_Filter = V_Filter(config)

        # fusion layer
        self.fusion = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(
                config["img_feature_size"] + config["word_embedding_size"] + config["lstm_hidden_size"] * 2, 
                config["lstm_hidden_size"]
            )
        )
        # other useful functions
        self.softmax = nn.Softmax(dim=-1)

        # initialization
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform(m.weight.data)
                if m.bias is not None:
                    nn.init.constant_(m.bias.data, 0)
示例#5
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config

        use_glove = config.get('use_glove', False)
        print("encoder use_glove:{}".format(use_glove))
        if use_glove == False:
            self.word_embed = nn.Embedding(
                len(vocabulary),
                config["word_embedding_size"],
                padding_idx=vocabulary.PAD_INDEX,
            )
        else:
            self.word_embed = nn.Embedding.from_pretrained(
                vocabulary.get_vocab_emb_tensors())

        self.hist_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.ques_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.dropout = nn.Dropout(p=config["dropout"])

        # questions and history are right padded sequences of variable length
        # use the DynamicRNN utility module to handle them properly
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)

        # project image features to lstm_hidden_size for computing attention
        self.image_features_projection = nn.Linear(config["img_feature_size"],
                                                   config["lstm_hidden_size"])

        # fc layer for image * question to attention weights
        self.attention_proj = nn.Linear(config["lstm_hidden_size"], 1)

        # fusion layer (attended_image_features + question + history)
        fusion_size = (config["img_feature_size"] +
                       config["lstm_hidden_size"] * 2)
        self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"])

        nn.init.kaiming_uniform_(self.image_features_projection.weight)
        nn.init.constant_(self.image_features_projection.bias, 0)
        nn.init.kaiming_uniform_(self.fusion.weight)
        nn.init.constant_(self.fusion.bias, 0)
示例#6
0
    def __init__(self, config, vocabulary, glove, elmo):
        super().__init__()
        self.config = config

        self.glove_embed = nn.Embedding(len(vocabulary),
                                        config["glove_embedding_size"])
        self.elmo_embed = nn.Embedding(len(vocabulary),
                                       config["elmo_embedding_size"])
        self.glove_embed.weight.data = glove
        self.elmo_embed.weight.data = elmo
        #self.glove_embed.weight.requires_grad = False
        self.elmo_embed.weight.requires_grad = False
        self.embed_change = nn.Linear(config["elmo_embedding_size"],
                                      config["word_embedding_size"])

        self.hist_rnn = nn.LSTM(config["glove_embedding_size"] +
                                config["word_embedding_size"],
                                config["lstm_hidden_size"],
                                config["lstm_num_layers"],
                                batch_first=True,
                                dropout=config["dropout"])
        self.ques_rnn = nn.LSTM(config["glove_embedding_size"] +
                                config["word_embedding_size"],
                                config["lstm_hidden_size"],
                                config["lstm_num_layers"],
                                batch_first=True,
                                dropout=config["dropout"])
        # questions and history are right padded sequences of variable length
        # use the DynamicRNN utility module to handle them properly
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)

        self.semantic_module = Semantic(config)
        self.visual_module = Visual(config)

        self.img_cap_layer = nn.Linear(
            config["img_feature_size"] + config["captionsize_todecoder"],
            config["img_feature_size"] + config["captionsize_todecoder"])

        self.MemNetFusion = MemNet(config)

        self.dropout = nn.Dropout(p=config["dropout"])

        # initialization

        nn.init.kaiming_uniform_(self.fusion.weight)
        nn.init.constant_(self.fusion.bias, 0)

        nn.init.kaiming_uniform_(self.embed_change.weight)
        nn.init.constant_(self.embed_change.bias, 0)

        nn.init.kaiming_uniform_(self.img_cap_layer.weight)
        nn.init.constant_(self.img_cap_layer.bias, 0)
示例#7
0
    def __init__(self, config, vocabulary):

        super().__init__()

        self.config = config

        self.dropout = nn.Dropout(p=config["dropout"])

        #self.word_embed = nn.Embedding(len(vocab),  config["word_embedding_size"], padding_idx=vocab.PAD_INDEX)

        weights = get_pretrained_weights(vocabulary)

        self.word_embed = nn.Embedding.from_pretrained(weights)

        self.ques_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )

        self.hist_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )

        self.ques_rnn = DynamicRNN(self.ques_rnn)

        self.hist_rnn = DynamicRNN(self.hist_rnn)

        self.image_features_projection = nn.Linear(config["img_feature_size"],
                                                   config["lstm_hidden_size"])

        self.attention_proj = nn.Linear(config["lstm_hidden_size"], 1)

        fusion_size = (config["img_feature_size"] +
                       config["lstm_hidden_size"] * 2)

        self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"])

        nn.init.kaiming_uniform_(self.image_features_projection.weight)

        nn.init.constant_(self.image_features_projection.bias, 0)

        nn.init.kaiming_uniform_(self.fusion.weight)

        nn.init.constant_(self.fusion.bias, 0)
示例#8
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config

        self.word_embed = nn.Embedding(
            len(vocabulary),
            config["word_embedding_size"],
            padding_idx=vocabulary.PAD_INDEX,
        )
        self.hist_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.ques_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.dropout = nn.Dropout(p=config["dropout"])

        # questions and history are right padded sequences of variable length
        # use the DynamicRNN utility module to handle them properly
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)

        # project image features to lstm_hidden_size for computing attention
        self.image_features_projection = nn.Linear(config["img_feature_size"],
                                                   config["lstm_hidden_size"])

        # fc layer for image * question to attention weights
        self.attention_proj = nn.Linear(config["lstm_hidden_size"], 1)

        # fusion layer (attended_image_features + question + history)
        fusion_size = (config["img_feature_size"] +
                       config["lstm_hidden_size"] * 2)
        self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"])

        # block fusion
        opt = {}
        opt['type'] = 'block'
        opt['input_dims'] = [2048, 512]
        opt['output_dim'] = 2560
        self.fusionb = factory_fusion(opt)

        nn.init.kaiming_uniform_(self.image_features_projection.weight)
        nn.init.constant_(self.image_features_projection.bias, 0)
示例#9
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config

        self.word_embed = nn.Embedding(len(vocabulary),
                                       config["word_embedding_size"],
                                       padding_idx=vocabulary.PAD_INDEX)

        self.hist_rnn = nn.LSTM(config["word_embedding_size"],
                                config["lstm_hidden_size"],
                                config["lstm_enc_num_layers"],
                                batch_first=True,
                                dropout=config["lstm_dropout"],
                                bidirectional=True)

        self.ques_rnn = nn.LSTM(config["word_embedding_size"],
                                config["lstm_hidden_size"],
                                config["lstm_enc_num_layers"],
                                batch_first=True,
                                dropout=config["lstm_dropout"],
                                bidirectional=True)

        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)

        self.util1 = UtilityLayer(hidden_dim=config["lstm_hidden_size"],
                                  feedforward_dim=2048,
                                  nhead=8,
                                  dropout=0.1)
        self.util2 = UtilityLayer(hidden_dim=config["lstm_hidden_size"],
                                  feedforward_dim=2048,
                                  nhead=8,
                                  dropout=0.1)
        self.summary_attn = SummaryAttn(dim=config["lstm_hidden_size"],
                                        num_attn=3,
                                        dropout=config["model_dropout"])
        self.context_fusion = nn.Linear(2 * config["lstm_hidden_size"],
                                        config["lstm_hidden_size"])

        self.v_proj = nn.Linear(config["img_feature_size"],
                                config["lstm_hidden_size"])

        self.j_proj = nn.Linear(config["lstm_hidden_size"],
                                config["lstm_hidden_size"])

        self.q_proj = nn.Linear(config["lstm_hidden_size"] * 2,
                                config["lstm_hidden_size"])

        self.h_proj = nn.Linear(config["lstm_hidden_size"] * 2,
                                config["lstm_hidden_size"])
示例#10
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config

        self.word_embed = nn.Embedding(
            len(vocabulary),
            config["word_embedding_size"],
            padding_idx=vocabulary.PAD_INDEX,
        )
        self.hist_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.ques_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )

        self.dropout = nn.Dropout(p=config["dropout"])

        # questions and history are right padded sequences of variable length
        # use the DynamicRNN utility module to handle them properly
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)

        # fc layer for image * question to attention weights
        self.Wk = nn.Linear(config["lstm_hidden_size"],
                            config["lstm_hidden_size"])
        self.Wq = nn.Linear(config["lstm_hidden_size"],
                            config["lstm_hidden_size"])
        self.Wv = nn.Linear(config["lstm_hidden_size"],
                            config["lstm_hidden_size"])

        self.project_attention = nn.Linear(config["lstm_hidden_size"], 1)
        # fusion layer (attended_image_features + question + history)
        fusion_size = (config["lstm_hidden_size"] * 2)
        self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"])

        #nn.init.kaiming_uniform_(self.image_features_projection.weight)
        #nn.init.constant_(self.image_features_projection.bias, 0)
        nn.init.kaiming_uniform_(self.fusion.weight)
        nn.init.constant_(self.fusion.bias, 0)
示例#11
0
    def __init__(self, config, vocabulary, bert_model=None, stage=1):
        super().__init__()
        self.config = config

        if config['word_embedding_type'] == 'glove':
            self.word_embed = nn.Embedding.from_pretrained(vocabulary.get_vocab_emb_tensors())
        elif config['word_embedding_type'] == 'bert':
            self.word_embed = BertRefEmbedding(bert_model)
        else:
            self.word_embed = nn.Embedding(
                len(vocabulary),
                config["word_embedding_size"],
                padding_idx=vocabulary.PAD_INDEX,
            )

        self.option_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["decoder_lstm_hidden_size"],
            num_layers=config["decoder_lstm_num_layers"],
            batch_first=True,
            dropout=config["decoder_lstm_dropout"],
        )

        # Options are variable length padded sequences, use DynamicRNN.
        self.option_rnn = DynamicRNN(self.option_rnn)
示例#12
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config
        self.nhid = config["lstm_hidden_size"]

        self.word_embed = nn.Embedding(
            len(vocabulary),
            config["word_embedding_size"],
            padding_idx=vocabulary.PAD_INDEX,
        )
        self.option_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.a2a = nn.Linear(self.nhid *2, self.nhid)
        self.option_rnn = DynamicRNN(self.option_rnn)
        path = "data/qt_scores.json"
        file = open(path, 'r')
        self.count_dict = json.loads(file.read())
        file.close()
        file = open('data/qt_count.json','r')
        self.qt_file = json.loads(file.read())
        self.qt_list = list(self.qt_file.keys())
        file.close()
示例#13
0
    def __init__(self, config, vocabulary,glove,elmo):
        super().__init__()
        self.config = config

        self.glove_embed = nn.Embedding(
            len(vocabulary), config["glove_embedding_size"]
        )
        self.elmo_embed = nn.Embedding(
            len(vocabulary), config["elmo_embedding_size"]
        )
        self.glove_embed.weight.data = glove
        self.elmo_embed.weight.data = elmo
        #self.glove_embed.weight.requires_grad = False
        self.elmo_embed.weight.requires_grad = False
        self.embed_change = nn.Linear(
            config["elmo_embedding_size"], config["word_embedding_size"]
        )


        self.option_rnn = nn.LSTM(config["glove_embedding_size"] + config["word_embedding_size"],
                                  config["lstm_hidden_size"],
                                  config["lstm_num_layers"],
                                  batch_first=True,
                                  dropout=config["dropout"])
        self.option_rnn = DynamicRNN(self.option_rnn)

        self.dropout = nn.Dropout(p=config["dropout"])
示例#14
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config
        self.dropout = config['dropout']
        self.nhid = config['lstm_hidden_size']
        self.img_feature_size = config['img_feature_size']
        self.ninp = config['word_embedding_size']

        self.word_embed = nn.Embedding(
            len(vocabulary),
            config["word_embedding_size"],
            padding_idx=vocabulary.PAD_INDEX,
        )
        self.hist_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.option_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )

        self.dropout = nn.Dropout(p=config["dropout_fc"])
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.option_rnn = DynamicRNN(self.option_rnn)
        self.Wc = nn.Linear(self.nhid * 2, self.nhid)
        self.Wd = nn.Linear(self.nhid, self.nhid)
        self.Wall = nn.Linear(self.nhid, 1)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight.data)
                if m.bias is not None:
                    nn.init.constant_(m.bias.data, 0)
        initial_path = 'data/100ans_feature.npy'
        initial_answer_feat = np.load(initial_path)
        self.user_dict = nn.Parameter(torch.FloatTensor(initial_answer_feat))
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config

        self.word_embed = nn.Embedding(len(vocabulary),
                                       config["word_embedding_size"],
                                       padding_idx=vocabulary.PAD_INDEX)
        self.option_rnn = nn.LSTM(config["word_embedding_size"],
                                  config["lstm_hidden_size"],
                                  batch_first=True)

        # options are variable length padded sequences, use DynamicRNN
        self.option_rnn = DynamicRNN(self.option_rnn)
示例#16
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config

        self.word_embed = nn.Embedding(len(vocabulary),
                                       config["word_embedding_size"],
                                       padding_idx=vocabulary.PAD_INDEX)
        self.hist_rnn = nn.LSTM(config["word_embedding_size"],
                                config["lstm_hidden_size"],
                                config["lstm_enc_num_layers"],
                                batch_first=True,
                                dropout=config["lstm_dropout"],
                                bidirectional=True)
        self.ques_rnn = nn.LSTM(config["word_embedding_size"],
                                config["lstm_hidden_size"],
                                config["lstm_enc_num_layers"],
                                batch_first=True,
                                dropout=config["lstm_dropout"],
                                bidirectional=True)
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)
        self.backbone = BackboneNetwork(dropout=config["model_dropout"])
        self.v_proj = weight_norm(nn.Linear(config["img_feature_size"],
                                            config["lstm_hidden_size"]),
                                  dim=None)

        self.j_proj = weight_norm(nn.Linear(config["lstm_hidden_size"],
                                            config["lstm_hidden_size"]),
                                  dim=None)

        self.q_proj = weight_norm(nn.Linear(config["lstm_hidden_size"] * 2,
                                            config["lstm_hidden_size"]),
                                  dim=None)

        self.h_proj = weight_norm(nn.Linear(config["lstm_hidden_size"] * 2,
                                            config["lstm_hidden_size"]),
                                  dim=None)
    def __init__(self, config, vocab):
        super().__init__()
        self.config = config
        # Getting pretrained weights
        weights = get_pretrained_weights(vocab)
        self.word_embed = nn.Embedding.from_pretrained(weights)

        # Discriminative decoder generates scores for each option
        self.option_rnn = nn.LSTM(config["word_embedding_size"],
                                  config["lstm_hidden_size"],
                                  config["lstm_num_layers"],
                                  batch_first=True,
                                  dropout=config["dropout"])

        self.option_rnn = DynamicRNN(self.option_rnn)
示例#18
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config

        self.word_embed = nn.Embedding(
            len(vocabulary),
            config["word_embedding_size"],
            padding_idx=vocabulary.PAD_INDEX,
        )
        self.option_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        # self.a2a = nn.Linear(self.nhid * 2, self.nhid)  # this is useless in this version
        # Options are variable length padded sequences, use DynamicRNN.
        self.option_rnn = DynamicRNN(self.option_rnn)
示例#19
0
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.cap_rnn = nn.LSTM(config["glove_embedding_size"] +
                               config["word_embedding_size"],
                               config["lstm_hidden_size"],
                               config["lstm_num_layers"],
                               batch_first=True,
                               dropout=config["dropout"])

        self.cap_rnn = DynamicRNN(self.cap_rnn)

        self.ques_push = nn.Linear(config["lstm_hidden_size"],
                                   config["ques_change_num"])

        self.caption_push = nn.Linear(config["lstm_hidden_size"],
                                      config["caption_change_num"])

        self.caption_gate = nn.Linear(int(2 * config["lstm_hidden_size"]),
                                      int(2 * config["lstm_hidden_size"]))

        self.caption_dim_change = nn.Linear(
            int(2 * config["lstm_hidden_size"]), config["lstm_hidden_size"])

        self.dropout = nn.Dropout(p=config["dropout"])

        nn.init.kaiming_uniform_(self.ques_push.weight)
        nn.init.constant_(self.ques_push.bias, 0)

        nn.init.kaiming_uniform_(self.caption_push.weight)
        nn.init.constant_(self.caption_push.bias, 0)

        nn.init.kaiming_uniform_(self.caption_gate.weight)
        nn.init.constant_(self.caption_gate.bias, 0)

        nn.init.kaiming_uniform_(self.caption_dim_change.weight)
        nn.init.constant_(self.caption_dim_change.bias, 0)
    def __init__(self, config, vocabulary, num_rounds=10):
        """
        SA: TODO we have hardcoded num_rounds for now to 10.
        We have defined mask in init for speed computation as it
        is static. Ideally should be in forward. need a better
        way to masking

        :param config:
        :param vocabulary:
        :param num_rounds:
        """
        super().__init__()
        self.config = config
        self.vocabulary = vocabulary

        self.mcan_config = Cfgs()
        # ans embedding size
        self.image_MCAN_Net = MCAN_Net(self.mcan_config, answer_size=config["lstm_hidden_size"])

        self.use_hist = config.get("use_hist", False)

        self.word_embed = nn.Embedding(
            len(vocabulary),
            config["word_embedding_size"],
            padding_idx=vocabulary.PAD_INDEX,
        )
        self.word_embed_size_for_rnn = config["word_embedding_size"]

        self.ques_rnn = nn.LSTM(
            self.word_embed_size_for_rnn,
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True)
        # SA: removing dropout for mcan

        self.dropout = nn.Dropout(p=config["dropout"])

        # questions and history are right padded sequences of variable length
        # use the DynamicRNN utility module to handle them properly

        if self.use_hist:
            self.hist_MCAN_Net = MCAN_Net(self.mcan_config, answer_size=config["lstm_hidden_size"])

            self.hist_rnn = nn.LSTM(
                self.word_embed_size_for_rnn,
                config["lstm_hidden_size"],
                config["lstm_num_layers"],
                batch_first=True,
                dropout=config["dropout"]
            )
            self.hist_rnn = DynamicRNN(self.hist_rnn)


            self.vqa_MCAN_Net = MCAN_Net(self.mcan_config, answer_size=config["lstm_hidden_size"])

            self.mask_prev_rounds_ = self.mask_prev_rounds(num_rounds=num_rounds,
                                                           emb_size=config["lstm_hidden_size"])


        self.ques_rnn = DynamicRNN(self.ques_rnn)

        # project image features to lstm_hidden_size for computing attention
        self.image_features_projection = nn.Linear(
            config["img_feature_size"], config["lstm_hidden_size"]
        )

        fusion_size = (
            config["lstm_hidden_size"] * 2
        )
        self.fusion = nn.Linear(fusion_size, config["lstm_hidden_size"])

        nn.init.kaiming_uniform_(self.image_features_projection.weight)
        nn.init.constant_(self.image_features_projection.bias, 0)
        nn.init.kaiming_uniform_(self.fusion.weight)
        nn.init.constant_(self.fusion.bias, 0)
示例#21
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config

        self.word_embed = nn.Embedding(len(vocabulary),
                                       config["word_embedding_size"],
                                       padding_idx=vocabulary.PAD_INDEX)
        # if config["fix_word_embedding"] == True:
        #     self.word_embed.weight.requires_grad = False

        self.hist_rnn = nn.LSTM(config["word_embedding_size"],
                                config["lstm_hidden_size"],
                                config["lstm_num_layers"],
                                batch_first=True,
                                dropout=config["dropout"],
                                bidirectional=True)
        self.ques_rnn = nn.LSTM(config["word_embedding_size"],
                                config["lstm_hidden_size"],
                                config["lstm_num_layers"],
                                batch_first=True,
                                dropout=config["dropout"],
                                bidirectional=True)
        # questions and history are right padded sequences of variable length
        # use the DynamicRNN utility module to handle them properly
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)

        # img_feature_size = config["img_feature_size"] + config["img_loc_size"]
        img_feature_size = config["img_feature_size"]
        lstm_hidden_size = config["lstm_hidden_size"]
        word_embed_size = config["word_embedding_size"]
        self.img_feature_size = img_feature_size
        self.lstm_hidden_size = lstm_hidden_size
        self.word_embed_size = word_embed_size
        self.relu = nn.ReLU()

        # new: attention
        # embedding
        self.Wii = nn.Sequential(nn.Dropout(p=config["dropout_fc"]),
                                 nn.Linear(img_feature_size, lstm_hidden_size),
                                 nn.BatchNorm1d(lstm_hidden_size), self.relu)

        self.Wqi = nn.Sequential(nn.Dropout(p=config["dropout_fc"]),
                                 nn.Linear(word_embed_size, lstm_hidden_size),
                                 nn.BatchNorm1d(lstm_hidden_size), self.relu)

        self.Wq_fuse_g = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(word_embed_size, img_feature_size), nn.Sigmoid())

        self.Wqq_ans = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(lstm_hidden_size * 2, lstm_hidden_size),
            nn.BatchNorm1d(lstm_hidden_size), self.relu)

        self.Wqq_ref = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(lstm_hidden_size * 2, lstm_hidden_size),
            nn.BatchNorm1d(lstm_hidden_size), self.relu)

        self.Wqq_inf = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(word_embed_size, lstm_hidden_size),
            nn.BatchNorm1d(lstm_hidden_size), self.relu)

        self.Whh_ref = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(lstm_hidden_size * 2, lstm_hidden_size),
            nn.BatchNorm1d(lstm_hidden_size), self.relu)

        self.Wqh_ref = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(lstm_hidden_size * 2, lstm_hidden_size),
            nn.BatchNorm1d(lstm_hidden_size), self.relu)

        # attention
        self.Wia = nn.Sequential(nn.Dropout(p=config["dropout_fc"]),
                                 nn.Linear(lstm_hidden_size, 1))
        self.Wqa_ans = nn.Sequential(nn.Dropout(p=config["dropout_fc"]),
                                     nn.Linear(lstm_hidden_size, 1))
        self.Wqa_ref = nn.Sequential(nn.Dropout(p=config["dropout_fc"]),
                                     nn.Linear(lstm_hidden_size, 1))
        self.Wha_ans = nn.Sequential(nn.Dropout(p=config["dropout_fc"]),
                                     nn.Linear(lstm_hidden_size, 1))
        self.Wha_ref = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(lstm_hidden_size * 2, lstm_hidden_size),
            nn.Dropout(p=config["dropout_fc"]), nn.Linear(lstm_hidden_size, 1))
        self.Wh_ref = nn.Linear(2, 1)

        # referring to history
        self.Wq_inf = nn.Sequential(nn.Dropout(p=config["dropout_fc"]),
                                    nn.Linear(lstm_hidden_size, 2))
        # fusion
        self.fusion_v = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(img_feature_size, lstm_hidden_size * 2),
            nn.BatchNorm1d(lstm_hidden_size * 2), self.relu)
        self.fusion_q = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(lstm_hidden_size * 2, lstm_hidden_size * 2),
            nn.BatchNorm1d(lstm_hidden_size * 2), self.relu)
        self.fusion = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(
                img_feature_size + lstm_hidden_size * 2 + lstm_hidden_size * 2,
                lstm_hidden_size * config["ans_cls_num"]))
        self.fusion_cls = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(lstm_hidden_size * 2, config["ans_cls_num"]))
        # other useful functions
        self.softmax = nn.Softmax(dim=-1)
        # self.G_softmax = F.gumbel_softmax()

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight.data)
                if m.bias is not None:
                    nn.init.constant_(m.bias.data, 0)
示例#22
0
    def __init__(self, config, vocabulary):
        super().__init__()
        self.config = config
        self.dropout = config['dropout']
        self.nhid = config['lstm_hidden_size']
        self.img_feature_size = config['img_feature_size']
        self.ninp = config['word_embedding_size']
        self.head_num = config['head_num']
        self.word_embed = nn.Embedding(
            len(vocabulary),
            config["word_embedding_size"],
            padding_idx=vocabulary.PAD_INDEX,
        )
        self.hist_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.ques_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.cap_rnn = nn.LSTM(
            config["word_embedding_size"],
            config["lstm_hidden_size"],
            config["lstm_num_layers"],
            batch_first=True,
            dropout=config["dropout"],
        )
        self.dropout = nn.Dropout(p=config["dropout_fc"])
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)
        self.cap_rnn = DynamicRNN(self.cap_rnn)

        ##q c att on img
        self.Wq2 = nn.Sequential(self.dropout,
                                 nn.Linear(self.nhid * 2, self.nhid))
        self.Wi2 = nn.Sequential(self.dropout,
                                 nn.Linear(self.img_feature_size, self.nhid))
        self.Wall2 = nn.Linear(self.nhid, 1)

        # q_att_on_cap
        self.Wqs3 = nn.Sequential(self.dropout,
                                  nn.Linear(self.nhid, self.nhid))
        self.Wcs3 = nn.Sequential(self.dropout,
                                  nn.Linear(self.nhid, self.nhid))
        self.Wc3 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid))
        self.Wall3 = nn.Linear(self.nhid, 1)
        self.c2c = nn.Sequential(self.dropout, nn.Linear(self.ninp, self.nhid))

        # c_att_on_ques
        self.Wqs5 = nn.Sequential(self.dropout,
                                  nn.Linear(self.nhid, self.nhid))
        self.Wcs5 = nn.Sequential(self.dropout,
                                  nn.Linear(self.nhid, self.nhid))
        self.Wq5 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid))
        self.Wall5 = nn.Linear(self.nhid, 1)
        self.q2q = nn.Sequential(self.dropout, nn.Linear(self.ninp, self.nhid))
        # q att on h
        self.Wq1 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid))
        self.Wh1 = nn.Sequential(self.dropout, nn.Linear(self.nhid, self.nhid))
        self.Wqh1 = nn.Linear(self.nhid, 1)
        ###cap att img
        self.Wc4 = nn.Sequential(self.dropout,
                                 nn.Linear(self.nhid * 2, self.nhid))
        self.Wi4 = nn.Sequential(self.dropout,
                                 nn.Linear(self.img_feature_size, self.nhid))
        self.Wall4 = nn.Linear(self.nhid, 1)
        ##fusion
        self.i2i = nn.Sequential(self.dropout,
                                 nn.Linear(self.img_feature_size, self.nhid))
        self.fusion_1 = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(self.nhid * 2 + self.img_feature_size + self.nhid,
                      self.nhid), nn.LeakyReLU())
        self.fusion_2 = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(self.nhid * 2 + self.img_feature_size + self.nhid,
                      self.nhid), nn.LeakyReLU())
        self.fusion_3 = nn.Sequential(
            nn.Dropout(p=config["dropout_fc"]),
            nn.Linear(self.nhid * 2 + self.img_feature_size + self.nhid,
                      self.nhid), nn.LeakyReLU())
        self.q_ref = nn.Sequential(nn.Dropout(p=config["dropout_fc"]),
                                   nn.Linear(self.nhid * 2, self.nhid),
                                   nn.LeakyReLU(),
                                   nn.Dropout(p=config["dropout_fc"]),
                                   nn.Linear(self.nhid, 2), nn.LeakyReLU())
        self.q_multi = nn.Sequential(nn.Dropout(p=config["dropout_fc"]),
                                     nn.Linear(self.nhid * 2, self.nhid),
                                     nn.LeakyReLU(),
                                     nn.Dropout(p=config["dropout_fc"]),
                                     nn.Linear(self.nhid, 3), nn.LeakyReLU())
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight.data)
                if m.bias is not None:
                    nn.init.constant_(m.bias.data, 0)