def __init__(self, args):
        super().__init__()
        self.args = args
        print(args.dropout)
        self.word_embed = nn.Embedding(args.vocab_size,
                                       args.embed_size,
                                       padding_idx=0)

        self.ques_img_rnn = nn.LSTM(args.embed_size + args.img_feature_size,
                                    args.rnn_hidden_size,
                                    args.num_layers,
                                    batch_first=True,
                                    dropout=args.dropout)

        self.hist_rnn = nn.LSTM(args.embed_size,
                                args.rnn_hidden_size,
                                args.num_layers,
                                batch_first=True,
                                dropout=args.dropout)

        self.dialog_rnn = nn.LSTM(args.rnn_hidden_size * 2,
                                  args.rnn_hidden_size,
                                  args.num_layers,
                                  batch_first=True,
                                  dropout=args.dropout)

        self.ques_img_rnn = DynamicRNN(self.ques_img_rnn)
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.dialog_rnn = DynamicRNN(self.dialog_rnn)
    def __init__(self, args):
        super().__init__()
        self.args = args

        self.word_embed = nn.Embedding(args.vocab_size, args.embed_size, padding_idx=0)
        self.hist_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers,
                                batch_first=True, dropout=args.dropout)
        self.ques_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers,
                                batch_first=True, dropout=args.dropout)
        self.dropout = nn.Dropout(p=args.dropout)

        # questions and history are right padded sequences of variable length
        # use the DynamicRNN utility module to handle them properly
        self.hist_rnn = DynamicRNN(self.hist_rnn)
        self.ques_rnn = DynamicRNN(self.ques_rnn)

        # fusion layer
        fusion_size = args.img_feature_size + args.rnn_hidden_size * 2
        self.fusion = nn.Linear(fusion_size, args.rnn_hidden_size)

        if args.weight_init == 'xavier':
            nn.init.xavier_uniform(self.fusion.weight.data)
        elif args.weight_init == 'kaiming':
            nn.init.kaiming_uniform(self.fusion.weight.data)
        nn.init.constant(self.fusion.bias.data, 0)
예제 #3
0
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.bn = nn.BatchNorm1d(args.hidden_size)
        self.word_embed = WordEmbedding(args.vocab_size, 300, .0)
        self.sent_embed = nn.LSTM(args.embed_size,
                                  args.hidden_size,
                                  2,
                                  dropout=args.dropout,
                                  batch_first=True)
        self.sent_embed = DynamicRNN(self.sent_embed)
        self.hist_embed = nn.LSTM(args.embed_size,
                                  args.hidden_size,
                                  2,
                                  dropout=args.dropout,
                                  batch_first=True)
        self.hist_embed = DynamicRNN(self.hist_embed)
        self.bup_att = FIND(2048, 1024, 1024)

        self.q_net = FCNet([1024, 1024])
        self.v_net = FCNet([2048, 1024])
        self.linear = nn.Linear(args.hidden_size * 2, args.hidden_size)

        self.layer_stack = nn.ModuleList([
            REFER(d_model=512,
                  d_inner=1024,
                  n_head=4,
                  d_k=256,
                  d_v=256,
                  dropout=0.2) for _ in range(2)
        ])
예제 #4
0
    def __init__(self, args):
        super().__init__()
        self.args = args
        img_num_features = 512
        img_size = 49
        self.args.img_feature_size = (img_size, img_num_features)

        self.word_embed = nn.Embedding(args.vocab_size,
                                       args.embed_size,
                                       padding_idx=0)

        self.ques_rnn = nn.LSTM(args.embed_size,
                                args.rnn_hidden_size,
                                args.num_layers,
                                batch_first=True,
                                dropout=args.dropout)
        self.ques_rnn = DynamicRNN(self.ques_rnn)

        self.hist_q_rnn = nn.LSTM(args.embed_size,
                                  args.rnn_hidden_size,
                                  args.num_layers,
                                  batch_first=True,
                                  dropout=args.dropout)
        self.hist_q_rnn = DynamicRNN(self.hist_q_rnn)
        self.hist_a_rnn = nn.LSTM(args.embed_size,
                                  args.rnn_hidden_size,
                                  args.num_layers,
                                  batch_first=True,
                                  dropout=args.dropout)

        self.qa_fusion = nn.Linear(args.rnn_hidden_size * 2,
                                   args.rnn_hidden_size)
        self.hist_rnn = nn.LSTM(args.rnn_hidden_size,
                                args.rnn_hidden_size,
                                args.num_layers,
                                batch_first=True,
                                dropout=args.dropout)

        self.qh_fusion = nn.Linear(args.rnn_hidden_size * 2,
                                   args.rnn_hidden_size)

        self.c_projection = nn.Linear(args.rnn_hidden_size,
                                      args.rnn_hidden_size)
        self.f_projection = nn.Linear(img_num_features, args.rnn_hidden_size)

        # Transpose + Matrix Multiplication Stuff in Forward
        # Softmax stuff in forward (nn.softmax)
        # Elementwise multiplication to get attended image

        # Currently f^att_t, alpha_t, c_t
        self.final_concat_size = img_num_features + img_size + args.rnn_hidden_size
        self.final_fusion = nn.Linear(final_concat_size, args.rnn_hidden_size)
예제 #5
0
    def __init__(self, args):
        super().__init__()
        self.args = args

        self.dropout = nn.Dropout(p=args.dropout)

        if self.args.lang_model == 'bert':
            self.ques_net = BertModel.from_pretrained("bert-base-uncased",
                                                      return_dict=True)
        else:
            self.word_embed = nn.Embedding(args.vocab_size,
                                           args.embed_size,
                                           padding_idx=0)
            self.ques_net = DynamicRNN(
                nn.LSTM(args.embed_size,
                        args.rnn_hidden_size,
                        args.num_layers,
                        batch_first=True,
                        dropout=args.dropout))

        fusion_size = args.audio_feature_size \
                  + args.visual_feature_size \
                  + args.rnn_hidden_size

        self.fusion = nn.Linear(fusion_size, args.rnn_hidden_size)

        self.weight_init()
    def __init__(self, args, encoder):
        super().__init__()
        self.args = args
        # share word embedding
        self.word_embed = encoder.word_embed
        self.option_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, batch_first=True)
        self.log_softmax = nn.LogSoftmax(dim=1)

        # options are variable length padded sequences, use DynamicRNN
        self.option_rnn = DynamicRNN(self.option_rnn)
예제 #7
0
 def __init__(self, args, encoder):
     super().__init__()
     self.args = args
     # share word embedding
     self.word_embed = encoder.word_embed
     self.opt_embed = nn.LSTM(args.embed_size,
                              512,
                              batch_first=True,
                              dropout=args.dropout)
     self.opt_embed = DynamicRNN(self.opt_embed)
     self.log_softmax = nn.LogSoftmax(dim=1)
예제 #8
0
    def __init__(self, args):
        super().__init__()
        self.args = args

        self.img_embed = nn.Linear(args.img_feature_size, args.img_embed_size)
        self.word_embed = nn.Embedding(args.vocab_size,
                                       args.embed_size,
                                       padding_idx=0)

        ques_img_feature_size = args.embed_size + args.img_embed_size
        self.ques_img_rnn = nn.LSTM(ques_img_feature_size,
                                    args.rnn_hidden_size,
                                    args.num_layers,
                                    batch_first=True,
                                    dropout=args.dropout)
        self.ques_img_rnn = DynamicRNN(self.ques_img_rnn)

        self.hist_rnn = nn.LSTM(args.embed_size,
                                args.rnn_hidden_size,
                                args.num_layers,
                                batch_first=True,
                                dropout=args.dropout)
        self.hist_rnn = DynamicRNN(self.hist_rnn)

        # Original Paper did not use fusion
        # fusion_size = args.rnn_hidden_size * 2
        # self.fusion = nn.Linear(fusion_size, args.rnn_hidden_size)
        """
        if args.weight_init == 'xavier':
            nn.init.xavier_uniform(self.fusion.weight.data)
        elif args.weight_init == 'kaiming':
            nn.init.kaiming_uniform(self.fusion.weight.data)
        nn.init.constant(self.fusion.bias.data, 0)
        """
        self.dialog_rnn = nn.LSTM(args.rnn_hidden_size * 2,
                                  args.rnn_hidden_size,
                                  args.num_layers,
                                  batch_first=True,
                                  dropout=args.dropout)
예제 #9
0
    def __init__(self, args, encoder):
        super().__init__()
        self.args = args

        if args.lang_model == 'bert':
            # Few notes on option_net -
            # in original paper, decoder and encoder shares word_embedding
            # but each has its own rnn
            # NOT SURE if the code below makes ques_net and option_net
            # to share the embedding layer...

            # self.option_net = BertModel.from_pretrained('bert-base-uncased',
            #                                             return_dict=True)
            # self.option_net.embeddings = encoder.ques_net.embeddings
            self.option_net = encoder.ques_net
        else:
            self.word_embed = encoder.word_embed
            self.option_net = DynamicRNN(
                nn.LSTM(args.embed_size,
                        args.rnn_hidden_size,
                        batch_first=True))