Exemplo n.º 1
0
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    v_att = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    q_att = weight_norm(nn.Linear(num_hid, 1), dim=None)

    w_emb2 = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb2 = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1,
                               False, .0)
    v_att2 = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2)
    v_net2 = FCNet([dataset.v_dim, num_hid])
    q_att2 = weight_norm(nn.Linear(num_hid, 1), dim=None)

    if task == 'vqa':
        b_net = []
        q_prj = []
        c_prj = []
        objects = 10  # minimum number of boxes
        for i in range(gamma):
            b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
            q_prj.append(FCNet([num_hid, num_hid], '', .2))
            c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
        classifier = SimpleClassifier(num_hid, num_hid * 2, 3, .5)
        classifier2 = SimpleClassifier(num_hid, num_hid * 2,
                                       dataset.num_ans_candidates, .5)
        counter = Counter(objects)
        return BanModel(dataset, w_emb, q_emb, v_att, q_att, b_net, q_prj,
                        c_prj, q_net, v_net, classifier, classifier2, counter,
                        op, gamma, w_emb2, q_emb2, v_att2, v_net2, q_att2)
    elif task == 'flickr':
        return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
Exemplo n.º 2
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    c_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    c_net = FCNet([c_emb.num_hid, num_hid])
    classifier = SimpleClassifier(2 * num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, c_emb, v_att, q_net, v_net, c_net,
                     classifier)
Exemplo n.º 3
0
def build_baseline0(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb1 = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    q_emb2 = QuestionEmbedding(300)
    v_att = StackAttention(num_hid, num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([num_hid, num_hid])
    linear = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return AttentionModel(w_emb, q_emb1, v_att, q_net, v_net, classifier,
                          linear)
Exemplo n.º 4
0
    def __init__(self, opt):
        super(Model, self).__init__()
        num_hid = opt.num_hid
        activation = opt.activation
        dropG = opt.dropG
        dropW = opt.dropW
        dropout = opt.dropout
        dropL = opt.dropL
        norm = opt.norm
        dropC = opt.dropC
        self.opt = opt

        self.w_emb = WordEmbedding(opt.ntokens, emb_dim=300, dropout=dropW)
        self.w_emb.init_embedding(opt.dataroot + 'glove6b_init_300d.npy')
        self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1,
                                       bidirect=False, dropout=dropG, rnn_type='GRU')

        self.q_net = FCNet([self.q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation)
        self.gv_net = FCNet([opt.v_dim, num_hid], dropout=dropL, norm=norm, act=activation)

        self.gv_att_1 = Att_3(v_dim=opt.v_dim, q_dim=self.q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm,
                              act=activation)
        self.gv_att_2 = Att_3(v_dim=opt.v_dim, q_dim=self.q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm,
                              act=activation)
        self.classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=opt.ans_dim,
                                           dropout=dropC, norm=norm, act=activation)

        self.normal = nn.BatchNorm1d(num_hid,affine=False)
Exemplo n.º 5
0
    def __init__(self, dataset,args):
        super(BAN_Model, self).__init__()

        self.args = args
        # init word embedding module, question embedding module, biAttention network, bi_residual network, and classifier
        self.w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.cat)
        self.q_emb = QuestionEmbedding(600 if args.cat else 300, args.hid_dim, 1, False, .0, args.rnn)
        self.bi_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim, args.glimpse)
        self.bi_resnet = BiResNet(args,dataset)
        self.classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2, dataset.num_ans_candidates, args)


        # build and load pre-trained MAML model
        if args.maml:
            weight_path = args.data_dir + '/' + args.maml_model_path
            print('load initial weights MAML from: %s' % (weight_path))
            self.maml = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
        # build and load pre-trained Auto-encoder model
        if args.autoencoder:
            self.ae = Auto_Encoder_Model()
            weight_path = args.data_dir + '/' + args.ae_model_path
            print('load initial weights DAE from: %s' % (weight_path))
            self.ae.load_state_dict(torch.load(weight_path))
            self.convert = nn.Linear(16384, 64)
        # Loading tfidf weighted embedding
        if hasattr(args, 'tfidf'):
            self.w_emb = tfidf_loading(args.tfidf, self.w_emb, args)
            
        # Loading the other net
        if args.other_model:
            self.unet = Resnet50Encoder()
Exemplo n.º 6
0
def build_model_APD(dataset, num_hid, dropout, norm, activation, dropL, dropG,
                    dropW, dropC):
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=dropW)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=False,
                              dropout=dropG,
                              rnn_type='GRU')

    v_att = Att_PD(v_dim=dataset.v_dim,
                   q_dim=q_emb.num_hid,
                   num_hid=num_hid,
                   dropout=dropout,
                   norm=norm,
                   act=activation)
    q_net = FCNet([q_emb.num_hid, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    v_net = FCNet([dataset.v_dim, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)

    classifier = SimpleClassifier(in_dim=num_hid,
                                  hid_dim=2 * num_hid,
                                  out_dim=dataset.num_ans_candidates,
                                  dropout=dropC,
                                  norm=norm,
                                  act=activation)
    return Model(w_emb, q_emb, v_att, q_net, v_net, classifier)
Exemplo n.º 7
0
def build_SAN(dataset, args):
    # init word embedding module, question embedding module, and Attention network
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0, args.op)
    q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid,
                              1, False, 0.0, args.rnn)
    v_att = StackedAttention(args.num_stacks, dataset.v_dim, args.num_hid,
                             args.num_hid, dataset.num_ans_candidates,
                             args.dropout)
    # build and load pre-trained MAML model
    if args.maml:
        weight_path = args.RAD_dir + '/' + args.maml_model_path
        print('load initial weights MAML from: %s' % (weight_path))
        maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
    # build and load pre-trained Auto-encoder model
    if args.autoencoder:
        ae_v_emb = Auto_Encoder_Model()
        weight_path = args.RAD_dir + '/' + args.ae_model_path
        print('load initial weights DAE from: %s' % (weight_path))
        ae_v_emb.load_state_dict(torch.load(weight_path))
    # Loading tfidf weighted embedding
    if hasattr(args, 'tfidf'):
        w_emb = tfidf_loading(args.tfidf, w_emb, args)
    # init classifier
    classifier = SimpleClassifier(args.num_hid, 2 * args.num_hid,
                                  dataset.num_ans_candidates, args)
    # contruct VQA model and return
    if args.maml and args.autoencoder:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb,
                         ae_v_emb)
    elif args.maml:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb,
                         None)
    elif args.autoencoder:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, ae_v_emb)
    return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, None)
Exemplo n.º 8
0
def build_model_P_exact(dataset, num_hid, dropout, norm, activation):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=0.0)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=False,
                              dropout=0,
                              rnn_type='GRU')

    v_att = Att_P(v_dim=dataset.v_dim,
                  q_dim=q_emb.num_hid,
                  num_hid=num_hid,
                  dropout=dropout,
                  norm=norm,
                  act=activation)
    q_net = GTH(q_emb.num_hid, num_hid, dropout=0, norm=norm, act=activation)
    v_net = GTH(dataset.v_dim, num_hid, dropout=0, norm=norm, act=activation)

    classifier = PaperClassifier(in_dim=num_hid,
                                 hid_dim_1=300,
                                 hid_dim_2=2048,
                                 out_dim=dataset.num_ans_candidates,
                                 dropout=0,
                                 norm=norm,
                                 act=activation)
    return Model(w_emb, q_emb, v_att, q_net, v_net, classifier)
Exemplo n.º 9
0
    def __init__(self, opt):
        super(Model, self).__init__()
        self.dictionary = Dictionary.load_from_file(opt.dataroot +
                                                    'dictionary.pkl')
        num_hid = 128
        activation = opt.activation
        dropG = opt.dropG
        dropW = opt.dropW
        dropout = opt.dropout
        dropL = opt.dropL
        norm = opt.norm
        dropC = opt.dropC
        self.opt = opt

        self.w_emb = WordEmbedding(opt.ntokens, emb_dim=300, dropout=dropW)
        self.w_emb.init_embedding(opt.dataroot + 'glove6b_init_300d.npy')
        self.q_emb = QuestionEmbedding(in_dim=300,
                                       num_hid=num_hid,
                                       nlayers=1,
                                       bidirect=False,
                                       dropout=dropG,
                                       rnn_type='GRU')
        self.q_net = FCNet([self.q_emb.num_hid, num_hid],
                           dropout=dropL,
                           norm=norm,
                           act=activation)
        self.classifier = SimpleClassifier(
            in_dim=num_hid,
            hid_dim=num_hid // 2,
            out_dim=2,  #opt.test_candi_ans_num,
            dropout=dropC,
            norm=norm,
            act=activation)
        self.normal = nn.BatchNorm1d(num_hid, affine=False)
Exemplo n.º 10
0
def build_ban(num_token,
              v_dim,
              num_hid,
              num_ans,
              op='',
              gamma=4,
              reasoning=False):
    w_emb = WordEmbedding(num_token, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    if not reasoning:
        v_att = BiAttention(v_dim, num_hid, num_hid, gamma)
    else:
        v_att = BiAttention(v_dim, num_hid, num_hid, 1)

    # constructing the model
    b_net = []
    q_prj = []
    c_prj = []
    objects = 36  # minimum number of boxes, originally 10
    for i in range(gamma):
        b_net.append(BCNet(v_dim, num_hid, num_hid, None, k=1))
        q_prj.append(FCNet([num_hid, num_hid], '', .2))
        c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
    classifier = SimpleClassifier(num_hid, num_hid * 2, num_ans, .5)
    counter = Counter(objects)
    if not reasoning:
        return BanModel(w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier,
                        counter, op, gamma, num_hid)
    else:
        return BanModel_Reasoning(w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                                  classifier, counter, op, gamma, num_hid)
Exemplo n.º 11
0
 def __init__(self, size_question, path_init):
     super(classify_model, self).__init__()
     self.w_emb = WordEmbedding(size_question, 300, 0.0, False)
     self.w_emb.init_embedding(path_init)
     self.q_emb = QuestionEmbedding(300, 1024, 1, False, 0.0, 'GRU')
     self.q_final = QuestionAttention(1024)
     self.f_fc1 = linear(1024, 256)
     self.f_fc2 = linear(256, 64)
     self.f_fc3 = linear(64, 2)
Exemplo n.º 12
0
def build_baseline_Implicit(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Implicit(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel_Implicit(w_emb, q_emb, v_att, q_net, v_net, classifier)
Exemplo n.º 13
0
 def __init__(self, size_question, path_init):
     super(typeAttention, self).__init__()
     self.w_emb = WordEmbedding(size_question, 300, 0.0, False)
     self.w_emb.init_embedding(path_init)
     self.q_emb = QuestionEmbedding(300, 1024, 1, False, 0.0, 'GRU')
     self.q_final = QuestionAttention(1024)
     self.f_fc1 = linear(1024, 2048)
     self.f_fc2 = linear(2048, 1024)
     self.f_fc3 = linear(1024, 1024)
Exemplo n.º 14
0
def build_caq(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates + 1, 0.5)
    return CAQModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
def build_model(dataset, v_dim, num_hid, logger=None):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, logger)
Exemplo n.º 16
0
def build_questionembedding_model(num_hid):
    questionembedding_config = QuestionEmbeddingConfig(
        input_dim=300,
        hidden_dim=num_hid,
        num_layers=1,
        is_bidirect=False,
        dropout_prob=0.5,
        rnn_type="GRU"
    )
    return QuestionEmbedding(questionembedding_config)
def build_attention_model(dataset, args):
    num_hid = args.num_hid
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = SoftAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return Attention_Model(w_emb, q_emb, v_att, q_net, v_net, classifier)
Exemplo n.º 18
0
 def __init__(self, ndim=1024, n_hid=1024, dropout=0.0):
     super(YNclassifier, self).__init__()
     self.ndim = ndim
     self.n_hid = n_hid
     self.dropout = dropout
     self.rnn = QuestionEmbedding(ndim, n_hid, 1, False, dropout)
     self.activ = nn.LeakyReLU()
     self.Lin = nn.Sequential(nn.Linear(n_hid, 512), self.activ,
                              nn.Linear(512, 128), self.activ,
                              nn.Linear(128, 1))
def build_LL_newatt(dataset, num_hid):#---------------------------------!!!!!!
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier_LL = SimpleClassifier(#--------!!!!!
        num_hid*4, num_hid /8, 1, 0.5)#-----!!!!!
    classifier_All = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    return LL4ALModel(w_emb, q_emb, v_att, q_net, v_net, classifier_LL,classifier_All)
Exemplo n.º 20
0
def build_baseline0(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att1 = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    v_att2 = Attention(dataset.v_dim, q_emb.num_hid + dataset.v_dim, num_hid)

    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return SANModel1(w_emb, q_emb, v_att1, v_att2, q_net, v_net, classifier)
Exemplo n.º 21
0
def build_baseline(dataset):
    opt = config.parse_opt()
    w_emb = WordEmbedding(dataset.dictionary.ntokens(), 300, opt.EMB_DROPOUT)
    q_emb = QuestionEmbedding(300, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT,
                              opt.L_RNN_DROPOUT)
    v_emb = VideoEmbedding(opt.C3D_SIZE + opt.RES_SIZE, opt.NUM_HIDDEN,
                           opt.NUM_LAYER, opt.BIDIRECT, opt.L_RNN_DROPOUT)
    v_att = Attention(opt.NUM_HIDDEN, opt.MID_DIM, opt.FC_DROPOUT)
    classifier = SimpleClassifier(opt.NUM_HIDDEN, opt.MID_DIM, 1,
                                  opt.FC_DROPOUT)
    return BaseModel(w_emb, q_emb, v_att, classifier, v_emb)
Exemplo n.º 22
0
def build_baseline2(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = nn.Linear(dataset.v_dim, 300)
    v_bn = nn.BatchNorm1d(300, momentum=0.01)
    lstm = nn.LSTM(300, num_hid, 1, batch_first=True)
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, lstm, v_bn)
def build_dualatt(dataset, num_hid, args):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4)
    q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4)
    v_att = DualAttention(dataset.v_dim, q_emb.num_hid, num_hid, 0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)

    model = BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, args)
    return model
Exemplo n.º 24
0
def build_baseline(dataset,opt):
    opt=config.parse_opt()
    w_emb=WordEmbedding(dataset.dictionary.ntokens(),300,opt.EMB_DROPOUT)
    q_emb=QuestionEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    v_emb=VideoEmbedding(opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    v_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    r_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    v_fc=Videofc(opt.GLIMPSE,opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.FC_DROPOUT)
    a_emb=AnswerEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    rela_emb = Rela_Module(opt.NUM_HIDDEN*3,opt.NUM_HIDDEN,opt.NUM_HIDDEN)
    classifier=SimpleClassifier(opt.NUM_HIDDEN,opt.MID_DIM,dataset.num_ans,opt.FC_DROPOUT)
    return BaseModel(w_emb,q_emb,v_emb,a_emb,v_att,v_fc,rela_emb,r_att,classifier,opt)
Exemplo n.º 25
0
def build_BAN(dataset, args, priotize_using_counter=False):
    # init word embedding module, question embedding module, and Attention network
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.op)
    q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid,
                              1, False, .0, args.rnn)
    v_att = BiAttention(dataset.v_dim, args.num_hid, args.num_hid, args.gamma)
    # build and load pre-trained MAML model
    if args.maml:
        weight_path = args.RAD_dir + '/' + args.maml_model_path
        print('load initial weights MAML from: %s' % (weight_path))
        maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
    # build and load pre-trained Auto-encoder model
    if args.autoencoder:
        ae_v_emb = Auto_Encoder_Model()
        weight_path = args.RAD_dir + '/' + args.ae_model_path
        print('load initial weights DAE from: %s' % (weight_path))
        ae_v_emb.load_state_dict(torch.load(weight_path))
    # Loading tfidf weighted embedding
    if hasattr(args, 'tfidf'):
        w_emb = tfidf_loading(args.tfidf, w_emb, args)
    # Optional module: counter for BAN
    use_counter = args.use_counter if priotize_using_counter is None else priotize_using_counter
    if use_counter or priotize_using_counter:
        objects = 10  # minimum number of boxes
    if use_counter or priotize_using_counter:
        counter = Counter(objects)
    else:
        counter = None
    # init BAN residual network
    b_net = []
    q_prj = []
    c_prj = []
    for i in range(args.gamma):
        b_net.append(
            BCNet(dataset.v_dim, args.num_hid, args.num_hid, None, k=1))
        q_prj.append(FCNet([args.num_hid, args.num_hid], '', .2))
        if use_counter or priotize_using_counter:
            c_prj.append(FCNet([objects + 1, args.num_hid], 'ReLU', .0))
    # init classifier
    classifier = SimpleClassifier(args.num_hid, args.num_hid * 2,
                                  dataset.num_ans_candidates, args)
    # contruct VQA model and return
    if args.maml and args.autoencoder:
        return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                         classifier, counter, args, maml_v_emb, ae_v_emb)
    elif args.maml:
        return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                         classifier, counter, args, maml_v_emb, None)
    elif args.autoencoder:
        return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                         classifier, counter, args, None, ae_v_emb)
    return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                     classifier, counter, args, None, None)
Exemplo n.º 26
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    c_1 = MLP(input_dim=1024,
              dimensions=[1024, 1024, dataset.num_ans_candidates])
    c_2 = nn.Linear(dataset.num_ans_candidates, dataset.num_ans_candidates)
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, c_1, c_2)
def build_multimodal_newatt(dataset, num_hid):#---------------------------------!!!!!!
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier_V = SimpleClassifier(
        2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!!
    classifier_Q = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    classifier_All = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    return MultiModalModel(w_emb, q_emb, v_att, q_net, v_net, classifier_V,classifier_Q,classifier_All)
Exemplo n.º 28
0
def build_CCB_model(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_ct_net = FCNet([q_emb.num_hid, num_hid])
    q_cx_net = FCNet([q_emb.num_hid, num_hid])
    v_ct_net = FCNet([dataset.v_dim, num_hid])
    v_cx_net = FCNet([dataset.v_dim, num_hid])
    classifier_fq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    classifier_vq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net,
                     classifier_fq, classifier_vq, v_cx_net)
Exemplo n.º 29
0
def build_caq_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid // 2])
    v_net = FCNet([dataset.v_dim, num_hid // 2])
    updated_query_composer = FCNet([num_hid + num_hid // 2, num_hid])
    neighbour_attention = MultiHeadedAttention(4, num_hid // 2, dropout=0.1)
    Dropout_C = nn.Dropout(0.1)

    classifier = SimpleClassifier(num_hid // 2, num_hid * 2,
                                  dataset.num_ans_candidates + 1, 0.5)
    return CAQModel(w_emb, q_emb, v_att, q_net, v_net, updated_query_composer,
                    neighbour_attention, Dropout_C, classifier, dataset)
Exemplo n.º 30
0
def build_baseline(dataset,opt):
    w_emb=WordEmbedding(dataset.dictionary.ntokens(),300,opt.EMB_DROPOUT)
    q_emb=QuestionEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    v_emb=VideoEmbedding(opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    v_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    r_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    v_fc=Videofc(opt.GLIMPSE,opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.FC_DROPOUT)
    a_emb=AnswerEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    rela_emb = Rela_Module(opt.NUM_HIDDEN*3,opt.NUM_HIDDEN,opt.NUM_HIDDEN)
    classifier=SimpleClassifier(opt.NUM_HIDDEN*2,opt.MID_DIM,1,opt.FC_DROPOUT)
    ques_att = Q_Att(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    #vlinear=FCNet([opt.NUM_HIDDEN,opt.MID_DIM,opt.NUM_HIDDEN])
    #rlinear=FCNet([opt.NUM_HIDDEN,opt.MID_DIM,opt.NUM_HIDDEN])
    
    return BaseModel(w_emb,q_emb,v_emb,a_emb,v_att,v_fc,rela_emb,r_att,classifier,ques_att,opt)