def build_stackatt(dataset, num_hid, args):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4)
    q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4)
    v_att = NewAttention(dataset.v_dim, 2048 + q_emb.num_hid, num_hid, 0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    query_net = FCNet([dataset.v_dim, num_hid])

    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)

    model = BaseModelStackAtt(w_emb, q_emb, v_att, q_net, v_net, query_net,
                              classifier, args)
    return model
示例#2
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)

    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)

    q_net = FCNet([q_emb.num_hid, num_hid])  # match dimensions
    v_net = FCNet([dataset.v_dim, num_hid])  # match dimensions
    u_net = FCNet([dataset.v_dim, num_hid])  # match dimensions

    classifier = SimpleClassifier(num_hid * 2, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)

    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, u_net, classifier)
示例#3
0
def build_CCB_model(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_ct_net = FCNet([q_emb.num_hid, num_hid])
    q_cx_net = FCNet([q_emb.num_hid, num_hid])
    v_ct_net = FCNet([dataset.v_dim, num_hid])
    v_cx_net = FCNet([dataset.v_dim, num_hid])
    classifier_fq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    classifier_vq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net,
                     classifier_fq, classifier_vq, v_cx_net)
示例#4
0
def build_caq_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid // 2])
    v_net = FCNet([dataset.v_dim, num_hid // 2])
    updated_query_composer = FCNet([num_hid + num_hid // 2, num_hid])
    neighbour_attention = MultiHeadedAttention(4, num_hid // 2, dropout=0.1)
    Dropout_C = nn.Dropout(0.1)

    classifier = SimpleClassifier(num_hid // 2, num_hid * 2,
                                  dataset.num_ans_candidates + 1, 0.5)
    return CAQModel(w_emb, q_emb, v_att, q_net, v_net, updated_query_composer,
                    neighbour_attention, Dropout_C, classifier, dataset)
示例#5
0
def build_baseline(dataset,opt):
    w_emb=WordEmbedding(dataset.dictionary.ntokens(),300,opt.EMB_DROPOUT)
    q_emb=QuestionEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    v_emb=VideoEmbedding(opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    v_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    r_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    v_fc=Videofc(opt.GLIMPSE,opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.FC_DROPOUT)
    a_emb=AnswerEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    rela_emb = Rela_Module(opt.NUM_HIDDEN*3,opt.NUM_HIDDEN,opt.NUM_HIDDEN)
    classifier=SimpleClassifier(opt.NUM_HIDDEN*2,opt.MID_DIM,1,opt.FC_DROPOUT)
    ques_att = Q_Att(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    #vlinear=FCNet([opt.NUM_HIDDEN,opt.MID_DIM,opt.NUM_HIDDEN])
    #rlinear=FCNet([opt.NUM_HIDDEN,opt.MID_DIM,opt.NUM_HIDDEN])
    
    return BaseModel(w_emb,q_emb,v_emb,a_emb,v_att,v_fc,rela_emb,r_att,classifier,ques_att,opt)
示例#6
0
def build_baseline(dataset, opt):
    opt = config.parse_opt()
    w_emb = WordEmbedding(dataset.dictionary.ntokens(), 300, opt.EMB_DROPOUT)
    q_emb = QuestionEmbedding(300, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT,
                              opt.L_RNN_DROPOUT)
    v_emb = VideoEmbedding(opt.C3D_SIZE + opt.RES_SIZE, opt.NUM_HIDDEN,
                           opt.NUM_LAYER, opt.BIDIRECT, opt.L_RNN_DROPOUT)
    v_att = Attention(opt.NUM_HIDDEN, opt.MID_DIM, opt.FC_DROPOUT)
    r_att = Attention(opt.NUM_HIDDEN, opt.MID_DIM, opt.FC_DROPOUT)
    v_fc = Videofc(opt.GLIMPSE, opt.C3D_SIZE + opt.RES_SIZE, opt.NUM_HIDDEN,
                   opt.FC_DROPOUT)
    a_emb = AnswerEmbedding(300, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT,
                            opt.L_RNN_DROPOUT)
    rela_emb = Rela_Module(opt.NUM_HIDDEN * 3, opt.NUM_HIDDEN, opt.NUM_HIDDEN)
    return BaseModel(w_emb, q_emb, v_emb, a_emb, v_att, v_fc, rela_emb, r_att,
                     opt)
示例#7
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    cls_att = NewAttention(dataset.cls_dim, q_emb.num_hid, num_hid)
    attr_att = NewAttention(dataset.attr_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    cls_net = FCNet([dataset.cls_dim, num_hid])
    attr_net = FCNet([dataset.attr_dim, num_hid])

    fusion_dim = 16000
    mcb = CompactBilinearPooling(num_hid, num_hid, fusion_dim)
    classifier = SimpleClassifier(fusion_dim, num_hid * 2, dataset.num_ans_candidates, 0.5)
    
    return BaseModel(w_emb, q_emb, v_att, cls_att, attr_att, q_net, v_net, cls_net, attr_net, classifier, mcb)
示例#8
0
def build_model_A2x3(dataset, num_hid, dropout, norm, activation, dropL, dropG,
                     dropW, dropC):
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=dropW)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=False,
                              dropout=dropG,
                              rnn_type='GRU')

    v_att_1 = Att_2(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    v_att_2 = Att_2(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    v_att_3 = Att_2(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    q_net = FCNet([q_emb.num_hid, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    v_net = FCNet([dataset.v_dim, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)

    classifier = SimpleClassifier(in_dim=num_hid,
                                  hid_dim=2 * num_hid,
                                  out_dim=dataset.num_ans_candidates,
                                  dropout=dropC,
                                  norm=norm,
                                  act=activation)
    return Model_3(w_emb, q_emb, v_att_1, v_att_2, v_att_3, q_net, v_net,
                   classifier)
示例#9
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    if not dataset.bert:
        q_att = SelfAttention(q_emb.num_hid, num_hid)
        v_att = NewAttention(dataset.v_dim + 2, q_emb.num_hid, num_hid)
        q_net = FCNet([q_emb.num_hid, num_hid])
    else:
        q_att = SelfAttention(768, num_hid)
        q_emb = FCNet([768, 768])
        v_att = NewAttention(dataset.v_dim, 768, num_hid)
        q_net = FCNet([768, num_hid])
    v_net = FCNet([dataset.v_dim + 2, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, q_att, v_att, q_net, v_net, classifier,
                     dataset.bert)
示例#10
0
def build_ban_foil(dataset, num_hid, num_ans_candidates, op='', gamma=4):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma)
    b_net = []
    q_prj = []
    c_prj = []
    objects = 10  # minimum number of boxes
    for i in range(gamma):
        b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
        q_prj.append(FCNet([num_hid, num_hid], '', .2))
        c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
    classifier = SimpleClassifierFoil(num_hid, 64, num_ans_candidates)
    counter = Counter(objects)
    return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                    classifier, counter, op, gamma)
    def __init__(self, opt):
        super(UpDn, self).__init__()
        num_hid = opt.num_hid
        activation = opt.activation
        dropG = opt.dropG
        dropW = opt.dropW
        dropout = opt.dropout
        dropL = opt.dropL
        norm = opt.norm
        dropC = opt.dropC
        self.opt = opt
        print(f"ntokens {opt.ntokens}")
        self.w_emb = WordEmbedding(opt.ntokens, emb_dim=300, dropout=dropW)
        self.w_emb.init_embedding(f'{opt.data_dir}/glove6b_init_300d.npy')
        # self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1,
        #                                bidirect=False, dropout=dropG, rnn_type='GRU')
        self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid)

        self.q_net = FCNet([self.q_emb.num_hid, num_hid],
                           dropout=dropL,
                           norm=norm,
                           act=activation)
        self.gv_net = FCNet([2048, num_hid],
                            dropout=dropL,
                            norm=norm,
                            act=activation)

        self.gv_att_1 = Att_3(v_dim=2048,
                              q_dim=self.q_emb.num_hid,
                              num_hid=num_hid,
                              dropout=dropout,
                              norm=norm,
                              act=activation)
        self.gv_att_2 = Att_3(v_dim=2048,
                              q_dim=self.q_emb.num_hid,
                              num_hid=num_hid,
                              dropout=dropout,
                              norm=norm,
                              act=activation)
        self.classifier = SimpleClassifier(in_dim=num_hid,
                                           hid_dim=2 * num_hid,
                                           out_dim=3129,
                                           dropout=dropC,
                                           norm=norm,
                                           act=activation)
示例#12
0
def build_baseline0_newatt(dataset,
                           num_hid,
                           reconstruction,
                           size=64,
                           dropout_hid=0.0,
                           gamma_r=0.0,
                           adv_mode="wgan",
                           logger=None):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid,
                     dataset.v_dim, reconstruction, size, dropout_hid, gamma_r,
                     adv_mode, logger)
示例#13
0
def build_model_A3x2_h(dataset, num_hid, dropout, norm, activation, dropL,
                       dropG, dropW, dropC):
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=dropW)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=False,
                              dropout=dropG,
                              rnn_type='GRU')

    print('v_dim: %d\tq_dim: %d\tnum_hid: %d\t num ans candidates: %d' %
          (dataset.v_dim, q_emb.num_hid, num_hid, dataset.num_ans_candidates))
    v_att_1 = Att_3(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    v_att_2 = Att_3(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    q_net = FCNet([q_emb.num_hid, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    v_net = FCNet([dataset.v_dim, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    h_net = HNet([1280, 100, 100], [1280, 1280])

    classifier = SimpleClassifier(in_dim=num_hid,
                                  hid_dim=2 * num_hid,
                                  out_dim=dataset.num_ans_candidates,
                                  dropout=dropC,
                                  norm=norm,
                                  act=activation)
    return Model_h(w_emb, q_emb, v_att_1, v_att_2, q_net, v_net, h_net,
                   classifier)
示例#14
0
def build_SAN(dataset, args):
    # init word embedding module, question embedding module, and Attention network
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0, args.op)
    q_emb = QuestionEmbedding(300 if "c" not in args.op else 600, args.num_hid,
                              1, False, 0.0, args.rnn)
    v_att = StackedAttention(
        args.num_stacks,
        dataset.v_dim,
        args.num_hid,
        args.num_hid,
        dataset.num_ans_candidates,
        args.dropout,
    )
    # build and load pre-trained MAML model
    if args.maml:
        weight_path = args.RAD_dir + "/" + args.maml_model_path
        print("load initial weights MAML from: %s" % (weight_path))
        maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
    # build and load pre-trained Auto-encoder model
    if args.autoencoder:
        ae_v_emb = Auto_Encoder_Model()
        weight_path = args.RAD_dir + "/" + args.ae_model_path
        print("load initial weights DAE from: %s" % (weight_path))
        ae_v_emb.load_state_dict(torch.load(weight_path, args.map_location))
    # Loading tfidf weighted embedding
    if hasattr(args, "tfidf"):
        w_emb = tfidf_loading(args.tfidf, w_emb, args)
    # init classifier
    classifier = SimpleClassifier(args.num_hid, 2 * args.num_hid,
                                  dataset.num_ans_candidates, args)
    # contruct VQA model and return
    if args.maml and args.autoencoder:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb,
                         ae_v_emb)
    elif args.maml:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb,
                         None)
    elif args.autoencoder:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, ae_v_emb)
    return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, None)
示例#15
0
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    v_att = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2)
    q_net = FCNet([q_emb.num_hid, num_hid], 'Sigmoid')
    v_net = FCNet([dataset.v_dim, num_hid])
    if task == 'vqa':
        b_net = []
        q_prj = []
        c_prj = []
        objects = 10  # minimum number of boxes
        for i in range(gamma):
            b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
            q_prj.append(FCNet([num_hid, num_hid], '', .2))
            c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
        classifier = SimpleClassifier(num_hid, num_hid * 2,
                                      dataset.num_ans_candidates, .5)
        counter = Counter(objects)
        return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                        q_net, v_net, classifier, counter, op, gamma)
    elif task == 'flickr':
        return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
def build_fine(dataset, num_hid, args):

    cnn = getattr(resnet, args.cnn_model)()
    cnn.load_state_dict(
        torch.load(os.path.join(args.model_root, args.cnn_model + '.pth')))
    my_cnn = myResnet(cnn)

    for param in my_cnn.parameters():
        param.requires_grad = False
    for param in my_cnn.resnet.layer4.parameters():
        param.requires_grad = True

    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.4)
    v_att = DualAttention(dataset.v_dim, q_emb.num_hid, num_hid, 0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)

    model = BaseModelWithCNN(w_emb, q_emb, v_att, q_net, v_net, classifier,
                             my_cnn, args)
    return model
示例#17
0
def build_model_P_mod(dataset, num_hid, dropout, norm, activation, dropL,
                      dropG, dropW, dropC):
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=dropW)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=False,
                              dropout=dropG,
                              rnn_type='GRU')

    v_att = Att_P(v_dim=dataset.v_dim,
                  q_dim=q_emb.num_hid,
                  num_hid=num_hid,
                  dropout=dropout,
                  norm=norm,
                  act=activation)
    q_net = GTH(q_emb.num_hid,
                num_hid,
                dropout=dropL,
                norm=norm,
                act=activation)
    v_net = GTH(dataset.v_dim,
                num_hid,
                dropout=dropL,
                norm=norm,
                act=activation)

    classifier = PaperClassifier(in_dim=num_hid,
                                 hid_dim_1=300,
                                 hid_dim_2=2048,
                                 out_dim=dataset.num_ans_candidates,
                                 dropout=dropC,
                                 norm=norm,
                                 act=activation)
    return Model(w_emb, q_emb, v_att, q_net, v_net, classifier)
示例#18
0
def build_baseline0_newatt(dataset,
                           num_hid,
                           reconstruction,
                           layer=4,
                           size=64,
                           variant='',
                           finetune=False,
                           use_residual=False,
                           use_feat_loss=False,
                           dropout_hid=False,
                           dropout_unet=False,
                           logger=None):

    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid,
                     dataset.v_dim, reconstruction, layer, size, variant,
                     finetune, use_residual, use_feat_loss, dropout_hid,
                     dropout_unet, logger)
示例#19
0
class typeAttention(nn.Module):
    def __init__(self, size_question, path_init):
        super(typeAttention, self).__init__()
        self.w_emb = WordEmbedding(size_question, 300, 0.0, False)
        self.w_emb.init_embedding(path_init)
        self.q_emb = QuestionEmbedding(300, 1024, 1, False, 0.0, 'GRU')
        self.q_final = QuestionAttention(1024)
        self.f_fc1 = linear(1024, 2048)
        self.f_fc2 = linear(2048, 1024)
        self.f_fc3 = linear(1024, 1024)

    def forward(self, question):
        w_emb = self.w_emb(question)
        q_emb = self.q_emb.forward_all(w_emb)  # [batch, q_len, q_dim]
        q_final = self.q_final(w_emb, q_emb)  # b, 1024

        x_f = self.f_fc1(q_final)
        x_f = F.relu(x_f)
        x_f = self.f_fc2(x_f)
        x_f = F.dropout(x_f)
        x_f = F.relu(x_f)
        x_f = self.f_fc3(x_f)

        return x_f
示例#20
0
def build_BAN(dataset, args, priotize_using_counter=False):
    # init word embedding module, question embedding module, and Attention network
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0, args.op)
    q_emb = QuestionEmbedding(300 if "c" not in args.op else 600, args.num_hid,
                              1, False, 0.0, args.rnn)
    v_att = BiAttention(dataset.v_dim, args.num_hid, args.num_hid, args.gamma)
    # build and load pre-trained MAML model
    if args.maml:
        weight_path = args.RAD_dir + "/" + args.maml_model_path
        print("load initial weights MAML from: %s" % (weight_path))
        maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
    # build and load pre-trained Auto-encoder model
    if args.autoencoder:
        ae_v_emb = Auto_Encoder_Model()
        weight_path = args.RAD_dir + "/" + args.ae_model_path
        print("load initial weights DAE from: %s" % (weight_path))
        ae_v_emb.load_state_dict(torch.load(weight_path, args.map_location))
    # Loading tfidf weighted embedding
    if hasattr(args, "tfidf"):
        w_emb = tfidf_loading(args.tfidf, w_emb, args)
    # Optional module: counter for BAN
    use_counter = (args.use_counter if priotize_using_counter is None else
                   priotize_using_counter)
    if use_counter or priotize_using_counter:
        objects = 10  # minimum number of boxes
    if use_counter or priotize_using_counter:
        counter = Counter(objects)
    else:
        counter = None
    # init BAN residual network
    b_net = []
    q_prj = []
    c_prj = []
    for i in range(args.gamma):
        b_net.append(
            BCNet(dataset.v_dim, args.num_hid, args.num_hid, None, k=1))
        q_prj.append(FCNet([args.num_hid, args.num_hid], "", 0.2))
        if use_counter or priotize_using_counter:
            c_prj.append(FCNet([objects + 1, args.num_hid], "ReLU", 0.0))
    # init classifier
    classifier = SimpleClassifier(args.num_hid, args.num_hid * 2,
                                  dataset.num_ans_candidates, args)
    # contruct VQA model and return
    if args.maml and args.autoencoder:
        return BAN_Model(
            dataset,
            w_emb,
            q_emb,
            v_att,
            b_net,
            q_prj,
            c_prj,
            classifier,
            counter,
            args,
            maml_v_emb,
            ae_v_emb,
        )
    elif args.maml:
        return BAN_Model(
            dataset,
            w_emb,
            q_emb,
            v_att,
            b_net,
            q_prj,
            c_prj,
            classifier,
            counter,
            args,
            maml_v_emb,
            None,
        )
    elif args.autoencoder:
        return BAN_Model(
            dataset,
            w_emb,
            q_emb,
            v_att,
            b_net,
            q_prj,
            c_prj,
            classifier,
            counter,
            args,
            None,
            ae_v_emb,
        )
    return BAN_Model(
        dataset,
        w_emb,
        q_emb,
        v_att,
        b_net,
        q_prj,
        c_prj,
        classifier,
        counter,
        args,
        None,
        None,
    )
示例#21
0
        utils.assert_eq(len(tokens), max_length)
        ans_tokens.append(tokens)
    return ans_tokens


def create_answer_embedding(ans_list, dictionary, w_emb, ans_emb):
    ans_tokens = tokenize(ans_list, dictionary)
    ans_tokens = torch.from_numpy(np.array(ans_tokens))
    answer_embedding = torch.zeros(3129, 1024)
    for idx, ans in enumerate(ans_tokens):
        ans = ans.unsqueeze(0)
        w = w_emb(ans)
        ans = ans_emb(w)
        answer_embedding[idx] = ans.squeeze()

    with open('data/answer_embedding.pkl', 'wb') as f:
        cPickle.dump(answer_embedding, f)


if __name__ == '__main__':
    dictionary = Dictionary.load_from_file('data/dictionary.pkl')
    w_emb = WordEmbedding(dictionary.ntoken, 300, .0, 'c')
    w_emb.init_embedding('data/glove6b_init_300d.npy', None, None)
    ans_emb = QuestionEmbedding(600, 1024, 1, False, .0)

    ans2label_path = ans2label_path = os.path.join('data', 'cache',
                                                   'trainval_ans2label.pkl')
    ans2label = cPickle.load(open(ans2label_path, 'rb'))
    ans_list = [ans for ans in ans2label]
    create_answer_embedding(ans_list, dictionary, w_emb, ans_emb)
示例#22
0
    return loss


def compute_score_with_logits(logits, labels):
    logits = torch.max(logits, 1)[1].data # argmax
    one_hots = torch.zeros(*labels.size()).cuda()
    one_hots.scatter_(1, logits.view(-1, 1), 1)
    scores = (one_hots * labels)
    return scores
pred2.shape
tmp = instance_bce_with_logits(pred2.to('cpu'), a2.to('cpu'))
tmp.size()
tmp.shape
input = torch.randn(3, requires_grad=True)
target = torch.empty(3).random_(2)
loss = torch.nn.functional.binary_cross_entropy_with_logits(input, target)
loss.backward()
len(tmp[0])
tmp
from language_model import QuestionEmbedding, WordEmbedding, QuestionEmbedding2
num_hid = 1024
w_emb = WordEmbedding(dictionary.ntoken, 300, 0.0)
q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
q_emb2 = QuestionEmbedding2(300, num_hid, 1, True, 0.0)
w_emb = w_emb(q)
q_emb = q_emb(w_emb)  # [batch, q_dim]
q_emb2 = q_emb2(w_emb)
q_emb.shape
q.shape
q_emb2.shape
type(q_emb2)
示例#23
0
class BAN_Model(nn.Module):
    def __init__(self, dataset,args):
        super(BAN_Model, self).__init__()

        self.args = args
        # init word embedding module, question embedding module, biAttention network, bi_residual network, and classifier
        self.w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.cat)
        self.q_emb = QuestionEmbedding(600 if args.cat else 300, args.hid_dim, 1, False, .0, args.rnn)
        self.bi_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim, args.glimpse)
        self.bi_resnet = BiResNet(args,dataset)
        self.classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2, dataset.num_ans_candidates, args)


        # build and load pre-trained MAML model
        if args.maml:
            weight_path = args.data_dir + '/' + args.maml_model_path
            print('load initial weights MAML from: %s' % (weight_path))
            self.maml = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
        # build and load pre-trained Auto-encoder model
        if args.autoencoder:
            self.ae = Auto_Encoder_Model()
            weight_path = args.data_dir + '/' + args.ae_model_path
            print('load initial weights DAE from: %s' % (weight_path))
            self.ae.load_state_dict(torch.load(weight_path))
            self.convert = nn.Linear(16384, 64)
        # Loading tfidf weighted embedding
        if hasattr(args, 'tfidf'):
            self.w_emb = tfidf_loading(args.tfidf, self.w_emb, args)
            
        # Loading the other net
        if args.other_model:
            self.unet = Resnet50Encoder()
        

    def forward(self, v, q):
        """Forward
        v: [batch, num_objs, obj_dim]
        b: [batch, num_objs, b_dim]
        q: [batch_size, seq_length]
        return: logits, not probs
        """
        # get visual feature
        if self.args.maml:
            maml_v_emb = self.maml(v[0]).unsqueeze(1)
            v_emb = maml_v_emb
        if self.args.autoencoder:
            encoder = self.ae.forward_pass(v[1])
            decoder = self.ae.reconstruct_pass(encoder)
            ae_v_emb = encoder.view(encoder.shape[0], -1)
            ae_v_emb = self.convert(ae_v_emb).unsqueeze(1)
            v_emb = ae_v_emb
        if self.args.maml and self.args.autoencoder:
            v_emb = torch.cat((maml_v_emb, ae_v_emb), 2)
        if self.args.other_model:
            v_emb = self.unet(v)  #input: b,c,h,w c==3 ; output= b,c,1,1
            v_emb = v_emb.squeeze(3).squeeze(2).unsqueeze(1) # b,1,c

        # get lextual feature
        w_emb = self.w_emb(q)
        q_emb = self.q_emb.forward_all(w_emb) # [batch, q_len, q_dim]
        # Attention
        att_p, logits = self.bi_att(v_emb, q_emb) # b x g x v x q
        # bilinear residual network
        last_output = self.bi_resnet(v_emb,q_emb,att_p)
        if self.args.autoencoder:
                return last_output, decoder
        return last_output

    def classify(self, input_feats):
        return self.classifier(input_feats)
示例#24
0
class BAN_Model(nn.Module):
    def __init__(self, dataset, args):
        super(BAN_Model, self).__init__()

        self.args = args
        # init word embedding module, question embedding module, biAttention network, bi_residual network, and classifier
        self.w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0,
                                   args.cat)
        self.q_emb = QuestionEmbedding(600 if args.cat else 300, args.hid_dim,
                                       1, False, .0, args.rnn)

        # for close att+ resnet + classify
        self.close_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim,
                                     args.glimpse)
        self.close_resnet = BiResNet(args, dataset)
        self.close_classifier = SimpleClassifier(args.hid_dim,
                                                 args.hid_dim * 2,
                                                 dataset.num_close_candidates,
                                                 args)

        # for open_att + resnet + classify
        self.open_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim,
                                    args.glimpse)
        self.open_resnet = BiResNet(args, dataset)
        self.open_classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2,
                                                dataset.num_open_candidates,
                                                args)

        # type attention: b * 1024
        self.typeatt = typeAttention(dataset.dictionary.ntoken,
                                     './data/glove6b_init_300d.npy')

        # build and load pre-trained MAML model
        if args.maml:
            weight_path = args.data_dir + '/' + args.maml_model_path
            print('load initial weights MAML from: %s' % (weight_path))
            self.maml = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
        # build and load pre-trained Auto-encoder model
        if args.autoencoder:
            self.ae = Auto_Encoder_Model()
            weight_path = args.data_dir + '/' + args.ae_model_path
            print('load initial weights DAE from: %s' % (weight_path))
            self.ae.load_state_dict(torch.load(weight_path))
            self.convert = nn.Linear(16384, 64)
        # Loading tfidf weighted embedding
        if hasattr(args, 'tfidf'):
            self.w_emb = tfidf_loading(args.tfidf, self.w_emb, args)

        # Loading the other net
        if args.other_model:
            pass

    def forward(self, v, q, a, answer_target):
        """Forward
        v: [batch, num_objs, obj_dim]
        b: [batch, num_objs, b_dim]
        q: [batch_size, seq_length]
        return: logits, not probs
        """
        # get visual feature
        if self.args.maml:
            maml_v_emb = self.maml(v[0]).unsqueeze(1)
            v_emb = maml_v_emb
        if self.args.autoencoder:
            encoder = self.ae.forward_pass(v[1])
            decoder = self.ae.reconstruct_pass(encoder)
            ae_v_emb = encoder.view(encoder.shape[0], -1)
            ae_v_emb = self.convert(ae_v_emb).unsqueeze(1)
            v_emb = ae_v_emb
        if self.args.maml and self.args.autoencoder:
            v_emb = torch.cat((maml_v_emb, ae_v_emb), 2)
        if self.args.other_model:
            pass

        # get type attention
        type_att = self.typeatt(q)

        # get lextual feature    global
        w_emb = self.w_emb(q)
        q_emb = self.q_emb.forward_all(w_emb)  # [batch, q_len, q_dim]

        # get open & close feature
        v_open, v_close, q_open, q_close, a_open, a_close, typeatt_open, typeatt_close = seperate(
            v_emb, q_emb, a, type_att, answer_target)

        # diverse Attention -> (open + close)
        # att_p, logits = self.bi_att(v_emb, q_emb) # b x g x v x q
        att_close, _ = self.close_att(v_close, q_close)
        att_open, _ = self.open_att(v_open, q_open)

        # bilinear residual network
        # last_output = self.bi_resnet(v_emb,q_emb,att_p)
        last_output_close = self.close_resnet(v_close, q_close, att_close)
        last_output_open = self.open_resnet(v_open, q_open, att_open)

        #type attention (5.19 try)
        last_output_close = last_output_close * typeatt_close
        last_output_open = last_output_open * typeatt_open

        if self.args.autoencoder:
            return last_output_close, last_output_open, a_close, a_open, decoder
        return last_output_close, last_output_open, a_close, a_open

    def classify(self, close_feat, open_feat):
        return self.close_classifier(close_feat), self.open_classifier(
            open_feat)

    def forward_classify(self, v, q, a, classify):
        # get visual feature
        if self.args.maml:
            maml_v_emb = self.maml(v[0]).unsqueeze(1)
            v_emb = maml_v_emb
        if self.args.autoencoder:
            encoder = self.ae.forward_pass(v[1])
            decoder = self.ae.reconstruct_pass(encoder)
            ae_v_emb = encoder.view(encoder.shape[0], -1)
            ae_v_emb = self.convert(ae_v_emb).unsqueeze(1)
            v_emb = ae_v_emb
        if self.args.maml and self.args.autoencoder:
            v_emb = torch.cat((maml_v_emb, ae_v_emb), 2)
        if self.args.other_model:
            pass

        # get type attention
        type_att = self.typeatt(q)

        # get lextual feature    global
        w_emb = self.w_emb(q)
        q_emb = self.q_emb.forward_all(w_emb)  # [batch, q_len, q_dim]

        # get open & close feature
        answer_target = classify(q)
        _, predicted = torch.max(answer_target, 1)
        v_open, v_close, q_open, q_close, a_open, a_close, typeatt_open, typeatt_close = seperate(
            v_emb, q_emb, a, type_att, predicted)

        # diverse Attention -> (open + close)
        # att_p, logits = self.bi_att(v_emb, q_emb) # b x g x v x q
        att_close, _ = self.close_att(v_close, q_close)
        att_open, _ = self.open_att(v_open, q_open)

        # bilinear residual network
        # last_output = self.bi_resnet(v_emb,q_emb,att_p)
        last_output_close = self.close_resnet(v_close, q_close, att_close)
        last_output_open = self.open_resnet(v_open, q_open, att_open)

        # type attention (5.19 try)
        last_output_close = last_output_close * typeatt_close
        last_output_open = last_output_open * typeatt_open

        if self.args.autoencoder:
            return last_output_close, last_output_open, a_close, a_open, decoder
        return last_output_close, last_output_open, a_close, a_open