示例#1
0
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    v_att = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    q_att = weight_norm(nn.Linear(num_hid, 1), dim=None)

    w_emb2 = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb2 = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1,
                               False, .0)
    v_att2 = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2)
    v_net2 = FCNet([dataset.v_dim, num_hid])
    q_att2 = weight_norm(nn.Linear(num_hid, 1), dim=None)

    if task == 'vqa':
        b_net = []
        q_prj = []
        c_prj = []
        objects = 10  # minimum number of boxes
        for i in range(gamma):
            b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
            q_prj.append(FCNet([num_hid, num_hid], '', .2))
            c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
        classifier = SimpleClassifier(num_hid, num_hid * 2, 3, .5)
        classifier2 = SimpleClassifier(num_hid, num_hid * 2,
                                       dataset.num_ans_candidates, .5)
        counter = Counter(objects)
        return BanModel(dataset, w_emb, q_emb, v_att, q_att, b_net, q_prj,
                        c_prj, q_net, v_net, classifier, classifier2, counter,
                        op, gamma, w_emb2, q_emb2, v_att2, v_net2, q_att2)
    elif task == 'flickr':
        return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
示例#2
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    cls_att = NewAttention(dataset.cls_dim, q_emb.num_hid, num_hid)
    attr_att = NewAttention(dataset.attr_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    cls_net = FCNet([dataset.cls_dim, num_hid])
    attr_net = FCNet([dataset.attr_dim, num_hid])

    fusion_dim = 16000
    mcb = CompactBilinearPooling(num_hid, num_hid, fusion_dim)
    classifier = SimpleClassifier(fusion_dim, num_hid * 2, dataset.num_ans_candidates, 0.5)
    
    return BaseModel(w_emb, q_emb, v_att, cls_att, attr_att, q_net, v_net, cls_net, attr_net, classifier, mcb)
    def __init__(self,
                 vocab_size,
                 embed_hidden=300,
                 mlp_hidden=512):
        super(TopDown, self).__init__()

        self.vocab_size = vocab_size

        self.q_emb = nn.LSTM(embed_hidden, mlp_hidden,
                             batch_first=True, bidirectional=True)
        self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.verb_transform = nn.Linear(embed_hidden, mlp_hidden)
        self.v_att = NewAttention(mlp_hidden, mlp_hidden, mlp_hidden)
        '''self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
        self.classifier = SimpleClassifier(
            mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5)'''
        self.classifier = nn.Sequential(
            nn.Linear(mlp_hidden * 7 *7 + mlp_hidden, mlp_hidden*8),
            nn.BatchNorm1d(mlp_hidden*8),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(mlp_hidden * 8, mlp_hidden*8),
            nn.BatchNorm1d(mlp_hidden*8),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
        )
示例#4
0
 def __init__(self, image_emb_size, qst_emb_size, no_ans):
     super(VQA_Model, self).__init__()
     emb_size = image_emb_size + qst_emb_size
     self.img_att = NewAttention(image_emb_size, qst_emb_size, qst_emb_size)
     #fc1_size = 1024
     #emb_size = fc1_size + qst_desc_emb_size
     #self.linear1 =  nn.Linear(img_ques_emb_size, fc1_size)
     self.linear = nn.Linear(emb_size, no_ans)
示例#5
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    if not dataset.bert:
        q_att = SelfAttention(q_emb.num_hid, num_hid)
        v_att = NewAttention(dataset.v_dim + 2, q_emb.num_hid, num_hid)
        q_net = FCNet([q_emb.num_hid, num_hid])
    else:
        q_att = SelfAttention(768, num_hid)
        q_emb = FCNet([768, 768])
        v_att = NewAttention(dataset.v_dim, 768, num_hid)
        q_net = FCNet([768, num_hid])
    v_net = FCNet([dataset.v_dim + 2, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, q_att, v_att, q_net, v_net, classifier,
                     dataset.bert)
示例#6
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
示例#7
0
 def __init__(self, image_emb_size, qst_emb_size, no_ans):
     super(VQA_Model, self).__init__()
     num_hid = 1024
     #emb_size = image_emb_size + qst_emb_size
     self.img_att = NewAttention(image_emb_size, qst_emb_size, qst_emb_size)
     # self.linear =  nn.Linear(emb_size, no_ans)
     self.q_net = FCNet([image_emb_size, num_hid])
     self.v_net = FCNet([qst_emb_size, num_hid])
     self.classifier = SimpleClassifier(num_hid, num_hid * 2, no_ans, 0.5)
示例#8
0
def visualize_vqe(dataset, num_hid, att_dim, dec_dim):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    generator = STDecoder(
        dataset.v_dim, num_hid, 300, dec_dim,\
        dataset.explanation_dictionary.ntoken, 1, 0.5)
    return VQE(w_emb, q_emb, v_att, q_net, v_net, generator)
示例#9
0
    def __init__(self, vocab_size, embed_hidden=300, mlp_hidden=512):
        super(TopDown, self).__init__()

        self.vocab_size = vocab_size

        self.q_emb = nn.LSTM(embed_hidden,
                             mlp_hidden,
                             batch_first=True,
                             bidirectional=True)
        self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.v_att = NewAttention(mlp_hidden, mlp_hidden, mlp_hidden)
def build_LL_newatt(dataset, num_hid):#---------------------------------!!!!!!
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier_LL = SimpleClassifier(#--------!!!!!
        num_hid*4, num_hid /8, 1, 0.5)#-----!!!!!
    classifier_All = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    return LL4ALModel(w_emb, q_emb, v_att, q_net, v_net, classifier_LL,classifier_All)
示例#11
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    c_1 = MLP(input_dim=1024,
              dimensions=[1024, 1024, dataset.num_ans_candidates])
    c_2 = nn.Linear(dataset.num_ans_candidates, dataset.num_ans_candidates)
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, c_1, c_2)
示例#12
0
def build_vqae3_split(dataset, num_hid, att_dim, dec_dim):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att_1 = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net_1 = FCNet([q_emb.num_hid, num_hid])
    v_net_1 = FCNet([dataset.v_dim, num_hid])
    v_att_2 = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net_2 = FCNet([q_emb.num_hid, num_hid])
    v_net_2 = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    generator = STDecoder(
        dataset.v_dim, num_hid, 300, dec_dim,\
        dataset.explanation_dictionary.ntoken, 1, 0.5)
    e_emb = ExplainEmbedding(generator.embed, 300, num_hid, 1, False, 0.0,
                             'GRU')
    T_vq = FCNet([num_hid, num_hid])
    T_e = FCNet([e_emb.num_hid, num_hid])
    return Split_VQAE(w_emb, q_emb, v_att_1, q_net_1, v_net_1, v_att_2,
                      q_net_2, v_net_2, classifier, generator, e_emb, T_vq,
                      T_e)
示例#13
0
def build_vqae_newatt(dataset, num_hid, att_dim, dec_dim):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    generator = STDecoder(
        dataset.v_dim, num_hid, 300, dec_dim,\
        dataset.explanation_dictionary.ntoken, 1, 0.5)
    return VQAE(w_emb, q_emb, v_att, q_net, v_net, classifier, generator)
def build_multimodal_newatt(dataset, num_hid):#---------------------------------!!!!!!
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier_V = SimpleClassifier(
        2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!!
    classifier_Q = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    classifier_All = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    return MultiModalModel(w_emb, q_emb, v_att, q_net, v_net, classifier_V,classifier_Q,classifier_All)
def build_stackatt(dataset, num_hid, args):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4)
    q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4)
    v_att = NewAttention(dataset.v_dim, 2048 + q_emb.num_hid, num_hid, 0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    query_net = FCNet([dataset.v_dim, num_hid])

    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)

    model = BaseModelStackAtt(w_emb, q_emb, v_att, q_net, v_net, query_net,
                              classifier, args)
    return model
示例#16
0
def build_CCB_model(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_ct_net = FCNet([q_emb.num_hid, num_hid])
    q_cx_net = FCNet([q_emb.num_hid, num_hid])
    v_ct_net = FCNet([dataset.v_dim, num_hid])
    v_cx_net = FCNet([dataset.v_dim, num_hid])
    classifier_fq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    classifier_vq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net,
                     classifier_fq, classifier_vq, v_cx_net)
示例#17
0
def build_lstm_vqa(dataset, num_hid, att_dim, dec_dim):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    generator = SATDecoder(
        dataset.v_dim, num_hid, 300, att_dim, dec_dim,\
        dataset.explanation_dictionary.ntoken, 1, 0.5)
    #att_emb = nn.GRU(dataset.v_dim, num_hid, 1, False, batch_first=True)
    att_emb = nn.GRUCell(dataset.v_dim, num_hid)
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return LSTM_VQA(w_emb, q_emb, v_att, q_net, v_net, generator, att_emb,
                    classifier)
示例#18
0
def build_vqae2_newatt(dataset, num_hid, emb_rnn='GRU'):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, 1024, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, 1024, 1024)
    q_net = FCNet([1024, 1024])
    v_net = FCNet([dataset.v_dim, 1024])
    classifier = SimpleClassifier(1024, 1024 * 2, dataset.num_ans_candidates,
                                  0.5)
    generator = STDecoder(
        dataset.v_dim, 1024, 300, 1024,\
        dataset.explanation_dictionary.ntoken, 1, 0.5)
    e_emb = ExplainEmbedding(generator.embed, 300, num_hid, 1, False, 0.0,
                             emb_rnn)
    e_net = FCNet([e_emb.num_hid, 1024])
    return VQAE2(w_emb, q_emb, v_att, q_net, v_net, classifier, generator,
                 e_emb, e_net)
示例#19
0
def build_baseline0_newatt(dataset,
                           num_hid,
                           reconstruction,
                           size=64,
                           dropout_hid=0.0,
                           gamma_r=0.0,
                           adv_mode="wgan",
                           logger=None):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid,
                     dataset.v_dim, reconstruction, size, dropout_hid, gamma_r,
                     adv_mode, logger)
示例#20
0
    def __init__(self, vocab_size, embed_hidden=300, mlp_hidden=512):
        super(TopDown, self).__init__()

        self.vocab_size = vocab_size

        self.v_att = NewAttention(mlp_hidden, 768, mlp_hidden)
        '''self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
        self.classifier = SimpleClassifier(
            mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5)'''
        self.classifier = nn.Sequential(
            nn.Linear(mlp_hidden * 7 * 7, mlp_hidden * 8),
            nn.BatchNorm1d(mlp_hidden * 8),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(mlp_hidden * 8, mlp_hidden * 8),
            nn.BatchNorm1d(mlp_hidden * 8),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
        )
示例#21
0
def build_baseline0_newatt(dataset,
                           num_hid,
                           reconstruction,
                           layer=4,
                           size=64,
                           variant='',
                           finetune=False,
                           use_residual=False,
                           use_feat_loss=False,
                           dropout_hid=False,
                           dropout_unet=False,
                           logger=None):

    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid,
                     dataset.v_dim, reconstruction, layer, size, variant,
                     finetune, use_residual, use_feat_loss, dropout_hid,
                     dropout_unet, logger)
示例#22
0
def build_baseline0_newatt2(args, num_hid):
    w_emb = WordEmbedding(args.vocab_size, args.ninp, 0.0)
    q_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0)
    h_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0)
    v_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    h_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    qih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    qhi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    q_net = FCNet([q_emb.num_hid*2, num_hid*2])
    v_net = FCNet([args.nhid*2, num_hid*2])
    h_net = FCNet([args.nhid*2, num_hid*2])
    qih_net = FCNet([args.nhid*2, num_hid*2])
    qhi_net = FCNet([args.nhid*2, num_hid*2])
    qhih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    qihi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)

    decoder = netG(args)
    return BaseModel2(w_emb, q_emb, h_emb, v_att, h_att, q_net, v_net, h_net, qih_att, qhi_att, qih_net, qhi_net,
                     decoder, args, qhih_att, qihi_att)