Exemplo n.º 1
0
def build_baseline0(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att1 = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    v_att2 = Attention(dataset.v_dim, num_hid, num_hid)

    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return SANModel2(w_emb, q_emb, v_att1, v_att2, q_net, v_net, classifier)
Exemplo n.º 2
0
def build_VQE_newatt_2(dataset, num_hid, att_dim, dec_dim):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    generator = STDecoder(
        dataset.v_dim, num_hid, 300, dec_dim,\
        dataset.explanation_dictionary.ntoken, 1, 0.5)
    return VQAE2(w_emb, q_emb, v_att, q_net, v_net, None, generator, None,
                 None)
Exemplo n.º 3
0
 def __init__(self, v_dim, q_dim, num_hid, com=False):
     self.q_net = FCNet([q_dim, num_hid])
     self.v_net = FCNet([v_dim, num_hid])
     self.com = com
     if com:
         layers = [
             nn.Dropout(0.2, inplace=True),
             weight_norm(nn.Linear(num_hid, num_hid), dim=None)  #,
             #nn.ReLU()
         ]
         self.f = nn.Sequential(*layers)
    def __init__(self, vocab_size, embed_hidden=300, mlp_hidden=512):
        super(TopDown, self).__init__()

        self.vocab_size = vocab_size

        self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
        self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                           self.vocab_size, 0.5)
        self.mlp_hidden = mlp_hidden
def build_LL_newatt(dataset, num_hid):#---------------------------------!!!!!!
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier_LL = SimpleClassifier(#--------!!!!!
        num_hid*4, num_hid /8, 1, 0.5)#-----!!!!!
    classifier_All = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    return LL4ALModel(w_emb, q_emb, v_att, q_net, v_net, classifier_LL,classifier_All)
Exemplo n.º 6
0
def attention_mfh(dataset,
                  num_hid,
                  dropout,
                  norm,
                  activation,
                  drop_L,
                  drop_G,
                  drop_W,
                  drop_C,
                  mfb_out_dim,
                  bidirect_val=False):
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=drop_W)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=bidirect_val,
                              dropout=drop_G,
                              rnn_type='GRU')
    v_att = Base_Att(v_dim=dataset.v_dim,
                     q_dim=q_emb.num_hid,
                     num_hid=num_hid,
                     dropout=dropout,
                     bidirect=bidirect_val,
                     norm=norm,
                     act=activation)
    if (bidirect_val is False):
        q_net = FCNet([num_hid, num_hid],
                      dropout=drop_L,
                      norm=norm,
                      act=activation)
        #v_net = FCNet([dataset.v_dim, num_hid], dropout= drop_L, norm= norm, act= activation)
    else:
        q_net = FCNet([2 * num_hid, num_hid],
                      dropout=drop_L,
                      norm=norm,
                      act=activation)

    v_net = FCNet([dataset.v_dim, num_hid],
                  dropout=drop_L,
                  norm=norm,
                  act=activation)
    mfh_net = mfh_baseline(QUEST_EMBED=num_hid,
                           VIS_EMBED=num_hid,
                           MFB_OUT_DIM=mfb_out_dim)
    classifier = SimpleClassifier(in_dim=2 * mfb_out_dim,
                                  hid_dim=2 * num_hid,
                                  out_dim=dataset.num_ans_candidates,
                                  dropout=drop_C,
                                  norm=norm,
                                  act=activation)
    return (VQA_Model_MFH(w_emb, q_emb, v_att, q_net, v_net, mfh_net,
                          classifier))
Exemplo n.º 7
0
def build_BAN(dataset, args, priotize_using_counter=False):
    # init word embedding module, question embedding module, and Attention network
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.op)
    q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid,
                              1, False, .0, args.rnn)
    v_att = BiAttention(dataset.v_dim, args.num_hid, args.num_hid, args.gamma)
    # build and load pre-trained MAML model
    if args.maml:
        weight_path = args.RAD_dir + '/' + args.maml_model_path
        print('load initial weights MAML from: %s' % (weight_path))
        maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
    # build and load pre-trained Auto-encoder model
    if args.autoencoder:
        ae_v_emb = Auto_Encoder_Model()
        weight_path = args.RAD_dir + '/' + args.ae_model_path
        print('load initial weights DAE from: %s' % (weight_path))
        ae_v_emb.load_state_dict(torch.load(weight_path))
    # Loading tfidf weighted embedding
    if hasattr(args, 'tfidf'):
        w_emb = tfidf_loading(args.tfidf, w_emb, args)
    # Optional module: counter for BAN
    use_counter = args.use_counter if priotize_using_counter is None else priotize_using_counter
    if use_counter or priotize_using_counter:
        objects = 10  # minimum number of boxes
    if use_counter or priotize_using_counter:
        counter = Counter(objects)
    else:
        counter = None
    # init BAN residual network
    b_net = []
    q_prj = []
    c_prj = []
    for i in range(args.gamma):
        b_net.append(
            BCNet(dataset.v_dim, args.num_hid, args.num_hid, None, k=1))
        q_prj.append(FCNet([args.num_hid, args.num_hid], '', .2))
        if use_counter or priotize_using_counter:
            c_prj.append(FCNet([objects + 1, args.num_hid], 'ReLU', .0))
    # init classifier
    classifier = SimpleClassifier(args.num_hid, args.num_hid * 2,
                                  dataset.num_ans_candidates, args)
    # contruct VQA model and return
    if args.maml and args.autoencoder:
        return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                         classifier, counter, args, maml_v_emb, ae_v_emb)
    elif args.maml:
        return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                         classifier, counter, args, maml_v_emb, None)
    elif args.autoencoder:
        return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                         classifier, counter, args, None, ae_v_emb)
    return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                     classifier, counter, args, None, None)
Exemplo n.º 8
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    c_1 = MLP(input_dim=1024,
              dimensions=[1024, 1024, dataset.num_ans_candidates])
    c_2 = nn.Linear(dataset.num_ans_candidates, dataset.num_ans_candidates)
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, c_1, c_2)
Exemplo n.º 9
0
def build_baseline1(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    w_emb2 = WordEmbedding(dataset.dictionary.ntoken, num_hid, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    lstm = nn.LSTM(num_hid, num_hid, 1, batch_first=True)
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, lstm,
                     w_emb2)
Exemplo n.º 10
0
def build_baseline0(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb1 = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    q_emb2 = QuestionEmbedding(300)
    v_att = StackAttention(num_hid, num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([num_hid, num_hid])
    linear = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return AttentionModel(w_emb, q_emb1, v_att, q_net, v_net, classifier,
                          linear)
    def __init__(
            self,
            encoder,
            gpu_mode,
            conv_hidden=24,
            embed_hidden=300,
            lstm_hidden=300,
            mlp_hidden=512
    ):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        #self.vocab_size = self.encoder.get_num_labels()
        self.n_verbs = self.encoder.get_num_verbs()
        self.vocab_size = self.encoder.get_num_labels()

        #self.agent_label_lookup = nn.Embedding(self.vocab_size, embed_hidden)

        self.conv_agent = vgg16_modified()
        self.conv_verb = vgg16_modified_feat()

        self.q_word_count = len(self.encoder.question_words)
        self.w_emb = nn.Embedding(self.q_word_count, embed_hidden)



        self.word_att = BigAttention(mlp_hidden, embed_hidden, mlp_hidden)

        self.vqa_model = TopDown()

        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])

        self.classifier = SimpleClassifier(
            mlp_hidden, 2 * mlp_hidden, self.n_verbs, 0.5)
Exemplo n.º 12
0
def build_vqae_newatt(dataset, num_hid, att_dim, dec_dim):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    generator = STDecoder(
        dataset.v_dim, num_hid, 300, dec_dim,\
        dataset.explanation_dictionary.ntoken, 1, 0.5)
    return VQAE(w_emb, q_emb, v_att, q_net, v_net, classifier, generator)
Exemplo n.º 13
0
def build_baseline4(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = AttQuestionEmbedding(300, 1024, 1, 0, 512, 1, 2, 0)
    v_att = doubel_project_attention(dataset.v_dim, q_emb.num_hid,
                                     q_emb.num_hid, 0.2)
    q_net = FCNet([num_hid, num_hid], 0.0, "weight", "ReLU")
    v_net = FCNet([dataset.v_dim, num_hid], 0.0, "weight", "ReLU")
    rn = RN4(dataset.v_dim, num_hid, "weight", "ReLU", 0.0)
    sfu = SFU(dataset.v_dim, dataset.v_dim)
    classifier = SimpleClassifier(num_hid, 5000, dataset.num_ans_candidates,
                                  0.5)
    return BaseModel1(w_emb, q_emb, v_att, q_net, v_net, rn, sfu, classifier)
Exemplo n.º 14
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    c_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    c_net = FCNet([c_emb.num_hid, num_hid])
    classifier = SimpleClassifier(2 * num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, c_emb, v_att, q_net, v_net, c_net,
                     classifier)
Exemplo n.º 15
0
 def __init__(self, v_dim, q_dim, num_hid, norm, act, dropout=0.0):
     super(Att_PD, self).__init__()
     norm_layer = get_norm(norm)
     self.nonlinear = FCNet([v_dim + q_dim, num_hid, num_hid],
                            dropout=dropout,
                            norm=norm,
                            act=act)
     self.nonlinear_gate = FCNet([v_dim + q_dim, num_hid, num_hid],
                                 dropout=dropout,
                                 norm=norm,
                                 act='Sigmoid')
     self.linear = norm_layer(nn.Linear(num_hid, 1), dim=None)
Exemplo n.º 16
0
 def __init__(self, v_dim, q_dim, num_hid, norm, act, dropout=0.0):
     super(Att_2, self).__init__()
     norm_layer = get_norm(norm)
     self.v_proj = FCNet([v_dim, num_hid],
                         dropout=dropout,
                         norm=norm,
                         act=act)
     self.q_proj = FCNet([q_dim, num_hid],
                         dropout=dropout,
                         norm=norm,
                         act=act)
     self.linear = norm_layer(nn.Linear(q_dim, 1), dim=None)
Exemplo n.º 17
0
def attention_baseline(dataset,
                       num_hid,
                       dropout,
                       norm,
                       activation,
                       drop_L,
                       drop_G,
                       drop_W,
                       drop_C,
                       bidirect_val=False):
    print('Here in the attention baseline')
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=drop_W)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=bidirect_val,
                              dropout=drop_G,
                              rnn_type='GRU')
    #bert_emb=BertEmbedding(in_dim=7168,num_hid=num_hid)

    v_att = Base_Att(v_dim=dataset.v_dim,
                     q_dim=q_emb.num_hid,
                     num_hid=num_hid,
                     dropout=dropout,
                     bidirect=bidirect_val,
                     norm=norm,
                     act=activation)
    if (bidirect_val is False):
        q_net = FCNet([num_hid, num_hid],
                      dropout=drop_L,
                      norm=norm,
                      act=activation)
        #v_net = FCNet([dataset.v_dim, num_hid], dropout= drop_L, norm= norm, act= activation)
    else:
        q_net = FCNet([2 * num_hid, num_hid],
                      dropout=drop_L,
                      norm=norm,
                      act=activation)

    v_net = FCNet([dataset.v_dim, num_hid],
                  dropout=drop_L,
                  norm=norm,
                  act=activation)
    classifier = SimpleClassifier(in_dim=num_hid,
                                  hid_dim=2 * num_hid,
                                  out_dim=dataset.num_ans_candidates,
                                  dropout=drop_C,
                                  norm=norm,
                                  act=activation)
    return (VQA_Model(w_emb, q_emb, v_att, q_net, v_net, classifier))
    def __init__(self, embed_hidden=300, mlp_hidden=512):
        super(TopDown, self).__init__()

        self.q_emb = nn.LSTM(embed_hidden + mlp_hidden,
                             mlp_hidden,
                             batch_first=True,
                             bidirectional=True)
        self.q_prep = FCNet([mlp_hidden, mlp_hidden])
        self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.verb_transform = nn.Linear(embed_hidden, mlp_hidden)
        self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
def build_multimodal_newatt(dataset, num_hid):#---------------------------------!!!!!!
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier_V = SimpleClassifier(
        2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!!
    classifier_Q = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    classifier_All = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    return MultiModalModel(w_emb, q_emb, v_att, q_net, v_net, classifier_V,classifier_Q,classifier_All)
Exemplo n.º 20
0
def build_baseline(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att1 = Att_3(dataset.v_dim, num_hid, num_hid, "weight", "ReLU")
    v_att2 = Att_3(dataset.v_dim, num_hid, num_hid, "weight", "ReLU")
    q_net = FCNet([num_hid, num_hid], 0.0, "weight", "ReLU")
    v_net = FCNet([dataset.v_dim, num_hid], 0.0, "weight", "ReLU")
    rn = RN(dataset.v_dim, "weight", "ReLU", 0.0)
    sfu = SFU(dataset.v_dim, dataset.v_dim)
    classifier = SimpleClassifier(num_hid, 5000, dataset.num_ans_candidates,
                                  0.5)
    return BaseModel(w_emb, q_emb, v_att1, v_att2, q_net, v_net, rn, sfu,
                     classifier)
Exemplo n.º 21
0
    def __init__(self,
                 vocab_size,
                 embed_hidden=300,
                 mlp_hidden=512):
        super(TopDown, self).__init__()

        self.vocab_size = vocab_size



        self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
Exemplo n.º 22
0
 def __init__(self,
              image_feat_dim,
              txt_rnn_embeding_dim,
              hidden_size,
              dropout=0.2):
     super(project_attention, self).__init__()
     self.image_feat_dim = image_feat_dim
     self.txt_embeding_dim = txt_rnn_embeding_dim
     self.Fa_image = FCNet([image_feat_dim, hidden_size], 0.0, "weight",
                           "LeakyReLU")
     self.Fa_txt = FCNet([txt_rnn_embeding_dim, hidden_size], 0.0, "weight",
                         "LeakyReLU")
     self.dropout = nn.Dropout(dropout)
     self.lc = nn.Linear(hidden_size, 1)
Exemplo n.º 23
0
def build_caq_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid // 2])
    v_net = FCNet([dataset.v_dim, num_hid // 2])
    updated_query_composer = FCNet([num_hid + num_hid // 2, num_hid])
    neighbour_attention = MultiHeadedAttention(4, num_hid // 2, dropout=0.1)
    Dropout_C = nn.Dropout(0.1)

    classifier = SimpleClassifier(num_hid // 2, num_hid * 2,
                                  dataset.num_ans_candidates + 1, 0.5)
    return CAQModel(w_emb, q_emb, v_att, q_net, v_net, updated_query_composer,
                    neighbour_attention, Dropout_C, classifier, dataset)
Exemplo n.º 24
0
def build_CCB_model(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_ct_net = FCNet([q_emb.num_hid, num_hid])
    q_cx_net = FCNet([q_emb.num_hid, num_hid])
    v_ct_net = FCNet([dataset.v_dim, num_hid])
    v_cx_net = FCNet([dataset.v_dim, num_hid])
    classifier_fq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    classifier_vq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net,
                     classifier_fq, classifier_vq, v_cx_net)
Exemplo n.º 25
0
    def __init__(self, v_dim, q_dim, num_hid, output_channel=36, kernel_size=1, stride=1,  \
                        instance_norm=0, padding_type='same', l2_norm=0, concat=1, leaky_relu=None, last_no_relu=None, \
                        num_conv_layer=1, conv_norm=0, softmax=0, dropout=0.2):
        super(SigSoftAttention, self).__init__()

        self.output_channel = output_channel
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding_type = padding_type
        self.l2_norm = l2_norm
        self.concat = concat
        self.softmax = softmax
        self.num_conv_layer = num_conv_layer
        if self.num_conv_layer == 2:
            conv_hid_dim = 256

        self.v_proj = FCNet([v_dim, num_hid], leaky_relu, last_no_relu)
        self.q_proj = FCNet([q_dim, num_hid], leaky_relu, last_no_relu)
        self.dropout = nn.Dropout(dropout)

        assert stride==1
        if padding_type == 'same':
            padding_num = (self.kernel_size-1)/2
        elif padding_type == 'valid':
            padding_num = 0
            self.zero_padding = nn.ConstantPad2d((self.kernel_size-1)/2, -1) # there is a (x+1)/2 operation later
        else:
            raise ValueError

        if self.concat:
            if self.num_conv_layer == 2:
                self.conv2 = conv_weight_norm(nn.Conv2d(2*num_hid, conv_hid_dim, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm)
                self.conv2_relu = nn.LeakyReLU(negative_slope=0.3)
                self.conv1 = conv_weight_norm(nn.Conv2d(conv_hid_dim, self.output_channel, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm)
            else:
                self.conv1 = conv_weight_norm(nn.Conv2d(2*num_hid, self.output_channel, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm)
        else:
            if self.num_conv_layer == 2:
                self.conv2 = conv_weight_norm(nn.Conv2d(num_hid, conv_hid_dim, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm)
                self.conv2_relu = nn.LeakyReLU(negative_slope=0.3)
                self.conv1 = conv_weight_norm(nn.Conv2d(conv_hid_dim, self.output_channel, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm)
            else:
                self.conv1 = conv_weight_norm(nn.Conv2d(num_hid, self.output_channel, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm)

        self.sigmoid = nn.Sigmoid()
        self.instance_norm = instance_norm
        if self.instance_norm:
            if self.num_conv_layer == 2:
                self.conv2_in = nn.InstanceNorm2d(conv_hid_dim)
            self.conv1_in = nn.InstanceNorm2d(self.output_channel) 
Exemplo n.º 26
0
    def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                 std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.n_roles = self.encoder.get_num_roles()
        self.n_verbs = self.encoder.get_num_verbs()
        self.vocab_size = self.encoder.get_num_labels()
        self.max_role_count = self.encoder.get_max_role_count()
        self.n_role_q_vocab = len(self.encoder.question_words)

        self.conv = vgg16_modified()
        self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden)
        self.w_emb = nn.Embedding(self.n_role_q_vocab + 1,
                                  embed_hidden,
                                  padding_idx=self.n_role_q_vocab)
        self.q_emb = nn.LSTM(embed_hidden,
                             mlp_hidden,
                             batch_first=True,
                             bidirectional=True)
        self.q_prep = FCNet([mlp_hidden, mlp_hidden])
        self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.verb_transform = nn.Linear(embed_hidden, mlp_hidden)
        #self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
        self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                           self.vocab_size, 0.5)

        self.conv_hidden = self.conv.base_size()
        self.mlp_hidden = mlp_hidden
        self.embed_hidden = embed_hidden
def build_stackatt(dataset, num_hid, args):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4)
    q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4)
    v_att = NewAttention(dataset.v_dim, 2048 + q_emb.num_hid, num_hid, 0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    query_net = FCNet([dataset.v_dim, num_hid])

    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)

    model = BaseModelStackAtt(w_emb, q_emb, v_att, q_net, v_net, query_net,
                              classifier, args)
    return model
Exemplo n.º 28
0
    def __init__(self,
                 c_dim,
                 num_hid,
                 q_dim,
                 nlayers,
                 bidirect,
                 dropout,
                 rnn_type='LSTM',
                 v_dim=2048):
        """Module for question embedding
        """
        super(CaptionQuestionImageRNN, self).__init__()
        assert rnn_type == 'LSTM' or rnn_type == 'GRU'
        rnn_cls = nn.LSTM if rnn_type == 'LSTM' else nn.GRU
        norm_layer = get_norm('weight')
        self.rnn_att = rnn_cls(c_dim,
                               num_hid,
                               nlayers,
                               bidirectional=bidirect,
                               dropout=dropout,
                               batch_first=True)

        self.rnn_c = rnn_cls(c_dim,
                             num_hid,
                             nlayers,
                             bidirectional=bidirect,
                             dropout=dropout,
                             batch_first=True)

        self.q_emb_for_c = FCNet([q_dim, num_hid],
                                 dropout=0.2,
                                 norm='weight',
                                 act='LeakyReLU')
        self.att_logits = norm_layer(nn.Linear(num_hid, 1), dim=None)

        self.v_emb_for_c = FCNet([v_dim, num_hid],
                                 dropout=0.2,
                                 norm='weight',
                                 act='LeakyReLU')
        self.v_att_logits = norm_layer(nn.Linear(num_hid, 1), dim=None)

        self.Sig = nn.Sigmoid()
        self.c_dim = c_dim
        self.q_dim = q_dim
        self.num_hid = num_hid
        self.nlayers = nlayers
        self.ndirections = int(bidirect) + 1
        self.rnn_type = rnn_type
        self.v_dim = v_dim
Exemplo n.º 29
0
def build_lstm_vqa(dataset, num_hid, att_dim, dec_dim):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    generator = SATDecoder(
        dataset.v_dim, num_hid, 300, att_dim, dec_dim,\
        dataset.explanation_dictionary.ntoken, 1, 0.5)
    #att_emb = nn.GRU(dataset.v_dim, num_hid, 1, False, batch_first=True)
    att_emb = nn.GRUCell(dataset.v_dim, num_hid)
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return LSTM_VQA(w_emb, q_emb, v_att, q_net, v_net, generator, att_emb,
                    classifier)
Exemplo n.º 30
0
def build_baseline0_gcn(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding_all(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)

    q_net0 = FCNet([q_emb.num_hid, num_hid])
    v_net0 = FCNet([dataset.v_dim, num_hid])

    gcn = FCNet([num_hid, num_hid])

    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return GraphModel(w_emb, q_emb, v_att, q_net0, v_net0, gcn, q_net, v_net,
                      classifier)