def build_baseline(embeddings, num_ans_candidates): vision_features = config.output_features visual_glimpses = config.visual_glimpses question_features = hidden_features = config.hid_dim w_emb = WordEmbedding(embeddings, dropout=0.0) q_emb = QuestionEmbedding(w_dim=300, hid_dim=question_features, nlayers=1, bidirect=False, dropout=0.0) v_att = Attention( v_dim=vision_features, q_dim=question_features * q_emb.ndirections, hid_dim=hidden_features, glimpses=visual_glimpses, ) classifier = SimpleClassifier( in_dim=(question_features * q_emb.ndirections, vision_features), hid_dim=(hidden_features, hidden_features * 2), out_dim=num_ans_candidates, dropout=0.5) return BaseModel(w_emb, q_emb, v_att, classifier)
def baseline(args, dataset, pretrained=False): # initialise model w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, args.num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, args.num_hid) q_net = FCNet([args.num_hid, args.num_hid]) v_net = FCNet([dataset.v_dim, args.num_hid]) classifier = SimpleClassifier(args.num_hid, 2 * args.num_hid, dataset.num_ans_candidates, 0.5) model = BaseModel(args, w_emb, q_emb, v_att, q_net, v_net, classifier) # load model on device if available map_location = None if not model.cuda_available: map_location = torch.device('cpu') # download and load pretrained model if pretrained: key = 'baseline-vqa' url = pretrained_urls[key] model.load_state_dict(download_model( key, url, map_location=map_location)['model'], strict=False) else: key = 'untrained' # set model name model.name = key return model
def build_regat(dataset, args): print("Building ReGAT model with %s relation and %s fusion method" % (args.relation_type, args.fusion)) w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.op) q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid, 1, False, .0) q_att = QuestionSelfAttention(args.num_hid, .2) if args.relation_type == "semantic": v_relation = ExplicitRelationEncoder( dataset.v_dim, args.num_hid, args.relation_dim, args.dir_num, args.sem_label_num, num_heads=args.num_heads, num_steps=args.num_steps, nongt_dim=args.nongt_dim, residual_connection=args.residual_connection, label_bias=args.label_bias) elif args.relation_type == "spatial": v_relation = ExplicitRelationEncoder( dataset.v_dim, args.num_hid, args.relation_dim, args.dir_num, args.spa_label_num, num_heads=args.num_heads, num_steps=args.num_steps, nongt_dim=args.nongt_dim, residual_connection=args.residual_connection, label_bias=args.label_bias) else: v_relation = ImplicitRelationEncoder( dataset.v_dim, args.num_hid, args.relation_dim, args.dir_num, args.imp_pos_emb_dim, args.nongt_dim, num_heads=args.num_heads, num_steps=args.num_steps, residual_connection=args.residual_connection, label_bias=args.label_bias) classifier = SimpleClassifier(args.num_hid, args.num_hid * 2, dataset.num_ans_candidates, 0.5) gamma = 0 if args.fusion == "ban": joint_embedding = BAN(args.relation_dim, args.num_hid, args.ban_gamma) gamma = args.ban_gamma elif args.fusion == "butd": joint_embedding = BUTD(args.relation_dim, args.num_hid, args.num_hid) else: joint_embedding = MuTAN(args.relation_dim, args.num_hid, dataset.num_ans_candidates, args.mutan_gamma) gamma = args.mutan_gamma classifier = None return ReGAT(dataset, w_emb, q_emb, q_att, v_relation, joint_embedding, classifier, gamma, args.fusion, args.relation_type)
def build_baseline_with_onestep(embeddings, num_ans_candidates, debias_mode='LearnedMixin'): assert debias_mode in [ 'BiasProduct', 'ReweightByInvBias', 'LearnedMixin', 'Plain' ] vision_features = config.output_features visual_glimpses = config.visual_glimpses hidden_features = config.hid_dim question_features = config.hid_dim w_emb = WordEmbedding(embeddings, dropout=0.0) q_emb = QuestionEmbedding(w_dim=300, hid_dim=question_features, nlayers=1, bidirect=False, dropout=0.0) v_att = attention.Attention( v_dim=vision_features, q_dim=question_features, hid_dim=hidden_features, glimpses=visual_glimpses, ) classifier = SimpleClassifier(in_dim=(question_features, visual_glimpses * vision_features), hid_dim=(hidden_features, hidden_features * 2), out_dim=num_ans_candidates, dropout=0.5) # mask_v_att = attention.Attention( # v_dim=vision_features, # q_dim=question_features, # hid_dim=hidden_features, # glimpses=visual_glimpses, # ) # # mask_classifier = SimpleClassifier( # in_dim=(question_features, vision_features), # hid_dim=(hidden_features, hidden_features * 2), # out_dim=num_ans_candidates, # dropout=0.5 # ) # Add the loss_fn based our arguments debias_loss_fn = eval(debias_mode)() return BaseModel_with_Onestep(w_emb, q_emb, v_att, classifier, debias_loss_fn)
def build_baseline_with_twostep(embeddings, num_ans_candidates, debias_mode='LearnedMixin'): assert debias_mode in [ 'BiasProduct', 'ReweightByInvBias', 'LearnedMixin', 'Plain' ] vision_features = config.output_features visual_glimpses = config.visual_glimpses hidden_features = config.hid_dim question_features = config.hid_dim w_emb = WordEmbedding(embeddings, dropout=0.0) q_emb = QuestionEmbedding(w_dim=300, hid_dim=question_features, nlayers=1, bidirect=False, dropout=0.0) v_att = attention.Attention( v_dim=vision_features, q_dim=question_features, hid_dim=hidden_features, glimpses=visual_glimpses, ) classifier = SimpleClassifier(in_dim=(question_features, visual_glimpses * vision_features), hid_dim=(hidden_features, hidden_features * 2), out_dim=num_ans_candidates, dropout=0.5) if config.use_rubi: c1 = MLP( input_dim=question_features, dimensions=[1024, 1024, num_ans_candidates], ) c2 = nn.Linear(num_ans_candidates, num_ans_candidates) else: c1, c2 = None, None # Add the loss_fn based our arguments debias_loss_fn = eval(debias_mode)(hidden_features if config.fusion_type == 'mul' else hidden_features * 2) return BaseModel_with_Twostep(w_emb, q_emb, v_att, classifier, debias_loss_fn, c1, c2)
def build_regat(dataset, args): print("Building ReGAT model with %s relation and %s fusion method" % (args.relation_type, args.fusion)) # 词嵌入向量模型 # WordEmbedding( # (emb): Embedding(19902, 300, padding_idx=19901) # (emb_): Embedding(19902, 300, padding_idx=19901) # (dropout): Dropout(p=0.0, inplace=False) # ) w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.op) #调用WordEmbding词嵌入方法 #问题嵌入 # QuestionEmbedding( # (rnn): GRU(600, 1024, batch_first=True) ) q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid, 1, False, .0) #问题自注意力 # QuestionSelfAttention( # (drop): Dropout(p=0.2, inplace=False) # (W1_self_att_q): FCNet( # (main): Sequential( # (0): Dropout(p=0.2, inplace=False) # (1): Linear(in_features=1024, out_features=1024, bias=True) # ) # ) # (W2_self_att_q): FCNet( # (main): Sequential( # (0): Linear(in_features=1024, out_features=1, bias=True) # ) # ) # ) q_att = QuestionSelfAttention(args.num_hid, .2) if args.relation_type == "semantic": #如果关系类型是语义的 v_relation = ExplicitRelationEncoder( dataset.v_dim, args.num_hid, args.relation_dim, args.dir_num, args.sem_label_num, num_heads=args.num_heads, num_steps=args.num_steps, nongt_dim=args.nongt_dim, residual_connection=args.residual_connection, label_bias=args.label_bias) elif args.relation_type == "spatial": #如果关系类型是空间的 v_relation = ExplicitRelationEncoder( dataset.v_dim, args.num_hid, args.relation_dim, args.dir_num, args.spa_label_num, num_heads=args.num_heads, num_steps=args.num_steps, nongt_dim=args.nongt_dim, residual_connection=args.residual_connection, label_bias=args.label_bias) else: #否则是隐式关系 v_relation = ImplicitRelationEncoder( dataset.v_dim, args.num_hid, args.relation_dim, args.dir_num, args.imp_pos_emb_dim, args.nongt_dim, num_heads=args.num_heads, num_steps=args.num_steps, residual_connection=args.residual_connection, label_bias=args.label_bias) #分类器 classifier = SimpleClassifier(args.num_hid, args.num_hid * 2, dataset.num_ans_candidates, 0.5) gamma = 0 #采用融合方法 if args.fusion == "ban": joint_embedding = BAN(args.relation_dim, args.num_hid, args.ban_gamma) gamma = args.ban_gamma elif args.fusion == "butd": joint_embedding = BUTD(args.relation_dim, args.num_hid, args.num_hid) else: joint_embedding = MuTAN(args.relation_dim, args.num_hid, dataset.num_ans_candidates, args.mutan_gamma) gamma = args.mutan_gamma classifier = None return ReGAT(dataset, w_emb, q_emb, q_att, v_relation, joint_embedding, classifier, gamma, args.fusion, args.relation_type)