def __init__(self, opt): super(AttModel, self).__init__() self.vocab_size = opt.vocab_size self.detect_size = opt.detect_size # number of object classes self.input_encoding_size = opt.input_encoding_size self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.seg_info_size = 50 self.fc_feat_size = opt.fc_feat_size + self.seg_info_size self.att_feat_size = opt.att_feat_size self.att_hid_size = opt.att_hid_size self.seq_per_img = opt.seq_per_img self.itod = opt.itod self.att_input_mode = opt.att_input_mode self.transfer_mode = opt.transfer_mode self.test_mode = opt.test_mode self.enable_BUTD = opt.enable_BUTD self.w_grd = opt.w_grd self.w_cls = opt.w_cls self.num_sampled_frm = opt.num_sampled_frm self.num_prop_per_frm = opt.num_prop_per_frm self.att_model = opt.att_model self.unk_idx = int(opt.wtoi['UNK']) if opt.region_attn_mode == 'add': self.alpha_net = nn.Linear(self.att_hid_size, 1) elif opt.region_attn_mode == 'cat': self.alpha_net = nn.Linear(self.att_hid_size * 2, 1) self.stride = 32 # downsizing from input image to feature map self.t_attn_size = opt.t_attn_size self.tiny_value = 1e-8 if self.enable_BUTD: assert (self.att_input_mode == 'region') self.pool_feat_size = self.att_feat_size else: self.pool_feat_size = self.att_feat_size + 300 + self.detect_size + 1 self.min_value = -1e8 opt.beta = 1 self.beta = opt.beta self.loc_fc = nn.Sequential(nn.Linear(5, 300), nn.ReLU(), nn.Dropout(inplace=True)) self.embed = nn.Sequential( nn.Embedding(self.vocab_size, self.input_encoding_size), # det is 1-indexed nn.ReLU(), nn.Dropout(self.drop_prob_lm, inplace=True)) if self.transfer_mode in ('none', 'cls'): self.vis_encoding_size = 2048 elif self.transfer_mode == 'both': self.vis_encoding_size = 2348 elif self.transfer_mode == 'glove': self.vis_encoding_size = 300 else: raise NotImplementedError self.vis_embed = nn.Sequential( nn.Embedding(self.detect_size + 1, self.vis_encoding_size), # det is 1-indexed nn.ReLU(), nn.Dropout(self.drop_prob_lm, inplace=True)) self.fc_embed = nn.Sequential( nn.Linear(self.fc_feat_size, self.rnn_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm, inplace=True)) self.seg_info_embed = nn.Sequential( nn.Linear(4, self.seg_info_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm, inplace=True)) self.att_embed = nn.ModuleList([ nn.Sequential( nn.Linear(2048, self.rnn_size // 2), # for rgb feature nn.ReLU(), nn.Dropout(self.drop_prob_lm, inplace=True)), nn.Sequential( nn.Linear(1024, self.rnn_size // 2), # for motion feature nn.ReLU(), nn.Dropout(self.drop_prob_lm, inplace=True)) ]) self.att_embed_aux = nn.Sequential(nn.BatchNorm1d(self.rnn_size), nn.ReLU()) self.pool_embed = nn.Sequential( nn.Linear(self.pool_feat_size, self.rnn_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm, inplace=True)) self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size) self.ctx2pool = nn.Linear(self.rnn_size, self.att_hid_size) self.logit = nn.Linear(self.rnn_size, self.vocab_size) if opt.obj_interact: n_layers = 2 n_heads = 6 attn_drop = 0.2 self.obj_interact = Transformer(self.rnn_size, 0, 0, d_hidden=int(self.rnn_size / 2), n_layers=n_layers, n_heads=n_heads, drop_ratio=attn_drop, pe=False) if self.att_model == 'transformer': n_layers = 2 n_heads = 6 attn_drop = 0.2 print('initiailze language decoder transformer...') self.cap_model = TransformerDecoder(self.rnn_size, 0, self.vocab_size, \ d_hidden = self.rnn_size//2, n_layers=n_layers, n_heads=n_heads, drop_ratio=attn_drop) if opt.t_attn_mode == 'bilstm': # frame-wise feature encoding n_layers = 2 attn_drop = 0.2 self.context_enc = nn.LSTM(self.rnn_size, self.rnn_size//2, n_layers, dropout=attn_drop, \ bidirectional=True, batch_first=True) elif opt.t_attn_mode == 'bigru': n_layers = 2 attn_drop = 0.2 self.context_enc = nn.GRU(self.rnn_size, self.rnn_size//2, n_layers, dropout=attn_drop, \ bidirectional=True, batch_first=True) else: raise NotImplementedError self.ctx2pool_grd = nn.Sequential( nn.Linear(self.att_feat_size, self.vis_encoding_size), # fc7 layer nn.ReLU(), nn.Dropout(self.drop_prob_lm, inplace=True)) self.critLM = utils.LMCriterion(opt) # initialize the glove weight for the labels. # self.det_fc[0].weight.data.copy_(opt.glove_vg_cls) # for p in self.det_fc[0].parameters(): p.requires_grad=False # self.embed[0].weight.data.copy_(torch.cat((opt.glove_w, opt.glove_clss))) # for p in self.embed[0].parameters(): p.requires_grad=False # weights transfer for fc7 layer with open('data/detectron_weights/fc7_w.pkl') as f: fc7_w = torch.from_numpy(pickle.load(f)) with open('data/detectron_weights/fc7_b.pkl') as f: fc7_b = torch.from_numpy(pickle.load(f)) self.ctx2pool_grd[0].weight[:self.att_feat_size].data.copy_(fc7_w) self.ctx2pool_grd[0].bias[:self.att_feat_size].data.copy_(fc7_b) if self.transfer_mode in ('cls', 'both'): # find nearest neighbour class for transfer with open('data/detectron_weights/cls_score_w.pkl') as f: cls_score_w = torch.from_numpy(pickle.load(f)) # 1601x2048 with open('data/detectron_weights/cls_score_b.pkl') as f: cls_score_b = torch.from_numpy(pickle.load(f)) # 1601x2048 assert (len(opt.itod) + 1 == opt.glove_clss.size(0) ) # index 0 is background assert (len(opt.vg_cls) == opt.glove_vg_cls.size(0) ) # index 0 is background sim_matrix = torch.matmul(opt.glove_vg_cls/torch.norm(opt.glove_vg_cls, dim=1).unsqueeze(1), \ (opt.glove_clss/torch.norm(opt.glove_clss, dim=1).unsqueeze(1)).transpose(1,0)) max_sim, matched_cls = torch.max(sim_matrix, dim=0) self.max_sim = max_sim self.matched_cls = matched_cls vis_classifiers = opt.glove_clss.new(self.detect_size + 1, cls_score_w.size(1)).fill_(0) self.vis_classifiers_bias = nn.Parameter( opt.glove_clss.new(self.detect_size + 1).fill_(0)) vis_classifiers[0] = cls_score_w[0] # background self.vis_classifiers_bias[0].data.copy_(cls_score_b[0]) for i in range(1, self.detect_size + 1): vis_classifiers[i] = cls_score_w[matched_cls[i]] self.vis_classifiers_bias[i].data.copy_( cls_score_b[matched_cls[i]]) if max_sim[i].item() < 0.9: print('index: {}, similarity: {:.2}, {}, {}'.format(i, max_sim[i].item(), \ opt.itod[i], opt.vg_cls[matched_cls[i]])) if self.transfer_mode == 'cls': self.vis_embed[0].weight.data.copy_(vis_classifiers) else: self.vis_embed[0].weight.data.copy_( torch.cat((vis_classifiers, opt.glove_clss), dim=1)) elif self.transfer_mode == 'glove': self.vis_embed[0].weight.data.copy_(opt.glove_clss) elif self.transfer_mode == 'none': print('No knowledge transfer...') else: raise NotImplementedError # for p in self.ctx2pool_grd.parameters(): p.requires_grad=False # for p in self.vis_embed[0].parameters(): p.requires_grad=False if opt.enable_visdom: import visdom self.vis = visdom.Visdom(server=opt.visdom_server, env='vis-' + opt.id)
def __init__(self, opts, pretrained_decoder=None, embed=None, logit=None, roi_extractor=None): super(DecodeAndGroundCaptionerGVDROI, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.opts = opts self.vocab_size = opts.vocab_size self.ix_to_word = opts.itow # ix_to_word is the same for train and val if roi_extractor is not None: self.roi_feat_extractor = roi_extractor else: self.roi_feat_extractor = RegionalFeatureExtractorGVD(opts) self.seq_length = opts.seq_length self.seq_per_img = opts.seq_per_img self.decoder_num_layers = 2 # Top-Down model has 2 layer of LSTMs self.localizer_num_layers = 1 # 1 layer of LSTM for localizer self.rnn_size = opts.rnn_size self.ss_prob = 0.0 # Schedule sampling probability self.iou_threshold = 0.5 # ================================================== if pretrained_decoder is None: self.decoder_core = TopDownDecoderCore(opts) else: self.decoder_core = pretrained_decoder if embed is None: if opts.embedding_vocab_plus_1: self.embed = nn.Sequential( nn.Embedding(opts.vocab_size + 1, opts.input_encoding_size), # we probably can't do "padding_idx=0" since the <BOS> is also encoded as "0" nn.ReLU(), nn.Dropout(opts.drop_prob_lm)) else: self.embed = nn.Sequential( nn.Embedding(opts.vocab_size, opts.input_encoding_size), # we probably can't do "padding_idx=0" since the <BOS> is also encoded as "0" nn.ReLU(), nn.Dropout(opts.drop_prob_lm)) else: self.embed = embed if logit is None: if opts.embedding_vocab_plus_1: self.logit = nn.Linear(opts.rnn_size, opts.vocab_size + 1) else: self.logit = nn.Linear(opts.rnn_size, opts.vocab_size) else: self.logit = logit # ================================================== # set up the localizer self.localizer_num_layers = 1 # 1 layer of LSTM for localizer self.localizer_core = LocalizerNoLSTMCore(opts) # set up the reconstructor with the shared AttnLSTM and LangLSTM self.attended_roi_decoder_core = AttenedDecoderCore( opts, self.decoder_core.att_lstm, self.decoder_core.lang_lstm) self.critLM = utils.LMCriterion(opts) self.unk_idx = int(opts.wtoi['UNK']) self.xe_criterion = utils.LanguageCriterion()
def __init__(self, opt): super(AttModel, self).__init__() self.image_crop_size = opt.image_crop_size self.vocab_size = opt.vocab_size self.detect_size = opt.detect_size self.input_encoding_size = opt.input_encoding_size #self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_hid_size = opt.att_hid_size self.finetune_cnn = opt.finetune_cnn self.cbs = opt.cbs self.cbs_mode = opt.cbs_mode self.seq_per_img = 5 if opt.cnn_backend == 'vgg16': self.stride = 16 else: self.stride = 32 self.att_size = int(opt.image_crop_size / self.stride) self.tiny_value = 1e-8 self.pool_feat_size = self.att_feat_size + 300 * 2 self.ss_prob = 0.0 # Schedule sampling probability self.min_value = -1e8 opt.beta = 1 self.beta = opt.beta if opt.cnn_backend == 'res101': self.cnn = resnet(opt, _num_layers=101, _fixed_block=opt.fixed_block, pretrained=True) elif opt.cnn_backend == 'res152': self.cnn = resnet(opt, _num_layers=152, _fixed_block=opt.fixed_block, pretrained=True) elif opt.cnn_backend == 'vgg16': self.cnn = vgg16(opt, pretrained=True) self.det_fc = nn.Sequential(nn.Embedding(self.detect_size + 1, 300), nn.ReLU(), nn.Dropout()) self.loc_fc = nn.Sequential(nn.Linear(5, 300), nn.ReLU(), nn.Dropout()) self.embed = nn.Sequential( nn.Embedding(self.vocab_size + self.detect_size + 1, self.input_encoding_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm)) self.fc_embed = nn.Sequential( nn.Linear(self.fc_feat_size, self.rnn_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm)) self.att_embed = nn.Sequential( nn.Linear(self.att_feat_size, self.rnn_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm)) self.pool_embed = nn.Sequential( nn.Linear(self.pool_feat_size, self.rnn_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm)) self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size) self.ctx2pool = nn.Linear(self.rnn_size, self.att_hid_size) self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1) self.roi_align = RoIAlignAvg(1, 1, 1.0 / self.stride) #self.grid_size = 1 #self.roi_crop = _RoICrop() self.critLM = utils.LMCriterion(opt) self.critBN = utils.BNCriterion(opt) self.critFG = utils.FGCriterion(opt) if opt.self_critical: print("load reward function...") self.get_self_critical_reward = get_self_critical_reward(opt) self.critRL = utils.RewardCriterion(opt) # initialize the glove weight for the labels. self.det_fc[0].weight.data.copy_(opt.glove_clss) for p in self.det_fc[0].parameters(): p.requires_grad = False
def __init__(self, opt): super(AttModel, self).__init__() self.image_crop_size = opt.image_crop_size self.vocab_size = opt.vocab_size self.detect_size = opt.detect_size self.input_encoding_size = opt.input_encoding_size # self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_hid_size = opt.att_hid_size self.finetune_cnn = opt.finetune_cnn self.cbs = opt.cbs self.cbs_mode = opt.cbs_mode self.seq_per_img = 5 if opt.cnn_backend == 'vgg16': self.stride = 16 else: self.stride = 32 self.att_size = int(opt.image_crop_size / self.stride) self.tiny_value = 1e-8 if opt.relation_type == 'implicit' or opt.relation_type == 'spatial' or opt.relation_type == 'semantic': self.pool_feat_size = opt.relation_dim + 300 * 2 else: self.pool_feat_size = self.att_feat_size + 300 * 2 self.ss_prob = 0.0 # Schedule sampling probability self.min_value = -1e8 opt.beta = 1 self.beta = opt.beta if opt.cnn_backend == 'res101': self.cnn = resnet(opt, _num_layers=101, _fixed_block=opt.fixed_block, pretrained=True) elif opt.cnn_backend == 'res152': self.cnn = resnet(opt, _num_layers=152, _fixed_block=opt.fixed_block, pretrained=True) elif opt.cnn_backend == 'vgg16': self.cnn = vgg16(opt, pretrained=True) # Object Detection Model # self.faster_rcnn = fasterrcnn_resnet50_fpn(pretrained=True) # self.faster_rcnn.eval() # self.ppls_threshold = opt.ppls_thresh # self.max_proposal = 200 # self.det_oracle = opt.det_oracle self.det_fc = nn.Sequential(nn.Embedding(self.detect_size + 1, 300), nn.ReLU(inplace=opt.inplace), nn.Dropout()) self.loc_fc = nn.Sequential(nn.Linear(5, 300), nn.ReLU(inplace=opt.inplace), nn.Dropout()) self.embed = nn.Sequential( nn.Embedding(self.vocab_size + self.detect_size + 1, self.input_encoding_size), nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm)) self.fc_embed = nn.Sequential( nn.Linear(self.fc_feat_size, self.rnn_size), nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm)) self.att_embed = nn.Sequential( nn.Linear(self.att_feat_size, self.rnn_size), nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm)) self.pool_embed = nn.Sequential( nn.Linear(self.pool_feat_size, self.rnn_size), nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm)) self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size) self.ctx2pool = nn.Linear(self.rnn_size, self.att_hid_size) self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1) # fix the RoIAlign to use the torchvision version # self.roi_align = RoIAlignAvg(1, 1, 1.0 / self.stride) self.roi_align = RoIAlign((1, 1), 1.0 / self.stride, 0) # self.grid_size = 1 # self.roi_crop = _RoICrop() self.critLM = utils.LMCriterion(opt) self.critBN = utils.BNCriterion(opt) self.critFG = utils.FGCriterion(opt) if opt.self_critical: print("load reward function...") self.get_self_critical_reward = get_self_critical_reward(opt) self.critRL = utils.RewardCriterion(opt) # initialize the glove weight for the labels. self.det_fc[0].weight.data.copy_(opt.glove_clss) for p in self.det_fc[0].parameters(): p.requires_grad = False # initialize relation module self.nongt_dim = opt.nongt_dim self.imp_pos_emb_dim = opt.imp_pos_emb_dim self.relation_type = opt.relation_type # if opt.implicit_type: # self.imp_relation = ImplicitRelationEncoder( # self.att_feat_size, opt.relation_dim, # opt.dir_num, opt.imp_pos_emb_dim, opt.nongt_dim, # num_heads=opt.num_heads, num_steps=opt.num_steps, # residual_connection=opt.residual_connection, # label_bias=opt.label_bias) # if opt.spatial_type: # self.spa_relation = ExplicitRelationEncoder( # self.att_feat_size, opt.relation_dim, # opt.dir_num, opt.spa_label_num, # num_heads=opt.num_heads, num_steps=opt.num_steps, # nongt_dim=opt.nongt_dim, # residual_connection=opt.residual_connection, # label_bias=opt.label_bias # ) # if opt.semantic_tpye: # self.sem_relation = ExplicitRelationEncoder( # self.att_feat_size, opt.relation_dim, # opt.dir_num, opt.sem_label_num, # num_heads=opt.num_heads, # num_steps=opt.num_steps, nongt_dim=opt.nongt_dim, # residual_connection=opt.residual_connection, # label_bias=opt.label_bias) if opt.relation_type == 'implicit': self.v_relation = ImplicitRelationEncoder( self.att_feat_size, opt.relation_dim, opt.dir_num, opt.imp_pos_emb_dim, opt.nongt_dim, num_heads=opt.num_heads, num_steps=opt.num_steps, residual_connection=opt.residual_connection, label_bias=opt.label_bias) elif opt.relation_type == 'spatial': self.v_relation = ExplicitRelationEncoder( self.att_feat_size, opt.relation_dim, opt.dir_num, opt.spa_label_num, pos_emb_dim=opt.imp_pos_emb_dim, num_heads=opt.num_heads, num_steps=opt.num_steps, nongt_dim=opt.nongt_dim, residual_connection=opt.residual_connection, label_bias=opt.label_bias, graph_att=opt.graph_attention) elif opt.relation_type == 'semantic': self.v_relation = ExplicitRelationEncoder( self.att_feat_size, opt.relation_dim, opt.dir_num, opt.sem_label_num, num_heads=opt.num_heads, num_steps=opt.num_steps, nongt_dim=opt.nongt_dim, residual_connection=opt.residual_connection, label_bias=opt.label_bias, graph_att=opt.graph_attention)