def __init__(self, opt):
        super(AttModel, self).__init__()
        self.vocab_size = opt.vocab_size
        self.detect_size = opt.detect_size  # number of object classes
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.seg_info_size = 50
        self.fc_feat_size = opt.fc_feat_size + self.seg_info_size
        self.att_feat_size = opt.att_feat_size
        self.att_hid_size = opt.att_hid_size
        self.seq_per_img = opt.seq_per_img
        self.itod = opt.itod
        self.att_input_mode = opt.att_input_mode
        self.transfer_mode = opt.transfer_mode
        self.test_mode = opt.test_mode
        self.enable_BUTD = opt.enable_BUTD
        self.w_grd = opt.w_grd
        self.w_cls = opt.w_cls
        self.num_sampled_frm = opt.num_sampled_frm
        self.num_prop_per_frm = opt.num_prop_per_frm
        self.att_model = opt.att_model
        self.unk_idx = int(opt.wtoi['UNK'])

        if opt.region_attn_mode == 'add':
            self.alpha_net = nn.Linear(self.att_hid_size, 1)
        elif opt.region_attn_mode == 'cat':
            self.alpha_net = nn.Linear(self.att_hid_size * 2, 1)

        self.stride = 32  # downsizing from input image to feature map

        self.t_attn_size = opt.t_attn_size
        self.tiny_value = 1e-8

        if self.enable_BUTD:
            assert (self.att_input_mode == 'region')
            self.pool_feat_size = self.att_feat_size
        else:
            self.pool_feat_size = self.att_feat_size + 300 + self.detect_size + 1

        self.min_value = -1e8
        opt.beta = 1
        self.beta = opt.beta

        self.loc_fc = nn.Sequential(nn.Linear(5, 300), nn.ReLU(),
                                    nn.Dropout(inplace=True))

        self.embed = nn.Sequential(
            nn.Embedding(self.vocab_size,
                         self.input_encoding_size),  # det is 1-indexed
            nn.ReLU(),
            nn.Dropout(self.drop_prob_lm, inplace=True))

        if self.transfer_mode in ('none', 'cls'):
            self.vis_encoding_size = 2048
        elif self.transfer_mode == 'both':
            self.vis_encoding_size = 2348
        elif self.transfer_mode == 'glove':
            self.vis_encoding_size = 300
        else:
            raise NotImplementedError

        self.vis_embed = nn.Sequential(
            nn.Embedding(self.detect_size + 1,
                         self.vis_encoding_size),  # det is 1-indexed
            nn.ReLU(),
            nn.Dropout(self.drop_prob_lm, inplace=True))

        self.fc_embed = nn.Sequential(
            nn.Linear(self.fc_feat_size, self.rnn_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm, inplace=True))

        self.seg_info_embed = nn.Sequential(
            nn.Linear(4, self.seg_info_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm, inplace=True))

        self.att_embed = nn.ModuleList([
            nn.Sequential(
                nn.Linear(2048, self.rnn_size // 2),  # for rgb feature
                nn.ReLU(),
                nn.Dropout(self.drop_prob_lm, inplace=True)),
            nn.Sequential(
                nn.Linear(1024, self.rnn_size // 2),  # for motion feature
                nn.ReLU(),
                nn.Dropout(self.drop_prob_lm, inplace=True))
        ])

        self.att_embed_aux = nn.Sequential(nn.BatchNorm1d(self.rnn_size),
                                           nn.ReLU())

        self.pool_embed = nn.Sequential(
            nn.Linear(self.pool_feat_size, self.rnn_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm, inplace=True))

        self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size)
        self.ctx2pool = nn.Linear(self.rnn_size, self.att_hid_size)

        self.logit = nn.Linear(self.rnn_size, self.vocab_size)

        if opt.obj_interact:
            n_layers = 2
            n_heads = 6
            attn_drop = 0.2
            self.obj_interact = Transformer(self.rnn_size,
                                            0,
                                            0,
                                            d_hidden=int(self.rnn_size / 2),
                                            n_layers=n_layers,
                                            n_heads=n_heads,
                                            drop_ratio=attn_drop,
                                            pe=False)

        if self.att_model == 'transformer':
            n_layers = 2
            n_heads = 6
            attn_drop = 0.2
            print('initiailze language decoder transformer...')
            self.cap_model = TransformerDecoder(self.rnn_size, 0, self.vocab_size, \
                d_hidden = self.rnn_size//2, n_layers=n_layers, n_heads=n_heads, drop_ratio=attn_drop)

        if opt.t_attn_mode == 'bilstm':  # frame-wise feature encoding
            n_layers = 2
            attn_drop = 0.2
            self.context_enc = nn.LSTM(self.rnn_size, self.rnn_size//2, n_layers, dropout=attn_drop, \
                bidirectional=True, batch_first=True)
        elif opt.t_attn_mode == 'bigru':
            n_layers = 2
            attn_drop = 0.2
            self.context_enc = nn.GRU(self.rnn_size, self.rnn_size//2, n_layers, dropout=attn_drop, \
                bidirectional=True, batch_first=True)
        else:
            raise NotImplementedError

        self.ctx2pool_grd = nn.Sequential(
            nn.Linear(self.att_feat_size, self.vis_encoding_size),  # fc7 layer
            nn.ReLU(),
            nn.Dropout(self.drop_prob_lm, inplace=True))

        self.critLM = utils.LMCriterion(opt)

        # initialize the glove weight for the labels.
        # self.det_fc[0].weight.data.copy_(opt.glove_vg_cls)
        # for p in self.det_fc[0].parameters(): p.requires_grad=False

        # self.embed[0].weight.data.copy_(torch.cat((opt.glove_w, opt.glove_clss)))
        # for p in self.embed[0].parameters(): p.requires_grad=False

        # weights transfer for fc7 layer
        with open('data/detectron_weights/fc7_w.pkl') as f:
            fc7_w = torch.from_numpy(pickle.load(f))
        with open('data/detectron_weights/fc7_b.pkl') as f:
            fc7_b = torch.from_numpy(pickle.load(f))
        self.ctx2pool_grd[0].weight[:self.att_feat_size].data.copy_(fc7_w)
        self.ctx2pool_grd[0].bias[:self.att_feat_size].data.copy_(fc7_b)

        if self.transfer_mode in ('cls', 'both'):
            # find nearest neighbour class for transfer
            with open('data/detectron_weights/cls_score_w.pkl') as f:
                cls_score_w = torch.from_numpy(pickle.load(f))  # 1601x2048
            with open('data/detectron_weights/cls_score_b.pkl') as f:
                cls_score_b = torch.from_numpy(pickle.load(f))  # 1601x2048

            assert (len(opt.itod) + 1 == opt.glove_clss.size(0)
                    )  # index 0 is background
            assert (len(opt.vg_cls) == opt.glove_vg_cls.size(0)
                    )  # index 0 is background

            sim_matrix = torch.matmul(opt.glove_vg_cls/torch.norm(opt.glove_vg_cls, dim=1).unsqueeze(1), \
                (opt.glove_clss/torch.norm(opt.glove_clss, dim=1).unsqueeze(1)).transpose(1,0))

            max_sim, matched_cls = torch.max(sim_matrix, dim=0)
            self.max_sim = max_sim
            self.matched_cls = matched_cls

            vis_classifiers = opt.glove_clss.new(self.detect_size + 1,
                                                 cls_score_w.size(1)).fill_(0)
            self.vis_classifiers_bias = nn.Parameter(
                opt.glove_clss.new(self.detect_size + 1).fill_(0))
            vis_classifiers[0] = cls_score_w[0]  # background
            self.vis_classifiers_bias[0].data.copy_(cls_score_b[0])
            for i in range(1, self.detect_size + 1):
                vis_classifiers[i] = cls_score_w[matched_cls[i]]
                self.vis_classifiers_bias[i].data.copy_(
                    cls_score_b[matched_cls[i]])
                if max_sim[i].item() < 0.9:
                    print('index: {}, similarity: {:.2}, {}, {}'.format(i, max_sim[i].item(), \
                        opt.itod[i], opt.vg_cls[matched_cls[i]]))

            if self.transfer_mode == 'cls':
                self.vis_embed[0].weight.data.copy_(vis_classifiers)
            else:
                self.vis_embed[0].weight.data.copy_(
                    torch.cat((vis_classifiers, opt.glove_clss), dim=1))
        elif self.transfer_mode == 'glove':
            self.vis_embed[0].weight.data.copy_(opt.glove_clss)
        elif self.transfer_mode == 'none':
            print('No knowledge transfer...')
        else:
            raise NotImplementedError

        # for p in self.ctx2pool_grd.parameters(): p.requires_grad=False
        # for p in self.vis_embed[0].parameters(): p.requires_grad=False

        if opt.enable_visdom:
            import visdom
            self.vis = visdom.Visdom(server=opt.visdom_server,
                                     env='vis-' + opt.id)
    def __init__(self,
                 opts,
                 pretrained_decoder=None,
                 embed=None,
                 logit=None,
                 roi_extractor=None):
        super(DecodeAndGroundCaptionerGVDROI, self).__init__()

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.opts = opts
        self.vocab_size = opts.vocab_size
        self.ix_to_word = opts.itow  # ix_to_word is the same for train and val

        if roi_extractor is not None:
            self.roi_feat_extractor = roi_extractor
        else:
            self.roi_feat_extractor = RegionalFeatureExtractorGVD(opts)

        self.seq_length = opts.seq_length
        self.seq_per_img = opts.seq_per_img

        self.decoder_num_layers = 2  # Top-Down model has 2 layer of LSTMs
        self.localizer_num_layers = 1  # 1 layer of LSTM for localizer
        self.rnn_size = opts.rnn_size

        self.ss_prob = 0.0  # Schedule sampling probability

        self.iou_threshold = 0.5

        # ==================================================
        if pretrained_decoder is None:
            self.decoder_core = TopDownDecoderCore(opts)
        else:
            self.decoder_core = pretrained_decoder

        if embed is None:
            if opts.embedding_vocab_plus_1:
                self.embed = nn.Sequential(
                    nn.Embedding(opts.vocab_size + 1,
                                 opts.input_encoding_size),
                    # we probably can't do "padding_idx=0" since the <BOS> is also encoded as "0"
                    nn.ReLU(),
                    nn.Dropout(opts.drop_prob_lm))
            else:
                self.embed = nn.Sequential(
                    nn.Embedding(opts.vocab_size, opts.input_encoding_size),
                    # we probably can't do "padding_idx=0" since the <BOS> is also encoded as "0"
                    nn.ReLU(),
                    nn.Dropout(opts.drop_prob_lm))
        else:
            self.embed = embed

        if logit is None:
            if opts.embedding_vocab_plus_1:
                self.logit = nn.Linear(opts.rnn_size, opts.vocab_size + 1)
            else:
                self.logit = nn.Linear(opts.rnn_size, opts.vocab_size)
        else:
            self.logit = logit
        # ==================================================

        # set up the localizer
        self.localizer_num_layers = 1  # 1 layer of LSTM for localizer
        self.localizer_core = LocalizerNoLSTMCore(opts)

        # set up the reconstructor with the shared AttnLSTM and LangLSTM
        self.attended_roi_decoder_core = AttenedDecoderCore(
            opts, self.decoder_core.att_lstm, self.decoder_core.lang_lstm)

        self.critLM = utils.LMCriterion(opts)
        self.unk_idx = int(opts.wtoi['UNK'])

        self.xe_criterion = utils.LanguageCriterion()
Exemplo n.º 3
0
    def __init__(self, opt):
        super(AttModel, self).__init__()
        self.image_crop_size = opt.image_crop_size
        self.vocab_size = opt.vocab_size
        self.detect_size = opt.detect_size
        self.input_encoding_size = opt.input_encoding_size
        #self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size
        self.att_feat_size = opt.att_feat_size
        self.att_hid_size = opt.att_hid_size
        self.finetune_cnn = opt.finetune_cnn
        self.cbs = opt.cbs
        self.cbs_mode = opt.cbs_mode
        self.seq_per_img = 5
        if opt.cnn_backend == 'vgg16':
            self.stride = 16
        else:
            self.stride = 32

        self.att_size = int(opt.image_crop_size / self.stride)
        self.tiny_value = 1e-8

        self.pool_feat_size = self.att_feat_size + 300 * 2
        self.ss_prob = 0.0  # Schedule sampling probability
        self.min_value = -1e8
        opt.beta = 1
        self.beta = opt.beta
        if opt.cnn_backend == 'res101':
            self.cnn = resnet(opt,
                              _num_layers=101,
                              _fixed_block=opt.fixed_block,
                              pretrained=True)
        elif opt.cnn_backend == 'res152':
            self.cnn = resnet(opt,
                              _num_layers=152,
                              _fixed_block=opt.fixed_block,
                              pretrained=True)
        elif opt.cnn_backend == 'vgg16':
            self.cnn = vgg16(opt, pretrained=True)

        self.det_fc = nn.Sequential(nn.Embedding(self.detect_size + 1, 300),
                                    nn.ReLU(), nn.Dropout())

        self.loc_fc = nn.Sequential(nn.Linear(5, 300), nn.ReLU(), nn.Dropout())

        self.embed = nn.Sequential(
            nn.Embedding(self.vocab_size + self.detect_size + 1,
                         self.input_encoding_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm))

        self.fc_embed = nn.Sequential(
            nn.Linear(self.fc_feat_size, self.rnn_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm))

        self.att_embed = nn.Sequential(
            nn.Linear(self.att_feat_size, self.rnn_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm))

        self.pool_embed = nn.Sequential(
            nn.Linear(self.pool_feat_size, self.rnn_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm))

        self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size)
        self.ctx2pool = nn.Linear(self.rnn_size, self.att_hid_size)

        self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1)
        self.roi_align = RoIAlignAvg(1, 1, 1.0 / self.stride)

        #self.grid_size = 1
        #self.roi_crop = _RoICrop()
        self.critLM = utils.LMCriterion(opt)
        self.critBN = utils.BNCriterion(opt)
        self.critFG = utils.FGCriterion(opt)

        if opt.self_critical:
            print("load reward function...")
            self.get_self_critical_reward = get_self_critical_reward(opt)
            self.critRL = utils.RewardCriterion(opt)

        # initialize the glove weight for the labels.
        self.det_fc[0].weight.data.copy_(opt.glove_clss)
        for p in self.det_fc[0].parameters():
            p.requires_grad = False
Exemplo n.º 4
0
    def __init__(self, opt):
        super(AttModel, self).__init__()
        self.image_crop_size = opt.image_crop_size
        self.vocab_size = opt.vocab_size
        self.detect_size = opt.detect_size
        self.input_encoding_size = opt.input_encoding_size
        # self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size
        self.att_feat_size = opt.att_feat_size
        self.att_hid_size = opt.att_hid_size
        self.finetune_cnn = opt.finetune_cnn
        self.cbs = opt.cbs
        self.cbs_mode = opt.cbs_mode
        self.seq_per_img = 5
        if opt.cnn_backend == 'vgg16':
            self.stride = 16
        else:
            self.stride = 32

        self.att_size = int(opt.image_crop_size / self.stride)
        self.tiny_value = 1e-8

        if opt.relation_type == 'implicit' or opt.relation_type == 'spatial' or opt.relation_type == 'semantic':
            self.pool_feat_size = opt.relation_dim + 300 * 2
        else:
            self.pool_feat_size = self.att_feat_size + 300 * 2
        self.ss_prob = 0.0  # Schedule sampling probability
        self.min_value = -1e8
        opt.beta = 1
        self.beta = opt.beta
        if opt.cnn_backend == 'res101':
            self.cnn = resnet(opt,
                              _num_layers=101,
                              _fixed_block=opt.fixed_block,
                              pretrained=True)
        elif opt.cnn_backend == 'res152':
            self.cnn = resnet(opt,
                              _num_layers=152,
                              _fixed_block=opt.fixed_block,
                              pretrained=True)
        elif opt.cnn_backend == 'vgg16':
            self.cnn = vgg16(opt, pretrained=True)

        # Object Detection Model
        # self.faster_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)
        # self.faster_rcnn.eval()
        # self.ppls_threshold = opt.ppls_thresh
        # self.max_proposal = 200
        # self.det_oracle = opt.det_oracle

        self.det_fc = nn.Sequential(nn.Embedding(self.detect_size + 1, 300),
                                    nn.ReLU(inplace=opt.inplace), nn.Dropout())

        self.loc_fc = nn.Sequential(nn.Linear(5, 300),
                                    nn.ReLU(inplace=opt.inplace), nn.Dropout())

        self.embed = nn.Sequential(
            nn.Embedding(self.vocab_size + self.detect_size + 1,
                         self.input_encoding_size),
            nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm))

        self.fc_embed = nn.Sequential(
            nn.Linear(self.fc_feat_size, self.rnn_size),
            nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm))

        self.att_embed = nn.Sequential(
            nn.Linear(self.att_feat_size, self.rnn_size),
            nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm))

        self.pool_embed = nn.Sequential(
            nn.Linear(self.pool_feat_size, self.rnn_size),
            nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm))

        self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size)
        self.ctx2pool = nn.Linear(self.rnn_size, self.att_hid_size)

        self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1)

        # fix the RoIAlign to use the torchvision version
        # self.roi_align = RoIAlignAvg(1, 1, 1.0 / self.stride)
        self.roi_align = RoIAlign((1, 1), 1.0 / self.stride, 0)

        # self.grid_size = 1
        # self.roi_crop = _RoICrop()
        self.critLM = utils.LMCriterion(opt)
        self.critBN = utils.BNCriterion(opt)
        self.critFG = utils.FGCriterion(opt)

        if opt.self_critical:
            print("load reward function...")
            self.get_self_critical_reward = get_self_critical_reward(opt)
            self.critRL = utils.RewardCriterion(opt)

        # initialize the glove weight for the labels.
        self.det_fc[0].weight.data.copy_(opt.glove_clss)
        for p in self.det_fc[0].parameters():
            p.requires_grad = False

        # initialize relation module
        self.nongt_dim = opt.nongt_dim
        self.imp_pos_emb_dim = opt.imp_pos_emb_dim
        self.relation_type = opt.relation_type

        # if opt.implicit_type:
        #     self.imp_relation = ImplicitRelationEncoder(
        #         self.att_feat_size, opt.relation_dim,
        #         opt.dir_num, opt.imp_pos_emb_dim, opt.nongt_dim,
        #         num_heads=opt.num_heads, num_steps=opt.num_steps,
        #         residual_connection=opt.residual_connection,
        #         label_bias=opt.label_bias)
        # if opt.spatial_type:
        #     self.spa_relation = ExplicitRelationEncoder(
        #         self.att_feat_size, opt.relation_dim,
        #         opt.dir_num, opt.spa_label_num,
        #         num_heads=opt.num_heads, num_steps=opt.num_steps,
        #         nongt_dim=opt.nongt_dim,
        #         residual_connection=opt.residual_connection,
        #         label_bias=opt.label_bias
        #     )
        # if opt.semantic_tpye:
        #     self.sem_relation = ExplicitRelationEncoder(
        #         self.att_feat_size, opt.relation_dim,
        #         opt.dir_num, opt.sem_label_num,
        #         num_heads=opt.num_heads,
        #         num_steps=opt.num_steps, nongt_dim=opt.nongt_dim,
        #         residual_connection=opt.residual_connection,
        #         label_bias=opt.label_bias)
        if opt.relation_type == 'implicit':
            self.v_relation = ImplicitRelationEncoder(
                self.att_feat_size,
                opt.relation_dim,
                opt.dir_num,
                opt.imp_pos_emb_dim,
                opt.nongt_dim,
                num_heads=opt.num_heads,
                num_steps=opt.num_steps,
                residual_connection=opt.residual_connection,
                label_bias=opt.label_bias)
        elif opt.relation_type == 'spatial':
            self.v_relation = ExplicitRelationEncoder(
                self.att_feat_size,
                opt.relation_dim,
                opt.dir_num,
                opt.spa_label_num,
                pos_emb_dim=opt.imp_pos_emb_dim,
                num_heads=opt.num_heads,
                num_steps=opt.num_steps,
                nongt_dim=opt.nongt_dim,
                residual_connection=opt.residual_connection,
                label_bias=opt.label_bias,
                graph_att=opt.graph_attention)
        elif opt.relation_type == 'semantic':
            self.v_relation = ExplicitRelationEncoder(
                self.att_feat_size,
                opt.relation_dim,
                opt.dir_num,
                opt.sem_label_num,
                num_heads=opt.num_heads,
                num_steps=opt.num_steps,
                nongt_dim=opt.nongt_dim,
                residual_connection=opt.residual_connection,
                label_bias=opt.label_bias,
                graph_att=opt.graph_attention)