예제 #1
0
    def extract_output(self, vis_inps, vis_gt, save_dir):
        vis_data = self._infer(vis_inps, "save_output", vis_gt)

        qids = vis_data["qids"]
        preds = net_utils.loc2mask(loc, seg_masks)
        for i, qid in enumerate(qids):
            out = dict()
            for k in vis_data.keys():
                out[k] = vis_data[k][i]
            # save output
            save_path = os.path.join(save_dir, "{}.pkl".format(qid))
            io_utils.check_and_create_dir(save_dir)
            io_utils.write_pkl(save_path, out)
예제 #2
0
    def _infer(self, net_inps, mode="forward", gts=None):
        # fetch inputs
        word_labels = net_inps["query_labels"]  # [B,L] (nword == L)
        word_masks = net_inps["query_masks"]  # [B,L]
        c3d_feats = net_inps["video_feats"]  # [B,T,d_v]
        seg_masks = net_inps["video_masks"].squeeze(2)  # [B,T]
        B, nseg, _ = c3d_feats.size()  # nseg == T

        # forward encoders
        # get word-level, sentence-level and segment-level features
        word_feats, sen_feats = self.query_enc(word_labels, word_masks,
                                               "both")  # [B,L,*]
        seg_feats = self.video_enc(c3d_feats, seg_masks)  # [B,nseg,*]

        # get semantic phrase features:
        # se_feats: semantic phrase features [B,nse,*];
        #           ([e^1,...,e^n]) in Eq. (7)
        # se_attw: attention weights for semantic phrase [B,nse,nword];
        #           ([a^1,...,a^n]) in Eq. (6)
        if self.nse > 1:
            se_feats, se_attw = self.sqan(sen_feats, word_feats, word_masks)
        else:
            se_attw = None

        # Local-global video-text interactions
        # sa_feats: semantics-aware segment features [B,nseg,d]; R in Eq. (12)
        # s_attw: aggregating weights [B,nse]
        if self.nse > 1:
            q_feats = se_feats
        else:
            q_feats = sen_feats
        sa_feats, s_attw = self.vti_fn(seg_feats, seg_masks, q_feats)

        # Temporal attentive localization by regression
        # loc: prediction of time span (t^s, t^e)
        # t_attw: temporal attention weights (o)
        loc, t_attw = self.ta_reg_fn(sa_feats, seg_masks)

        if mode == "forward":
            outs = OrderedDict()
            outs["grounding_loc"] = loc
            if self.use_tag_loss:
                outs["tag_attw"] = t_attw
            if self.use_dqa_loss:
                outs["dqa_attw"] = se_attw
        else:
            outs = dict()
            outs["vids"] = gts["vids"]
            outs["qids"] = gts["qids"]
            outs["query_labels"] = net_utils.to_data(net_inps["query_labels"])
            outs["grounding_gt"] = net_utils.to_data(
                gts["grounding_att_masks"])
            outs["grounding_pred"] = net_utils.loc2mask(loc, seg_masks)
            outs["nfeats"] = gts["nfeats"]
            if self.nse > 1:
                outs["se_attw"] = net_utils.to_data(se_attw)
            else:
                outs["se_attw"] = net_utils.to_data(
                    t_attw.new_zeros(t_attw.size(0), 2, 4))
            outs["t_attw"] = net_utils.to_data(t_attw.unsqueeze(1))
            if s_attw is None:
                outs["s_attw"] = net_utils.to_data(
                    t_attw.new_zeros(t_attw.size(0), 2, 4))
            else:
                outs["s_attw"] = net_utils.to_data(s_attw)

            if mode == "save_output":
                outs["duration"] = gts["duration"]
                outs["timestamps"] = gts["timestamps"]
                outs["grounding_pred_loc"] = net_utils.to_data(loc)

        return outs