def extract_output(self, vis_inps, vis_gt, save_dir): vis_data = self._infer(vis_inps, "save_output", vis_gt) qids = vis_data["qids"] preds = net_utils.loc2mask(loc, seg_masks) for i, qid in enumerate(qids): out = dict() for k in vis_data.keys(): out[k] = vis_data[k][i] # save output save_path = os.path.join(save_dir, "{}.pkl".format(qid)) io_utils.check_and_create_dir(save_dir) io_utils.write_pkl(save_path, out)
def _infer(self, net_inps, mode="forward", gts=None): # fetch inputs word_labels = net_inps["query_labels"] # [B,L] (nword == L) word_masks = net_inps["query_masks"] # [B,L] c3d_feats = net_inps["video_feats"] # [B,T,d_v] seg_masks = net_inps["video_masks"].squeeze(2) # [B,T] B, nseg, _ = c3d_feats.size() # nseg == T # forward encoders # get word-level, sentence-level and segment-level features word_feats, sen_feats = self.query_enc(word_labels, word_masks, "both") # [B,L,*] seg_feats = self.video_enc(c3d_feats, seg_masks) # [B,nseg,*] # get semantic phrase features: # se_feats: semantic phrase features [B,nse,*]; # ([e^1,...,e^n]) in Eq. (7) # se_attw: attention weights for semantic phrase [B,nse,nword]; # ([a^1,...,a^n]) in Eq. (6) if self.nse > 1: se_feats, se_attw = self.sqan(sen_feats, word_feats, word_masks) else: se_attw = None # Local-global video-text interactions # sa_feats: semantics-aware segment features [B,nseg,d]; R in Eq. (12) # s_attw: aggregating weights [B,nse] if self.nse > 1: q_feats = se_feats else: q_feats = sen_feats sa_feats, s_attw = self.vti_fn(seg_feats, seg_masks, q_feats) # Temporal attentive localization by regression # loc: prediction of time span (t^s, t^e) # t_attw: temporal attention weights (o) loc, t_attw = self.ta_reg_fn(sa_feats, seg_masks) if mode == "forward": outs = OrderedDict() outs["grounding_loc"] = loc if self.use_tag_loss: outs["tag_attw"] = t_attw if self.use_dqa_loss: outs["dqa_attw"] = se_attw else: outs = dict() outs["vids"] = gts["vids"] outs["qids"] = gts["qids"] outs["query_labels"] = net_utils.to_data(net_inps["query_labels"]) outs["grounding_gt"] = net_utils.to_data( gts["grounding_att_masks"]) outs["grounding_pred"] = net_utils.loc2mask(loc, seg_masks) outs["nfeats"] = gts["nfeats"] if self.nse > 1: outs["se_attw"] = net_utils.to_data(se_attw) else: outs["se_attw"] = net_utils.to_data( t_attw.new_zeros(t_attw.size(0), 2, 4)) outs["t_attw"] = net_utils.to_data(t_attw.unsqueeze(1)) if s_attw is None: outs["s_attw"] = net_utils.to_data( t_attw.new_zeros(t_attw.size(0), 2, 4)) else: outs["s_attw"] = net_utils.to_data(s_attw) if mode == "save_output": outs["duration"] = gts["duration"] outs["timestamps"] = gts["timestamps"] outs["grounding_pred_loc"] = net_utils.to_data(loc) return outs