def draw_instance_positive_proposals(instance, prop_type="proposal_labels"): dr = instance["duration"] vid = instance["video_id"] vid_labels = instance[prop_type] nfeats, K = vid_labels.shape # get classnames and featstamps for GT segments y_names = ["GT_{}".format(i+1) for i,x in enumerate(instance["gt_times"])] featstamps = [utils.timestamp_to_featstamp(x, nfeats, dr) for x in instance["gt_times"]] featstamps_for_proposals = utils.get_candidate_featstamps(nfeats, K) for n in range(nfeats): find = False nth_featstamps = deepcopy(featstamps) nth_y_names = deepcopy(y_names) for k in range(K): if vid_labels[n,k] == 1: nth_featstamps.append(featstamps_for_proposals[n][k]) nth_y_names.append("{}_{}".format(n,k)) find = True if find: nth_y_names = np.asarray(nth_y_names) # get y-values and unique labels uniq_names, uniq_idx, idx = np.unique(nth_y_names, True, True) y = (idx + 1) / float(len(uniq_names) + 1) # draw durations of each segment print("Start drawing timestamps ({})".format(n+1)) colors = sns.color_palette("Set1", n_colors=len(uniq_names), desat=.4) for fs, y_, i in zip(nth_featstamps, y, idx): add_timelines(y_, fs[0], fs[1], color=colors[i]) # set x-, y-axis ax = plt.gca() plt.yticks(y[uniq_idx], uniq_names) plt.ylim(0,1) plt.xlim(0-nfeats/50.0, nfeats+nfeats/50.0) plt.xlabel("Time") #plt.savefig("/Users/jonghwan.mun/figs/{:05d}.jpg".format(n)) #print("saved in ~/figs/{:05d}.jpg".format(n)) plt.show() wait = input("Waiting to show plots") plt.clf()
def generate_labels(self, config): """ Generate and save labels for temporal language grouding 1)query_info (.json) with - wtoi: word to index dictionary (vocabulary) - itow: index to word dictionary (vocabulary) - query_lengths: lengths for queries 2)query_labels (.h5): qid -> label 3)grounding_labels (.h5): qid -> label """ """ Query information """ if not os.path.exists(self.paths["query_labels"]): # build vocabulary from training data train_ann_path = "data/charades/annotations/charades_sta_train.txt" train_aux_path = "data/charades/annotations/Charades_v1_train.csv" train_anns, _, _ = self._load_annotation(train_ann_path, train_aux_path) wtoi = self._build_vocab(train_anns) itow = {v: k for k, v in wtoi.items()} # encode query and save labels (+lenghts) L = config.get("max_length", 20) encoded = self._encode_query(self.anns, wtoi, L) query_labels = io_utils.open_hdf5(self.paths["query_labels"], "w") for qid in tqdm(encoded["query_lengths"].keys(), desc="Saving query"): _ = query_labels.create_dataset( str(qid), data=encoded["query_labels"][qid]) query_labels.close() # save vocabulary and query length query_info = { "wtoi": wtoi, "itow": itow, "query_lengths": encoded["query_lengths"], } io_utils.write_json(self.paths["query_info"], query_info) """ Grounding information """ if not os.path.exists(self.paths["grounding_info"]): grd_dataset = io_utils.open_hdf5(self.paths["grounding_info"], "w") start_pos = grd_dataset.create_group("start_pos") end_pos = grd_dataset.create_group("end_pos") att_masks = grd_dataset.create_group("att_mask") for qid, ann in tqdm(self.anns.items(), desc="Gen. Grd. Labels"): # get starting/ending positions ts = ann["timestamps"] vid_d = ann["duration"] start = ts[0] / vid_d end = ts[1] / vid_d # get attention calibration mask vid = ann["video_id"] if self.feature_type == "I3D": nfeats = np.load(self.feat_path.format(vid)).shape[0] else: raise NotImplementedError() nfeats = min(nfeats, self.S) fs = utils.timestamp_to_featstamp(ts, nfeats, vid_d) att_mask = np.zeros((self.S)) att_mask[fs[0]:fs[1] + 1] = 1 _ = start_pos.create_dataset(qid, data=start, dtype="float") _ = end_pos.create_dataset(qid, data=end, dtype="float") _ = att_masks.create_dataset(qid, data=att_mask, dtype="float") # save the encoded proposal labels and video ids grd_dataset.close()
def __getitem__(self, idx): # get query id and corresponding video id qid = str(self.qids[idx]) vid = self.anns[qid]["video_id"] timestamp = self.anns[qid]["timestamps"] duration = self.anns[qid]["duration"] # get query labels if self.in_memory: q_label = self.query_labels[qid] else: query_labels = h5py.File(self.paths["query_labels"], "r") q_label = query_labels[qid][:] q_leng = self.query_lengths[qid] # get grounding label if self.in_memory: start_pos = self.s_pos[qid] end_pos = self.e_pos[qid] else: grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False) start_pos = grd_info["start_pos/" + qid][()] end_pos = grd_info["end_pos/" + qid][()] # get video features if self.in_memory: vid_feat_all = self.feats[vid] else: vid_feat_all = np.load(self.feat_path.format(vid)).squeeze() # treat defective case if timestamp[1] > duration: duration = timestamp[1] if timestamp[0] > timestamp[1]: timestamp = [timestamp[1], timestamp[0]] # Cropping Augmentation, part 1 cropping = random() do_crop = self.cropping_augmentation and self.split == "train" and ( not self.no_aug) and (cropping < self.cropping_prob) if do_crop: # treat defective case cut_start = random() * timestamp[0] * self.cropping_factor cut_end = random() * (duration - timestamp[1]) * self.cropping_factor # modify vid_all nfeats_all = vid_feat_all.shape[0] keep = utils.timestamp_to_featstamp( [cut_start, duration - cut_end], nfeats_all, duration) vid_feat_all = vid_feat_all[keep[0]:keep[1] + 1] # modify duration, timestamp, grounding label duration = duration - cut_start - cut_end timestamp = [timestamp[0] - cut_start, timestamp[1] - cut_start] start_pos = timestamp[0] / duration end_pos = timestamp[1] / duration # Adjust video feats vid_feat, nfeats, start_index, end_index = self.get_fixed_length_feat( vid_feat_all, self.S, start_pos, end_pos) # Cropping augmentation, part 2 if do_crop: # if training, make attention mask fs = utils.timestamp_to_featstamp(timestamp, nfeats, duration) att_mask = np.zeros((self.S)) att_mask[fs[0]:fs[1] + 1] = 1 else: # if not training, get attention mask if self.in_memory: att_mask = self.att_mask[qid] else: att_mask = grd_info["att_mask/" + qid][:] # get video masks vid_mask = np.zeros((self.S, 1)) vid_mask[:nfeats] = 1 instance = { "vids": vid, "qids": qid, "timestamps": timestamp, # GT location [s, e] (second) "duration": duration, # video span (second) "query_lengths": q_leng, "query_labels": torch.LongTensor(q_label).unsqueeze(0), # [1,L_q_max] "query_masks": (torch.FloatTensor(q_label) > 0).unsqueeze(0), # [1,L_q_max] "grounding_start_pos": torch.FloatTensor([start_pos]), # [1]; normalized "grounding_end_pos": torch.FloatTensor([end_pos]), # [1]; normalized "grounding_att_masks": torch.FloatTensor(att_mask), # [L_v] "nfeats": torch.FloatTensor([nfeats]), "video_feats": torch.FloatTensor(vid_feat), # [L_v,D_v] "video_masks": torch.ByteTensor(vid_mask), # [L_v,1] } return instance