Пример #1
0
def draw_instance_positive_proposals(instance, prop_type="proposal_labels"):

    dr = instance["duration"]
    vid = instance["video_id"]
    vid_labels = instance[prop_type]
    nfeats, K = vid_labels.shape
    # get classnames and featstamps for GT segments
    y_names = ["GT_{}".format(i+1) for i,x in enumerate(instance["gt_times"])]
    featstamps = [utils.timestamp_to_featstamp(x, nfeats, dr) for x in instance["gt_times"]]
    featstamps_for_proposals = utils.get_candidate_featstamps(nfeats, K)
    for n in range(nfeats):
        find = False
        nth_featstamps = deepcopy(featstamps)
        nth_y_names = deepcopy(y_names)
        for k in range(K):
            if vid_labels[n,k] == 1:
                nth_featstamps.append(featstamps_for_proposals[n][k])
                nth_y_names.append("{}_{}".format(n,k))
                find = True
        if find:
            nth_y_names = np.asarray(nth_y_names)

            # get y-values and unique labels
            uniq_names, uniq_idx, idx = np.unique(nth_y_names, True, True)
            y = (idx + 1) / float(len(uniq_names) + 1)

            # draw durations of each segment
            print("Start drawing timestamps ({})".format(n+1))
            colors = sns.color_palette("Set1", n_colors=len(uniq_names), desat=.4)
            for fs, y_, i in zip(nth_featstamps, y, idx):
                add_timelines(y_, fs[0], fs[1], color=colors[i])

            # set x-, y-axis
            ax = plt.gca()
            plt.yticks(y[uniq_idx], uniq_names)
            plt.ylim(0,1)
            plt.xlim(0-nfeats/50.0, nfeats+nfeats/50.0)
            plt.xlabel("Time")
            #plt.savefig("/Users/jonghwan.mun/figs/{:05d}.jpg".format(n))
            #print("saved in ~/figs/{:05d}.jpg".format(n))
            plt.show()
    wait = input("Waiting to show plots")
    plt.clf()
Пример #2
0
    def generate_labels(self, config):
        """ Generate and save labels for temporal language grouding
            1)query_info (.json) with
                - wtoi: word to index dictionary (vocabulary)
                - itow: index to word dictionary (vocabulary)
                - query_lengths: lengths for queries
            2)query_labels (.h5): qid -> label
            3)grounding_labels (.h5): qid -> label
        """
        """ Query information """
        if not os.path.exists(self.paths["query_labels"]):
            # build vocabulary from training data
            train_ann_path = "data/charades/annotations/charades_sta_train.txt"
            train_aux_path = "data/charades/annotations/Charades_v1_train.csv"
            train_anns, _, _ = self._load_annotation(train_ann_path,
                                                     train_aux_path)
            wtoi = self._build_vocab(train_anns)
            itow = {v: k for k, v in wtoi.items()}

            # encode query and save labels (+lenghts)
            L = config.get("max_length", 20)
            encoded = self._encode_query(self.anns, wtoi, L)
            query_labels = io_utils.open_hdf5(self.paths["query_labels"], "w")
            for qid in tqdm(encoded["query_lengths"].keys(),
                            desc="Saving query"):
                _ = query_labels.create_dataset(
                    str(qid), data=encoded["query_labels"][qid])
            query_labels.close()

            # save vocabulary and query length
            query_info = {
                "wtoi": wtoi,
                "itow": itow,
                "query_lengths": encoded["query_lengths"],
            }
            io_utils.write_json(self.paths["query_info"], query_info)
        """ Grounding information """
        if not os.path.exists(self.paths["grounding_info"]):
            grd_dataset = io_utils.open_hdf5(self.paths["grounding_info"], "w")
            start_pos = grd_dataset.create_group("start_pos")
            end_pos = grd_dataset.create_group("end_pos")
            att_masks = grd_dataset.create_group("att_mask")

            for qid, ann in tqdm(self.anns.items(), desc="Gen. Grd. Labels"):
                # get starting/ending positions
                ts = ann["timestamps"]
                vid_d = ann["duration"]
                start = ts[0] / vid_d
                end = ts[1] / vid_d

                # get attention calibration mask
                vid = ann["video_id"]
                if self.feature_type == "I3D":
                    nfeats = np.load(self.feat_path.format(vid)).shape[0]
                else:
                    raise NotImplementedError()

                nfeats = min(nfeats, self.S)

                fs = utils.timestamp_to_featstamp(ts, nfeats, vid_d)
                att_mask = np.zeros((self.S))
                att_mask[fs[0]:fs[1] + 1] = 1

                _ = start_pos.create_dataset(qid, data=start, dtype="float")
                _ = end_pos.create_dataset(qid, data=end, dtype="float")
                _ = att_masks.create_dataset(qid, data=att_mask, dtype="float")

            # save the encoded proposal labels and video ids
            grd_dataset.close()
Пример #3
0
    def __getitem__(self, idx):
        # get query id and corresponding video id
        qid = str(self.qids[idx])
        vid = self.anns[qid]["video_id"]
        timestamp = self.anns[qid]["timestamps"]
        duration = self.anns[qid]["duration"]

        # get query labels
        if self.in_memory:
            q_label = self.query_labels[qid]
        else:
            query_labels = h5py.File(self.paths["query_labels"], "r")
            q_label = query_labels[qid][:]
        q_leng = self.query_lengths[qid]

        # get grounding label
        if self.in_memory:
            start_pos = self.s_pos[qid]
            end_pos = self.e_pos[qid]
        else:
            grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False)
            start_pos = grd_info["start_pos/" + qid][()]
            end_pos = grd_info["end_pos/" + qid][()]

        # get video features
        if self.in_memory:
            vid_feat_all = self.feats[vid]
        else:
            vid_feat_all = np.load(self.feat_path.format(vid)).squeeze()

        # treat defective case
        if timestamp[1] > duration:
            duration = timestamp[1]
        if timestamp[0] > timestamp[1]:
            timestamp = [timestamp[1], timestamp[0]]

        # Cropping Augmentation, part 1
        cropping = random()
        do_crop = self.cropping_augmentation and self.split == "train" and (
            not self.no_aug) and (cropping < self.cropping_prob)
        if do_crop:
            # treat defective case
            cut_start = random() * timestamp[0] * self.cropping_factor
            cut_end = random() * (duration -
                                  timestamp[1]) * self.cropping_factor
            # modify vid_all
            nfeats_all = vid_feat_all.shape[0]
            keep = utils.timestamp_to_featstamp(
                [cut_start, duration - cut_end], nfeats_all, duration)
            vid_feat_all = vid_feat_all[keep[0]:keep[1] + 1]
            # modify duration, timestamp, grounding label
            duration = duration - cut_start - cut_end
            timestamp = [timestamp[0] - cut_start, timestamp[1] - cut_start]
            start_pos = timestamp[0] / duration
            end_pos = timestamp[1] / duration

        # Adjust video feats
        vid_feat, nfeats, start_index, end_index = self.get_fixed_length_feat(
            vid_feat_all, self.S, start_pos, end_pos)

        # Cropping augmentation, part 2
        if do_crop:
            # if training, make attention mask
            fs = utils.timestamp_to_featstamp(timestamp, nfeats, duration)
            att_mask = np.zeros((self.S))
            att_mask[fs[0]:fs[1] + 1] = 1
        else:
            # if not training, get attention mask
            if self.in_memory:
                att_mask = self.att_mask[qid]
            else:
                att_mask = grd_info["att_mask/" + qid][:]

        # get video masks
        vid_mask = np.zeros((self.S, 1))
        vid_mask[:nfeats] = 1

        instance = {
            "vids": vid,
            "qids": qid,
            "timestamps": timestamp,  # GT location [s, e] (second)
            "duration": duration,  # video span (second)
            "query_lengths": q_leng,
            "query_labels":
            torch.LongTensor(q_label).unsqueeze(0),  # [1,L_q_max]
            "query_masks":
            (torch.FloatTensor(q_label) > 0).unsqueeze(0),  # [1,L_q_max]
            "grounding_start_pos":
            torch.FloatTensor([start_pos]),  # [1]; normalized
            "grounding_end_pos":
            torch.FloatTensor([end_pos]),  # [1]; normalized
            "grounding_att_masks": torch.FloatTensor(att_mask),  # [L_v]
            "nfeats": torch.FloatTensor([nfeats]),
            "video_feats": torch.FloatTensor(vid_feat),  # [L_v,D_v]
            "video_masks": torch.ByteTensor(vid_mask),  # [L_v,1]
        }

        return instance