Exemplo n.º 1
0
    def __init__(
        self,
        root_path="data/BSLCP",
        inp_res=224,
        resize_res=256,
        setname="train",
        scale_factor=0.1,
        num_in_frames=16,
        evaluate_video=False,
        hflip=0.5,
        stride=0.5,
        gpu_collation=False,
        word_data_pkl=None,
        featurize_mask="",
        featurize_mode=False,
    ):
        self.root_path = root_path
        self.setname = setname  # train, val or test
        self.featurize_mode = featurize_mode
        self.featurize_mask = featurize_mask
        self.gpu_collation = gpu_collation
        self.inp_res = inp_res
        self.resize_res = resize_res
        self.scale_factor = scale_factor
        self.num_in_frames = num_in_frames
        self.evaluate_video = evaluate_video
        self.hflip = hflip
        self.stride = stride

        infofile = os.path.join(root_path, "info/info.pkl")
        self.video_folder = "videos-resized-25fps-256x256-signdict_signbank"

        print(f"Loading {infofile}")
        data = pkl.load(open(infofile, "rb"))

        self.set_video_metadata(data,
                                meta_key="videos",
                                fixed_sz_frames=gpu_collation)
        self.set_class_names(data=data, word_data_pkl=word_data_pkl)

        self.train = list(
            np.where(np.asarray(data["videos"]["split"]) == 0)[0])
        self.valid = list(
            np.where(np.asarray(data["videos"]["split"]) == 2)[0])

        self.videos = [s.strip() for s in data["videos"]["name"]]

        if evaluate_video:
            self.valid, self.t_beg = self._slide_windows(self.valid)

        VideoDataset.__init__(self)
Exemplo n.º 2
0
    def __init__(
        self,
        info_pkl_json="misc/bsl1k/info-pkls.json",
        inp_res=224,
        resize_res=256,
        setname="train",
        scale_factor=0.1,
        num_in_frames=16,
        evaluate_video=False,
        hflip=0.5,
        stride=0.5,
        mouthing_prob_thres=0.9,
        gpu_collation=False,
        num_last_frames=20,
        featurize_mode=False,
        featurize_mask="",
        word_data_pkl=None,
        input_type="rgb",
        pose_keys=["body", "face", "lhnd", "rhnd"],
        mask_rgb=None,
        mask_type=None,
        bsl1k_pose_subset=False,
        bsl1k_anno_key="original-mouthings",
    ):
        self.setname = setname  # train, val or test
        self.featurize_mode = featurize_mode
        self.featurize_mask = featurize_mask
        self.gpu_collation = gpu_collation
        self.inp_res = inp_res
        self.resize_res = resize_res
        self.scale_factor = scale_factor
        self.num_in_frames = num_in_frames
        self.evaluate_video = evaluate_video
        self.hflip = hflip
        self.stride = stride
        self.input_type = input_type
        self.pose_keys = pose_keys
        self.mask_rgb = mask_rgb
        self.mask_type = mask_type

        assert self.num_in_frames == 16
        self.num_last_frames = num_last_frames
        print(f"Using only {self.num_last_frames} last frames of videos")

        with open(info_pkl_json, "r") as f:
            pkls = json.load(f)[bsl1k_anno_key]
        infofile = pkls["info"]

        self.video_folder = pkls["videos"]

        print(f"Loading {infofile}")
        data = pkl.load(open(infofile, "rb"))
        if self.input_type == "pose":
            pose_pkl = pkls["pose"]
            print(f"Loading {pose_pkl}")
            self.pose_data = pkl.load(open(pose_pkl, "rb"))
        if self.mask_rgb:
            assert bsl1k_pose_subset
            assert mask_type
        if self.mask_rgb == "face":
            face_pkl = pkls["face_bbox"]
            print(f"Loading {face_pkl}")
            self.face_data = pkl.load(open(face_pkl, "rb"))

        if bsl1k_pose_subset:  # self.mask_rgb:
            mouth_pkl = pkls["mouth_bbox"]
            print(f"Loading {mouth_pkl}")
            self.mouth_data = pkl.load(open(mouth_pkl, "rb"))

        self.set_video_metadata(data,
                                meta_key="videos",
                                fixed_sz_frames=gpu_collation)
        subset_ix = self.set_class_names(data=data,
                                         word_data_pkl=word_data_pkl)

        self.train = list(
            np.where(np.asarray(data["videos"]["split"]) == 0)[0])  # train
        self.valid = list(
            np.where(np.asarray(data["videos"]["split"]) == 2)[0])  # test
        self.videos = [s.strip() for s in data["videos"]["name"]]

        # Take subsets based on 'mouthing_prob'
        confident_mouthing = np.where(
            np.asarray(data["videos"]["mouthing_prob"]) >= mouthing_prob_thres
        )[0]
        msg = (
            f"Keeping {len(confident_mouthing)}/{len(data['videos']['mouthing_prob'])} "
            f"videos with more than {mouthing_prob_thres} mouthing confidence."
        )
        print(msg)
        self.train = [i for i in self.train if i in confident_mouthing]
        self.valid = [i for i in self.valid if i in confident_mouthing]

        print("Taking subsets according to word vocab")
        self.train = list(set(self.train).intersection(set(subset_ix)))
        self.valid = list(set(self.valid).intersection(set(subset_ix)))

        if self.input_type == "pose":
            valid_pose_ix = np.where(
                np.array([i is not None for i in self.pose_data["pose"]]))[0]
            print(f"{len(self.train)} train, {len(self.valid)} val samples.")
            print("Taking subsets according to having pose or not")
            self.train = list(set(self.train).intersection(set(valid_pose_ix)))
            self.valid = list(set(self.valid).intersection(set(valid_pose_ix)))
            print(f"{len(self.train)} train, {len(self.valid)} val samples.")

        if bsl1k_pose_subset:  # self.mask_rgb:
            # Valid mouth ix should be equivalent to valid face ix, so leaving this bit.
            valid_mouth_ix = np.where(
                np.array([i is not None for i in self.mouth_data]))[0]
            print(f"{len(self.train)} train, {len(self.valid)} val samples.")
            print("Taking subsets according to having pose or not")
            self.train = list(
                set(self.train).intersection(set(valid_mouth_ix)))
            self.valid = list(
                set(self.valid).intersection(set(valid_mouth_ix)))
            print(f"{len(self.train)} train, {len(self.valid)} val samples.")

        # Take a subset for validation if too large
        if self.setname == "val" and len(self.valid) > 1300:
            self.valid = self.valid[::int(len(self.valid) / 1300)]

        if evaluate_video:
            self.valid, self.t_beg = self._slide_windows(self.valid)

        VideoDataset.__init__(self)
Exemplo n.º 3
0
    def __init__(
        self,
        root_path="data/PHOENIX-2014-T-release-v3/PHOENIX-2014-T",
        inp_res=224,
        resize_res=256,
        setname="train",
        scale_factor=0.1,
        num_in_frames=16,
        evaluate_video=False,
        hflip=0.5,
        stride=0.5,
        gpu_collation=False,
        assign_labels="auto",
    ):
        self.root_path = root_path
        self.setname = setname  # train, val or test
        self.gpu_collation = gpu_collation
        self.inp_res = inp_res
        self.resize_res = resize_res
        self.scale_factor = scale_factor
        self.num_in_frames = num_in_frames
        self.evaluate_video = evaluate_video
        self.hflip = hflip
        self.stride = stride
        self.assign_labels = assign_labels
        infofile = os.path.join(root_path, "info", "info.pkl")
        print(f"Loading {infofile}")
        data = pkl.load(open(infofile, "rb"))
        self.videos = [s.strip() for s in data["videos"]["name"]]

        other_class_ix = 1232
        self.classes = data["videos"]["gloss_ids"]
        replace_cnt = 0
        for i, seq in enumerate(self.classes):
            for j, gid in enumerate(seq):
                if gid == -1:
                    replace_cnt += 1
                    self.classes[i][j] = other_class_ix
        print(f"Replaced {replace_cnt} -1s with {other_class_ix}")
        with open(os.path.join(self.root_path, "info", "words.txt"), "r") as f:
            self.class_names = f.read().splitlines()

        self.class_names.append("1232 __OTHER__")

        self.video_folder = "videos"
        meta_key = self.video_folder
        if gpu_collation:
            # GPU collation requires all inputs to share the same spatial input size
            self.video_folder = "videos-resized-256fps-256x256"
        self.set_video_metadata(data, meta_key=meta_key, fixed_sz_frames=gpu_collation)

        self.train = list(np.where(np.asarray(data["videos"]["split"]) == 0)[0])
        if self.setname == "val":
            self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 1)[0])
        elif self.setname == "test":
            self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 2)[0])

        if self.assign_labels == "auto":
            self.frame_level_glosses = data["videos"]["alignments"]["gloss_id"]

        if evaluate_video:
            self.valid, self.t_beg = self._slide_windows(self.valid)

        VideoDataset.__init__(self)
Exemplo n.º 4
0
    def __init__(
        self,
        root_path="data/wlasl",
        inp_res=224,
        resize_res=256,
        setname="train",
        scale_factor=0.1,
        num_in_frames=64,
        evaluate_video=False,
        hflip=0.5,
        stride=0.5,
        ram_data=True,
        gpu_collation=False,
        use_bbox=True,
        monolithic_pkl_path="data/pickled-videos/wlasl-compressed-quality-90-resized-256x256.pkl",
        input_type="rgb",
        pose_keys=["body", "face", "lhnd", "rhnd"],
        mask_rgb=None,
        mask_type=None,
        mask_prob=1.0,
    ):
        self.root_path = root_path
        self.setname = setname  # train, val or test
        self.inp_res = inp_res
        self.resize_res = resize_res
        self.scale_factor = scale_factor
        self.num_in_frames = num_in_frames
        self.evaluate_video = evaluate_video
        self.hflip = hflip
        self.gpu_collation = gpu_collation
        self.stride = stride
        self.use_bbox = use_bbox
        self.input_type = input_type
        self.pose_keys = pose_keys
        self.mask_rgb = mask_rgb
        self.mask_type = mask_type

        self.video_folder = "videos_360h_25fps"
        if Path(monolithic_pkl_path).exists() and ram_data:
            print(f"Loading monolithic pickle from {monolithic_pkl_path}")
            self.video_data_dict = memcache(monolithic_pkl_path)
        else:
            self.video_data_dict = None

        infofile = os.path.join(root_path, "info", "info.pkl")
        print(f"Loading {infofile}")
        data = pkl.load(open(infofile, "rb"))

        if self.input_type == "pose":
            pose_pkl = os.path.join(root_path, "info", "pose.pkl")
            print(f"Loading {pose_pkl}")
            self.pose_data = pkl.load(open(pose_pkl, "rb"))
        if self.mask_rgb:
            assert mask_type
        if self.mask_rgb == "face":
            face_pkl = os.path.join(root_path, "info", "face_bbox.pkl")
            print(f"Loading {face_pkl}")
            self.face_data = pkl.load(open(face_pkl, "rb"))

        # Use this to take subset
        if self.input_type == "pose" or self.mask_rgb:
            mouth_pkl = os.path.join(root_path, "info", "mouth_bbox.pkl")
            print(f"Loading {mouth_pkl}")
            self.mouth_data = pkl.load(open(mouth_pkl, "rb"))

        self.videos = [s.strip() for s in data["videos"]["name"]]
        self.videos = np.asarray(self.videos)

        self.classes = data["videos"]["word_id"]
        with open(os.path.join(self.root_path, "info", "words.txt"), "r") as f:
            self.class_names = f.read().splitlines()

        meta_key = self.video_folder
        if gpu_collation and not self.video_data_dict:
            # GPU collation requires all inputs to share the same spatial input size
            self.video_folder = "videos-resized-256fps-256x256"
        self.set_video_metadata(data, meta_key=meta_key, fixed_sz_frames=gpu_collation)

        bboxes_orig = [s for s in np.asarray(data["videos"]["box"])]
        self.bboxes = []
        for i, bb in enumerate(bboxes_orig):
            ht = data["videos"]["videos_original"]["H"][i]
            wt = data["videos"]["videos_original"]["W"][i]
            xmin, ymin, xmax, ymax = bb
            bb_norm = [ymin / ht, xmin / wt, ymax / ht, xmax / wt]
            self.bboxes.append(bb_norm)

        self.train = list(np.where(np.asarray(data["videos"]["split"]) == 0)[0])
        if self.setname == "val":
            self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 1)[0])
        elif self.setname == "test":
            self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 2)[0])

        if self.input_type == "pose" or self.mask_rgb:
            # Valid mouth ix should be equivalent to valid face ix, valid pose ix etc
            valid_mouth_ix = np.where(
                np.array([i is not None for i in self.mouth_data])
            )[0]
            if self.setname == "val" or self.setname == "test":
                print(f"{len(self.train)} train, {len(self.valid)} val samples.")
            print("Taking subsets according to having pose or not")
            self.train = list(set(self.train).intersection(set(valid_mouth_ix)))
            if self.setname == "val" or self.setname == "test":
                self.valid = list(set(self.valid).intersection(set(valid_mouth_ix)))
                print(f"{len(self.train)} train, {len(self.valid)} val samples.")

        if evaluate_video:
            self.valid, self.t_beg = self._slide_windows(self.valid)

        VideoDataset.__init__(self)