def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1, fold=1, train=True, framewiseTransform=False, transform=None): super(HMDB51, self).__init__(root) if not 1 <= fold <= 3: raise ValueError( "fold should be between 1 and 3, got {}".format(fold)) extensions = ('avi', ) self.fold = fold self.train = train classes = list(sorted(list_dir(root))) class_to_idx = {classes[i]: i for i in range(len(classes))} self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None) self.classes = classes video_list = [x[0] for x in self.samples] video_clips = VideoClips(video_list, frames_per_clip, step_between_clips) self.indices = self._select_fold(video_list, annotation_path, fold, train) self.video_clips = video_clips.subset(self.indices) self.video_list = [video_list[i] for i in self.indices] self.framewiseTransform = framewiseTransform self.transform = transform
def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1, frame_rate=None, fold=1, train=True, transform=None, _precomputed_metadata=None, num_workers=1, _video_width=0, _video_height=0, _video_min_dimension=0, _audio_samples=0): super(MYUCF101, self).__init__(root) if not 1 <= fold <= 3: raise ValueError( "fold should be between 1 and 3, got {}".format(fold)) extensions = ('avi', ) self.fold = fold self.train = train classes = list(sorted(list_dir(root))) class_to_idx = {classes[i]: i for i in range(len(classes))} self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None) self.classes = classes video_list = [x[0] for x in self.samples] video_clips = VideoClips( video_list, frames_per_clip, step_between_clips, frame_rate, _precomputed_metadata, num_workers=num_workers, _video_width=_video_width, _video_height=_video_height, _video_min_dimension=_video_min_dimension, _audio_samples=_audio_samples, ) meta_data_str_ = os.path.join( root, f"meta_data_train_{train}_fold_{fold}_frames_{frames_per_clip}_skip_" f"{step_between_clips}.pickle") if not os.path.exists(meta_data_str_): with open(meta_data_str_, 'wb') as ff: pickle.dump(video_clips.metadata, ff) self.video_clips_metadata = video_clips.metadata self.indices = self._select_fold(video_list, annotation_path, fold, train) self.video_clips = video_clips.subset(self.indices) self.transform = transform
def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1, frame_rate=None, fold=1, train=True, transform=None, _precomputed_metadata=None, num_workers=1, _video_width=0, _video_height=0, _video_min_dimension=0, _audio_samples=0): super(UCF101, self).__init__(root) if not 1 <= fold <= 3: raise ValueError( "fold should be between 1 and 3, got {}".format(fold)) extensions = ('avi', ) self.fold = fold self.train = train classes = list(sorted(list_dir(root))) class_to_idx = {classes[i]: i for i in range(len(classes))} self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None) self.classes = classes video_list = [x[0] for x in self.samples] metadata_filepath = os.path.join(root, 'ucf101_metadata.pt') if os.path.exists(metadata_filepath): metadata = torch.load(metadata_filepath) else: metadata = None video_clips = VideoClips( video_list, frames_per_clip, step_between_clips, frame_rate, metadata, num_workers=num_workers, _video_width=_video_width, _video_height=_video_height, _video_min_dimension=_video_min_dimension, _audio_samples=_audio_samples, ) if not os.path.exists(metadata_filepath): torch.save(video_clips.metadata, metadata_filepath) self.video_clips_metadata = video_clips.metadata self.indices = self._select_fold(video_list, annotation_path, fold, train) self.video_clips = video_clips.subset(self.indices) self.transform = transform
def init_data(self, root, frames_per_clip, step_between_clips=6, frame_rate=6, train=True, transform=None, _precomputed_metadata=None, num_workers=1, _video_width=0, _video_height=0, _video_min_dimension=0, _audio_samples=0): super(HMDB51, self).__init__(root) extensions = ('avi', ) if train: root = root + "/train" else: root = root + "/test" classes = sorted(list_dir(root)) class_to_idx = {class_: i for (i, class_) in enumerate(classes)} print(class_to_idx) self.samples = [] for target_class in sorted(class_to_idx.keys()): class_index = class_to_idx[target_class] target_dir = os.path.join(root, target_class) for root_curr, _, fnames in sorted( os.walk(target_dir, followlinks=True)): for fname in sorted(fnames): path = os.path.join(root_curr, fname) if os.path.isfile(path): item = path, class_index self.samples.append(item) video_paths = [path for (path, _) in self.samples] video_clips = VideoClips( video_paths, frames_per_clip, step_between_clips, frame_rate, _precomputed_metadata, num_workers=num_workers, _video_width=_video_width, _video_height=_video_height, _video_min_dimension=_video_min_dimension, _audio_samples=_audio_samples, ) self.train = train self.classes = classes self.video_clips_metadata = video_clips.metadata self.indices = self.get_indices(video_paths) self.video_clips = video_clips.subset(self.indices) self.transform = transform
def __init__(self, feat_path, root, annotation_path, frames_per_clip, extracted_frames_per_clip=2, step_between_clips=1, fold=1, train=True, transform=None): super(HMDB51FeatureSequenceDataset, self).__init__(root) if not 1 <= fold <= 3: raise ValueError( "fold should be between 1 and 3, got {}".format(fold)) extensions = ('avi', ) self.fold = fold self.train = train self.frames_per_clip = frames_per_clip self.extracted_frames_per_clip = extracted_frames_per_clip self.step_between_clips = step_between_clips with open(feat_path, "rb") as fp: self.features = pickle.load(fp) classes = list(sorted(list_dir(root))) class_to_idx = {classes[i]: i for i in range(len(classes))} self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None) self.classes = classes video_list = [x[0] for x in self.samples] video_clips = VideoClips(video_list, frames_per_clip, step_between_clips) self.indices = self._select_fold(video_list, annotation_path, fold, train) self.video_clips = video_clips.subset(self.indices) self.video_list = [video_list[i] for i in self.indices] self.transform = transform
def __init__(self, root, train, resolution, n_frames, fold=1): video_root = osp.join(root, 'UCF-101') super(UCF101, self).__init__(video_root) if not 1 <= fold <= 3: raise ValueError( "fold should be between 1 and 3, got {}".format(fold)) self.train = train self.fold = fold self.resolution = resolution self.n_frames = n_frames self.annotation_path = os.path.join(root, 'ucfTrainTestlist') self.classes = list( sorted(p for p in os.listdir(video_root) if osp.isdir(osp.join(video_root, p)))) class_to_idx = {self.classes[i]: i for i in range(len(self.classes))} self.samples = make_dataset(video_root, class_to_idx, ('avi', ), is_valid_file=None) video_list = [x[0] for x in self.samples] frames_between_clips = 1 if train else 16 self.video_clips_fname = os.path.join( root, f'ucf_video_clips_{frames_between_clips}_{n_frames}.pkl') if not osp.exists(self.video_clips_fname): video_clips = VideoClips(video_paths=video_list, clip_length_in_frames=n_frames, frames_between_clips=1, num_workers=4) with open(self.video_clips_fname, 'wb') as f: pickle.dump(video_clips, f) else: with open(self.video_clips_fname, 'rb') as f: video_clips = pickle.load(f) indices = self._select_fold(video_list, self.annotation_path, fold, train) self.size = video_clips.subset(indices).num_clips() self._need_init = True
def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1, frame_rate=None, fold=1, train=True, transform=None, dim=[240, 320], chn=1, _precomputed_metadata=None, num_workers=1, _video_width=0, _video_height=0, _video_min_dimension=0, _audio_samples=0): super(HMDB51, self).__init__(root) if fold not in (1, 2, 3): raise ValueError( "fold should be between 1 and 3, got {}".format(fold)) extensions = ('avi', ) self.channel = chn self.d_y = dim[0] self.d_x = dim[1] name_class = "classhmdb.txt" f = os.path.join(annotation_path, name_class) lb = [] n_file = [] with open(f, "r") as fid: data = fid.readlines() data = [x.strip().split(" ") for x in data] lb2 = [x[0] for x in data] data = [x[1] for x in data] lb.extend(lb2) n_file.extend(data) cls = n_file class_to_idx = {cls[i]: lb[i] for i in range(len(cls))} self.classes = cls #classes = sorted(list_dir(root)) #class_to_idx = {class_: i for (i, class_) in enumerate(classes)} self.samples = make_dataset( self.root, class_to_idx, extensions, ) video_paths = [path for (path, _) in self.samples] video_clips = VideoClips( video_paths, frames_per_clip, step_between_clips, frame_rate, _precomputed_metadata, num_workers=num_workers, _video_width=_video_width, _video_height=_video_height, _video_min_dimension=_video_min_dimension, _audio_samples=_audio_samples, ) self.fold = fold self.train = train #self.classes = classes self.indices = self._select_fold(video_paths, annotation_path, fold, train) self.video_clips = video_clips.subset(self.indices) self.transform = transform
class KineticsAndFails(VisionDataset): FLOW_FPS = 8 def __init__(self, fails_path, kinetics_path, frames_per_clip, step_between_clips, fps, transform=None, extensions=('.mp4', ), video_clips=None, fails_only=False, val=False, balance_fails_only=False, get_clip_times=False, fails_video_list=None, fns_to_remove=None, load_flow=False, flow_histogram=False, fails_flow_path=None, all_fail_videos=False, selfsup_loss=None, clip_interval_factor=None, labeled_fails=True, debug_dataset=False, anticipate_label=0, data_proportion=1, **kwargs): self.clip_len = frames_per_clip / fps self.clip_step = step_between_clips / fps self.clip_interval_factor = clip_interval_factor self.fps = fps self.t = transform self.load_flow = load_flow self.flow_histogram = flow_histogram self.video_clips = None self.fails_path = fails_path self.fails_flow_path = fails_flow_path self.selfsup_loss = selfsup_loss self.get_clip_times = get_clip_times self.anticipate_label = anticipate_label data_proportion = 1 if val else data_proportion if video_clips: self.video_clips = video_clips else: assert fails_path is None or fails_video_list is None video_list = fails_video_list or glob( os.path.join(fails_path, '**', '*.mp4'), recursive=True) if not fails_only: kinetics_cls = torch.load("PATH/TO/kinetics_classes.pt") kinetics_dist = torch.load("PATH/TO/dist.pt") s = len(video_list) for i, n in kinetics_dist.items(): n *= s video_list += sorted( glob(os.path.join(kinetics_path, '**', kinetics_cls[i], '*.mp4'), recursive=True))[:round(n)] self.video_clips = VideoClips(video_list, frames_per_clip, step_between_clips, fps) with open("PATH/TO/borders.json") as f: self.fails_borders = json.load(f) with open("PATH/TO/all_mturk_data.json") as f: self.fails_data = json.load(f) self.fails_only = fails_only self.t_from_clip_idx = lambda idx: ( (step_between_clips * idx) / fps, (step_between_clips * idx + frames_per_clip) / fps) if not balance_fails_only: # no support for recompute clips after balance calc yet self.video_clips.compute_clips(frames_per_clip, step_between_clips, fps) if video_clips is None and fails_only and labeled_fails: # if True: if not all_fail_videos: idxs = [] for i, video_path in enumerate(self.video_clips.video_paths): video_path = os.path.splitext( os.path.basename(video_path))[0] if video_path in self.fails_data: idxs.append(i) self.video_clips = self.video_clips.subset(idxs) # if not val and balance_fails_only: # balance dataset # ratios = {0: 0.3764, 1: 0.0989, 2: 0.5247} self.video_clips.labels = [] self.video_clips.compute_clips(frames_per_clip, step_between_clips, fps) for video_idx, vid_clips in tqdm(enumerate(self.video_clips.clips), total=len( self.video_clips.clips)): video_path = self.video_clips.video_paths[video_idx] if all_fail_videos and os.path.splitext( os.path.basename( video_path))[0] not in self.fails_data: self.video_clips.labels.append([-1 for _ in vid_clips]) continue t_unit = av.open(video_path, metadata_errors='ignore').streams[0].time_base t_fail = sorted(self.fails_data[os.path.splitext( os.path.basename(video_path))[0]]['t']) t_fail = t_fail[len(t_fail) // 2] if t_fail < 0 or not 0.01 <= statistics.median( self.fails_data[os.path.splitext(os.path.basename(video_path))[0]]['rel_t']) <= 0.99 or \ self.fails_data[os.path.splitext(os.path.basename(video_path))[0]]['len'] < 3.2 or \ self.fails_data[os.path.splitext(os.path.basename(video_path))[0]]['len'] > 30: self.video_clips.clips[video_idx] = torch.Tensor() self.video_clips.resampling_idxs[video_idx] = torch.Tensor( ) self.video_clips.labels.append([]) continue prev_label = 0 first_one_idx = len(vid_clips) first_two_idx = len(vid_clips) for clip_idx, clip in enumerate(vid_clips): start_pts = clip[0].item() end_pts = clip[-1].item() t_start = float(t_unit * start_pts) t_end = float(t_unit * end_pts) label = 0 if t_start <= t_fail <= t_end: label = 1 elif t_start > t_fail: label = 2 if label == 1 and prev_label == 0: first_one_idx = clip_idx elif label == 2 and prev_label == 1: first_two_idx = clip_idx break prev_label = label self.video_clips.labels.append( [0 for i in range(first_one_idx)] + [1 for i in range(first_one_idx, first_two_idx)] + [2 for i in range(first_two_idx, len(vid_clips))]) if balance_fails_only and not val: balance_idxs = [] counts = (first_one_idx, first_two_idx - first_one_idx, len(vid_clips) - first_two_idx) offsets = torch.LongTensor([0] + list(counts)).cumsum( 0)[:-1].tolist() ratios = (1, 0.93, 1 / 0.93) labels = (0, 1, 2) lbl_mode = max(labels, key=lambda i: counts[i]) for i in labels: if i != lbl_mode and counts[i] > 0: n_to_add = round( counts[i] * ((counts[lbl_mode] * ratios[i] / counts[i]) - 1)) tmp = list( range(offsets[i], counts[i] + offsets[i])) random.shuffle(tmp) tmp_bal_idxs = [] while len(tmp_bal_idxs) < n_to_add: tmp_bal_idxs += tmp tmp_bal_idxs = tmp_bal_idxs[:n_to_add] balance_idxs += tmp_bal_idxs if not balance_idxs: continue t = torch.cat( (vid_clips, torch.stack([vid_clips[i] for i in balance_idxs]))) self.video_clips.clips[video_idx] = t vid_resampling_idxs = self.video_clips.resampling_idxs[ video_idx] try: t = torch.cat( (vid_resampling_idxs, torch.stack([ vid_resampling_idxs[i] for i in balance_idxs ]))) self.video_clips.resampling_idxs[video_idx] = t except IndexError: pass self.video_clips.labels[-1] += [ self.video_clips.labels[-1][i] for i in balance_idxs ] clip_lengths = torch.as_tensor( [len(v) for v in self.video_clips.clips]) self.video_clips.cumulative_sizes = clip_lengths.cumsum(0).tolist() fns_removed = 0 if fns_to_remove and not val: for i, video_path in enumerate(self.video_clips.video_paths): if fns_removed > len(self.video_clips.video_paths) // 4: break video_path = os.path.splitext(os.path.basename(video_path))[0] if video_path in fns_to_remove: fns_removed += 1 self.video_clips.clips[i] = torch.Tensor() self.video_clips.resampling_idxs[i] = torch.Tensor() self.video_clips.labels[i] = [] clip_lengths = torch.as_tensor( [len(v) for v in self.video_clips.clips]) self.video_clips.cumulative_sizes = clip_lengths.cumsum(0).tolist() if kwargs['local_rank'] <= 0: print( f'removed videos from {fns_removed} out of {len(self.video_clips.video_paths)} files' ) # if not fails_path.startswith("PATH/TO/scenes"): for i, p in enumerate(self.video_clips.video_paths): self.video_clips.video_paths[i] = p.replace( "PATH/TO/scenes", os.path.dirname(fails_path)) self.debug_dataset = debug_dataset if debug_dataset: # self.video_clips = self.video_clips.subset([0]) pass if data_proportion < 1: rng = random.Random() rng.seed(23719) lbls = self.video_clips.labels subset_idxs = rng.sample( range(len(self.video_clips.video_paths)), int(len(self.video_clips.video_paths) * data_proportion)) self.video_clips = self.video_clips.subset(subset_idxs) self.video_clips.labels = [lbls[i] for i in subset_idxs] def trim_borders(self, img, fn): l, r = self.fails_borders[os.path.splitext(os.path.basename(fn))[0]] w = img.shape[2] # THWC if l > 0 and r > 0: img = img[:, :, round(w * l):round(w * r)] return img def __len__(self): return self.video_clips.num_clips() def compute_clip_times(self, video_idx, clip_idx): video_path = self.video_clips.video_paths[video_idx] video_path = os.path.join( self.fails_path, os.path.sep.join(video_path.rsplit(os.path.sep, 2)[-2:])) clip_pts = self.video_clips.clips[video_idx][clip_idx] start_pts = clip_pts[0].item() end_pts = clip_pts[-1].item() t_unit = av.open(video_path, metadata_errors='ignore').streams[0].time_base t_start = float(t_unit * start_pts) t_end = float(t_unit * end_pts) return t_start, t_end def __getitem__(self, idx): if self.load_flow: video_idx, clip_idx = self.video_clips.get_clip_location(idx) video_path = self.video_clips.video_paths[video_idx] video_path = os.path.join( self.fails_path, os.path.sep.join(video_path.rsplit(os.path.sep, 2)[-2:])) label = self.video_clips.labels[video_idx][clip_idx] flow_path = os.path.join( self.fails_flow_path, os.path.sep.join( os.path.splitext(video_path)[0].rsplit(os.path.sep, 2)[-2:])) t_start, t_end = self.compute_clip_times(video_idx, clip_idx) frame_start = round(t_start * self.FLOW_FPS) n_frames = round(self.clip_len * self.FLOW_FPS) flow = [] for frame_i in range(frame_start, frame_start + n_frames): frame_fn = os.path.join(flow_path, f'{frame_i:06}.flo') try: flow.append( torch.load(frame_fn, map_location=torch.device('cpu')).permute( 1, 2, 0).data.numpy()) except: pass while len(flow) < n_frames: flow += flow flow = flow[:n_frames] flow = torch.Tensor(flow) flow = self.trim_borders(flow, video_path) if self.t is not None: flow = self.t(flow) return flow, label, (flow_path, t_start, t_end) else: video_idx, clip_idx = self.video_clips.get_clip_location(idx) if self.anticipate_label: assert not self.selfsup_loss, 'no anticipation with self supervision' video_path = self.video_clips.video_paths[video_idx] label = self.video_clips.labels[video_idx][clip_idx] idx -= round(self.anticipate_label / self.clip_step) new_video_idx, new_clip_idx = self.video_clips.get_clip_location( idx) video, *_ = self.video_clips.get_clip(idx) video = self.trim_borders(video, video_path) if self.t is not None: video = self.t(video) new_t_start, new_t_end = self.compute_clip_times( new_video_idx, new_clip_idx) old_t_start, old_t_end = self.compute_clip_times( video_idx, clip_idx) if new_video_idx != video_idx or new_t_start > old_t_start: label = -1 return video, label, (video_path, new_t_start, new_t_end, []) video, audio, info, video_idx = self.video_clips.get_clip(idx) video_path = self.video_clips.video_paths[video_idx] # print(video_path) try: label = self.video_clips.labels[video_idx][clip_idx] # if self.anticipate_label: # video_path = self.video_clips.video_paths[video_idx] # t_fail = statistics.median(self.fails_data[os.path.splitext(os.path.basename(video_path))[0]]['t']) # t_start, t_end = self.compute_clip_times(video_idx, clip_idx) # t_start += self.anticipate_label # t_end += self.anticipate_label # label = 0 # if t_start <= t_fail <= t_end: # label = 1 # elif t_start > t_fail: # label = 2 except: label = -1 if label == 0 or self.fails_only: video = self.trim_borders(video, video_path) if self.debug_dataset: pass # video[:] = 0 # video[..., 0] = 255 if self.t is not None: video = self.t(video) t_start = t_end = -1 if self.get_clip_times: t_start, t_end = self.compute_clip_times(video_idx, clip_idx) other = [] if self.selfsup_loss == 'pred_middle' or self.selfsup_loss == 'sort' or self.selfsup_loss == 'ctc': k = round(self.clip_len / self.clip_step * self.clip_interval_factor) video_l = [video] try: pvideo, paudio, pinfo, pvideo_idx = self.video_clips.get_clip( idx - k) except: pvideo_idx = -1 try: nvideo, naudio, ninfo, nvideo_idx = self.video_clips.get_clip( idx + k) except: nvideo_idx = -1 t_start, _ = self.compute_clip_times( *self.video_clips.get_clip_location(idx)) try: p_t_start, _ = self.compute_clip_times( *self.video_clips.get_clip_location(idx - k)) except: p_t_start = 1000000000 try: n_t_start, _ = self.compute_clip_times( *self.video_clips.get_clip_location(idx + k)) except: n_t_start = -1000000000 # if pvideo_idx == video_idx: # assert p_t_start < t_start, f"{t_start} <= prev video time {p_t_start}" # if nvideo_idx == video_idx: # assert t_start < n_t_start, f"{t_start} >= next video time {n_t_start}" if pvideo_idx == video_idx and p_t_start < t_start: pvideo = self.trim_borders(pvideo, video_path) if self.t is not None: pvideo = self.t(pvideo) video_l.insert(0, pvideo) else: video_l.insert(0, torch.full_like(video, -1)) if nvideo_idx == video_idx and t_start < n_t_start: nvideo = self.trim_borders(nvideo, video_path) if self.t is not None: nvideo = self.t(nvideo) video_l.append(nvideo) else: video_l.append(torch.full_like(video, -1)) video_l = torch.stack(video_l) video = video_l other = [nvideo_idx == video_idx and pvideo_idx == video_idx] if self.selfsup_loss == 'fps': other = [self.fps] other.append(idx) return video, label, (video_path, t_start, t_end, *other)