def test_video_clips_custom_fps(self): with get_list_of_videos(num_videos=3, sizes=[12, 12, 12], fps=[3, 4, 6]) as video_list: num_frames = 4 for fps in [1, 3, 4, 10]: video_clips = VideoClips(video_list, num_frames, num_frames, fps) for i in range(video_clips.num_clips()): video, audio, info, video_idx = video_clips.get_clip(i) self.assertEqual(video.shape[0], num_frames) self.assertEqual(info["video_fps"], fps)
def test_video_clips_custom_fps(self, tmpdir): video_list = get_list_of_videos(tmpdir, num_videos=3, sizes=[12, 12, 12], fps=[3, 4, 6]) num_frames = 4 for fps in [1, 3, 4, 10]: video_clips = VideoClips(video_list, num_frames, num_frames, fps, num_workers=2) for i in range(video_clips.num_clips()): video, audio, info, video_idx = video_clips.get_clip(i) assert video.shape[0] == num_frames assert info["video_fps"] == fps
class Mice(VisionDataset): def __init__(self, root, frames_per_clip, step_between_clips=1, frame_rate=None, extensions=("mp4", ), transform=None, _precomputed_metadata=None, num_workers=1, _video_width=0, _video_height=0, _video_min_dimension=0, _audio_samples=0, _audio_channels=0): super(Mice, self).__init__(root) classes = list(sorted(list_dir(root))) class_to_idx = {classes[i]: i for i in range(len(classes))} self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None) self.classes = classes video_list = [x[0] for x in self.samples] self.video_clips = VideoClips( video_list, frames_per_clip, step_between_clips, frame_rate, _precomputed_metadata, num_workers=num_workers, _video_width=_video_width, _video_height=_video_height, _video_min_dimension=_video_min_dimension, _audio_samples=_audio_samples, _audio_channels=_audio_channels, ) self.transform = transform @property def metadata(self): return self.video_clips.metadata def __len__(self): return self.video_clips.num_clips() def __getitem__(self, idx): video, _, _, video_idx = self.video_clips.get_clip(idx) video_idx, clip_idx = self.video_clips.get_clip_location(idx) label = self.samples[video_idx][1] if self.transform is not None: video = self.transform(video) return video, label, video_idx, clip_idx
class MyVideoDataset(object): def __init__(self, video_paths): self.video_clips = VideoClips(video_paths, clip_length_in_frames=16, frames_between_clips=1, frame_rate=15) def __getitem__(self, idx): video, audio, info, video_idx = self.video_clips.get_clip(idx) return video, audio def __len__(self): return self.video_clips.num_clips()
def sliding_window(video_path, save_path, epoch_id, preprocess=[]): T = 300 videoclips = VideoClips([video_path], clip_length_in_frames=T, frames_between_clips=1) filenames = [] rotation = 0 sample_count = -1 sample_dir = "" sample_id = "" for i in range(len(videoclips)): sample_count = hash( str(sample_count + 1 + epoch_id * (len(videoclips) / T))) % ((sys.maxsize + 1) * 2) # create new preprocess values rnd = np.random.uniform(-1, 1) rotation = 5 * rnd scale_factor = np.random.uniform(0.8, 1.2) crop_scale_y = np.random.uniform(0.5, 1) crop_scale_x = np.random.uniform(0.5, 1) # Preprocess clip, _, _, _ = videoclips.get_clip(i) clip = clip.numpy() for f in range(len(clip)): for p in preprocess: clip[f] = p(clip[f], rotation=rotation, scale_factor=scale_factor, crop_scale=(crop_scale_y, crop_scale_x)) clip = torch.tensor(clip) # Save filename = "{}.mp4".format(hex(sample_count)) filepath = join(save_path, filename) torchvision.io.write_video(filepath, clip, 30) filenames.append(filename) print("{}, {}, {}/{}".format(filepath, epoch_id, i, len(videoclips))) return filenames
class MyVideoDataset(data.Dataset): def __init__(self, root, data_dirs, labels, n_frames=30, fps=5, spatial_transform=None, temporal_transform=None, random_slice_size=0): data_dirs = [os.path.join(root, d + ".mp4") for d in data_dirs] self.videos = data_dirs self.labels = labels self.video_clips = VideoClips(self.videos, clip_length_in_frames=n_frames, frames_between_clips=n_frames, frame_rate=fps, num_workers=2) self.spatial_transform = spatial_transform self.temporal_transform = temporal_transform self.data_mean = None self.data_std = None self.random_slice_size = random_slice_size def set_stats(self, mean, std): self.data_mean, self.data_std = mean, std def __getitem__(self, idx): video, audio, info, video_idx = self.video_clips.get_clip(idx) if self.random_slice_size: video = T.RandomSlice(self.random_slice_size)(video) if self.temporal_transform is not None: video = self.temporal_transform(video) if self.spatial_transform is not None: video = self.spatial_transform(video) if self.data_mean is not None and self.data_std is not None: video = T.Normalize(mean=self.data_mean, std=self.data_std)(video) label = self.labels[video_idx] print(video_idx, "--- ", self.video_clips.video_paths[video_idx], "--- ", label) return idx, video, label, video_idx def __len__(self): return self.video_clips.num_clips()
class ClipDataset: def __init__(self, video_paths, clip_length_in_frames, stride, frame_rate, refresh, cache_dir): self.frame_rate = frame_rate self.clip_length_in_frames = clip_length_in_frames self.stride = stride self.video_paths = video_paths fname = f"fps-{frame_rate}-clip_length-{clip_length_in_frames}-stride{stride}" video_str_bytes = '-'.join(sorted(video_paths)).encode("utf-8") hashed = hashlib.sha256(video_str_bytes).hexdigest() fname += f"num-videos{len(video_paths)}-{hashed}" cached_clips_path = Path(cache_dir) / fname if cached_clips_path.exists() and not refresh: print(f"Reloading cached clips object") with open(cached_clips_path, "rb") as f: self.video_clips = pickle.load(f) else: print(f"Building new video clips object") self.video_clips = VideoClips( frame_rate=frame_rate, video_paths=video_paths, frames_between_clips=stride, clip_length_in_frames=clip_length_in_frames, ) cached_clips_path.parent.mkdir(exist_ok=True, parents=True) print(f"Writing object to cache at {cached_clips_path}") with open(cached_clips_path, "wb") as f: pickle.dump(self.video_clips, f) def __getitem__(self, idx): video, audio, info, video_idx = self.video_clips.get_clip(idx) return video def __len__(self): return self.video_clips.num_clips()
class VideoIter(data.Dataset): def __init__(self, clip_length, frame_stride, dataset_path=None, annotation_path=None, video_transform=None, name="<NO_NAME>", shuffle_list_seed=None, single_load=False): super(VideoIter, self).__init__() self.dataset_path = dataset_path self.frames_stride = frame_stride self.video_transform = video_transform self.rng = np.random.RandomState( shuffle_list_seed if shuffle_list_seed else 0) # load video list if dataset_path is not None: self.video_list = self._get_video_list( dataset_path=self.dataset_path) elif type(annotation_path) == list(): self.video_list = annotation_path else: self.video_list = [annotation_path] self.total_clip_length_in_frames = clip_length * frame_stride if single_load: print("loading each file at a time") self.video_clips = VideoClips( video_paths=[self.video_list[0]], clip_length_in_frames=self.total_clip_length_in_frames, frames_between_clips=self.total_clip_length_in_frames) with tqdm(total=len(self.video_list[1:]) + 1, desc=' total % of videos loaded') as pbar1: for video_list_used in self.video_list[1:]: print(video_list_used) pbar1.update(1) video_clips_out = VideoClips( video_paths=[video_list_used], clip_length_in_frames=self.total_clip_length_in_frames, frames_between_clips=self.total_clip_length_in_frames) self.video_clips.clips.append(video_clips_out.clips[0]) self.video_clips.cumulative_sizes.append( self.video_clips.cumulative_sizes[-1] + video_clips_out.cumulative_sizes[0]) self.video_clips.resampling_idxs.append( video_clips_out.resampling_idxs[0]) self.video_clips.video_fps.append( video_clips_out.video_fps[0]) self.video_clips.video_paths.append( video_clips_out.video_paths[0]) self.video_clips.video_pts.append( video_clips_out.video_pts[0]) else: print("single loader used") self.video_clips = VideoClips( video_paths=self.video_list, clip_length_in_frames=self.total_clip_length_in_frames, frames_between_clips=self.total_clip_length_in_frames) logging.info( "VideoIter:: iterator initialized (phase: '{:s}', num: {:d})". format(name, len(self.video_list))) def getitem_from_raw_video(self, idx): # get current video info video, _, _, _ = self.video_clips.get_clip(idx) video_idx, clip_idx = self.video_clips.get_clip_location(idx) video_path = self.video_clips.video_paths[video_idx] in_clip_frames = list( range(0, self.total_clip_length_in_frames, self.frames_stride)) video = video[in_clip_frames] if self.video_transform is not None: video = self.video_transform(video) label = 0 if "Normal" in video_path else 1 dir, file = video_path.split(os.sep)[-2:] file = file.split('.')[0] return video, label, clip_idx, dir, file def __len__(self): return len(self.video_clips) def __getitem__(self, index): succ = False while not succ: try: clip_input, label, sampled_idx, dir, file = self.getitem_from_raw_video( index) succ = True except Exception as e: index = self.rng.choice(range(0, self.__len__())) logging.warning( "VideoIter:: ERROR!! (Force using another index:\n{})\n{}". format(index, e)) return clip_input, label, sampled_idx, dir, file @staticmethod def _get_video_list(dataset_path): assert os.path.exists( dataset_path), "VideoIter:: failed to locate: `{}'".format( dataset_path) vid_list = [] for path, subdirs, files in os.walk(dataset_path): for name in files: vid_list.append(os.path.join(path, name)) return vid_list
class Kinetics400(VisionDataset): """ `Kinetics-400 <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_ dataset. Kinetics-400 is an action recognition video dataset. This dataset consider every video as a collection of video clips of fixed size, specified by ``frames_per_clip``, where the step in frames between each clip is given by ``step_between_clips``. To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5`` and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two elements will come from video 1, and the next three elements from video 2. Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all frames in a video might be present. Internally, it uses a VideoClips object to handle clip creation. Args: root (string): Root directory of the Kinetics-400 Dataset. frames_per_clip (int): number of frames in a clip step_between_clips (int): number of frames between each clip transform (callable, optional): A function/transform that takes in a TxHxWxC video and returns a transformed version. Returns: video (Tensor[T, H, W, C]): the `T` video frames audio(Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points label (int): class of the video clip """ def __init__(self, root, frames_per_clip, step_between_clips=1, frame_rate=None, extensions=('avi', ), transform=None, num_workers=1, _video_width=0, _video_height=0, _video_min_dimension=0, _audio_samples=0): super(Kinetics400, self).__init__(root) classes = list(sorted(list_dir(root))) class_to_idx = {classes[i]: i for i in range(len(classes))} self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None) self.classes = classes video_list = [x[0] for x in self.samples] split = root.split('/')[-1].strip('/') metadata_filepath = os.path.join( root, 'kinetics_metadata_{}.pt'.format(split)) if os.path.exists(metadata_filepath): metadata = torch.load(metadata_filepath) else: metadata = None self.video_clips = VideoClips( video_list, frames_per_clip, step_between_clips, frame_rate, metadata, num_workers=num_workers, _video_width=_video_width, _video_height=_video_height, _video_min_dimension=_video_min_dimension, _audio_samples=_audio_samples, ) self.transform = transform if not os.path.exists(metadata_filepath): torch.save(self.video_clips.metadata, metadata_filepath) @property def metadata(self): return self.video_clips.metadata def __len__(self): return self.video_clips.num_clips() def __getitem__(self, idx): # video_q, audio_q, info_q, video_idx_q = self.video_clips.get_clip(idx[0]) # video_k, audio_k, info_k, video_idx_k = self.video_clips.get_clip(idx[1]) video, audio, info, video_idx = self.video_clips.get_clip(idx) video_q = self.transform['video'](video) video_k = self.transform['video'](video) audio_q = self.transform['audio'](audio) audio_k = self.transform['audio'](audio) return (video_q, video_k), (audio_q, audio_k)
class Dataset(VisionDataset): def __init__(self, datapath, annotations_path, transforms, cached_all_train_data_name='cached_all_train_data.pt', cached_valid_train_data_name='cached_valid_train_data.pt', cached_all_val_data_name='cached_all_val_data.pt', cached_valid_val_data_name='cached_valid_val_data.pt', get_video_wise=False, val=False, fps=None, frames_per_clip=None, step_between_clips=None, start_id=0): self.get_video_wise = get_video_wise self.start_id = start_id self.transforms = transforms #Load annotations = fails_data(in original file) with open(annotations_path) as f: self.annotations = json.load(f) #Load videos if fps is None: fps = 16 if frames_per_clip is None: frames_per_clip = fps if step_between_clips is None: step_between_clips = int(fps * 0.25) # FPS X seconds = frames else: step_between_clips = int(fps * step_between_clips) # FPS X seconds = frames #For train_data if not val: if os.path.exists(os.path.join(datapath,'train',cached_valid_train_data_name)): self.video_clips = torch.load(os.path.join(datapath,'train',cached_valid_train_data_name)) print('\nLoaded Valid train data from cache...') else: #Load all train data all_video_list = glob(os.path.join(datapath, 'train', '**', '*.mp4'), recursive=True) if os.path.exists(os.path.join(datapath,'train',cached_all_train_data_name)): self.all_video_clips = torch.load(os.path.join(datapath,'train',cached_all_train_data_name)) print('\nLoaded all train data from cache...') else: print('\nProcessing all train data...') self.all_video_clips = VideoClips(all_video_list, frames_per_clip, step_between_clips, fps) torch.save(self.all_video_clips, os.path.join(datapath,'train',cached_all_train_data_name)) #Separate out all valid videos print('\nSEPARATING VALID VIDEOS... VAL=',val) valid_video_paths = [] print('Computing all clips...') self.all_video_clips.compute_clips(frames_per_clip, step_between_clips, fps) for video_idx, vid_clips in tqdm(enumerate(self.all_video_clips.clips), total=len(self.all_video_clips.clips)): video_path = self.all_video_clips.video_paths[video_idx] #Ignore if annotation doesnt exist if os.path.splitext(os.path.basename(video_path))[0] not in self.annotations: continue #Ignore if moov atom error try: #Ignore if video attribute doesnt qualify t_unit = av.open(video_path, metadata_errors='ignore').streams[0].time_base t_fail = sorted(self.annotations[os.path.splitext(os.path.basename(video_path))[0]]['t']) t_fail = t_fail[len(t_fail) // 2] if t_fail < 0 or not 0.01 <= statistics.median(self.annotations[os.path.splitext(os.path.basename(video_path))[0]]['rel_t']) <= 0.99 or \ self.annotations[os.path.splitext(os.path.basename(video_path))[0]]['len'] < 3.2 or \ self.annotations[os.path.splitext(os.path.basename(video_path))[0]]['len'] > 30: continue except: continue #If none of the above happens, then save the video path valid_video_paths.append(video_path) self.video_clips = VideoClips(valid_video_paths, frames_per_clip, step_between_clips, fps) torch.save(self.video_clips, os.path.join(datapath,'train',cached_valid_train_data_name)) print('Saved valid train data in cache.') #For test data else: if os.path.exists(os.path.join(datapath,'val',cached_valid_val_data_name)): self.video_clips = torch.load(os.path.join(datapath,'val',cached_valid_val_data_name)) print('\nLoaded Valid Val data from cache...') else: #Load all val data all_video_list = glob(os.path.join(datapath, 'val', '**', '*.mp4'), recursive=True) if os.path.exists(os.path.join(datapath,'val',cached_all_val_data_name)): self.all_video_clips = torch.load(os.path.join(datapath,'val',cached_all_val_data_name)) print('\nLoaded all val data from cache...') else: print('\nProcessing all val data...') self.all_video_clips = VideoClips(all_video_list, frames_per_clip, step_between_clips, fps) torch.save(self.all_video_clips, os.path.join(datapath,'val',cached_all_val_data_name)) #Separate out all valid videos print('\nSEPARATING VALID VIDEOS... VAL=',val) valid_video_paths = [] print('Computing all clips...') self.all_video_clips.compute_clips(frames_per_clip, step_between_clips, fps) for video_idx, vid_clips in tqdm(enumerate(self.all_video_clips.clips), total=len(self.all_video_clips.clips)): video_path = self.all_video_clips.video_paths[video_idx] #Ignore if annotation doesnt exist if os.path.splitext(os.path.basename(video_path))[0] not in self.annotations: continue #Ignore if moov atom error try: #Ignore if video attribute doesnt qualify t_unit = av.open(video_path, metadata_errors='ignore').streams[0].time_base t_fail = sorted(self.annotations[os.path.splitext(os.path.basename(video_path))[0]]['t']) t_fail = t_fail[len(t_fail) // 2] if t_fail < 0 or not 0.01 <= statistics.median(self.annotations[os.path.splitext(os.path.basename(video_path))[0]]['rel_t']) <= 0.99 or \ self.annotations[os.path.splitext(os.path.basename(video_path))[0]]['len'] < 3.2 or \ self.annotations[os.path.splitext(os.path.basename(video_path))[0]]['len'] > 30: continue except: continue #if moov atom exception occurs then ignore clip try: temp = av.open(video_path, metadata_errors='ignore').streams[0].time_base except: continue #Ignore video attributes for test data : Like video_len and median(rel_t) valid_video_paths.append(video_path) self.video_clips = VideoClips(valid_video_paths, frames_per_clip, step_between_clips, fps) torch.save(self.video_clips, os.path.join(datapath,'val',cached_valid_val_data_name)) print('Saved valid val data in cache.') #Load borders.json : LATER #Generate all mini-clips of size frames_per_clip from all video clips print('\nGenerating VALID mini-clips and labels from',len(self.video_clips.clips),'videos... VAL=',val) self.video_clips.compute_clips(frames_per_clip, step_between_clips, fps) self.video_clips.labels = [] for video_idx, vid_clips in tqdm(enumerate(self.video_clips.clips), total=len(self.video_clips.clips)): video_path = self.video_clips.video_paths[video_idx] t_unit = av.open(video_path, metadata_errors='ignore').streams[0].time_base t_fail = sorted(self.annotations[os.path.splitext(os.path.basename(video_path))[0]]['t']) t_fail = t_fail[len(t_fail) // 2] prev_label = 0 first_one_idx = len(vid_clips) first_two_idx = len(vid_clips) for clip_idx, clip in enumerate(vid_clips): #clip == timestamps start_pts = clip[0].item() end_pts = clip[-1].item() t_start = float(t_unit * start_pts) t_end = float(t_unit * end_pts) label = 0 if t_start <= t_fail <= t_end: label = 1 elif t_start > t_fail: label = 2 if label == 1 and prev_label == 0: first_one_idx = clip_idx elif label == 2 and prev_label == 1: first_two_idx = clip_idx break prev_label = label self.video_clips.labels.append( [0 for i in range(first_one_idx)] + [1 for i in range(first_one_idx, first_two_idx)] + [2 for i in range(first_two_idx, len(vid_clips))]) #Leaving the part: balance_fails_only (I dunno what this is!!) print('\nNumber of CLIPS generated:', self.video_clips.num_clips()) def __len__(self): if self.get_video_wise: return len(self.video_clips.labels) - self.start_id else: return self.video_clips.num_clips() def __getitem__(self, idx): idx = self.start_id + idx if self.get_video_wise: #TO return all clips of a single video labels = self.video_clips.labels[idx] #here idx is video_idx num_of_clips = len(labels) num_of_clips_before_this_video = 0 for l in self.video_clips.labels[:idx]: num_of_clips_before_this_video += len(l) start_clip_id = num_of_clips_before_this_video end_clip_id = num_of_clips_before_this_video + num_of_clips video = [] for idx in range(start_clip_id, end_clip_id): clip, _, _, _ = self.video_clips.get_clip(idx) if self.transforms: clip = self.transforms(clip) clip = clip.permute(1,0,2,3) video.append(clip.unsqueeze(0)) video = torch.cat(video, dim=0) #labels = torch.cat(labels) return video, labels else: video_idx, clip_idx = self.video_clips.get_clip_location(idx) video, audio, info, video_idx = self.video_clips.get_clip(idx) video_path = self.video_clips.video_paths[video_idx] label = self.video_clips.labels[video_idx][clip_idx] if self.transforms is not None: video = self.transforms(video) video = video.permute(1,0,2,3) return video, label
class VideoDataset(data.Dataset): def __init__(self, opt, transforms, subset, fraction=1.): """file_list is a list of [/path/to/mp4 key-to-df]""" self.subset = subset self.video_info_path = opt["video_info"] self.mode = opt["mode"] self.boundary_ratio = opt['boundary_ratio'] self.skip_videoframes = opt['skip_videoframes'] self.num_videoframes = opt['num_videoframes'] self.dist_videoframes = opt['dist_videoframes'] self.fraction = fraction subset_translate = {'train': 'training', 'val': 'validation'} self.anno_df = pd.read_csv(self.video_info_path) print(self.anno_df) print(subset, subset_translate.get(subset)) if subset != 'full': self.anno_df = self.anno_df[self.anno_df.subset == subset_translate[subset]] print(self.anno_df) file_loc = opt['%s_video_file_list' % subset] with open(file_loc, 'r') as f: lines = [k.strip() for k in f.readlines()] file_list = [k.split(' ')[0] for k in lines] keys_list = [k.split(' ')[1][:-4] for k in lines] print(keys_list[:5]) valid_key_indices = [num for num, k in enumerate(keys_list) \ if k in set(self.anno_df.video.unique())] self.keys_list = [keys_list[num] for num in valid_key_indices] self.file_list = [file_list[num] for num in valid_key_indices] print('Number of indices: ', len(valid_key_indices), subset) video_info_dir = '/'.join(self.video_info_path.split('/')[:-1]) clip_length_in_frames = self.num_videoframes * self.skip_videoframes frames_between_clips = self.dist_videoframes saved_video_clips = os.path.join( video_info_dir, 'video_clips.%s.%df.%ds.pkl' % (subset, clip_length_in_frames, frames_between_clips)) if os.path.exists(saved_video_clips): print('Path Exists for video_clips: ', saved_video_clips) self.video_clips = pickle.load(open(saved_video_clips, 'rb')) else: print('Path does NOT exist for video_clips: ', saved_video_clips) self.video_clips = VideoClips( self.file_list, clip_length_in_frames=clip_length_in_frames, frames_between_clips=frames_between_clips, frame_rate=opt['fps']) pickle.dump(self.video_clips, open(saved_video_clips, 'wb')) print('Length of vid clips: ', self.video_clips.num_clips(), self.subset) if self.mode == "train": self.datums = self._retrieve_valid_datums() self.datum_indices = list(range(len(self.datums))) if fraction < 1: print('DOING the subset dataset on %s ...' % subset) self._subset_dataset(fraction) print('Len of %s datums: ' % subset, len(self.datum_indices)) self.transforms = transforms def _subset_dataset(self, fraction): num_datums = int(len(self.datums) * fraction) self.datum_indices = list(range(len(self.datums))) random.shuffle(self.datum_indices) self.datum_indices = self.datum_indices[:num_datums] print('These indices: ', len(self.datum_indices), num_datums, len(self.datums)) print(sorted(self.datum_indices)[:10]) print(sorted(self.datum_indices)[-10:]) def __len__(self): if self.mode == 'train': return len(self.datum_indices) else: return self.video_clips.num_clips() def _retrieve_valid_datums(self): video_info_dir = '/'.join(self.video_info_path.split('/')[:-1]) num_clips = self.video_clips.num_clips() saved_data_path = os.path.join( video_info_dir, 'saved.%s.nf%d.sf%d.df%d.vid%d.pkl' % (self.subset, self.num_videoframes, self.skip_videoframes, self.dist_videoframes, num_clips)) print(saved_data_path) if os.path.exists(saved_data_path): print('Got saved data.') with open(saved_data_path, 'rb') as f: return pickle.load(f) ret = [] for flat_index in range(num_clips): video_idx, clip_idx = self.video_clips.get_clip_location( flat_index) start_frame = clip_idx * self.dist_videoframes snippets = [ start_frame + self.skip_videoframes * i for i in range(self.num_videoframes) ] key = self.keys_list[video_idx] training_anchors = self._get_training_anchors(snippets, key) if not training_anchors: continue anchor_xmins, anchor_xmaxs, gt_bbox = training_anchors ret.append((flat_index, anchor_xmins, anchor_xmaxs, gt_bbox)) print('Size of data: ', len(ret), flush=True) with open(saved_data_path, 'wb') as f: pickle.dump(ret, f) print('Dumped data...') return ret def __getitem__(self, index): # The video_data retrieved has shape [nf * sf, w, h, c]. # We want to pick every sf'th frame out of that. if self.mode == "train": datum_index = self.datum_indices[index] flat_index, anchor_xmin, anchor_xmax, gt_bbox = self.datums[ datum_index] else: flat_index = index video, _, _, video_idx = self.video_clips.get_clip(flat_index) video_data = video[0::self.skip_videoframes] print('Bef transform: ', video_data, type(video_data)) video_data = self.transforms(video_data) print('AFt transform: ', video_data, type(video_data)) video_data = torch.transpose(video_data, 0, 1) _, clip_idx = self.video_clips.get_clip_location(index) start_frame = clip_idx * self.dist_videoframes snippets = [ start_frame + self.skip_videoframes * i for i in range(self.num_videoframes) ] if self.mode == "train": match_score_action, match_score_start, match_score_end = self._get_train_label( gt_bbox, anchor_xmin, anchor_xmax) return video_data, match_score_action, match_score_start, match_score_end else: try: video_name = self.keys_list[video_idx] except Exception as e: print('Whoops: VideoReader ...', video_idx, len(self.keys_list), index, flat_index) return flat_index, video_data, video_name, snippets def _get_training_anchors(self, snippets, key): tmp_anchor_xmins = np.array(snippets) - self.skip_videoframes / 2. tmp_anchor_xmaxs = np.array(snippets) + self.skip_videoframes / 2. tmp_gt_bbox = [] tmp_ioa_list = [] anno_df_video = self.anno_df[self.anno_df.video == key] gt_xmins = anno_df_video.startFrame.values[:] gt_xmaxs = anno_df_video.endFrame.values[:] if len(gt_xmins) == 0: print('Yo wat gt_xmins: ', key) raise for idx in range(len(gt_xmins)): tmp_ioa = ioa_with_anchors(gt_xmins[idx], gt_xmaxs[idx], tmp_anchor_xmins[0], tmp_anchor_xmaxs[-1]) tmp_ioa_list.append(tmp_ioa) if tmp_ioa > 0: tmp_gt_bbox.append([gt_xmins[idx], gt_xmaxs[idx]]) # print(len(tmp_gt_bbox), max(tmp_ioa_list), tmp_ioa_list) if len(tmp_gt_bbox) > 0: # NOTE: Removed the threshold of 0.9... ruh roh. return tmp_anchor_xmins, tmp_anchor_xmaxs, tmp_gt_bbox return None def _get_train_label(self, gt_bbox, anchor_xmin, anchor_xmax): gt_bbox = np.array(gt_bbox) gt_xmins = gt_bbox[:, 0] gt_xmaxs = gt_bbox[:, 1] # same as gt_len but using the thumos code repo :/. gt_duration = gt_xmaxs - gt_xmins gt_duration_boundary = np.maximum(self.skip_videoframes, gt_duration * self.boundary_ratio) gt_start_bboxs = np.stack((gt_xmins - gt_duration_boundary / 2, gt_xmins + gt_duration_boundary / 2), axis=1) gt_end_bboxs = np.stack((gt_xmaxs - gt_duration_boundary / 2, gt_xmaxs + gt_duration_boundary / 2), axis=1) match_score_action = [ np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_xmins, gt_xmaxs)) for jdx in range(len(anchor_xmin)) ] match_score_start = [ np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1])) for jdx in range(len(anchor_xmin)) ] match_score_end = [ np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_end_bboxs[:, 0], gt_end_bboxs[:, 1])) for jdx in range(len(anchor_xmin)) ] return torch.Tensor(match_score_action), torch.Tensor( match_score_start), torch.Tensor(match_score_end)
class VideoDataset(data.Dataset): """ Process raw videos to get videoclips """ def __init__(self, clip_length, frame_stride, frame_rate=None, dataset_path=None, spatial_transform=None, temporal_transform=None, return_label=False, video_formats=["avi", "mp4"]): super(VideoDataset, self).__init__() # video clip properties self.frames_stride = frame_stride self.total_clip_length_in_frames = clip_length * frame_stride self.spatial_transform = spatial_transform self.temporal_transform = temporal_transform self.video_formats = video_formats # IO self.dataset_path = dataset_path self.video_list = self._get_video_list(dataset_path=self.dataset_path) # print("video_list:", self.video_list, len(self.video_list)) self.return_label = return_label # data loading self.video_clips = VideoClips(video_paths=self.video_list, clip_length_in_frames=self.total_clip_length_in_frames, frames_between_clips=self.total_clip_length_in_frames, frame_rate=frame_rate) @property def video_count(self): return len(self.video_list) def getitem_from_raw_video(self, idx): video, _, _, _ = self.video_clips.get_clip(idx) video_idx, clip_idx = self.video_clips.get_clip_location(idx) video_path = self.video_clips.video_paths[video_idx] in_clip_frames = list(range(0, self.total_clip_length_in_frames, self.frames_stride)) # print("idx: {}, video_path: {}, video_idx: {}, clip_idx: {}, in_clip_frames: {}".format(idx, video_path, video_idx, clip_idx, in_clip_frames)) video = video[in_clip_frames] # print('video: ', video.size(), video.dtype) if self.temporal_transform: video = self.temporal_transform(video) if self.spatial_transform: video = self.spatial_transform(video) dir, file = video_path.split(os.sep)[-2:] file = file.split('.')[0] # if self.return_label: # label = 0 if "Normal" in video_path else 1 # return video, label, clip_idx, dir, file label = 0 if "Normal" in video_path else 1 return video, label, (clip_idx, dir, file) def __len__(self): return len(self.video_clips) def __getitem__(self, index): succ = False while not succ: try: batch = self.getitem_from_raw_video(index) succ = True except Exception as e: index = np.random.choice(range(0, self.__len__())) trace_back = sys.exc_info()[2] line = trace_back.tb_lineno logging.warning(f"VideoIter:: ERROR (line number {line}) !! (Force using another index:\n{index})\n{e}") return batch def _get_video_list(self, dataset_path): assert os.path.exists(dataset_path), "VideoIter:: failed to locate: `{}'".format(dataset_path) vid_list = [] for path, subdirs, files in os.walk(dataset_path): for name in files: if not any([format in name and name[0]!= '.' for format in self.video_formats]): continue vid_list.append(os.path.join(path, name)) return vid_list
class Kinetics400Indexed(VisionDataset): """ This class is similar to Kinetics400, but use a index file to build classes and samples, instead of building them from IO operation. """ def __init__(self, root, index_path, *, frames_per_clip, step_between_clips, frame_rate, extensions=('mp4', ), transform=None, _precomputed_metadata=None, num_workers=1, _video_width=0, _video_height=0, _video_min_dimension=0, _audio_samples=0): super(Kinetics400Indexed, self).__init__(root) self.index_path = index_path with open(index_path) as f: index = json.load(f) classes = index['classes'] self.classes = classes self.samples = [(os.path.join(root, path), label) for path, label in index['samples']] ''' classes = list(sorted(list_dir(root))) class_to_idx = {classes[i]: i for i in range(len(classes))} self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None) self.classes = classes ''' video_list = [x[0] for x in self.samples] self.video_clips = VideoClips( video_list, frames_per_clip, step_between_clips, frame_rate, _precomputed_metadata, num_workers=num_workers, _video_width=_video_width, _video_height=_video_height, _video_min_dimension=_video_min_dimension, _audio_samples=_audio_samples, ) self.transform = transform @property def metadata(self): return self.video_clips.metadata def __len__(self): return self.video_clips.num_clips() def __getitem__(self, idx): with warnings.catch_warnings(): # ignore UserWarning: The pts_unit 'pts' gives wrong results and # will be removed in a follow-up version. Please use pts_unit 'sec'. warnings.simplefilter("ignore") video, audio, info, video_idx = self.video_clips.get_clip(idx) target = self.samples[video_idx][1] if self.transform is not None: video = self.transform(video) # return video, audio, label return dict(video=video, audio=audio, target=target, video_idx=video_idx)
class VideoIterTrain(data.Dataset): def __init__(self, dataset_path, annotation_path, clip_length, frame_stride, video_transform=None, name="<NO_NAME>", return_item_subpath=False, shuffle_list_seed=None, single_load=False): super(VideoIterTrain, self).__init__() self.force_color = True if dataset_path != None: self.dataset_path = dataset_path self.frames_stride = frame_stride self.video_transform = video_transform self.return_item_subpath = return_item_subpath self.rng = np.random.RandomState( shuffle_list_seed if shuffle_list_seed else 0) # load video list if dataset_path != None: self.video_list = self._get_video_list( dataset_path=self.dataset_path, annotation_path=annotation_path) elif type(annotation_path) == list(): self.video_list = annotation_path else: self.video_list = [annotation_path] self.total_clip_length_in_frames = clip_length * frame_stride #size_list=[] if single_load == True: print("loading each file at a time") self.video_clips = VideoClips( video_paths=[self.video_list[0]], clip_length_in_frames=self.total_clip_length_in_frames, frames_between_clips=self.total_clip_length_in_frames) with tqdm(total=len(self.video_list[1:]) + 1, desc=' total % of videos loaded') as pbar1: for video_list_used in self.video_list[1:]: #length of load?) #blockPrint() print(video_list_used) import os #print("size "+str(os.path.getsize(video_list_used))) #size_list.append(os.path.getsize(video_list_used)) #print(max(size_list)) pbar1.update(1) video_clips_out = VideoClips( video_paths=[video_list_used], clip_length_in_frames=self.total_clip_length_in_frames, frames_between_clips=self.total_clip_length_in_frames) # if video_list_used =="/media/peter/Maxtor/AD-pytorch/UCF_Crimes/Videos/Training_Normal_Videos_Anomaly/Normal_Videos547_x264.mp4": # continue # #enablePrint() self.video_clips.clips.append(video_clips_out.clips[0]) #print(self.video_clips.cumulative_sizes) self.video_clips.cumulative_sizes.append( self.video_clips.cumulative_sizes[-1] + video_clips_out.cumulative_sizes[0]) self.video_clips.resampling_idxs.append( video_clips_out.resampling_idxs[0]) self.video_clips.video_fps.append( video_clips_out.video_fps[0]) self.video_clips.video_paths.append( video_clips_out.video_paths[0]) self.video_clips.video_pts.append( video_clips_out.video_pts[0]) else: print("single loader used") self.video_clips = VideoClips( video_paths=self.video_list, clip_length_in_frames=self.total_clip_length_in_frames, frames_between_clips=self.total_clip_length_in_frames) logging.info( "VideoIter:: iterator initialized (phase: '{:s}', num: {:d})". format(name, len(self.video_list))) def getitem_from_raw_video(self, idx): # get current video info video, _, _, _ = self.video_clips.get_clip(idx) video_idx, clip_idx = self.video_clips.get_clip_location(idx) in_clip_frames = list( range(0, self.total_clip_length_in_frames, self.frames_stride)) video_path = self.video_clips.video_paths[video_idx] print(idx) print(video_idx) print(video_path) in_clip_frames = list( range(0, self.total_clip_length_in_frames, self.frames_stride)) video = video[in_clip_frames] if self.video_transform is not None: video = self.video_transform(video) if "Normal" not in video_path: label = 1 else: label = 0 dir, file = video_path.split(os.sep)[-2:] file = file.split('.')[0] #video=video.numpy() #test=video.shape #t=video[:][0] #video[in_clip_frames] return video, label, clip_idx, dir, file #video[:, in_clip_frames, :, :], label, clip_idx, dir, file def __getitem__(self, index): succ = False while not succ: try: clip_input, label, sampled_idx, dir, file = self.getitem_from_raw_video( index) succ = True except Exception as e: index = self.rng.choice(range(0, self.__len__())) logging.warning( "VideoIter:: ERROR!! (Force using another index:\n{})\n{}". format(index, e)) return clip_input, label, sampled_idx, dir, file def __len__(self): return len(self.video_list) def _get_video_list(self, dataset_path, annotation_path): assert os.path.exists( dataset_path ) # , "VideoIter:: failed to locate: `{}'".format(dataset_path) assert os.path.exists( annotation_path ) # , "VideoIter:: failed to locate: `{}'".format(annotation_path) vid_list = [] with open(annotation_path, 'r') as f: for line in f: items = line.split() path = os.path.join(dataset_path, items[0]) vid_list.append(path.strip('\n')) return vid_list #set(vid_list)
class _SomethingSomethingV2Dataset: """TBD """ def __init__( self, video_dir, label_map_json, labels_json, frames_per_clip, step_between_clips=1, frame_rate=None, transform=None, _precomputed_metadata=None, num_workers=1, _video_width=0, _video_height=0, _video_min_dimension=0, ) -> "_SomethingSomethingV2Dataset": for data_file in [label_map_json, labels_json]: assert os.path.exists( data_file), f"Data file {data_file} is missing" with open(label_map_json, "r") as fp: label_map = json.load(fp) with open(labels_json, "r") as fp: samples = json.load(fp) self.samples = [] for sample in samples: video_id = sample["id"] label = sample["template"].replace("[", "").replace("]", "") assert label in label_map, f"Unknown label: {label}" video_path = os.path.join(video_dir, f"{video_id}.webm") assert os.path.exists(video_path), f"{video_path} is missing" self.samples.append((video_path, int(label_map[label]))) video_list = [x[0] for x in self.samples] self.video_clips = VideoClips( video_list, frames_per_clip, step_between_clips, frame_rate, _precomputed_metadata, num_workers=num_workers, _video_width=_video_width, _video_height=_video_height, _video_min_dimension=_video_min_dimension, ) self.transform = transform @property def metadata(self): return self.video_clips.metadata def __len__(self): return self.video_clips.num_clips() def __getitem__(self, idx): video, audio, info, video_idx = self.video_clips.get_clip(idx) label = self.samples[video_idx][1] if self.transform is not None: video = self.transform(video) return video, audio, label
class VideoIterVal(data.Dataset): def __init__(self, dataset_path, annotation_path, clip_length, frame_stride, video_transform=None, name="<NO_NAME>", return_item_subpath=False, shuffle_list_seed=None): super(VideoIterVal, self).__init__() # load params self.frames_stride = frame_stride self.dataset_path = dataset_path self.video_transform = video_transform self.return_item_subpath = return_item_subpath self.rng = np.random.RandomState( shuffle_list_seed if shuffle_list_seed else 0) # load video list self.video_list = self._get_video_list(dataset_path=self.dataset_path, annotation_path=annotation_path) self.total_clip_length_in_frames = clip_length * frame_stride self.video_clips = VideoClips( video_paths=self.video_list, clip_length_in_frames=self.total_clip_length_in_frames, frames_between_clips=self.total_clip_length_in_frames) logging.info( "VideoIter:: iterator initialized (phase: '{:s}', num: {:d})". format(name, len(self.video_list))) def getitem_from_raw_video(self, idx): # get current video info video, _, _, _ = self.video_clips.get_clip(idx) video_idx, clip_idx = self.video_clips.get_clip_location(idx) video_path = self.video_clips.video_paths[video_idx] if self.video_transform is not None: video = self.video_transform(video) if "Normal" not in video_path: label = 1 else: label = 0 dir, file = video_path.split(os.sep)[-2:] file = file.split('.')[0] in_clip_frames = list( range(0, self.total_clip_length_in_frames, self.frames_stride)) return video[in_clip_frames], label, clip_idx, dir, file def __getitem__(self, index): succ = False while not succ: try: clip_input, label, sampled_idx, dir, file = self.getitem_from_raw_video( index) succ = True except Exception as e: index = self.rng.choice(range(0, self.__len__())) logging.warning( "VideoIter:: ERROR!! (Force using another index:\n{})\n{}". format(index, e)) return clip_input, label, sampled_idx, dir, file def __len__(self): return len(self.video_list) def _get_video_list(self, dataset_path, annotation_path): assert os.path.exists( dataset_path ) # , "VideoIter:: failed to locate: `{}'".format(dataset_path) assert os.path.exists( annotation_path ) # , "VideoIter:: failed to locate: `{}'".format(annotation_path) v_id = 0 vid_list = [] with open(annotation_path, 'r') as f: for line in f: items = line.split() path = os.path.join(dataset_path, items[0]) vid_list.append(path.strip('\n')) return vid_list
class VideoIter(data.Dataset): def __init__(self, clip_length, frame_stride, dataset_path=None, video_transform=None, return_label=False): super(VideoIter, self).__init__() # video clip properties self.frames_stride = frame_stride self.total_clip_length_in_frames = clip_length * frame_stride self.video_transform = video_transform # IO self.dataset_path = dataset_path self.video_list = self._get_video_list(dataset_path=self.dataset_path) self.return_label = return_label # data loading self.video_clips = VideoClips( video_paths=self.video_list, clip_length_in_frames=self.total_clip_length_in_frames, frames_between_clips=self.total_clip_length_in_frames, ) @property def video_count(self): return len(self.video_list) def getitem_from_raw_video(self, idx): video, _, _, _ = self.video_clips.get_clip(idx) video_idx, clip_idx = self.video_clips.get_clip_location(idx) video_path = self.video_clips.video_paths[video_idx] in_clip_frames = list( range(0, self.total_clip_length_in_frames, self.frames_stride)) video = video[in_clip_frames] if self.video_transform is not None: video = self.video_transform(video) dir, file = video_path.split(os.sep)[-2:] file = file.split('.')[0] if self.return_label: label = 0 if "Normal" in video_path else 1 return video, label, clip_idx, dir, file return video, clip_idx, dir, file def __len__(self): return len(self.video_clips) def __getitem__(self, index): succ = False while not succ: try: batch = self.getitem_from_raw_video(index) succ = True except Exception as e: index = np.random.choice(range(0, self.__len__())) logging.warning( "VideoIter:: ERROR!! (Force using another index:\n{})\n{}". format(index, e)) return batch def _get_video_list(self, dataset_path): features_path = r'/Users/eitankosman/PycharmProjects/anomaly_features' existing_features = np.concatenate( [[file.split('.')[0] for file in files] for path, subdirs, files in os.walk(features_path)]) print(len(existing_features)) assert os.path.exists( dataset_path), "VideoIter:: failed to locate: `{}'".format( dataset_path) vid_list = [] skp = 0 for path, subdirs, files in os.walk(dataset_path): for name in files: if 'mp4' not in name: continue if name.split('.')[0] in existing_features: print(f"Skipping {name}") skp += 1 continue vid_list.append(os.path.join(path, name)) print(f"Skipped {skp}") return vid_list
class Kinetics400(VisionDataset): """ `Kinetics-400 <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_ dataset. Kinetics-400 is an action recognition video dataset. This dataset consider every video as a collection of video clips of fixed size, specified by ``frames_per_clip``, where the step in frames between each clip is given by ``step_between_clips``. To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5`` and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two elements will come from video 1, and the next three elements from video 2. Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all frames in a video might be present. Internally, it uses a VideoClips object to handle clip creation. Args: root (string): Root directory of the Kinetics-400 Dataset. frames_per_clip (int): number of frames in a clip step_between_clips (int): number of frames between each clip transform (callable, optional): A function/transform that takes in a TxHxWxC video and returns a transformed version. Returns: video (Tensor[T, H, W, C]): the `T` video frames audio(Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points label (int): class of the video clip """ def __init__(self, root, frames_per_clip, step_between_clips=1, frame_rate=None, extensions=('mp4', ), transform=None, cached=None, _precomputed_metadata=None): super(Kinetics400, self).__init__(root) extensions = extensions classes = list(sorted(list_dir(root))) class_to_idx = {classes[i]: i for i in range(len(classes))} self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None) self.classes = classes video_list = [x[0] for x in self.samples] self.video_clips = VideoClips( video_list, frames_per_clip, step_between_clips, frame_rate, _precomputed_metadata, ) self.transform = transform def __len__(self): return self.video_clips.num_clips() def __getitem__(self, idx): success = False while not success: try: video, audio, info, video_idx = self.video_clips.get_clip(idx) success = True except: print('skipped idx', idx) idx = np.random.randint(self.__len__()) label = self.samples[video_idx][1] if self.transform is not None: video = self.transform(video) return video, audio, label
class _MiniKinetics200Dataset: """TBD """ def __init__( self, root, data_file, frames_per_clip, step_between_clips=1, frame_rate=None, extension="mp4", transform=None, _precomputed_metadata=None, num_workers=1, _video_width=0, _video_height=0, _video_min_dimension=0, _audio_samples=0, _audio_channels=0, ) -> "MiniKinetics200Dataset": assert os.path.exists(data_file), f"Data file {data_file} is missing" self.samples = [] with open(data_file, "r") as fp: for line in fp.readlines(): video_id, class_name, class_label = line.strip().split(",") class_name = class_name.replace("_", " ") video_path = os.path.join( root, class_name, f"{video_id}.{extension}", ) if os.path.exists(video_path): self.samples.append([video_path, int(class_label)]) video_list = [x[0] for x in self.samples] self.video_clips = VideoClips( video_list, frames_per_clip, step_between_clips, frame_rate, _precomputed_metadata, num_workers=num_workers, _video_width=_video_width, _video_height=_video_height, _video_min_dimension=_video_min_dimension, _audio_samples=_audio_samples, _audio_channels=_audio_channels, ) self.transform = transform @property def metadata(self): return self.video_clips.metadata def __len__(self): return self.video_clips.num_clips() def __getitem__(self, idx): video, audio, info, video_idx = self.video_clips.get_clip(idx) label = self.samples[video_idx][1] if self.transform is not None: video = self.transform(video) return video, audio, label
def main(input_dir, output_dir): device = "cuda" config = load_config("config.json") annotations_in = join(input_dir, "annotations.csv") annotations_out = join(output_dir, "annotations.csv") annotations = pd.read_csv(annotations_in) labels = list(annotations.iloc[:, 1]) #labels = [(annotations.iloc[0,1])] # debug subset_size = 16 video_names = [ join(input_dir, annotations.iloc[i, 0]) for i in range(len(annotations)) ] #video_names = [join(input_dir, annotations.iloc[0,0])] # debug videoclips = VideoClips(video_names, clip_length_in_frames=subset_size, frames_between_clips=subset_size) transformers = [ FactorCrop(config["model"]["downsample"], dest_size=config["dataset"]["image_size"]), RTPosePreprocessing(), ToRTPoseInput(0), ] composed = Compose(transformers) model = PoseModel() model = model.to(device) model.load_state_dict( torch.load("model/PoseModel/weights/vgg19.pth", map_location=torch.device(device))) counter, sample = {}, {} vframes = None subpart_count = {} for i in range(len(videoclips)): vframes, _, info, video_idx = videoclips.get_clip(i) label = labels[video_idx] video_name = basename(video_names[video_idx]) video_dir = join(output_dir, "data", label, video_name) if not exists(video_dir): mkdir(video_dir) if str(video_idx) in counter: counter[str(video_idx)] += 1 else: counter[str(video_idx)] = 0 sample["data"] = vframes.numpy() sample["type"] = "video" sample = composed(sample) vframes = sample["data"] # attempt to free some memory del sample sample = {} with torch.no_grad(): (branch1, branch2), _ = model(vframes.to(device)) del vframes vframes = None paf = branch1.data.cpu().numpy().transpose(0, 2, 3, 1) heatmap = branch2.data.cpu().numpy().transpose(0, 2, 3, 1) # Construct humans on every frame no_frames = len(paf[:]) # == len(heatmap[:]) frames = [] for frame in range(no_frames): humans = paf_to_pose_cpp(heatmap[frame], paf[frame], config) frames.append(humans) # attempt to free some memory del paf del heatmap paf = [] heatmap = [] metadata = { "filename": video_names[video_idx], "body_part_translation": body_part_translation, "body_construction": body_part_construction, "label": labels[video_idx], "video_properties": info, "subpart": counter[str(video_idx)] } save_name = join(video_dir, str(counter[str(video_idx)]) + ".json") save_humans(save_name, frames, metadata) dir_name = join("data", label, video_name) if dir_name in subpart_count: (l, n, ss) = subpart_count[dir_name] subpart_count[dir_name] = (l, n + 1, ss) else: subpart_count[dir_name] = (label, 0, subset_size) print(save_name) with open(annotations_out, "a") as f: for key, (l, n, ss) in subpart_count.items(): f.write("{},{},{},{}\n".format(key, l, n, ss))
class VideoDataset(VisionDataset): def __init__(self, root, train, frames_per_clip=16, step_between_clips=1, frame_rate=16, transform=None, extensions=('mp4',), label_fn=lambda x, *_: x, local_rank=-1, get_label_only=False): train_or_val = 'train' if train else 'val' root = os.path.join(root, train_or_val) self.root = root super().__init__(root) self.transform = transform # Function that takes in __getitem__ idx and returns auxiliary label information in the form of a tensor self.label_fn = MethodType(label_fn, self) self.get_label_only = get_label_only clips_fn = os.path.join(root, f'clips_{train_or_val}_{frames_per_clip}_{step_between_clips}_{frame_rate}.pt') try: self.video_clips = torch.load(clips_fn) except FileNotFoundError: video_list = list( map(str, itertools.chain.from_iterable(Path(root).rglob(f'*.{ext}') for ext in extensions))) random.shuffle(video_list) if local_rank <= 0: print('Generating video clips file: ' + clips_fn) self.video_clips = VideoClips( video_list, frames_per_clip, step_between_clips, frame_rate, num_workers=32 ) torch.save(self.video_clips, clips_fn) clip_lengths = torch.as_tensor([len(v) for v in self.video_clips.clips]) self.video_clips.clip_sizes = clip_lengths def __len__(self): return self.video_clips.num_clips() def __getitem__(self, idx): if self.get_label_only: return torch.Tensor([0]), torch.Tensor([0]), self.label_fn(idx) try: video, audio, info, video_idx = self.video_clips.get_clip(idx) # Takes in index w.r.t orig clip sizes except IndexError as e: # Off by one bug in VideoClips object vi, ci = self.video_clips.get_clip_location(idx) self.video_clips.resampling_idxs[vi][ci][-1] -= 1 video, audio, info, video_idx = self.video_clips.get_clip(idx) if self.transform is not None: video = self.transform(video) return video, torch.Tensor([0]), self.label_fn(idx) def update_subset(self, paths, path_transform=None): paths = set(paths) for i, path in enumerate(self.video_clips.video_paths): if path_transform: path = path_transform(path) if path not in paths: self.video_clips.clip_sizes[i] = 0 self.video_clips.cumulative_sizes = self.video_clips.clip_sizes.cumsum(0).tolist() def use_partial_data(self, fraction): self.update_subset(self.video_clips.video_paths[:round(fraction * len(self.video_clips.video_paths))])
class VideoIter(data.Dataset): def __init__(self, clip_length, frame_stride, dataset_path=None, video_transform=None, return_label=False): super(VideoIter, self).__init__() # video clip properties self.frames_stride = frame_stride self.total_clip_length_in_frames = clip_length * frame_stride self.video_transform = video_transform # IO self.dataset_path = dataset_path self.video_list = self._get_video_list(dataset_path=self.dataset_path) self.return_label = return_label # data loading self.video_clips = VideoClips( video_paths=self.video_list, clip_length_in_frames=self.total_clip_length_in_frames, frames_between_clips=self.total_clip_length_in_frames, ) # # if os.path.exists('video_clips.file'): # with open('video_clips.file', 'rb') as fp: # self.video_clips = pickle.load(fp) # else: # self.video_clips = VideoClips(video_paths=self.video_list, # clip_length_in_frames=self.total_clip_length_in_frames, # frames_between_clips=self.total_clip_length_in_frames,) # # if not os.path.exists('video_clips.file'): # with open('video_clips.file', 'wb') as fp: # pickle.dump(self.video_clips, fp, protocol=pickle.HIGHEST_PROTOCOL) @property def video_count(self): return len(self.video_list) def getitem_from_raw_video(self, idx): video, _, _, _ = self.video_clips.get_clip(idx) video_idx, clip_idx = self.video_clips.get_clip_location(idx) video_path = self.video_clips.video_paths[video_idx] in_clip_frames = list( range(0, self.total_clip_length_in_frames, self.frames_stride)) video = video[in_clip_frames] if self.video_transform is not None: video = self.video_transform(video) dir, file = video_path.split(os.sep)[-2:] file = file.split('.')[0] if self.return_label: label = 0 if "Normal" in video_path else 1 return video, label, clip_idx, dir, file return video, clip_idx, dir, file def __len__(self): return len(self.video_clips) def __getitem__(self, index): succ = False while not succ: try: batch = self.getitem_from_raw_video(index) succ = True except Exception as e: index = np.random.choice(range(0, self.__len__())) trace_back = sys.exc_info()[2] line = trace_back.tb_lineno logging.warning( f"VideoIter:: ERROR (line number {line}) !! (Force using another index:\n{index})\n{e}" ) return batch def _get_video_list(self, dataset_path): # features_path = r'/Users/eitankosman/PycharmProjects/anomaly_features' # existing_features = np.concatenate( # [[file.split('.')[0] for file in files] for path, subdirs, files in os.walk(features_path)]) # print(len(existing_features)) assert os.path.exists( dataset_path), "VideoIter:: failed to locate: `{}'".format( dataset_path) vid_list = [] # skp = 0 for path, subdirs, files in os.walk(dataset_path): for name in files: if 'mp4' not in name: continue # if name.split('.')[0] in existing_features: # print(f"Skipping {name}") # skp += 1 # continue vid_list.append(os.path.join(path, name)) # print(f"Skipped {skp}") return vid_list
class GymnasticsVideo(data.Dataset): def __init__(self, transforms=None, train=True, test=False, count_videos=-1, count_clips=-1, skip_videoframes=5, num_videoframes=100, dist_videoframes=50, video_directory=None, fps=5): # If count_videos <= 0, use all the videos. If count_clips <= 0, use # all the clips from all the videos. self.train = train self.transforms = transforms self.video_directory = video_directory self.skip_videoframes = skip_videoframes self.num_videoframes = num_videoframes self.dist_videoframes = dist_videoframes self.video_files = sorted([ os.path.join(video_directory, f) for f in os.listdir(video_directory) \ if f.endswith('mp4') ]) if count_videos > 0: self.video_files = self.video_files[:count_videos] clip_length_in_frames = self.num_videoframes * self.skip_videoframes frames_between_clips = self.dist_videoframes self.saved_video_clips = os.path.join( video_directory, 'video_clips.%dnf.%df.%ds.pkl' % (count_videos, clip_length_in_frames, frames_between_clips)) if os.path.exists(self.saved_video_clips): print('Path Exists for video_clips: ', self.saved_video_clips) self.video_clips = pickle.load(open(self.saved_video_clips, 'rb')) else: print('Path does NOT exist for video_clips: ', self.saved_video_clips) self.video_clips = VideoClips( self.video_files, clip_length_in_frames=clip_length_in_frames, frames_between_clips=frames_between_clips, frame_rate=fps) pickle.dump(self.video_clips, open(self.saved_video_clips, 'wb')) self.datums = self._retrieve_valid_datums(count_videos, count_clips) print(self.datums) def __len__(self): return len(self.datums) def _retrieve_valid_datums(self, count_videos, count_clips): num_clips = self.video_clips.num_clips() ret = [] for flat_index in range(num_clips): video_idx, clip_idx = self.video_clips.get_clip_location( flat_index) if count_videos > 0 and video_idx >= count_videos: # We reached the max number of videos we want. break if count_clips > 0 and clip_idx >= count_clips: # We reached the max number of clips for this video. continue ret.append((flat_index, video_idx, clip_idx)) return ret def __getitem__(self, index): # The video_data retrieved has shape [nf * sf, w, h, c]. # We want to pick every sf'th frame out of that. flat_idx, video_idx, clip_idx = self.datums[index] video, _, _, _ = self.video_clips.get_clip(flat_idx) # video_data is [100, 360, 640, 3] --> num_videoframes, w, h, ch. video_data = video[0::self.skip_videoframes] # now video_transforms is [ch, num_videoframes, 64, 64] video_data = self.transforms(video_data) # now it's [num_videoframes, ch, 64, 64] video_data = torch.transpose(video_data, 0, 1) # path = '/misc/kcgscratch1/ChoGroup/resnick/v%d.c%d.npy' % (video_idx, clip_idx) # if not os.path.exists(path): # np.save(path, video_data.numpy()) return video_data, index
class KineticsAndFails(VisionDataset): FLOW_FPS = 8 def __init__(self, fails_path, kinetics_path, frames_per_clip, step_between_clips, fps, transform=None, extensions=('.mp4', ), video_clips=None, fails_only=False, val=False, balance_fails_only=False, get_clip_times=False, fails_video_list=None, fns_to_remove=None, load_flow=False, flow_histogram=False, fails_flow_path=None, all_fail_videos=False, selfsup_loss=None, clip_interval_factor=None, labeled_fails=True, debug_dataset=False, anticipate_label=0, data_proportion=1, **kwargs): self.clip_len = frames_per_clip / fps self.clip_step = step_between_clips / fps self.clip_interval_factor = clip_interval_factor self.fps = fps self.t = transform self.load_flow = load_flow self.flow_histogram = flow_histogram self.video_clips = None self.fails_path = fails_path self.fails_flow_path = fails_flow_path self.selfsup_loss = selfsup_loss self.get_clip_times = get_clip_times self.anticipate_label = anticipate_label data_proportion = 1 if val else data_proportion if video_clips: self.video_clips = video_clips else: assert fails_path is None or fails_video_list is None video_list = fails_video_list or glob( os.path.join(fails_path, '**', '*.mp4'), recursive=True) if not fails_only: kinetics_cls = torch.load("PATH/TO/kinetics_classes.pt") kinetics_dist = torch.load("PATH/TO/dist.pt") s = len(video_list) for i, n in kinetics_dist.items(): n *= s video_list += sorted( glob(os.path.join(kinetics_path, '**', kinetics_cls[i], '*.mp4'), recursive=True))[:round(n)] self.video_clips = VideoClips(video_list, frames_per_clip, step_between_clips, fps) with open("PATH/TO/borders.json") as f: self.fails_borders = json.load(f) with open("PATH/TO/all_mturk_data.json") as f: self.fails_data = json.load(f) self.fails_only = fails_only self.t_from_clip_idx = lambda idx: ( (step_between_clips * idx) / fps, (step_between_clips * idx + frames_per_clip) / fps) if not balance_fails_only: # no support for recompute clips after balance calc yet self.video_clips.compute_clips(frames_per_clip, step_between_clips, fps) if video_clips is None and fails_only and labeled_fails: # if True: if not all_fail_videos: idxs = [] for i, video_path in enumerate(self.video_clips.video_paths): video_path = os.path.splitext( os.path.basename(video_path))[0] if video_path in self.fails_data: idxs.append(i) self.video_clips = self.video_clips.subset(idxs) # if not val and balance_fails_only: # balance dataset # ratios = {0: 0.3764, 1: 0.0989, 2: 0.5247} self.video_clips.labels = [] self.video_clips.compute_clips(frames_per_clip, step_between_clips, fps) for video_idx, vid_clips in tqdm(enumerate(self.video_clips.clips), total=len( self.video_clips.clips)): video_path = self.video_clips.video_paths[video_idx] if all_fail_videos and os.path.splitext( os.path.basename( video_path))[0] not in self.fails_data: self.video_clips.labels.append([-1 for _ in vid_clips]) continue t_unit = av.open(video_path, metadata_errors='ignore').streams[0].time_base t_fail = sorted(self.fails_data[os.path.splitext( os.path.basename(video_path))[0]]['t']) t_fail = t_fail[len(t_fail) // 2] if t_fail < 0 or not 0.01 <= statistics.median( self.fails_data[os.path.splitext(os.path.basename(video_path))[0]]['rel_t']) <= 0.99 or \ self.fails_data[os.path.splitext(os.path.basename(video_path))[0]]['len'] < 3.2 or \ self.fails_data[os.path.splitext(os.path.basename(video_path))[0]]['len'] > 30: self.video_clips.clips[video_idx] = torch.Tensor() self.video_clips.resampling_idxs[video_idx] = torch.Tensor( ) self.video_clips.labels.append([]) continue prev_label = 0 first_one_idx = len(vid_clips) first_two_idx = len(vid_clips) for clip_idx, clip in enumerate(vid_clips): start_pts = clip[0].item() end_pts = clip[-1].item() t_start = float(t_unit * start_pts) t_end = float(t_unit * end_pts) label = 0 if t_start <= t_fail <= t_end: label = 1 elif t_start > t_fail: label = 2 if label == 1 and prev_label == 0: first_one_idx = clip_idx elif label == 2 and prev_label == 1: first_two_idx = clip_idx break prev_label = label self.video_clips.labels.append( [0 for i in range(first_one_idx)] + [1 for i in range(first_one_idx, first_two_idx)] + [2 for i in range(first_two_idx, len(vid_clips))]) if balance_fails_only and not val: balance_idxs = [] counts = (first_one_idx, first_two_idx - first_one_idx, len(vid_clips) - first_two_idx) offsets = torch.LongTensor([0] + list(counts)).cumsum( 0)[:-1].tolist() ratios = (1, 0.93, 1 / 0.93) labels = (0, 1, 2) lbl_mode = max(labels, key=lambda i: counts[i]) for i in labels: if i != lbl_mode and counts[i] > 0: n_to_add = round( counts[i] * ((counts[lbl_mode] * ratios[i] / counts[i]) - 1)) tmp = list( range(offsets[i], counts[i] + offsets[i])) random.shuffle(tmp) tmp_bal_idxs = [] while len(tmp_bal_idxs) < n_to_add: tmp_bal_idxs += tmp tmp_bal_idxs = tmp_bal_idxs[:n_to_add] balance_idxs += tmp_bal_idxs if not balance_idxs: continue t = torch.cat( (vid_clips, torch.stack([vid_clips[i] for i in balance_idxs]))) self.video_clips.clips[video_idx] = t vid_resampling_idxs = self.video_clips.resampling_idxs[ video_idx] try: t = torch.cat( (vid_resampling_idxs, torch.stack([ vid_resampling_idxs[i] for i in balance_idxs ]))) self.video_clips.resampling_idxs[video_idx] = t except IndexError: pass self.video_clips.labels[-1] += [ self.video_clips.labels[-1][i] for i in balance_idxs ] clip_lengths = torch.as_tensor( [len(v) for v in self.video_clips.clips]) self.video_clips.cumulative_sizes = clip_lengths.cumsum(0).tolist() fns_removed = 0 if fns_to_remove and not val: for i, video_path in enumerate(self.video_clips.video_paths): if fns_removed > len(self.video_clips.video_paths) // 4: break video_path = os.path.splitext(os.path.basename(video_path))[0] if video_path in fns_to_remove: fns_removed += 1 self.video_clips.clips[i] = torch.Tensor() self.video_clips.resampling_idxs[i] = torch.Tensor() self.video_clips.labels[i] = [] clip_lengths = torch.as_tensor( [len(v) for v in self.video_clips.clips]) self.video_clips.cumulative_sizes = clip_lengths.cumsum(0).tolist() if kwargs['local_rank'] <= 0: print( f'removed videos from {fns_removed} out of {len(self.video_clips.video_paths)} files' ) # if not fails_path.startswith("PATH/TO/scenes"): for i, p in enumerate(self.video_clips.video_paths): self.video_clips.video_paths[i] = p.replace( "PATH/TO/scenes", os.path.dirname(fails_path)) self.debug_dataset = debug_dataset if debug_dataset: # self.video_clips = self.video_clips.subset([0]) pass if data_proportion < 1: rng = random.Random() rng.seed(23719) lbls = self.video_clips.labels subset_idxs = rng.sample( range(len(self.video_clips.video_paths)), int(len(self.video_clips.video_paths) * data_proportion)) self.video_clips = self.video_clips.subset(subset_idxs) self.video_clips.labels = [lbls[i] for i in subset_idxs] def trim_borders(self, img, fn): l, r = self.fails_borders[os.path.splitext(os.path.basename(fn))[0]] w = img.shape[2] # THWC if l > 0 and r > 0: img = img[:, :, round(w * l):round(w * r)] return img def __len__(self): return self.video_clips.num_clips() def compute_clip_times(self, video_idx, clip_idx): video_path = self.video_clips.video_paths[video_idx] video_path = os.path.join( self.fails_path, os.path.sep.join(video_path.rsplit(os.path.sep, 2)[-2:])) clip_pts = self.video_clips.clips[video_idx][clip_idx] start_pts = clip_pts[0].item() end_pts = clip_pts[-1].item() t_unit = av.open(video_path, metadata_errors='ignore').streams[0].time_base t_start = float(t_unit * start_pts) t_end = float(t_unit * end_pts) return t_start, t_end def __getitem__(self, idx): if self.load_flow: video_idx, clip_idx = self.video_clips.get_clip_location(idx) video_path = self.video_clips.video_paths[video_idx] video_path = os.path.join( self.fails_path, os.path.sep.join(video_path.rsplit(os.path.sep, 2)[-2:])) label = self.video_clips.labels[video_idx][clip_idx] flow_path = os.path.join( self.fails_flow_path, os.path.sep.join( os.path.splitext(video_path)[0].rsplit(os.path.sep, 2)[-2:])) t_start, t_end = self.compute_clip_times(video_idx, clip_idx) frame_start = round(t_start * self.FLOW_FPS) n_frames = round(self.clip_len * self.FLOW_FPS) flow = [] for frame_i in range(frame_start, frame_start + n_frames): frame_fn = os.path.join(flow_path, f'{frame_i:06}.flo') try: flow.append( torch.load(frame_fn, map_location=torch.device('cpu')).permute( 1, 2, 0).data.numpy()) except: pass while len(flow) < n_frames: flow += flow flow = flow[:n_frames] flow = torch.Tensor(flow) flow = self.trim_borders(flow, video_path) if self.t is not None: flow = self.t(flow) return flow, label, (flow_path, t_start, t_end) else: video_idx, clip_idx = self.video_clips.get_clip_location(idx) if self.anticipate_label: assert not self.selfsup_loss, 'no anticipation with self supervision' video_path = self.video_clips.video_paths[video_idx] label = self.video_clips.labels[video_idx][clip_idx] idx -= round(self.anticipate_label / self.clip_step) new_video_idx, new_clip_idx = self.video_clips.get_clip_location( idx) video, *_ = self.video_clips.get_clip(idx) video = self.trim_borders(video, video_path) if self.t is not None: video = self.t(video) new_t_start, new_t_end = self.compute_clip_times( new_video_idx, new_clip_idx) old_t_start, old_t_end = self.compute_clip_times( video_idx, clip_idx) if new_video_idx != video_idx or new_t_start > old_t_start: label = -1 return video, label, (video_path, new_t_start, new_t_end, []) video, audio, info, video_idx = self.video_clips.get_clip(idx) video_path = self.video_clips.video_paths[video_idx] # print(video_path) try: label = self.video_clips.labels[video_idx][clip_idx] # if self.anticipate_label: # video_path = self.video_clips.video_paths[video_idx] # t_fail = statistics.median(self.fails_data[os.path.splitext(os.path.basename(video_path))[0]]['t']) # t_start, t_end = self.compute_clip_times(video_idx, clip_idx) # t_start += self.anticipate_label # t_end += self.anticipate_label # label = 0 # if t_start <= t_fail <= t_end: # label = 1 # elif t_start > t_fail: # label = 2 except: label = -1 if label == 0 or self.fails_only: video = self.trim_borders(video, video_path) if self.debug_dataset: pass # video[:] = 0 # video[..., 0] = 255 if self.t is not None: video = self.t(video) t_start = t_end = -1 if self.get_clip_times: t_start, t_end = self.compute_clip_times(video_idx, clip_idx) other = [] if self.selfsup_loss == 'pred_middle' or self.selfsup_loss == 'sort' or self.selfsup_loss == 'ctc': k = round(self.clip_len / self.clip_step * self.clip_interval_factor) video_l = [video] try: pvideo, paudio, pinfo, pvideo_idx = self.video_clips.get_clip( idx - k) except: pvideo_idx = -1 try: nvideo, naudio, ninfo, nvideo_idx = self.video_clips.get_clip( idx + k) except: nvideo_idx = -1 t_start, _ = self.compute_clip_times( *self.video_clips.get_clip_location(idx)) try: p_t_start, _ = self.compute_clip_times( *self.video_clips.get_clip_location(idx - k)) except: p_t_start = 1000000000 try: n_t_start, _ = self.compute_clip_times( *self.video_clips.get_clip_location(idx + k)) except: n_t_start = -1000000000 # if pvideo_idx == video_idx: # assert p_t_start < t_start, f"{t_start} <= prev video time {p_t_start}" # if nvideo_idx == video_idx: # assert t_start < n_t_start, f"{t_start} >= next video time {n_t_start}" if pvideo_idx == video_idx and p_t_start < t_start: pvideo = self.trim_borders(pvideo, video_path) if self.t is not None: pvideo = self.t(pvideo) video_l.insert(0, pvideo) else: video_l.insert(0, torch.full_like(video, -1)) if nvideo_idx == video_idx and t_start < n_t_start: nvideo = self.trim_borders(nvideo, video_path) if self.t is not None: nvideo = self.t(nvideo) video_l.append(nvideo) else: video_l.append(torch.full_like(video, -1)) video_l = torch.stack(video_l) video = video_l other = [nvideo_idx == video_idx and pvideo_idx == video_idx] if self.selfsup_loss == 'fps': other = [self.fps] other.append(idx) return video, label, (video_path, t_start, t_end, *other)