def from_directory( cls, path: str, fps: float = 30.0, multithreaded_io=False, path_order_cache: Optional[Dict[str, List[str]]] = None, ): """ Args: path (str): path to frame video directory. fps (float): the target fps for the video. This is needed to link the frames to a second timestamp in the video. multithreaded_io (bool): controls whether parllelizable io operations are performed across multiple threads. path_order_cache (dict): An optional mapping from directory-path to list of frames in the directory in numerical order. Used for speedup by caching the frame paths. """ if path_order_cache is not None and path in path_order_cache: return cls.from_frame_paths(path_order_cache[path], fps, multithreaded_io) assert g_pathmgr.isdir(path), f"{path} is not a directory" rel_frame_paths = g_pathmgr.ls(path) def natural_keys(text): return [ int(c) if c.isdigit() else c for c in re.split("(\d+)", text) ] rel_frame_paths.sort(key=natural_keys) frame_paths = [os.path.join(path, f) for f in rel_frame_paths] if path_order_cache is not None: path_order_cache[path] = frame_paths return cls.from_frame_paths(frame_paths, fps, multithreaded_io)
def get_last_checkpoint(): """Retrieves the most recent checkpoint (highest epoch number).""" checkpoint_dir = get_checkpoint_dir() checkpoints = [ f for f in g_pathmgr.ls(checkpoint_dir) if _NAME_PREFIX in f ] last_checkpoint_name = sorted(checkpoints)[-1] return os.path.join(checkpoint_dir, last_checkpoint_name)
def has_checkpoint(path_to_job): """ Determines if the given directory contains a checkpoint. Args: path_to_job (string): the path to the folder of the current job. """ d = get_checkpoint_dir(path_to_job) files = g_pathmgr.ls(d) if g_pathmgr.exists(d) else [] return any("checkpoint" in f for f in files)
def _construct_imdb(self): """Constructs the imdb.""" # Compile the split data path split_path = os.path.join(self.data_path, self.mode) logger.info("{} data path: {}".format(self.mode, split_path)) # Images are stored per class in subdirs (format: n<number>) split_files = g_pathmgr.ls(split_path) self._class_ids = sorted(f for f in split_files if re.match(r"^n[0-9]+$", f)) # Map ImageNet class ids to contiguous ids self._class_id_cont_id = {v: i for i, v in enumerate(self._class_ids)} # Construct the image db self._imdb = [] for class_id in self._class_ids: cont_id = self._class_id_cont_id[class_id] im_dir = os.path.join(split_path, class_id) for im_name in g_pathmgr.ls(im_dir): im_path = os.path.join(im_dir, im_name) self._imdb.append({"im_path": im_path, "class": cont_id}) logger.info("Number of images: {}".format(len(self._imdb))) logger.info("Number of classes: {}".format(len(self._class_ids)))
def _get_filenames(self, data_path: str): fnames = [] for fname in sorted(g_pathmgr.ls(data_path)): # Only put images in fnames. if not fname.endswith(".jpg"): continue full_fname = os.path.join(data_path, fname) fnames.append(full_fname) return np.array(fnames)
def get_checkpoint_resume_files( checkpoint_folder: str, config: AttrDict, skip_final: bool = False, latest_checkpoint_resume_num: int = 1, ): """ Get the checkpoint file from which the model should be resumed. We look at all the checkpoints in the checkpoint_folder and if the final model checkpoint exists (starts with `model_final_`) and not overriding it, then return the final checkpoint. Otherwise find the latest checkpoint. Args: checkpoint_folder (str): path to the checkpoint folder. config (AttrDict): root config skip_final (bool): whether the final model checkpoint should be skipped or not latest_checkpoint_resume_num (int): what Nth latest checkpoint to resume from. Sometimes the latest checkpoints could be corrupt so this option helps to resume from instead a few checkpoints before the last checkpoint. """ all_files = g_pathmgr.ls(checkpoint_folder) all_iters = [] replace_prefix = "model_phase" # if we checkpoint at iterations too, we start from an iteration checkpoint # since that's latest than the phase end checkpoint. Sometimes, it's also # possible that there is no phase. if config.CHECKPOINT.CHECKPOINT_ITER_FREQUENCY > 0: replace_prefix = "model_iteration" for f in all_files: # if we have the finished training, we pick the finished training file # the checkpoint is saved as "model_final_checkpoint". Otherwise, we pick # the latest phase checkpoint if "model_final" in f and not skip_final: return f if replace_prefix in f: iter_num = f.replace(".torch", "").replace(replace_prefix, "") if iter_num.isdigit(): all_iters.append(int(iter_num)) # make sure the checkpoint resume number is in bounds checkpoint_resume_num = max(0, latest_checkpoint_resume_num - 1) # len(all_iters) - 1 is the last index, checkpoint_resume_num can't be beyond that. checkpoint_resume_num = min(len(all_iters) - 1, checkpoint_resume_num) logging.info(f"checkpoint_resume_num: {checkpoint_resume_num}") if len(all_iters) > 0: all_iters.sort(reverse=True) last_iter = int(all_iters[checkpoint_resume_num]) filename = f"{replace_prefix}{last_iter}.torch" return filename else: return None
def delete_checkpoints(checkpoint_dir=None, keep="all"): """Deletes unneeded checkpoints, keep can be "all", "last", or "none".""" assert keep in ["all", "last", "none"], "Invalid keep setting: {}".format(keep) checkpoint_dir = checkpoint_dir if checkpoint_dir else get_checkpoint_dir() if keep == "all" or not g_pathmgr.exists(checkpoint_dir): return 0 checkpoints = [f for f in g_pathmgr.ls(checkpoint_dir) if _NAME_PREFIX in f] checkpoints = sorted(checkpoints)[:-1] if keep == "last" else checkpoints [ g_pathmgr.rm(os.path.join(checkpoint_dir, checkpoint)) for checkpoint in checkpoints ] return len(checkpoints)
def get_last_checkpoint(path_to_job): """ Get the last checkpoint from the checkpointing folder. Args: path_to_job (string): the path to the folder of the current job. """ d = get_checkpoint_dir(path_to_job) names = g_pathmgr.ls(d) if g_pathmgr.exists(d) else [] names = [f for f in names if "checkpoint" in f] assert len(names), "No checkpoints found in '{}'.".format(d) # Sort the checkpoints by epoch. name = sorted(names)[-1] return os.path.join(d, name)
def build_encoded_manifest_from_nested_directory( data_directory_path: str, ) -> Dict[str, EncodedVideoInfo]: """ Creates a dictionary from video_id to EncodedVideoInfo for encoded videos in the given directory. Args: data_directory_path (str): The folder to ls to find encoded video files. Returns: Dict[str, EncodedVideoInfo] mapping video_id to EncodedVideoInfo for each file in 'data_directory_path' """ encoded_video_infos = {} for participant_id in g_pathmgr.ls(data_directory_path): participant_folder_path = f"{data_directory_path}/{participant_id}" for video_file_name in g_pathmgr.ls(participant_folder_path): video_id = video_file_name[:6] video_full_path = f"{participant_folder_path}/{video_file_name}" encoded_video_infos[video_id] = EncodedVideoInfo( video_id, video_full_path) return encoded_video_infos
def get_filelist_labels_images_paths(input_path): dataset_split_summary = {} img_paths, img_labels = [], [] label_paths = g_pathmgr.ls(input_path) dataset_split_summary["labels"] = label_paths dataset_split_summary["num_labels"] = len(label_paths) print(f"{len(label_paths)} classes found.") total_split_examples = 0 # Populate the img_paths and img_labels based on torchvision image folder file structure. for label in label_paths: label_path = os.path.join(input_path, label) images = g_pathmgr.ls(os.path.join(input_path, label)) print(f"{len(images)} examples found for {label}.") total_split_examples += len(images) for image in images: img_path = os.path.join(label_path, image) img_paths.append(img_path) img_labels.append(label) # print the dataset summary dataset_split_summary["num_examples"] = total_split_examples print(f"{total_split_examples} found") return dataset_split_summary, img_paths, img_labels
def add_participant_video_frames(participant_id: str, participant_path: str) -> None: participant_frames = sorted(g_pathmgr.ls(str(participant_path))) for frame_file_name in participant_frames: file_extension = frame_file_name.split(".")[-1] frame_name = frame_file_name[:-(len(file_extension) + 1)] [path_participant_id, path_video_id, path_frame_id] = frame_name.split("_") assert path_participant_id == participant_id video_id = f"{path_participant_id}_{path_video_id}" if ( video_id not in video_frames ): # This is the first frame we have seen from video w/ video_id video_frames[video_id] = VideoFrameInfo( video_id=video_id, location=participant_path, frame_file_stem=f"{video_id}_", frame_string_length=len(frame_name), min_frame_number=int(path_frame_id), max_frame_number=int(path_frame_id), file_extension=file_extension, ) else: video_frame_info = video_frames[video_id] # Check that this new frame is of the same format as other frames for this video # and that it is the next frame in order, if so update the frame info for this # video to reflect there is an additional frame. # We don't need to check video_id or frame_file_stem as they are function of # video_id which is aligned within the dictionary assert video_frame_info.frame_string_length == len(frame_name) assert video_frame_info.location == participant_path, ( f"Frames for {video_id} found in two paths: " f"{video_frame_info.location} and {participant_path}") assert video_frame_info.max_frame_number + 1 == int( path_frame_id) assert ( video_frame_info.file_extension == file_extension ), f"Frames with two different file extensions found for video {video_id}" video_frames[video_id] = VideoFrameInfo( video_id=video_frame_info.video_id, location=video_frame_info.location, frame_file_stem=video_frame_info.frame_file_stem, frame_string_length=video_frame_info.frame_string_length, min_frame_number=video_frame_info.min_frame_number, max_frame_number=int(path_frame_id), # Update file_extension=video_frame_info.file_extension, )
def has_final_checkpoint(checkpoint_folder: str, final_checkpoint_pattern: str = "model_final"): """ Check whether the final checkpoint exists in the checkpoint folder. The final checkpoint is recognized by the prefix "model_final_" in VISSL. Args: checkpoint_folder (str): path to the checkpoint folder. final_checkpoint_pattern (str): what prefix is used to save the final checkpoint. Returns: has_final_checkpoint: whether the final checkpoint exists or not """ checkpointed_files = g_pathmgr.ls(checkpoint_folder) torch_files = filter(lambda x: x.endswith(".torch"), checkpointed_files) final_files = filter(lambda x: final_checkpoint_pattern in x, torch_files) return len(list(final_files)) > 0
def has_checkpoint(checkpoint_folder: str, skip_final: bool = False): """ Check whether there are any checkpoints at all in the checkpoint folder. Args: checkpoint_folder (str): path to the checkpoint folder skip_final (bool): if the checkpoint with `model_final_` prefix exist, whether to skip it and train. Returns: checkpoint_exists (bool): whether checkpoint exists or not """ checkpointed_files = g_pathmgr.ls(checkpoint_folder) checkpoint_exists = False for f in checkpointed_files: if f.endswith(".torch") and ("model_final" not in f or not skip_final): checkpoint_exists = True break return checkpoint_exists
def get_shard_file_names( input_dir: str, split: str, layer: str, sorted: bool = True, ) -> List[ExtractedFeaturesShardPaths]: """ Get the list of files needed to load the extracted features """ # List all the files that are containing the features for a given # dataset split and a given layer feature_regex = re.compile(rf"(.*)_{split}_{layer}_features.npy") prefixes = [] for file_path in g_pathmgr.ls(input_dir): match = feature_regex.match(file_path) if match is not None: prefixes.append(match.group(1)) # Sort the shards by file name if required: it might be useful # if the algorithm that uses the shards is influenced by ordering if sorted: prefixes.sort() # Yield all the files needed to merge the features dumped on # the different GPUs shard_paths = [] for prefix in prefixes: feat_file = os.path.join(input_dir, f"{prefix}_{split}_{layer}_features.npy") targets_file = os.path.join( input_dir, f"{prefix}_{split}_{layer}_targets.npy") indices_file = os.path.join(input_dir, f"{prefix}_{split}_{layer}_inds.npy") shard_paths.append( ExtractedFeaturesShardPaths( feature_file=feat_file, targets_file=targets_file, indices_file=indices_file, )) return shard_paths
def load(self, num_samples=None): """ Load the data ground truth and parse the data so it's ready to be used. """ # Load the dataset GT self.lab_root = f"{self.path}/lab/" self.img_root = f"{self.path}/jpg/" logging.info(f"Loading data: {self.path}") lab_filenames = np.sort(g_pathmgr.ls(self.lab_root)) # Get the filenames without the extension self.img_filenames = [ e[:-4] for e in np.sort(g_pathmgr.ls(self.img_root)) if e[:-4] not in self.blacklisted ] # Parse the label files. Some challenges as filenames do not correspond # exactly to query names. Go through all the labels to: # i) map names to filenames and vice versa # ii) get the relevant regions of interest of the queries, # iii) get the indexes of the dataset images that are queries # iv) get the relevants / non-relevants list self.relevants = {} self.junk = {} self.non_relevants = {} self.filename_to_name = {} self.name_to_filename = OrderedDict() self.q_roi = {} for e in lab_filenames: if e.endswith("_query.txt"): q_name = e[:-len("_query.txt")] with g_pathmgr.open(f"{self.lab_root}/{e}") as fopen: q_data = fopen.readline().split(" ") if q_data[0].startswith("oxc1_"): q_filename = q_data[0][5:] else: q_filename = q_data[0] self.filename_to_name[q_filename] = q_name self.name_to_filename[q_name] = q_filename with g_pathmgr.open( f"{self.lab_root}/{q_name}_ok.txt") as fopen: good = {e.strip() for e in fopen} with g_pathmgr.open( f"{self.lab_root}/{q_name}_good.txt") as fopen: good = good.union({e.strip() for e in fopen}) with g_pathmgr.open( f"{self.lab_root}/{q_name}_junk.txt") as fopen: junk = {e.strip() for e in fopen} good_plus_junk = good.union(junk) self.relevants[q_name] = [ i for i in range(len(self.img_filenames)) if self.img_filenames[i] in good ] self.junk[q_name] = [ i for i in range(len(self.img_filenames)) if self.img_filenames[i] in junk ] self.non_relevants[q_name] = [ i for i in range(len(self.img_filenames)) if self.img_filenames[i] not in good_plus_junk ] self.q_roi[q_name] = np.array([float(q) for q in q_data[1:]], dtype=np.float32) self.q_names = list(self.name_to_filename.keys()) self.q_index = np.array([ self.img_filenames.index(self.name_to_filename[qn]) for qn in self.q_names ]) self.N_images = len(self.img_filenames) self.N_queries = len(self.q_index) if num_samples is not None: self.N_queries = min(self.N_queries, num_samples) self.N_images = min(self.N_images, num_samples)
def has_checkpoint(): """Determines if there are checkpoints available.""" checkpoint_dir = get_checkpoint_dir() if not g_pathmgr.exists(checkpoint_dir): return False return any(_NAME_PREFIX in f for f in g_pathmgr.ls(checkpoint_dir))
def build_frame_manifest_from_flat_directory( data_directory_path: str, multithreaded: bool) -> Dict[str, VideoFrameInfo]: """ Args: data_directory_path (str): Path or URI to EpicKitchenDataset data. Data at this path must be a folder of structure: { "{video_id}": [ "frame_{frame_number}.{file_extension}", "frame_{frame_number}.{file_extension}", "frame_{frame_number}.{file_extension}", ...] ...} multithreaded (bool): controls whether io operations are performed across multiple threads. Returns: Dictionary mapping video_id of available videos to the locations of their underlying frame files. """ video_frames = {} video_ids = g_pathmgr.ls(str(data_directory_path)) def add_video_frames(video_id: str, video_path: str) -> None: video_frame_file_names = sorted(g_pathmgr.ls(video_path)) for frame in video_frame_file_names: file_extension = frame.split(".")[-1] frame_name = frame[:-(len(file_extension) + 1)] stem, path_frame_id = frame_name.split("_") if video_id not in video_frames: video_frames[video_id] = VideoFrameInfo( video_id=video_id, location=video_path, frame_file_stem=f"{stem}_", frame_string_length=len(frame_name), min_frame_number=int(path_frame_id), max_frame_number=int(path_frame_id), file_extension=file_extension, ) else: video_frame_info = video_frames[video_id] # Check that this new frame is of the same format as other frames for this video # and that it is the next frame in order, if so update the frame info for this # video to reflect there is an additional frame. # We don't need to check video_id or frame_file_stem as they are function of # video_id which is aligned within the dictionary assert video_frame_info.frame_string_length == len(frame_name) assert video_frame_info.location == video_path, ( f"Frames for {video_id} found in two paths: " f"{video_frame_info.location} and {video_path}") assert video_frame_info.max_frame_number + 1 == int( path_frame_id) assert ( video_frame_info.file_extension == file_extension ), f"Frames with two different file extensions found for video {video_id}" video_frames[video_id] = VideoFrameInfo( video_id=video_frame_info.video_id, location=video_frame_info.location, frame_file_stem=video_frame_info.frame_file_stem, frame_string_length=video_frame_info.frame_string_length, min_frame_number=video_frame_info.min_frame_number, max_frame_number=int(path_frame_id), # Update file_extension=video_frame_info.file_extension, ) video_paths = [(video_id, f"{data_directory_path}/{video_id}") for video_id in video_ids] # Kick off frame indexing for all participants optional_threaded_foreach(add_video_frames, video_paths, multithreaded) return video_frames
def ls(path: str) -> List[str]: if IOPathManager: return IOPathManager.ls(path) return os.listdir(path)
if __name__ == "__main__": """ Example usage: python extra_scripts/convert_folder_to_filelist.par \ -i "manifold://ssl_framework/tree/datasets/food_101/" \ -o "manifold://ssl_framework/tree/datasets/food_101/" """ args = get_argument_parser().parse_args() setup_path_manager() ground_truth_splits = ["train", "trainval", "val", "test"] available_splits = g_pathmgr.ls(args.input) dataset_summary = {} if not any(split in available_splits for split in ground_truth_splits): # the dataset doesn't have any splits. So we just read it as is print("Dataset has no splits...") dataset_summary, img_paths, img_labels = get_filelist_labels_images_paths( args.input) out_image_filepath = os.path.join(args.output, "images.npy") out_label_filepath = os.path.join(args.output, "labels.npy") save_img_labels_filelist(img_paths, img_labels, out_image_filepath, out_label_filepath) else: for split in ["train", "trainval", "val", "test"]: if not g_pathmgr.exists(os.path.join(args.input, split)):