def save_checkpoint(path_to_job, model, optimizer, epoch, cfg): """ Save a checkpoint. Args: model (model): model to save the weight to the checkpoint. optimizer (optim): optimizer to save the historical state. epoch (int): current number of epoch of the model. cfg (CfgNode): configs to save. """ # Save checkpoints only from the master process. if not du.is_master_proc(cfg.NUM_GPUS * cfg.NUM_SHARDS): return # Ensure that the checkpoint dir exists. g_pathmgr.mkdirs(get_checkpoint_dir(path_to_job)) # Omit the DDP wrapper in the multi-gpu setting. sd = model.module.state_dict() if cfg.NUM_GPUS > 1 else model.state_dict() normalized_sd = sub_to_normal_bn(sd) # Record the state. checkpoint = { "epoch": epoch, "model_state": normalized_sd, "optimizer_state": optimizer.state_dict(), "cfg": cfg.dump(), } # Write the checkpoint. path_to_checkpoint = get_path_to_checkpoint(path_to_job, epoch + 1) with g_pathmgr.open(path_to_checkpoint, "wb") as f: torch.save(checkpoint, f) return path_to_checkpoint
def makedir(dir_path): """ Create the directory if it does not exist. """ is_success = False try: if not g_pathmgr.exists(dir_path): g_pathmgr.mkdirs(dir_path) is_success = True except BaseException: logging.info(f"Error creating directory: {dir_path}") return is_success
def make_checkpoint_dir(path_to_job): """ Creates the checkpoint directory (if not present already). Args: path_to_job (string): the path to the folder of the current job. """ checkpoint_dir = os.path.join(path_to_job, "checkpoints") # Create the checkpoint dir from the master process if du.is_master_proc() and not g_pathmgr.exists(checkpoint_dir): try: g_pathmgr.mkdirs(checkpoint_dir) except Exception: pass return checkpoint_dir
def load_cfg_fom_args(description="Config options."): """Load config from command line args and set any specified options.""" current_time = datetime.now().strftime("%y%m%d_%H%M%S") parser = argparse.ArgumentParser(description=description) parser.add_argument("--cfg", dest="cfg_file", type=str, required=True, help="Config file location") parser.add_argument("opts", default=None, nargs=argparse.REMAINDER, help="See conf.py for all options") if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() merge_from_file(args.cfg_file) cfg.merge_from_list(args.opts) log_dest = os.path.basename(args.cfg_file) log_dest = log_dest.replace('.yaml', '_{}.txt'.format(current_time)) g_pathmgr.mkdirs(cfg.SAVE_DIR) cfg.LOG_TIME, cfg.LOG_DEST = current_time, log_dest cfg.freeze() logging.basicConfig( level=logging.INFO, format="[%(asctime)s] [%(filename)s: %(lineno)4d]: %(message)s", datefmt="%y/%m/%d %H:%M:%S", handlers=[ logging.FileHandler(os.path.join(cfg.SAVE_DIR, cfg.LOG_DEST)), logging.StreamHandler() ]) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) random.seed(cfg.RNG_SEED) torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK logger = logging.getLogger(__name__) version = [ torch.__version__, torch.version.cuda, torch.backends.cudnn.version() ] logger.info( "PyTorch Version: torch={}, cuda={}, cudnn={}".format(*version)) logger.info(cfg)
def setup_seed(rank): """Sets up environment for training or testing.""" if rank == 0: g_pathmgr.mkdirs(cfg.OUT_DIR) config.dump_cfg() if cfg.RNG_SEED: np.random.seed(cfg.RNG_SEED + rank) torch.manual_seed(cfg.RNG_SEED + rank) random.seed(cfg.RNG_SEED + rank) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True else: torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
def create_sun397_disk_filelist_dataset(input_path: str, output_path: str, seed: int): """ Create partitions "train", "trainval", "val", "test" from the input path of SUN397 by allocating 70% of labels to "train", 10% to "val" and 20% to "test". """ random.seed(seed) g_pathmgr.mkdirs(output_path) # List all the available classes in SUN397 and their path image_folder = os.path.join(input_path, "SUN397") class_names_file = os.path.join(image_folder, "ClassName.txt") class_paths = [] with open(class_names_file, "r") as f: for line in f: path = line.strip() if path.startswith("/"): path = path[1:] class_paths.append(path) # For each label, split the samples in train/val/test and add them # to the list of samples associated to each split splits_data = { "train": SplitData(), "val": SplitData(), "test": SplitData(), "trainval": SplitData(), } for i, class_path in tqdm(enumerate(class_paths), total=len(class_paths)): full_class_path = os.path.join(image_folder, class_path) image_names = os.listdir(full_class_path) splits = split_sample_list(image_names) for split, images in splits.items(): for image_name in images: image_path = os.path.join(full_class_path, image_name) splits_data[split].image_paths.append(image_path) splits_data[split].image_labels.append(i) # Save each split for split, samples in splits_data.items(): image_output_path = os.path.join(output_path, f"{split}_images.npy") with g_pathmgr.open(image_output_path, mode="wb") as f: np.save(f, np.array(samples.image_paths)) label_output_path = os.path.join(output_path, f"{split}_labels.npy") with g_pathmgr.open(label_output_path, mode="wb") as f: np.save(f, np.array(samples.image_labels))
def cache_url(url_or_file, cache_dir, base_url=_PYCLS_BASE_URL): """Download the file specified by the URL to the cache_dir and return the path to the cached file. If the argument is not a URL, simply return it as is. """ is_url = re.match(r"^(?:http)s?://", url_or_file, re.IGNORECASE) is not None if not is_url: return url_or_file url = url_or_file assert url.startswith(base_url), "url must start with: {}".format(base_url) cache_file_path = url.replace(base_url, cache_dir) if g_pathmgr.exists(cache_file_path): return cache_file_path cache_file_dir = os.path.dirname(cache_file_path) if not g_pathmgr.exists(cache_file_dir): g_pathmgr.mkdirs(cache_file_dir) logger.info("Downloading remote file {} to {}".format( url, cache_file_path)) download_url(url, cache_file_path) return cache_file_path
def setup_env(): """Sets up environment for training or testing.""" if dist.is_master_proc(): # Ensure that the output dir exists g_pathmgr.mkdirs(cfg.OUT_DIR) # Save the config config.dump_cfg() # Setup logging logging.setup_logging() # Log torch, cuda, and cudnn versions version = [torch.__version__, torch.version.cuda, torch.backends.cudnn.version()] logger.info("PyTorch Version: torch={}, cuda={}, cudnn={}".format(*version)) # Log the config as both human readable and as a json logger.info("Config:\n{}".format(cfg)) if cfg.VERBOSE else () logger.info(logging.dump_log_data(cfg, "cfg", None)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) random.seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
def save_checkpoint(model, optimizer, epoch, best_acc1, best): """Saves a checkpoint.""" # Save checkpoints only from the master process if torch.distributed.get_rank() != 0: return # Ensure that the checkpoint dir exists g_pathmgr.mkdirs(get_checkpoint_dir()) # Record the state checkpoint = { "epoch": epoch, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), "best_acc1": best_acc1, } # Write the checkpoint checkpoint_file = get_checkpoint(epoch + 1) torch.save(checkpoint, checkpoint_file) # If best copy checkpoint to the best checkpoint if best: shutil.copyfile(checkpoint_file, os.path.join(cfg.OUT_DIR, "best.pth.tar")) return checkpoint_file
def adapt_train_database_extract_config(config, checkpoint_folder): config.DATA.TRAIN.DATA_SOURCES = [] config.DATA.TRAIN.DATA_PATHS = [] config.DATA.TRAIN.DATA_LIMIT = -1 if config.IMG_RETRIEVAL.TRAIN_PCA_WHITENING: config.DATA.TRAIN.DATA_SOURCES = ["disk_filelist"] config.DATA.TRAIN.DATA_PATHS = [ f"{config.IMG_RETRIEVAL.DATASET_PATH}/{config.IMG_RETRIEVAL.TRAIN_DATASET_NAME}/train_images.npy" # NOQA ] config.DATA.TEST.DATA_SOURCES = ["disk_filelist"] if config.IMG_RETRIEVAL.USE_DISTRACTORS: config.DATA.TEST.DATA_PATHS = [ f"{config.IMG_RETRIEVAL.DATASET_PATH}/{config.IMG_RETRIEVAL.EVAL_DATASET_NAME}/database_with_distractors_images.npy" # NOQA ] else: config.DATA.TEST.DATA_PATHS = [ f"{config.IMG_RETRIEVAL.DATASET_PATH}/{config.IMG_RETRIEVAL.EVAL_DATASET_NAME}/database_images.npy" # NOQA ] output_dir = os.path.join(checkpoint_folder, "train_database") g_pathmgr.mkdirs(output_dir) config.EXTRACT_FEATURES.OUTPUT_DIR = output_dir if config.IMG_RETRIEVAL.DEBUG_MODE: config.DATA.TRAIN.DATA_LIMIT = 10 config.DATA.TEST.DATA_LIMIT = 50 # Images are all of different sizes. config.DATA.TRAIN.BATCHSIZE_PER_REPLICA = 1 config.DATA.TEST.BATCHSIZE_PER_REPLICA = 1 config.DATA.TRAIN.TRANSFORMS = get_extract_features_transforms(config) config.DATA.TEST.TRANSFORMS = get_extract_features_transforms(config) return config
def adapt_query_extract_config(config, checkpoint_folder): config.DATA.TRAIN.DATA_SOURCES = [] config.DATA.TRAIN.DATA_PATHS = [] config.DATA.TRAIN.DATASET_NAMES = [] config.DATA.TRAIN.DATA_LIMIT = 0 config.DATA.TEST.DATA_SOURCES = ["disk_filelist"] config.DATA.TEST.DATA_PATHS = [ f"{config.IMG_RETRIEVAL.DATASET_PATH}/{config.IMG_RETRIEVAL.EVAL_DATASET_NAME}/query_images.npy" # NOQA ] output_dir = os.path.join(checkpoint_folder, "query") g_pathmgr.mkdirs(output_dir) config.EXTRACT_FEATURES.OUTPUT_DIR = output_dir if config.IMG_RETRIEVAL.DEBUG_MODE: config.DATA.TEST.DATA_LIMIT = 10 # Images are all of different sizes. config.DATA.TEST.BATCHSIZE_PER_REPLICA = 1 config.DATA.TEST.TRANSFORMS = get_extract_features_transforms(config) return config
def save_checkpoint(model, optimizer, epoch, best): """Saves a checkpoint.""" # Save checkpoints only from the master process if not dist.is_master_proc(): return # Ensure that the checkpoint dir exists g_pathmgr.mkdirs(get_checkpoint_dir()) # Record the state checkpoint = { "epoch": epoch, "model_state": unwrap_model(model).state_dict(), "optimizer_state": optimizer.state_dict(), "cfg": cfg.dump(), } # Write the checkpoint checkpoint_file = get_checkpoint(epoch + 1) with g_pathmgr.open(checkpoint_file, "wb") as f: torch.save(checkpoint, f) # If best copy checkpoint to the best checkpoint if best: with g_pathmgr.open(checkpoint_file, "rb") as src: with g_pathmgr.open(get_checkpoint_best(), "wb") as dst: copyfileobj(src, dst) return checkpoint_file
def mkdirs(path: str) -> None: if IOPathManager: return IOPathManager.mkdirs(path) os.makedirs(path, exist_ok=True)
def _get_output_dir(self, cfg_out_dir): odir = f"{cfg_out_dir}/{self.layer}" g_pathmgr.mkdirs(odir) logging.info(f"Output directory for SVM results: {odir}") return odir
def _create_dataset_split(cfg: AttrDict, data_split: str, features_dim: int, kmeans, pca: Optional[PCA] = None): """ Scan the dataset split and create a new classification dataset out of it where each image is associated to the centroid the closest in feature space. """ num_clusters = cfg.CLUSTERFIT.NUM_CLUSTERS data_name = cfg.CLUSTERFIT.FEATURES.DATASET_NAME layer_name = cfg.CLUSTERFIT.FEATURES.LAYER_NAME logging.info( f"Computing cluster label assignment for each sample in {data_split}..." ) indices = [] distances = [] target_clusters = [] shard_paths = ExtractedFeaturesLoader.get_shard_file_names( input_dir=cfg.CLUSTERFIT.FEATURES.PATH, split=data_split.lower(), layer=cfg.CLUSTERFIT.FEATURES.LAYER_NAME, ) for shard_path in shard_paths: shard_content = ExtractedFeaturesLoader.load_feature_shard(shard_path) shard_features = shard_content.features # TODO - factorize this with above??? normalization at least??? # Reshape and normalize the loaded features shard_features = shard_features.reshape(shard_features.shape[0], -1) shard_features_norm = np.linalg.norm(shard_features, axis=1) + 1e-5 shard_features = shard_features / shard_features_norm[:, np.newaxis] if pca is not None: shard_features = pca.transform(shard_features) shard_features = np.ascontiguousarray(shard_features) shard_distances, shard_cluster_labels = kmeans.index.search( shard_features, 1) indices.extend(shard_content.indices) distances.extend(shard_distances) target_clusters.extend(shard_cluster_labels) # Step 5: save clustering data and hard cluster labels for the images logging.info("Saving centroids and cluster assignments to file...") dataset_image_paths = get_image_paths(cfg, split=data_split) image_paths = [dataset_image_paths[i] for i in indices] data_split = data_split.lower() clustering_output_dict = { "sample_indices": indices, "hard_labels": target_clusters, "centroids": kmeans.centroids, "distances": distances, "images": image_paths, } output_dir = cfg.CLUSTERFIT.OUTPUT_DIR g_pathmgr.mkdirs(output_dir) output_prefix = ( f"{data_name}_{data_split}_{layer_name}_N{num_clusters}_D{features_dim}" ) cluster_output_filepath = os.path.join(output_dir, f"{output_prefix}.pkl") labels_output_filepath = os.path.join(output_dir, f"{output_prefix}_labels.npy") image_path_filepath = os.path.join(output_dir, f"{output_prefix}_images.npy") out_images = np.array(image_paths) out_hard_labels = np.array(target_clusters, dtype=np.int64).reshape(-1) save_file(clustering_output_dict, cluster_output_filepath) save_file(out_images, image_path_filepath) save_file(out_hard_labels, labels_output_filepath)