def test_logger_files(self) -> None: self.assertTrue( PathManager.exists( glob.glob(os.path.join(self._tmpdir, "logs", "train*"))[0])) self.assertTrue( PathManager.exists(os.path.join(self._tmpdir, "train.log"))) self.assertTrue(PathManager.exists(os.path.join(self._tmpdir, "logs")))
def _try_download(self): _is_master = is_master() if self._already_downloaded: return needs_download = False if not hasattr(self.config, "model_file"): if _is_master: warnings.warn("'model_file' key is required but missing " "from FastTextProcessor's config.") needs_download = True model_file = self.config.model_file # If model_file is already an existing path don't join to cache dir if not PathManager.exists(model_file): model_file = os.path.join(get_multimodelity_cache_dir(), model_file) if not PathManager.exists(model_file): if _is_master: warnings.warn(f"No model file present at {model_file}.") needs_download = True if needs_download: logger.info("Downloading FastText bin") model_file = self._download_model() self.model_file = model_file self._already_downloaded = True synchronize()
def assert_files(self, folder): files_needed = self.JSONL_PHASE_ONE_FILES phase_one = True for file in files_needed: try: assert PathManager.exists( os.path.join(folder, "data", file) ), f"{file} doesn't exist in {folder}" except AssertionError: phase_one = False if not phase_one: files_needed = self.JSONL_PHASE_TWO_FILES for file in files_needed: assert PathManager.exists( os.path.join(folder, "data", file) ), f"{file} doesn't exist in {folder}" else: warnings.warn( "You are on Phase 1 of the Hateful Memes Challenge. " "Please update to Phase 2" ) files_needed = self.IMAGE_FILES exists = False for file in files_needed: exists = exists or PathManager.exists(os.path.join(folder, "data", file)) if not exists: raise AssertionError("Neither img or img.tar.gz exists in current zip") return phase_one
def __init__(self, config: Config, *args, **kwargs): super().__init__() model_data_dir = get_absolute_path(config.model_data_dir) if not os.path.isabs(config.weights_file): weights_file = os.path.join(model_data_dir, config.weights_file) if not os.path.isabs(config.bias_file): bias_file = os.path.join(model_data_dir, config.bias_file) if not PathManager.exists(bias_file) or not PathManager.exists( weights_file): download_path = download_pretrained_model("detectron.vmb_weights") weights_file = get_absolute_path( os.path.join(download_path, "fc7_w.pkl")) bias_file = get_absolute_path( os.path.join(download_path, "fc7_b.pkl")) with PathManager.open(weights_file, "rb") as w: weights = pickle.load(w) with PathManager.open(bias_file, "rb") as b: bias = pickle.load(b) out_dim = bias.shape[0] self.lc = nn.Linear(config.in_dim, out_dim) self.lc.weight.data.copy_(torch.from_numpy(weights)) self.lc.bias.data.copy_(torch.from_numpy(bias)) self.out_dim = out_dim
def test_finalize_and_resume_file(self): with mock_env_with_temp() as d: checkpoint = Checkpoint(self.trainer) self._init_early_stopping(checkpoint) self._do_a_pass() checkpoint.finalize() original = deepcopy(self.trainer.model) pth_path = os.path.join(d, "simple_final.pth") self.assertTrue(PathManager.exists(pth_path)) self._do_a_pass() after_a_pass = deepcopy(self.trainer.model) original_optimizer = deepcopy(self.trainer.optimizer) self.trainer.config.checkpoint.resume_file = pth_path with contextlib.redirect_stdout(StringIO()): checkpoint.load_state_dict() self.assertTrue( compare_state_dicts(self.trainer.model.state_dict(), original.state_dict())) self.assertFalse( compare_state_dicts(self.trainer.model.state_dict(), after_a_pass.state_dict())) self.assertTrue( self._compare_optimizers(self.trainer.optimizer, original_optimizer))
def resolve_cache_dir(env_variable="multimodelity_CACHE_DIR", default="multimodelity"): # Some of this follow what "transformers" does for there cache resolving try: from torch.hub import _get_torch_home torch_cache_home = _get_torch_home() except ImportError: torch_cache_home = os.path.expanduser( os.getenv( "TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"), )) default_cache_path = os.path.join(torch_cache_home, default) cache_path = os.getenv(env_variable, default_cache_path) if not PathManager.exists(cache_path): try: PathManager.mkdirs(cache_path) except PermissionError: cache_path = os.path.join(get_multimodelity_root(), ".multimodelity_cache") PathManager.mkdirs(cache_path) return cache_path
def setup_output_folder(folder_only: bool = False): """Sets up and returns the output file where the logs will be placed based on the configuration passed. Usually "save_dir/logs/log_<timestamp>.txt". If env.log_dir is passed, logs will be directly saved in this folder. Args: folder_only (bool, optional): If folder should be returned and not the file. Defaults to False. Returns: str: folder or file path depending on folder_only flag """ save_dir = get_multimodelity_env(key="save_dir") time_format = "%Y_%m_%dT%H_%M_%S" log_filename = "train_" log_filename += Timer().get_time_hhmmss(None, format=time_format) log_filename += ".log" log_folder = os.path.join(save_dir, "logs") env_log_dir = get_multimodelity_env(key="log_dir") if env_log_dir: log_folder = env_log_dir if not PathManager.exists(log_folder): PathManager.mkdirs(log_folder) if folder_only: return log_folder log_filename = os.path.join(log_folder, log_filename) return log_filename
def restore(self): synchronize() logger.info("Restoring checkpoint") best_path = os.path.join(self.ckpt_foldername, self.ckpt_prefix + "best.ckpt") if PathManager.exists(best_path): self._load(best_path, force=True)
def resolve_dir(env_variable, default="data"): default_dir = os.path.join(resolve_cache_dir(), default) dir_path = os.getenv(env_variable, default_dir) if not PathManager.exists(dir_path): PathManager.mkdirs(dir_path) return dir_path
def load_state_dict(self): ckpt_config = self.config.checkpoint suffix = "best.ckpt" if ckpt_config.resume_best else "current.ckpt" reverse_suffix = "best.ckpt" if not ckpt_config.resume_best else "current.ckpt" ckpt_filepath = os.path.join(self.ckpt_foldername, self.ckpt_prefix + suffix) # In case of interrupts and resume, ckpt_config.resume_file would be there # But, if the checkpoints are already created in the save dir # and resume is true signifying the interrupt resume, we should skip # loading the resume file. if (ckpt_config.resume_file is not None or ckpt_config.resume_zoo is not None) and (not ckpt_config.resume or not PathManager.exists(ckpt_filepath)): if ckpt_config.resume_file and PathManager.exists( ckpt_config.resume_file): self._load( ckpt_config.resume_file, load_pretrained=ckpt_config.resume_pretrained, ) return # resume_file doesn't exist, try from zoo now elif ckpt_config.resume_zoo is not None: self._load( ckpt_config.resume_zoo, load_zoo=True, load_pretrained=ckpt_config.resume_pretrained, ) return else: raise RuntimeError(f"{ckpt_config.resume_file} doesn't exist") if ckpt_config.resume: if PathManager.exists(ckpt_filepath): self._load(ckpt_filepath) else: warnings.warn( "Tried to resume but checkpoint filepath {} " "is not present. Trying {}, otherwise skipping.".format( ckpt_filepath, reverse_suffix)) ckpt_filepath = ckpt_filepath.replace(suffix, reverse_suffix) if PathManager.exists(ckpt_filepath): self._load(ckpt_filepath)
def __init__(self, vocab_file, data_dir=None): if not os.path.isabs(vocab_file) and data_dir is not None: vocab_file = get_absolute_path(os.path.join(data_dir, vocab_file)) if not PathManager.exists(vocab_file): raise RuntimeError( f"Vocab file {vocab_file} for vocab dict doesn't exist") self.word_list = load_str_list(vocab_file) self._build()
def get_default_config_path(): directory = os.path.dirname(os.path.abspath(__file__)) configs_dir = os.path.join(directory, "..", "configs") # Check for fb defaults fb_defaults = os.path.join(configs_dir, "fb_defaults.yaml") if PathManager.exists(fb_defaults): return fb_defaults else: return os.path.join(configs_dir, "defaults.yaml")
def load_yaml(f): # Convert to absolute path for loading includes abs_f = get_absolute_path(f) try: mapping = OmegaConf.load(abs_f) f = abs_f except FileNotFoundError as e: # Check if this file might be relative to root? # TODO: Later test if this can be removed relative = os.path.abspath(os.path.join(get_multimodelity_root(), f)) if not PathManager.isfile(relative): raise e else: f = relative mapping = OmegaConf.load(f) if mapping is None: mapping = OmegaConf.create() includes = mapping.get("includes", []) if not isinstance(includes, collections.abc.Sequence): raise AttributeError("Includes must be a list, {} provided".format( type(includes))) include_mapping = OmegaConf.create() multimodelity_root_dir = get_multimodelity_root() for include in includes: original_include_path = include include = os.path.join(multimodelity_root_dir, include) # If path doesn't exist relative to multimodelity root, try relative to current file if not PathManager.exists(include): include = os.path.join(os.path.dirname(f), original_include_path) current_include_mapping = load_yaml(include) include_mapping = OmegaConf.merge(include_mapping, current_include_mapping) mapping.pop("includes", None) mapping = OmegaConf.merge(include_mapping, mapping) return mapping
def __init__(self, vocab_file, embedding_file, data_dir=None, *args, **kwargs): """Use this vocab class when you have a custom vocab as well as a custom embeddings file. This will inherit vocab class, so you will get predefined tokens with this one. IMPORTANT: To init your embedding, get your vectors from this class's object by calling `get_vectors` function Parameters ---------- vocab_file : str Path of custom vocabulary embedding_file : str Path to custom embedding inititalization file data_dir : str Path to data directory if embedding file is not an absolute path. Default: None """ super().__init__(vocab_file) self.type = "custom" if not os.path.isabs(embedding_file) and data_dir is not None: embedding_file = os.path.join(data_dir, embedding_file) embedding_file = get_absolute_path(embedding_file) if not PathManager.exists(embedding_file): raise RuntimeError( f"Embedding file path {embedding_file} doesn't exist") embedding_vectors = torch.from_numpy(np.load(embedding_file)) self.vectors = torch.FloatTensor(self.get_size(), len(embedding_vectors[0])) for i in range(0, 4): self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i for i in range(4, self.get_size()): self.vectors[i] = embedding_vectors[i - 4]
def _load_pretrained_model(model_name_or_path, *args, **kwargs): if PathManager.exists(model_name_or_path): download_path = model_name_or_path model_name = model_name_or_path else: download_path = download_pretrained_model(model_name_or_path, *args, **kwargs) model_name = model_name_or_path configs = glob.glob(os.path.join(download_path, "*.yaml")) assert len(configs) <= 1, ( "Multiple yaml files with the pretrained model. " + "multimodelity doesn't know what to do.") ckpts = [] allowed_ckpt_types = [f"*{ext}" for ext in ALLOWED_CHECKPOINT_EXTS] for ckpt_type in allowed_ckpt_types: ckpts.extend(glob.glob(os.path.join(download_path, ckpt_type))) assert ( len(ckpts) == 1 ), "None or multiple checkpoints files. multimodelity doesn't know what to do." _hack_imports() with PathManager.open(ckpts[0], "rb") as f: ckpt = torch.load(f, map_location=lambda storage, loc: storage) # If configs are not present, will ckpt provide the config? if len(configs) == 0: assert "config" in ckpt, ( "No configs provided with pretrained model" " while checkpoint also doesn't have configuration.") config = ckpt["config"] else: config = load_yaml(configs[0]) model_config = config.get("model_config", config) ckpt = ckpt.get("model", ckpt) # Also handle the case of model_name is path model_config = model_config.get( model_name.split(os.path.sep)[-1].split(".")[0]) return {"config": model_config, "checkpoint": ckpt, "full_config": config}
def _download_model(self): _is_master = is_master() model_file_path = os.path.join(get_multimodelity_cache_dir(), "wiki.en.bin") if not _is_master: return model_file_path if PathManager.exists(model_file_path): logger.info(f"Vectors already present at {model_file_path}.") return model_file_path import requests from tqdm import tqdm from multimodelity.common.constants import FASTTEXT_WIKI_URL PathManager.mkdirs(os.path.dirname(model_file_path)) response = requests.get(FASTTEXT_WIKI_URL, stream=True) with PathManager.open(model_file_path, "wb") as f: pbar = tqdm( total=int(response.headers["Content-Length"]) / 4096, miniters=50, disable=not _is_master, ) idx = 0 for data in response.iter_content(chunk_size=4096): if data: if idx % 50 == 0: pbar.update(len(data)) f.write(data) idx += 1 pbar.close() logger.info(f"fastText bin downloaded at {model_file_path}.") return model_file_path
def get_absolute_path(paths): # String check should be first as Sequence would pass for string too if isinstance(paths, str): # If path is absolute return it directly if os.path.isabs(paths): return paths possible_paths = [ # Direct path paths ] # Now, try relative to user_dir if it exists from multimodelity.utils.configuration import get_multimodelity_env user_dir = get_multimodelity_env(key="user_dir") if user_dir: possible_paths.append(os.path.join(user_dir, paths)) multimodelity_root = get_multimodelity_root() # Relative to root folder of multimodelity install possible_paths.append(os.path.join(multimodelity_root, "..", paths)) # Relative to multimodelity root possible_paths.append(os.path.join(multimodelity_root, paths)) # Test all these paths, if any exists return for path in possible_paths: if PathManager.exists(path): # URIs if path.find("://") == -1: return os.path.abspath(path) else: return path # If nothing works, return original path so that it throws an error return paths elif isinstance(paths, collections.abc.Iterable): return [get_absolute_path(path) for path in paths] else: raise TypeError("Paths passed to dataset should either be " "string or list")
def __init__(self, trainer): """ Generates a path for saving model which can also be used for resuming from a checkpoint. """ self.trainer = trainer self.config = self.trainer.config self.save_dir = get_multimodelity_env(key="save_dir") self.model_name = self.config.model self.ckpt_foldername = self.save_dir self.device = get_current_device() self.ckpt_prefix = "" if hasattr(self.trainer.model, "get_ckpt_name"): self.ckpt_prefix = self.trainer.model.get_ckpt_name() + "_" self.pth_filepath = os.path.join( self.ckpt_foldername, self.ckpt_prefix + self.model_name + "_final.pth") self.models_foldername = os.path.join(self.ckpt_foldername, "models") if not PathManager.exists(self.models_foldername): PathManager.mkdirs(self.models_foldername) self.save_config() self.repo_path = updir(os.path.abspath(__file__), n=3) self.git_repo = None if git and self.config.checkpoint.save_git_details: try: self.git_repo = git.Repo(self.repo_path) except git.exc.InvalidGitRepositoryError: # Not a git repo, don't do anything pass self.max_to_keep = self.config.checkpoint.max_to_keep self.saved_iterations = []
def __init__(self, vocab_file=None, embedding_dim=300, data_dir=None, *args, **kwargs): """Vocab class to be used when you want to train word embeddings from scratch based on a custom vocab. This will initialize the random vectors for the vocabulary you pass. Get the vectors using `get_vectors` function. This will also create random embeddings for some predefined words like PAD - <pad>, SOS - <s>, EOS - </s>, UNK - <unk>. Parameters ---------- vocab_file : str Path of the vocabulary file containing one word per line embedding_dim : int Size of the embedding """ self.type = "base" self.word_dict = {} self.itos = {} self.itos[self.PAD_INDEX] = self.PAD_TOKEN self.itos[self.SOS_INDEX] = self.SOS_TOKEN self.itos[self.EOS_INDEX] = self.EOS_TOKEN self.itos[self.UNK_INDEX] = self.UNK_TOKEN self.word_dict[self.SOS_TOKEN] = self.SOS_INDEX self.word_dict[self.EOS_TOKEN] = self.EOS_INDEX self.word_dict[self.PAD_TOKEN] = self.PAD_INDEX self.word_dict[self.UNK_TOKEN] = self.UNK_INDEX index = len(self.itos.keys()) self.total_predefined = len(self.itos.keys()) if vocab_file is not None: if not os.path.isabs(vocab_file) and data_dir is not None: vocab_file = os.path.join(data_dir, vocab_file) vocab_file = get_absolute_path(vocab_file) if not PathManager.exists(vocab_file): raise RuntimeError("Vocab not found at " + vocab_file) with PathManager.open(vocab_file, "r") as f: for line in f: self.itos[index] = line.strip() self.word_dict[line.strip()] = index index += 1 self.word_dict[self.SOS_TOKEN] = self.SOS_INDEX self.word_dict[self.EOS_TOKEN] = self.EOS_INDEX self.word_dict[self.PAD_TOKEN] = self.PAD_INDEX self.word_dict[self.UNK_TOKEN] = self.UNK_INDEX # Return unk index by default self.stoi = defaultdict(self.get_unk_index) self.stoi.update(self.word_dict) self.vectors = torch.FloatTensor(self.get_size(), embedding_dim)
def convert(self): config = self.configuration.get_config() data_dir = config.env.data_dir if self.args.multimodelity_data_folder: data_dir = self.args.multimodelity_data_folder bypass_checksum = False if self.args.bypass_checksum: bypass_checksum = bool(self.args.bypass_checksum) print(f"Data folder is {data_dir}") print(f"Zip path is {self.args.zip_file}") base_path = os.path.join(data_dir, "datasets", "hateful_memes", "defaults") images_path = os.path.join(base_path, "images") PathManager.mkdirs(images_path) move_dir = False if self.args.move: move_dir = bool(self.args.move) if not bypass_checksum: self.checksum(self.args.zip_file, self.POSSIBLE_CHECKSUMS) src = self.args.zip_file dest = images_path if move_dir: print(f"Moving {src}") move(src, dest) else: print(f"Copying {src}") copy(src, dest) print(f"Unzipping {src}") self.decompress_zip( dest, fname=os.path.basename(src), password=self.args.password ) phase_one = self.assert_files(images_path) annotations_path = os.path.join(base_path, "annotations") PathManager.mkdirs(annotations_path) annotations = ( self.JSONL_PHASE_ONE_FILES if phase_one is True else self.JSONL_PHASE_TWO_FILES ) for annotation in annotations: print(f"Moving {annotation}") src = os.path.join(images_path, "data", annotation) dest = os.path.join(annotations_path, annotation) move(src, dest) images = self.IMAGE_FILES for image_file in images: src = os.path.join(images_path, "data", image_file) if PathManager.exists(src): print(f"Moving {image_file}") else: continue dest = os.path.join(images_path, image_file) move(src, dest) if src.endswith(".tar.gz"): decompress(dest, fname=image_file, delete_original=False)
def test_save_and_load_state_dict(self): with mock_env_with_temp() as d: checkpoint = Checkpoint(self.trainer) self._init_early_stopping(checkpoint) self._do_a_pass() # Test normal case checkpoint.save(1500) self.assertTrue( PathManager.exists(os.path.join(d, "models", "model_1500.ckpt"))) self.assertTrue(PathManager.exists(os.path.join(d, "current.ckpt"))) self.assertFalse(PathManager.exists(os.path.join(d, "best.ckpt"))) os.remove(os.path.join(d, "models", "model_1500.ckpt")) os.remove(os.path.join(d, "current.ckpt")) best_model = deepcopy(self.trainer.model) best_optimizer = deepcopy(self.trainer.optimizer) # Test with update_best checkpoint.save(2000, update_best=True) self.assertTrue( PathManager.exists(os.path.join(d, "models", "model_2000.ckpt"))) self.assertTrue(PathManager.exists(os.path.join(d, "best.ckpt"))) self.assertTrue(PathManager.exists(os.path.join(d, "current.ckpt"))) self._do_a_pass() checkpoint.save(2500) # Test resume self.trainer.config.checkpoint.resume = True current_model = deepcopy(self.trainer.model) current_optimizer = deepcopy(self.trainer.optimizer) checkpoint.load_state_dict() self.assertFalse( compare_state_dicts(self.trainer.model.state_dict(), best_model.state_dict())) self.assertTrue( compare_state_dicts(self.trainer.model.state_dict(), current_model.state_dict())) self.assertFalse( self._compare_optimizers(self.trainer.optimizer, best_optimizer)) self.assertTrue( self._compare_optimizers(self.trainer.optimizer, current_optimizer)) base_0_weight_current = self.trainer.model.base[ 0].weight.data.clone() # Test resume_best self.trainer.config.checkpoint.resume = True self.trainer.config.checkpoint.resume_best = True checkpoint.load_state_dict() self.assertTrue( compare_state_dicts(self.trainer.model.state_dict(), best_model.state_dict())) self.assertTrue( self._compare_optimizers(self.trainer.optimizer, best_optimizer)) self.assertFalse( self._compare_optimizers(self.trainer.optimizer, current_optimizer)) base_0_weight_best = self.trainer.model.base[0].weight.data.clone() self.trainer.config.checkpoint.resume_best = False # Test distributed settings self.trainer.model = torch.nn.DataParallel(self.trainer.model) checkpoint.load_state_dict() weight_to_be_tested = self.trainer.model.module.base[0].weight weight_device = weight_to_be_tested.device self.assertTrue( torch.equal(weight_to_be_tested, base_0_weight_current.to(weight_device))) self.assertFalse( torch.equal(weight_to_be_tested, base_0_weight_best.to(weight_device)))
def test_file_io_exists(self): self.assertEqual(PathManager.exists(self._tmpfile), os.path.exists(self._tmpfile)) fake_path = os.path.join(self._tmpdir, uuid.uuid4().hex) self.assertEqual(PathManager.exists(fake_path), os.path.exists(fake_path))