def _get_upstream(self): if "from_hf_hub" in self.args and self.args.from_hf_hub == True: from huggingface_hub import snapshot_download print(f'[Runner] - Downloading upstream model {self.args.upstream} from the Hugging Face Hub') filepath = snapshot_download(self.args.upstream, use_auth_token=True) sys.path.append(filepath) from expert import UpstreamExpert Upstream = UpstreamExpert ckpt_path = os.path.join(filepath, self.args.upstream_model_name) else: Upstream = getattr(hub, self.args.upstream) ckpt_path = self.args.upstream_ckpt upstream_refresh = self.args.upstream_refresh if is_initialized() and get_rank() > 0: torch.distributed.barrier() upstream_refresh = False model = Upstream( ckpt = ckpt_path, model_config = self.args.upstream_model_config, refresh = upstream_refresh, ).to(self.args.device) if is_initialized() and get_rank() == 0: torch.distributed.barrier() return self._init_model( model = model, name = 'Upstream', trainable = self.args.upstream_trainable, interfaces = ["get_downsample_rates"] )
def check_download_model_with_regex(self, regex, allow=True): # Test `main` branch allow_regex = regex if allow else None ignore_regex = regex if not allow else None with tempfile.TemporaryDirectory() as tmpdirname: storage_folder = snapshot_download( f"{USER}/{REPO_NAME}", revision="main", cache_dir=tmpdirname, allow_regex=allow_regex, ignore_regex=ignore_regex, ) # folder contains the two files contributed and the .gitattributes folder_contents = os.listdir(storage_folder) self.assertEqual(len(folder_contents), 2) self.assertTrue("dummy_file.txt" in folder_contents) self.assertTrue("dummy_file_2.txt" in folder_contents) self.assertTrue(".gitattributes" not in folder_contents) with open(os.path.join(storage_folder, "dummy_file.txt"), "r") as f: contents = f.read() self.assertEqual(contents, "v2") # folder name contains the revision's commit sha. self.assertTrue(self.second_commit_hash in storage_folder)
def load_model_ensemble_and_task_from_hf_hub( model_id, cache_dir: Optional[str] = None, arg_overrides: Optional[Dict[str, Any]] = None, **kwargs: Any, ): try: from huggingface_hub import snapshot_download except ImportError: raise ImportError( "You need to install huggingface_hub to use `load_from_hf_hub`. " "See https://pypi.org/project/huggingface-hub/ for installation." ) library_name = "fairseq" cache_dir = cache_dir or (Path.home() / ".cache" / library_name).as_posix() cache_dir = snapshot_download( model_id, cache_dir=cache_dir, library_name=library_name, **kwargs ) _arg_overrides = arg_overrides or {} _arg_overrides["data"] = cache_dir return load_model_ensemble_and_task( [p.as_posix() for p in Path(cache_dir).glob("*.pt")], arg_overrides=_arg_overrides, )
def __init__(self, model_id: str): filepath = snapshot_download(model_id) sys.path.append(filepath) if "requirements.txt" in os.listdir(filepath): cache_dir = os.environ["PIP_CACHE"] subprocess.check_call( [ sys.executable, "-m", "pip", "install", "--cache-dir", cache_dir, "-r", os.path.join(filepath, "requirements.txt"), ] ) from pipeline import PreTrainedPipeline self.model = PreTrainedPipeline(filepath) if hasattr(self.model, "sampling_rate"): self.sampling_rate = self.model.sampling_rate else: # 16000 by default if not specified self.sampling_rate = 16000
def from_hub(repo_id: str, **kwargs: Any): """Instantiate & load a pretrained model from HF hub. >>> from doctr.models import from_hub >>> model = from_hub("mindee/fasterrcnn_mobilenet_v3_large_fpn") Args: repo_id: HuggingFace model hub repo kwargs: kwargs of `hf_hub_download` or `snapshot_download` Returns: Model loaded with the checkpoint """ # Get the config with open(hf_hub_download(repo_id, filename="config.json", **kwargs), "rb") as f: cfg = json.load(f) arch = cfg["arch"] task = cfg["task"] cfg.pop("arch") cfg.pop("task") if task == "classification": model = models.classification.__dict__[arch]( pretrained=False, classes=cfg["classes"], num_classes=cfg["num_classes"]) elif task == "detection": model = models.detection.__dict__[arch](pretrained=False) elif task == "recognition": model = models.recognition.__dict__[arch]( pretrained=False, input_shape=cfg["input_shape"], vocab=cfg["vocab"]) elif task == "obj_detection" and is_torch_available(): model = models.obj_detection.__dict__[arch]( pretrained=False, image_mean=cfg["mean"], image_std=cfg["std"], max_size=cfg["input_shape"][-1], num_classes=len(cfg["classes"]), ) # update model cfg model.cfg = cfg # Load checkpoint if is_torch_available(): state_dict = torch.load(hf_hub_download(repo_id, filename="pytorch_model.bin", **kwargs), map_location="cpu") model.load_state_dict(state_dict) else: # tf repo_path = snapshot_download(repo_id, **kwargs) model.load_weights(os.path.join(repo_path, "tf_model", "weights")) return model
def pull_from_hf_model_hub(specifier: str, version: str = None, **kwargs) -> str: download_path = snapshot_download( specifier, revision=version, cache_dir=kwargs.pop("cache_dir", None), library_name="adapter-transformers", library_version=__adapters_version__, ) return download_path
def test_download_model(self): # Test `main` branch with tempfile.TemporaryDirectory() as tmpdirname: storage_folder = snapshot_download( f"{USER}/{REPO_NAME}", revision="main", cache_dir=tmpdirname ) # folder contains the two files contributed and the .gitattributes folder_contents = os.listdir(storage_folder) self.assertEqual(len(folder_contents), 3) self.assertTrue("dummy_file.txt" in folder_contents) self.assertTrue("dummy_file_2.txt" in folder_contents) self.assertTrue(".gitattributes" in folder_contents) with open(os.path.join(storage_folder, "dummy_file.txt"), "r") as f: contents = f.read() self.assertEqual(contents, "v2") # folder name contains the revision's commit sha. self.assertTrue(self.second_commit_hash in storage_folder) # Test with specific revision with tempfile.TemporaryDirectory() as tmpdirname: storage_folder = snapshot_download( f"{USER}/{REPO_NAME}", revision=self.first_commit_hash, cache_dir=tmpdirname, ) # folder contains the two files contributed and the .gitattributes folder_contents = os.listdir(storage_folder) self.assertEqual(len(folder_contents), 2) self.assertTrue("dummy_file.txt" in folder_contents) self.assertTrue(".gitattributes" in folder_contents) with open(os.path.join(storage_folder, "dummy_file.txt"), "r") as f: contents = f.read() self.assertEqual(contents, "v1") # folder name contains the revision's commit sha. self.assertTrue(self.first_commit_hash in storage_folder)
def _hf_hub_download(url, model_identifier: str, filename: Optional[str], cache_dir: Union[str, Path]) -> str: revision: Optional[str] if "@" in model_identifier: repo_id = model_identifier.split("@")[0] revision = model_identifier.split("@")[1] else: repo_id = model_identifier revision = None if filename is not None: hub_url = hf_hub.hf_hub_url(repo_id=repo_id, filename=filename, revision=revision) cache_path = str( hf_hub.cached_download( url=hub_url, library_name="allennlp", library_version=VERSION, cache_dir=cache_dir, )) # HF writes it's own meta '.json' file which uses the same format we used to use and still # support, but is missing some fields that we like to have. # So we overwrite it when it we can. with FileLock(cache_path + ".lock", read_only_ok=True): meta = _Meta.from_path(cache_path + ".json") # The file HF writes will have 'resource' set to the 'http' URL corresponding to the 'hf://' URL, # but we want 'resource' to be the original 'hf://' URL. if meta.resource != url: meta.resource = url meta.to_file() else: cache_path = str( hf_hub.snapshot_download(repo_id, revision=revision, cache_dir=cache_dir)) # Need to write the meta file for snapshot downloads if it doesn't exist. with FileLock(cache_path + ".lock", read_only_ok=True): if not os.path.exists(cache_path + ".json"): meta = _Meta( resource=url, cached_path=cache_path, creation_time=time.time(), extraction_dir=True, size=_get_resource_size(cache_path), ) meta.to_file() return cache_path
def get_model( revision: Optional[str] = typer.Argument(None, callback=_url_callback), model_dir: Path = typer.Argument( None, envvar="MODEL_DIR", help="Optionally specify a directory to store model files in", ), local_only=False, ) -> Path: # pragma: no cover """Downloads models, defaults to the latest available model""" repo_id = MODEL_REPO_ID with console.status("Getting model", spinner="dots"): model = snapshot_download( repo_id, cache_dir=model_dir, revision=None, local_files_only=local_only ) return Path(model)
def _from_pretrained( cls, model_id, revision, cache_dir, force_download, proxies, resume_download, local_files_only, use_auth_token, **model_kwargs, ): """Here we just call from_pretrained_keras function so both the mixin and functional APIs stay in sync. TODO - Some args above aren't used since we are calling snapshot_download instead of hf_hub_download. """ if is_tf_available(): import tensorflow as tf else: raise ImportError( "Called a Tensorflow-specific function but could not import it." ) # TODO - Figure out what to do about these config values. Config is not going to be needed to load model cfg = model_kwargs.pop("config", None) # Root is either a local filepath matching model_id or a cached snapshot if not os.path.isdir(model_id): storage_folder = snapshot_download( repo_id=model_id, revision=revision, cache_dir=cache_dir, library_name="keras", library_version=get_tf_version(), ) else: storage_folder = model_id model = tf.keras.models.load_model(storage_folder, **model_kwargs) # For now, we add a new attribute, config, to store the config loaded from the hub/a local dir. model.config = cfg return model
def test_wav2vec2_with_lm(self): downloaded_folder = snapshot_download( "patrickvonplaten/common_voice_es_sample") file_path = glob.glob(downloaded_folder + "/*")[0] sample = librosa.load(file_path, sr=16_000)[0] model = TFWav2Vec2ForCTC.from_pretrained( "patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm") processor = Wav2Vec2ProcessorWithLM.from_pretrained( "patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm") input_values = processor(sample, return_tensors="tf").input_values logits = model(input_values).logits transcription = processor.batch_decode(logits.numpy()).text self.assertEqual(transcription[0], "el libro ha sido escrito por cervantes")
def test_decoder_local_files(self): local_dir = snapshot_download("hf-internal-testing/processor_with_lm") processor = Wav2Vec2ProcessorWithLM.from_pretrained(local_dir) language_model = processor.decoder.model_container[ processor.decoder._model_key] path_to_cached_dir = Path( language_model._kenlm_model.path.decode( "utf-8")).parent.parent.absolute() local_decoder_files = os.listdir(local_dir) expected_decoder_files = os.listdir(path_to_cached_dir) local_decoder_files.sort() expected_decoder_files.sort() # test that both decoder form hub and local files in cache are the same self.assertListEqual(local_decoder_files, expected_decoder_files)
def from_pretrained_fastai( repo_id: str, revision: Optional[str] = None, ): """ Load pretrained fastai model from the Hub or from a local directory. Args: repo_id (`str`): The location where the pickled fastai.Learner is. It can be either of the two: - Hosted on the Hugging Face Hub. E.g.: 'espejelomar/fatai-pet-breeds-classification' or 'distilgpt2'. You can add a `revision` by appending `@` at the end of `repo_id`. E.g.: `dbmdz/bert-base-german-cased@main`. Revision is the specific model version to use. Since we use a git-based system for storing models and other artifacts on the Hugging Face Hub, it can be a branch name, a tag name, or a commit id. - Hosted locally. `repo_id` would be a directory containing the pickle and a pyproject.toml indicating the fastai and fastcore versions used to build the `fastai.Learner`. E.g.: `./my_model_directory/`. revision (`str`, *optional*): Revision at which the repo's files are downloaded. See documentation of `snapshot_download`. Returns: The `fastai.Learner` model in the `repo_id` repo. """ _check_fastai_fastcore_versions() # Load the `repo_id` repo. # `snapshot_download` returns the folder where the model was stored. # `cache_dir` will be the default '/root/.cache/huggingface/hub' if not os.path.isdir(repo_id): storage_folder = snapshot_download( repo_id=repo_id, revision=revision, library_name="fastai", library_version=get_fastai_version(), ) else: storage_folder = repo_id _check_fastai_fastcore_pyproject_versions(storage_folder) from fastai.learner import load_learner return load_learner(os.path.join(storage_folder, "model.pkl"))
def test_with_local_lm_fast(self): local_dir = snapshot_download("hf-internal-testing/processor_with_lm") speech_recognizer = pipeline( task="automatic-speech-recognition", model=local_dir, ) self.assertEqual(speech_recognizer.type, "ctc_with_lm") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 audio_tiled = np.tile(audio, n_repeats) output = speech_recognizer([audio_tiled], batch_size=2) self.assertEqual(output, [{"text": ANY(str)}]) self.assertEqual(output[0]["text"][:6], "<s> <s")
def __init__(self, model_id: str): # IMPLEMENT_THIS # Preload all the elements you are going to need at inference. # For instance your model, processors, tokenizer that might be needed. # This function is only called once, so do all the heavy processing I/O here # IMPLEMENT_THIS : Please define a `self.sampling_rate` for this pipeline # to automatically read the input correctly filepath = snapshot_download(model_id) sys.path.append(filepath) if "requirements.txt" in os.listdir(filepath): subprocess.check_call([ sys.executable, "-m", "pip", "install", "-r", os.path.join(filepath, "requirements.txt"), ]) from model import PreTrainedModel self.model = PreTrainedModel(filepath) self.sampling_rate = 16000
def test_download_model_local_only_multiple(self): # Test `main` branch with tempfile.TemporaryDirectory() as tmpdirname: # download both from branch and from commit snapshot_download( f"{USER}/{REPO_NAME}", cache_dir=tmpdirname, ) snapshot_download( f"{USER}/{REPO_NAME}", revision=self.first_commit_hash, cache_dir=tmpdirname, ) # now load from cache and make sure warning to be raised with self.assertWarns(Warning): snapshot_download( f"{USER}/{REPO_NAME}", cache_dir=tmpdirname, local_files_only=True, ) # cache multiple commits and make sure correct commit is taken with tempfile.TemporaryDirectory() as tmpdirname: # first download folder to cache it snapshot_download( f"{USER}/{REPO_NAME}", cache_dir=tmpdirname, ) # now load folder from another branch snapshot_download( f"{USER}/{REPO_NAME}", revision="other", cache_dir=tmpdirname, ) # now make sure that loading "main" branch gives correct branch storage_folder = snapshot_download( f"{USER}/{REPO_NAME}", cache_dir=tmpdirname, local_files_only=True, ) # folder contains the two files contributed and the .gitattributes folder_contents = os.listdir(storage_folder) self.assertEqual(len(folder_contents), 3) self.assertTrue("dummy_file.txt" in folder_contents) self.assertTrue(".gitattributes" in folder_contents) with open(os.path.join(storage_folder, "dummy_file.txt"), "r") as f: contents = f.read() self.assertEqual(contents, "v2") # folder name contains the 2nd commit sha and not the 3rd self.assertTrue(self.second_commit_hash in storage_folder)
def test_download_private_model(self): self._api.update_repo_visibility( token=self._token, repo_id=REPO_NAME, private=True ) # Test download fails without token with tempfile.TemporaryDirectory() as tmpdirname: with self.assertRaisesRegex( requests.exceptions.HTTPError, "404 Client Error" ): _ = snapshot_download( f"{USER}/{REPO_NAME}", revision="main", cache_dir=tmpdirname ) # Test we can download with token from cache with tempfile.TemporaryDirectory() as tmpdirname: HfFolder.save_token(self._token) storage_folder = snapshot_download( f"{USER}/{REPO_NAME}", revision="main", cache_dir=tmpdirname, use_auth_token=True, ) # folder contains the two files contributed and the .gitattributes folder_contents = os.listdir(storage_folder) self.assertEqual(len(folder_contents), 3) self.assertTrue("dummy_file.txt" in folder_contents) self.assertTrue("dummy_file_2.txt" in folder_contents) self.assertTrue(".gitattributes" in folder_contents) with open(os.path.join(storage_folder, "dummy_file.txt"), "r") as f: contents = f.read() self.assertEqual(contents, "v2") # folder name contains the revision's commit sha. self.assertTrue(self.second_commit_hash in storage_folder) # Test we can download with explicit token with tempfile.TemporaryDirectory() as tmpdirname: storage_folder = snapshot_download( f"{USER}/{REPO_NAME}", revision="main", cache_dir=tmpdirname, use_auth_token=self._token, ) # folder contains the two files contributed and the .gitattributes folder_contents = os.listdir(storage_folder) self.assertEqual(len(folder_contents), 3) self.assertTrue("dummy_file.txt" in folder_contents) self.assertTrue("dummy_file_2.txt" in folder_contents) self.assertTrue(".gitattributes" in folder_contents) with open(os.path.join(storage_folder, "dummy_file.txt"), "r") as f: contents = f.read() self.assertEqual(contents, "v2") # folder name contains the revision's commit sha. self.assertTrue(self.second_commit_hash in storage_folder) self._api.update_repo_visibility( token=self._token, repo_id=REPO_NAME, private=False )