def download_checkpoint(cls, pretrained_model_name: str, cache_dir=cache_dir) -> str: r"""Download the specified pre-trained checkpoint, and return the directory in which the checkpoint is cached. Args: pretrained_model_name (str): Name of the model checkpoint. cache_dir (str, optional): Path to the cache directory. If `None`, uses the default directory (user's home directory). Returns: Path to the cache directory. """ # cache_dir = cache_dir if pretrained_model_name in cls._MODEL2URL: download_path = cls._MODEL2URL[pretrained_model_name] else: raise ValueError( f"Pre-trained model not found: {pretrained_model_name}") if cache_dir is None: cache_path = default_download_dir(cls._MODEL_NAME) else: cache_path = Path(cache_dir) print(cache_path) cache_path = cache_path / pretrained_model_name if not cache_path.exists(): if isinstance(download_path, str): filename = get_filename(download_path) maybe_download(download_path, cache_path, extract=True) # removing the compressed file (cache_path / filename).unlink() folder = None # if extracted into a new directory for file in cache_path.iterdir(): if file.is_dir(): folder = file if folder is not None: for file in folder.iterdir(): file.rename(file.parents[1] / file.name) folder.rmdir() else: for path in download_path: maybe_download(path, cache_path) print(f"Pre-trained {cls._MODEL_NAME} checkpoint " f"{pretrained_model_name} cached to {cache_path}") else: print(f"Using cached pre-trained {cls._MODEL_NAME} checkpoint " f"from {cache_path}.") return str(cache_path)
def download_checkpoint(cls, pretrained_model_name: str, cache_dir: Optional[str] = None) -> str: r"""Download the specified pre-trained checkpoint, and return the directory in which the checkpoint is cached. Args: pretrained_model_name (str): Name of the model checkpoint. cache_dir (str, optional): Path to the cache directory. If `None`, uses the default directory given by :meth:`~default_download_dir`. Returns: Path to the cache directory. """ if pretrained_model_name in cls._MODEL2URL: download_path = cls._MODEL2URL[pretrained_model_name] else: raise ValueError( f"Pre-trained model not found: {pretrained_model_name}") if cache_dir is None: cache_path = default_download_dir(cls._MODEL_NAME) else: cache_path = Path(cache_dir) cache_path = cache_path / pretrained_model_name if not cache_path.exists(): if isinstance(download_path, str): filename = download_path.split('/')[-1] maybe_download(download_path, cache_path, extract=True) folder = None for file in cache_path.iterdir(): if file.is_dir(): folder = file assert folder is not None (cache_path / filename).unlink() for file in folder.iterdir(): file.rename(file.parents[1] / file.name) folder.rmdir() else: for path in download_path: maybe_download(path, cache_path) print(f"Pre-trained {cls._MODEL_NAME} checkpoint " f"{pretrained_model_name} cached to {cache_path}") else: print(f"Using cached pre-trained {cls._MODEL_NAME} checkpoint " f"from {cache_path}.") return str(cache_path)
def setUp(self): self.tmp_dir = tempfile.TemporaryDirectory() self.SAMPLE_VOCAB = maybe_download( 'https://github.com/google/sentencepiece/blob/master/' 'python/test/test_model.model?raw=true', self.tmp_dir.name) self.tokenizer = SentencePieceTokenizer.load(self.SAMPLE_VOCAB) self.tokenizer.save(self.tmp_dir.name)
def setUp(self): self.tmp_dir = tempfile.TemporaryDirectory() self.SAMPLE_VOCAB = maybe_download( 'https://github.com/huggingface/transformers/raw/main/tests/' 'fixtures/test_sentencepiece.model', self.tmp_dir.name) self.tokenizer = XLNetTokenizer.load(self.SAMPLE_VOCAB, configs={'keep_accents': True}) self.tokenizer.save(self.tmp_dir.name)
def setUp(self): self.tmp_dir = tempfile.TemporaryDirectory() self.SAMPLE_VOCAB = maybe_download( 'https://github.com/gpengzhi/pytorch-transformers/blob/master/' 'pytorch_transformers/tests/fixtures/test_sentencepiece.model' '?raw=true', self.tmp_dir.name) self.tokenizer = XLNetTokenizer.load(self.SAMPLE_VOCAB, configs={'keep_accents': True}) self.tokenizer.save(self.tmp_dir.name)
def test_train(self): tmp_dir = tempfile.TemporaryDirectory() TEXT_FILE = maybe_download( 'https://github.com/google/sentencepiece/blob/master/' 'data/botchan.txt?raw=true', tmp_dir.name) hparams = { "vocab_file": None, "text_file": TEXT_FILE, "vocab_size": 1000, } tokenizer = SentencePieceTokenizer(hparams=hparams) with open(TEXT_FILE, 'r', encoding='utf-8') as file: for line in file: tokenizer.map_token_to_text(tokenizer.map_text_to_token(line)) tokenizer.map_id_to_text(tokenizer.map_text_to_id(line))
def setUp(self): # Create test data self._test_dir = tempfile.mkdtemp() cat_in_snow = maybe_download( 'https://storage.googleapis.com/download.tensorflow.org/' 'example_images/320px-Felis_catus-cat_on_snow.jpg', self._test_dir, 'cat_0.jpg') williamsburg_bridge = maybe_download( 'https://storage.googleapis.com/download.tensorflow.org/' 'example_images/194px-New_East_River_Bridge_from_Brooklyn_' 'det.4a09796u.jpg', self._test_dir, 'bridge_0.jpg') _feature_types = { 'height': ('tf.int64', 'FixedLenFeature', 1), 'width': ('tf.int64', 'FixedLenFeature', 1), 'label': ('tf.int64', 'stacked_tensor', 1), 'shape': (np.int64, 'VarLenFeature'), 'image_raw': (bytes, 'stacked_tensor'), 'variable1': (np.str, 'FixedLenFeature'), 'variable2': ('tf.int64', 'FixedLenFeature'), } self._feature_convert_types = { 'variable1': 'tf.float32', 'variable2': 'tf.string', } _image_options = {} self._unconvert_features = ['height', 'width', 'label'] self._dataset_valid = { 'height': [], 'width': [], 'shape': [], 'label': [], 'image_raw': [], 'variable1': [], 'variable2': [], } _toy_image_labels_valid = { cat_in_snow: 0, williamsburg_bridge: 1, } _toy_image_shapes = { cat_in_snow: (213, 320, 3), williamsburg_bridge: (239, 194), } _record_filepath = os.path.join(self._test_dir, 'test.pkl') # Prepare Validation data with RecordData.writer(_record_filepath, _feature_types) as writer: for image_path, label in _toy_image_labels_valid.items(): with open(image_path, 'rb') as fid: image_data = fid.read() image_shape = _toy_image_shapes[image_path] # _construct_dataset_valid("", shape, label) single_data = { 'height': image_shape[0], 'width': image_shape[1], 'shape': image_shape, 'label': label, 'image_raw': image_data, 'variable1': "1234567890", 'variable2': int(9876543210), } for key, value in single_data.items(): self._dataset_valid[key].append(value) writer.write(single_data) self._hparams = { "num_epochs": 1, "batch_size": 1, "shuffle": False, "dataset": { "files": _record_filepath, "feature_original_types": _feature_types, "feature_convert_types": self._feature_convert_types, "image_options": [_image_options], } }