def __init__(self, root=os.path.join(get_home_dir(), 'models')): try: import regex # pylint: disable=import-outside-toplevel self._regex = regex except ImportError: raise ImportError('GPT2BPETokenizer requires regex. ' 'To install regex, use pip install -U regex') super(GPT2BPETokenizer, self).__init__() root = os.path.expanduser(root) file_name, sha1_hash = self.bpe_ranks_file_hash file_path = os.path.join(root, file_name) if not os.path.exists(file_path) or not check_sha1( file_path, sha1_hash): if os.path.exists(file_path): print( 'Detected mismatch in the content of BPE rank file. Downloading again.' ) else: print('BPE rank file is not found. Downloading.') if not os.path.exists(root): try: os.makedirs(root) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(root): pass else: raise e prefix = str(time.time()) zip_file_path = os.path.join(root, prefix + file_name) repo_url = _get_repo_url() if repo_url[-1] != '/': repo_url = repo_url + '/' archive_name, archive_hash = self.bpe_ranks_archive_hash _url_format = '{repo_url}gluon/dataset/vocab/{file_name}' download(_url_format.format(repo_url=repo_url, file_name=archive_name), path=zip_file_path, sha1_hash=archive_hash, overwrite=True) with zipfile.ZipFile(zip_file_path) as zf: if not os.path.exists(file_path): zf.extractall(root) try: os.remove(zip_file_path) except OSError as e: # file has already been removed. if e.errno == 2: pass else: raise e if not check_sha1(file_path, sha1_hash): raise ValueError( 'Downloaded file has different hash. Please try again.') self._read_bpe_ranks(file_path) self._cache = {} self._token_pattern = self._regex.compile( r'\'s|\'t|\'re|\'ve|\'m|\'ll|\'d| ?\p{L}+' r'| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+')
def _load_pretrained_vocab(name, root, cls=None): """Load the accompanying vocabulary object for pre-trained model. Parameters ---------- name : str Name of the vocabulary, usually the name of the dataset. root : str Location for keeping the model vocabulary. cls : nlp.Vocab or nlp.vocab.BERTVocab, default nlp.Vocab Returns ------- Vocab or nlp.vocab.BERTVocab Loaded vocabulary object for the pre-trained model. """ file_name = '{name}-{short_hash}'.format(name=name, short_hash=short_hash(name)) root = os.path.expanduser(root) file_path = os.path.join(root, file_name + '.vocab') sha1_hash = _vocab_sha1[name] if os.path.exists(file_path): if check_sha1(file_path, sha1_hash): return _load_vocab_file(file_path, cls) else: print( 'Detected mismatch in the content of model vocab file. Downloading again.' ) else: print('Vocab file is not found. Downloading.') if not os.path.exists(root): try: os.makedirs(root) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(root): pass else: raise e zip_file_path = os.path.join(root, file_name + '.zip') repo_url = _get_repo_url() if repo_url[-1] != '/': repo_url = repo_url + '/' download(_url_format.format(repo_url=repo_url, file_name=file_name), path=zip_file_path, overwrite=True) with zipfile.ZipFile(zip_file_path) as zf: zf.extractall(root) os.remove(zip_file_path) if check_sha1(file_path, sha1_hash): return _load_vocab_file(file_path, cls) else: raise ValueError( 'Downloaded file has different hash. Please try again.')
def _load_pretrained_vocab(name, root, cls=None): """Load the accompanying vocabulary object for pre-trained model. Parameters ---------- name : str Name of the vocabulary, usually the name of the dataset. root : str Location for keeping the model vocabulary. cls : nlp.Vocab or nlp.vocab.BERTVocab, default nlp.Vocab Returns ------- Vocab or nlp.vocab.BERTVocab Loaded vocabulary object for the pre-trained model. """ file_name = '{name}-{short_hash}'.format(name=name, short_hash=short_hash(name)) root = os.path.expanduser(root) file_path = os.path.join(root, file_name + '.vocab') sha1_hash = _vocab_sha1[name] temp_num = str(random.Random().randint(1, sys.maxsize)) temp_root = os.path.join(root, temp_num) temp_file_path = os.path.join(temp_root, file_name + '.vocab') temp_zip_file_path = os.path.join(root, temp_num + file_name + '.zip') if os.path.exists(file_path): if check_sha1(file_path, sha1_hash): return _load_vocab_file(file_path, cls) else: print('Detected mismatch in the content of model vocab file. Downloading again.') else: print('Vocab file is not found. Downloading.') utils.mkdir(root) repo_url = _get_repo_url() if repo_url[-1] != '/': repo_url = repo_url + '/' download(_url_format.format(repo_url=repo_url, file_name=file_name), path=temp_zip_file_path, overwrite=True) with zipfile.ZipFile(temp_zip_file_path) as zf: if not os.path.exists(file_path): utils.mkdir(temp_root) zf.extractall(temp_root) os.replace(temp_file_path, file_path) shutil.rmtree(temp_root) if check_sha1(file_path, sha1_hash): return _load_vocab_file(file_path, cls) else: raise ValueError('Downloaded file has different hash. Please try again.')
def _load_pretrained_vocab(name, root=os.path.join('~', '.mxnet', 'models')): """Load the accompanying vocabulary object for pretrained model. Parameters ---------- name : str Name of the vocabulary, usually the name of the dataset. root : str, default '~/.mxnet/models' Location for keeping the model parameters. Returns ------- file_path Path to the requested vocabulary object file. """ file_name = '{name}-{short_hash}'.format(name=name, short_hash=short_hash(name)) root = os.path.expanduser(root) file_path = os.path.join(root, file_name + '.vocab') sha1_hash = _vocab_sha1[name] if os.path.exists(file_path): if check_sha1(file_path, sha1_hash): return _load_vocab_file(file_path) else: print( 'Detected mismatch in the content of model vocab file. Downloading again.' ) else: print('Vocab file is not found. Downloading.') if not os.path.exists(root): os.makedirs(root) zip_file_path = os.path.join(root, file_name + '.zip') repo_url = _get_repo_url() if repo_url[-1] != '/': repo_url = repo_url + '/' download(_url_format.format(repo_url=repo_url, file_name=file_name), path=zip_file_path, overwrite=True) with zipfile.ZipFile(zip_file_path) as zf: zf.extractall(root) os.remove(zip_file_path) if check_sha1(file_path, sha1_hash): return _load_vocab_file(file_path) else: raise ValueError( 'Downloaded file has different hash. Please try again.')
def _get_xlnet_tokenizer(dataset_name, root): assert dataset_name.lower() == '126gb' root = os.path.expanduser(root) file_path = os.path.join(root, 'xlnet_126gb-871f0b3c.spiece') sha1_hash = '871f0b3c13b92fc5aea8fba054a214c420e302fd' if os.path.exists(file_path): if not check_sha1(file_path, sha1_hash): print( 'Detected mismatch in the content of model tokenizer. Downloading again.' ) else: print('Tokenizer file is not found. Downloading.') if not os.path.exists(root): try: os.makedirs(root) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(root): pass else: raise e repo_url = _get_repo_url() prefix = str(time.time()) zip_file_path = os.path.join(root, prefix + 'xlnet_126gb-871f0b3c.zip') if repo_url[-1] != '/': repo_url = repo_url + '/' download(_url_format.format(repo_url=repo_url, file_name='xlnet_126gb-871f0b3c'), path=zip_file_path, overwrite=True) with zipfile.ZipFile(zip_file_path) as zf: if not os.path.exists(file_path): zf.extractall(root) try: os.remove(zip_file_path) except OSError as e: # file has already been removed. if e.errno == 2: pass else: raise e if not check_sha1(file_path, sha1_hash): raise ValueError( 'Downloaded file has different hash. Please try again.') tokenizer = XLNetTokenizer(file_path) return tokenizer
def _load_pretrained_vocab(name, root=os.path.join('~', '.mxnet', 'models')): """Load the accompanying vocabulary object for pre-trained model. Parameters ---------- name : str Name of the vocabulary, usually the name of the dataset. root : str, default '~/.mxnet/models' Location for keeping the model parameters. Returns ------- Vocab Loaded vocabulary object for the pre-trained model. """ file_name = '{name}-{short_hash}'.format(name=name, short_hash=short_hash(name)) root = os.path.expanduser(root) file_path = os.path.join(root, file_name+'.vocab') sha1_hash = _vocab_sha1[name] if os.path.exists(file_path): if check_sha1(file_path, sha1_hash): return _load_vocab_file(file_path) else: print('Detected mismatch in the content of model vocab file. Downloading again.') else: print('Vocab file is not found. Downloading.') if not os.path.exists(root): os.makedirs(root) zip_file_path = os.path.join(root, file_name+'.zip') repo_url = _get_repo_url() if repo_url[-1] != '/': repo_url = repo_url + '/' download(_url_format.format(repo_url=repo_url, file_name=file_name), path=zip_file_path, overwrite=True) with zipfile.ZipFile(zip_file_path) as zf: zf.extractall(root) os.remove(zip_file_path) if check_sha1(file_path, sha1_hash): return _load_vocab_file(file_path) else: raise ValueError('Downloaded file has different hash. Please try again.')
def test_pretrained_gpt2(model_name, tmp_path): sentence = ' natural language processing tools such as gluonnlp and torchtext' model, vocab = get_model(model_name, dataset_name='openai_webtext') tokenizer = GPT2BPETokenizer() detokenizer = GPT2BPEDetokenizer() true_data_hash = {'gpt2_117m': '29526682508d03a7c54c598e889f77f7b4608df0', 'gpt2_345m': '6680fd2a3d7b737855536f480bc19d166f15a3ad'} file_name = '{model_name}_gt_logits-{short_hash}.npy'.format( model_name=model_name, short_hash=true_data_hash[model_name][:8]) url_format = '{repo_url}gluon/dataset/test/{file_name}' repo_url = _get_repo_url() path = os.path.join(str(tmp_path), file_name) download(url_format.format(repo_url=repo_url, file_name=file_name), path=path, sha1_hash=true_data_hash[model_name]) gt_logits = np.load(path) model.hybridize() indices = vocab[tokenizer(sentence)] nd_indices = mx.nd.expand_dims(mx.nd.array(indices), axis=0) logits, new_states = model(nd_indices, None) npt.assert_allclose(logits.asnumpy(), gt_logits, 1E-5, 1E-5)
def _download_vocab_tokenizer(root, file_name, file_ext, file_path): utils.mkdir(root) temp_num = str(random.Random().randint(1, sys.maxsize)) temp_root = os.path.join(root, temp_num) temp_file_path = os.path.join(temp_root, file_name + file_ext) temp_zip_file_path = os.path.join(temp_root, temp_num + '_' + file_name + '.zip') repo_url = _get_repo_url() download(_url_format.format(repo_url=repo_url, file_name=file_name), path=temp_zip_file_path, overwrite=True) with zipfile.ZipFile(temp_zip_file_path) as zf: assert file_name + file_ext in zf.namelist( ), '{} not part of {}. Only have: {}'.format(file_name + file_ext, file_name + '.zip', zf.namelist()) utils.mkdir(temp_root) zf.extractall(temp_root) os.replace(temp_file_path, file_path) shutil.rmtree(temp_root)