def get_vocab_path(self): """ Gets the path of the module vocabulary path. """ save_path = os.path.join(DATA_HOME, 'bert-base-cased', 'bert-base-cased-vocab.txt') if not os.path.exists(save_path) or not os.path.isfile(save_path): url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt" download(url, os.path.join(DATA_HOME, 'bert-base-cased')) return save_path
def get_vocab_path(self): """ Gets the path of the module vocabulary path. """ save_path = os.path.join(DATA_HOME, 'ernie_tiny', 'vocab.txt') if not os.path.exists(save_path) or not os.path.isfile(save_path): url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/vocab.txt" download(url, os.path.join(DATA_HOME, 'ernie_tiny')) return save_path
def _download_and_uncompress_dataset(self, destination: str, url: str): """ Downloads dataset and uncompresses it. Args: destination (:obj:`str`): The dataset cached directory. url (:obj: str): The link to be downloaded a dataset. """ if not os.path.exists(destination): dataset_package = download(url=url, path=DATA_HOME) if is_xarfile(dataset_package): unarchive(dataset_package, DATA_HOME) else: logger.info("Dataset {} already cached.".format(destination))
def get_tokenizer(self, tokenize_chinese_chars=True): """ Gets the tokenizer that is customized for this module. Args: tokenize_chinese_chars (:obj: bool , defaults to :obj: True): Whether to tokenize chinese characters or not. Returns: tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. """ spm_path = os.path.join(DATA_HOME, 'ernie_tiny', 'spm_cased_simp_sampled.model') if not os.path.exists(spm_path) or not os.path.isfile(spm_path): url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/spm_cased_simp_sampled.model" download(url, os.path.join(DATA_HOME, 'ernie_tiny')) word_dict_path = os.path.join(DATA_HOME, 'ernie_tiny', 'dict.wordseg.pickle') if not os.path.exists(word_dict_path) or not os.path.isfile( word_dict_path): url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/dict.wordseg.pickle" download(url, os.path.join(DATA_HOME, 'ernie_tiny')) return ErnieTinyTokenizer(self.get_vocab_path(), spm_path, word_dict_path)
def download_file_and_uncompress(self, url: str, save_path: str, print_progress: bool): with utils.generate_tempdir() as _dir: if print_progress: with log.ProgressBar('Download {}'.format(url)) as bar: for path, ds, ts in utils.download_with_progress(url=url, path=_dir): bar.update(float(ds) / ts) else: path = utils.download(url=url, path=_dir) if print_progress: with log.ProgressBar('Decompress {}'.format(path)) as bar: for path, ds, ts in xarfile.unarchive_with_progress(name=path, path=save_path): bar.update(float(ds) / ts) else: path = xarfile.unarchive(name=path, path=save_path)