def catalog(self, argv): """Print dataset/corpus available for download.""" corpus_db = corpus.get_corpus_db(corpus.corpus_db_url()) corpus_db = corpus_db.json() corpus_names = sorted(corpus_db.keys()) print("Dataset/corpus available for download:") for name in corpus_names: print(f"- {name} {corpus_db[name]['latest_version']}", end="") corpus_info = corpus.get_corpus_db_detail(name) if corpus_info: print(f" (Local: {corpus_info['version']})") else: print() print("\nUse subcommand 'get' to download a dataset.\n\n" "Example: thainlp data get crfcut\n")
def download(name: str, force: bool = False, url: str = None, version: str = None) -> bool: """ Download corpus. The available corpus names can be seen in this file: https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json :param str name: corpus name :param bool force: force download :param str url: URL of the corpus catalog :param str version: Version of the corpus :return: **True** if the corpus is found and succesfully downloaded. Otherwise, it returns **False**. :rtype: bool :Example: :: from pythainlp.corpus import download download('wiki_lm_lstm', force=True) # output: # Corpus: wiki_lm_lstm # - Downloading: wiki_lm_lstm 0.1 # thwiki_lm.pth: 26%|██▌ | 114k/434k [00:00<00:00, 690kB/s] By default, downloaded corpus and model will be saved in ``$HOME/pythainlp-data/`` (e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``). """ if not url: url = corpus_db_url() corpus_db = get_corpus_db(url) if not corpus_db: print(f"Cannot download corpus catalog from: {url}") return False corpus_db = corpus_db.json() # check if corpus is available if name in list(corpus_db.keys()): local_db = TinyDB(corpus_db_path()) query = Query() corpus = corpus_db[name.lower()] print("Corpus:", name) if version is None: version = corpus["latest_version"] corpus_versions = corpus["versions"][version] file_name = corpus_versions["filename"] found = local_db.search((query.name == name) & (query.version == version)) # If not found in local, download if force or not found: print(f"- Downloading: {name} {version}") _download( corpus_versions["download_url"], file_name, ) _check_hash( file_name, corpus_versions["md5"], ) if found: local_db.update({"version": version}, query.name == name) else: local_db.insert({ "name": name, "version": version, "filename": file_name }) else: if local_db.search(query.name == name and query.version == version): # Already has the same version print("- Already up to date.") else: # Has the corpus but different version current_ver = local_db.search(query.name == name)[0]["version"] print(f"- Existing version: {current_ver}") print(f"- New version available: {version}") print("- Use download(data_name, force=True) to update") local_db.close() return True print("Corpus not found:", name) return False