def _download_install(name: str) -> None: if get_corpus_path(name) is None: download(name, force=True, version="1.0") tar = tarfile.open(get_corpus_path(name), "r:gz") tar.extractall() tar.close() if not os.path.exists(get_full_data_path(name)): os.mkdir(get_full_data_path(name)) with tarfile.open(get_corpus_path(name)) as tar: tar.extractall(path=get_full_data_path(name))
def _download(url: str, dst: str) -> int: """ @param: url to download file @param: dst place to put the file """ file_size = int(urlopen(url).info().get("Content-Length", -1)) if os.path.exists(dst): first_byte = os.path.getsize(dst) else: first_byte = 0 if first_byte >= file_size: return file_size header = {"Range": "bytes=%s-%s" % (first_byte, file_size)} pbar = tqdm( total=file_size, initial=first_byte, unit="B", unit_scale=True, desc=url.split("/")[-1], ) req = requests.get(url, headers=header, stream=True) with (open(get_full_data_path(dst), "wb")) as f: for chunk in req.iter_content(chunk_size=1024): if chunk: f.write(chunk) pbar.update(1024) pbar.close() return file_size
def test_path(self): data_filename = "ttc_freq.txt" self.assertTrue( get_full_data_path(data_filename).endswith(data_filename) ) self.assertIsInstance(get_pythainlp_data_path(), str) self.assertIsInstance(get_pythainlp_path(), str)
def _download(url: str, dst: str) -> int: """ Download helper. @param: url to download file @param: dst place to put the file """ _CHUNK_SIZE = 64 * 1024 # 64 KiB file_size = int(urlopen(url).info().get("Content-Length", -1)) r = requests.get(url, stream=True) with open(get_full_data_path(dst), "wb") as f: pbar = None try: from tqdm import tqdm pbar = tqdm(total=int(r.headers["Content-Length"])) except ImportError: pbar = None for chunk in r.iter_content(chunk_size=_CHUNK_SIZE): if chunk: f.write(chunk) if pbar: pbar.update(len(chunk)) if pbar: pbar.close() else: print("Done.") return file_size
def download_(url: str, dst: str): """ @param: url to download file @param: dst place to put the file """ file_size = int(urlopen(url).info().get("Content-Length", -1)) if os.path.exists(dst): first_byte = os.path.getsize(dst) else: first_byte = 0 if first_byte >= file_size: return file_size header = {"Range": "bytes=%s-%s" % (first_byte, file_size)} pbar = tqdm( total=file_size, initial=first_byte, unit="B", unit_scale=True, desc=url.split("/")[-1], ) req = requests.get(url, headers=header, stream=True) with (open(get_full_data_path(dst), "wb")) as f: for chunk in req.iter_content(chunk_size=1024): if chunk: f.write(chunk) pbar.update(1024) pbar.close()
def get_corpus_path(name: str) -> Union[str, None]: """ Get corpus path. :param str name: corpus name :return: path to the corpus or **None** of the corpus doesn't \ exist in the device :rtype: str :Example: If the corpus already exists:: from pythainlp.corpus import get_corpus_path print(get_corpus_path('ttc')) # output: /root/pythainlp-data/ttc_freq.txt If the corpus has not been downloaded yet:: from pythainlp.corpus import download, get_corpus_path print(get_corpus_path('wiki_lm_lstm')) # output: None download('wiki_lm_lstm') # output: # Download: wiki_lm_lstm # wiki_lm_lstm 0.32 # thwiki_lm.pth?dl=1: 1.05GB [00:25, 41.5MB/s] # /root/pythainlp-data/thwiki_model_lstm.pth print(get_corpus_path('wiki_lm_lstm')) # output: /root/pythainlp-data/thwiki_model_lstm.pth """ # check if the corpus is in local catalog, download if not corpus_db_detail = get_corpus_db_detail(name) if (corpus_db_detail.get("file_name") is not None and corpus_db_detail.get("filename") is None): _update_all() elif (corpus_db_detail.get("file") is not None and corpus_db_detail.get("filename") is None): _update_all() if not corpus_db_detail or not corpus_db_detail.get("filename"): download(name) corpus_db_detail = get_corpus_db_detail(name) if corpus_db_detail and corpus_db_detail.get("filename"): # corpus is in the local catalog, get full path to the file path = get_full_data_path(corpus_db_detail.get("filename")) # check if the corpus file actually exists, download if not if not os.path.exists(path): download(name) if os.path.exists(path): return path return None
def _check_hash(dst: str, md5: str) -> NoReturn: """ @param: dst place to put the file @param: md5 place to hash the file (MD5) """ if md5 and md5 != "-": with open(get_full_data_path(dst), "rb") as f: content = f.read() file_md5 = hashlib.md5(content).hexdigest() if md5 != file_md5: raise Exception("Hash does not match expected.")
def get_corpus_path(name: str) -> Union[str, None]: """ Get corpus path :param string name: corpus name :return: path to the corpus or **None** of the corpus doesn't exist in the device :rtype: str :Example: If the corpus already exists:: from pythainlp.corpus import get_corpus_path print(get_corpus_path('ttc')) # output: /root/pythainlp-data/ttc_freq.txt If the corpus has not been downloaded yet:: from pythainlp.corpus import download, get_corpus_path print(get_corpus_path('wiki_lm_lstm')) # output: None download('wiki_lm_lstm') # output: # Download: wiki_lm_lstm # wiki_lm_lstm 0.32 # thwiki_lm.pth?dl=1: 1.05GB [00:25, 41.5MB/s] # /root/pythainlp-data/thwiki_model_lstm.pth print(get_corpus_path('wiki_lm_lstm')) # output: /root/pythainlp-data/thwiki_model_lstm.pth """ db = TinyDB(corpus_db_path()) query = Query() path = None if db.search(query.name == name): path = get_full_data_path(db.search(query.name == name)[0]["file"]) if not os.path.exists(path): download(name) db.close() return path
def _download(url: str, dst: str) -> int: """ @param: url to download file @param: dst place to put the file """ _CHUNK_SIZE = 1024 * 64 file_size = int(urlopen(url).info().get("Content-Length", -1)) r = requests.get(url, stream=True) with open(get_full_data_path(dst), "wb") as f: pbar = tqdm(total=int(r.headers["Content-Length"])) for chunk in r.iter_content(chunk_size=_CHUNK_SIZE): if chunk: f.write(chunk) pbar.update(len(chunk)) pbar.close() return file_size
def word_freqs(): """ Get word frequency from Thai Textbook Corpus (TTC) """ path = get_full_data_path("ttc_freq.txt") # try local copy first if not os.path.exists(path): # if fail, download from internet download_data("ttc") with open(path, "r", encoding="utf8") as f: lines = f.read().splitlines() f.close() listword = [] for line in lines: listindata = line.split(" ") listword.append((listindata[0], int(listindata[1]))) return listword
def get_corpus_path(name: str) -> Union[str, None]: """ Get corpus path :param string name: corpus name """ db = TinyDB(corpus_db_path()) temp = Query() if len(db.search(temp.name == name)) > 0: path = get_full_data_path(db.search(temp.name == name)[0]["file"]) db.close() if not os.path.exists(path): download(name) return path return None
def get_corpus_path(name: str) -> [str, None]: """ Get corpus path :param string name: corpus name """ db = TinyDB(corpus_db_path()) temp = Query() if len(db.search(temp.name == name)) > 0: path = get_full_data_path(db.search(temp.name == name)[0]["file"]) db.close() if not os.path.exists(path): download(name) return path return None
from pythainlp.tools import get_full_data_path, get_pythainlp_path from tinydb import TinyDB # Remote and local corpus databases _CORPUS_DIRNAME = "corpus" _CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME) # remote corpus catalog URL _CORPUS_DB_URL = ("https://pythainlp.github.io/pythainlp-corpus/db.json") # local corpus catalog filename _CORPUS_DB_FILENAME = "db.json" # local corpus catalog full path _CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME) # create a local corpus database if it does not already exist if not os.path.exists(_CORPUS_DB_PATH): TinyDB(_CORPUS_DB_PATH).close() def corpus_path() -> str: """ Get path where corpus files are kept locally. """ return _CORPUS_PATH def corpus_db_url() -> str: """
def _get_translate_path(model: str, *path: str) -> str: return os.path.join(get_full_data_path(model), *path)
def get_corpus_path(name: str, version : str = None) -> Union[str, None]: """ Get corpus path. :param str name: corpus name :return: path to the corpus or **None** of the corpus doesn't \ exist in the device :rtype: str :Example: (Please see the filename from `this file <https://pythainlp.github.io/pythainlp-corpus/db.json>`_ If the corpus already exists:: from pythainlp.corpus import get_corpus_path print(get_corpus_path('ttc')) # output: /root/pythainlp-data/ttc_freq.txt If the corpus has not been downloaded yet:: from pythainlp.corpus import download, get_corpus_path print(get_corpus_path('wiki_lm_lstm')) # output: None download('wiki_lm_lstm') # output: # Download: wiki_lm_lstm # wiki_lm_lstm 0.32 # thwiki_lm.pth?dl=1: 1.05GB [00:25, 41.5MB/s] # /root/pythainlp-data/thwiki_model_lstm.pth print(get_corpus_path('wiki_lm_lstm')) # output: /root/pythainlp-data/thwiki_model_lstm.pth """ # Customize your the corpus path then close the line after lines 164 through 190. _CUSTOMIZE = { # "the corpus name":"path" } if name in list(_CUSTOMIZE.keys()): return _CUSTOMIZE[name] # check if the corpus is in local catalog, download if not corpus_db_detail = get_corpus_db_detail(name) if not corpus_db_detail or not corpus_db_detail.get("filename"): download(name, version = version) corpus_db_detail = get_corpus_db_detail(name) if corpus_db_detail and corpus_db_detail.get("filename"): # corpus is in the local catalog, get full path to the file path = get_full_data_path(corpus_db_detail.get("filename")) # check if the corpus file actually exists, download if not if not os.path.exists(path): download(name) if os.path.exists(path): return path return None
import requests from pythainlp.tools import get_full_data_path, get_pythainlp_path from tinydb import Query, TinyDB from tqdm import tqdm # Remote and local corpus databases _CORPUS_DIRNAME = "corpus" _CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME) _CORPUS_DB_URL = ( "https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/2.0/db.json" ) _CORPUS_DB_FILENAME = "db.json" _CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME) if not os.path.exists(_CORPUS_DB_PATH): TinyDB(_CORPUS_DB_PATH) def corpus_path(): return _CORPUS_PATH def corpus_db_url(): return _CORPUS_DB_URL def corpus_db_path(): return _CORPUS_DB_PATH