def process_folders(config, format_func, user_vocabs=[], remove_page_info=True): in_folder = config['Exec']['input_folder'] out_folder = config['Exec']['output_folder'] tok_profile = config['Exec']['tok_profile'] suffix = config['Exec']['tokenized_suffix'] in_folder = Path(in_folder) out_folder = Path(out_folder) assert in_folder.is_dir() # check the input folder exists out_folder.mkdir(exist_ok=True) # ensure the output folder exists tok = BoTokenizer(tok_profile, user_word_list=user_vocabs) if bool(config['Exec']['rebuild_trie']): tok.tok.trie.rebuild_trie() in_files = in_folder.glob('*.txt') for f in in_files: content = f.read_text(encoding='utf-8-sig') if remove_page_info: content = clean_lines(content) tokens = tok.tokenize(content) out = format_func(tokens) out = ' '.join(out).replace('\n ', '\n') out_file = out_folder / str(f.name) out_file.write_text(out, encoding='utf-8-sig')
def tokenize_folder(config, user_vocabs=[]): in_folder = config['Exec']['input_folder'] out_folder = config['Exec']['output_folder'] tok_profile = config['Exec']['tok_profile'] suffix = config['Exec']['tokenized_suffix'] in_folder = Path(in_folder) out_folder = Path(out_folder) assert in_folder.is_dir() # check the input folder exists out_folder.mkdir(exist_ok=True) # ensure the output folder exists tok = BoTokenizer(tok_profile, toadd_filenames=user_vocabs) if bool(config['Exec']['rebuild_trie']): tok.tok.trie.rebuild_trie() in_files = in_folder.glob('*.txt') for f in in_files: content = f.read_text(encoding='utf-8-sig') tokens = tok.tokenize(content) out = get_tokenized_string(tokens) out = ' '.join(out).replace('\n ', '\n') out_file = out_folder / str(f.stem + suffix + f.suffix) out_file.write_text(out, encoding='utf-8-sig')
def pybo_tok(text: str, profile: str) -> List[PyboToken]: tok = BoTokenizer(profile) return tok.tokenize(text)
from pybo import BoTokenizer from pathlib import Path def get_vocab_files(vocab_folder): """helper function to get the absolute paths of all .txt files in a give dir""" files = Path(vocab_folder).glob('*.txt') abs_file_paths = [Path().cwd() / f for f in files] return abs_file_paths if __name__ == '__main__': custom_vocab_filenames = get_vocab_files('vocabs') tok = BoTokenizer('POS', toadd_filenames=custom_vocab_filenames) in_str = 'བཀྲ་ཤིས་བདེ་ལེགས། ཀཀཀ། པཔཔ།' tokens = tok.tokenize(in_str) for t in tokens: print(f'{t.content}: {t.pos}') # བཀྲ་ཤིས་: NOUN # བདེ་ལེགས: NOUN # །: punct # ཀཀཀ: TEST # །: punct # པཔཔ: TEST # །: punct