예제 #1
0
def process_folders(config,
                    format_func,
                    user_vocabs=[],
                    remove_page_info=True):
    in_folder = config['Exec']['input_folder']
    out_folder = config['Exec']['output_folder']
    tok_profile = config['Exec']['tok_profile']
    suffix = config['Exec']['tokenized_suffix']

    in_folder = Path(in_folder)
    out_folder = Path(out_folder)

    assert in_folder.is_dir()  # check the input folder exists
    out_folder.mkdir(exist_ok=True)  # ensure the output folder exists

    tok = BoTokenizer(tok_profile, user_word_list=user_vocabs)
    if bool(config['Exec']['rebuild_trie']):
        tok.tok.trie.rebuild_trie()

    in_files = in_folder.glob('*.txt')
    for f in in_files:
        content = f.read_text(encoding='utf-8-sig')
        if remove_page_info:
            content = clean_lines(content)
        tokens = tok.tokenize(content)
        out = format_func(tokens)
        out = ' '.join(out).replace('\n ', '\n')
        out_file = out_folder / str(f.name)
        out_file.write_text(out, encoding='utf-8-sig')
예제 #2
0
def tokenize_folder(config, user_vocabs=[]):
    in_folder = config['Exec']['input_folder']
    out_folder = config['Exec']['output_folder']
    tok_profile = config['Exec']['tok_profile']
    suffix = config['Exec']['tokenized_suffix']

    in_folder = Path(in_folder)
    out_folder = Path(out_folder)

    assert in_folder.is_dir()  # check the input folder exists
    out_folder.mkdir(exist_ok=True)  # ensure the output folder exists

    tok = BoTokenizer(tok_profile, toadd_filenames=user_vocabs)
    if bool(config['Exec']['rebuild_trie']):
        tok.tok.trie.rebuild_trie()

    in_files = in_folder.glob('*.txt')
    for f in in_files:
        content = f.read_text(encoding='utf-8-sig')
        tokens = tok.tokenize(content)
        out = get_tokenized_string(tokens)
        out = ' '.join(out).replace('\n ', '\n')
        out_file = out_folder / str(f.stem + suffix + f.suffix)
        out_file.write_text(out, encoding='utf-8-sig')
예제 #3
0
def pybo_tok(text: str, profile: str) -> List[PyboToken]:
    tok = BoTokenizer(profile)
    return tok.tokenize(text)
예제 #4
0
from pybo import BoTokenizer
from pathlib import Path


def get_vocab_files(vocab_folder):
    """helper function to get the absolute paths of all .txt files in a give dir"""
    files = Path(vocab_folder).glob('*.txt')
    abs_file_paths = [Path().cwd() / f for f in files]
    return abs_file_paths


if __name__ == '__main__':
    custom_vocab_filenames = get_vocab_files('vocabs')

    tok = BoTokenizer('POS', toadd_filenames=custom_vocab_filenames)

    in_str = 'བཀྲ་ཤིས་བདེ་ལེགས། ཀཀཀ། པཔཔ།'
    tokens = tok.tokenize(in_str)
    for t in tokens:
        print(f'{t.content}: {t.pos}')

        # བཀྲ་ཤིས་: NOUN
        # བདེ་ལེགས: NOUN
        # །: punct
        # ཀཀཀ: TEST
        # །: punct
        # པཔཔ: TEST
        # །: punct