示例#1
0
def test_config():
    config = Config()

    # default config filename
    assert config.filename.name == "botok.yaml"  # config.filename is a Path object

    # paths for trie content
    main, custom = config.get_tok_data_paths("POS")
    # each profile contains one or more sections
    assert [m for m in main] == ["words", "words_non_inflected"]
    # each element in a Path object leading to a resource file
    assert isinstance(main["words"][0], Path)

    # custom files to overwrite the existing trie can be added as follows
    modif_path = "trie_data/"
    assert not len(custom)
    main, custom = config.get_tok_data_paths("POS", modifs=modif_path)
    expected = sorted(["words", "words_skrt"])
    assert expected == sorted([c for c in custom])
    expected1 = sorted(["adjustment", "remove", "words", "words_skrt"])
    assert expected1 == sorted(
        [t.parts[-1] for t in Path(modif_path).glob("*")])
    # overwriting the main profile
    main, custom = config.get_tok_data_paths(modif_path, mode="custom")
    expected = sorted(["words", "words_non_inflected", "words_skrt"])
    assert expected == sorted([m for m in main])
示例#2
0
文件: cli.py 项目: Esukhia/pybo
def tok(**kwargs):
    input_dir = Path(kwargs["input_dir"])
    dialect_name = kwargs["dialect_name"]
    dialect_path = kwargs["dialect_path"]
    # overwrite = kwargs["overwrite"]
    rebuild = kwargs["rebuild_trie"]

    # load botok config
    if dialect_name:
        config = Config(dialect_name=dialect_name)
        save_config(config.dialect_pack_path)
    elif dialect_path:
        config = Config.from_path(dialect_path)
        # config.dialect_pack_path = Path(dialect_pack_path)
        save_config(config.dialect_pack_path)
    else:
        pybo_config = load_config()
        if not pybo_config:
            config = Config()
            save_config(config.dialect_pack_path)
        else:
            dialect_pack_path = pybo_config["dialect_pack_path"]
            config = Config.from_path(dialect_pack_path)

    print(
        f"[INFO] Using `{config.dialect_pack_path.name}` dialect pack for tokenization ..."
    )

    wt = WordTokenizer(config=config, build_trie=rebuild)

    def pybo_tok(in_str):
        return wt.tokenize(in_str)

    # Select and Order the tags
    if kwargs["tags"]:
        pybo_mod.__defaults__ = (list(kwargs["tags"]), )

    if input_dir.is_dir():
        if kwargs["o"] is not None:
            output_dir = Path(kwargs["o"])
        else:
            output_dir = input_dir.parent / (input_dir.name + "_tok")
            output_dir.mkdir(exist_ok=True)
        for f in input_dir.glob("*.txt"):
            out_file = output_dir / (f.stem + "_tok.txt")
            text = Text(f, out_file)
            text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form)
    elif input_dir.is_file():
        input_file = input_dir
        if kwargs["o"] is not None:
            output_dir = Path(kwargs["o"])
        else:
            output_dir = input_file.parent / (input_file.stem + "_tok")
            output_dir.mkdir(exist_ok=True)
        out_file = output_dir / (input_file.stem + "_tok.txt")
        text = Text(input_file, out_file)
        text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form)
    else:
        print("[INFO] Invalid input directory or file!!!")
示例#3
0
def test_reset(base_path):
    custome_pack_name = "kangyur"
    config = Config(dialect_name=custome_pack_name)
    assert config.dialect_pack_path == base_path / custome_pack_name

    config.reset()

    assert config.dialect_pack_path == base_path / "general"
示例#4
0
def test_add_dialect_pack():
    config = Config()
    old_dictionary = copy.deepcopy(config.dictionary)
    old_adjustments = copy.deepcopy(config.adjustments)

    config.add_dialect_pack(Path("./tests/data/trie_dialect_pack"))

    assert config.dictionary != old_dictionary
    assert config.adjustments != old_adjustments
示例#5
0
def test_syl_tokenize():
    instr = " མཐའི་རྒྱ་མཚོའི་གླིང་། ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་ཀཀ"
    preprocessed = TokChunks(instr)
    preprocessed.serve_syls_to_trie()
    config = Config()
    trie = Trie(BoSyl, config.profile, config.dictionary, config.adjustments)
    tok = Tokenize(trie)
    tokens = tok.tokenize(preprocessed)
    texts = [t.text for t in tokens]
    expected = [
        " མཐའི་",
        "རྒྱ་མཚོའི་",
        "གླིང་",
        "། ",
        "ཤི་",
        "བཀྲ་ཤིས་  ",
        "tr ",
        "བདེ་་ལེ གས",
        "། ",
        "བཀྲ་ཤིས་",
        "བདེ་ལེགས་",
        "ཀཀ",
    ]
    # current: [' མཐའི་', 'རྒྱ་མཚོའི་', '། ', 'གླིང་', 'བཀྲ་', 'ཤི་', 'tr ', 'ཤིས་  ', 'བདེ་་ལེ གས', '། ', 'བདེ་',
    #          'བཀྲ་ཤིས་', 'ཀཀ', 'ལེགས་']
    assert texts == expected
示例#6
0
def test_multiple_words_per_entry():
    profile = "POS"
    config = Config.from_path("./tests/data/trie_dialect_pack")
    bt = Trie(BoSyl, profile, config.dictionary, config.adjustments)

    res = bt.has_word(syls("ལྟར་"))
    assert {
        "lemma": "ལྟ་",
        "pos": "VERB",
        "freq": 123,
        "affixed": True
    } in res["data"]["senses"]
    assert {
        "lemma": "ལྟར་",
        "pos": "ADV",
        "freq": 456,
        "affixed": False
    } in res["data"]["senses"]
示例#7
0
def test_defaults(base_path):
    config = Config()

    # default dialect pach path
    assert config.dialect_pack_path == base_path / "general"
    assert config.dialect_pack_path.is_dir()

    # Trie data should be .tsv file
    for data_type in ["words", "rules"]:
        assert data_type in config.dictionary
        for data_fn in config.dictionary[data_type]:
            assert data_fn.suffix == ".tsv"

    # Segmentation adjustment
    for data_type in ["remove", "rules", "words", "words_skrt"]:
        assert data_type in config.adjustments
        for data_fn in config.adjustments[data_type]:
            if data_fn.suffix:
                assert data_fn.suffix == ".tsv"
示例#8
0
def test_empty_config():
    config = Config.from_path("./tests/data/empty_dialect_pack")

    assert not config.dictionary
    assert not config.adjustments
示例#9
0
def test_custome_dialect_pack(base_path):
    config = Config(dialect_name="kangyur")
    assert config.dialect_pack_path == base_path / "kangyur"
    assert config.dialect_pack_path.is_dir()
示例#10
0
from pathlib import Path

from botok import WordTokenizer, Text, Config

###########################################
in_str = "ལེ གས། བཀྲ་ཤིས་མཐའི་ ༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"
WT = WordTokenizer()
tokens = WT.tokenize(in_str)

in_str = "ལ་པོ་ལ་པོ་ལ་པོ་"
t = Text(in_str, tok_params={"config": Config()})
tokens = t.tokenize_words_raw_text
tt = Text(
    in_str,
    tok_params={"config": Config.from_path("./tests/data/trie_dialect_pack")},
)
ttokens = tt.tokenize_words_raw_text
print(tokens)
print(ttokens)
###########################################

#
# ### Extract token-string / POS pairs ########
#
# tagged = ['"{}"/{}'.format(w.text, w.pos) for w in tokens]
# print(', '.join(tagged))
#
#
# ### Extract the cleaned version of the tokens
#
# cleaned = [w.text_cleaned for w in tokens if w.text_cleaned]
示例#11
0
def test_adj_config():
    config = Config()
    modif_path = "trie_data"
    main, custom = config.get_adj_data_paths("basic", modifs=modif_path)
    assert "rdr_basis.tsv" == "".join([m.name for m in main])
    assert "test.tsv" == "".join([c.name for c in custom])
示例#12
0
def test_createtrie():
    profile = "empty"
    config = Config.from_path("./tests/data/trie_dialect_pack")
    bt = Trie(BoSyl, profile, config.dictionary, config.adjustments)

    # the trie works as expected. but the add() method should never be used directly:
    # it does not inflect entries, so the tokenizer won't work as expected.
    # be careful only to use it with words that can't ever be inflected, like case particles.
    bt.add(syls("གྲུབ་མཐའ་"), {"pos": "NOUN"})
    assert bt.has_word(syls("གྲུབ་མཐའི་")) == {
        "exists": False,
        "data": {
            "_": {}
        }
    }

    # use inflect_n_modify_trie() instead, to add entries
    bt.inflect_n_modify_trie("གྲུབ་མཐའ་")
    assert bt.has_word(syls("གྲུབ་མཐའི་")) == {
        "exists": True,
        "data": {
            "_": {},
            "affixation": {
                "len": 2,
                "type": "gi",
                "aa": True
            }
        },
    }

    bt.inflect_n_modify_trie("ཀ་ར་", skrt=True)
    assert bt.has_word(syls("ཀ་རར་")) == {
        "exists": True,
        "data": {
            "_": {},
            "affixation": {
                "len": 1,
                "type": "la",
                "aa": False
            },
            "skrt": True,
            "senses": [{
                "lemma": "",
                "affixed": True
            }],
        },
    }  # arrives here because skrt was True

    bt.inflect_n_add_data(
        "གྲུབ་མཐའ་\t\t\t\t532"
    )  # 'freq' is hard-coded in Trie, just as 'lemma' and 'pos' are
    assert bt.has_word(syls("གྲུབ་མཐའི་")) == {
        "exists": True,
        "data": {
            "_": {},
            "affixation": {
                "len": 2,
                "type": "gi",
                "aa": True
            },
            "senses": [{
                "freq": 532,
                "affixed": True
            }],
        },
    }  # freq is an int

    # just like add() was not meant to be used directly, deactivate() is not
    # instead, use bt.inflect_n_modify_trie("word", deactivate=True)
    bt.deactivate(syls("ཀ་ར་"))
    assert (bt.has_word(syls("ཀ་ར་"))["exists"] is False
            )  # since 'ཀ་ར་' has been deactivated
示例#13
0
# coding: utf8
from collections import defaultdict
from pathlib import Path

from botok import BoSyl, Config, TokChunks, Trie

config = Config()


def syls(string):
    return TokChunks(string).get_syls()


def test_createtrie():
    profile = "empty"
    config = Config.from_path("./tests/data/trie_dialect_pack")
    bt = Trie(BoSyl, profile, config.dictionary, config.adjustments)

    # the trie works as expected. but the add() method should never be used directly:
    # it does not inflect entries, so the tokenizer won't work as expected.
    # be careful only to use it with words that can't ever be inflected, like case particles.
    bt.add(syls("གྲུབ་མཐའ་"), {"pos": "NOUN"})
    assert bt.has_word(syls("གྲུབ་མཐའི་")) == {
        "exists": False,
        "data": {
            "_": {}
        }
    }

    # use inflect_n_modify_trie() instead, to add entries
    bt.inflect_n_modify_trie("གྲུབ་མཐའ་")
示例#14
0
def empty_wt():
    """Return empty word tokenizer."""
    config = Config.from_path("./tests/data/empty_dialect_pack")
    return WordTokenizer(config=config)