Пример #1
0
def test_accent(text, accent):
    # This checks for correct handling of feature fields containing commas as reported in #13
    tagger = Tagger()
    tokens = tagger.parseToNodeList(text)
    # Skip if UnidicFeatures17 is used because it doesn't have 'atype' attribute
    if tokens and isinstance(tokens[0].feature, UnidicFeatures17):
        pytest.skip()
    accent_ = [tok.feature.aType for tok in tokens]
    assert accent_ == accent
Пример #2
0
def test_tokens(text, saved):
    # testing the token objects is tricky, so instead just check surfaces
    #TODO: maybe save serialized nodes to compare?
    tagger = Tagger()
    tokens = [str(tok) for tok in tagger.parseToNodeList(text)]
    assert tokens == saved
Пример #3
0
def test_accent(text, accent):
    # This checks for correct handling of feature fields containing commas as reported in #13
    tagger = Tagger()
    accent_ = [tok.feature.aType for tok in tagger.parseToNodeList(text)]
    assert accent_ == accent
Пример #4
0
def test_pos(text, tags):
    # There should be a pos property when using the default tagger
    tagger = Tagger()
    tags_ = [tok.pos for tok in tagger.parseToNodeList(text)]
    assert tags == tags_
#!/usr/bin/env python
from fugashi import Tagger

tt = Tagger()
from collections import Counter

wc = Counter()

for line in open('wagahai.txt'):
    for word in tt.parseToNodeList(line.strip()):
        wc[word.surface] += 1