def test_accent(text, accent): # This checks for correct handling of feature fields containing commas as reported in #13 tagger = Tagger() tokens = tagger.parseToNodeList(text) # Skip if UnidicFeatures17 is used because it doesn't have 'atype' attribute if tokens and isinstance(tokens[0].feature, UnidicFeatures17): pytest.skip() accent_ = [tok.feature.aType for tok in tokens] assert accent_ == accent
def test_tokens(text, saved): # testing the token objects is tricky, so instead just check surfaces #TODO: maybe save serialized nodes to compare? tagger = Tagger() tokens = [str(tok) for tok in tagger.parseToNodeList(text)] assert tokens == saved
def test_accent(text, accent): # This checks for correct handling of feature fields containing commas as reported in #13 tagger = Tagger() accent_ = [tok.feature.aType for tok in tagger.parseToNodeList(text)] assert accent_ == accent
def test_pos(text, tags): # There should be a pos property when using the default tagger tagger = Tagger() tags_ = [tok.pos for tok in tagger.parseToNodeList(text)] assert tags == tags_
#!/usr/bin/env python from fugashi import Tagger tt = Tagger() from collections import Counter wc = Counter() for line in open('wagahai.txt'): for word in tt.parseToNodeList(line.strip()): wc[word.surface] += 1