def test_lexicon(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "foo") self.assertEqual(lexicon.lemma_to_stems["FOO"], [("bar", "foo", set())]) self.assertEqual(lexicon.stem_to_lemma_key_regex["foo"], {("FOO", "bar", ())})
def setUp(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "foo") rules = StemmingRuleSet() self.rule = rules.add("barista", "|o><|llow") self.inflexion = Inflexion() self.inflexion.add_lexicon(lexicon) self.inflexion.add_stemming_rule_set(rules)
def test_find_stems_with_tags_2(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "faa", {'-a'}) lexicon.add("FOO", "bar", "fee", {'-b'}) self.assertEqual( lexicon.find_stems("FOO", "barista"), {"faa", "fee"} )
def load_lexicon(lexicon_file, pre_processor=lambda x: x): lexicon = Lexicon() partnum_to_key_regex = { "1-": "P", "1-A": "PA", "1-M": "PM", "1+": "I", "2-": "F[AM]", "2-A": "FA", "2-M": "FM", "3-": "A[AM][NPDSO]", "3+": "A[AM]I", "3+A": "AAI", "3+M": "AMI", "4-": "XA", "4+": "YA", "5-": "X[MP]", "5+": "Y[MP]", "6-": "AP[NPDSO]", "6+": "API", "7-": "FP", } form_override = {} accent_override = defaultdict(list) with open(lexicon_file) as f: for lemma, entry in yaml.load(f).items(): if "stems" not in entry: continue stems = [] for partnum, stems in sorted(entry["stems"].items()): key_regex = partnum_to_key_regex[partnum] for stem, tag in split_stem_tags(stems): lexicon.add(lemma, key_regex, pre_processor(stem), tag) for key_regex, stems in entry.get("stem_overrides", []): if stems is None: continue for stem, tag in split_stem_tags(stems): lexicon.add(lemma, key_regex, pre_processor(stem), tag) for key, form in entry.get("forms", {}).items(): form_override[(lemma, key)] = form for key_regex, form in entry.get("accents", []): accent_override[lemma].append((key_regex, form)) return lexicon, form_override, accent_override
def test_lexicon(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "foo") self.assertEqual( lexicon.lemma_to_stems["FOO"], [("bar", "foo", set())] ) self.assertEqual( lexicon.stem_to_lemma_key_regex["foo"], {("FOO", "bar", ())} )
def test_find_stems_with_tags_2(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "faa", {'-a'}) lexicon.add("FOO", "bar", "fee", {'-b'}) self.assertEqual(lexicon.find_stems("FOO", "barista"), {"faa", "fee"})
def test_find_stems(self): lexicon = Lexicon() lexicon.add("FOO", "bar", "foo") self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})
def load_lexicon(lexicon_file, pre_processor=lambda x: x): lexicon = Lexicon() partnum_to_key_regex = { "1-": "P", "1-A": "PA", "1-M": "PM", "1+": "I", "2-": "F[AM]", "2-A": "FA", "2-M": "FM", "3-": "A[AM][NPDSO]", "3+": "A[AM]I", "3+A": "AAI", "3+M": "AMI", "4-": "XA", "4+": "YA", "5-": "X[MP]", "5+": "Y[MP]", "6-": "AP[NPDSO]", "6+": "API", "7-": "FP", "8-": "Z[MP]", "M": "..M", "F": "..F", "N": "..N", } form_override = {} accent_override = defaultdict(list) with open(lexicon_file) as f: for lemma, entry in yaml.load(f).items(): if "stems" in entry: stems = [] for partnum, stems in sorted( (entry["stems"] if entry.get("stems") else {}).items()): key_regex = partnum_to_key_regex[partnum] for stem, tag in split_stem_tags(stems): lexicon.add(lemma, key_regex, pre_processor(stem), tag) for key_regex, stems in entry.get("stem_overrides", []): if stems is None: continue for stem, tag in split_stem_tags(stems): lexicon.add(lemma, key_regex, pre_processor(stem), tag) for key, form in entry.get("forms", {}).items(): form_override[(lemma, key)] = form for key_regex, form in entry.get("accents", []): accent_override[lemma].append((key_regex, form)) return lexicon, form_override, accent_override