def setUpClass(cls): super().setUpClass() case = features.Feature("case", "nom", "gen", "dat", "acc", "abl") num = features.Feature("num", "sg", "pl") # Ignoring gender since gender is a property of the stem rather than the # ending. noun = features.Category(case, num) nomsg = features.FeatureVector(noun, "case=nom", "num=sg") stem = paradigms.make_byte_star_except_boundary() slots = [(paradigms.suffix("+a", stem), nomsg), (paradigms.suffix("+ae", stem), features.FeatureVector(noun, "case=gen", "num=sg")), (paradigms.suffix("+ae", stem), features.FeatureVector(noun, "case=dat", "num=sg")), (paradigms.suffix("+am", stem), features.FeatureVector(noun, "case=acc", "num=sg")), (paradigms.suffix("+ā", stem), features.FeatureVector(noun, "case=abl", "num=sg")), (paradigms.suffix("+ae", stem), features.FeatureVector(noun, "case=nom", "num=pl")), (paradigms.suffix("+ārum", stem), features.FeatureVector(noun, "case=gen", "num=pl")), (paradigms.suffix("+īs", stem), features.FeatureVector(noun, "case=dat", "num=pl")), (paradigms.suffix("+ās", stem), features.FeatureVector(noun, "case=acc", "num=pl")), (paradigms.suffix("+īs", stem), features.FeatureVector(noun, "case=abl", "num=pl"))] cls.paradigm = paradigms.Paradigm( category=noun, slots=slots, lemma_feature_vector=nomsg, stems=["aqu", "bell", "caus", "cicād", "mens", "naut", "puell"])
def setUpClass(cls): super().setUpClass() case = features.Feature("case", "nom", "gen", "dat", "acc", "abl") number = features.Feature("num", "sg", "pl") # Ignoring gender since gender is a property of the stem rather than the # ending. noun = features.Category(case, number) nomsg = features.FeatureVector(noun, "case=nom", "num=sg") stem = paradigms.make_byte_star_except_boundary() slots = [(paradigms.suffix("+a", stem), nomsg), (paradigms.suffix("+ae", stem), features.FeatureVector(noun, "case=gen", "num=sg")), (paradigms.suffix("+ae", stem), features.FeatureVector(noun, "case=dat", "num=sg")), (paradigms.suffix("+am", stem), features.FeatureVector(noun, "case=acc", "num=sg")), (paradigms.suffix("+ā", stem), features.FeatureVector(noun, "case=abl", "num=sg")), (paradigms.suffix("+ae", stem), features.FeatureVector(noun, "case=nom", "num=pl")), (paradigms.suffix("+ārum", stem), features.FeatureVector(noun, "case=gen", "num=pl")), (paradigms.suffix("+īs", stem), features.FeatureVector(noun, "case=dat", "num=pl")), (paradigms.suffix("+ās", stem), features.FeatureVector(noun, "case=acc", "num=pl")), (paradigms.suffix("+īs", stem), features.FeatureVector(noun, "case=abl", "num=pl"))] v = pynini.union("a", "i", "e", "o", "u") c = pynini.union("b", "c", "d", "f", "g", "h", "l", "m", "n", "p", "q", "r", "s", "t") cls.paradigm = paradigms.Paradigm(category=noun, slots=slots, lemma_feature_vector=nomsg, stems=[(v | c).closure(1)])
def setUpClass(cls): super().setUpClass() case = features.Feature("case", "nom", "gen", "dat", "acc", "abl") number = features.Feature("num", "sg", "pl") cls.noun = features.Category(case, number) cls.noun = features.Category(case, number) nomsg = features.FeatureVector(cls.noun, "case=nom", "num=sg") stem = paradigms.make_byte_star_except_boundary() slots = [(paradigms.suffix("+s", stem), nomsg), (paradigms.suffix("+is", stem), features.FeatureVector(cls.noun, "case=gen", "num=sg")), (paradigms.suffix("+ī", stem), features.FeatureVector(cls.noun, "case=dat", "num=sg")), (paradigms.suffix("+em", stem), features.FeatureVector(cls.noun, "case=acc", "num=sg")), (paradigms.suffix("+e", stem), features.FeatureVector(cls.noun, "case=abl", "num=sg")), (paradigms.suffix("+ēs", stem), features.FeatureVector(cls.noun, "case=nom", "num=pl")), (paradigms.suffix("+um", stem), features.FeatureVector(cls.noun, "case=gen", "num=pl")), (paradigms.suffix("+ibus", stem), features.FeatureVector(cls.noun, "case=dat", "num=pl")), (paradigms.suffix("+ēs", stem), features.FeatureVector(cls.noun, "case=acc", "num=pl")), (paradigms.suffix("+ibus", stem), features.FeatureVector(cls.noun, "case=abl", "num=pl"))] velar = pynini.union("c", "ct", "g") v = pynini.union("a", "i", "ī", "e", "ē", "u") # Builds way more stem IDs than we need to show that that this is efficient. stem_ids = paradigms.build_stem_ids(1000, 101000) rules = [ # c, ct, g -> x in nominative singular. Note the spelling of "cs" as "x" # in Latin breaks the segmentation. One might also consider representing # this as "c+s". pynini.cdrewrite( pynini.cross(velar, "x") + stem_ids + pynini.cross("+s", "+"), "", "", cls.noun.sigma_star), # s -> r / V __ V. pynini.cdrewrite(pynini.cross("s", "r"), "", stem_ids + "+" + v, cls.noun.sigma_star), # s+s -> s. pynini.cdrewrite(pynini.cross("s", ""), "s" + stem_ids + "+", "", cls.noun.sigma_star) ] cls.paradigm = paradigms.Paradigm( category=cls.noun, slots=slots, lemma_feature_vector=nomsg, stems=["noct__1000__", "ōs__1001__", "pac__1002__", "rēg__1003__"], rules=rules) cls.delete_stem_ids = pynini.cdrewrite(pynutil.delete(stem_ids), "", "", cls.noun.sigma_star)
def setUpClass(cls): super().setUpClass() case = features.Feature("case", "nom", "gen", "dat", "acc", "abl") num = features.Feature("num", "sg", "pl") # Ignoring gender since gender is a property of the stem rather than the # ending. noun = features.Category(case, num) nomsg = features.FeatureVector(noun, "case=nom", "num=sg") stem = paradigms.make_byte_star_except_boundary() slots = [(paradigms.suffix("+s", stem), nomsg), (paradigms.suffix("+is", stem), features.FeatureVector(noun, "case=gen", "num=sg")), (paradigms.suffix("+ī", stem), features.FeatureVector(noun, "case=dat", "num=sg")), (paradigms.suffix("+em", stem), features.FeatureVector(noun, "case=acc", "num=sg")), (paradigms.suffix("+e", stem), features.FeatureVector(noun, "case=abl", "num=sg")), (paradigms.suffix("+ēs", stem), features.FeatureVector(noun, "case=nom", "num=pl")), (paradigms.suffix("+um", stem), features.FeatureVector(noun, "case=gen", "num=pl")), (paradigms.suffix("+ibus", stem), features.FeatureVector(noun, "case=dat", "num=pl")), (paradigms.suffix("+ēs", stem), features.FeatureVector(noun, "case=acc", "num=pl")), (paradigms.suffix("+ibus", stem), features.FeatureVector(noun, "case=abl", "num=pl"))] velar = pynini.union("c", "ct", "g") v = pynini.union("a", "i", "ī", "e", "ē", "u") rules = [ # c, ct, g -> x in nominative singular. Note the spelling of "cs" as "x" # in Latin breaks the segmentation. One might also consider representing # this as "c+s". pynini.cdrewrite(pynini.cross(velar + "+s", "x+"), "", "", noun.sigma_star), # Rhotacize /s/ prevocalically: a non-Gorman theory of this alternation. pynini.cdrewrite(pynini.cross("s", "r"), "", "+" + v, noun.sigma_star), # s+s -> s. pynini.cdrewrite(pynini.cross("s+s", "s+"), "", "", noun.sigma_star) ] cls.paradigm = paradigms.Paradigm(category=noun, slots=slots, lemma_feature_vector=nomsg, stems=["noct", "ōs", "pac", "rēg"], rules=rules)
def setUpClass(cls): super().setUpClass() focus = features.Feature("focus", "none", "actor") verb = features.Category(focus) none = features.FeatureVector(verb, "focus=none") v = pynini.union("a", "e", "i", "o", "u") c = pynini.union("b", "d", "f", "g", "h", "k", "l", "ly", "k", "m", "n", "ng", "ny", "p", "r", "s", "t", "ts", "w", "y", "z") stem = paradigms.make_byte_star_except_boundary() um = pynini.union(c.plus + pynutil.insert("+um+") + v + stem, pynutil.insert("um+") + v + stem) slots = [(stem, none), (um, features.FeatureVector(verb, "focus=actor"))] cls.paradigm = paradigms.Paradigm( category=verb, slots=slots, lemma_feature_vector=none, stems=["bilang", "ibig", "lipad", "kopya", "punta"])
def testFeatureVector(self): fv = features.FeatureVector(self.noun, "num=sg", "case=dat") fvm = fv.acceptor @ self.fm self.assertSameElements( fvm.paths().ostrings(), ("[case=dat][gen=fem][num=sg]", "[case=dat][gen=mas][num=sg]", "[case=dat][gen=neu][num=sg]")) fv = features.FeatureVector(self.noun, "gen=fem", "case=nom") fvm = fv.acceptor @ self.fm self.assertSameElements(fvm.paths().ostrings(), ( "[case=nom][gen=fem][num=sg]", "[case=nom][gen=fem][num=pl]", )) # Checks that we fail appropriately when we pass an illegal combo. with self.assertRaises(features.Error): fv = features.FeatureVector(self.noun, "gen=acc", "case=nom") with self.assertRaises(features.Error): fv = features.FeatureVector(self.noun, "gen=foofoo", "case=nom") with self.assertRaises(features.Error): fv = features.FeatureVector(self.noun, "wiggywoggy=fem", "case=nom")
def setUpClass(cls): super().setUpClass() # Not clear "aspect" is exactly the right concept. aspect = features.Feature("aspect", "root", "dubitative", "gerundial", "durative") verb = features.Category(aspect) root = features.FeatureVector(verb, "aspect=root") stem = paradigms.make_byte_star_except_boundary() # Naming these with short names for space reasons. vowels = ("a", "i", "o", "u") v = pynini.union(*vowels) c = pynini.union("c", "m", "h", "l", "y", "k", "ʔ", "d", "n", "w", "t") # First template: apply Procrustean transformation to CVCC^?. cvcc = (c + v + pynutil.delete(v).ques + c + pynutil.delete(v).star + c.ques).optimize() # Second template: apply Procrustean transformation to CVCVVC^?. The # CVCVVC^? case involves copying vowels, which is most easily achieved by # iterating over the vowels in the construction. cvcvvc = pynini.Fst() for v in vowels: cvcvvc.union(c + v + pynutil.delete(v).ques + c + pynutil.delete(v).star + pynutil.insert(v + v) + c.ques) cvcvvc.optimize() slots = [(stem, root), (paradigms.suffix("+al", stem), features.FeatureVector(verb, "aspect=dubitative")), (paradigms.suffix("+inay", stem @ cvcc), features.FeatureVector(verb, "aspect=gerundial")), (paradigms.suffix("+ʔaa", stem @ cvcvvc), features.FeatureVector(verb, "aspect=durative"))] cls.paradigm = paradigms.Paradigm( category=verb, slots=slots, lemma_feature_vector=root, stems=["caw", "cuum", "hoyoo", "diiyl", "ʔilk", "hiwiit"])
def testUnification(self): fv = features.FeatureVector(self.noun, "num=sg", "case=dat") fv_other = features.FeatureVector(self.noun, "num=sg", "case=dat") # Identical should unify to the same. self.assertEqual(fv.unify(fv_other), fv) # Feature clash should fail. fv_other = features.FeatureVector(self.noun, "num=sg", "case=nom") self.assertFalse(fv.unify(fv_other)) fv_orig = fv fv = features.FeatureVector(self.noun, "num=sg") fv_other = features.FeatureVector(self.noun, "case=dat") # Free values for features unify with any particular specification. self.assertEqual(fv.unify(fv_other), fv_orig) some_other_category = features.Category(self.number, self.gender) # Mismatched categories should fail. fv_other = features.FeatureVector(some_other_category, "num=sg") self.assertFalse(fv.unify(fv_other))
def setUpClass(cls): super().setUpClass() case = features.Feature("case", "nom", "gen", "dat", "acc", "ins", "prp") num = features.Feature("num", "sg", "pl") noun = features.Category(case, num) stem = paradigms.make_byte_star_except_boundary() nomsg = features.FeatureVector(noun, "case=nom", "num=sg") # Accent A has stem stress. slots_a = [ (stem, nomsg), (paradigms.suffix("+a", stem), features.FeatureVector(noun, "case=gen", "num=sg")), (paradigms.suffix("+u", stem), features.FeatureVector(noun, "case=dat", "num=sg")), (stem, features.FeatureVector(noun, "case=acc", "num=sg")), (paradigms.suffix("+om", stem), features.FeatureVector(noun, "case=ins", "num=sg")), (paradigms.suffix("+e", stem), features.FeatureVector(noun, "case=prp", "num=sg")), (paradigms.suffix("+y", stem), features.FeatureVector(noun, "case=nom", "num=pl")), (paradigms.suffix("+ov", stem), features.FeatureVector(noun, "case=gen", "num=pl")), (paradigms.suffix("+am", stem), features.FeatureVector(noun, "case=dat", "num=pl")), (paradigms.suffix("+y", stem), features.FeatureVector(noun, "case=acc", "num=pl")), (paradigms.suffix("+ami", stem), features.FeatureVector(noun, "case=ins", "num=pl")), (paradigms.suffix("+ax", stem), features.FeatureVector(noun, "case=prp", "num=pl")), ] cls.paradigm_a = paradigms.Paradigm( category=noun, name="hard stem masculine accent A", slots=slots_a, lemma_feature_vector=nomsg, stems=["grádus", "žurnál"], ) # Accent B has stress-shift to the desinence except in the nom./acc. deaccentuation_map = pynini.string_map([ ("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"), ("ý", "y"), ]) acc_v = pynini.project(deaccentuation_map, "input") deaccentuation = pynini.cdrewrite(deaccentuation_map, "", noun.sigma_star + acc_v, noun.sigma_star).optimize() slots_b = [ (paradigms.suffix("+á", stem), features.FeatureVector(noun, "case=gen", "num=sg")), (paradigms.suffix("+ú", stem), features.FeatureVector(noun, "case=dat", "num=sg")), (paradigms.suffix("+óm", stem), features.FeatureVector(noun, "case=ins", "num=sg")), (paradigms.suffix("+é", stem), features.FeatureVector(noun, "case=prp", "num=sg")), (paradigms.suffix("+ý", stem), features.FeatureVector(noun, "case=nom", "num=pl")), (paradigms.suffix("+óv", stem), features.FeatureVector(noun, "case=gen", "num=pl")), (paradigms.suffix("+ám", stem), features.FeatureVector(noun, "case=dat", "num=pl")), (paradigms.suffix("+ý", stem), features.FeatureVector(noun, "case=acc", "num=pl")), (paradigms.suffix("+ámi", stem), features.FeatureVector(noun, "case=ins", "num=pl")), (paradigms.suffix("+áx", stem), features.FeatureVector(noun, "case=prp", "num=pl")), ] cls.paradigm_b = paradigms.Paradigm( category=noun, name="hard stem masculine accent B", slots=slots_b, parent_paradigm=cls.paradigm_a, lemma_feature_vector=nomsg, stems=["górb", "stól"], rules=[deaccentuation])