예제 #1
0
def test_scope_concat_scope():
    fst = StemGuesser('(abc)(abc)', '', [(None, 0.0)]).fst
    assert accepts(fst, 'abcabc')
    assert not accepts(fst, 'ab')

    fst = StemGuesser('(abef)', '', [(None, 0.0)]).fst
    assert accepts(fst, 'abef')
예제 #2
0
def test_union_concat_scope():
    fst = StemGuesser('[abc](de)', '', [(None, 0.0)]).fst
    assert accepts(fst, 'cde')

    fst = StemGuesser('[abc](de)[fgh]', '', [(None, 0.0)]).fst
    assert accepts(fst, 'cdef')
    assert accepts(fst, 'adeg')

    fst = StemGuesser('[abc](ce)[fgh]', '', [(None, 0.0)]).fst
    assert accepts(fst, 'acef')
예제 #3
0
def test_sigma_star_symbol_sigma_star():
    fst = StemGuesser('.*j.*', '', [(None, 0.0)], nahuatl_alphabet).fst
    assert not accepts(fst, '')
    assert not accepts(fst, 'a')
    assert accepts(fst, 'j')

    fst = StemGuesser('[CV]*[CV][CV]*', '', [(None, 0.0)],
                      nahuatl_alphabet).fst
    assert not accepts(fst, '')

    fst = StemGuesser('.*..*', '', [(None, 0.0)], nahuatl_alphabet).fst
    assert not accepts(fst, '')
예제 #4
0
def test_sigma_star_following_sigma():
    fst = StemGuesser('..*', '', [(None, 0.0)], {
        'C': ['b', 'c'],
        'V': ['a']
    }).fst
    assert not accepts(fst, '')

    fst = StemGuesser('..*', '', [(None, 0.0)], nahuatl_alphabet).fst
    assert accepts(fst, 'a')
    assert not accepts(fst, '')
    assert accepts(fst, 'at')
    assert accepts(fst, 'atp')
예제 #5
0
def test_closure_no_alphabet():
    fst = StemGuesser('CV*', '', [(None, 0.0)]).fst
    assert accepts(fst, 'C')
    assert accepts(fst, 'CV')
    assert accepts(fst, 'CVV')
    assert accepts(fst, 'CVVV')
    assert not accepts(fst, 'CVC')
예제 #6
0
def test_closure_of_scope_no_alphabet():
    fst = StemGuesser('(CV)*', '', [(None, 0.0)]).fst
    assert accepts(fst, '')
    assert accepts(fst, 'CV')
    assert accepts(fst, 'CVCV')
    assert not accepts(fst, 'CVV')
    assert not accepts(fst, 'CCV')
예제 #7
0
def test_sigma_star_following():
    fst = StemGuesser('t.*', '', [(None, 0.0)], nahuatl_alphabet).fst
    assert accepts(fst, 't')
    assert not accepts(fst, '')
    assert accepts(fst, 'ta')
    assert accepts(fst, 'tta')
    assert not accepts(fst, 'at')
예제 #8
0
def test_sigma_star_preceding():
    fst = StemGuesser('.*t', '', [(None, 0.0)], nahuatl_alphabet).fst
    assert accepts(fst, 't')
    assert not accepts(fst, '')
    assert accepts(fst, 'at')
    assert accepts(fst, 'att')
    assert not accepts(fst, 'ta')
예제 #9
0
def test_closure_of_scope_preceding_symbol():
    fst = StemGuesser('(CV)*C', '', [(None, 0.0)]).fst
    assert not accepts(fst, 'CCV')
    assert accepts(fst, 'CVC')
    assert accepts(fst, 'CVCVC')
    assert accepts(fst, 'C')
    assert not accepts(fst, '')
예제 #10
0
def test_symbol_closure():
    fst = StemGuesser('a*', '', [(None, 0.0)]).fst
    assert accepts(fst, '')
    assert accepts(fst, 'a')
    assert accepts(fst, 'aa')
    assert accepts(fst, 'aaa')
    assert accepts(fst, 'aaaa')
    assert not accepts(fst, 'ab')
예제 #11
0
def test_sigma_star_even_number():
    fst = StemGuesser('.*.*', '', [(None, 0.0)], nahuatl_alphabet).fst
    assert accepts(fst, 'at')
    assert accepts(fst, '')
    assert accepts(fst, 'a')
    assert accepts(fst, 't')
    assert accepts(fst, 'at')
    assert accepts(fst, 'atp')
예제 #12
0
def parser_from_stem(stem):
    return compile({
        StemGuesser(stem, 'NounStem', [('Absolutive', 0.0)], alphabet=nawat_alphabet, start=True),
        Slot('Absolutive',
             [
                 ('-t', 't', [(None, 0.0)], 0.0),
                 ('-ti', 'ti', [(None, 0.0)], 0.0),
                 ('l-li', 'li', [(None, 0.0)], 0.0)  # This case actually has l in the stem
             ]),
    })
예제 #13
0
def test_closure_of_union_no_alphabet():
    fst = StemGuesser('[CV]*V[CV]*V[CV]*', '', [(None, 0.0)]).fst
    assert accepts(fst, 'CVVCV')  # bimoraic
    assert accepts(fst, 'VV')  # bimoraic
    assert accepts(fst, 'VVC')  # bimoraic
    assert accepts(fst, 'CVCV')  # bimoraic
    assert accepts(fst, 'CVCVC')  # bimoraic
    assert not accepts(fst, 'CV')  # not bimoraic
    assert not accepts(fst, 'CC')  # not bimoraic
    assert not accepts(fst, 'CCV')  # not bimoraic
예제 #14
0
def test_concat():
    fst = StemGuesser('CVCV', '', [(None, 0.0)]).fst
    assert accepts(fst, 'CVCV')
    assert not accepts(fst, 'CVC')
    assert not accepts(fst, 'CVV')
예제 #15
0
def test_sigma_star_alone():
    fst = StemGuesser('.*', '', [(None, 0.0)], nahuatl_alphabet).fst
    assert accepts(fst, '')
    assert accepts(fst, 'a')
    assert accepts(fst, 'ann')
    assert accepts(fst, 'nn')
예제 #16
0
def test_sigma_in_middle():
    fst = StemGuesser('p.p', '', [(None, 0.0)], nahuatl_alphabet).fst
    assert accepts(fst, 'pop')
    assert accepts(fst, 'pip')
    assert accepts(fst, 'psp')
    assert not accepts(fst, 'pp')
예제 #17
0
def test_sigma_concatenated():
    fst = StemGuesser('...', '', [(None, 0.0)], nahuatl_alphabet).fst
    assert accepts(fst, 'tap')
    assert not accepts(fst, '')
    assert not accepts(fst, 'ta')
    assert not accepts(fst, 'main')
예제 #18
0
from morphotactics.stem_guesser import StemGuesser
import pynini

nahuatl_alphabet = {
    'C': [
        'm', 'n', 'p', 't', 'k', 'kw', 'h', 'ts', 'tl', 'ch', 's', 'l', 'x',
        'j', 'w'
    ],
    'V': ['a', 'e', 'i', 'o']
}
bimoraic_fsa = StemGuesser('[CV]*V[CV]*V[CV]*', '', [(None, 0.0)],
                           nahuatl_alphabet).fst
bimoraic_fsa_sigma_form = StemGuesser('.*V.*V.*', '', [(None, 0.0)],
                                      nahuatl_alphabet).fst
# note: StemGuesser('.*V.*V.*', '', [(None, 0.0)], nahuatl_alphabet) != StemGuesser('[CV]*V[CV]*V[CV]*', '', [(None, 0.0)], nahuatl_alphabet)
# because of different state numberings during state optimization but they accept the same language still


def accepts(fst, input_str):
    return pynini.compose(input_str, fst).num_states() != 0


def is_bimoraic(oov_stem):
    return accepts(bimoraic_fsa, oov_stem)


def is_bimoraic_sigma_form(oov_stem):
    return accepts(bimoraic_fsa_sigma_form, oov_stem)


def test_sigma_concatenated():
예제 #19
0
def test_union_concat_union():
    fst = StemGuesser('[abc][abc]', '', [(None, 0.0)]).fst
    assert not accepts(fst, 'abcabc')
    assert accepts(fst, 'ab')
예제 #20
0
 # This structure can happen within a sentence too, so even though it doesn't
 # occur all too often, the best way to deal with it is to always include the
 # subject prefix that predicates take.
 Slot('Subject', [
     ('n-', 'n', [('NounStem', 0.0), ('PossessedNoun', 0.0)], 0.0),
     ('ni-', 'ni', [('NounStem', 0.0), ('PossessedNoun', 0.0)], 0.0),
     ('t-', 't', [('NounStem', 0.0), ('PossessedNoun', 0.0)], 0.0),
     ('ti-', 'ti', [('NounStem', 0.0), ('PossessedNoun', 0.0)], 0.0),
     ('0-', '', [('NounStem', 0.0), ('PossessedNoun', 0.0)], 100.0),  # the most common case by far
 ], start=True),
 Slot('NounStem', [
     ('', '', [('NounStemC', 0.0), ('NounStemV', 0.0)], 0.0),
 ]),
 StemGuesser('.*C', 'NounStemC', [
     ('C-Absolutive', 100.0),
     (None, 0.0),  # This rarer case mostly occurs when ending in -l or -s with more than one mora
     ('tsin', 100.0),
     ('Locative', 0.0)
 ], alphabet=nawat_alphabet),
 Slot('C-Absolutive', [('-ti', 'ti', [(None, 0.0)], 0.0)]),
 StemGuesser('.*V', 'NounStemV', [
     ('V-Absolutive', 100.0),
     ('tsin', 100.0),
     ('Locative', 0.0)
 ], alphabet=nawat_alphabet),
 Slot('V-Absolutive', [
     ('-t', 't', [(None, 0.0)], 0.0),
     ('l-li', 'li', [(None, 0.0)], 0.0)  # Here, l is actually part of the stem, but easier to do this way
 ]),
 Slot('PossessedNoun', [
     ('no-', 'no', [('PossessedNounStem', 0.0)], 0.0),
     ('n-', 'n', [('oPossessedNounStem', 0.0)], 0.0),