Пример #1
0
def test_parse_multiple(eat_fst_txt: str):
    """
    Test that the parser will disallow more than one Foma FST in one file.
    Foma will allow multiple, but we'll explicitly reject that.
    """
    invalid_fst = eat_fst_txt * 2

    with pytest.raises(FSTParseError):
        parse_text(invalid_fst)
Пример #2
0
def test_parse_states() -> None:
    """
    Ensure we parse all kinds of FST states/arcs.
    """
    fst_text = """##foma-net 1.0##
##props##
2 390211 90019 390213 5 -1 1 2 2 1 0 2
##sigma##
0 @_EPSILON_SYMBOL_@
##states##
0 0 0 0
0 1
0 0 2
1 -1 -1 1
2 0 0 2 1
-1 -1 -1 -1 -1
##end##
"""
    parse = parse_text(fst_text)
    s0, s1, s2 = (StateID(i) for i in range(3))
    assert parse.arcs == {
        Arc(s0, Epsilon, Epsilon, s0),
        Arc(s0, Epsilon, Epsilon, s1),
        Arc(s0, Epsilon, Epsilon, s2),
        Arc(s2, Epsilon, Epsilon, s2),
    }
    assert parse.accepting_states == {1, 2}
Пример #3
0
def test_parse_bad_symbols() -> None:
    """
    Makes sure we crash if the FST tries to define a symbol entry twice.
    """
    bad_fst = """##foma-net 1.0##
##props##
2 390211 90019 390213 5 -1 1 2 2 1 0 2
##sigma##
0 @_EPSILON_SYMBOL_@
3 @P.UN.ON@
0 +Err/Orth
##states##
-1 -1 -1 -1 -1
##end##
"""
    with pytest.raises(FSTParseError):
        parse_text(bad_fst)
Пример #4
0
def test_parse_simple(eat_fst_txt: str):
    """
    Parse a simple FST WITHOUT flag diacritics.
    """
    result = parse_text(eat_fst_txt)
    assert len(result.sigma) == 15
    multichar_symbols = set("+3P +Mass +N +Past +PastPart "
                            "+PresPart +Sg +V".split())
    graphemes = set("eats eaten eating ate".replace(" ", ""))
    assert len(multichar_symbols) + len(graphemes) == len(result.sigma)
    assert stringified_set(result.multichar_symbols) == set(multichar_symbols)
    assert stringified_set(result.graphemes) == graphemes
    assert len(result.states) == 15
    assert len(result.arcs) == 19
    assert result.accepting_states == {14}
Пример #5
0
def test_parse_fst_with_flag_diacritics(english_flags_fst_txt: str) -> None:
    """
    Parse a Foma FST with flag diacritics.
    """
    result = parse_text(english_flags_fst_txt)
    assert len(result.sigma) == 22
    flag_diacritics = set("@C.UN@ @D.UN@ @P.UN.ON@".split())
    multichar_symbols = set("+Adj +Inf +Pl +V UN+ ".split())
    graphemes = set("a b d e i k l n o p r s u y".split())
    assert len(multichar_symbols) + len(graphemes) + len(
        flag_diacritics) == len(result.sigma)
    assert stringified_set(result.multichar_symbols) == set(multichar_symbols)
    assert stringified_set(result.flag_diacritics) == flag_diacritics
    assert stringified_set(result.graphemes) == graphemes
    assert len(result.states) == 21
    assert len(result.arcs) == 27
    assert result.accepting_states == {20}
Пример #6
0
def test_parse_symbols() -> None:
    """
    Ensures we parse symbols properly
    """
    parse = parse_text("""##foma-net 1.0##
##props##
2 390211 90019 390213 5 -1 1 2 2 1 0 2
##sigma##
0 @_EPSILON_SYMBOL_@
3 @P.UN.ON@
4 +Err/Orth
5 î
##states##
-1 -1 -1 -1 -1
##end##
""")

    assert parse.has_epsilon
    assert all(isinstance(sym, Symbol) for sym in parse.sigma.values())
    assert stringified_set(parse.sigma) == {"@P.UN.ON@", "+Err/Orth", "î"}
Пример #7
0
def test_parse_whitespace_in_sigma() -> None:
    """
    Ensures that whitespace within sigma is parsed correctly.
    """
    result = parse_text("""##foma-net 1.0##
##props##
2 390211 90019 390213 5 -1 1 2 2 1 0 2
##sigma##
0 @_EPSILON_SYMBOL_@
1 @_UNKNOWN_SYMBOL_@
2 @_IDENTITY_SYMBOL_@
3 \u0020
4 \u00A0
5 \u00AD
##states##
-1 -1 -1 -1 -1
##end##
""")
    assert len(result.sigma) == 3
    assert stringified_set(result.graphemes) == {
        ' ', '\N{NO-BREAK SPACE}', '\N{SOFT HYPHEN}'
    }
Пример #8
0
def test_parse_at_symbol():
    # It used to throw "NotImplementedError"
    parse_text(FST_WITH_AT_TEXT)