示例#1
0
def test_shunting_yard_postfix_re():
    assert list(
        shunting_yard_postfix(tokenizer_re("(12)?(345)*?(67)+89"),
                              MAP_OPERATORS_RE)) == [
                                  '1', '2', '.', '?', '3', '4', '.', '5', '.',
                                  '*', '?', '.', '6', '7', '.', '+', '.', '8',
                                  '.', '9', '.'
                              ]
示例#2
0
def test_rpn_deque_ast():
    tokenized = tokenizer_re("(a?b)*?c+d")
    ast = Ast()
    output = RpnDequeAst(map_operators=MAP_OPERATORS_RE, ast=ast)
    ret = shunting_yard_postfix(tokenized, MAP_OPERATORS_RE, output=output)
    assert num_vertices(ast) == 11
    assert num_edges(ast) == 10
    [root] = ret
    assert root == 10
    from pybgl.graphviz import graph_to_html
    graph_to_html(ast)
示例#3
0
def test_tokenizer_re_implicit():
    map_input_expected = {
        "123?(4|5)*67": "1.2.3?.(4|5)*.6.7",
        "(1?2)*?3+4": "(1?.2)*?.3+.4",
        "a\\dx": "a.\\d.x",
        "a\\d+x": "a.\\d+.x",
        "a[0-9]x": "a.[0-9].x",
        "a[0-9]+x": "a.[0-9]+.x",
        "a{1,2}+x": "a{1,2}+.x",
    }
    for (regexp, expected) in map_input_expected.items():
        obtained = "".join(tokenizer_re(regexp))
        assert obtained == expected
示例#4
0
def test_tokenizer_re_classes():
    # Explicit concatenation
    assert list(tokenizer_re("a.[0-9].b",
                             cat=None)) == ["a", ".", "[0-9]", ".", "b"]
    assert list(tokenizer_re("a.[^0-9].b",
                             cat=None)) == ["a", ".", "[^0-9]", ".", "b"]
    assert list(tokenizer_re("a.[(].b",
                             cat=None)) == ["a", ".", "[(]", ".", "b"]
    assert list(tokenizer_re("[a-z].[0-9]",
                             cat=None)) == ["[a-z]", ".", "[0-9]"]
    # Implicit concatenation
    assert list(tokenizer_re("a[0-9]b")) == ["a", ".", "[0-9]", ".", "b"]
    assert list(tokenizer_re("a[^0-9]b")) == ["a", ".", "[^0-9]", ".", "b"]
    assert list(tokenizer_re("a[(]b")) == ["a", ".", "[(]", ".", "b"]
    assert list(tokenizer_re("[a-z][0-9]")) == ["[a-z]", ".", "[0-9]"]
示例#5
0
def test_tokenizer_re_char_repetitions():
    # Explicit concatenation
    assert list(tokenizer_re("x{1,3}.y", cat=None)) == ["x", "{1,3}", ".", "y"]
    assert list(tokenizer_re("x{3}.y", cat=None)) == ["x", "{3}", ".", "y"]
    assert list(tokenizer_re("x{3,}.y", cat=None)) == ["x", "{3,}", ".", "y"]
    # Implicit concatenation
    assert list(tokenizer_re("x{1,3}y")) == ["x", "{1,3}", ".", "y"]
    assert list(tokenizer_re("x{3}y")) == ["x", "{3}", ".", "y"]
    assert list(tokenizer_re("x{3,}y")) == ["x", "{3,}", ".", "y"]
示例#6
0
def test_tokenizer_re_escape_sequence():
    # Explicit concatenation
    assert list(tokenizer_re("a.\\d.b",
                             cat=None)) == ["a", ".", "\\d", ".", "b"]
    assert list(tokenizer_re("a.\\s.b",
                             cat=None)) == ["a", ".", "\\s", ".", "b"]
    assert list(tokenizer_re("a.\\w.b",
                             cat=None)) == ["a", ".", "\\w", ".", "b"]
    # Implicit concatenation
    assert list(tokenizer_re("a\\db")) == ["a", ".", "\\d", ".", "b"]
    assert list(tokenizer_re("a\\sb")) == ["a", ".", "\\s", ".", "b"]
    assert list(tokenizer_re("a\\wb")) == ["a", ".", "\\w", ".", "b"]
示例#7
0
def thompson_compile_nfa(expression: str, whole_alphabet=None) -> Nfa:
    if not expression:
        g = Nfa(1)
        set_final(0, g)
        return (g, 0, 0)
    if whole_alphabet is None:
        whole_alphabet = DEFAULT_ALPHABET
    expression = list(tokenizer_re(expression, cat="."))

    class ThompsonShuntingYardVisitor(DefaultShuntingYardVisitor):
        def __init__(self):
            self.cur_id = 0
            self.nfas = deque()

        def on_push_output(self, a):
            if a in {".", "|"}:
                (nfa2, q02, f2) = self.nfas.pop()
                (nfa1, q01, f1) = self.nfas.pop()
                f = (concatenation
                     if a == "." else alternation if a == "|" else None)
                (nfa1, q01, f1) = f(nfa1, q01, f1, nfa2, q02, f2)
            elif a in {"?", "*", "+"}:
                (nfa1, q01, f1) = self.nfas.pop()
                f = (zero_or_one if a == "?" else zero_or_more
                     if a == "*" else one_or_more if a == "+" else None)
                (nfa1, q01, f1) = f(nfa1, q01, f1)
            elif a[0] == "{":
                (nfa1, q01, f1) = self.nfas.pop()
                (m, n) = parse_repetition(a)
                (nfa1, q01, f1) = repetition_range(nfa1, q01, f1, m, n)
            elif a[0] == "[":
                chars = parse_bracket(a, whole_alphabet)
                (nfa1, q01, f1) = bracket(chars)
            elif a[0] == "\\":
                chars = parse_escaped(a, whole_alphabet)
                (nfa1, q01, f1) = bracket(chars)
            else:
                (nfa1, q01, f1) = literal(a)
            self.nfas.append((nfa1, q01, f1))

    vis = ThompsonShuntingYardVisitor()
    shunting_yard_postfix(expression, map_operators=MAP_OPERATORS_RE, vis=vis)
    assert len(vis.nfas) == 1
    (nfa, q0, f) = vis.nfas.pop()
    return (nfa, q0, f)
示例#8
0
def test_tokenizer_re_parenthesis():
    assert list(tokenizer_re(
        "(a.b.c)+", cat=None)) == ["(", "a", ".", "b", ".", "c", ")", "+"]
    assert list(
        tokenizer_re("(abc)+")) == ["(", "a", ".", "b", ".", "c", ")", "+"]
示例#9
0
def test_shunting_yard_ast():
    tokenized = tokenizer_re("(a?b)*?c+d")
    (ast, root) = shunting_yard_ast(tokenized, MAP_OPERATORS_RE)
    assert num_vertices(ast) == 11
    assert num_edges(ast) == 10
    assert root == 10
示例#10
0
def test_tokenizer_re_explicit():
    assert list(tokenizer_re("11.2.(3+.4*.5?)", cat=None)) == [
        "11", ".", "2", ".", "(", "3", "+", ".", "4", "*", ".", "5", "?", ")"
    ]