Exemplo n.º 1
0
 def test_escape_sequences(self):
     scanner = RegexScanner(r'\0\a\b\t\n\v\f\r\\')
     for c in '\0\a\b\t\n\v\f\r\\':
         token = scanner.lex()
         self.assertEqual(token.category, Token.SYMBOL)
         self.assertEqual(token.symbol, c)
     self.assertEqual(scanner.lex().category, Token.EOF)
Exemplo n.º 2
0
 def test_escape_metachars(self):
     scanner = RegexScanner(r'\*\+\|\(\)')
     for c in '*+|()':
         token = scanner.lex()
         self.assertEqual(token.category, Token.SYMBOL)
         self.assertEqual(token.symbol, c)
     self.assertEqual(scanner.lex().category, Token.EOF)
Exemplo n.º 3
0
    def test_eof(self):
        scanner = RegexScanner('')
        self.assertEqual(scanner.lex().category, Token.EOF)

        # Repeated calls after we hit the end should still yield EOF.
        self.assertEqual(scanner.lex().category, Token.EOF)
        self.assertEqual(scanner.lex().category, Token.EOF)
Exemplo n.º 4
0
 def test_chars(self):
     scanner = RegexScanner('abc')
     for c in 'abc':
         token = scanner.lex()
         self.assertEqual(token.category, Token.SYMBOL)
         self.assertEqual(token.symbol, c)
     self.assertEqual(scanner.lex().category, Token.EOF)
Exemplo n.º 5
0
    def test_parens(self):
        scanner = RegexScanner('(())')
        self.assertEqual(scanner.lex().category, Token.LPAREN)
        self.assertEqual(scanner.lex().category, Token.LPAREN)
        self.assertEqual(scanner.lex().category, Token.RPAREN)
        self.assertEqual(scanner.lex().category, Token.RPAREN)
        self.assertEqual(scanner.lex().category, Token.EOF)

        scanner = RegexScanner(')((()')
        self.assertEqual(scanner.lex().category, Token.RPAREN)
        self.assertEqual(scanner.lex().category, Token.LPAREN)
        self.assertEqual(scanner.lex().category, Token.LPAREN)
        self.assertEqual(scanner.lex().category, Token.LPAREN)
        self.assertEqual(scanner.lex().category, Token.RPAREN)
        self.assertEqual(scanner.lex().category, Token.EOF)
Exemplo n.º 6
0
    def test_invalid_range_in_char_class(self):
        scanner = RegexScanner('[z-a]')
        with self.assertRaises(ScanningError):
            scanner.lex()

        scanner = RegexScanner('[^z-a]')
        with self.assertRaises(ScanningError):
            scanner.lex()
Exemplo n.º 7
0
    def test_unmatched_bracket_in_char_class(self):
        scanner = RegexScanner('[a-')
        with self.assertRaises(ScanningError):
            scanner.lex()

        scanner = RegexScanner('[^a-')
        with self.assertRaises(ScanningError):
            scanner.lex()
Exemplo n.º 8
0
    def test_simple_char_class(self):
        scanner = RegexScanner('0[abc]')
        scanner.lex()
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, {'a', 'b', 'c'})

        scanner = RegexScanner('[^abc]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, set(SIGMA) - {'a', 'b', 'c'})
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(description='Generate programs for scanning of text.')

    parser.add_argument('-l', '--lex', type=argparse.FileType('w'), metavar='FILE',
                        help='emit a log of lexed tokens to a specified file')
    parser.add_argument('-a', '--ast', type=argparse.FileType('w'), metavar='FILE',
                        help='emit the parsed regex ASTs to a specified file')
    parser.add_argument('-n', '--nfa', type=argparse.FileType('w'), metavar='FILE',
                        help='write the NFA for Graphviz dot rendering')
    parser.add_argument('-d', '--dfa', type=argparse.FileType('w'), metavar='FILE',
                        help='write the DFA for Graphviz dot rendering')
    parser.add_argument('-m', '--min-dfa', type=argparse.FileType('w'), metavar='FILE',
                        help='write the minimized DFA for Graphviz dot rendering')
    parser.add_argument('-c', '--c-source', type=argparse.FileType('w'),
                        metavar='FILE', default=sys.stdout,
                        help='write the C source code for a scanner (defaults to stdout)')

    args = parser.parse_args()

    rescanner = RegexScanner(sys.stdin, args.lex)
    reparser = RegexParser(rescanner)

    asts = reparser.parse_top_level()
    for ast in asts:
        if args.ast:
            print(ast, file=args.ast)

    nfa = asts_to_nfa(asts)
    if args.nfa:
        nfa.print_graphviz(args.nfa)

    dfa = nfa.to_dfa()
    if args.dfa:
        dfa.print_graphviz(args.dfa)

    min_dfa = dfa.minimized()
    if args.min_dfa:
        min_dfa.print_graphviz(args.min_dfa)

    scangen = TableDrivenScannerGenerator(min_dfa)
    args.c_source.write(scangen.c_source())

    rescanner.close()
Exemplo n.º 10
0
    def test_backslash_in_char_class(self):
        scanner = RegexScanner(r'[\n]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, {'\\', 'n'})

        scanner = RegexScanner(r'[^\n]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, set(SIGMA) - {'\\', 'n'})
Exemplo n.º 11
0
    def test_caret_in_char_class(self):
        scanner = RegexScanner('[a^]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, {'a', '^'})

        scanner = RegexScanner('[^a^]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, set(SIGMA) - {'a', '^'})
Exemplo n.º 12
0
    def test_hyphen_and_bracket_in_char_class(self):
        scanner = RegexScanner('[]-]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, {']', '-'})

        scanner = RegexScanner('[^]-]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, set(SIGMA) - {']', '-'})
Exemplo n.º 13
0
 def test_trailing_backslash(self):
     scanner = RegexScanner('\\')
     with self.assertRaises(ScanningError):
         scanner.lex()
Exemplo n.º 14
0
 def test_eol(self):
     scanner = RegexScanner('\n\n')
     self.assertEqual(scanner.lex().category, Token.EOL)
     self.assertEqual(scanner.lex().category, Token.EOL)
     self.assertEqual(scanner.lex().category, Token.EOF)
Exemplo n.º 15
0
    def test_trailing_hyphen_in_char_class(self):
        scanner = RegexScanner('[a-]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, {'a', '-'})

        scanner = RegexScanner('[-]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, {'-'})

        scanner = RegexScanner('[^a-]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, set(SIGMA) - {'a', '-'})

        scanner = RegexScanner('[^-]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, set(SIGMA) - {'-'})
Exemplo n.º 16
0
 def test_pipe(self):
     scanner = RegexScanner('|')
     self.assertEqual(scanner.lex().category, Token.PIPE)
     self.assertEqual(scanner.lex().category, Token.EOF)
Exemplo n.º 17
0
 def test_plus(self):
     scanner = RegexScanner('+')
     self.assertEqual(scanner.lex().category, Token.PLUS)
     self.assertEqual(scanner.lex().category, Token.EOF)
Exemplo n.º 18
0
    def test_char_class_range(self):
        expected = {chr(c) for c in range(ord('a'), ord('z') + 1)}

        scanner = RegexScanner('[a-z]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, expected)

        scanner = RegexScanner('[^a-z]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, set(SIGMA) - expected)

        expected |= {chr(c) for c in range(ord('0'), ord('9') + 1)}

        scanner = RegexScanner('[a-z0-9]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, expected)

        scanner = RegexScanner('[^a-z0-9]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, set(SIGMA) - expected)
Exemplo n.º 19
0
    def test_closing_bracket_in_char_class(self):
        scanner = RegexScanner('[]]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, {']'})

        scanner = RegexScanner('[]3]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, {']', '3'})

        scanner = RegexScanner('[^]]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, set(SIGMA) - {']'})

        scanner = RegexScanner('[^]3]')
        token = scanner.lex()
        self.assertEqual(token.category, Token.CHARCLASS)
        self.assertEqual(token.char_class, set(SIGMA) - {']', '3'})
Exemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser(
        description='Generate programs for scanning of text.')

    parser.add_argument('-l',
                        '--lex',
                        type=argparse.FileType('w'),
                        metavar='FILE',
                        help='emit a log of lexed tokens to a specified file')
    parser.add_argument('-a',
                        '--ast',
                        type=argparse.FileType('w'),
                        metavar='FILE',
                        help='emit the parsed regex ASTs to a specified file')
    parser.add_argument('-n',
                        '--nfa',
                        type=argparse.FileType('w'),
                        metavar='FILE',
                        help='write the NFA for Graphviz dot rendering')
    parser.add_argument('-d',
                        '--dfa',
                        type=argparse.FileType('w'),
                        metavar='FILE',
                        help='write the DFA for Graphviz dot rendering')
    parser.add_argument(
        '-m',
        '--min-dfa',
        type=argparse.FileType('w'),
        metavar='FILE',
        help='write the minimized DFA for Graphviz dot rendering')
    parser.add_argument(
        '-c',
        '--c-source',
        type=argparse.FileType('w'),
        metavar='FILE',
        default=sys.stdout,
        help='write the C source code for a scanner (defaults to stdout)')

    args = parser.parse_args()

    rescanner = RegexScanner(sys.stdin, args.lex)
    reparser = RegexParser(rescanner)

    asts = reparser.parse_top_level()
    for ast in asts:
        if args.ast:
            print(ast, file=args.ast)

    nfa = asts_to_nfa(asts)
    if args.nfa:
        nfa.print_graphviz(args.nfa)

    dfa = nfa.to_dfa()
    if args.dfa:
        dfa.print_graphviz(args.dfa)

    min_dfa = dfa.minimized()
    if args.min_dfa:
        min_dfa.print_graphviz(args.min_dfa)

    scangen = TableDrivenScannerGenerator(min_dfa)
    args.c_source.write(scangen.c_source())

    rescanner.close()
Exemplo n.º 21
0
 def test_star(self):
     scanner = RegexScanner('*')
     self.assertEqual(scanner.lex().category, Token.STAR)
     self.assertEqual(scanner.lex().category, Token.EOF)