def test_match_7(self): alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), make_str_parser) expr = Compiler.single(alphabet, '[^(*SOL)(*EOL)a]*').dfa() result = list(expr.match([str('1'), EOL])) assert result == [[str('label')], [str('1')], [EOL]], \ result
def __init__(self, alphabet=None, use=True, matcher=NfaRegexp): if alphabet is None: alphabet = UnicodeAlphabet.instance() super(CompileRegexp, self).__init__(Rewriter.COMPILE_REGEXP, fmt('CompileRegexp({0}, {1}, {2})', alphabet, use, matcher)) self.alphabet = alphabet self.use = use self.matcher = matcher
def __init__(self, alphabet=None, use=True, matcher=NfaRegexp): if alphabet is None: alphabet = UnicodeAlphabet.instance() super(CompileRegexp, self).__init__(Rewriter.COMPILE_REGEXP, format('CompileRegexp({0}, {1}, {2})', alphabet, use, matcher)) self.alphabet = alphabet self.use = use self.matcher = matcher
def __get_alphabet(self): ''' Get the alphabet used. Typically this is Unicode, which is the default. It is needed for the generation of regular expressions. ''' from lepl.regexp.unicode import UnicodeAlphabet if not self.__alphabet: self.__alphabet = UnicodeAlphabet.instance() return self.__alphabet
def __init__(self, alphabet=None, discard=None, lexer=None): if alphabet is None: alphabet = UnicodeAlphabet.instance() # use '' to have no discard at all if discard is None: discard = '[ \t\r\n]+' super(AddLexer, self).__init__(Rewriter.LEXER, name=fmt('Lexer({0}, {1}, {2})', alphabet, discard, lexer)) self.alphabet = alphabet self.discard = discard self.lexer = lexer if lexer else Lexer
def __init__(self, alphabet=None, discard=None, source=None): if alphabet is None: alphabet = UnicodeAlphabet.instance() # use '' to have no discard at all if discard is None: discard = '[ \t\r\n]' super(AddLexer, self).__init__(Rewriter.LEXER, format('Lexer({0}, {1}, {2})', alphabet, discard, source)) self.alphabet = alphabet self.discard = discard self.source = source
def compile(self, alphabet=None): ''' Convert the regexp if necessary. ''' if alphabet is None: alphabet = UnicodeAlphabet.instance() # pylint: disable-msg=E0203 # set in constructor via _kargs if self.alphabet is None: self.alphabet = alphabet self.regexp = self.__to_regexp(self.regexp, self.alphabet) self.compiled = True
def __init__(self, alphabet=None, discard=None, source=None): if alphabet is None: alphabet = UnicodeAlphabet.instance() # use '' to have no discard at all if discard is None: discard = '[ \t\r\n]' super(AddLexer, self).__init__( Rewriter.LEXER, format('Lexer({0}, {1}, {2})', alphabet, discard, source)) self.alphabet = alphabet self.discard = discard self.source = source
# terms of the LGPL License and not to allow others to use your version # of this file under the MPL, indicate your decision by deleting the # provisions above and replace them with the notice and other provisions # required by the LGPL License. If you do not delete the provisions # above, a recipient may use your version of this file under either the # MPL or the LGPL License. ''' Tests for the lepl.regexp.interval module. ''' from unittest import TestCase from lepl.regexp.interval import IntervalMap, TaggedFragments, Character from lepl.regexp.unicode import UnicodeAlphabet UNICODE = UnicodeAlphabet.instance() # pylint: disable-msg=C0103, C0111, C0301, C0324 # (dude this is just a test) class IntervalMapTest(TestCase): def test_single(self): m = IntervalMap() m[(1, 2)] = 12 assert m[0] == None, m[0] assert m[1] == 12, m[1] assert m[1.5] == 12, m[1.5] assert m[2] == 12, m[2] assert m[3] == None, m[3]
''' from unittest import TestCase #from logging import basicConfig, DEBUG from lepl import RegexpError, DEFAULT_STREAM_FACTORY from lepl.regexp.core import NfaGraph, NfaToDfa, Compiler from lepl.regexp.unicode import UnicodeAlphabet from lepl.stream.simple import StringHelper from lepl.support.lib import fmt # pylint: disable-msg=C0103, C0111, C0301, R0201, R0904 # (dude this is just a test) UNICODE = UnicodeAlphabet.instance() def _test_parser(regexp): return Compiler.single(UNICODE, regexp) def label(text): return fmt('(?P<label>{0!s})', text) class CharactersTest(TestCase): def test_unicode_dot(self): #basicConfig(level=DEBUG) c = _test_parser('.') assert label('.') == str(c), str(c) c = _test_parser('.\\.')
def __init__(self, regexp, alphabet=None): alphabet = UnicodeAlphabet.instance() if alphabet is None else alphabet super(NfaRegexp, self).__init__(regexp, alphabet) self.__cached_matcher = None
def line_aware(self, alphabet=None, parser_factory=None, discard=None, tabsize=-1, block_policy=None, block_start=None): ''' Configure the parser for line aware behaviour. This clears the current setting and sets many different options. Although these options are required for "line aware" parsing, you normally do not need to call this because it is called by `default_line_aware` . `alphabet` is the alphabet used; by default it is assumed to be Unicode and it will be extended to include start and end of line markers. `parser_factory` is used to generate a regexp parser. If this is unset then the parser used depends on whether blocks are being used. If so, then the HideSolEolParser is used (so that you can specify tokens without worrying about SOL and EOL); otherwise a normal parser is used. `discard` is a regular expression which is matched against the stream if lexing otherwise fails. A successful match is discarded. If None then the usual token defaut is used (whitespace). To disable, use an empty string. `tabsize`, if not None, should be the number of spaces used to replace tabs. `block_policy` should be the number of spaces in an indent, if blocks are used (or an appropriate function). By default (ie if `block_start` is given) it is taken to be DEFAULT_POLICY. `block_start` is the initial indentation, if blocks are used. By default (ie if `block_policy` is given) 0 is used. To enable blocks ("offside rule" parsing), at least one of `block_policy` and `block_start` must be given. ` ''' from lepl.offside.matchers import DEFAULT_TABSIZE from lepl.offside.regexp import LineAwareAlphabet, \ make_hide_sol_eol_parser from lepl.offside.stream import LineAwareStreamFactory, \ LineAwareTokenSource from lepl.regexp.str import make_str_parser from lepl.regexp.unicode import UnicodeAlphabet self.clear() use_blocks = block_policy is not None or block_start is not None if use_blocks: self.blocks(block_policy, block_start) if tabsize and tabsize < 0: tabsize = DEFAULT_TABSIZE if alphabet is None: alphabet = UnicodeAlphabet.instance() if not parser_factory: if use_blocks: parser_factory = make_hide_sol_eol_parser else: parser_factory = make_str_parser self.alphabet(LineAwareAlphabet(alphabet, parser_factory)) self.set_alphabet_arg() if use_blocks: self.set_block_policy_arg(block_policy) self.lexer(alphabet=self.__get_alphabet(), discard=discard, source=LineAwareTokenSource.factory(tabsize)) self.stream_factory(LineAwareStreamFactory(self.__get_alphabet())) return self
def test_match_3(self): alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), make_str_parser) expr = Compiler.single(alphabet, '[^a]*').dfa() result = list(expr.match(str('123a'))) assert result == [[str('label')], str('123'), str('a')], result
def test_match_2(self): alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), make_str_parser) expr = Compiler.single(alphabet, '[^a]').nfa() result = list(expr.match(str('123a'))) assert result == [(str('label'), str('1'), str('23a'))], result
def test_match_5(self): alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), make_str_parser) expr = Compiler.single(alphabet, '[^a]*').dfa() result = list(expr.match([SOL, str('1'), str('a')])) assert result == [[str('label')], [SOL, str('1')], [str('a')]], result