Exemplo n.º 1
0
 def test_match_7(self):
     alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), 
                                  make_str_parser)
     expr = Compiler.single(alphabet, '[^(*SOL)(*EOL)a]*').dfa()
     result = list(expr.match([str('1'), EOL]))
     assert result == [[str('label')], [str('1')], [EOL]], \
         result
Exemplo n.º 2
0
 def test_match_7(self):
     alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(),
                                  make_str_parser)
     expr = Compiler.single(alphabet, '[^(*SOL)(*EOL)a]*').dfa()
     result = list(expr.match([str('1'), EOL]))
     assert result == [[str('label')], [str('1')], [EOL]], \
         result
Exemplo n.º 3
0
 def __init__(self, alphabet=None, use=True, matcher=NfaRegexp):
     if alphabet is None:
         alphabet = UnicodeAlphabet.instance()
     super(CompileRegexp, self).__init__(Rewriter.COMPILE_REGEXP,
         fmt('CompileRegexp({0}, {1}, {2})', alphabet, use, matcher))
     self.alphabet = alphabet
     self.use = use
     self.matcher = matcher
Exemplo n.º 4
0
 def __init__(self, alphabet=None, use=True, matcher=NfaRegexp):
     if alphabet is None:
         alphabet = UnicodeAlphabet.instance()
     super(CompileRegexp, self).__init__(Rewriter.COMPILE_REGEXP,
         format('CompileRegexp({0}, {1}, {2})', alphabet, use, matcher))
     self.alphabet = alphabet
     self.use = use
     self.matcher = matcher
Exemplo n.º 5
0
 def __get_alphabet(self):
     '''
     Get the alphabet used.
     
     Typically this is Unicode, which is the default.  It is needed for
     the generation of regular expressions. 
     '''
     from lepl.regexp.unicode import UnicodeAlphabet
     if not self.__alphabet:
         self.__alphabet = UnicodeAlphabet.instance()
     return self.__alphabet
Exemplo n.º 6
0
 def __init__(self, alphabet=None, discard=None, lexer=None):
     if alphabet is None:
         alphabet = UnicodeAlphabet.instance()
     # use '' to have no discard at all
     if discard is None:
         discard = '[ \t\r\n]+'
     super(AddLexer, self).__init__(Rewriter.LEXER,
         name=fmt('Lexer({0}, {1}, {2})', alphabet, discard, lexer))
     self.alphabet = alphabet
     self.discard = discard
     self.lexer = lexer if lexer else Lexer
Exemplo n.º 7
0
 def __init__(self, alphabet=None, discard=None, source=None):
     if alphabet is None:
         alphabet = UnicodeAlphabet.instance()
     # use '' to have no discard at all
     if discard is None:
         discard = '[ \t\r\n]'
     super(AddLexer, self).__init__(Rewriter.LEXER,
         format('Lexer({0}, {1}, {2})', alphabet, discard, source))
     self.alphabet = alphabet
     self.discard = discard
     self.source = source
Exemplo n.º 8
0
 def __get_alphabet(self):
     '''
     Get the alphabet used.
     
     Typically this is Unicode, which is the default.  It is needed for
     the generation of regular expressions. 
     '''
     from lepl.regexp.unicode import UnicodeAlphabet
     if not self.__alphabet:
         self.__alphabet = UnicodeAlphabet.instance()
     return self.__alphabet
Exemplo n.º 9
0
 def compile(self, alphabet=None):
     '''
     Convert the regexp if necessary. 
     '''
     if alphabet is None:
         alphabet = UnicodeAlphabet.instance()
     # pylint: disable-msg=E0203
     # set in constructor via _kargs
     if self.alphabet is None:
         self.alphabet = alphabet
     self.regexp = self.__to_regexp(self.regexp, self.alphabet)
     self.compiled = True
Exemplo n.º 10
0
 def __init__(self, alphabet=None, discard=None, source=None):
     if alphabet is None:
         alphabet = UnicodeAlphabet.instance()
     # use '' to have no discard at all
     if discard is None:
         discard = '[ \t\r\n]'
     super(AddLexer, self).__init__(
         Rewriter.LEXER,
         format('Lexer({0}, {1}, {2})', alphabet, discard, source))
     self.alphabet = alphabet
     self.discard = discard
     self.source = source
Exemplo n.º 11
0
# terms of the LGPL License and not to allow others to use your version
# of this file under the MPL, indicate your decision by deleting the
# provisions above and replace them with the notice and other provisions
# required by the LGPL License.  If you do not delete the provisions
# above, a recipient may use your version of this file under either the
# MPL or the LGPL License.
'''
Tests for the lepl.regexp.interval module.
'''

from unittest import TestCase

from lepl.regexp.interval import IntervalMap, TaggedFragments, Character
from lepl.regexp.unicode import UnicodeAlphabet

UNICODE = UnicodeAlphabet.instance()

# pylint: disable-msg=C0103, C0111, C0301, C0324
# (dude this is just a test)


class IntervalMapTest(TestCase):
    def test_single(self):
        m = IntervalMap()
        m[(1, 2)] = 12
        assert m[0] == None, m[0]
        assert m[1] == 12, m[1]
        assert m[1.5] == 12, m[1.5]
        assert m[2] == 12, m[2]
        assert m[3] == None, m[3]
Exemplo n.º 12
0
'''

from unittest import TestCase

#from logging import basicConfig, DEBUG
from lepl import RegexpError, DEFAULT_STREAM_FACTORY
from lepl.regexp.core import NfaGraph, NfaToDfa, Compiler
from lepl.regexp.unicode import UnicodeAlphabet
from lepl.stream.simple import StringHelper
from lepl.support.lib import fmt

# pylint: disable-msg=C0103, C0111, C0301, R0201, R0904
# (dude this is just a test)


UNICODE = UnicodeAlphabet.instance()


def _test_parser(regexp):
    return Compiler.single(UNICODE, regexp)

def label(text):
    return fmt('(?P<label>{0!s})', text)
    
class CharactersTest(TestCase):
    
    def test_unicode_dot(self):
        #basicConfig(level=DEBUG)
        c = _test_parser('.')
        assert label('.') == str(c), str(c)
        c = _test_parser('.\\.')
Exemplo n.º 13
0
 def __init__(self, regexp, alphabet=None):
     alphabet = UnicodeAlphabet.instance() if alphabet is None else alphabet
     super(NfaRegexp, self).__init__(regexp, alphabet)
     self.__cached_matcher = None
Exemplo n.º 14
0
 def __init__(self, regexp, alphabet=None):
     alphabet = UnicodeAlphabet.instance() if alphabet is None else alphabet
     super(NfaRegexp, self).__init__(regexp, alphabet)
     self.__cached_matcher = None
Exemplo n.º 15
0
    def line_aware(self, alphabet=None, parser_factory=None,
                   discard=None, tabsize=-1, 
                   block_policy=None, block_start=None):
        '''
        Configure the parser for line aware behaviour.  This clears the
        current setting and sets many different options.
        
        Although these options are required for "line aware" parsing,
        you normally do not need to call this because it is called by 
        `default_line_aware` .
        
        `alphabet` is the alphabet used; by default it is assumed to be Unicode
        and it will be extended to include start and end of line markers.
        
        `parser_factory` is used to generate a regexp parser.  If this is unset
        then the parser used depends on whether blocks are being used.  If so,
        then the HideSolEolParser is used (so that you can specify tokens 
        without worrying about SOL and EOL); otherwise a normal parser is
        used.
        
        `discard` is a regular expression which is matched against the stream
        if lexing otherwise fails.  A successful match is discarded.  If None
        then the usual token defaut is used (whitespace).  To disable, use
        an empty string.
        
        `tabsize`, if not None, should be the number of spaces used to replace
        tabs.
        
        `block_policy` should be the number of spaces in an indent, if blocks 
        are used (or an appropriate function).  By default (ie if `block_start`
        is given) it is taken to be DEFAULT_POLICY.
        
        `block_start` is the initial indentation, if blocks are used.  By 
        default (ie if `block_policy` is given) 0 is used.
        
        To enable blocks ("offside rule" parsing), at least one of 
        `block_policy` and `block_start` must be given.
        `
        '''
        from lepl.offside.matchers import DEFAULT_TABSIZE
        from lepl.offside.regexp import LineAwareAlphabet, \
            make_hide_sol_eol_parser
        from lepl.offside.stream import LineAwareStreamFactory, \
            LineAwareTokenSource
        from lepl.regexp.str import make_str_parser
        from lepl.regexp.unicode import UnicodeAlphabet
        
        self.clear()
        
        use_blocks = block_policy is not None or block_start is not None
        if use_blocks:
            self.blocks(block_policy, block_start)
            
        if tabsize and tabsize < 0:
            tabsize = DEFAULT_TABSIZE
        if alphabet is None:
            alphabet = UnicodeAlphabet.instance()
        if not parser_factory:
            if use_blocks:
                parser_factory = make_hide_sol_eol_parser
            else:
                parser_factory = make_str_parser
        self.alphabet(LineAwareAlphabet(alphabet, parser_factory))

        self.set_alphabet_arg()
        if use_blocks:
            self.set_block_policy_arg(block_policy)
        self.lexer(alphabet=self.__get_alphabet(), discard=discard, 
                   source=LineAwareTokenSource.factory(tabsize))
        self.stream_factory(LineAwareStreamFactory(self.__get_alphabet()))
        
        return self
Exemplo n.º 16
0
 def test_match_3(self):
     alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(),
                                  make_str_parser)
     expr = Compiler.single(alphabet, '[^a]*').dfa()
     result = list(expr.match(str('123a')))
     assert result == [[str('label')], str('123'), str('a')], result
Exemplo n.º 17
0
 def test_match_2(self):
     alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), 
                                  make_str_parser)
     expr = Compiler.single(alphabet, '[^a]').nfa()
     result = list(expr.match(str('123a')))
     assert result == [(str('label'), str('1'), str('23a'))], result
Exemplo n.º 18
0
 def test_match_5(self):
     alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), 
                                  make_str_parser)
     expr = Compiler.single(alphabet, '[^a]*').dfa()
     result = list(expr.match([SOL, str('1'), str('a')]))
     assert result == [[str('label')], [SOL, str('1')], [str('a')]], result
Exemplo n.º 19
0
    def line_aware(self,
                   alphabet=None,
                   parser_factory=None,
                   discard=None,
                   tabsize=-1,
                   block_policy=None,
                   block_start=None):
        '''
        Configure the parser for line aware behaviour.  This clears the
        current setting and sets many different options.
        
        Although these options are required for "line aware" parsing,
        you normally do not need to call this because it is called by 
        `default_line_aware` .
        
        `alphabet` is the alphabet used; by default it is assumed to be Unicode
        and it will be extended to include start and end of line markers.
        
        `parser_factory` is used to generate a regexp parser.  If this is unset
        then the parser used depends on whether blocks are being used.  If so,
        then the HideSolEolParser is used (so that you can specify tokens 
        without worrying about SOL and EOL); otherwise a normal parser is
        used.
        
        `discard` is a regular expression which is matched against the stream
        if lexing otherwise fails.  A successful match is discarded.  If None
        then the usual token defaut is used (whitespace).  To disable, use
        an empty string.
        
        `tabsize`, if not None, should be the number of spaces used to replace
        tabs.
        
        `block_policy` should be the number of spaces in an indent, if blocks 
        are used (or an appropriate function).  By default (ie if `block_start`
        is given) it is taken to be DEFAULT_POLICY.
        
        `block_start` is the initial indentation, if blocks are used.  By 
        default (ie if `block_policy` is given) 0 is used.
        
        To enable blocks ("offside rule" parsing), at least one of 
        `block_policy` and `block_start` must be given.
        `
        '''
        from lepl.offside.matchers import DEFAULT_TABSIZE
        from lepl.offside.regexp import LineAwareAlphabet, \
            make_hide_sol_eol_parser
        from lepl.offside.stream import LineAwareStreamFactory, \
            LineAwareTokenSource
        from lepl.regexp.str import make_str_parser
        from lepl.regexp.unicode import UnicodeAlphabet

        self.clear()

        use_blocks = block_policy is not None or block_start is not None
        if use_blocks:
            self.blocks(block_policy, block_start)

        if tabsize and tabsize < 0:
            tabsize = DEFAULT_TABSIZE
        if alphabet is None:
            alphabet = UnicodeAlphabet.instance()
        if not parser_factory:
            if use_blocks:
                parser_factory = make_hide_sol_eol_parser
            else:
                parser_factory = make_str_parser
        self.alphabet(LineAwareAlphabet(alphabet, parser_factory))

        self.set_alphabet_arg()
        if use_blocks:
            self.set_block_policy_arg(block_policy)
        self.lexer(alphabet=self.__get_alphabet(),
                   discard=discard,
                   source=LineAwareTokenSource.factory(tabsize))
        self.stream_factory(LineAwareStreamFactory(self.__get_alphabet()))

        return self