Пример #1
0
 def clone_and(use, original, *matchers):
     '''
     We can convert an And only if all the sub-matchers have possible
     regular expressions, and even then we must tag the result unless
     an add transform is present.
     '''
     wrapper = original.wrapper.functions
     add_reqd = True
     if wrapper:
         if wrapper[0] is add:
             wrapper = wrapper[1:]
             add_reqd = False
         else:
             raise Unsuitable
     try:
         # combine all
         (use, regexps) = \
             RegexpContainer.to_regexps(use, matchers, have_add=None)
         # if we have regexp sub-expressions, join them
         regexp = Sequence(alphabet_, *regexps)
         log.debug(format('And: cloning {0}', regexp))
         return RegexpContainer.build(original, regexp, alphabet_, 
                                      regexp_type, use, add_reqd=add_reqd,
                                      wrapper=wrapper)
     except Unsuitable:
         # combine contiguous matchers where possible
         if add_reqd:
             raise
         def unpack(matcher):
             original = RegexpContainer.to_matcher(matcher)
             try:
                 return (original, 
                         RegexpContainer.to_regexps(use, [matcher], 
                                                    have_add=None)[1][0])
             except Unsuitable:
                 return (original, None)
         output = []
         (regexps, originals) = ([], [])
         for (matcher, regexp) in [unpack(matcher) for matcher in matchers]:
             if regexp:
                 regexps.append(regexp)
                 originals.append(matcher)
             else:
                 if len(regexps) > 1:
                     # combine regexps
                     output.append(
                         regexp_type(Sequence(alphabet_, *regexps), 
                                      alphabet_))
                 else:
                     output.extend(originals)
                 output.append(matcher)
                 (regexps, originals) = ([], [])
         if len(regexps) > 1:
             output.append(
                 regexp_type(Sequence(alphabet_, *regexps), alphabet_))
         else:
             output.extend(originals)
         merged = And(*output)
         return merged.compose(original.wrapper)
Пример #2
0
 def and_(a, b):
     '''
     Add space only in the case when both consume something.
     '''
     return Or(And(Consumer(a), separator, Consumer(b)),
               And(Consumer(a), Consumer(b, False)),
               And(Consumer(a, False), Consumer(b)),
               And(Consumer(a, False), Consumer(b, False)))
Пример #3
0
def SkipTo(matcher, include=True):
    '''
    Consume everything up to (and including, if include is True, as it is by
    default) the matcher.  Returns all the skipped data, joined.
    '''
    if include:
        return Add(And(Star(AnyBut(matcher)), matcher))
    else:
        return And(Add(Star(AnyBut(matcher))), Lookahead(matcher))
Пример #4
0
def SingleLineString(quote='"', escape='\\', exclude='\n'):
    '''
    Like `String`,  but will not match across multiple lines.
    '''
    q = Literal(quote)
    content = AnyBut(Or(q, Any(exclude)))
    if escape:
        content = Or(content, And(Drop(escape), q))
    content = Repeat(content, add_=True)
    return And(Drop(q), content, Drop(q))
Пример #5
0
def String(quote='"', escape='\\'):
    '''
    Match a string with quotes that can be escaped.  This will match across
    newlines (see `SingleLineString` for an alternative).
    '''
    q = Literal(quote)
    content = AnyBut(q)
    if escape:
        content = Or(And(Drop(escape), q), content)
    content = Repeat(content, add_=True) 
    return And(Drop(q), content, Drop(q))
Пример #6
0
def SkipString(quote='"', escape='\\', ignore='\n', empty='', join=__add__):
    '''
    Like `String`, matching across multiple lines, but will silently 
    drop newlines.
    '''
    q = Literal(quote)
    content = AnyBut(Or(q, Any(ignore)))
    if escape:
        content = Or(content, And(Drop(escape), q))
    content = Or(content, Drop(Any(ignore)))
    content = Repeat(content, reduce=(empty, join))
    return And(Drop(q), content, Drop(q))
Пример #7
0
def SingleLineString(quote='"',
                     escape='\\',
                     exclude='\n',
                     empty='',
                     join=__add__):
    '''
    Like `String`,  but will not match across multiple lines.
    '''
    q = Literal(quote)
    content = AnyBut(Or(q, Any(exclude)))
    if escape:
        content = Or(content, And(Drop(escape), q))
    content = Repeat(content, reduce=(empty, join))
    return And(Drop(q), content, Drop(q))
Пример #8
0
def String(quote='"', escape='\\', empty='', join=__add__):
    '''
    Match a string with quotes that can be escaped.  This will match across
    newlines (see `SingleLineString` for an alternative).

    More generally, a string is a grouping of results.  Setting `empty` and
    `join` correctly will allow this matcher to work with a variety of types.
    '''
    q = Literal(quote)
    content = AnyBut(q)
    if escape:
        content = Or(And(Drop(escape), q), content)
    content = Repeat(content, reduce=(empty, join))
    return And(Drop(q), content, Drop(q))
Пример #9
0
def UnsignedEFloat(decimal='.', exponent='eE'):
    '''
    Match an `UnsignedFloat` followed by an optional exponent 
    (e+02 etc).
    '''
    return Join(UnsignedFloat(decimal), 
                Optional(And(Any(exponent), SignedInteger())))
Пример #10
0
def UnsignedEReal(decimal='.', exponent='eE'):
    '''
    Match an `UnsignedReal` followed by an optional exponent 
    (e+02 etc).  This will match both integer and float values.
    '''
    return Join(UnsignedReal(decimal),
                Optional(And(Any(exponent), SignedInteger())))
Пример #11
0
    def _match(self, stream_in):
        '''
        Pull indent and call the policy and update the global value, 
        then evaluate the contents.
        '''
        # detect a nested call
        key = s_key(stream_in)
        if key in self.__streams:
            self._debug('Avoided left recursive call to Block.')
            return
        self.__streams.add(key)
        try:
            ((tokens, token_stream), _) = s_next(stream_in)
            (indent, _) = s_line(token_stream, True)
            if START not in tokens:
                raise StopIteration
            current = self.__monitor.indent
            policy = self.policy(current, indent)

            generator = And(*self.lines)._match(stream_in)
            while True:
                self.__monitor.push_level(policy)
                try:
                    results = yield generator
                finally:
                    self.__monitor.pop_level()
                yield results
        finally:
            self.__streams.remove(key)
Пример #12
0
 def _replacements(self, separator):
     '''
     Require the separator on each `And`.
     '''
     # Handle circular dependencies
     from lepl.matchers.combine import And
     return (lambda a, b: And(a, separator, b), self._repeat(separator))
Пример #13
0
 def __init__(self):
     super(TokenNamespace, self).__init__({
         ADD:
         lambda a, b: Add(And(a, b)),
         AND:
         And,
         OR:
         Or,
         APPLY:
         Apply,
         APPLY_RAW:
         lambda a, b: Apply(a, b, raw=True),
         NOT:
         Drop,
         KARGS:
         KApply,
         RAISE:
         lambda a, b: KApply(a, raise_error(b)),
         REPEAT:
         RepeatWrapper,
         FIRST:
         First,
         MAP:
         Map,
         REDUCE:
         None,
     })
Пример #14
0
def AnyBut(exclude=None):
    '''
    Match any character except those specified (or, if a matcher is used as
    the exclude, if the matcher fails).
    
    The argument should be a list of tokens (or a string of suitable 
    characters) to exclude, or a matcher.  If omitted all tokens are accepted.
    '''
    return And(~Lookahead(coerce_(exclude, Any)), Any())
Пример #15
0
 def repeat(m, st=0, sp=None, d=0, s=None, a=False, r=None):
     '''
     Wrap `Repeat` to adapt the separator.
     '''
     if s is None:
         s = separator
     elif not a:
         s = And(separator, s, separator)
     return RepeatWrapper(m, st, sp, d, s, a, r)
Пример #16
0
        def and_(matcher_a, matcher_b):
            '''
            Combine two matchers.
            '''
            (requireda, optionala) = non_optional_copy(matcher_a)
            (requiredb, optionalb) = non_optional_copy(matcher_b)

            if not (optionala or optionalb):
                return And(matcher_a, separator, matcher_b)
            else:
                matcher = Or(*filter((lambda x: x is not None), [
                    And(Optional(And(requireda, separator)), requiredb
                        ) if optionala else None,
                    And(requireda, Optional(And(separator, requiredb))
                        ) if optionalb else None
                ]))
                if optionala and optionalb:
                    # making this explicit allows chaining (we can detect it
                    # when called again in a tree of "ands")
                    matcher = Optional(matcher)
                return matcher
Пример #17
0
 def __init__(self):
     # Handle circular dependencies
     from lepl.matchers.error import raise_error
     from lepl.matchers.derived import Space, Add, Apply, KApply, Drop, Map
     from lepl.matchers.combine import And, Or, First
     super(OperatorNamespace, self).__init__({
         SPACE_OPT:
         lambda a, b: And(a,
                          Space()[0:, ...], b),
         SPACE_REQ:
         lambda a, b: And(a,
                          Space()[1:, ...], b),
         ADD:
         lambda a, b: Add(And(a, b)),
         AND:
         And,
         OR:
         Or,
         APPLY:
         Apply,
         APPLY_RAW:
         lambda a, b: Apply(a, b, raw=True),
         NOT:
         Drop,
         KARGS:
         KApply,
         RAISE:
         lambda a, b: KApply(a, raise_error(b)),
         REPEAT:
         RepeatWrapper,
         FIRST:
         First,
         MAP:
         Map,
         REDUCE:
         None,
     })
Пример #18
0
def Word(chars=NfaRegexp('[^%s]' % whitespace), body=None):
    '''
    Match a sequence of non-space characters, joining them together. 
     
    chars and body, if given as strings, define possible characters to use
    for the first and rest of the characters in the word, respectively.
    If body is not given, then chars is used for the entire word.
    They can also specify matchers, which typically should match only a
    single character.
    
    So ``Word(Upper(), Lower())`` would match names that being with an upper
    case letter, for example, while ``Word(AnyBut(Space()))`` (the default)
    matches any sequence of non-space characters. 
    '''
    chars = coerce_(chars, Any)
    body = chars if body is None else coerce_(body, Any)
    return Add(And(chars, Star(body)))
Пример #19
0
def Repeat(matcher, start=0, stop=None, algorithm=DEPTH_FIRST, 
            separator=None, add_=False):
    '''
    This is called by the [] operator.  It repeats the given matcher between
    start and stop number of times (inclusive).  If ``add`` is true then the
    results are joined with `Add`. If ``separator`` is given then each
    repetition is separated by that matcher.
    '''
    first = coerce_(matcher)
    if separator is None:
        rest = first
    else:
        rest = And(coerce_(separator, Regexp), first)
    if start is None:
        start = 0
    assert_type('The start index for Repeat or [...]', start, int)
    assert_type('The stop index for Repeat or [...]', stop, int, none_ok=True)
    assert_type('The algorithm/increment for Repeat or [...]', algorithm, str)
    if start < 0:
        raise ValueError('Repeat or [...] cannot have a negative start.')
    if stop is not None and stop < start:
        raise ValueError('Repeat or [...] must have a stop '
                         'value greater than or equal to the start.')
    if 'dbgn'.find(algorithm) == -1:
        raise ValueError('Repeat or [...] must have a step (algorithm) '
                         'of d, b, g or n.')
    add_ = Add if add_ else Identity
    return {DEPTH_FIRST:
                add_(DepthFirst(first=first, start=start, 
                                stop=stop, rest=rest)),
            BREADTH_FIRST: 
                add_(BreadthFirst(first=first, start=start, 
                                  stop=stop, rest=rest)),
            GREEDY:        
                add_(OrderByResultCount(BreadthFirst(first=first, start=start, 
                                                     stop=stop, rest=rest))),
            NON_GREEDY:
                add_(OrderByResultCount(BreadthFirst(first=first, start=start, 
                                                     stop=stop, rest=rest),
                                       False))
            }[algorithm]
Пример #20
0
    def __init__(self, clean_html=True):
        self.clean_html = clean_html

        self._punctuation = '!"#&\'()*+,.;<=>?@[\\]^_`{|}~'
        self._lctx_1_exceptions = set('/ :'.split())
        self._lctx_2_exceptions = set('discount redeem voucher'.split())
        self._rctx_1_exceptions = set('/ : th am pm hour hours %'.split())
        self._rctx_2_exceptions = set('discount redeem voucher'.split())

        # LEPL Real Number Matchers (w/thousands)
        _comma_three_digits = Join(Drop(','), Add(Digit()[3]))[:]
        _thousand_group = Or(
            Join(_comma_three_digits, Any('.'), UnsignedInteger()),
            Join(_comma_three_digits, Optional(Any('.'))))
        _real = Or(Join(UnsignedInteger(), _thousand_group),
                   UnsignedReal()) >> float
        _any = Join(Star(AnyBut(_real)))
        self._real_partition_matcher = Star(And(_any, _real, _any))
        self._real_simple_matcher = _real[:,
                                          Drop(
                                              Star(Or(Whitespace(), Any(',-')))
                                          )]
Пример #21
0
    def _match(self, stream_in):
        '''
        Pull indent and call the policy and update the global value, 
        then evaluate the contents.
        '''
        # detect a nested call
        (_line_no, _line_off, char_off, _desc, _text) = stream_in.location
        if char_off in self.__streams:
            self._debug('Avoided left recursive call to Block.')
            return
        self.__streams.add(char_off)
        try:
            (indent, _stream) = yield self.indent._match(stream_in)
            current = self.__monitor.indent
            self.__monitor.push_level(self.policy(current, indent))
            # this flags we have pushed and need to pop
            self.__monitor = None

            generator = And(*self.lines)._match(stream_in)
            while True:
                yield (yield generator)
        finally:
            self.__streams.remove(char_off)
Пример #22
0
 def __build_matcher(self, stream_in):
     '''
     Build a matcher that, when it is evaluated, will return the 
     matcher results for the columns.  We base this on `And`, but need
     to force the correct streams.
     '''
     def force_out(replacement):
         '''
         Generate a transformer function that replaces the stream_out.
         '''
         def replace_out(_stream, matcher):
             (results, _stream_out) = matcher()
             return (results, replacement)
         return replace_out
     # left and right are the indices for the column
     # matchers is the list of matchers that will be joined by And
     # previous is the "column before", which must be modified so that
     # it returns the correct stream_out for the next matcher
     right, matchers, previous = 0, [], Empty()
     columns = list(zip(self.indices, self.matchers))
     if self.skip: 
         # this takes the entire stream_in and applies it to skip
         columns.append(((0, None), Drop(self.skip)))
     else:
         # this takes everything to the right of the previous column
         columns.append((None, Empty()))
     for (col, matcher) in columns:
         try:
             (left, right) = col
         except TypeError:
             left = right
             right = None if col is None else right + col
         matchers.append(Transform(previous, 
                                   force_out(stream_in[left:right])))
         previous = matcher
     matchers.append(previous)
     return And(*matchers)
Пример #23
0
def SignedInteger():
    '''Match a sequence of digits with an optional initial sign.'''
    return Add(And(Optional(Any('+-')), UnsignedInteger()))
Пример #24
0
 def test_simple(self):
     #basicConfig(level=DEBUG)
     self.assert_join([1], Any(), [[1]])
     self.assert_join([1,2], And(Any(), Any()), [[1, 2]])
     self.assert_join([1,2,3], And(Any(), Any()), [[1, 2]])
     self.assert_join([1], And(Any(), Any()), [])
Пример #25
0
def Join(*matchers):
    '''
    Combine many matchers together with Add(And(...)).
    It can be used indirectly by placing ``+`` between matchers.
    '''
    return Add(And(*matchers))
Пример #26
0
    def clone_and(use, original, *matchers):
        '''
        We can convert an And only if all the sub-matchers have possible
        regular expressions, and even then we must tag the result unless
        an add transform is present.
        '''
        if hasattr(original, 'wrapper'):
            wrapper = original.wrapper.functions
        else:
            wrapper = None
        add_reqd = True
        if wrapper:
            if wrapper[0] is add:
                wrapper = wrapper[1:]
                add_reqd = False
            else:
                raise Unsuitable
        try:
            # combine all
            (use, regexps) = \
                RegexpContainer.to_regexps(use, matchers, have_add=None)
            # if we have regexp sub-expressions, join them
            regexp = Sequence(alphabet_, *regexps)
            log.debug(fmt('And: cloning {0}', regexp))
            return RegexpContainer.build(original,
                                         regexp,
                                         alphabet_,
                                         regexp_type,
                                         use,
                                         add_reqd=add_reqd,
                                         wrapper=wrapper)
        except Unsuitable:
            # combine contiguous matchers where possible
            if add_reqd:
                raise

            def unpack(matcher):
                original = RegexpContainer.to_matcher(matcher)
                try:
                    return (original,
                            RegexpContainer.to_regexps(use, [matcher],
                                                       have_add=None)[1][0])
                except Unsuitable:
                    return (original, None)

            output = []
            (regexps, originals) = ([], [])
            for (matcher, regexp) in [unpack(matcher) for matcher in matchers]:
                if regexp:
                    regexps.append(regexp)
                    originals.append(matcher)
                else:
                    if len(regexps) > 1:
                        # combine regexps
                        output.append(
                            regexp_type(Sequence(alphabet_, *regexps),
                                        alphabet_))
                    else:
                        output.extend(originals)
                    output.append(matcher)
                    (regexps, originals) = ([], [])
            if len(regexps) > 1:
                output.append(
                    regexp_type(Sequence(alphabet_, *regexps), alphabet_))
            else:
                output.extend(originals)
            merged = And(*output)
            return merged.compose(original.wrapper)
Пример #27
0
def Repeat(matcher,
           start=0,
           stop=None,
           limit=None,
           algorithm=DEPTH_FIRST,
           separator=None,
           add_=False,
           reduce=None):
    '''
    This is called by the [] operator.  It repeats the given matcher between
    `start` and `stop` number of times (inclusive).
    
    If `limit` is given it is an upper limit on the number of different
    results returned on backtracking.
    
    `algorithm` selects the repeat algorithm to use.
    
    If `separator` is given then each repetition is separated by that matcher.
    
    If `add_` is true then the results are joined with `Add` (once all
    results are obtained).
    
    If `reduce` is given it should be a pair (zero, join) where
    `join(results, next)` is used to accumulate results and `zero` is the
    initial value of `results`.  This is implemented via `Reduce`.

    `reduce` and `add_` cannot be given together.
    '''
    first = coerce_(matcher)
    if separator is None:
        rest = first
    else:
        rest = And(coerce_(separator, Regexp), first)
    if start is None:
        start = 0
    # allow duck typing (mutable values - IntVar etc)


#    assert_type('The start index for Repeat or [...]', start, int)
#    assert_type('The stop index for Repeat or [...]', stop, int, none_ok=True)
#    assert_type('The limit value (step index) for Repeat or [...]', limit, int, none_ok=True)
#    assert_type('The algorithm (step index) for Repeat or [...]', algorithm, str)
#    if start < 0:
#        raise ValueError('Repeat or [...] cannot have a negative start.')
#    if stop is not None and stop < start:
#        raise ValueError('Repeat or [...] must have a stop '
#                         'value greater than or equal to the start.')
#    if 'dbgn'.find(algorithm) == -1:
#        raise ValueError('Repeat or [...] must have a step (algorithm) '
#                         'of d, b, g or n.')
    if add_ and reduce:
        raise ValueError('Repeat cannot apply both add_ and reduce')
    elif add_:
        process = Add
    elif reduce:
        process = lambda r: Reduce(r, reduce[0], reduce[1])
    else:
        process = Identity
    matcher = {
        DEPTH_FIRST:
        process(DepthFirst(first=first, start=start, stop=stop, rest=rest)),
        BREADTH_FIRST:
        process(BreadthFirst(first=first, start=start, stop=stop, rest=rest)),
        GREEDY:
        process(
            OrderByResultCount(
                BreadthFirst(first=first, start=start, stop=stop, rest=rest))),
        NON_GREEDY:
        process(
            OrderByResultCount(
                BreadthFirst(first=first, start=start, stop=stop, rest=rest),
                False))
    }[algorithm]
    if limit is not None:
        matcher = Limit(matcher, count=limit)
    return matcher