def test():
  import ClearMap.Utils.InverseRegularExpression as ire;
  import sre_parse as sre;

  reload(ire)  
  source = '/test/test_(?P<row>\d{4})_(?P<col>\d{3}).tif';
  p = sre.parse(source);
  ire.patternToExpression(p)

  reload(ire)  
  source = r'/test/test_(?:\d)_(?P<col>\d{3})_[7-9][.](?=col)tif$';
  p = sre.parse(source);
  ire.patternToExpression(p)
Exemplo n.º 2
0
def doReversing(p):
#	p = 'ab*de.gh+i{10}'
#	p = re.compile()
	dbg("Pattern:" + p)
	pattern = sre_parse.parse(p, 0) 
	out = reverse(pattern)
	return out
Exemplo n.º 3
0
def charclass_runner(pat):
    r = Regex().get_parse_tree(pat)
    regexlint_version = r.children[0].matching_character_codes
    sre_parsed = sre_parse.parse(pat)
    print(sre_parsed)
    if isinstance(sre_parsed[0][1], int):
        sre_chars = sre_parsed
    else:
        sre_chars = sre_parsed[0][1]
    print('inner', sre_chars)
    golden = list(expand_sre_in(sre_chars))
    order_matters = True
    try:
        if (sre_parsed[0][0] == sre_constants.NOT_LITERAL or
            sre_parsed[0][1][0][0] == sre_constants.NEGATE):
            golden = [i for i in range(256) if i not in golden]
            order_matters = False

    except TypeError:
        pass

    print('sre_parse', golden)
    print('regexlint', regexlint_version)
    if order_matters:
        assert golden == regexlint_version
    else:
        print('extra:', sorted(set(regexlint_version) - set(golden)))
        print('missing:', sorted(set(golden) - set(regexlint_version)))

        assert sorted(golden) == sorted(regexlint_version)
Exemplo n.º 4
0
def reverse_group_map(re_str):
    r = re.compile(re_str)
    ast = sre_parse.parse(re_str)
    group_indices = r.groupindex
    group_index_map = dict((index, group) 
                           for (group, index) in r.groupindex.items())
    return group_index_map
Exemplo n.º 5
0
Arquivo: raz.py Projeto: jdukes/raz
 def __init__(self, pattern, flags=0):
     self.pattern = pattern
     try:
         self.parsed_pattern = sre_parse.parse(pattern, flags)
     except error, e:
         raise ErrorUnparseable(
             'Invalid regex %s failed: %s' % (pattern,e.message))
Exemplo n.º 6
0
def compile_regexp_to_noncapturing(pattern, flags=0):
    """
    Convert all grouping parentheses in the given regexp pattern to
    non-capturing groups, and return the result.  E.g.:

        >>> from nltk.internals import compile_regexp_to_noncapturing
        >>> compile_regexp_to_noncapturing('ab(c(x+)(z*))?d')
        'ab(?:c(?:x+)(?:z*))?d'

    :type pattern: str
    :rtype: str
    """
    def convert_regexp_to_noncapturing_parsed(parsed_pattern):
        res_data = []
        for key, value in parsed_pattern.data:
            if key == sre_constants.SUBPATTERN:
                index, subpattern = value
                value = (None, convert_regexp_to_noncapturing(subpattern))
            elif key == sre_constants.GROUPREF:
                raise ValueError('Regular expressions with back-references are not supported: {0}'.format(pattern))
            res_data.append((key, value))
        parsed_pattern.data = res_data
        parsed_pattern.pattern.groups = 1
        parsed_pattern.pattern.groupdict = {}
        return parsed_pattern

    return sre_compile.compile(convert_regexp_to_noncapturing_parsed(sre_parse.parse(pattern)))
Exemplo n.º 7
0
def compile(p, flags=0):
    # internal: convert pattern list to internal format

    if type(p) in STRING_TYPES:
        import sre_parse
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

    code = _code(p, flags)

    # print code

    # FIXME: <fl> get rid of this limitation!
    assert p.pattern.groups <= 100,\
           "sorry, but this version only supports 100 named groups"

    # map in either direction
    groupindex = p.pattern.groupdict
    indexgroup = [None] * p.pattern.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

    return _sre.compile(
        pattern, flags, code,
        p.pattern.groups-1,
        groupindex, indexgroup
        )
Exemplo n.º 8
0
 def __init__(self, lexicons, init_state=None, flags=0):
     # All the regexp magic below is copied from re.Scanner from
     # the standard library.
     import sre_compile
     import sre_parse
     from sre_constants import BRANCH, SUBPATTERN
     if init_state is None:
         init_state = State()
     if not hasattr(init_state, 'start'):
         init_state.start = None
     self.init_state = init_state
     self.lexicons = lexicons
     self.scanners = {}
     for start, lexicon in lexicons.iteritems():
         # combine phrases into a compound pattern
         p, a = [], []
         s = sre_parse.Pattern()
         s.flags = flags
         for phrase, action in lexicon:
             p.append(sre_parse.SubPattern(s, [
                         (SUBPATTERN, (len(p)+1,
                                       sre_parse.parse(phrase, flags))),
                         ]))
             a.append(action)
         p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
         s.groups = len(p)
         self.scanners[start] = sre_compile.compile(p).match, a
Exemplo n.º 9
0
def regex_slice(expr, start, end):
    """
    Get a slice of a regex by calling regex_index on each index.

    Note that this can return expressions that are overly general: for example,
    it can mix characters from both branches of a regex. Being more specific
    than that would take more work.

        >>> regex_slice('test', 0, 1)
        't'
        >>> regex_slice('t?est', 0, 2)
        '[te][es]'
        >>> regex_slice('mo+', 3, 8)
        'ooooo'

    """
    if start < 0 or end < 0:
        raise NotImplementedError("Can't take negative slices of a regex yet")
    result = ''
    for index in range(start, end):
        choices = _regex_index_pattern(parse(expr), index)
        if len(choices) == 0:
            return None
        elif len(choices) == 1:
            regex = unparse(choices[0])
            result += regex
        else:
            regex = round_trip(unparse(('branch', (None, choices))))
            if '|' in regex:
                result += '(%s)' % (regex,)
            else:
                result += regex
    return result
Exemplo n.º 10
0
def clean_pattern(pattern):
    """
    Cleans URL patterns
     * pattern => token
     * '2'     => ('literal', 50)
     * '2|3'   => ('in', [('literal', 50), ('literal', 51)])
    """
    star = '*'
    parsed = sre_parse.parse(pattern)
    literals = []

    for token in parsed:
        if token[0] == LITERAL:
            character = quote(unichr(token[1]).encode('utf8'))
            literals.append(character)
        elif token[0] == AT:
            pass

        elif literals[-1:] != [star]:
            literals.append(star)

    rule = '/' + ''.join(literals)

    if parsed and not rule.endswith(star):
        if parsed[-1] == (AT, AT_END):
            rule += '$'
        else:
            rule += star

    return rule
Exemplo n.º 11
0
    def __init__(self, lexicon, flags=FLAGS):
        self.actions = [None]
        # combine phrases into a compound pattern
        s = sre_parse.Pattern()
        s.flags = flags
        p = []
        # NOTE(kgibbs): These lines must be added to make this file work under
        # Python 2.2, which is commonly used at Google.
        def enumerate(obj):
            i = -1
            for item in obj:
                i += 1
                yield i, item
        # NOTE(kgibbs): End changes.
        for idx, token in enumerate(lexicon):
            phrase = token.pattern
            try:
                subpattern = sre_parse.SubPattern(s,
                    [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))])
            except sre_constants.error:
                raise
            p.append(subpattern)
            self.actions.append(token)

        s.groups = len(p)+1  # NOTE(guido): Added to make SRE validation work
        p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
        self.scanner = sre_compile.compile(p)
Exemplo n.º 12
0
def isFileExpression(source):
    """Checks if filename is a regular expression denoting a file list
    
    Arguments:
        source (str): source file name
        
    Returns:
        bool: True if source is true regular expression with at least one non-literal
        
    Note:
        The any character '.' is not treated as a non-literal because of possible filename extensions
    """    
    
    if not isinstance(source, basestring):
        return False;
    
    if isFile(source):
        return False;
    else:
        #searchRegex = re.compile('.*\\\\d\{(?P<digit>\d)\}.*').search
        #m = searchRegex(source);
        #if m is None:
        #    return False;
        #else:
        #    return True;
        
        #parse regular expression 
        p = sre_parse.parse(source);
        for l in p:
          #note: allow for a filname.ext patterns although this is strictly a regular expression which should be denoted as filename\.ext
          if l[0] != 'literal' and l[0] != 'any':
            return True;
        
        return False;
Exemplo n.º 13
0
def re_replace_literals(text, mapping):
    """Raises NotImplementedError or re.error"""

    assert isinstance(text, unicode)

    pattern = sre_parse.parse(text)
    return _construct_regexp(pattern, mapping)
Exemplo n.º 14
0
def charclass_runner(pat):
    r = Regex().get_parse_tree(pat)
    regexlint_version = r.children[0].matching_character_codes
    sre_parsed = sre_parse.parse(pat)
    print sre_parsed
    if isinstance(sre_parsed[0][1], int):
        sre_chars = sre_parsed
    else:
        sre_chars = sre_parsed[0][1]
    golden = list(expand_sre_in(sre_chars))
    order_matters = True
    try:
        if (sre_parsed[0][0] == 'not_literal' or
            sre_parsed[0][1][0][0] == 'negate'):
            golden = [i for i in range(256) if i not in golden]
            order_matters = False
    except TypeError:
        pass

    print golden
    print regexlint_version
    if order_matters:
        assert golden == regexlint_version
    else:
        assert sorted(golden) == sorted(regexlint_version)
Exemplo n.º 15
0
def compile(p, flags=0):
    # internal: convert pattern list to internal format

    if isstring(p):
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

    code = _code(p, flags)

    # print code

    # XXX: <fl> get rid of this limitation!
    if p.pattern.groups > 100:
        raise AssertionError(
            "sorry, but this version only supports 100 named groups"
        )

    # map in either direction
    groupindex = p.pattern.groupdict
    indexgroup = [None] * p.pattern.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

    return _sre.compile(
        pattern, flags | p.pattern.flags, code,
        p.pattern.groups - 1,
        groupindex, indexgroup
    )
Exemplo n.º 16
0
def compile(p, flags=0):
    # internal: convert pattern list to internal format

    if isstring(p):
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

    code = _code(p, flags)

    if flags & SRE_FLAG_DEBUG:
        print()
        dis(code)

    # map in either direction
    groupindex = p.state.groupdict
    indexgroup = [None] * p.state.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

    return _sre.compile(
        pattern, flags | p.state.flags, code,
        p.state.groups-1,
        groupindex, tuple(indexgroup)
        )
Exemplo n.º 17
0
def regex_score(regex, search_string):
    """
        Returns a closeness score of how well the regex matches the string.
        Will return -1 if it doesn't match.
    """

    match = re.search(regex, search_string)

    if match:
        # Base score is the longest distance between regex, match,
        # and search_string
        regex_match_dist = levenshtein_distance(
            regex.pattern.lower(), match.group(0).lower())
        match_string_dist = levenshtein_distance(
            match.group(0).lower(), search_string.lower())

        score = max(regex_match_dist, match_string_dist)

        # Adjust score: Special anchors slightly reduce distance
        for opcode, argument in sre_parse.parse(regex.pattern):
            if str(opcode) == 'AT':
                if str(argument) == 'AT_BEGINNING' or 'AT_END':
                    # ^ or $, adjust 1 edit
                    score -= 1

                if str(argument) == 'AT_BOUNDARY':
                    # all other anchors reduce 2 edits
                    score -= 2

        return score if score >= 0 else 0
    else:
        return -1
Exemplo n.º 18
0
def ipermute(p):
    r"""Generate permutations (returns an iterable rather than an array or list)

    >>> list(ipermute(r'[A-Z]\d'))
    ['A0', 'B0', ..., 'Z9']
    """
    toks = [tok_n_val for tok_n_val in sre_parse.parse(p)]
    return permute_toks(toks)
Exemplo n.º 19
0
def base_regex_strategy(regex, parsed=None):
    if parsed is None:
        parsed = sre.parse(regex.pattern)
    return clear_cache_after_draw(_strategy(
        parsed,
        Context(flags=regex.flags),
        regex.pattern
    ))
Exemplo n.º 20
0
 def test_re_inverse(self):
     import sre_parse
     RE = r'(firstleft|)somestring(\s.*|) \S(a|b) [fgh]+ {2,3}R(\S)'
     print(sre_parse.parse(RE))
     for i in range(20):
         ms = re_inverse.make_match_string(RE)
     for i in range(20):
         ms = re_inverse.make_nonmatch_string(RE)
Exemplo n.º 21
0
def base_regex_strategy(regex, parsed=None):
    if parsed is None:
        parsed = sre_parse.parse(regex.pattern, flags=regex.flags)
    return clear_cache_after_draw(
        _strategy(
            parsed, Context(flags=regex.flags), isinstance(regex.pattern, text_type)
        )
    )
Exemplo n.º 22
0
    def __init__(self, pattern, flag = 0, escape = None):
        self.name = None
        self.group = None
        self.flag = flag
        self.lexer = None
        self.escape = escape

        scf = 0
        if flag & self.IGNORE_CASE:
            scf |= sre_compile.SRE_FLAG_IGNORECASE

        cpattern = re.compile(pattern, scf)
        groupidx = cpattern.groupindex

        self.parsed = sre_parse.parse(pattern, scf)

        self.begin = None
        self.middle = None
        self.end = None
        self.exact = False

        for (op, (gidx, val)) in self.parsed:

            for (gname, idx) in groupidx.iteritems():
                if op == sre_constants.SUBPATTERN and gidx == idx:
                    if gname == "begin":
                        self.begin = MatchPattern(self, val, (flag & self.IGNORE_CASE) == self.IGNORE_CASE)
                    elif gname == "middle":
                        self.middle = MatchPattern(self, val, (flag & self.IGNORE_CASE) == self.IGNORE_CASE)
                    elif gname == "end":
                        self.end = MatchPattern(self, val, (flag & self.IGNORE_CASE) == self.IGNORE_CASE)

        if self.begin is None:
            raise Exception("Need to define 'begin' group!")

        if self.end is not None and self.middle is None:
            raise Exception("Need to define 'middle' group when 'end' defined!")

        if self.middle is None and self.end is None:
            _ex = True
            for (_op, _val) in self.begin._pattern:
                if _op != sre_constants.LITERAL:
                    _ex = False
                    break
            self.exact = _ex

        if self.begin.maxWidth() >= self.INF_WIDTH:
            raise Exception("Begin group don't support ifinity match!")

        if self.end and self.end.maxWidth() >= self.INF_WIDTH:
            raise Exception("End group don't support infinity match!");

        if self.middle and self.middle.maxWidth() >= self.INF_WIDTH:
            self.infinity = True
            self.middle.infinity = True
        else:
            self.infinity = False
def expressionToPattern(expression):
  """Convert regular expression to a parsed pattern for manipulation
  
  Arguments:
    expression (str): regular expression
    
  Returns:
    object: parsed pattern
  """
  return sre.parse(expression);
Exemplo n.º 24
0
def colorize_re(regexp):
    r"""
    @return: The HTML code for a colorized version of the pattern for
        the given SRE regular expression.  If C{colorize_re} can't
        figure out how to colorize the regexp, then it will simply return
        the (uncolorized) pattern, with C{'&'}, C{'<'}, and C{'>'}
        escaped as HTML entities.  The colorized expression includes
        spans with the following css classes:
          - X{re}: The entire regular expression.
          - X{re-char}: Special characters (such as C{'.'}, C{'\('}), 
            character categories (such as C{'\w'}), and locations
            (such as C{'\b'}).
          - X{re-op}: Operators (such as C{'*'} and C{'|'}).
          - X{re-group}: Grouping constructs (such as C{'(...)'}).
          - X{re-ref} References (such as C{'\1'})
    @rtype: C{string}
    @param regexp: The regular expression to colorize.
    @type regexp: C{SRE_Pattern} or C{string}
    @raise sre_constants.error: If regexp is not a valid regular
        expression.
    """
    if isinstance(regexp, str):
        pat = decode_with_backslashreplace(regexp)
        tree = sre_parse.parse(pat)
        
    elif isinstance(regexp, unicode):
        tree = sre_parse.parse(regexp)
        
    elif hasattr(regexp, 'pattern') and hasattr(regexp, 'flags'):
        if isinstance(regexp.pattern, str):
            pat = decode_with_backslashreplace(regexp.pattern)
            tree = sre_parse.parse(pat, regexp.flags)
            
        elif isinstance(regexp.pattern, unicode):
            tree = sre_parse.parse(regexp.pattern, regexp.flags)
            
        else:
            raise TypeError("Bad regexp object -- pattern is not a string")
    else:
        raise TypeError("Expected a regexp or a string")

    return ('<span class="%s">%s</span>' %
            (RE_TAG, _colorize_re(tree, 1)))
Exemplo n.º 25
0
        def make_pattern(rules, flags=0):
            """Compile a rules to single branch with groups."""
            pattern = Pattern()
            pattern.flags = flags
            pattern.subpatterns = [None] * (len(rules) + 1)

            return sre_compile(SubPattern(pattern, [
                (BRANCH, (None, [SubPattern(pattern, [
                    (SUBPATTERN, (group, parse(regex, flags, pattern))),
                ]) for group, (regex, _) in enumerate(rules, 1)]))
            ]))
Exemplo n.º 26
0
def regex_pieces(regex):
    """
    Separates a regex into independent pieces.

        >>> regex_pieces('[abc]de+')
        ['[abc]', 'd', 'e+']
    """
    result = []
    for piece in parse(regex):
        result.append(unparse([piece]))
    return result
Exemplo n.º 27
0
def make_nonmatch_string(regexp, flags=0):
    """Given a string that is a regular expression,
    return a string (perhaps with some randomness) that is certain to
    NOT produce a match.
    """
    s = _make_match_string_from_pattern(sre_parse.parse(regexp, get_flags(flags)), True)
    if __debug__:
        cre = compile(regexp, flags)
        if cre.match(s):
            raise GeneratorError("'%s' matches '%s'" % (s, regexp))
    return s
Exemplo n.º 28
0
    def __init__(self, rules, flags=0):
        pattern = Pattern()
        pattern.flags = flags
        pattern.groups = len(rules) + 1

        self.rules = [name for name, _ in rules]
        self._scanner = sre_compile(SubPattern(pattern, [
            (BRANCH, (None, [SubPattern(pattern, [
                (SUBPATTERN, (group, parse(regex, flags, pattern))),
            ]) for group, (_, regex) in enumerate(rules, 1)]))
        ])).scanner
Exemplo n.º 29
0
 def __init__(self, lexicon, flags=0):
     from sre_constants import BRANCH, SUBPATTERN
     self.lexicon = lexicon
     p = []
     s = sre_parse.Pattern()
     s.flags = flags
     for (phrase, action) in lexicon:
         p.append(sre_parse.SubPattern(s, [(SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags)))]))
     s.groups = len(p) + 1
     p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
     self.scanner = sre_compile.compile(p)
Exemplo n.º 30
0
def _make_url_form(regexp):
    cre = re.compile(regexp, re.I)
    # Build reverse format from re parse tree.
    indexmap = dict([(v,k) for k,v in cre.groupindex.items()])
    collect = []
    for op, val in sre_parse.parse(regexp, re.I):
        if op is sre_parse.LITERAL:
            collect.append(chr(val))
        elif op is sre_parse.SUBPATTERN:
            name = indexmap[val[0]]
            collect.append(r'%%(%s)s' % name)
    return cre, "".join(collect)
Exemplo n.º 31
0
    def compile(self):
        self.subpattern = sre_parse.parse(self.pattern, self.flags)
        self.code = sre_compile._code(self.subpattern, self.flags)

        # groups=0, groupindex={}, indexgroup=[None]
        self.groupindex = self.subpattern.pattern.groupdict
        self.indexgroup = [None] * self.subpattern.pattern.groups
        for k, i in self.groupindex.items():
            self.indexgroup[i] = k

        module = _sre_ if self.debug else _sre
        self.regex = getattr(module, 'compile')(
            self.pattern, self.flags | self.subpattern.pattern.flags,
            self.code, self.subpattern.pattern.groups - 1, self.groupindex,
            tuple(self.indexgroup))
        self.dump
        print('-' * 76)
        self.dis
Exemplo n.º 32
0
    def __init__(self, lexicon, flags=FLAGS):
        self.actions = [None]
        # combine phrases into a compound pattern
        s = sre_parse.Pattern()
        s.flags = flags
        p = []
        for idx, token in enumerate(lexicon):
            phrase = token.pattern
            try:
                subpattern = sre_parse.SubPattern(s,
                    [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))])
            except sre_constants.error:
                raise
            p.append(subpattern)
            self.actions.append(token)

        p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
        self.scanner = sre_compile.compile(p)
Exemplo n.º 33
0
Arquivo: re.py Projeto: ybay/yZhPy
 def __init__(self, lexicon, flags=0):
     from sre_constants import BRANCH, SUBPATTERN
     if isinstance(flags, RegexFlag):
         flags = flags.value
     self.lexicon = lexicon
     # 将短语组合成复合模式
     p = []
     s = sre_parse.Pattern()
     s.flags = flags
     for phrase, action in lexicon:
         gid = s.opengroup()
         p.append(
             sre_parse.SubPattern(s, [
                 (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))),
             ]))
         s.closegroup(gid, p[-1])
     p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
     self.scanner = sre_compile.compile(p)
Exemplo n.º 34
0
    def __init__(self, rules, flags=0):
        pattern = Pattern()
        pattern.flags = flags
        if sys.version_info < (3, 0):
            pattern.groups = len(rules) + 1
        _og = pattern.opengroup
        pattern.opengroup = lambda n: _og(n and '%s\x00%s' % (name, n) or n)

        self.rules = []
        subpatterns = []
        for group, (name, regex) in enumerate(rules, 1):
            last_group = pattern.groups - 1
            subpatterns.append(
                SubPattern(pattern, [(SUBPATTERN, (group, parse(regex, flags,
                                                                pattern))), ]))
            self.rules.append((name, last_group, pattern.groups - 1))
        self._scanner = sre_compile(
            SubPattern(pattern, [(BRANCH, (None, subpatterns))])).scanner
def compile(p, flags=0):
    if isstring(p):
        import sre_parse
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None
    code = _code(p, flags)
    if p.pattern.groups > 100:
        raise AssertionError(
            'sorry, but this version only supports 100 named groups')
    groupindex = p.pattern.groupdict
    indexgroup = [None] * p.pattern.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

    return _sre.compile(pattern, flags, code, p.pattern.groups - 1, groupindex,
                        indexgroup)
Exemplo n.º 36
0
    def __init__(self, tokens, ignore=()):
        assert all(isinstance(t, TokenDef) for t in tokens), tokens

        self.ignore = ignore
        self.newline_char = '\n'
        tokens = list(tokens)

        # Sanitization
        for t in tokens:
            try:
                re.compile(t.pattern.to_regexp())
            except:
                raise LexError("Cannot compile token %s: %s" %
                               (t.name, t.pattern))

            width = sre_parse.parse(t.pattern.to_regexp()).getwidth()
            if width[0] == 0:
                raise LexError(
                    "Lexer does not allow zero-width tokens. (%s: %s)" %
                    (t.name, t.pattern))

        token_names = {t.name for t in tokens}
        for t in ignore:
            if t not in token_names:
                raise LexError(
                    "Token '%s' was marked to ignore but it is not defined!" %
                    t)

        # Init
        self.newline_types = [
            t.name for t in tokens
            if _regexp_has_newline(t.pattern.to_regexp())
        ]
        self.ignore_types = [t for t in ignore]

        tokens.sort(key=lambda x: (x.pattern.priority, len(x.pattern.value)),
                    reverse=True)

        tokens, self.callback = _create_unless(tokens)
        assert all(self.callback.values())

        self.tokens = tokens

        self.mres = build_mres(tokens)
Exemplo n.º 37
0
def _match_pattern(compiled_regex, pattern, orig_smtstr, pos, endpos=None):
    space = orig_smtstr.statespace
    parsed_pattern = parse(pattern, compiled_regex.flags)
    smtstr = _slice_match_area(orig_smtstr, pos, endpos)
    match = _internal_match_patterns(space, parsed_pattern,
                                     compiled_regex.flags, smtstr, pos)
    if match is not None:
        match.pos = pos
        match.endpos = endpos if endpos is not None else len(orig_smtstr)
        match.re = compiled_regex
        match.string = orig_smtstr
        # fill None in unmatched groups:
        while len(match._groups) < compiled_regex.groups + 1:
            match._groups.append(None)
        # Link up any named groups:
        for name, num in compiled_regex.groupindex.items():
            (_, start, end) = match._groups[num]
            match._groups[num] = (name, start, end)
    return match
Exemplo n.º 38
0
 def parse(cls, s):
     keywords = []
     current = []
     quoted = False
     for t, x in sre_parse.parse(s):
         if t == "literal":
             if x == 92:  # \
                 if quoted:
                     current += ["\\"]
                     quoted = False
                 else:
                     quoted = True
             else:
                 current += [chr(x)]
         elif current:
             keywords += ["".join(current)]
             current = []
     if current:
         keywords += ["".join(current)]
     return keywords
Exemplo n.º 39
0
def compile(p, flags=0):
    if (type(p) in STRING_TYPES):
        import sre_parse
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None
    code = _code(p, flags)
    assert (p.pattern.groups <=
            100), 'sorry, but this version only supports 100 named groups'
    groupindex = p.pattern.groupdict
    indexgroup = ([None] * p.pattern.groups)
    for (
            k,
            i,
    ) in groupindex.items():
        indexgroup[i] = k

    return _sre.compile(pattern, flags, code, (p.pattern.groups - 1),
                        groupindex, indexgroup)
Exemplo n.º 40
0
def make_scanner(lexicon, flags=FLAGS):
    actions = [None]
    # Combine phrases into a compound pattern
    s = sre_parse.Pattern()
    s.flags = flags
    charpatterns = {}
    p = []
    idx = 0
    for token in lexicon:
        if token.pattern in (r'\[', r'{', r'"'):
            charpatterns[token.pattern[-1]] = token
        idx += 1
        phrase = token.pattern
        try:
            subpattern = sre_parse.SubPattern(
                s, [(SUBPATTERN, (idx, sre_parse.parse(phrase, flags)))])
        except sre_constants.error:
            raise
        p.append(subpattern)
        actions.append(token)

    s.groups = len(p) + 1  # NOTE(guido): Added to make SRE validation work
    p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
    scanner = sre_compile.compile(p).scanner

    def _scan_once(string, idx=0, context=None):
        try:
            action = charpatterns[string[idx]]
        except KeyError:
            pass
        except IndexError:
            raise StopIteration
        else:
            return action((string, idx + 1), context)

        m = scanner(string, idx).match()
        if m is None or m.end() == idx:
            raise StopIteration
        return actions[m.lastindex](m, context)

    return _scan_once
Exemplo n.º 41
0
    def get_brackets_values(pattern):
        replace_perl_regex_dict = {
            '[:alnum:]': 'a-zA-Z0-9',
            '[:alpha:]': 'a-zA-Z',
            '[:ascii:]': '\\x00-\\x7f',
            '[:blank:]': ' \\t',
            '[:cntrl:]': '\\x00-\\x1f\\x7f',
            '[:digit:]': '0-9',
            '[:graph:]': '\\x21-\\x7e',
            '[:lower:]': 'a-z',
            '[:print:]': '\\x20-\\x7e',
            '[:punct:]': '!"\\#$%&\'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~',
            '[:space:]': ' \\t\\r\\n\\v\\f',
            '[:upper:]': 'A-Z',
            '[:word:]': 'A-Za-z0-9_',
            '[:xdigit:]': 'A-Fa-f0-9'
        }

        for perl_pattern in replace_perl_regex_dict:
            pattern = pattern.replace(perl_pattern,
                                      replace_perl_regex_dict[perl_pattern])
        matches = []
        sre_parsed_data = sre_parse.parse(pattern).data
        if type(sre_parsed_data[0][1]) is int:
            return [
                chr(sre_parsed_data[0][1]).encode('unicode_escape').decode(
                    "utf-8")
            ]
        for m in sre_parsed_data[0][1]:
            if type(m) is int:
                return [chr(m).encode('unicode_escape').decode("utf-8")]
            if sre_parse.LITERAL == m[0]:
                matches.append(
                    chr(m[1]).encode('unicode_escape').decode("utf-8"))
            elif sre_parse.RANGE == m[0]:
                range_chars = [
                    chr(x).encode('unicode_escape').decode("utf-8")
                    for x in range(m[1][0], 1 + m[1][1])
                ]
                matches += range_chars
        return matches
Exemplo n.º 42
0
def compile(p, flags=0):
    # internal: convert pattern list to internal format

    if isstring(p):
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

    code = _code(p, flags)

    # print(code)

    # map in either direction
    groupindex = p.pattern.groupdict
    indexgroup = [None] * p.pattern.groups
    for k, i in list(groupindex.items()):
        indexgroup[i] = k

    return _sre.compile(pattern, flags | p.pattern.flags, code,
                        p.pattern.groups - 1, groupindex, indexgroup)
Exemplo n.º 43
0
    def _compile_hook(pattern: str, flags: int) -> "AtherisPatternProxy":
        """Overrides re._compile."""

        generated = ""
        if pattern not in pattern_gen_map:
            pat = sre_parse.parse(pattern)
            generated = gen_match(pat)
            # Check that the pattern actually matches
            check_pattern = pattern
            try:
                # Convert our pattern to a string if necessary
                check_pattern = pattern.decode("utf-8")  # type: ignore
            except AttributeError:
                # Already a string
                pass
            except Exception as e:  # pylint: disable=broad-except
                # Not sure what went wrong.
                sys.stderr.write(
                    f"Could not convert the pattern {pattern} to a " +
                    f"utf-8 string: {e}\n")
            try:
                if original_compile_func(check_pattern,
                                         flags).search(generated) is None:
                    sys.stderr.write(
                        f"ERROR: generated match '{generated}' did not " +
                        "match the RegEx pattern '{_pattern}'!\n")
            except Exception as e:  # pylint: disable=broad-except
                sys.stderr.write(
                    "Could not check the generated match against the " +
                    f"RegEx pattern: {e}\n")
            pattern_gen_map[pattern] = generated
        else:
            generated = pattern_gen_map[pattern]

        # Create the `re.Pattern` object. We will wrap this in a proxy later on.
        re_object = original_compile_func(pattern, flags)

        # Return the wrapped `re.Pattern` object.
        return AtherisPatternProxy(re_object, generated)
Exemplo n.º 44
0
def _match_pattern(
    compiled_regex: re.Pattern,
    pattern: str,
    orig_smtstr: SymbolicStr,
    pos: int,
    endpos: Optional[int] = None,
) -> Optional[_Match]:
    if pos == 0:
        # Remove some meaningless empty matchers for match/fullmatch:
        pattern = pattern.lstrip("^")
        while pattern.startswith(r"\A"):
            pattern = pattern[2:]

    space = orig_smtstr.statespace
    parsed_pattern = parse(pattern, compiled_regex.flags)
    smtstr = _slice_tail(orig_smtstr, endpos)
    matchpart = _internal_match_patterns(
        space, parsed_pattern, compiled_regex.flags, smtstr, pos
    )
    if matchpart is None:
        return None
    return _Match(matchpart._groups, pos, endpos, compiled_regex, orig_smtstr)
Exemplo n.º 45
0
def expand_sub(string, template, debug=0, mode='all'):
    """ Given a regular expression and a replacement string, generate expansions of
        the regular expression and for each one return it and its transformation
        as applied by the replacement string.

        string : regular expression to expand
        template : transformation to apply to each regular expression
        mode : can take 3 values
            all : return all possible shortest strings that the regular expression
                    would match
            first : return the first string that all would return
            random : return one random string that the regular expression would match
    """
    pattern = sre_parse.parse(string, flags=sre_parse.SRE_FLAG_VERBOSE)
    pattern.mode = mode
    template = sre_parse.parse_template(template, pattern)
    if debug:
        print(pattern)
        print(template)
    for s in _iterate(pattern, pattern.data, MatchObj(pattern, "")):
        s.patient = 0
        yield (s.string, sre_parse.expand_template(template, s))
Exemplo n.º 46
0
def regex_len(regex):
    """
    Returns a tuple of the minimum and maximum possible length string that a
    regex will match. Returns MAXREPEAT if a match can be very or infinitely
    long.

        >>> regex_len('test')
        (4, 4)
        >>> regex_len('t.st')
        (4, 4)
        >>> regex_len('.*')
        (0, MAXREPEAT)
        >>> regex_len('fo?o')
        (2, 3)
        >>> regex_len('mo{2,7}')
        (3, 8)
        >>> regex_len('(foo)+')
        (3, MAXREPEAT)
        >>> regex_len('s?e?q?u?e?n?c?e?')
        (0, 8)
    """
    return _regex_len_pattern(parse(regex))
Exemplo n.º 47
0
def regex_index(regex, index):
    """
    Index into a regex, returning a smaller regex of the things that match
    in that position.
        
        >>> regex_index('test', 0)
        't'
        >>> regex_index('t?est', 0)
        '[te]'
        >>> regex_index('fa(la){2,}', 2)
        'l'
        >>> regex_index('fa(la){2,}', 6)
        'l'
        >>> regex_index('.*', 99)
        '.'
    """
    choices = _regex_index_pattern(parse(regex), index)
    if len(choices) == 0:
        raise IndexError
    elif len(choices) == 1:
        return unparse(choices[0])
    else:
        return round_trip(unparse((BRANCH, (None, choices))))
Exemplo n.º 48
0
def compile_regexp_to_noncapturing(pattern, flags=0):
    """
    Compile the regexp pattern after switching all grouping parentheses
    in the given regexp pattern to non-capturing groups.

    :type pattern: str
    :rtype: str
    """
    def convert_regexp_to_noncapturing_parsed(parsed_pattern):
        res_data = []
        for key, value in parsed_pattern.data:
            if key == sre_constants.SUBPATTERN:
                index, subpattern = value
                value = (None, convert_regexp_to_noncapturing_parsed(subpattern))
            elif key == sre_constants.GROUPREF:
                raise ValueError('Regular expressions with back-references are not supported: {0}'.format(pattern))
            res_data.append((key, value))
        parsed_pattern.data = res_data
        parsed_pattern.pattern.groups = 1
        parsed_pattern.pattern.groupdict = {}
        return parsed_pattern

    return sre_compile.compile(convert_regexp_to_noncapturing_parsed(sre_parse.parse(pattern)), flags=flags)
Exemplo n.º 49
0
    def ranking(item):
        """ Sorting function """
        regex = item['pattern'] + '$'
        regex_max_width = int(sre_parse.parse(regex).getwidth()[1])

        # Capture group should not impact length
        l = re.sub(r'[()]', '', item['pattern'])
        # two character specifier should not impact length
        l = re.sub(r'\\(\w)', '\1', l)
        length = len(l)

        # "\d" and "\w" placed before "."
        a = re.sub(r'[\\]', u"\U0010FFFD", item['pattern'])
        # "atf" before "(atf)" with "at." last
        a = re.sub(r'[(]', '', a)
        # Make sure regex symbols after letters or numbers
        alphabetical = re.sub(r'[.?*)]', u"\U0010FFFF", a)

        # patterns with infinite wildcards like \d+ or .*
        if regex_max_width >= int(sre_constants.MAXREPEAT):
            # in this case longer string more specific
            length = -length
        return (regex_max_width, length, alphabetical)
Exemplo n.º 50
0
def compile(p, flags=0):
    # internal: convert pattern list to internal format

    if isstring(p):
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

    code = _code(p, flags)

    if flags & SRE_FLAG_DEBUG:
        print()
        dis(code)

    # map in either direction
    groupindex = p.state.groupdict
    indexgroup = [None] * p.state.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

    return _sre.compile(pattern, flags | p.state.flags, code,
                        p.state.groups - 1, groupindex, tuple(indexgroup))
Exemplo n.º 51
0
    def transform_format_string_into_regex(self):
        # 3. Convert the mangled format string into a regex object
        # Transforming our format string into a regular expression,
        # substituting {{ ... }} with regex named groups, so that param_stream
        # matched against this expression yields a dict of params with values.
        param_match = r'\1["\']?(?P<\2>(?:(?<=\').+?(?=\')|(?<=").+?(?=")|{.+?}|.+?))["\']?'
        reg = re.sub(r'(\s*)' + self._snippets['optional'],
                     r'(?:' + param_match + r')?', self._format)
        reg = re.sub(r'(\s*)' + self._snippets['required'], param_match, reg)

        reg_tokens = parse(reg, flags=re.DOTALL)

        # Add a beginning anchor if none exists
        if not search_regex_tokens(
            ((AT, AT_BEGINNING), (AT, AT_BEGINNING_STRING)), reg_tokens):
            reg = r'^\s*' + reg

        # Add an ending anchor if none exists
        if not search_regex_tokens(
            ((AT, AT_END), (AT, AT_END_STRING)), reg_tokens, backwards=True):
            reg = reg + r'\s*$'

        return re.compile(reg, re.DOTALL)
Exemplo n.º 52
0
def group_names(expression):
    """Returns the names of groups in the regular expression
  
  Arguments
  ---------
  expression : str
    The regular expression.
  
  Returns
  -------
  names : list of str
      The group names in the regular expression sorted according to appearance.
  """

    #parse regular expression for name expressions
    p = sre.parse(expression)
    gd = p.pattern.groupdict
    names = list(gd.keys())

    #sort according to appearance
    order = np.argsort(list(gd.values()))
    names = [names[o] for o in order]

    return names
Exemplo n.º 53
0
def regex(regex):
    """Return strategy that generates strings that match given regex.

    Regex can be either a string or compiled regex (through `re.compile()`).

    You can use regex flags (such as `re.IGNORECASE`, `re.DOTALL` or `re.UNICODE`)
    to control generation. Flags can be passed either in compiled regex (specify
    flags in call to `re.compile()`) or inside pattern with (?iLmsux) group.

    Some tricky regular expressions are partly supported or not supported at all.
    "^" and "$" do not affect generation. Positive lookahead/lookbehind groups
    are considered normal groups. Negative lookahead/lookbehind groups do not do
    anything. Ternary regex groups ('(?(name)yes-pattern|no-pattern)') are not
    supported at all.
    """
    if not hasattr(regex, 'pattern'):
        regex = re.compile(regex)

    pattern = regex.pattern
    flags = regex.flags

    codes = sre.parse(pattern)

    return _strategy(codes, Context(flags=flags)).filter(regex.match)
Exemplo n.º 54
0
    def __init__(self, pattern, flags=0, charset=CHARSET, max_count=None):
        # If the RE module cannot compile it, we give up quickly
        self.matcher = re.compile(r'(?:%s)\Z' % pattern, flags)
        if not flags & re.DOTALL:
            charset = ''.join(c for c in charset if c != '\n')
        self.charset = charset

        self.named_group_lookup = self.matcher.groupindex

        if flags & re.IGNORECASE:
            raise ParseError(
                'Flag "i" not supported. https://github.com/google/sre_yield/issues/4'
            )
        elif flags & re.UNICODE:
            raise ParseError(
                'Flag "u" not supported. https://github.com/google/sre_yield/issues/3'
            )
        elif flags & re.LOCALE:
            raise ParseError(
                'Flag "l" not supported. https://github.com/google/sre_yield/issues/5'
            )

        if max_count is None:
            self.max_count = MAX_REPEAT_COUNT
        else:
            self.max_count = max_count

        self.has_groupref = False

        # Configure the parser backends
        self.backends = {
            sre_constants.LITERAL:
            lambda y: [chr(y)],
            sre_constants.RANGE:
            lambda l, h: [chr(c) for c in xrange(l, h + 1)],
            sre_constants.SUBPATTERN:
            self.maybe_save,
            sre_constants.BRANCH:
            self.branch_values,
            sre_constants.MIN_REPEAT:
            self.max_repeat_values,
            sre_constants.MAX_REPEAT:
            self.max_repeat_values,
            sre_constants.AT:
            self.empty_list,
            sre_constants.ASSERT:
            self.empty_list,
            sre_constants.ASSERT_NOT:
            self.empty_list,
            sre_constants.ANY:
            lambda _: self.in_values(((sre_constants.NEGATE, ), )),
            sre_constants.IN:
            self.in_values,
            sre_constants.NOT_LITERAL:
            self.not_literal,
            sre_constants.CATEGORY:
            self.category,
            sre_constants.GROUPREF:
            self.groupref,
        }
        # Now build a generator that knows all possible patterns
        self.raw = self.sub_values(sre_parse.parse(pattern, flags))
        # Configure this class instance to know about that result
        self.length = self.raw.__len__()
Exemplo n.º 55
0
 def max_width(self):
     return sre_parse.parse(self.to_regexp()).getwidth()[1]
Exemplo n.º 56
0
    def get_extracted_param_value(self):
        """
        Match command against the format string and extract paramters from the command string.

        :rtype: ``dict``
        """
        result = {}

        param_stream = self._param_stream

        # As there's a lot of questions about using regular expressions,
        # I'll try to be thorough when documenting this code.

        # I'll split the whole convoluted regex into snippets to make it
        # a bit more readable (hopefully).
        snippets = dict()

        # Formats for keys and values: key is a non-spaced string,
        # value is anything in quotes or curly braces, or a single word.
        snippets['key'] = r'\s*(\S+?)\s*'
        snippets['value'] = r'""|\'\'|"(.+?)"|\'(.+?)\'|({.+?})|(\S+)'

        # Extended value: also matches unquoted text (caution).
        snippets['ext_value'] = r'""|\'\'|"(.+?)"|\'(.+?)\'|({.+?})|(.+?)'

        # Key-value pair:
        snippets['pairs'] = r'(?:^|\s+){key}=({value})'.format(**snippets)

        # End of string: multiple space-separated key-value pairs:
        snippets['ending'] = r'.*?(({pairs}\s*)*)$'.format(**snippets)

        # Default value in optional parameters:
        snippets['default'] = r'\s*=\s*(?:{ext_value})\s*'.format(**snippets)

        # Optional parameter (has a default value):
        snippets[
            'optional'] = '{{' + snippets['key'] + snippets['default'] + '}}'

        # Required parameter (no default value):
        snippets['required'] = '{{' + snippets['key'] + '}}'

        # 1. Matching the arbitrary key-value pairs at the end of the command
        # to support extra parameters (not specified in the format string),
        # and cutting them from the command string afterwards.
        ending_pairs = re.match(snippets['ending'], param_stream, re.DOTALL)
        has_ending_pairs = ending_pairs and ending_pairs.group(1)
        if has_ending_pairs:
            kv_pairs = re.findall(snippets['pairs'], ending_pairs.group(1),
                                  re.DOTALL)
            param_stream = param_stream.replace(ending_pairs.group(1), '')
        param_stream = " %s " % (param_stream)

        # 2. Matching optional parameters (with default values).
        optional = re.findall(snippets['optional'], self._format, re.DOTALL)

        # Transforming our format string into a regular expression,
        # substituting {{ ... }} with regex named groups, so that param_stream
        # matched against this expression yields a dict of params with values.
        param_match = r'\1["\']?(?P<\2>(?:(?<=\').+?(?=\')|(?<=").+?(?=")|{.+?}|.+?))["\']?'
        reg = re.sub(r'(\s*)' + snippets['optional'],
                     r'(?:' + param_match + r')?', self._format)
        reg = re.sub(r'(\s*)' + snippets['required'], param_match, reg)

        reg_tokens = parse(reg, flags=re.DOTALL)

        # Add a beginning anchor if none exists
        if not search_regex_tokens(
            ((AT, AT_BEGINNING), (AT, AT_BEGINNING_STRING)), reg_tokens):
            reg = r'^\s*' + reg

        # Add an ending anchor if none exists
        if not search_regex_tokens(
            ((AT, AT_END), (AT, AT_END_STRING)), reg_tokens, backwards=True):
            reg = reg + r'\s*$'

        # 3. Matching the command against our regex to get the param values
        matched_stream = re.match(reg, param_stream, re.DOTALL)

        if not matched_stream:
            # If no match is found we throw since this indicates provided user string (command)
            # didn't match the provided format string
            raise ParseException(
                'Command "%s" doesn\'t match format string "%s"' %
                (self._param_stream, self._format))

        # Compiling results from the steps 1-3.
        if matched_stream:
            result = matched_stream.groupdict()

        for param in optional:
            matched_value = result[param[0]] if matched_stream else None
            matched_result = matched_value or ''.join(param[1:])
            if matched_result is not None:
                result[param[0]] = matched_result

        if has_ending_pairs:
            for pair in kv_pairs:
                result[pair[0]] = ''.join(pair[2:])

        if self._format and not (self._param_stream.strip()
                                 or any(result.values())):
            raise ParseException(
                'No value supplied and no default value found.')

        return result
Exemplo n.º 57
0
def re_compile(expr):
    """NOT_RPYTHON"""
    pattern = sre_parse.parse(expr)
    return (sre_compile._code(pattern, 0), pattern.pattern.groups)
Exemplo n.º 58
0
def get_regexp_width(regexp):
    try:
        return sre_parse.parse(regexp).getwidth()
    except sre_constants.error:
        raise ValueError(regexp)
Exemplo n.º 59
0
def base_regex_strategy(regex, parsed=None):
    if parsed is None:
        parsed = sre_parse.parse(regex.pattern, flags=regex.flags)
    return clear_cache_after_draw(
        _strategy(parsed, Context(flags=regex.flags),
                  isinstance(regex.pattern, str)))
Exemplo n.º 60
0
def PrintRegex(pat):
    re_tree = sre_parse.parse(pat)
    print('\t\t[')
    PrintTree(re_tree)
    print('\t\t]')