def _regex_optional(defn, defs, grpid): subdef = defn.args[0] d = _regex(defn.args[0], defs, grpid) if d.op == RGX: subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})' return Regex(f'{subpat}?') else: return Optional(d)
def _regex_plus(defn, defs, grpid): subdef = defn.args[0] d = _regex(defn.args[0], defs, grpid) if d.op == RGX: subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})' gid = f'_{next(grpid)}' return Regex(f'(?=(?P<{gid}>{subpat}+))(?P={gid})') else: return Plus(d)
def _regex_sequence(defn, defs, grpid): _subdefs = [_regex(subdef, defs, grpid) for subdef in defn.args[0]] subdefs = [] for k, grp in groupby(_subdefs, key=lambda d: d.op): # only join regexes in sequence if unstructured if k == RGX: subdefs.append(Regex(''.join(d.args[0] for d in grp))) else: subdefs.extend(grp) return Sequence(*subdefs)
def _regex_choice(defn, defs, grpid): items = [_regex(d, defs, grpid) for d in defn.args[0]] subdefs = [] for k, grp in groupby(items, key=lambda d: d.op): grp = list(grp) if k == RGX and len(grp) > 1: gid = f'_{next(grpid)}' subdefs.append( Regex(f'(?=(?P<{gid}>' + '|'.join(sd.args[0] for sd in grp) + f'))(?P={gid})')) else: subdefs.extend(grp) return Choice(*subdefs)
def test_regex_not_dot(): assert (rload(r'A <- !"a" .') == grm({'A': Regex(r'(?!a)(?s:.)')})) assert (rload(r'A <- !"a" .', common=True) == grm({'A': Regex(r'[^a]')})) assert (rload(r'A <- !"\\" .', common=True) == grm({'A': Regex(r'[^\\]')})) assert (rload(r'A <- ![\\] .', common=True) == grm({'A': Regex(r'[^\\]')})) assert (rload(r'A <- ![abc] .', common=True) == grm({'A': Regex(r'[^abc]')})) assert (rload(r'A <- (![abc] .)*', common=True) == grm( {'A': Regex(r'(?=(?P<_1>[^abc]*))(?P=_1)')}))
def test_regex(): assert (rload(r'A <- "a"') == grm({'A': Regex(r'a')})) assert (rload(r'A <- "a" [bc]') == grm({'A': Regex(r'a[bc]')})) assert (rload(r'A <- ~("a" [bc])') == grm({'A': Capture(Regex(r'a[bc]'))})) assert (rload(r'A <- "a" B B <- [bc]') == grm({ 'A': Sequence(Regex('a'), Nonterminal('B')), 'B': Regex('[bc]') })) assert (rload(r'A <- .* "a"') == grm( {'A': Regex(r'(?=(?P<_1>(?s:.)*))(?P=_1)a')})) assert (rload(r'A <- "a"* [bc]+') == grm( {'A': Regex(r'(?=(?P<_1>a*))(?P=_1)(?=(?P<_2>[bc]+))(?P=_2)')})) assert (rload(r'A <- "a" ~([bc] / "d")*') == grm({ 'A': Sequence( Regex(r'a'), Capture( Regex(r'(?=(?P<_2>(?:(?=(?P<_1>[bc]|d))(?P=_1))*))(?P=_2)'))) })) assert (rload(r'A <- "ab" / "abc"') == grm( {'A': Regex(r'(?=(?P<_1>ab|abc))(?P=_1)')})) assert (rload(r'A <- "a"* / ~"b"') == grm( {'A': Choice(Regex(r'(?=(?P<_1>a*))(?P=_1)'), Capture(Regex(r'b')))}))
def _regex_not(defn, defs, grpid): d = _regex(defn.args[0], defs, grpid) if d.op == RGX: return Regex(f'(?!{d.args[0]})') else: return Not(d)
def _regex_and(defn, defs, grpid): d = _regex(defn.args[0], defs, grpid) if d.op == RGX: return Regex(f'(?={d.args[0]})') else: return And(d)
def _regex_class(defn, defs, grpid): neg = '^' if defn.args[1] else '' clsstr = ''.join(f'{re.escape(a)}-{re.escape(b)}' if b else re.escape(a) for a, b in defn.args[0]) return Regex(f'[{neg}{clsstr}]')
def _regex_literal(defn, defs, grpid): return Regex(re.escape(defn.args[0]))
def _regex_dot(defn, defs, grpid): return Regex('(?s:.)')
def test_Regex(): assert Regex('foo') == Def(Op.RGX, ('foo', 0)) assert Regex('foo', flags=1) == Def(Op.RGX, ('foo', 1))