def test_bad_reduction_bug(): # DEFECT: "0{2}|1{2}" was erroneously reduced() to "[01]{2}" bad = parse("0{2}|1{2}").to_fsm({"0", "1", fsm.anything_else}) assert bad.accepts("00") assert bad.accepts("11") assert not bad.accepts("01") assert str(parse("0|[1-9]|ab").reduce()) == "\d|ab"
def test_bug_36_2(): etc1 = parse("/etc/.*").to_fsm() etc2 = parse("/etc/something.*").to_fsm() assert etc1.accepts("/etc/something") assert etc2.accepts("/etc/something") assert not etc1.isdisjoint(etc2) assert not etc2.isdisjoint(etc1)
def format_check_orm_regex(instance): # pylint:disable=broad-except try: lego.parse(instance) except Exception: return False return True
def test_special_cases_for_charclass(): a = parse('[- ]') assert a.matches('-') assert a.matches(' ') a = parse('[ -]') assert a.matches('-') assert a.matches(' ')
def is_string_subtype(s1, s2): if s2.get("type") != "string": return False # s1 = JsonString(s1) s2 = JsonString(s2) # # uninhabited = handle_uninhabited_types(s1, s2) # if uninhabited != None: # return uninhabited # is_sub_interval = is_sub_interval_from_optional_ranges( s1.min, s1.max, s2.min, s2.max) if not is_sub_interval: return False # # at this point, length is compatible, # so we should now worry about pattern only. if s2.pattern == None or s2.pattern == "": return True elif s1.pattern == None or s1.pattern == "": return False elif s1.pattern == s2.pattern: return True else: regex1 = parse(s1.pattern) regex2 = parse(s2.pattern) result = regex1 & regex2.everythingbut() if result.empty(): return True else: return False
def test_bug_36_1(): etc1 = parse(".*").to_fsm() etc2 = parse("s.*").to_fsm() assert etc1.accepts("s") assert etc2.accepts("s") assert not etc1.isdisjoint(etc2) assert not etc2.isdisjoint(etc1)
def test_parse_anchors(): assert str(parse(r"\ba\b")) == r"\ba\b" assert str(parse(r"^a$")) == r"^a$" assert str(parse(r"\Aa\Z")) == r"\Aa\Z" assert str(parse(r"\Ga\z")) == r"\Ga\z" a = parse(r"^a$") mults = list(list(a.concs)[0].mults) assert mults[0] == caret assert mults[2] == dollar
def regex_isProperSubset(s1, s2): ''' regex proper subset is quite expensive to compute so we try to break it into two separate checks, and do the more expensive check, only if the cheaper one passes first.''' s1 = parse(s1).reduce() s2 = parse(s2).reduce() if not s1.equivalent(s2): return (s1 & s2.everythingbut()).empty() return False
def test_new_reduce(): # The @reduce_after decorator has been removed from many methods since it # takes unnecessary time which the user may not wish to spend. # This alters the behaviour of several methods and also exposes a new # opportunity for conc.reduce() assert conc.parse("a()").reduce() == charclass.parse("a") assert conc.parse("a()()").reduce() == charclass.parse("a") assert conc.parse("a.b()()").reduce() == conc.parse("a.b") assert str(parse("a.b()()")) == "a.b()()" assert str(parse("a.b()()").reduce()) == "a.b"
def test_hex_escapes(): # Should be able to parse e.g. "\\x40" assert parse("\\x00") == parse("\x00") assert parse("\\x40") == parse("@") assert parse("[\\x40]") == parse("[@]") assert parse("[\\x41-\\x5a]") == parse("[A-Z]") assert str(parse("\\x09")) == "\\t" # escape sequences are not preserved # Printing ASCII control characters? You should get hex escapes assert str(parse("\\x00")) == "\\x00"
def regex_meet(s1, s2): if s1 and s2: ret = parse(s1) & parse(s2) return str(ret.reduce()) if not ret.empty() else None elif s1: return s1 elif s2: return s2 else: return None
def test_fsm(): # You should be able to to_fsm() a single lego piece without supplying a specific # alphabet. That should be determinable from context. assert str(from_fsm(parse("a.b").to_fsm())) == "a.b" # not "a[ab]b" # A suspiciously familiar example bad = parse("0{2}|1{2}").to_fsm() assert bad.accepts("00") assert bad.accepts("11") assert not bad.accepts("01") assert str(parse("0|[1-9]|ab").reduce()) == "\d|ab"
def test_silly_reduction(): # This one is horrendous and we have to jump through some hoops to get to # a sensible result. Probably not a good unit test actually. long = \ "(aa|bb*aa)a*|((ab|bb*ab)|(aa|bb*aa)a*b)((ab|bb*ab)|(aa|bb*aa)a*b)*" + \ "(aa|bb*aa)a*|((ab|bb*ab)|(aa|bb*aa)a*b)((ab|bb*ab)|(aa|bb*aa)a*b)*" long = parse(long) long = reversed(long.to_fsm()) long = reversed(from_fsm(long)) assert str(long) == "[ab]*a[ab]" short = "[ab]*a?b*|[ab]*b?a*" assert str(parse(".*") & parse(short)) == "[ab]*"
def verify(subregex, supregex): # supregex : "\d" # subregex : "\d{3}" p_subregex = parse(".*" + subregex + ".*") p_supregex = parse(".*" + supregex + ".*") s = p_subregex&(p_supregex.everythingbut()) if s.empty(): print("Verified" + "------ "+subregex + " " + supregex) else: print("Not pass!" + "------ "+ subregex + " " + supregex)
def test_fsm(): # You should be able to to_fsm() a single lego piece without supplying a specific # alphabet. That should be determinable from context. assert parse("a.b").to_fsm().accepts("acb") bad = parse("0{2}|1{2}").to_fsm({"0", "1", fsm.anything_else}) assert bad.accepts("00") assert bad.accepts("11") assert not bad.accepts("01") bad = parse("0{2}|1{2}").to_fsm() assert bad.accepts("00") assert bad.accepts("11") assert not bad.accepts("01")
def difference(self, other): """Find the difference of two regexps. This method uses greenery library to find and reduce the difference pattern between two regex's. * If `other` is a string, a Python dict or a FiniteSet, it is converted to a regex pattern, after which it is parsed by `greenery.lego.parse` method and its difference with the pattern of the `self` is found. See more details here: https://github.com/qntm/greenery * If `other` is an instance of `EmpySet`, the difference is a copy of `self`. * If `other` is an instance of `UniversalSet`, the difference is an instance of `EmptySet`. Parameters ---------- other : set, str, re._pattern_type, RegexSet Returns ------- result : RegexSet The union set """ if self.pattern is None: return RegexSet.empty() other_exp = [] if isinstance(other, set): for exp in other: exp_str = _regex_to_string(exp) if exp_str is not None: other_exp.append(parse(exp_str)) else: other_str = _regex_to_string(other) if other_str is not None: other_exp.append(parse(other_str)) else: return self.copy() complement_exp = parse(self.pattern) for exp in other_exp: complement_exp = complement_exp.difference(exp) return RegexSet(str(complement_exp.reduce()))
def included(a): if isinstance(a, str): other_exp = parse(a) elif isinstance(a, re._pattern_type): other_exp = parse(a.pattern) elif isinstance(a, RegexSet): if a.pattern: other_exp = parse(a.pattern) else: return False else: raise AttributeSetError( "Regexp object should be of type `str` or `re._pattern_type`!" ) return (self_exp & other_exp.everythingbut()).empty()
def make_regex(pattern): if pattern is None: return None expression = [] for n, element in enumerate(pattern): if element: if len(element) > 1: expression.append('[{}]'.format(''.join(element))) else: expression.append(element) else: # hack with completely optional None characters # (specifying length may yield an incorrect pattern) expression.append('.*') # no_end_chars = {'$'} # if expression[-1] not in no_end_chars: # expression.append('$') # optimise the expression with lego expression = lego.parse(''.join(expression)) return expression
def test_statement_regex_mutual_exclusivity(): fsa_list = [ lego.parse(deverbosify(module._STATEMENT_REGEX.pattern)) for module in PROBE_MODULES ] for fsa1, fsa2 in itertools.combinations(fsa_list, 2): yield assert_non_overlapping, fsa1, fsa2
def create_pfsm_from_fsm(self, ): fsm_obj = parse(self.reg_exp).to_fsm() self.alphabet = list( set([str(i) for i in list(fsm_obj.alphabet)]) - set([ 'anything_else', ])) states = list(fsm_obj.states) self.add_states(states) initials = [ fsm_obj.initial, ] I = [ np.log(1 / len(initials)) if state in initials else LOG_EPS for state in self.states ] self.set_I(I) self.I_backup = self.I.copy() finals = list(fsm_obj.finals) F = [ np.log(self.STOP_P) if state in finals else LOG_EPS for state in self.states ] self.set_F(F) self.F_backup = self.F.copy() transitions = fsm_obj.map for state_i in transitions: trans = transitions[state_i] for symbol in list(trans): if str(symbol) == 'anything_else': del trans[symbol] transitions[state_i] = trans for state_i in transitions: trans = transitions[state_i] state_js = np.array(list(trans.values())) if len(state_js) == 0: self.F[state_i] = 0. else: symbols_js = np.array(list(trans.keys())) if self.F[state_i] != LOG_EPS: probs = np.array([ (1.0 - np.exp(self.F[state_i])) / len(symbols_js) for i in range(len(symbols_js)) ]) else: probs = np.array([ 1.0 / len(symbols_js) for i in range(len(symbols_js)) ]) for state_j in np.unique(state_js): idx = np.where(state_js == state_j)[0] symbols = list(symbols_js[idx]) self.add_transitions(state_i, state_j, symbols, list(probs[idx]))
def regex_isSubset(s1, s2): ''' regex subset is quite expensive to compute especially for complex patterns. ''' if s1 and s2: s1 = parse(s1).reduce() s2 = parse(s2).reduce() try: s1.cardinality() s2.cardinality() return set(s1.strings()).issubset(s2.strings()) except OverflowError: return s1.equivalent(s2) or (s1 & s2.everythingbut()).empty() elif s1: return True elif s2: return False
def test_complexify(): # Complexify! gen = (parse("[bc]*[ab]*") & parse("[ab]*[bc]*")).strings() assert next(gen) == "" assert next(gen) == "a" assert next(gen) == "b" assert next(gen) == "c" assert next(gen) == "aa" assert next(gen) == "ab" # no "ac" assert next(gen) == "ba" assert next(gen) == "bb" assert next(gen) == "bc" # no "ca" assert next(gen) == "cb" assert next(gen) == "cc" assert next(gen) == "aaa"
def test_block_comment_regex(): # I went through several incorrect regexes for C block comments. Here we show # why the first few attempts were incorrect a = parse("/\\*(([^*]|\\*+[^*/])*)\\*/") assert a.matches("/**/") assert not a.matches("/***/") assert not a.matches("/****/") b = parse("/\\*(([^*]|\\*[^/])*)\\*/") assert b.matches("/**/") assert not b.matches("/***/") assert b.matches("/****/") c = parse("/\\*(([^*]|\\*+[^*/])*)\\*+/") assert c.matches("/**/") assert c.matches("/***/") assert c.matches("/****/")
def test_everythingbut(): # Regexes are usually gibberish but we make a few claims a = parse("a") notA = a.everythingbut().to_fsm() assert notA.accepts("") assert not notA.accepts("a") assert notA.accepts("aa") # everythingbut(), called twice, should take us back to where we started. beer = parse("beer") notBeer = beer.everythingbut() beer2 = notBeer.everythingbut() assert str(beer2) == "be{2}r" # ".*" becomes "[]" and vice versa under this call. everything = parse(".*") assert str(everything.everythingbut()) == str(nothing) assert str(nothing.everythingbut()) == str(everything)
def test_charclass_str(): assert str(w) == "\\w" assert str(d) == "\\d" assert str(s) == "\\s" assert str(charclass("a")) == "a" assert str(charclass("{")) == "\\{" assert str(charclass("\t")) == "\\t" assert str(charclass("ab")) == "[ab]" assert str(charclass("a{")) == "[a{]" assert str(charclass("a\t")) == "[\\ta]" assert str(charclass("a-")) == "[\\-a]" assert str(charclass("a[")) == "[\\[a]" assert str(charclass("a]")) == "[\\]a]" assert str(charclass("ab")) == "[ab]" assert str(charclass("abc")) == "[abc]" assert str(charclass("abcd")) == "[a-d]" assert str(charclass("abcdfghi")) == "[a-df-i]" assert str(charclass("^")) == "^" assert str(charclass("\\")) == "\\\\" assert str(charclass("a^")) == "[\\^a]" assert str(charclass("0123456789a")) == "[0-9a]" assert str(charclass("\t\v\r A")) == "[\\t\\v\\r A]" assert str(charclass("\n\f A")) == "[\\n\\f A]" assert str(charclass("\t\n\v\f\r A")) == "[\\t-\\r A]" assert str(charclass("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz|")) == "[0-9A-Z_a-z|]" assert str(W) == "\\W" assert str(D) == "\\D" assert str(S) == "\\S" assert str(dot) == "." assert str(~charclass("")) == "." assert str(~charclass("a")) == "[^a]" assert str(~charclass("{")) == "[^{]" assert str(~charclass("\t")) == "[^\\t]" assert str(~charclass("^")) == "[^\\^]" # Arbitrary ranges assert str(parse("[\\w:;<=>?@\\[\\\\\\]\\^`]")) == "[0-z]" # TODO: what if \d is a proper subset of `chars`? # escape sequences are not preserved assert str(parse("\\x09")) == "\\t" # Printing ASCII control characters? You should get hex escapes assert str(parse("\\x00")) == "\\x00"
def test_charclass_str(): assert str(w) == "\\w" assert str(d) == "\\d" assert str(s) == "\\s" assert str(charclass("a")) == "a" assert str(charclass("{")) == "\\{" assert str(charclass("\t")) == "\\t" assert str(charclass("ab")) == "[ab]" assert str(charclass("a{")) == "[a{]" assert str(charclass("a\t")) == "[\\ta]" assert str(charclass("a-")) == "[\\-a]" assert str(charclass("a[")) == "[\\[a]" assert str(charclass("a]")) == "[\\]a]" assert str(charclass("ab")) == "[ab]" assert str(charclass("abc")) == "[abc]" assert str(charclass("abcd")) == "[a-d]" assert str(charclass("abcdfghi")) == "[a-df-i]" assert str(charclass("^")) == "^" assert str(charclass("\\")) == "\\\\" assert str(charclass("a^")) == "[\\^a]" assert str(charclass("0123456789a")) == "[0-9a]" assert str(charclass("\t\v\r A")) == "[\\t\\v\\r A]" assert str(charclass("\n\f A")) == "[\\n\\f A]" assert str(charclass("\t\n\v\f\r A")) == "[\\t-\\r A]" assert str(charclass("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz|")) == "[0-9A-Z_a-z|]" assert str(W) == "\\W" assert str(D) == "\\D" assert str(S) == "\\S" assert str(dot) == "." assert str(~charclass("")) == "." assert str(~charclass("a")) == "[^a]" assert str(~charclass("{")) == "[^{]" assert str(~charclass("\t")) == "[^\\t]" assert str(~charclass("^")) == "[^\\^]" # Arbitrary ranges assert str(parse("[\w:;<=>?@\\[\\\\\]\\^`]")) == "[0-z]" # TODO: what if \d is a proper subset of `chars`? # escape sequences are not preserved assert str(parse("\\x09")) == "\\t" # Printing ASCII control characters? You should get hex escapes assert str(parse("\\x00")) == "\\x00"
def issubset(regex1, regex2): # return True if regex1 is subset of regex2, which means # all the strings matched by r1 is can be matched by r2 subregex = parse(regex1) supregex = parse(regex2) s = subregex & (supregex.everythingbut()) start = time.time() elapsed = 0 if s.empty(): return "Verified" else: counterexample = "" while elapsed < 30: if not counterexample == "Timeout": break generator = s.strings() counterexample = next(generator) elapsed = time.time() - start return str(counterexample)
def test_wildcard_generator(): # Generator needs to handle wildcards as well. Wildcards come last. gen = parse("a.b").strings(otherchar="*") assert next(gen) == "aab" assert next(gen) == "abb" assert next(gen) == "a*b" try: next(gen) assert False except StopIteration: assert True
def regex_isSubset(s1, s2): ''' regex subset is quite expensive to compute especially for complex patterns. ''' if s1 and s2: s1 = parse(s1).reduce() s2 = parse(s2).reduce() try: s1.cardinality() s2.cardinality() return set(s1.strings()).issubset(s2.strings()) except (OverflowError, Exception): # catching a general exception thrown from greenery # see https://github.com/qntm/greenery/blob/master/greenery/lego.py # ... raise Exception("Please choose an 'otherchar'") return s1.equivalent(s2) or (s1 & s2.everythingbut()).empty() except Exception as e: exit_with_msg("regex failure from greenry", e) elif s1: return True elif s2: return False
def issubset(self, other): """Test regexp inclusion relation. Tests if a set defined by `self` is a included in a set defined by `other`. Parameters ---------- other : set, str, re._pattern_type, RegexSet Another regex to test inclusion. Returns ------- `True` is `self` defines a subset of `other`, `False` otherwise Raises ------ AttributeSetError If the type `other` is not recognized. """ if self.pattern is None: return True else: self_exp = parse(self.pattern) def included(a): if isinstance(a, str): other_exp = parse(a) elif isinstance(a, re._pattern_type): other_exp = parse(a.pattern) elif isinstance(a, RegexSet): if a.pattern: other_exp = parse(a.pattern) else: return False else: raise AttributeSetError( "Regexp object should be of type `str` or `re._pattern_type`!" ) return (self_exp & other_exp.everythingbut()).empty() if isinstance(other, set): res = True for element in other: if element is not None and not included(element): res = False break else: res = included(other) return res
def test_infinite_generation(): # Infinite generator, flummoxes both depth-first and breadth-first searches gen = parse("a*b*").strings() assert next(gen) == "" assert next(gen) == "a" assert next(gen) == "b" assert next(gen) == "aa" assert next(gen) == "ab" assert next(gen) == "bb" assert next(gen) == "aaa" assert next(gen) == "aab" assert next(gen) == "abb" assert next(gen) == "bbb" assert next(gen) == "aaaa"
def dfa_from_regex(s, alphabet=None): """ Using greenery to convert regex to a minimal (canonical) DFA :param str s: the input regular expression :param str alphabet: (optional) the alphabet for the required output DFA :return: MinDFA object, with language equivalent to the input's regex language """ # TODO: consider runtime impact for using alphabet... # alphabet = None f = parse(s).to_fsm(alphabet) # for canonical rep -- transform to minimal MinDFA f.reduce() res = MinDFA.dfa_from_fsm(f) # TODO: currently assuming input str as regex only has '*' operator for infinity if '*' not in s: res.is_all_words = MinDFA.Ternary.FALSE return res
def regex_to_dfa(): received_json = request.get_json(silent=True) received_regex = received_json['regex'] constructed_regex = lego.parse(received_regex) constructed_fsm = constructed_regex.to_fsm() alphabet = list(constructed_fsm.alphabet) prepared_alphabet = [letter for letter in alphabet if not isinstance(letter, fsm.anything_else_cls)] response = { "alphabet": prepared_alphabet, "states": list(constructed_fsm.states), "finals": list(constructed_fsm.finals), "initial": str(constructed_fsm.initial), "map": constructed_fsm.map, } return jsonify(response)
def pfsm_from_fsm(self, reg_exp): fsm_obj = parse(reg_exp).to_fsm() self.alphabet = sorted( [str(i) for i in list(fsm_obj.alphabet) if str(i) != "anything_else"] ) self.add_states(list(fsm_obj.states)) self.set_I( [np.log(1) if q == fsm_obj.initial else LOG_EPS for q in self.states] ) self.set_F( [ np.log(self.STOP_P) if q in list(fsm_obj.finals) else LOG_EPS for q in self.states ] ) for q_i in fsm_obj.map: transition = { symbol: v for symbol, v in fsm_obj.map[q_i].items() if str(symbol) != "anything_else" } q_js = np.array(list(transition.values())) if len(q_js) == 0: self.F[q_i] = 0.0 else: symbols_js = np.array(list(transition.keys())) dividend = 1.0 if self.F[q_i] == LOG_EPS else 1.0 - np.exp(self.F[q_i]) probs = np.array([dividend / len(symbols_js) for _ in symbols_js]) for q_j in np.unique(q_js): idx = np.where(q_js == q_j)[0] self.add_transitions( q_i, q_j, list(symbols_js[idx]), list(probs[idx]) )
def test_bug_slow(): # issue #43 import time m = fsm.fsm( alphabet = {'R', 'L', 'U', 'D'}, states = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, initial = 0, finals = {20}, map = {0: {'D': 1, 'U': 2}, 1: {'L': 3}, 2: {'L': 4}, 3: {'U': 5}, 4: {'D': 6}, 5: {'R': 7}, 6: {'R': 8}, 7: {'U': 9}, 8: {'D': 10}, 9: {'L': 11}, 10: {'L': 12}, 11: {'L': 13}, 12: {'L': 14}, 13: {'D': 15}, 14: {'U': 16}, 15: {'R': 17}, 16: {'R': 18}, 17: {'D': 19}, 18: {'U': 19}, 19: {'L': 20}, 20: {}}) t1 = time.time() l = from_fsm(m) t2 = time.time() assert (t2 - t1) < 60 # should finish in way under 1s assert l == parse("(DLURULLDRD|ULDRDLLURU)L").reduce()
def test_pattern_parsing(): assert pattern.parse("abc|def(ghi|jkl)") == pattern( conc( mult(charclass("a"), one), mult(charclass("b"), one), mult(charclass("c"), one), ), conc( mult(charclass("d"), one), mult(charclass("e"), one), mult(charclass("f"), one), mult( pattern( conc( mult(charclass("g"), one), mult(charclass("h"), one), mult(charclass("i"), one), ), conc( mult(charclass("j"), one), mult(charclass("k"), one), mult(charclass("l"), one), ), ), one ), ) ) # Accept the "non-capturing group" syntax, "(?: ... )" but give it no # special significance assert parse("(?:)") == parse("()") assert parse("(?:abc|def)") == parse("(abc|def)") parse("(:abc)") # should give no problems # Named groups assert pattern.parse("(?P<ng1>abc)") == parse("(abc)")
def test_isinstance_bug(): # Problem relating to isinstance(). The class "mult" was occurring as both # lego.mult and as __main__.mult and apparently these count as different # classes for some reason, so isinstance(m, mult) was returning false. starfree = (parse("").everythingbut() + parse("aa") + parse("").everythingbut()).everythingbut()
def regex_matches_string(regex=None, s=None): if regex: return parse(regex).matches(s) else: return True
def test_hex_escapes(): # Should be able to parse e.g. "\\x40" assert parse("\\x00") == parse("\x00") assert parse("\\x40") == parse("@") assert parse("[\\x40]") == parse("[@]") assert parse("[\\x41-\\x5a]") == parse("[A-Z]")
def test_set_ops(): assert parse("[abcd]") - parse("a") == charclass.parse("[bcd]") assert parse("[abcd]") ^ parse("[cdef]") == charclass.parse("[abef]")
def complement_of_string_pattern(s): return str(parse(s).everythingbut().reduce())
def test_statement_regex_mutual_exclusivity(): fsa_list = [lego.parse(deverbosify(module._STATEMENT_REGEX.pattern)) for module in PROBE_MODULES] for fsa1, fsa2 in itertools.combinations(fsa_list, 2): yield assert_non_overlapping, fsa1, fsa2
def intersection(self, other): """Find the intersection of two regexps. This method uses greenery library to find and reduce the intersection pattern. * If `other` is a string, a Python dict or a FiniteSet, it is converted to a regex pattern, after which it is parsed by `greenery.lego.parse` method and its intersection with the pattern of the `self` is found. The library `greenery` finds the intersection between two regex's by constructing corresponding FSM's (finite state machines) and finding their intersection, after which it is converted back to a regex. See more details here: https://github.com/qntm/greenery * If `other` is an instance of `EmpySet`, the intersection is a `EmpySet` object. * If `other` is an instance of `UniversalSet`, the intersection is a copy of `self`. Parameters ---------- other : set, str, re._pattern_type, RegexSet Returns ------- result : RegexSet The union set """ if self.pattern is None: return RegexSet.empty() if self.is_universal(): if isinstance(other, set): universal_flag = True other_exp = [] for el in other: exp = RegexSet(_regex_to_string(el)) other_exp.append(exp) if not exp.is_universal(): universal_flag = False if universal_flag: return RegexSet.universal() else: result_obj = RegexSet.empty() for exp in other_exp: result_obj.union(exp) return result_obj else: other_obj = RegexSet(_regex_to_string(other)) if other_obj.is_universal(): return RegexSet.universal() else: return other_obj self_exp = parse(self.pattern) other_exp = [] if isinstance(other, set): for exp in other: exp_str = _regex_to_string(exp) if exp_str is None: return RegexSet.empty() other_exp.append(parse(exp_str)) elif isinstance(other, UniversalSet): return copy.deepcopy(self) elif isinstance(other, EmptySet): return EmptySet() else: other_str = _regex_to_string(other) if other_str is None: return RegexSet.empty() other_exp.append(parse(other_str)) intersect_exp = self_exp for exp in other_exp: intersect_exp = intersect_exp.intersection(exp) return RegexSet(str(intersect_exp))
def test_regex_reversal(): assert reversed(parse("b")) == parse("b") assert reversed(parse("e*")) == parse("e*") assert reversed(parse("bear")) == parse("raeb") assert reversed(parse("beer")) == parse("reeb") assert reversed(parse("abc|def|ghi")) == parse("cba|fed|ihg") assert reversed(parse("(abc)*d")) == parse("d(cba)*")
import sys from greenery.lego import parse import re subregex = parse(sys.argv[1]) supregex = parse(sys.argv[2]) s = subregex&(supregex.everythingbut()) if s.empty(): print("subset") else: print("notsubset")
# This code is in the public domain. # http://qntm.org/greenery import sys from greenery.lego import parse regexes = sys.argv[1:] if len(regexes) < 2: print("Please supply several regexes to compute their intersection, union and concatenation.") print("E.g. \"19.*\" \"\\d{4}-\\d{2}-\\d{2}\"") else: p = parse(regexes[0]) for regex in regexes[1:]: p &= parse(regex) print("Intersection: %s" % ( p )) p = parse(regexes[0]) for regex in regexes[1:]: p |= parse(regex) print("Union: %s" % ( p )) p = parse(regexes[0]) for regex in regexes[1:]: p += parse(regex) print("Concatenation: %s" % ( p ))
import sys from greenery.lego import lego, parse regexes = sys.argv[1:] if len(regexes) < 2: print("Please supply several regexes to compute their intersection, union and concatenation.") print("E.g. \"19.*\" \"\\d{4}-\\d{2}-\\d{2}\"") else: regexes = [parse(regex) for regex in regexes] print("Intersection: %s" % ( lego.intersection(*regexes).reduce() )) print("Union: %s" % ( lego.union(*regexes).reduce() )) print("Concatenation: %s" % ( lego.concatenate(*regexes).reduce() ))
def _isObjectSubtype(s1, s2): ''' The general intuition is that a json object with more keys is more restrictive than a similar object with fewer keys. E.g.: if corresponding keys have same shcemas, then {name: {..}, age: {..}} <: {name: {..}} {name: {..}, age: {..}} />: {name: {..}} So the subtype checking is divided into two major parts: I) lhs keys/patterns/additional should be a superset of rhs II) schemas of comparable keys should have lhs <: rhs ''' if s2.type != "object": return False # Check properties range is_sub_interval = s1.interval in s2.interval if not is_sub_interval: print_db("__00__") return False # else: # If ranges are ok, check another trivial case of almost identical objects. # This is some sort of performance heuristic. if set(s1.required).issuperset(s2.required) \ and s1.properties == s2.properties \ and s1.patternProperties == s2.patternProperties \ and (s1.additionalProperties == s2.additionalProperties or (utils.is_dict(s1.additionalProperties) and s1.additionalProperties.isSubtype(s2.additionalProperties))): print_db("__01__") return True # def get_schema_for_key(k, s): ''' Searches for matching key and get the corresponding schema(s). Returns iterable because if a key matches more than one pattern, that key schema has to match all corresponding patterns schemas. ''' if k in s.properties.keys(): return [k.properties[k]] else: ret = [] for k_ in s.patternProperties.keys(): if utils.regex_matches_string(k_, k): # in case a key has to be checked against patternProperties, # it has to adhere to all schemas which have pattern matching the key. ret.append(k.patternProperties[k_]) if ret: return ret return [s.additionalProperties] # Check that required keys satisfy subtyping. # lhs required keys should be superset of rhs required keys. if not set(s1.required).issuperset(s2.required): print_db("__02__") return False # If required keys are properly defined, check their corresponding # schemas and make sure they are subtypes. # This is required because you could have a required key which does not # have an explicit schema defined by the json object. else: for k in set(s1.required).intersection(s2.required): for lhs_ in get_schema_for_key(k, s1): for rhs_ in get_schema_for_key(k, s2): if lhs_: if rhs_: if not lhs_.isSubtype(rhs_): print_db("__03__") return False else: print_db("__04__") return False # Missing keys on the rhs # I) Simple case: # lhs = {"properties": {p1: {string}} # rhs = {"properties": {p1: {string}, p2: {int}}} # >> this means lhs isNOT subtype of rhs cuz lhs # would accept any p2 that does not necesaarily match # the type int of the p2 on the rhs # II) what if # lhs = {"properties": {p1: {string}, # "patternProperties": {p2: {int}}} # again, ideally this means lhs isNOT subtype of rhs # because lhs accept any property name with pattern .*p2.* # III) however, the tricky case is: it could happend that # every string matched by patternProperties on the lhs exist as a property # or property pattern on the rhs, then we need to do picky and enumerative # checks cuz it could be that indeed lhs isSubtype of rhs. # break it down to subcases # if set(s1.properties.keys()).issubset(s2.properties.keys()) \ # and len(s1.properties.keys()) < len(s2.properties.keys()) \ # and len(s1.patternProperties.keys()) == 0: # TODO: The following is very inefficient. Can we do better? # lhs_keys = "|".join(k for k in s1.properties.keys( # )) + "|".join(utils.regex_unanchor(k) for k in s1.patternProperties.keys()) # rhs_keys = "|".join(k for k in s2.properties.keys( # )) + "|".join(utils.regex_unanchor(k) for k in s2.patternProperties.keys()) # lhs_keys_proper_subset_rhs_keys = utils.regex_isProperSubset( # lhs_keys, rhs_keys) # if lhs_keys_proper_subset_rhs_keys: # print_db("__05__") # return False extra_keys_on_rhs = set(s2.properties.keys()).difference( s1.properties.keys()) for k in extra_keys_on_rhs.copy(): for k_ in s1.patternProperties.keys(): if utils.regex_matches_string(k_, k): extra_keys_on_rhs.remove(k) if extra_keys_on_rhs: if not s1.additionalProperties: print_db("__05__") return False else: for k in extra_keys_on_rhs: if not s1.additionalProperties.isSubtype( s2.properties[k]): print_db("__06__") return False extra_patterns_on_rhs = set( s2.patternProperties.keys()).difference( s1.patternProperties.keys()) for k in extra_patterns_on_rhs.copy(): for k_ in s1.patternProperties.keys(): if utils.regex_isSubset(k, k_): extra_patterns_on_rhs.remove(k) if extra_patterns_on_rhs: if not s1.additionalProperties: print_db("__07__") return False else: for k in extra_patterns_on_rhs: if not s1.additionalProperties.isSubtype( s2.patternProperties[k]): try: # means regex k is infinite parse(k).cardinality() except OverflowError: print_db("__08__") return False # # missing_props_from_lhs = set( # s2.properties.keys()) - set(s1.properties.keys()) # for k in missing_props_from_lhs: # for k_ in s1.patternProperties.keys(): # if utils.regex_matches_string(k_, k): # if not s1.patternProperties[k_].isSubtype(s2.properties[k]): # return False # Now, lhs has a patternProperty which is subtype of a property on the rhs. # Idealy, at this point, I'd like to check that EVERY property matched by # this pattern also exist on the rhs. # from greenery.lego import parse # p = parse(k_) # try: # p.cardinality # first, matching properties should be subtype pairwise unmatched_lhs_props_keys = set(s1.properties.keys()) for k in s1.properties.keys(): if k in s2.properties.keys(): unmatched_lhs_props_keys.discard(k) if not s1.properties[k].isSubtype(s2.properties[k]): return False # for the remaining keys, make sure they either don't exist # in rhs or if they, then their schemas should be sub-type else: for k_ in s2.patternProperties: # if utils.regex_isSubset(k, k_): if utils.regex_matches_string(k_, k): unmatched_lhs_props_keys.discard(k) if not s1.properties[k].isSubtype( s2.patternProperties[k_]): return False # second, matching patternProperties should be subtype pairwise unmatched_lhs_pProps_keys = set(s1.patternProperties.keys()) for k in s1.patternProperties.keys(): for k_ in s2.patternProperties.keys(): if utils.regex_isSubset(k_, k): unmatched_lhs_pProps_keys.discard(k) if not s1.patternProperties[k].isSubtype( s2.patternProperties[k_]): return False # third, # fourth, if s2.additionalProperties == True: return True elif s2.additionalProperties == False: if s1.additionalProperties == True: return False elif unmatched_lhs_props_keys or unmatched_lhs_pProps_keys: return False else: return True else: for k in unmatched_lhs_props_keys: if not s1.properties[k].isSubtype(s2.additionalProperties): return False for k in unmatched_lhs_pProps_keys: if not s1.patternProperties[k].isSubtype( s2.additionalProperties): return False if s1.additionalProperties == True: return False elif s1.additionalProperties == False: return True else: return s1.additionalProperties.isSubtype( s2.additionalProperties)
def test_equivalence(): assert parse("aa*").equivalent(parse("a*a")) assert parse("([ab]*a|[bc]*c)?b*").equivalent(parse("b*(a[ab]*|c[bc]*)?"))