def test14(): myStr = "backup:x:34:34:backup:/var/backups:/usr/sbin/nologin" elemNum = 6 sar = RegexpSar() matchCount = VarPointer() strt = VarPointer() end = VarPointer() def sar_callback(from_pos, to_pos): matchCount.append(1) sar.continue_from(to_pos) if matchCount.val == elemNum - 1: strt.set(to_pos) elif matchCount.val == elemNum: end.set(from_pos) sar.stop_match() sar.add_regexp(":", sar_callback) sar.match(myStr) if end.val > 0: matchStr = myStr[strt.val:end.val] validate("/var/backups", matchStr) else: # fail on purpuse validate(0 == 1, "no match found")
def test18(): myStr = "first abbbbc second 123 end" matches = [] regexps = [ ["ab+c", "First Match"], ["\\d+", "Second Match"], ] sar = RegexpSar() for re in regexps: reStr = re[0] reTitle = re[1] def gen_call(reTitleInternal): def call(from_pos, to_pos): matchStr = myStr[from_pos:to_pos] matches.append(reTitleInternal + ": " + matchStr) sar.continue_from(to_pos) return call sar.add_regexp(reStr, gen_call(reTitle)) sar.match(myStr) validate(2, len(matches)) validate("First Match: abbbbc", matches[0]) validate("Second Match: 123", matches[1])
class StateSar: def __init__(self, init_state): super().__init__() self._sar = RegexpSar() self._state = init_state self._all_states = set() def add_regexp(self, state, regexp, regexp_callback): self._all_states.add(state) def inner_callback(from_pos, to_pos): if self._state == state: regexp_callback(from_pos, to_pos) self._sar.continue_from(to_pos) self._sar.add_regexp(regexp, inner_callback) def match(self, match_str): self._sar.match(match_str) def set_state(self, new_state): if new_state not in self._all_states: raise Exception("Unknown state: " + str(new_state)) self._state = new_state def get_state(self): return self._state
def test5(): obj = VarPointer() sar = RegexpSar() re = "abcd" myStr = "qabcdef" sar.add_regexp(re, lambda from_pos, to_pos: obj.append(1)) sar.match(myStr) validate(1, obj.val)
def test6(): obj = VarPointer() sar = RegexpSar() myStr = "0123456 789" sar.add_regexp("\\d+", lambda from_pos, to_pos: obj.set(myStr[from_pos:to_pos])) sar.match_at(myStr, 3) validate("3456", obj.val)
def test10(): myStr = "abc" res = [] sar = RegexpSar() sar.add_regexp("\\w+", lambda from_pos, to_pos: res.append(myStr[from_pos:to_pos])) sar.match(myStr) validate("abc", res[0]) validate("bc", res[1]) validate("c", res[2]) validate(3, len(res))
def test4(): obj = VarPointer() sar = RegexpSar() sar.add_regexp("abcd", lambda from_pos, to_pos: obj.append(1)) sar.add_regexp("nm", lambda from_pos, to_pos: obj.append(2)) my_str = "qabcdefnmq" sar.match_from(my_str, 1) validate(3, obj.val) obj.set(0) sar.match_from(my_str, 2) validate(2, obj.val)
def test15(): myStr = "123abc" obj = VarPointer() sar = RegexpSar() def call(from_pos, to_pos): obj.set(1) sar.add_regexp("\\d+", call) sar.add_regexp("\\w+", call) sar.match(myStr) validate(1, obj.val)
def test11(): myStr = "123abc456" res = [] sar = RegexpSar() def sar_callback(from_pos, to_pos): res.append(myStr[from_pos:to_pos]) sar.continue_from(to_pos + 1) sar.add_regexp("\\a+", sar_callback) sar.match(myStr) validate("abc", res[0]) validate(1, len(res))
def test20(): # get match only at certain position myStr = "aa11 bb22 cc33 dd44" matchedStr4 = VarPointer() sar = RegexpSar() def call(from_pos, to_pos): matchedStr4.val = myStr[from_pos:to_pos] sar.add_regexp("\\w+", call) sar.match_at(myStr, 5) validate("bb22", matchedStr4.val)
def test17(): myStr = "mm abc nn" obj = VarPointer() sar = RegexpSar() def call(from_pos, to_pos): obj.set(1) sar.add_regexp("\\d+", call) sar.add_regexp("\\a+", call) sar.match(myStr) if obj.val != 1: validate(0 == 1, "regexp should match")
def test9(): objFrom = VarPointer() objTo = VarPointer() sar = RegexpSar() myStr = "1234" re = "\\d+" def sar_callback(from_pos, to_pos): objFrom.set(from_pos) objTo.set(to_pos) sar.stop_match() sar.add_regexp(re, sar_callback) sar.match(myStr) validate(myStr, myStr[objFrom.val:objTo.val])
def test22(): sar = RegexpSar() alpha_pos = VarPointer() alpha_pos.val = -1 anchorPos = VarPointer() def sar_digit_callback(digit_pos, to_pos): if alpha_pos.val != -1: dist = digit_pos - alpha_pos.val if (dist == 1): anchorPos.set(digit_pos) sar.add_regexp("\\a", lambda from_pos, to_pos: alpha_pos.set(from_pos)) sar.add_regexp("\\d", sar_digit_callback) sar.match("aa bb2cc dd") validate(5, anchorPos.val)
def test21(): # get match only at certain position myStr = "aaaaaaaaaaaaaaaaaaaaaaaaaabbbc" matched = VarPointer() sar1 = RegexpSar() sar2 = RegexpSar() def sar1_callback(from_pos, to_pos): sar2.match_at(myStr, to_pos) sar1.stop_match() sar2.add_regexp("b+c", lambda from_pos, to_pos: matched.append(1)) sar1.add_regexp("a+", sar1_callback) sar1.match(myStr) validate(1, matched.val)
def test16(): myStr = "123abc" res = [] sar = RegexpSar() def call(from_pos, to_pos): res.append(myStr[from_pos:to_pos]) sar.continue_from(to_pos + 1) sar.add_regexp("\\d+", call) sar.add_regexp("\\a+", call) sar.match(myStr) validate("123", res[0]) validate("bc", res[1]) validate(2, len(res))
def match_test(all_regexp, match_string, match_expected): sar = RegexpSar() path_res = 0 for i in range(len(all_regexp)): re_num = 2**i # NOTE: this is done since re_num is not defined every iteration, but only once def gen_callback(cur_re_num): def match_callback(from_pos, to_pos): nonlocal path_res path_res += cur_re_num return match_callback sar.add_regexp(all_regexp[i], gen_callback(re_num)) sar.match(match_string) assert match_expected == path_res, f"Match fail for [{', '.join(all_regexp)}] in >>{match_string}<<"
def test13(): myStr = "123abc" res = [] sar = RegexpSar() def call(from_pos, to_pos): res.append(myStr[from_pos:to_pos]) sar.continue_from(to_pos) sar.add_regexp("\\d+", call) sar.add_regexp("\\w+", call) sar.match(myStr) validate("123", res[0]) # TODO check if this is a real error? validate("123abc", res[1]) validate(2, len(res))
def test7(): obj = VarPointer() sar = RegexpSar() sar.add_regexp("abcd", lambda from_pos, to_pos: obj.append(1)) sar.match("qabcdeabcdkabcdf") validate(3, obj.val) obj2 = VarPointer() sar2 = RegexpSar() def sar2_callback(from_pos, to_pos): obj2.append(1) if obj2.val == 2: sar2.stop_match() sar2.add_regexp("abcd", sar2_callback) sar2.match("qabcdeabcdkabcdf") validate(2, obj2.val)
def test19(): # get third match and stop myStr = "aa11 bb22 cc33 dd44" matchCount = VarPointer() matchedStr3 = VarPointer() sar = RegexpSar() def call(from_pos, to_pos): matchCount.append(1) if matchCount.val == 3: matchedStr3.val = myStr[from_pos:to_pos] sar.stop_match() else: sar.continue_from(to_pos) sar.add_regexp("\\w+", call) sar.match(myStr) validate(3, matchCount.val) validate("cc33", matchedStr3.val)
def test8(): objFrom = VarPointer() objTo = VarPointer() sar = RegexpSar() myStr = "qabcdef" re = "abcd" def sar_callback(from_pos, to_pos): objFrom.set(from_pos) objTo.set(to_pos) sar.add_regexp(re, sar_callback) sar.match(myStr) validate(1, objFrom.val) validate(5, objTo.val) matchStr = myStr[objFrom.val:objTo.val] validate(re, matchStr) objFrom.set(0) objTo.set(0) sar.match("qqqqabcdttt") validate(4, objFrom.val) validate(8, objTo.val)
match_num = None match_str = "hello world 123 abc 456 789" regexps = [ ['\w+', 'word'], ['\d+', 'number'], ] for cur_regexp in regexps: def find_second_match(description): match_count = 0 match_val = None def callback(from_pos, to_pos): nonlocal match_count, match_val match_count += 1 if match_count == 2: print("Match: " + str(description) + ": " + match_str[from_pos:to_pos]) sar.continue_from(to_pos) return callback sar.add_regexp(cur_regexp[0], find_second_match(cur_regexp[1])) ''' Output: Match: word: world Match: number: 456 ''' sar.match(match_str)
def test3(): obj = VarPointer() sar = RegexpSar() sar.add_regexp("abcd", lambda from_pos, to_pos: obj.append(1)) sar.match("qqqqqqqqqqqq") validate(0, obj.val)
c_content = c_fh.read() required_method = "malloc" sar = RegexpSar() last_found_word = None def found_name(from_pos, to_pos): global last_found_word last_found_word = c_content[from_pos:to_pos] sar.continue_from(to_pos) sar.add_regexp("\\w+", found_name) # name of last encountered function last_found_function = None def found_function(from_pos, to_pos): global last_found_function last_found_function = last_found_word sar.add_regexp("(", found_function) # name of function we are currently in its body (with its body) inside_function_name = None curly_bracket_count = 0
# set next sentence start position to end of current sentence position + 1 sentence_start_position = to_pos + 1 # reset words in sentence to false is_alice_in_sentence = False is_cat_in_sentence = False is_rabbit_in_sentence = False # get Alice In Wonderland book content with open("./alice_in_wonderland.txt", "r") as alice_book: text = alice_book.read() # add alice/cat/rabbit regexps sar.add_regexp('\\^\\walice\\^\\w', find_alice) sar.add_regexp('\\^\\wAlice\\^\\w', find_alice) sar.add_regexp('\\^\\wcat\\^\\w', find_cat) sar.add_regexp('\\^\\wCat\\^\\w', find_cat) sar.add_regexp('\\^\\wrabbit\\^\\w', find_rabbit) sar.add_regexp('\\^\\wRabbit\\^\\w', find_rabbit) # add end of sentence regexps, # NOTE that they point to the same callback for cur_re in ['\\.', '\\?', '!', ';']: sar.add_regexp(cur_re, end_of_sentence) # run the regexps on the text sar.match(text) # the matching has completed, show the results:
from regexp_sar import RegexpSar sar = RegexpSar() match_str = "hello world abc" sar.add_regexp( '\w+', lambda from_pos, to_pos: print("Match Word: " + match_str[from_pos: to_pos])) sar.add_regexp( 'world', lambda from_pos, to_pos: print("Found world from: " + str( from_pos) + " to: " + str(to_pos))) ''' Output: Match Word: hello Match Word: ello Match Word: llo Match Word: lo Match Word: o Match Word: world Found world from: 6 to: 11 Match Word: orld Match Word: rld Match Word: ld Match Word: d Match Word: abc Match Word: bc Match Word: c ''' sar.match(match_str)
match_count = 0 # this will be called every time any regexp was matched in the text def count_matches(from_pos, to_pos): global match_count match_count += 1 # benchmark native Python regexp engine performance print("Start python match") # generate Python regexp string using alternation python_native_re = f"({'|'.join(regexp_list)})" start = time.time() # find amount of matches found with native Python regexp engine python_re_match_count = len(re.findall(python_native_re, match_str)) print(f"Done in {time.time() - start} seconds {python_re_match_count} matches") # create a SAR instance sar = RegexpSar() # append all the regexps for the SAR instance for cur_re in regexp_list: sar.add_regexp(cur_re, count_matches) # benchmark SAR performance print("Start SAR Match") start = time.time() sar.match(match_str) print(f"Done Match in: {time.time() - start} seconds. {match_count} matches!")