def test_qualified_re_split(self): self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) self.assertEqual(re.split("(:)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c']) self.assertEqual(re.split("(:*)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c'])
def split_setences(text): sentences = [] results = re.split("\\.|!|\\?",text) for item in results: sentences.append(item) return sentences
def test_re_split(self): self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) self.assertEqual(re.split("(:*)", ":a:b::c"), ['', ':', 'a', ':', 'b', '::', 'c']) self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) self.assertEqual(re.split("(:)*", ":a:b::c"), ['', ':', 'a', ':', 'b', ':', 'c']) self.assertEqual(re.split("([b:]+)", ":a:b::c"), ['', ':', 'a', ':b::', 'c']) self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), ['', None, ':', 'a', None, ':', '', 'b', None, '', None, '::', 'c']) self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', '', 'c'])
def extract_entities(text,deduplication=False): sentences = split_setences(text) text_preprocessed = remove_accents(text) text_preprocessed = remove_digits(text_preprocessed) regexp2bat = build_regexpression() matches = re.finditer(regexp2bat, text_preprocessed) pt_patterns = PortuguesePatterns() phase_0_entities = [] for matchNum, match in enumerate(matches): matchNum = matchNum + 1 phase_0_entities.append(match.group()) phase_1_entities = [] for token in phase_0_entities: doc = nlp(token) prefix_pos=doc[0].pos_ if(prefix_pos in pt_patterns.tags_exclusions): token=token.replace(doc[0].text,'',1) doc[0].pos_='' if(doc[0].pos_=='' and len(doc)>1): if(doc[1].pos_ in pt_patterns.tags_exclusions): token=token.replace(doc[1].text,'',1) if(token.strip()!=''): phase_1_entities.append(token) unique_tokens = [] for token in phase_1_entities: if(token != '' and len(token) > 2): token = token.strip() if not(token in pt_patterns.stopwords or token in pt_patterns.preprositions): if(deduplication): if not(token in unique_tokens): unique_tokens.append(token) else: unique_tokens.append(token) origin="("+"|".join(unique_tokens)+")" text=re.split(origin,text) output={'text':text,'tokens':unique_tokens} return output
def execute(mode, code, input_str): result = "" if mode == "l": rows = [pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code)] table = {} for row in rows: table.update(dict(zip(row[:-1],[row[-1]]*(len(row)-1)))) if input_str in table: result = table[input_str] else: result = table["?"] elif mode == "f": result = code % ast.literal_eval(input_str) elif mode == "F": literal = ast.literal_eval(input_str) if isinstance(literal, tuple): result = code % literal input_str = str(sum([len(str(x)) for x in literal])) else: result = code % literal input_str = str(len(str(literal))) elif mode == "g": for string in exrex.generate(code): print(string.encode("utf-8").decode("unicode-escape")) return # Generate is always terminal elif mode == "h": if type(input_str) is str: input_str = pcre.escape(input_str) for string in exrex.generate(code % input_str): print(string.encode("utf-8").decode("unicode-escape")) return elif mode == "p": literal = ast.literal_eval(input_str) if isinstance(literal, int): result = pcre.sub(r"(?<![^\\]\\)~(.+?)(?<![^\\]\\)~",r"\1" * literal, code, flags=pcre.DOTALL) else: result = pcre.sub(r"(?<![^\\]\\)%(.+?)(?<![^\\]\\)%",r"\1" * literal[1], pcre.sub(r"~(.+?)~",r"\1" * literal[0], code, flags=pcre.DOTALL), flags=pcre.DOTALL) elif mode == "P": result = pcre.sub(r"(.)(?<![^\\]\\)~",r"\1" * ast.literal_eval(input_str), code, flags=pcre.DOTALL) elif mode == "e": rows = [pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code)] table = {} for row in rows: table.update(dict(zip(row[:-1],[row[-1]]*(len(row)-1)))) for char in i: result += table[i] elif mode == "o": pieces = pcre.split(r"(?<![^\\]\\)`", code) print(pieces[0].encode("utf-8").decode("unicode-escape")) result = "`" + "`".join(pieces[1:]) elif mode == "s": pieces = pcre.split(r"(?<![^\\]\\)`", code) subs = pcre.split(r"(?<![^\\]\\)&", pieces[0]) sub_length = len(subs) for i in range(0, len(subs), 2): input_str = pcre.sub(subs[i], subs[i + 1], input_str) if len(pieces) > 1: result = "`" + "`".join(pieces[1:]) else: result = input_str elif mode == "d": pieces = pcre.split(r"(?<![^\\]\\)`", code) subs = pcre.split(r"(?<![^\\]\\)&", pieces[0]) for sub in subs: input_str = pcre.sub(sub, "", input_str) if len(pieces) > 1: result = "`" + "`".join(pieces[1:]) else: result = input_str elif mode == "S": pieces = pcre.split(r"(?<![^\\]\\)`", code) subs = pcre.split(r"(?<![^\\]\\)&", pieces[0]) sub_length = len(subs) output = input_str for i in range(0, len(subs), 2): output = pcre.sub(subs[i], subs[i + 1], output) if len(pieces) > 1: result = "`" + "`".join(pieces[1:]) else: result = "" print(output.encode("utf-8").decode("unicode-escape")) elif mode == "i": result = code + input_str elif mode == "I": result = code + "\n" + input_str else: result = code if len(result) > 0 and result[0] == "`": input_pieces = pcre.split(r"(?<![^\\]\\)!", result) if len(input_pieces) >= 2: execute(result[1], input_pieces[0][2:], "!".join(input_pieces[1:])) else: execute(result[1], result[2:], get_input(input_str)) else: print(result.encode("utf-8").decode("unicode-escape"))
result = code if len(result) > 0 and result[0] == "`": input_pieces = pcre.split(r"(?<![^\\]\\)!", result) if len(input_pieces) >= 2: execute(result[1], input_pieces[0][2:], "!".join(input_pieces[1:])) else: execute(result[1], result[2:], get_input(input_str)) else: print(result.encode("utf-8").decode("unicode-escape")) if __name__ == "__main__": args = docopt(__doc__) pcre.enable_re_template_mode() with open(sys.argv[1], 'rb') as file: string = file.read() if hashlib.sha256(string).hexdigest() == "bca4894ae7cf4919e3b3977583df930c8f4bf5b75c8bf5ada9de1d9607ef846b": i = input() exec(string) else: mode = chr(string[0]) code = str(string) if args['-u'] else decompress(string) input_pieces = pcre.split(r"(?<![^\\]\\)!", code) if len(input_pieces) >= 2: i = get_input("!".join(input_pieces[1:])) else: i = get_input("") execute(mode, input_pieces[0], i)
def execute(mode, code, input_str): result = "" if mode == "l": rows = (pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code)) table = handle_table(rows) if input_str in table: result = table[input_str] else: result = table["?"] elif mode == "f": result = code % ast.literal_eval(input_str) elif mode == "F": literal = ast.literal_eval(input_str) if isinstance(literal, tuple): result = code % literal input_str = str(sum((len(str(x)) for x in literal))) else: result = code % literal input_str = str(len(str(literal))) elif mode == "g": for string in exrex.generate(code): print(unescape(string)) return # Generate is always terminal elif mode == "h": if type(input_str) is str: input_str = pcre.escape(input_str) for string in exrex.generate(code % input_str): print(unescape(string)) return elif mode == "p": literal = ast.literal_eval(input_str) if isinstance(literal, int): result = pcre.sub(r"(?<![^\\]\\)~(.+?)(?<![^\\]\\)~",r"\1" * literal, code, flags=pcre.DOTALL) else: result = pcre.sub(r"(?<![^\\]\\)%(.+?)(?<![^\\]\\)%",r"\1" * literal[1], pcre.sub(r"~(.+?)~",r"\1" * literal[0], code, flags=pcre.DOTALL), flags=pcre.DOTALL) elif mode == "P": result = pcre.sub(r"(.)(?<![^\\]\\)~",r"\1" * ast.literal_eval(input_str), code, flags=pcre.DOTALL) elif mode == "e": rows = (pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code)) table = handle_table(rows) for char in i: result += table[i] elif mode == "o": pieces = pcre.split(r"(?<![^\\]\\)`", code) print(unescape(pieces[0])) result = handle_pieces(pieces[1:], "") elif mode == "s": pieces = pcre.split(r"(?<![^\\]\\)`", code) subs = pcre.split(r"(?<![^\\]\\)&", pieces[0]) input_str = handle_subs(input_str, subs) result = handle_pieces(pieces[1:], input_str) elif mode == "d": pieces = pcre.split(r"(?<![^\\]\\)`", code) subs = pcre.split(r"(?<![^\\]\\)&", pieces[0]) for sub in subs: input_str = pcre.sub(sub, "", input_str) result = handle_pieces(pieces[1:], input_str) elif mode == "S": pieces = pcre.split(r"(?<![^\\]\\)`", code) subs = pcre.split(r"(?<![^\\]\\)&", pieces[0]) sub_length = len(subs) output = unescape(handle_subs(input_str, subs)) result = handle_pieces(pieces[1:], "") print(output) elif mode == "i": result = code + input_str elif mode == "I": result = code + "\n" + input_str else: result = code if len(result) > 0 and result[0] == "`": input_pieces = pcre.split(r"(?<![^\\]\\)!", result) if len(input_pieces) >= 2: execute(result[1], input_pieces[0][2:], "!".join(input_pieces[1:])) else: execute(result[1], result[2:], get_input(input_str)) else: print(unescape(result))