def __init__(self): Preprocess.__init__(self) self.equal_regex = re.compile(r'=+[^=]+=+') self.pars_regex = re.compile(r'([^()]+)|\([^\(\)]+\)') self.chartype = Chartype()
class JaWikiPreprocess(Preprocess): def __init__(self): Preprocess.__init__(self) self.equal_regex = re.compile(r'=+[^=]+=+') self.pars_regex = re.compile(r'([^()]+)|\([^\(\)]+\)') self.chartype = Chartype() def _is_nihongo(self, text): return all( self.chartype.is_nihongo(s) or self.chartype.is_ascii(s) for s in text) def _subs(self, regex: "re obj", repl: str, text: str): return regex.sub(repl, text) def remove_equal(self, text: str) -> str: return self._subs(self.equal_regex, "", text) def remove_pars(self, text: str) -> str: return self._subs(self.pars_regex, "", text) def ignore(self, text: str) -> str: if self._is_nihongo(text): return text else: return "" def execute(self, text: str) -> str: funcs = [ self.ignore, self.remove_equal, self.remove_pars, self.remove_newline, self.remove_link, self.convert_cont_spaces, self.strip ] _text = text for func in funcs: _text = func(_text) return _text