Пример #1
0
    def __init__(self):
        Preprocess.__init__(self)

        self.equal_regex = re.compile(r'=+[^=]+=+')
        self.pars_regex = re.compile(r'([^()]+)|\([^\(\)]+\)')

        self.chartype = Chartype()
Пример #2
0
class JaWikiPreprocess(Preprocess):

    def __init__(self):
        Preprocess.__init__(self)

        self.equal_regex = re.compile(r'=+[^=]+=+')
        self.pars_regex = re.compile(r'([^()]+)|\([^\(\)]+\)')

        self.chartype = Chartype()

    def _is_nihongo(self, text):
        return all(
            self.chartype.is_nihongo(s) or self.chartype.is_ascii(s)
            for s in text)

    def _subs(self, regex: "re obj", repl: str, text: str):
        return regex.sub(repl, text)

    def remove_equal(self, text: str) -> str:
        return self._subs(self.equal_regex, "", text)

    def remove_pars(self, text: str) -> str:
        return self._subs(self.pars_regex, "", text)

    def ignore(self, text: str) -> str:
        if self._is_nihongo(text):
            return text
        else:
            return ""

    def execute(self, text: str) -> str:
        funcs = [
            self.ignore,
            self.remove_equal,
            self.remove_pars,
            self.remove_newline,
            self.remove_link,
            self.convert_cont_spaces,
            self.strip
        ]
        _text = text
        for func in funcs:
            _text = func(_text)
        return _text