class TextProcessor(object): def __init__(self, splitter=None, tokenizer=None): self.split_cmd = splitter self.tokenizer = None if tokenizer: self.tokenizer = ExternalProcessor(tokenizer) def split_sentences(self, text): if not text: return [] p = subprocess.Popen(self.split_cmd.split(), stdin=subprocess.PIPE, stdout=subprocess.PIPE) out, err = p.communicate(input=text.encode("utf-8") + "\n") return out.decode("utf-8").split("\n") def process(self, text): if text: if self.split_cmd: text = self.split_sentences(text) else: text = [text] for line in text: if not line.strip(): continue if self.tokenizer: yield self.tokenizer.process(line).strip() else: yield line.strip()
def __init__(self, splitter=None, tokenizer=None): self.split_cmd = splitter self.tokenizer = None if tokenizer: self.tokenizer = ExternalProcessor(tokenizer)