def do(self, document: dict) -> dict: text = document[self.field] if self.normalize: text = text_normalizer.normalize(text) tokenized = self.sp.encode_as_pieces(text) document[self.output_field] = " ".join(tokenized) return document
def do(self, document: dict) -> Optional[str]: content: Optional[str] = document.get(self.field) if not content: return None all_sentences = [ s for l in content.split("\n") if l for s in self.splitter.split(text=l) ] unique_sentences = [] for s in all_sentences: if not s: continue h = dedup.str_hash(s) if h in self.hashes: continue self.hashes.add(h) unique_sentences.append(s) scores = [] for sentence in unique_sentences: normalized = text_normalizer.normalize(sentence) pieces = self.sp.encode_as_pieces(normalized) log_score = self.lm.score(" ".join(pieces)) pp = -1 if len(pieces): pp = perplexity.pp(log_score, len(pieces)) scores.append(pp) res = filter(lambda pp_s: self.threshold > pp_s[0] > 0, zip(scores, unique_sentences)) return "\n".join(f"{pp}\t{s}" for (pp, s) in res) or None
def do(self, text: str): text = text_normalizer.normalize(text, accent=self.rm_accent, case=False, numbers=False, punct=True) text = text_normalizer.normalize_spacing_for_tok(text, language=self.lang) return self.moses.tokenize(text, return_str=True, escape=False)
def do(self, document: dict) -> Optional[dict]: text = document[self.field] if self.normalize: text = text_normalizer.normalize(text) sp = self.get_sp(document.get("language")) if sp is None: return document tokenized = sp.encode_as_pieces(text) document[self.output_field] = " ".join(tokenized) return document
def extract_opening_text(source, n_docs: int = 10_000): i = 0 for doc in jsonql.read_jsons(source): if not doc: continue text = doc.get("opening_text") if not text: continue yield text_normalizer.normalize(text) i += 1 if i >= n_docs: break
def do(self, document: dict) -> Optional[str]: # type: ignore lines = self.get_lines(document) model = self.get_lm(document.get("language")) if not lines or not model: return None sentences = [] for line in lines: if self.normalize: line = text_normalizer.normalize(line) log_score = model.score(line) length = len(line.split()) + 1 sentences.append(f"{pp(log_score, length)}\t{line}") return "\n".join(sentences)
def do(self, document: dict) -> dict: lines = self.get_lines(document) model = self.get_lm(document.get("language")) if not lines or not model: return document doc_log_score, doc_length = 0, 0 for line in lines: if self.normalize: line = text_normalizer.normalize(line) log_score = model.score(line) length = len(line.split()) + 1 doc_log_score += log_score doc_length += length document[self.output_field] = round(pp(doc_log_score, doc_length), 1) return document
def test_numbers(): weird = "023456789 | 0123456789" normalized = "000000000 | 0000000000" assert txt.normalize(weird, numbers=True) == normalized assert txt.normalize(weird, numbers=False) == weird