class ExtractSentences(jsonql.Transformer): def __init__( self, sp_model: Path, lm_model: Path, field: str = "raw_content", threshold: float = float("+inf"), ): super().__init__() self.sp_model = sp_model self.lm_model = lm_model self.field = field self.threshold = threshold self.sp: SentencePieceProcessor = None self.lm: KenlmModel = None self.splitter: SentenceSplitter = None self.hashes: Set[int] = set() def _prepare(self): self.sp = SentencePieceProcessor() self.sp.load(str(self.sp_model)) self.splitter = SentenceSplitter("en") self.lm = KenlmModel(str(self.lm_model)) def do(self, document: dict) -> Optional[str]: content: Optional[str] = document.get(self.field) if not content: return None all_sentences = [ s for l in content.split("\n") if l for s in self.splitter.split(text=l) ] unique_sentences = [] for s in all_sentences: if not s: continue h = dedup.str_hash(s) if h in self.hashes: continue self.hashes.add(h) unique_sentences.append(s) scores = [] for sentence in unique_sentences: normalized = text_normalizer.normalize(sentence) pieces = self.sp.encode_as_pieces(normalized) log_score = self.lm.score(" ".join(pieces)) pp = -1 if len(pieces): pp = perplexity.pp(log_score, len(pieces)) scores.append(pp) res = filter(lambda pp_s: self.threshold > pp_s[0] > 0, zip(scores, unique_sentences)) return "\n".join(f"{pp}\t{s}" for (pp, s) in res) or None
class KenLMFeatureGenerator(LanguageFeatureGenerator): ''' Provide ngram features by querying language model via KenLM python wrapper @param model: the filename of the compact language model to be loaded @type model: str @param language: the language that this model has been trained on @type language: str ''' feature_names = ['kenlm_unk_pos_abs_avg', 'kenlm_unk_pos_abs_std', 'kenlm_unk_pos_abs_min', 'kenlm_unk_pos_abs_max', 'kenlm_unk_pos_rel_avg', 'kenlm_unk_pos_rel_std', 'kenlm_unk_pos_rel_min', 'kenlm_unk_pos_rel_max', 'kenlm_unk', 'kenlm_unk_len', 'kenlm_length_avg', 'kenlm_length_std', 'kenlm_length_min', 'kenlm_length_max', 'kenlm_probs_avg', 'kenlm_probs_std', 'kenlm_probs_min', 'kenlm_probs_max', 'kenlm_probs_pos_max', 'kenlm_probs_pos_min', 'kenlm_probs_low', 'kenlm_probs_high', 'kenlm_probs_low_pos_avg', 'kenlm_probs_low_pos_std', 'kenlm_prob', 'lm_unk_pos_abs_avg', 'lm_unk_pos_abs_std', 'lm_unk_pos_abs_min', 'lm_unk_pos_abs_max', 'lm_unk_pos_rel_avg', 'lm_unk_pos_rel_std', 'lm_unk_pos_rel_min', 'lm_unk_pos_rel_max', 'lm_unk', 'lm_unk_len', 'lm_length_avg', 'lm_length_std', 'lm_length_min', 'lm_length_max', 'lm_probs_avg', 'lm_probs_std', 'lm_probs_min', 'lm_probs_max', 'lm_probs_pos_max', 'lm_probs_pos_min', 'lm_probs_low', 'lm_probs_high', 'lm_probs_low_pos_avg', 'lm_probs_low_pos_std', 'lm_prob', ] def __init__(self, language=None, model=None, bos=True, eos=True, **kwargs): ''' Load the model ''' self.model = Model(model) self.language = language self.bos = bos self.eos = eos def get_features_string(self, string): total_score = self.model.score(string, bos=self.bos, eos=self.eos) partial_scores = self.model.full_scores(string, bos=self.bos, eos=self.eos) ngram_lengths = [] probs = [] unk_count = 0 unk_pos = [] unk_tokens = [] tokens = string.split() tokens_iter = iter(tokens) pos = 0 for pos, (prob, ngram_length, wid) in enumerate(partial_scores): try: token = next(tokens_iter) #End of sentence score has no token except StopIteration: token = "" if wid: unk_count += 1 unk_pos.append(pos) unk_tokens.append(token) ngram_lengths.append(ngram_length) probs.append(prob) pos += 1 unk_rel_pos = [(unk_pos_item * 1.00) / len(tokens) for unk_pos_item in unk_pos] unk_len = sum([len(token) for token in unk_tokens]) if len(unk_pos) == 0: unk_pos = [0] unk_rel_pos = [0] features = { 'kenlm_unk_pos_abs_avg' : average(unk_pos), 'kenlm_unk_pos_abs_std' : std(unk_pos), 'kenlm_unk_pos_abs_min' : min(unk_pos), 'kenlm_unk_pos_abs_max' : max(unk_pos), 'kenlm_unk_pos_rel_avg' : average(unk_rel_pos), 'kenlm_unk_pos_rel_std' : std(unk_rel_pos), 'kenlm_unk_pos_rel_min' : min(unk_rel_pos), 'kenlm_unk_pos_rel_max' : max(unk_rel_pos), 'kenlm_unk' : unk_count, 'kenlm_unk_len' : unk_len, 'kenlm_length_avg' : average(ngram_lengths), 'kenlm_length_std' : std(ngram_lengths), 'kenlm_length_min' : min(ngram_lengths), 'kenlm_length_max' : max(ngram_lengths), 'kenlm_probs_avg' : average(probs), 'kenlm_probs_std' : std(probs), 'kenlm_probs_min' : min(probs), 'kenlm_probs_max' : max(probs), 'kenlm_probs_pos_max' : probs.index(max(probs)), 'kenlm_probs_pos_min' : probs.index(min(probs)), 'kenlm_probs_low' : self._standouts(probs, -1), 'kenlm_probs_high' : self._standouts(probs, +1), 'kenlm_probs_low_pos_avg': average(self._standout_pos(probs, -1)), 'kenlm_probs_low_pos_std': std(self._standout_pos(probs, -1)), 'kenlm_prob' : total_score, 'lm_unk_pos_abs_avg' : average(unk_pos), 'lm_unk_pos_abs_std' : std(unk_pos), 'lm_unk_pos_abs_min' : min(unk_pos), 'lm_unk_pos_abs_max' : max(unk_pos), 'lm_unk_pos_rel_avg' : average(unk_rel_pos), 'lm_unk_pos_rel_std' : std(unk_rel_pos), 'lm_unk_pos_rel_min' : min(unk_rel_pos), 'lm_unk_pos_rel_max' : max(unk_rel_pos), 'lm_unk' : unk_count, 'lm_unk_len' : unk_len, 'lm_length_avg' : average(ngram_lengths), 'lm_length_std' : std(ngram_lengths), 'lm_length_min' : min(ngram_lengths), 'lm_length_max' : max(ngram_lengths), 'lm_probs_avg' : average(probs), 'lm_probs_std' : std(probs), 'lm_probs_min' : min(probs), 'lm_probs_max' : max(probs), 'lm_probs_pos_max' : probs.index(max(probs)), 'lm_probs_pos_min' : probs.index(min(probs)), 'lm_probs_low' : self._standouts(probs, -1), 'lm_probs_high' : self._standouts(probs, +1), 'lm_probs_low_pos_avg': average(self._standout_pos(probs, -1)), 'lm_probs_low_pos_std': std(self._standout_pos(probs, -1)), 'lm_prob' : total_score, } return features def _standouts(self, vector, sign): std_value = std(vector) avg_value = average(vector) standout = 0 for value in vector: if value*sign > (avg_value + sign*std_value): standout += 1 return standout def _standout_pos(self, vector, sign): std_value = std(vector) avg_value = average(vector) standout = [] for pos, value in enumerate(vector, start=1): if value*sign > (avg_value + sign*std_value): standout.append(pos) return standout
class KenLMScorer(Scorer): name = "kenlm" def __init__(self, model=None, path=None, nlp=None, lowercase=True): if model: self.model = model elif path: self.model = KenLMModel(path) self._check_model() if nlp: self.nlp = nlp else: import spacy self.nlp = spacy.load("en_core_web_sm") self.lowercase = lowercase def _check_model(self): assert isinstance(self.model, KenLMModel) assert self.model.score("testing !") < 0 def preprocess(self, segment): """ SpaCy tokenize + lowercase. Ignore extra whitespaces. - if Doc, Span, Token - retrieve .lower_ - if string - convert to Doc first """ if isinstance(segment, (Doc, Span, Token)): # spaCy tokenizer, ignore whitespaces tok = [token.text for token in segment if not token.is_space] if self.lowercase: tok = [token.lower() for token in tok] elif isinstance(segment, str): doc = self.nlp(segment, disable=self.nlp.pipe_names) return self.preprocess(doc) return " ".join(tok) def __call__(self, segment, score_type="perplexity"): text = self.preprocess(segment) word_count = len(text.split()) if word_count < 2: warnings.warn(f"Scorer: Received {word_count} tokens, expected >= 2.") return float("-inf") if isinstance(segment, Doc): # if doc - assume bos, eos=True bos = True eos = True if isinstance(segment, (Span, Token)): # if span - assume bos, eos=False bos = False eos = False if isinstance(segment, str): # string passed - guess: bos = text.capitalize() == text eos = text[-1] in string.punctuation # log10 prob score = self.model.score(text, bos=bos, eos=eos) if score_type == "log": return score elif score_type == "perplexity": prob = 10.0 ** (score) return prob ** (-1 / word_count) else: raise NotImplementedError def score_suggestion(self, doc, span, suggestion): text = " ".join([doc[: span.start].text] + suggestion + [doc[span.end :].text]) return self(text) def sort_suggestions(self, spans: List[Span]) -> List[Span]: for span in spans: if len(span._.suggestions) > 1: span._.suggestions = sorted( span._.suggestions, key=lambda x: self.score_suggestion( span.doc, span, [t.text for t in x] ), ) return spans