def next_word_by_tokens(self, tokens): candidate = set(self.high_freq) state = kenlm.State() state2 = kenlm.State() tids = map(lambda x: self.voc[x] if x in self.voc else 0, tokens[-5:]) candidate |= set( map(lambda x: self.voc[x] if x in self.voc else 0, tokens)) if len(tids) < 5: self.model.BeginSentenceWrite(state) else: self.model.NullContextWrite(state) # state query for tid in tids: self.model.BaseScore(state, str(tid), state2) state, state2 = state2, state # find max candidate ranking = [] for tid in candidate: p = self.model.BaseScore(state, str(tid), state2) heapq.heappush(ranking, (p, tid)) return map( lambda (max_p, max_id): '<unk>' if max_id == 0 else self.id2word[max_id][1], heapq.nlargest(10, ranking))
def _update_lm_state(self): self.lm_state = kenlm.State() tmp_state = kenlm.State() self.lm.BeginSentenceWrite(self.lm_state) for w in self.history[-6:]: self.lm.BaseScore(self.lm_state, w, tmp_state) self.lm_state, tmp_state = tmp_state, self.lm_state
def vocab_prob_given_ngram(lm, v_prev_ngram, trg_vocab, trg_vocab_i2w, given=False, wid=True): if wid: v_prev_ngram = [trg_vocab_i2w[i] for i in v_prev_ngram if i != -1] # debug(str(v_prev_ngram)) logps, wids = [], [] if given: state_in = kenlm.State() lm.NullContextWrite(state_in) # m.BeginSentenceWrite(state_in) for w in v_prev_ngram: ngram_state = kenlm.State() lm.BaseScore(state_in, w, ngram_state) state_in = ngram_state for w, idx in trg_vocab.iteritems(): state_out = kenlm.State() log_prob = lm.BaseScore(ngram_state, w, state_out) logps.append(log_prob) wids.append(idx) else: for w, idx in trg_vocab.iteritems(): new_gram = ' '.join(v_prev_ngram + [w]) log_prob = lm.score(new_gram, bos=False, eos=False) logps.append(log_prob) wids.append(idx) return logps, wids
def _infer_instance_lm(self, instance: str, *args, **kwargs): candidates = [] for incorrect in instance.split(): if any([c not in self.dictionary.alphabet for c in incorrect]): candidates.append([(0, incorrect)]) else: res = self.find_candidates(incorrect, k=self.candidates_count, prop_threshold=1e-6) if res: candidates.append([(score, candidate) for candidate, score in res]) else: candidates.append([(0, incorrect)]) candidates.append([(0, '</s>')]) state = kenlm.State() self.lm.BeginSentenceWrite(state) beam = [(0, state, [])] for sublist in candidates: new_beam = [] for beam_score, beam_state, beam_words in beam: for score, candidate in sublist: state = kenlm.State() c_score = self.lm.BaseScore(beam_state, candidate, state) new_beam.append((beam_score + score + c_score, state, beam_words + [candidate])) new_beam.sort(reverse=True) beam = new_beam[:self.beam_size] score, state, words = beam[0] return ' '.join(words[:-1])
def test_get_words_with_prefix(self): self.vocabtrie.add_word('abc') stateIn = kenlm.State() stateOut = kenlm.State() words_with_probs = self.vocabtrie.get_words_with_prefix( 'a', self.language_model, stateIn, stateOut).pop(0) self.assertEqual(words_with_probs[0], 'abc', 'Returned item is not equal')
def get_state(context_tokens, lm): if context_tokens is None or len(context_tokens) == 0: return kenlm.State() instate = kenlm.State() outstate = kenlm.State() for w in context_tokens: __ = lm.BaseScore(instate, w, outstate) instate = outstate return outstate
def get_context_state(self, context, model): state_in = kenlm.State() state_out = kenlm.State() context = '<s> ' + context context_words = context.split() for w in context_words: # print('Context', '{0}\t{1}'.format(model.BaseScore(state_in, w, state_out), w)) state_in = state_out state_out = kenlm.State() return state_in, state_out
def get_context_state(self, context, model, vocab_id): state_in = kenlm.State() state_out = kenlm.State() context = '<s> ' + self.format_context(context, vocab_id) context_words = context.split() for w in context_words: # print('Context', '{0}\t{1}'.format(model.BaseScore(state_in, w.lower(), state_out), w.lower())) '{0}\t{1}'.format(model.BaseScore(state_in, w, state_out), w) state_in = state_out state_out = kenlm.State() return state_in, state_out
def compute_word_logprob(model, current_state, target_word): word_pred = 0 if target_word == EOS: stateOut = kenlm.State() word_pred += model.BaseScore(current_state, str(target_word), stateOut) * LOG10 else: stateIn = current_state for token in list(target_word): stateOut = kenlm.State() word_pred += model.BaseScore(stateIn, token, stateOut) * LOG10 stateIn = stateOut return word_pred
def _cond_probs(self, history) -> mx.nd.NDArray: startstate = kenlm.State() self.lm.NullContextWrite(startstate) for word in history: endstate = kenlm.State() self.lm.BaseScore(startstate, word, endstate) startstate = endstate # base-10 log score # ONLY works on cpu return mx.nd.array([ self.lm.BaseScore(startstate, word, kenlm.State()) for word in self.vocab ])
def get_cond_log_prob(self, sequence): sequence = sequence[-self.lm.order:] in_state = kenlm.State() self.lm.NullContextWrite(in_state) out_state = kenlm.State() for word in sequence: lm_prob = self.lm.BaseScore(in_state, word, out_state) tmp_state = in_state in_state = out_state out_state = tmp_state return lm_prob
def eval_logprobs_for_words(self, state, next_words): new_state = kenlm.State() logprobs = np.empty(len(next_words)) for next_idx, word_idx in enumerate(next_words): logprobs[next_idx] = self.model.base_score_from_idx(state, word_idx, new_state) logprobs *= LOG10 return logprobs
def score_seq_by_word(self, state, words): scores = [] for word in words: new_state = kenlm.State() scores.append(LOG10 * self.model.base_score_from_idx(state, self.model.vocab_index(word), new_state)) state = new_state return scores
def generate_phrase(model, context_toks, length, prefix_logprobs=None, **kw): if context_toks[0] == '<s>': state, _ = model.get_state(context_toks[1:], bos=True) else: state, _ = model.get_state(context_toks, bos=False) phrase = context_toks[:] generated_logprobs = np.empty(length) for i in range(length): next_words, probs = next_word_probs(model, state, phrase[-1], prefix_logprobs=prefix_logprobs, **kw) if len(next_words) == 0: raise GenerationFailedException prefix_logprobs = None picked_subidx = np.random.choice(len(probs), p=probs) picked_idx = next_words[picked_subidx] new_state = kenlm.State() model.model.base_score_from_idx(state, picked_idx, new_state) state = new_state word = model.id2str[picked_idx] phrase.append(word) generated_logprobs[i] = np.log(probs[picked_subidx]) return phrase[len(context_toks):], generated_logprobs
def next_word_logprobs_raw(self, state, prev_word, prefix_logprobs=None): bigrams = self.unfiltered_bigrams if prefix_logprobs is not None: next_words = [] prior_logprobs = [] for logprob, prefix in prefix_logprobs: for word, word_idx in self.vocab_trie.items(prefix): next_words.append(word_idx) prior_logprobs.append(logprob) else: next_words = bigrams.get(prev_word, []) if len(next_words) == 0: next_words = self.vocab next_words = [ w for w in next_words if w != self.eos and w != self.eop ] if len(next_words) == 0: return [], np.zeros(0) new_state = kenlm.State() logprobs = np.empty(len(next_words)) for next_idx, word in enumerate(next_words): logprob = self.model.BaseScore(state, word, new_state) if prefix_logprobs is not None: logprob += prior_logprobs[next_idx] logprobs[next_idx] = logprob logprobs *= LOG10 return next_words, logprobs
def score_seq_by_word(self, state, words): scores = [] for word in words: new_state = kenlm.State() scores.append(LOG10 * self.model.BaseScore(state, word, new_state)) state = new_state return scores
def score_seq(self, state, words): score = 0. for word in words: new_state = kenlm.State() score += self.model.BaseScore(state, word, new_state) state = new_state return score * LOG10, state
def __init__(self, uid, name, order, path, bos, eos): """ A language model scorer (KenLM only). :param uid: unique id (int) :param name: prefix for features :param weights: weight vector (two features: logprob and oov count) :param order: n-gram order :param bos: a Terminal symbol representing the left boundary of the sentence. :param eos: a Terminal symbol representing the right boundary of the sentence. :param path: path to a kenlm model (ARPA or binary). :return: """ super(StatelessLM, self).__init__(uid, name) self._order = order self._bos = bos self._eos = eos self._path = path self._model = klm.Model(path) self._features = (name, '{0}_OOV'.format(name)) # get the initial state self._initial = klm.State() self._model.BeginSentenceWrite(self._initial)
def __call__(self, token, state): """ Args: token (th.Tensor): V, previous tokens state (list[list[State]] or None): LM states Return: score (Tensor): N x V, LM scores state (list[list[State]]), new states """ device = token.device token = token.tolist() if state is None: init_state = kenlm.State() self.ngram_lm.BeginSentenceWrite(init_state) prev_state = [init_state for _ in range(len(token))] else: assert len(token) == len(state) prev_state = [s[token[i]] for i, s in enumerate(state)] scores, states = [], [] for state in prev_state: score, state = self._step(state) scores.append(score) states.append(state) scores = th.stack(scores).to(device) return scores, states
def get_conditional_logprobs(base_model, context_toks, seq, prefix_logprobs=None): import kenlm context_toks = ['<s>'] + context_toks state, _ = base_model.get_state(context_toks, bos=True) possible_word_indices = [] offset_of_chosen_word = [] base_logprobs = [] for i, word in enumerate(seq): word_idx = base_model.model.vocab_index(word) next_words, logprobs = base_model.next_word_logprobs_raw( state, context_toks[-1] if i == 0 else seq[i - 1], prefix_logprobs=prefix_logprobs) # At this point we're past the first word, so no more prefix logprobs. prefix_logprobs = None # Store results. # # Note that since what we're scoring was always generated by the main # model, there will never be an <unk>. possible_word_indices.append(np.asanyarray(next_words)) offset_of_chosen_word.append(next_words.index(word_idx)) base_logprobs.append(logprobs) # Advance the model state. new_state = kenlm.State() base_model.model.base_score_from_idx(state, word_idx, new_state) state = new_state return possible_word_indices, offset_of_chosen_word, base_logprobs
def score_seq(self, state, words): score = 0. for word in words: new_state = kenlm.State() score += self.model.base_score_from_idx(state, self.model.vocab_index(word), new_state) state = new_state return score * LOG10, state
def expand_token(self, prev: Hypothesis, token: str, token_score: float) -> Hypothesis: if prev.lm_state is None: prev_state = kenlm.State() self.model.BeginSentenceWrite(prev_state) else: prev_state = prev.lm_state new_lm_state = kenlm.State() token_lm_score = self.model.BaseScore(prev_state, token, new_lm_state) hyp = copy.deepcopy(prev) hyp.expand_by_token(token, token_score, token_lm_score, new_lm_state) return hyp
def get_score(self, cand_parents, cand_syms, lang_model): """ the saved lm model will be called here Args: cand_parents: last selected top candidates cand_syms: last selected top char index lang_model: the language model Return: scores: the lm scores """ scale = 1.0 / np.log10(np.e) # convert log10 to ln num_cands = len(cand_syms) scores = np.zeros((num_cands, self.num_syms)) new_states = np.zeros((num_cands, self.num_syms), dtype=object) chars = [str(x) for x in range(self.num_syms)] chars[self.sos] = "<s>" chars[self.eos] = "</s>" chars[0] = "<space>" for i in range(num_cands): parent = cand_parents[i] kenlm_state_list = self.cand_kenlm_states[parent] kenlm_state = kenlm_state_list[cand_syms[i]] for sym in range(self.num_syms): char = chars[sym] out_state = kenlm.State() score = scale * lang_model.BaseScore(kenlm_state, char, out_state) scores[i, sym] = score new_states[i, sym] = out_state self.cand_kenlm_states = new_states return scores
def reset(self): """ Call this function to reset the lm to predict on a new sequence """ kenlm_state = kenlm.State() self.lang_model.BeginSentenceWrite(kenlm_state) self.cand_kenlm_states = np.array([[kenlm_state] * self.num_syms])
def main(): vt = VocabTrie() state_in = kenlm.State() state_out = kenlm.State() model = kenlm.LanguageModel('resources/lm_word_medium.kenlm') vt.add_word('hel') vt.add_word('help') vt.add_word('hi') vt.add_word('hello') vt.add_word('hellboy') vt.add_word('helen') print(vt.contains_word('hell')) print(vt.get_words_with_prefix('he', model, state_in, state_out))
def featurize_yield(self, projection): """ :param words: sequence of Terminal objects :return: weight """ qa = klm.State() qb = klm.State() self._model.BeginSentenceWrite(qa) log_prob = 0.0 oov = 0.0 for word in projection: r = self._model.BaseFullScore(qa, word.surface, qb) log_prob += r.log_prob oov += int(r.oov) qa, qb = qb, qa log_prob += self._model.BaseScore(qa, self._eos.surface, qb) return np.array([log_prob, oov])
def batch_advance(lm, inner_states, w, out_states): probs = [] for state in inner_states: out_states.append(kenlm.State()) probs.append(lm.BaseScore(state, w, out_states[-1])) return probs
def featurize_final(self, context): """ :param context: a state :return: """ out_state = klm.State() score = self._model.BaseFullScore(context, self._eos.surface, out_state) return np.array([score.log_prob, float(score.oov)])
def featurize(self, word, context): """ :param word: a Terminal :param context: a state :returns: weight, state """ out_state = klm.State() score = self._model.BaseFullScore(context, word.surface, out_state) return np.array([score.log_prob, float(score.oov)]), out_state
def get_trans_prob_use_kenlm(self, *args): ''' 获取转移概率, 使用log函数做处理,分数越接近0,转移概率越高 :param phrase: 词组类似:中国/钟国/忠国 :return: -9.256282567977905 ''' word_list = args state = kenlm.State() state1 = kenlm.State() self.model.NullContextWrite(state) acc = 0.0 for index, word in enumerate(word_list): if index % 2 == 0: acc += self.model.BaseScore(state, word, state1) else: acc += self.model.BaseScore(state1, word, state) return acc