def consume(self, word): """Pass through to slave predictor """ if not self.trgt_map: self.slave_predictor.consume(word) else: self.slave_predictor.consume(utils.common_get( self.trgt_map, word, utils.UNK_ID))
def predict_next(self): """Looks up ngram scores via self.scores. """ cur_hist_length = len(self.history) this_scores = [[] for _ in xrange(cur_hist_length+1)] this_unk_scores = [[] for _ in xrange(cur_hist_length+1)] for pos in xrange(len(self.scores)): this_scores[0].append(self.scores[pos]) this_unk_scores[0].append(self.unk_scores[pos]) acc = 0.0 for order, word in enumerate(self.history): if pos + order + 1 >= len(self.scores): break acc += utils.common_get( self.scores[pos + order], word, self.unk_scores[pos + order]) this_scores[order+1].append(acc + self.scores[pos + order + 1]) this_unk_scores[order+1].append( acc + self.unk_scores[pos + order + 1]) combined_scores = [] combined_unk_scores = [] for order, (scores, unk_scores) in enumerate(zip(this_scores, this_unk_scores)): if scores and order + 1 >= self.min_order: score_matrix = np.vstack(scores) combined_scores.append(logsumexp(score_matrix, axis=0)) combined_unk_scores.append(utils.log_sum(unk_scores)) if not combined_scores: self.cur_unk_score = 0.0 return {} self.cur_unk_score = sum(combined_unk_scores) return sum(combined_scores)
def _combine_posteriors_norm_reduced(self, non_zero_words, posteriors, unk_probs): """Combine predictor posteriors according the normalization scheme ``CLOSED_VOCAB_SCORE_NORM_REDUCED``. For more information on closed vocabulary predictor score normalization see the documentation on the ``CLOSED_VOCAB_SCORE_NORM_*`` vars. Args: non_zero_words (set): All words with positive probability posteriors: Predictor posterior distributions calculated with ``predict_next()`` unk_probs: UNK probabilities of the predictors, calculated with ``get_unk_probability`` Returns: combined,score_breakdown: like in ``apply_predictors()`` """ n_predictors = len(self.predictors) score_breakdown_raw = {} for trgt_word in non_zero_words: score_breakdown_raw[trgt_word] = [ (utils.common_get(posteriors[idx], trgt_word, unk_probs[idx]), w) for idx, (_, w) in enumerate(self.predictors) ] sums = [] for idx in xrange(n_predictors): sums.append( utils.log_sum([ preds[idx][0] for preds in score_breakdown_raw.itervalues() ])) return self._combine_posteriors_with_renorm(score_breakdown_raw, sums)
def decode(self, src_sentence): self.initialize_predictors(src_sentence) trg_sentence = self.trg_sentences[self.current_sen_id] + [utils.EOS_ID] score_breakdown = [] score = 0.0 all_posteriors = [] all_unk_scores = [] for trg_word in trg_sentence: self.apply_predictors_count += 1 breakdown = [] posteriors = [] unk_scores = [] for (p, w) in self.predictors: if isinstance(p, UnboundedVocabularyPredictor): posterior = p.predict_next([trg_word]) else: posterior = p.predict_next() unk_prob = p.get_unk_probability(posterior) pred_score = utils.common_get(posterior, trg_word, unk_prob) breakdown.append((pred_score, w)) score += pred_score * w posteriors.append(posterior) unk_scores.append(unk_prob) all_posteriors.append(posteriors) all_unk_scores.append(unk_scores) score_breakdown.append(breakdown) self.consume(trg_word) self.add_full_hypo(core.Hypothesis(trg_sentence, score, score_breakdown)) self.last_meta_data = { "src_sentence": np.array(src_sentence + [utils.EOS_ID]), "trg_sentence": np.array(trg_sentence), "posteriors": all_posteriors, "unk_scores": all_unk_scores } return self.full_hypos
def predict_next(self): """Looks up ngram scores via self.scores. """ cur_hist_length = len(self.history) this_scores = [[] for _ in range(cur_hist_length + 1)] this_unk_scores = [[] for _ in range(cur_hist_length + 1)] for pos in range(len(self.scores)): this_scores[0].append(self.scores[pos]) this_unk_scores[0].append(self.unk_scores[pos]) acc = 0.0 for order, word in enumerate(self.history): if pos + order + 1 >= len(self.scores): break acc += utils.common_get(self.scores[pos + order], word, self.unk_scores[pos + order]) this_scores[order + 1].append(acc + self.scores[pos + order + 1]) this_unk_scores[order + 1].append(acc + self.unk_scores[pos + order + 1]) combined_scores = [] combined_unk_scores = [] for order, (scores, unk_scores) in enumerate(zip(this_scores, this_unk_scores)): if scores and order + 1 >= self.min_order: score_matrix = np.vstack(scores) combined_scores.append(logsumexp(score_matrix, axis=0)) combined_unk_scores.append(utils.log_sum(unk_scores)) if not combined_scores: self.cur_unk_score = 0.0 return {} self.cur_unk_score = sum(combined_unk_scores) return sum(combined_scores)
def _combine_posteriors_norm_none(self, non_zero_words, posteriors, unk_probs): """Combine predictor posteriors according the normalization scheme ``CLOSED_VOCAB_SCORE_NORM_NONE``. For more information on closed vocabulary predictor score normalization see the documentation on the ``CLOSED_VOCAB_SCORE_NORM_*`` vars. Args: non_zero_words (set): All words with positive probability posteriors: Predictor posterior distributions calculated with ``predict_next()`` unk_probs: UNK probabilities of the predictors, calculated with ``get_unk_probability`` Returns: combined,score_breakdown: like in ``apply_predictors()`` """ combined = {} score_breakdown = {} for trgt_word in non_zero_words: preds = [(utils.common_get(posteriors[idx], trgt_word, unk_probs[idx]), w) for idx, (_, w) in enumerate(self.predictors)] combined[trgt_word] = self.combi_predictor_method(preds) score_breakdown[trgt_word] = preds return combined, score_breakdown
def _combine_posteriors_norm_none(self, non_zero_words, posteriors, unk_probs): """Combine predictor posteriors according the normalization scheme ``CLOSED_VOCAB_SCORE_NORM_NONE``. For more information on closed vocabulary predictor score normalization see the documentation on the ``CLOSED_VOCAB_SCORE_NORM_*`` vars. Args: non_zero_words (set): All words with positive probability posteriors: Predictor posterior distributions calculated with ``predict_next()`` unk_probs: UNK probabilities of the predictors, calculated with ``get_unk_probability`` Returns: combined,score_breakdown: like in ``apply_predictors()`` """ combined = {} score_breakdown = {} for trgt_word in non_zero_words: preds = [(utils.common_get(posteriors[idx], trgt_word, unk_probs[idx]), w) for idx, (_,w) in enumerate(self.predictors)] combined[trgt_word] = self.combi_predictor_method(preds) score_breakdown[trgt_word] = preds return combined, score_breakdown
def _combine_posteriors_norm_reduced(self, non_zero_words, posteriors, unk_probs, pred_weights, top_n=0): """Combine predictor posteriors according the normalization scheme ``CLOSED_VOCAB_SCORE_NORM_REDUCED``. For more information on closed vocabulary predictor score normalization see the documentation on the ``CLOSED_VOCAB_SCORE_NORM_*`` vars. Args: non_zero_words (set): All words with positive probability posteriors: Predictor posterior distributions calculated with ``predict_next()`` unk_probs: UNK probabilities of the predictors, calculated with ``get_unk_probability`` pred_weights (list): Predictor weights top_n (int): Not implemented! Returns: combined,score_breakdown: like in ``apply_predictors()`` """ n_predictors = len(self.predictors) score_breakdown_raw = {} for trgt_word in non_zero_words: score_breakdown_raw[trgt_word] = [(utils.common_get( posteriors[idx], trgt_word, unk_probs[idx]), w) for idx, w in enumerate(pred_weights)] sums = [] for idx in xrange(n_predictors): sums.append(utils.log_sum([preds[idx][0] for preds in score_breakdown_raw.itervalues()])) return self._combine_posteriors_with_renorm(score_breakdown_raw, sums)
def _get_stub_prob_bounded(self): """get_stub_prob implementation for bounded vocabulary slave predictors. """ word = self.words.get(self.word_stub) return common_get(self.slave_posterior, word if word else utils.UNK_ID, self.slave_unk)
def consume_single(self, predictor): if not self.unconsumed: return if not self.posterior is None: self.pending_score += utils.common_get(self.posterior, self.unconsumed[0], self.posterior[utils.UNK_ID]) self.posterior = None
def get_unk_probability(self, posterior): """Returns negative infinity if UNK is not in the lattice. Otherwise, return UNK score. Returns: float. Negative infinity """ return utils.common_get(posterior, utils.UNK_ID, utils.NEG_INF)
def _get_stub_prob_unbounded(self): """get_stub_prob implementation for unbounded vocabulary slave predictors. """ word = self.words.get(self.word_stub) if word: posterior = self.slave_predictor.predict_next([word]) return common_get(posterior, word, self.slave_unk) return self.slave_unk
def expand(self, decoder): for pidx, (p, _) in enumerate(decoder.predictors): stub = self.pred_stubs[pidx] if not stub.has_full_score(): p.set_state(copy.deepcopy(stub.pred_state)) p.consume(stub.tokens[stub.score_pos - 1]) posterior = p.predict_next() stub.score_next( utils.common_get(posterior, stub.tokens[stub.score_pos], p.get_unk_probability(posterior))) stub.pred_state = p.get_state()
def expand(self, decoder): for pidx,(p, _) in enumerate(decoder.predictors): stub = self.pred_stubs[pidx] if not stub.has_full_score(): p.set_state(copy.deepcopy(stub.pred_state)) p.consume(stub.tokens[stub.score_pos-1]) posterior = p.predict_next() stub.score_next(utils.common_get( posterior, stub.tokens[stub.score_pos], p.get_unk_probability(posterior))) stub.pred_state = p.get_state()
def _get_complete_continuations(self, hypo, min_hypo_score): """This is a generator which yields the complete continuations of ``hypo`` in descending order of score """ min_score = min_hypo_score - hypo.score if min_score > 0.0: return pred_weights = map(lambda el: el[1], self.predictors) # Get initial continuations by searching with predictors separately start_posteriors = self._get_word_initial_posteriors(hypo) pred_states = self.get_predictor_states() keys = {} for pidx, (p, w) in enumerate(self.predictors): stubs = self._search_full_words(p, start_posteriors[pidx], self.toks[pidx], min_score / w) n_added = 0 for stub in stubs: key = self.toks[pidx].tokens2key(stub.tokens) if is_key_complete(key): if key in keys: # Add to existing continuation prev_stub = keys[key].pred_stubs[pidx] if prev_stub is None or prev_stub.score < stub.score: keys[key].pred_stubs[pidx] = stub elif n_added < self.beam_size: # Create new continuation n_added += 1 stubs = [None] * len(self.predictors) stubs[pidx] = stub keys[key] = Continuation(hypo, stubs, key) # Fill in stubs which are set to None for cont in keys.itervalues(): for pidx in xrange(len(self.predictors)): if cont.pred_stubs[pidx] is None: stub = PredictorStub(self.toks[pidx].key2tokens(cont.key), pred_states[pidx]) stub.score_next( utils.common_get(start_posteriors[pidx], stub.tokens[0], start_posteriors[pidx][utils.UNK_ID])) cont.pred_stubs[pidx] = stub conts = [(-c.calculate_score(pred_weights), c) for c in keys.itervalues()] heapq.heapify(conts) # Iterate through conts, expand if necessary, yield if complete while conts: s, cont = heapq.heappop(conts) if cont.is_complete(): yield -s, cont else: # Need to rescore with sec predictors cont.expand(self) heapq.heappush(conts, (-cont.calculate_score(pred_weights), cont))
def _get_complete_continuations(self, hypo, min_hypo_score): """This is a generator which yields the complete continuations of ``hypo`` in descending order of score """ min_score = min_hypo_score - hypo.score if min_score > 0.0: return pred_weights = map(lambda el: el[1], self.predictors) # Get initial continuations by searching with predictors separately start_posteriors = self._get_word_initial_posteriors(hypo) pred_states = self.get_predictor_states() keys = {} for pidx, (p,w) in enumerate(self.predictors): stubs = self._search_full_words(p, start_posteriors[pidx], self.toks[pidx], min_score / w) n_added = 0 for stub in stubs: key = self.toks[pidx].tokens2key(stub.tokens) if is_key_complete(key): if key in keys: # Add to existing continuation prev_stub = keys[key].pred_stubs[pidx] if prev_stub is None or prev_stub.score < stub.score: keys[key].pred_stubs[pidx] = stub elif n_added < self.beam_size: # Create new continuation n_added += 1 stubs = [None] * len(self.predictors) stubs[pidx] = stub keys[key] = Continuation(hypo, stubs, key) # Fill in stubs which are set to None for cont in keys.itervalues(): for pidx in xrange(len(self.predictors)): if cont.pred_stubs[pidx] is None: stub = PredictorStub(self.toks[pidx].key2tokens(cont.key), pred_states[pidx]) stub.score_next(utils.common_get( start_posteriors[pidx], stub.tokens[0], start_posteriors[pidx][utils.UNK_ID])) cont.pred_stubs[pidx] = stub conts = [(-c.calculate_score(pred_weights), c) for c in keys.itervalues()] heapq.heapify(conts) # Iterate through conts, expand if necessary, yield if complete while conts: s,cont = heapq.heappop(conts) if cont.is_complete(): yield -s,cont else: # Need to rescore with sec predictors cont.expand(self) heapq.heappush(conts, (-cont.calculate_score(pred_weights), cont))
def _get_stub_prob_unbounded(self, ch): """get_stub_prob implementation for unbounded vocabulary slave predictors. (LM is an unbouded vocabulary predictor) """ word = self.words.get(self.word_stub) if word: if ch in [utils.EOS_ID]: # end of word char posterior = self.slave_predictor.predict_next([word], 1) else: # segmentation boundary ch in [self.sync_symb] posterior = self.slave_predictor.predict_next([word]) return utils.common_get(posterior, word, self.slave_unk) return self.slave_unk
def score(self, token, predictor): """Returns a score which can be added if ``token`` is consumed next. This is not necessarily the full score but an upper bound on it: Continuations will have a score lower or equal than this. We only use the current posterior vector and do not consume tokens with the wrapped predictor. """ if token and self.unconsumed: self.consume_all(predictor) s = self.pending_score if token: s += utils.common_get(self.posterior, token, self.posterior[utils.UNK_ID]) return s
def _combine_posteriors_norm_none(self, non_zero_words, posteriors, unk_probs, top_n=0): """Combine predictor posteriors according the normalization scheme ``CLOSED_VOCAB_SCORE_NORM_NONE``. For more information on closed vocabulary predictor score normalization see the documentation on the ``CLOSED_VOCAB_SCORE_NORM_*`` vars. Args: non_zero_words (set): All words with positive probability posteriors: Predictor posterior distributions calculated with ``predict_next()`` unk_probs: UNK probabilities of the predictors, calculated with ``get_unk_probability`` top_n (int): If positive, return only top n words Returns: combined,score_breakdown: like in ``apply_predictors()`` """ if isinstance(non_zero_words, xrange) and top_n > 0: n_words = len(non_zero_words) scaled_posteriors = [] for posterior, unk_prob, (_, weight) in zip( posteriors, unk_probs, self.predictors): if isinstance(posterior, dict): arr = np.full(n_words, unk_prob) for word, score in posterior.iteritems(): arr[word] = score scaled_posteriors.append(arr * weight) else: n_unks = n_words - len(posterior) if n_unks: posterior = np.concatenate(( posterior, np.full(n_unks, unk_prob))) scaled_posteriors.append(posterior * weight) combined_scores = np.sum(scaled_posteriors, axis=0) non_zero_words = utils.argmax_n(combined_scores, top_n) combined = {} score_breakdown = {} for trgt_word in non_zero_words: preds = [(utils.common_get(posteriors[idx], trgt_word, unk_probs[idx]), w) for idx, (_,w) in enumerate(self.predictors)] combined[trgt_word] = self.combi_predictor_method(preds) score_breakdown[trgt_word] = preds return combined, score_breakdown
def _combine_posteriors_norm_non_zero(self, non_zero_words, posteriors, unk_probs, pred_weights, top_n=0): """Combine predictor posteriors according the normalization scheme ``CLOSED_VOCAB_SCORE_NORM_NON_ZERO``. For more information on closed vocabulary predictor score normalization see the documentation on the ``CLOSED_VOCAB_SCORE_NORM_*`` vars. Args: non_zero_words (set): All words with positive probability posteriors: Predictor posterior distributions calculated with ``predict_next()`` unk_probs: UNK probabilities of the predictors, calculated with ``get_unk_probability`` pred_weights (list): Predictor weights top_n (int): If positive, return only top n words Returns: combined,score_breakdown: like in ``apply_predictors()`` """ if isinstance(non_zero_words, range) and top_n > 0: non_zero_words = Decoder._scale_combine_non_zero_scores(len(non_zero_words), posteriors, unk_probs, pred_weights, top_n) combined = {} score_breakdown = {} for trgt_word in non_zero_words: preds = [(utils.common_get(posteriors[idx], trgt_word, unk_probs[idx]), w) for idx, w in enumerate(pred_weights)] combi_score = self.combi_predictor_method(preds) if abs(combi_score) <= EPS_P: continue combined[trgt_word] = combi_score score_breakdown[trgt_word] = preds return combined, score_breakdown
def _combine_posteriors_norm_non_zero(self, non_zero_words, posteriors, unk_probs, pred_weights, top_n=0): """Combine predictor posteriors according the normalization scheme ``CLOSED_VOCAB_SCORE_NORM_NON_ZERO``. For more information on closed vocabulary predictor score normalization see the documentation on the ``CLOSED_VOCAB_SCORE_NORM_*`` vars. Args: non_zero_words (set): All words with positive probability posteriors: Predictor posterior distributions calculated with ``predict_next()`` unk_probs: UNK probabilities of the predictors, calculated with ``get_unk_probability`` pred_weights (list): Predictor weights top_n (int): If positive, return only top n words Returns: combined,score_breakdown: like in ``apply_predictors()`` """ if isinstance(non_zero_words, xrange) and top_n > 0: non_zero_words = Decoder._scale_combine_non_zero_scores(len(non_zero_words), posteriors, unk_probs, pred_weights, top_n) combined = {} score_breakdown = {} for trgt_word in non_zero_words: preds = [(utils.common_get(posteriors[idx], trgt_word, unk_probs[idx]), w) for idx, w in enumerate(pred_weights)] combi_score = self.combi_predictor_method(preds) if abs(combi_score) <= EPS_P: continue combined[trgt_word] = combi_score score_breakdown[trgt_word] = preds return combined, score_breakdown
def consume_all(self, predictor): """Consume all unconsumed tokens and update pred_state, pending_score, and posterior accordingly. Args: predictor (Predictor): Predictor instance """ if not self.unconsumed: return if self.posterior is None: self.update_posterior(predictor) predictor.set_state(copy.deepcopy(self.pred_state)) for token in self.unconsumed: self.pending_score += utils.common_get(self.posterior, token, self.posterior[utils.UNK_ID]) #print("consume %d (consume all, %d)" % (token, predictor.config['src_vocab_size'])) predictor.consume(token) self.posterior = predictor.predict_next() self.pred_state = copy.deepcopy(predictor.get_state()) self.unconsumed = []
def decode(self, src_sentence): self.initialize_predictors(src_sentence) trg_sentence = self.trg_sentences[self.current_sen_id] + [utils.EOS_ID] score_breakdown = [] score = 0.0 all_posteriors = [] all_unk_scores = [] for trg_word in trg_sentence: self.apply_predictors_count += 1 breakdown = [] posteriors = [] unk_scores = [] for (p, w) in self.predictors: if isinstance(p, UnboundedVocabularyPredictor): posterior = p.predict_next([trg_word]) else: posterior = p.predict_next() unk_prob = p.get_unk_probability(posterior) pred_score = utils.common_get(posterior, trg_word, unk_prob) breakdown.append((pred_score, w)) score += pred_score * w posteriors.append(posterior) unk_scores.append(unk_prob) all_posteriors.append(posteriors) all_unk_scores.append(unk_scores) score_breakdown.append(breakdown) self.consume(trg_word) self.add_full_hypo( core.Hypothesis(trg_sentence, score, score_breakdown)) self.last_meta_data = { "src_sentence": np.array(src_sentence + [utils.EOS_ID]), "trg_sentence": np.array(trg_sentence), "posteriors": all_posteriors, "unk_scores": all_unk_scores } return self.full_hypos
def _get_token_score(self, token, predictor): """Look up ``token`` in ``self.posterior``. """ return utils.common_get(self.posterior, token, predictor.get_unk_probability(self.posterior))
def get_unk_probability(self, posterior): """Fetch posterior[t2t_unk_id]""" if len(self.history_sentences) > self.max_sentences: return 0.0 return utils.common_get(posterior, self._t2t_unk_id, utils.NEG_INF)
def get_unk_probability(self, posterior): """Returns self.other_scores[n_aligned_words].""" return utils.common_get(self.other_scores, self.n_aligned_words, 0.0)
def get_unk_probability(self, posterior): """Use NPLM UNK score if exists """ return utils.common_get(posterior, utils.UNK_ID, utils.NEG_INF)
def get_unk_probability(self, posterior): """Use NPLM UNK score if exists """ return utils.common_get(posterior, utils.UNK_ID, NEG_INF)
def _update_slave_vars(self, posterior): self.slave_unk = self.slave_predictor.get_unk_probability(posterior) self.slave_go = common_get(posterior, utils.GO_ID, self.slave_unk) self.slave_eos = common_get(posterior, utils.EOS_ID, self.slave_unk)
def get_unk_probability(self, posterior): """Fetch posterior[utils.UNK_ID]""" return utils.common_get(posterior, utils.UNK_ID, utils.NEG_INF)
def predict_next(self): """Returns self.pop_scores[n_aligned_words] for POP and EOS.""" score = utils.common_get(self.pop_scores, self.n_aligned_words, 0.0) return {self.pop_id: score, utils.EOS_ID: score, 6: 0.0, 7: 0.0}
def get_unk_probability(self, posterior): """Fetch posterior[t2t_unk_id]""" return utils.common_get(posterior, self._t2t_unk_id, utils.NEG_INF)