def test_new_model_order(): lm = ARPAModelSimple() assert lm.order() is None for p in PARSERS: lm = arpa.loadf(TEST_ARPA, parser=p)[0] assert lm.order() == 5
def load_adaptation_sample(filename): if filename.endswith(".gz"): A_models = arpa.load(gzip.open(filename, mode='rt')) else: A_models = arpa.loadf(filename) A = A_models[0] return A
def test_loadf_dumpf(): lm = arpa.loadf(TEST_ARPA)[0] out = tempfile.NamedTemporaryFile(mode='w+t', delete=False) arpa.dumpf(lm, out.name) out.close() assert filecmp.cmp(TEST_ARPA, out.name, shallow=False) os.unlink(out.name)
def get_sentence_score(sentence_indexes, models, test_set, probabilities, guesses, alpha_start, alpha_transition): logger = logging.getLogger('recognizer') # print("Alpha start {}".format(alpha_start)) # print("Alpha transition {}".format(alpha_transition)) top_best = 3 lm_models = arpa.loadf("ukn.3.lm") lm = lm_models[0] emission_scores = get_emission_scores(sentence_indexes, models, test_set) if (alpha_start and alpha_transition): guess = get_viterbi_sentence(emission_scores, alpha_start, alpha_transition) else: guess = list(emission_scores.idxmax(axis = 0)) guesses.extend(guess) word_probabilities = [v for k, v in emission_scores.to_dict().items()] probabilities.extend(word_probabilities) logger.debug("Guess {}".format(guess)) logger.debug("Probability {}".format(word_probabilities)) return emission_scores
def test_loadf_dumpf(): lm = arpa.loadf(TEST_ARPA)[0] out = tempfile.NamedTemporaryFile(mode="w+t", delete=False) arpa.dumpf(lm, out.name) out.close() assert filecmp.cmp(TEST_ARPA, out.name, shallow=False) os.unlink(out.name)
def __init__(self, labels, model_path=None, alpha=0.5, beta=0.5, cutoff_top_n=40, cutoff_prob=-2.1, beam_width=64, blank_id=0, space_id=60, vocab=None, trie_path=None): self.NEG_INF = -float("inf") self.labels = labels self.model_path = model_path self.beam_size = beam_width self.alpha = alpha self.beta = beta self.blank_id = blank_id self.cutoff_top_n = cutoff_top_n self.cutoff_prob = cutoff_prob self.vocab = vocab self.space_id = space_id self.lm = arpa.loadf(self.model_path)[0] self.trie_path = trie_path self.trie_root = CustomUnpickler(open(self.trie_path, 'rb')).load()
def load_realigning_LM(self): self.N_range = ( self.realigning_lm_params['min_number_of_words'], self.realigning_lm_params['max_number_of_words'], ) self.stt_end_tokens = ['</s>', '<s>'] logging.info(f"Loading LM for realigning: {self.realigning_lm_params['arpa_language_model']}") return arpa.loadf(self.realigning_lm_params['arpa_language_model'])[0]
def test_loadf_dumpf_write(): for p in PARSERS: for suf in ['.arpa', '.gz']: # read lm1 = arpa.loadf(TEST_ARPA, parser=p)[0] # write out1 = tempfile.NamedTemporaryFile(mode='w+t', suffix=suf, delete=False) arpa.dumpf(lm1, out1.name) out1.close() # read again lm2 = arpa.loadf(out1.name, parser=p)[0] # write again out2 = tempfile.NamedTemporaryFile(mode='w+t', suffix='.arpa', delete=False) arpa.dumpf(lm2, out2.name) out2.close() # compare assert filecmp.cmp(TEST_ARPA, out2.name, shallow=False) os.unlink(out2.name)
def test_manual_contains(): lm = arpa.loadf(TEST_ARPA)[0] assert 'foo' in lm with pytest.raises(ValueError): assert ('foo', ) in lm with pytest.raises(ValueError): assert 'a little' in lm with pytest.raises(ValueError): assert ('a', 'little') in lm
def recognize_SLM(models: dict, test_set: SinglesData): # recognizer with SLM SLMmodel = arpa.loadf("ukn.3.lm")[0] warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = [] Xlengths = test_set.get_all_Xlengths() wordlist = test_set.wordlist # Iterate the testing data. for video in test_set.sentences_index: # Create a word sequence of each video word_sequence = [ wordlist[index] for index in test_set.sentences_index[video] ] # Enumerate words in a video for i, word_index in enumerate(test_set.sentences_index[video]): score_dict = {} best_word = "" max_score = float('-inf') X, lengths = Xlengths[word_index] prefix = "" """ # for 2-gram and 3-gram if i > 1: prefix = word_sequence[i-2: i] elif i == 1: prefix = word_sequence[i - 1] """ # for 2-gram if i > 0: prefix = word_sequence[i - 1] # Iterate all possible words in models, and calculate the Log Liklihood for key in models: try: liklihood = SLMmodel.log_p(prefix + " " + key) except: liklihood = 0 try: model = models[key] logL = model.score(X, lengths) + SLMmodel.log_p(key) score_dict[key] = logL # store the best guess words if logL > max_score: max_score = logL best_word = key except: score_dict[key] = -1 # add prob dictionary and the best guess of each word probabilities.append(score_dict) guesses.append(best_word) return probabilities, guesses
def _test_log_s(sentences, sos, eos): lm_me = arpa.loadf(TEST_ARPA)[0] lm_ken = kenlm.LanguageModel(TEST_ARPA) results_me = [] results_ken = [] for sentence in sentences: score_me = lm_me.log_s(sentence, sos=sos, eos=eos) score_ken = lm_ken.score(sentence, bool(sos), bool(eos)) results_me.append(score_me) results_ken.append(score_ken) assert all(round(m - k, 2) == 0 for m, k in zip(results_me, results_ken))
def test_loadf_dumpf_read(): for p in PARSERS: for src in [TEST_ARPA, TEST_ARPA_GZ]: # read lm = arpa.loadf(src, parser=p)[0] # write out = tempfile.NamedTemporaryFile(mode='w+t', suffix='.arpa', delete=False) arpa.dumpf(lm, out.name) out.close() # compare assert filecmp.cmp(TEST_ARPA, out.name, shallow=False) os.unlink(out.name)
def _test_log_p(queries): lm_me = arpa.loadf(TEST_ARPA)[0] lm_ken = kenlm.LanguageModel(TEST_ARPA) results_me = [] results_ken = [] for ngram in queries: prob_me = lm_me.log_p(ngram) prob_ken = list(lm_ken.full_scores(' '.join(ngram), False, False))[-1][0] results_me.append(prob_me) results_ken.append(prob_ken) assert all(round(m - k, 4) == 0 for m, k in zip(results_me, results_ken))
def recognize(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = [] #num_cores = multiprocessing.cpu_count() startTime = int(round(time.time() * 1000)) p = os.path.join('data', 'ukn.3.lm') language_model = arpa.loadf(p)[0] idx = 0 #Parallel(n_jobs=num_cores)(process_by_sq(models, X, length, probabilities, guesses) for _, (X, length) in test_set.get_all_Xlengths().items()) for _, (X, length) in test_set.get_all_Xlengths().items(): probability_by_word = {} bare_prob = {} sq = [si for k, si in test_set.sentences_index.items() if idx in si] for word, model in models.items(): w_score = -float(math.inf) try: w_score = model.score(X, length) except: pass bare_prob[word] = w_score probability_by_word[word] = w_score + calculate_with_LM( guesses, idx, language_model, sq[0], word) idx += 1 best_guess = max(probability_by_word.keys(), key=(lambda w: probability_by_word[w])) best_bare = max(bare_prob.values()) guesses.append(best_guess, best_bare) probabilities.append(probability_by_word) endTime = int(round(time.time() * 1000)) print(endTime - startTime, 'ms') return probabilities, [g[0] for g in guesses]
def test_input_equality(): lm = ARPAModelSimple() with pytest.raises(KeyError): assert lm.p('foo') == lm.p(('foo', )) with pytest.raises(KeyError): assert lm.p('xxx') == lm.p(('xxx', )) with pytest.raises(KeyError): assert lm.p('a little') == lm.p(('a', 'little')) with pytest.raises(KeyError): assert lm.p('xxx little') == lm.p(('xxx', 'little')) lm = arpa.loadf(TEST_ARPA)[0] assert lm.p('foo') == lm.p(('foo', )) assert lm.p('xxx') == lm.p(('xxx', )) assert lm.p('a little') == lm.p(('a', 'little')) assert lm.p('xxx little') == lm.p(('xxx', 'little'))
def load_background(filename): if filename.endswith(".gz"): B_models = arpa.load(gzip.open(filename, mode='rt')) else: B_models = arpa.loadf(filename) B = B_models[0] # ARPA files may contain several models. # We can recover f_B_star (i.e., discounted probabilities) from interpolated probabilities # As B is an interpolated model, i.e., p_B(w|h) = f_B_star(w|h) + bow_B(h) * p_B(w|h') # Thus, # # f_B_star(w|h) = p_B(w|h) - bow_B(h) * p_B(w|h') # # where h' = h[1:] f_B_star = dict() for n in range(2, B.order() + 1): print("%d-gram" % n) # progress_count = 0 for e in B._entries(n): # entry format: (log10(prob), hw, log10(bow)) hw = e[1] h = hw[:-1] h_prime_w = hw[1:] f_B_star[hw] = B._base**float(e[0]) - B._base**( float(B._bos[h]) + float(log_p(B, h_prime_w))) # assert f_B_star[hw] >= 0 # progress_count += 1 # if progress_count % 2000 == 0: # print(progress_count) # Index structure: # len(h) --> h --> {w | hw is seen in the corpus}, where len(h) >= 1 B_hist_index = [defaultdict(list) for i in range(B.order())] for n in range(2, B.order() + 1): print("%d-gram" % n) # progress_count = 0 for e in B._entries(n): hw = e[1] h = hw[:-1] w = hw[-1] B_hist_index[len(h)][h].append(w) # progress_count += 1 # if progress_count % 2000 == 0: # print(progress_count) return B, f_B_star, B_hist_index
def __init__(self, ngram, vocab=None, base=np.exp(1)): self.lm = arpa.loadf(ngram)[0] self.base = base with open(ngram) as f: for line in f: if line.startswith("\\1-grams:"): break if line.startswith("ngram "): self.n = int(line.replace("ngram ", "").split("=")[0]) self.context = ["<s>"] self.vocab = None is_first_line = True is_subword_nmt = True if vocab: self.vocab = list() with open(vocab) as f: for line in f: w = line.strip() if is_first_line: is_first_line = False if w is "{": is_subword_nmt = True continue else: is_subword_nmt = False if is_subword_nmt: if w is "}": break else: p = w.split(": ") v = p[0][1:len(p[0]) - 1] i = int(p[1][0:len(p[1]) - 1]) if v == "<s>": continue else: self.vocab.append(v) if self.vocab[i] is not v: print( "Wrong word index!! index: %d vocab in the file: %s vocab in the list: %s" % (i, v, self.vocab[i])) else: self.vocab.append( w.split("\t")[0].split(" ")[0].strip()) print("%d vocabs were loaded for shallow fusion w/ arpa" % len(self.vocab))
def error_detector(lmAdaptPath, sentence, threshold): # Reading input language model. models = arpa.loadf(lmAdaptPath) # ARPA files may contain several models. lm = models[0] words = sentence.split() scores = dict(zip(words, [0] * len(words))) n_grams = list(ngrams(words, 3)) for n_gram in n_grams: prop = lm.p(n_gram) if prop < threshold: for word in n_gram: scores[word] += 1 sent_errors = ['0'] for n_gram in n_grams: if scores[n_gram[1]] > 1: sent_errors.append('1') else: sent_errors.append('0') sent_errors.append('0') return " ".join(sent_errors)
import arpa import re import pickle import pandas as pd import itertools from functools import reduce import numpy as np from asl_data import SinglesData from asl_utils import show_errors ukn3 = arpa.loadf("lm/ukn.3.lm") lm = ukn3[0] probabilities = pickle.load(open("data/probabilities.pkl", "rb")) test_set = pickle.load(open("data/test_set.pkl", "rb")) df_probs = pd.DataFrame(probabilities) # print(df_prob.head()) lm_factor = 20.0 def score_with_lm1(): for video_num, indices in test_set.sentences_index.items(): # visual_model_guesses = df_probs.iloc[indices,:].idxmax(axis=1) ngram_indices = [] for sentence_idx, word_idx in enumerate(indices): if ngram_indices: ngram_prefix = df_probs.iloc[ngram_indices, :].idxmax( axis=1).tolist() row = df_probs.iloc[word_idx, :]
help="If set, save errors in pickle format", action='store_true') args = parser.parse_args() input = args.input lm = args.lm n = args.n threshold = args.threshold print_words = args.print_words save = args.save # Get sentences of asr output. sentences = get_hypothesis(input, True) # Reading input language model. models = arpa.loadf(lm) # ARPA files may contain several models. lm = models[0] # For each sentence find words that have low propability # and keep a score of them. errors = [] for sent in sentences: words = sent.split() scores = dict(zip(words, [0] * len(words))) n_grams = list(ngrams(words, n)) for n_gram in n_grams: prop = lm.p(n_gram) if prop < threshold: for word in n_gram: scores[word] += 1 sent_errors = [0]
#!/usr/bin/env python3 import sys import arpa import os import math import numpy as np lmfile = sys.argv[1] print("lmfile=", lmfile) lms = arpa.loadf(lmfile) lm = lms[0] def log_p(B, ngram): # words = B._check_input(ngram) # if B._unk: # words = B._replace_unks(words) # return log_p_raw(B, words) return log_p_raw(B, ngram) def log_p_raw(B, ngram): ret = B._ps.get(ngram, None) if ret is not None: return ret else: # if len(ngram) == 1: # raise KeyError
best_sentence = s except: continue if best_sentence is not None: sentence_guesses[video_num] = best_sentence errors = 0 for video_num in sentence_guesses: correct_sentence = [ test_set.wordlist[i] for i in test_set.sentences_index[video_num] ] recognised_sentence = sentence_guesses[video_num] for c, r in zip(correct_sentence, list(recognised_sentence)): if c != r: errors += 1 # print('Correct {}'.format(correct_sentence)) # print('Recognised {}'.format(recognised_sentence)) # print() print(float(errors) / float(178)) if __name__ == '__main__': # use n-gram models = train_all_words(features_custom, all_model_selectors['SelectorBIC']) test_set = asl.build_test(features_custom) # load 3-gram language model lm_models = arpa.loadf(os.path.join('data', 'n-grams', 'ukn.3.lm')) lm = lm_models[0] recognize_ngram(lm, models, test_set)
def recognize_ngram(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ # This import is necessary to be able recognize language model in arpa files # could be easily installed using the following command : # pip install arpa import arpa import itertools warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement the recognizer probabilities = [] guesses = [] probabilities_dict = {} guesses_dict = {} #load the language model lm_models = arpa.loadf('lm3_sri.lm') lm = lm_models[0] # ARPA files may contain several models. #for word_id in range(0, len(test_set.get_all_Xlengths())): # probabilities_dict[word_id] = 'None' # guesses_dict[word_id] = 'None' for video_index in test_set._load_sentence_word_indices(): word_ids = test_set._load_sentence_word_indices()[video_index] video_probs = collections.OrderedDict() for word_id in word_ids: current_sequence = test_set.get_item_sequences(word_id) current_length = test_set.get_item_Xlengths(word_id) probs = {} for word, model in models.items(): try: probs[word] = model.score(current_sequence[0], current_length[1]) except: print('failed for word_id {} and word: {}'.format( word_id, word)) probs[word] = float('-inf') if len(word_ids) > 5: top_words = sorted(probs, key=probs.get, reverse=True)[:3] elif len(word_ids) == 5: top_words = sorted(probs, key=probs.get, reverse=True)[:4] elif len(word_ids) < 5: top_words = sorted(probs, key=probs.get, reverse=True)[:6] probabilities_dict[word_id] = probs video_probs[word_id] = {x: probs[x] for x in top_words} sentences = list(itertools.product(*video_probs.values())) sentences_prob = [] for sentence_index in range(len(sentences)): sentence = sentences[sentence_index] visual_prob = 0 word_index = 0 for word_id in word_ids: word_id_probs = video_probs[word_id] visual_prob = visual_prob + word_id_probs[sentence[word_index]] word_index = word_index + 1 sentence_string = '' for word in sentence: sentence_string = sentence_string + ' ' + word try: language_prob = lm.log_s(sentence_string.strip()) alpha = 1 beta = 25 sentence_prob = alpha * visual_prob + beta * language_prob sentences_prob.append(sentence_prob) print(language_prob) except: print('no language for sor sentence: {}', sentence_string.strip()) sentences_prob.append(float('-inf')) #find the sentence with the highest prob then extract word_ids max_sentence = sentences[sentences_prob.index(max(sentences_prob))] word_index = 0 for word_id in word_ids: guesses_dict[word_id] = max_sentence[word_index] word_index = word_index + 1 for key in sorted(guesses_dict): probabilities.append(probabilities_dict[key]) guesses.append(guesses_dict[key]) return probabilities, guesses
def recognize_unigram(models: dict, test_set: SinglesData, lm_scaling_factor: int): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :param lm_scaling_factor: int multiply the language model probability by int value so it's on a closer scale the the HMM log_ls probability :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = [] # read in the language model from the ukn.1.lm arpa file provided # note: lm stands for language model try: language_models = arpa.loadf('./data/ukn.1.lm') lm = language_models[0] # ARPA files may contain several models. except: print("Problem reading the language model from the ARPA file") raise # implement the recognizer # print("Total Length: {}".format(test_set.num_items)) for video_num in test_set.sentences_index: for word_id in test_set.sentences_index[video_num]: log_ls = {} # dict of log liklihoods of a word best_score = float("-inf") # best log_l thus far best_guess = None # best guess for what the word from the test set could be x, lengths = test_set.get_item_Xlengths(word_id) for word, model in models.items(): try: # Assumes a HMM model log_ls[word] = model.score(x, lengths) except: # Unable to process word with this model log_ls[word] = float("-inf") else: # Remove a trailing digit from word if it has one before passing to language model word_key = ''.join( word[:-1] if word[-1].isdigit() else word) log_ls[word] = log_ls[word] + lm_scaling_factor * lm.log_p( word_key) if log_ls[word] > best_score: best_score = log_ls[word] best_guess = word # print("New Best Guess for {}: {}".format(word_id, best_guess)) probabilities.append(log_ls) guesses.append(best_guess) return probabilities, guesses
def recognize(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = [] n_gram = 3 lsm = arpa.loadf("slm/devel-lm-M{}.sri.lm".format(n_gram)) lm = lsm[0] # TODO implement the recognizer # go foreach sentences: # for each word in sentence: # probability = {} # for each train-word(model): # score guess-word with model # find in slm: logP of word that has predecessor of current guess-word. # total_score = K * logP + logL # probability[train-word] = total_score # # # find max total_score # import operator # guess_word = max(probability.items(), key=operator.itemgetter(1))[0] # guesses.append(guess_word) # probabilities.append(probability) K = 50 for test_X, test_Xlength in test_set.get_all_Xlengths().values(): probability = {} for word, model in models.items(): # calculate the scores for each model(word) and update the 'probabilities' list. try: logL = model.score(test_X, test_Xlength) if not guesses: logP = lm.log_p('<s>') score_start = K * logP + logL logP = lm.log_p('</s>') score_end = K * logP + logL if score_end > score_start: logP = lm.log_p('</s>') else: logP = lm.log_p('<s>') else: w = get_adjecent_words(guesses, n_gram) logP = lm.log_p(w) probability[word] = K * logP + logL except: probability[word] = float("-inf") pass import operator guess_word = max(probability.items(), key=operator.itemgetter(1))[0] guesses.append(guess_word) probabilities.append(probability) return probabilities, guesses
try: import arpa except: pass import os import pandas def process_by_sq(models, X, length, probabilities, guesses): probability_by_word = {} for word, model in models.items(): try: w_score = model.score(X, length) probability_by_word[word] = w_score except Exception: probability_by_word[word] = -float(math.inf) best_guess = max(probability_by_word.keys(), key=(lambda w: probability_by_word[w])) guesses.append(best_guess) probabilities.append(probability_by_word) if __name__ == '__main__': #df_probs = pandas.DataFrame(data={'col1': 4, 'col2': 4}, index=3) p = os.path.join('data', 'ukn.3.lm') models = arpa.loadf(p) print('d')
from asl_data import SinglesData from asl_utils import show_errors from string import digits import pickle import arpa import itertools LM_SCALE = 150 lm1 = arpa.loadf("ukn.1.lm")[0] lm2 = arpa.loadf("ukn.2.lm")[0] lm3 = arpa.loadf("ukn.3.lm")[0] exceptional_words = { 'SAY-P': 'SAY', 'IX-P': 'IX', } with open('probabilities.pickle', 'rb') as file: gm = pickle.load(file) # gesture model with open('test_set.pickle', 'rb') as file: test_set = pickle.load(file) def clean_word(word): w = word[:-1] if word[-1].isdigit() else word return exceptional_words.get(w, w)
import arpa models = arpa.loadf( "/Users/huangruizhe/Downloads/PycharmProjects/lm_adapt/data/c5-end.arpa") lm = models[0] # ARPA files may contain several models. # probability p(end|in, the) print(lm.p("4 9 5")) print(lm.log_p("4 9 5")) # sentence score w/ sentence markers print(lm.s("4 9 3 4 7 5 7")) print(lm.log_s("4 9 3 4 7 5 7")) # sentence score w/o sentence markers # print(lm.s("4 9 3 4 7 5 7", sos=False, eos=False)) print(lm.log_s("4 9 3 4 7 5 7", sos=False, eos=False)) # entries of order n, e.g. (-0.4317983, ('3', '4'), 0.3461446) # ref: python-arpa/arpa/models/simple.py print([e for e in lm._entries(2)]) # vocabularies print([v for v in lm.vocabulary()])
def test_manual_p(): lm = arpa.loadf(TEST_ARPA)[0] assert round(lm.p('<s>'), 4) == 0
def test_manual_log_p_unk(): lm = arpa.loadf(TEST_ARPA)[0] assert lm.log_p('UnladenSwallow') == -1.995635
def one_gram(df_prob): # TODO 1-gram lm lm = arpa.loadf(os.path.join("data", "ukn.1.lm")) print(lm[0].s("JOHN WRITE HOMEWORK")) print(lm[0].log_s("JOHN WRITE HOMEWORK"))
sm = sum(features_success_rates.values()) features_weights = [(k, v / sm) for (k, v) in features_success_rates.items()] pickle.dump( { "features_probs": features_probs, "features_weights": features_weights }, open("data/feature_models_data.pkl", "wb")) ensemble_guess(features_probs, features_weights, test_set) #[avg_sequences_probs([ground,norm,polar,delta]) for (ground,norm,polar,delta) in list(zip(*model_probs.values()))] language_model = arpa.loadf("lm/ukn.3.lm")[0] def ensemble_guess(features_probs=None, features_weights=None, test_set=None): if features_probs is None or features_weights is None: l = pickle.load(open("data/feature_models_data.pkl", "rb")) features_probs = l["features_probs"] features_weights = l["features_weights"] print("feature models data loaded") if test_set is None: test_set = asl.build_test(features_ground) features_weights = dict(features_weights) ensemble_probabilities = [ merge_seq_dicts(ground, norm, polar, delta, custom, features_weights)
def get_viterbi_sentence(scores, alpha_start = 1, alpha_transition = 1): logger = logging.getLogger('recognizer') top = 5 min_score = 1e6 * (-1) lm_models = arpa.loadf("ukn.3.lm") lm = lm_models[0] states_num, observations_num = scores.shape states = list(scores.index) observations = list(scores.columns.values) viterbi = pd.DataFrame(index = states, columns = observations) backpointers = pd.DataFrame(index = states, columns = observations) step_0 = 0 # Initialization step 0 for state in states: emission_score = scores.get_value(state, observations[step_0]) sentence = ['<s>'] sentence.append(state) transition_score = get_n_gram_score(sentence, n_gram_model = lm, n_gram = 3) viterbi.set_value(state, observations[step_0], emission_score + alpha_start * transition_score) backpointers.set_value(state, observations[step_0], 0) # Recursion for observation in range(1, len(observations)): logger.debug("Observation {}".format(observation)) # Get the last top states from previous step top_states = list(scores.sort_values(by = observations[observation - 1], ascending = False)[0:top].index) for state in states: # Get emission score from currente step emission_score = scores.get_value(state, observations[observation]) # Get the max score emission_score + emission_score + transition_score for top_state in top_states: best_score = min_score best_state = None sentence = [] sentence.append(top_state) sentence.append(state) # Get the transition score transition_score = get_n_gram_score(sentence, n_gram_model = lm, n_gram = 3) # Get the previous emission score emission_score_previous = scores.get_value(top_state, observations[observation - 1]) # middle_score = alpha_transition * transition_score + emission_score + emission_score_previous middle_score = alpha_transition * transition_score + emission_score_previous # print("Middle score {}".format(middle_score)) # Update the max score if middle_score > best_score: best_score = middle_score best_state = top_state # print("Best score {}". format(best_score)) state_score = best_score + emission_score viterbi.set_value(state, observations[observation], state_score) backpointers.set_value(state, observations[observation], best_state) # Termination # steps = len(observations) # last_state = list(viterbi.sort_values(by = observations[steps - 1], ascending = False)[0:1].index) # viterbi_sentence = [] # viterbi_sentence.extend(last_state) # for observation in range(steps - 1, 0, -1): # viterbi_sentence.append(backpointers.get_value(viterbi_sentence[steps - 1 - observation], observations[observation])) # return list(reversed(viterbi_sentence)) #return viterbi viterbi_sentence = list(viterbi.idxmax(axis = 0)) return viterbi_sentence
def recognize_ngram(models: dict, test_set: SinglesData, probs, BIC_guesses): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = [] model = arpa.loadf("devel-lm-M3.sri.lm") lm = model[0] # ARPA files may contain several models. # TODO implement the recognizer # return probabilities, guesses test_sequences = list(test_set.get_all_Xlengths().values()) word_keys = list(test_set.get_all_Xlengths().keys()) i = -1 for sentence in test_set.sentences_index.values(): f = {} maxs = float("-inf") prob = [] words = [] sentenceLength = 0 for word_index in sentence: i += 1 word = test_set.wordlist[word_index] sentenceLength += 1 try: f[word] = probs[word][i] except: f[word] = float("-inf") prob.append( f[word] ) ## These are Just the probabilities unchanged from the BIC recognizer. # Find Six most probable words and generate the possible permutations sixwords = sorted(f, key=f.get, reverse=True)[:6] for k in permutations(sixwords, r=sentenceLength): l = 0 for j in range(len(k)): l += f[k[j]] try: sentenceLP = l + 13 * lm.log_s( " ".join(k) ) ## According to one student in the forum 13 is the best hyperparameter if sentenceLP > maxs: ## https://discussions.udacity.com/t/slm-data-for-this-asl-dataset/230822/8?u=spiros sentence = " ".join(k) maxs = sentenceLP words = list(k) except: pass if (words == []): words = BIC_guesses[len(guesses):len(guesses) + sentenceLength] ## Fall back to BIC guesses probabilities.append(prob) guesses += words return (probabilities, guesses)
subs][:args.dataset_limit_train] model_info.max_dial_len = hcn_utils.get_feature_size_from_data( subsets_file_lists[subs][0], feat_sep=True) if args.states: dialogue_settings = {} for cond in ['user_plan', 'system_plan']: dialogue_settings[cond] = dialogue_utils.dialogue( nlg_utils.NLG(args.states, cond), rig_db) else: dialogue_settings = {} if args.dact_lm: logger.debug('Loading DAct language model') dact_lm = arpa.loadf(args.dact_lm)[0] if DEBUG: input(dact_lm) else: dact_lm = None if model_info.max_dial_len == 0: logger.error('No dialogue found') sys.exit() logger.info('Loading actions set') try: with open(os.path.join(task_dir, 'orca_action_set.txt'), 'r') as lfp: available_actions = pickle.load(lfp) except: with open(os.path.join(task_dir, 'orca_action_set.txt'), 'r') as lfp:
from utils.word_to_characters import lexicon_dic import arpa import re # from pynlpl.lm import lm with open('mydata/data/local/lm/phones.txt', 'r') as f: alphabet = [] for char in f: alphabet.append(char[0]) NEG_INF = -float("inf") lexicon_dict = lexicon_dic() print("dict length : {}".format(len(lexicon_dict))) #load the language models lm_models = arpa.loadf( "/home/emekonnen/mydata/E2E-ASR-pytorch/mydata/data/local/lm/3-gram.pruned.3e-7.arpa" ) #lm_models = lm.ARPALanguageModel("/home/emekonnen/mydata/E2E-ASR-pytorch/mydata/data/local/lm/3-gram.arpa") lm = lm_models[0] def compute_probs(trigrams): total_probs = 0 for tri in trigrams: try: total_probs += lm.log_p(" ".join(tri)) except KeyError: pass return total_probs