Exemplo n.º 1
0
    def __init__(self,
                 word_lm_filename,
                 char_lm_filename,
                 vocab_filename,
                 char_filename,
                 parent=None):
        self.lm_filename = word_lm_filename
        self.vocab_filename = vocab_filename

        self.word_predictor = WordPredictor(word_lm_filename, vocab_filename)
        self.char_predictor = CharacterPredictor(char_lm_filename,
                                                 char_filename)

        # Define how many predictions you want for each character
        # By default it is set to 0 and will return all possible
        # words
        if parent is not None:
            self.parent = parent
            self.num_predictions = self.parent.N_pred
            self.prob_thres = self.parent.prob_thres
            self.num_words_total = self.parent.num_words_total
        else:
            self.parent = None
            self.num_predictions = kconfig.N_pred
            self.prob_thres = 0
            self.num_words_total = 26 * self.num_predictions
        self.min_log_prob = -float("inf")

        # The default vocab_id is ''
        self.vocab_id = ''
def main():
    # Provide the name and path of a language model and the vocabulary
    lm_filename = '../resources/lm_word_medium.kenlm'
    vocab_filename = '../resources/vocab_100k'

    # Create an instance of the WordPredictor class
    word_predictor = WordPredictor(lm_filename, vocab_filename)

    prefix = 'a'
    context = 'the united states of'

    # Define how many predictions you want for each character
    # By default it is set to 0 and will return all possible
    # words
    num_predictions = 3
    min_log_prob = -float('inf')

    #The default vocab_id is ''
    vocab_id = ''

    word_list = word_predictor.get_words_with_context(prefix, context,
                                                      vocab_id,
                                                      num_predictions,
                                                      min_log_prob)

    # Call the print_suggestions method to print all the words
    word_predictor.print_suggestions(word_list)
Exemplo n.º 3
0
def main():
	# We need to provide the name and path of a language model
	# and the vocabulary

	lm_filename = '../resources/lm_word_medium.kenlm'
	vocab_filename = '../resources/vocab_100k'

	# ******The token file is in '../resources/tokens.txt'
	# If you are running this script, then change the token file path in WordPredictor __init__


	# Create an instance of the WordPredictor class
	word_predictor = WordPredictor(lm_filename, vocab_filename)

	# Perform a query with the current vocabulary
	w , p = word_predictor.get_most_probable_word(prefix = 'w', context = 'hello', vocab_id = '', min_log_prob = -float('inf'))

	print('Word: ' + w + ', log probability: ' + str(p))


	new_vocab_filename = '../resources/vocab_20k'
	new_vocab_id = 'small'
	word_predictor.add_vocab(new_vocab_id, new_vocab_filename)

	# Perform a query with the new vocabulary
	w , p = word_predictor.get_most_probable_word(prefix = 'w', context = 'hello', vocab_id = new_vocab_id, min_log_prob = -float('inf'))
	print('Word: ' + w + ', log probability: ' + str(p))


	words = word_predictor.get_words_with_context('w', 'hello', new_vocab_id, 3, -float('inf'))
	word_predictor.print_suggestions(words)
Exemplo n.º 4
0
    def __init__(self, lm_filename, vocab_filename):
        self.lm_filename = lm_filename
        self.vocab_filename = vocab_filename

        self.word_predictor = WordPredictor(lm_filename, vocab_filename)

        # Define how many predictions you want for each character
        # By default it is set to 0 and will return all possible
        # words
        self.num_predictions = 3
        self.min_log_prob = -float("inf")

        # The default vocab_id is ''
        self.vocab_id = ''
Exemplo n.º 5
0
class LanguageModel():
    def __init__(self, lm_filename, vocab_filename):
        self.lm_filename = lm_filename
        self.vocab_filename = vocab_filename

        self.word_predictor = WordPredictor(lm_filename, vocab_filename)

        # Define how many predictions you want for each character
        # By default it is set to 0 and will return all possible
        # words
        self.num_predictions = 3
        self.min_log_prob = -float("inf")

        # The default vocab_id is ''
        self.vocab_id = ''

    def get_words(self, context, prefix, num_words):
        self.context = context
        self.prefix = prefix
        # print("prefix: ", prefix, ", context: ", context)

        word_preds = []
        word_probs = []

        lm_results = self.word_predictor.get_words_with_context(
            prefix, context, self.vocab_id, self.num_predictions,
            self.min_log_prob)

        flattened_results = [
            freq for sublist in lm_results for freq in sublist
        ]
        flattened_results.sort(key=lambda x: -x[1])
        return [word[0] for word in flattened_results][:num_words]
def main():

    # We need to provide the name and path of a language model
    # and the vocabulary

    lm_filename = '../resources/lm_word_medium.kenlm'
    vocab_filename = '../resources/vocab_100k'

    # Create an instance of the WordPredictor class
    word_predictor = WordPredictor(lm_filename, vocab_filename)

    # The WordPredictor class creates a Trie data structure
    # for the given vocabulary with a vocab_id. By default
    # the vocab_id is set to ''. If you want to add another
    # vocabulary then you need to call add_vocab method which
    # is illustrated in add_vocab_query.py

    # Suppose, we have a prefix 'a' and a context 'the united
    # states of america'. Based on the given information, if
    # we want to guess the most probable word, then we need
    # to do it following way:

    prefix = 'a'
    context = 'the united states of'
    most_prob_word, log_prob = word_predictor.get_most_probable_word(
        prefix, context, vocab_id='', min_log_prob=-float('inf'))

    print('Context: ' + context)
    print('Prefix: ' + prefix)
    print('Most probable word: "' + most_prob_word +
          '" with log probability: ' + str(log_prob))

    # Another example
    prefix = 't'
    context = 'hello'
    most_prob_word, log_prob = word_predictor.get_most_probable_word(
        prefix, context, vocab_id='', min_log_prob=-float('inf'))

    print('Context: ' + context)
    print('Prefix: ' + prefix)
    print('Most probable word: "' + most_prob_word +
          '" with log probability: ' + str(log_prob))
Exemplo n.º 7
0
 def setUp(self):
     self.wordPredictor = WordPredictor('../resources/lm_word_medium.kenlm', '../resources/vocab_100k')
     self.language_model = kenlm.LanguageModel('../resources/lm_word_medium.kenlm')
     self.vocab_filename = '../resources/vocab_100k'
     self.vocab_id = ''
Exemplo n.º 8
0
class TestWordPredictor(unittest.TestCase):

    def setUp(self):
        self.wordPredictor = WordPredictor('../resources/lm_word_medium.kenlm', '../resources/vocab_100k')
        self.language_model = kenlm.LanguageModel('../resources/lm_word_medium.kenlm')
        self.vocab_filename = '../resources/vocab_100k'
        self.vocab_id = ''

    def test_create_new_trie(self):
        wp = self.wordPredictor
        self.assertIsInstance(wp.create_new_trie(self.vocab_filename), VocabTrie, "OK")

    def test_update_char_list_from_string(self):
        list = ['a']
        str = "bc"
        res = ['a', 'b', 'c']
        self.assertEqual(self.wordPredictor.update_char_list_from_string(list, str), res, "OK")

    def test_create_char_list_from_vocab(self):
        char_set = self.wordPredictor.create_char_list_from_vocab(self.vocab_id, self.vocab_filename)
        #id, char_set = test_res.popitem()
        #self.assertIsInstance(type(id), type(str), "Return type is not same")
        self.assertIsInstance(type(char_set), type(set), "Return type is not same")

    def test_add_vocab(self, vocab_id = 'vocab_id'):
        new_trie = self.wordPredictor.create_new_trie(self.vocab_filename)
        self.assertTrue((new_trie!= None))
        self.assertFalse((new_trie == None))

    def test_get_vocab_trie(self):
        flag, vocabTr = self.wordPredictor.get_vocab_trie(self.vocab_id)
        self.assertIsInstance(vocabTr, VocabTrie, 'Not OK')
        self.assertIsInstance(type(flag), type(bool), "Not OK")

    """
    def test_get_punc_token(self):
        self.assertEqual(self.wordPredictor.get_punc_token(','), ',comma', 'Punctuation and token are not equal')
    """

    def test_get_context_state(self):
        sIn, sOut = self.wordPredictor.get_context_state('<s>', self.language_model, self.vocab_id)
        self.assertIsInstance(sIn, kenlm.State, 'stateIn is not an instance of kenlm.State')
        self.assertIsInstance(sOut, kenlm.State, 'stateOut is not an instance of kenlm.State')

    def test_find_most_probable_word(self):
        pass

    def test_get_words(self):
        pass

    def test__get_words(self):

        suggestion_list = self.wordPredictor._get_words('a', 'the united states of', self.vocab_id, 3,-float('inf'))
        self.assertTrue(isinstance(type(suggestion_list), type(str)), "Not a list") #basestring is gone in python 3


    def test_print_suggestions(self):
        pass

    def test_get_most_likely_word(self):
        word, log_prob = self.wordPredictor.get_most_probable_word('a', 'the united states of', self.vocab_id)
        self.assertEqual(word, 'america', "Not equal")
        self.assertTrue(isinstance(log_prob, numbers.Number), "False")
Exemplo n.º 9
0
class LanguageModel():
    def __init__(self,
                 word_lm_filename,
                 char_lm_filename,
                 vocab_filename,
                 char_filename,
                 parent=None):
        self.lm_filename = word_lm_filename
        self.vocab_filename = vocab_filename

        self.word_predictor = WordPredictor(word_lm_filename, vocab_filename)
        self.char_predictor = CharacterPredictor(char_lm_filename,
                                                 char_filename)

        # Define how many predictions you want for each character
        # By default it is set to 0 and will return all possible
        # words
        if parent is not None:
            self.parent = parent
            self.num_predictions = self.parent.N_pred
            self.prob_thres = self.parent.prob_thres
            self.num_words_total = self.parent.num_words_total
        else:
            self.parent = None
            self.num_predictions = kconfig.N_pred
            self.prob_thres = 0
            self.num_words_total = 26 * self.num_predictions
        self.min_log_prob = -float("inf")

        # The default vocab_id is ''
        self.vocab_id = ''

    def get_words(self,
                  context,
                  prefix,
                  keys_li,
                  num_words_total=kconfig.num_words_total):
        if self.parent is not None:
            self.num_predictions = self.parent.N_pred
            self.prob_thres = self.parent.prob_thres
            self.num_words_total = self.parent.num_words_total

        self.context = context
        self.prefix = prefix
        # print("prefix: ", prefix, ", context: ", context)

        word_preds = []
        word_probs = []

        lm_results = self.word_predictor.get_words_with_context(
            prefix, context, self.vocab_id, self.num_predictions,
            self.min_log_prob)
        flattened_results = [
            freq for sublist in lm_results for freq in sublist
        ]
        flattened_results.sort(key=lambda x: -x[1])
        flattened_results = [
            word_pair[0] for word_pair in flattened_results[:num_words_total]
        ]

        # print(flattened_results)

        word_dict = {}
        for word_list in lm_results:
            if len(word_list) > 0:
                cur_word_list = []
                for word_pair in word_list:
                    if word_pair[0] in flattened_results:
                        cur_word_list.append(word_pair)
                word_dict[word_list[0][0][len(prefix)]] = cur_word_list

        for key in keys_li:
            key_word_preds = ["", "", ""]
            key_word_probs = [-float("inf"), -float("inf"), -float("inf")]
            if key in word_dict:
                index = 0
                for word_tuple in word_dict[key]:
                    if word_tuple[1] >= self.min_log_prob:
                        key_word_preds[index] = word_tuple[0] + " "
                        key_word_probs[index] = word_tuple[1]
                    index += 1
            word_preds += [key_word_preds]
            word_probs += [key_word_probs]

        key_probs = self.get_char_probs(context, prefix, keys_li)

        word_probs = np.array(word_probs)

        key_probs = key_probs - lognormalize_factor(key_probs)
        word_probs = word_probs - lognormalize_factor(word_probs)

        # nth_min_log_prob = np.partition(word_probs.flatten(), num_words_total)[num_words_total]
        #
        # word_probs = np.where(word_probs >= nth_min_log_prob, word_probs, -float("inf"))
        word_preds = np.where(word_probs != -float("inf"), word_preds, "")
        # word_preds = np.where(word_probs >= nth_min_log_prob, word_preds, "")

        return word_preds.tolist(), word_probs.tolist(), key_probs

    def get_char_probs(self, context, prefix, keys_li):
        key_results = dict(self.char_predictor.get_characters(context +
                                                              prefix))
        key_results[kconfig.space_char] = key_results[" "]
        del key_results[" "]

        key_probs = np.array([
            max(key_results[key], np.log(1 / 30))
            if key in key_results else -float("inf") for key in keys_li
        ])
        return key_probs