Python MLE示例，nltk.lm.MLE Python示例

示例#1

0

显示文件

文件： train.py 项目： CodeValkyrie/dialogue-act-sequences

def train_n_gram(data, ids, n):
    """ Returns an n-gram model.

        Args:
            data    - the original data frame containing all the data
            ids     - the IDs of the dialogues the n-gram model will train on
            n       - the length of the n-gram model

        Returns:
            NLTK NgramCounter containing the counts of all the n-grams in the training set.
    """
    # Extracts all the dialogue act classes.
    unique_dialogue_acts = sorted(list(set(data['dialogue_act'])))

    # Makes n-grams of the all the dialogues with the given ids.
    training_dialogues = []
    for ID in ids:
        training_dialogue = list(
            data[data['dialogue_id'] == ID]['dialogue_act'])
        training_dialogues.append(training_dialogue)

    # Get the every n-gram up to n from the training dialogues.
    n_grams = []
    for i in range(n):
        n_grams = n_grams + [
            list(ngrams(dialogue, n - i)) for dialogue in training_dialogues
        ]

    # Trains the n-gram model on the dialogue n-grams and the unique dialogue acts.
    lm = MLE(n)
    lm.fit(n_grams, unique_dialogue_acts)
    return lm

示例#2

0

显示文件

文件： nextWord.py 项目： arrgee23/ml

def makeModel():
    #sentences = webtext.raw()+brown.raw()+reuters.raw()
    sentences = webtext.raw() + reuters.raw()
    # Tokenize the sentences
    try:  # Use the default NLTK tokenizer.
        from nltk import word_tokenize, sent_tokenize
        # Testing whether it works.
        # Sometimes it doesn't work on some machines because of setup issues.
        word_tokenize(
            sent_tokenize("This is a foobar sentence. Yes it is.")[0])

    except:  # Use a naive sentence tokenizer and toktok.
        import re
        from nltk.tokenize import ToktokTokenizer
        # See https://stackoverflow.com/a/25736515/610569
        sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
        # Use the toktok tokenizer that requires no dependencies.
        toktok = ToktokTokenizer()
        word_tokenize = word_tokenize = toktok.tokenize

    tokenized_text = [
        list(map(str.lower, word_tokenize(sent)))
        for sent in sent_tokenize(sentences)
    ]

    # Make it ready for making 3 grams
    n = 5
    train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

    model = MLE(n)  # Lets train a 3-grams model, previously we set n=3

    model.fit(train_data, padded_sents)
    #print(model.vocab)

    return model

示例#3

0

显示文件

class FormatJudge:
    """Detects format errors on a tabular data set."""
    def __init__(self, generator: PatternGenerator, n: int = 3, dim: int = 1):
        self.generator = generator
        self.n = n
        self.dim = dim

    def __call__(self, o: Any) -> list:
        return self.judge(o)

    def fit(self, values: List[Any]):
        patterns = [self.generator(v) for v in values]
        padded_patterns = [pad_both_ends(p, n=self.n) for p in patterns]
        ngrams_ = [ngrams(pp, n=self.n) for pp in padded_patterns]

        self.vocab = list(flatten(
            pad_both_ends(p, n=self.n) for p in patterns))
        self.model = MLE(self.n)
        self.model.fit(ngrams_, self.vocab)

    def judge(self, o: Any) -> list:
        scores = []
        p = self.generator(o)
        p = list(pad_both_ends(p, n=self.n))
        for i, v in enumerate(p):
            if i < self.n - 1:
                continue
            letters = []
            for j in range(i - (self.n - 1), i):
                letters.append(p[j])
            scores.append(self.model.score(v, letters))
        return heapq.nsmallest(self.dim, scores)

示例#4

0

显示文件

文件： lm_model.py 项目： zeta1999/MetaGen

class LM_nGram(LM_BaseModel):
    def __init__(self, args, generator, config=None):
        LM_BaseModel.__init__(self, args, generator, config)
        self.ngram = MLE(2)

    def fit(self, steps):
        tokens = [step.tree.list() for step in steps]
        train_data = [
            nltk.bigrams(t,
                         pad_right=True,
                         pad_left=True,
                         left_pad_symbol="<s>",
                         right_pad_symbol="</s>") for t in tokens
        ]
        words = [word for sent in tokens for word in sent]
        words.extend(["<s>", "</s>"])
        padded_vocab = Vocabulary(words)
        self.ngram.fit(train_data, padded_vocab)

    def forward(self, steps):
        probs = []
        for step in steps:
            words = step.tree.list()
            ngrams = nltk.bigrams(words,
                                  pad_right=True,
                                  pad_left=True,
                                  left_pad_symbol="<s>",
                                  right_pad_symbol="</s>")
            prob = [
                self.ngram.score(ngram[-1], ngram[:-1]) for ngram in ngrams
            ]
            probs.append(sum(prob) / len(prob))
            if len(probs) % 100 == 0:
                print(len(probs), sum(probs) / len(probs))
        return probs

示例#5

0

显示文件

文件： model.py 项目： gaoshengqing21/thesis

 def fit(self, sequences: List[List]):
     train, vocab = padded_everygram_pipeline(self.config.GRAM_SIZE, sequences)
     model = MLE(self.config.GRAM_SIZE)
     model.fit(train, vocab)
     self.model = model
     if self.config.SAVE_PATH:
         self.save_model(self.config.SAVE_PATH)

示例#6

0

显示文件

文件： ex_ngram.py 项目： mfkiwl/hdlp

def train_ngram_model(src_dict: dict, ngram_order=N_GRAM_ORDER):
    print(f"Training {ngram_order}-gram model on train dataset...")
    train_data, padded_sents = padded_everygram_pipeline(
        ngram_order, src_dict["train"])
    model = MLE(ngram_order)
    model.fit(train_data, padded_sents)
    return model

示例#7

0

显示文件

    def fit(self, values: List[Any]):
        patterns = [self.generator(v) for v in values]
        padded_patterns = [pad_both_ends(p, n=self.n) for p in patterns]
        ngrams_ = [ngrams(pp, n=self.n) for pp in padded_patterns]

        self.vocab = list(flatten(
            pad_both_ends(p, n=self.n) for p in patterns))
        self.model = MLE(self.n)
        self.model.fit(ngrams_, self.vocab)

示例#8

0

显示文件

文件： lm.py 项目： SixingYan/TextErrorDetect

def getEveryModel(n: int, text: List, ngrams):
    """ get mixed-n model """
    lm = MLE(n)

    train, vocab = padded_everygram_pipeline(n, text)

    lm.fit(train, vocab)

    return lm

示例#9

0

显示文件

文件： project02.py 项目： cavitcakir/Turkish-Text-Exploration

def create_LanguageModel(Docs,model_type,ngram):
    text = " ".join(Docs)
    text = text.replace("\\n"," ")
    tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)]
    train_data, padded_sents = padded_everygram_pipeline(ngram, tokenized_text)
    model = MLE(ngram)
    if model_type != "MLE":
        model = KneserNeyInterpolated(ngram) 
    model.fit(train_data, padded_sents)
    return model

示例#10

0

显示文件

 def train_model(self, corpus, n):
     """
     MLE训练基于统计的语言模型
     :param text: [['a','b'],['a','b','c']]
     """
     train_data, padded_sents = padded_everygram_pipeline(n, corpus)
     # train model
     t1 = time.time()
     self.model = MLE(n)
     self.model.fit(train_data, padded_sents)
     print('training LM takes {} time'.format(time.time()-t1))

示例#11

0

显示文件

文件： featureextract.py 项目： spenc34/fake-news-detector

class UrlNGram:
    def __init__(self, urls, n=2):
        self.ngram = MLE(n)
        train_data, padded_sents = padded_everygram_pipeline(n, urls)
        self.ngram.fit(train_data, padded_sents)

    def get_entropy(self, url):
        return self.ngram.entropy(list(url))

    def get_perplexity(self, url):
        return self.ngram.perplexity(list(url))

示例#12

0

显示文件

def create_and_fit_model(corpus):
    # Recibe un corpus tokenizado por sentencia y por palabra.
    train_data, padded_sents = padded_everygram_pipeline(NGRAM, corpus)

    # Crea modelo
    model = MLE(NGRAM)

    # Ajusta a los datos
    model.fit(train_data, padded_sents)

    return model

示例#13

0

显示文件

def ngram_language_model(corpus, n):
    # form of corpus: [[word, word, word], [word, word, ..., word]]
    # ngram language model
    train_data, padded_sents = padded_everygram_pipeline(n, corpus)
    print(padded_sents)
    model = MLE(n)
    model.fit(train_data, padded_sents)
    print(len(model.vocab))

    with open('ngram_model.pkl', 'wb') as fout:
        pickle.dump(model, fout)
    return model

示例#14

0

显示文件

	def get_MLELM(self, tokens, n_gram = 2) -> MLE:
		'''
			Trains lm and stores in class upon training to be reused
		'''
		paddedLine = [list(pad_both_ends(tokens, n=n_gram))]
		train, vocab = padded_everygram_pipeline(2, paddedLine)
			
		if (tokens not in self.lms.keys()):
			lm = MLE(n_gram)
			lm.fit(train, vocab)
			self.lms[tokens] = lm
			
		return self.lms[tokens]

示例#15

0

显示文件

文件： class_7_modularizing_model_and_testing.py 项目： matheusnalmeida/Regex-NLP

def treinando_modelo_MLE(lista_de_textos):
    #Salvando todas as frases em uma unica variavel
    todas_as_questoes = " ".join(lista_de_textos)
    #Separando as palavras do texto levando em consideração o espaço em branco
    todas_as_palavras = WhitespaceTokenizer().tokenize(todas_as_questoes)
    #Adicionando os fake chars em cada palavra e gerando o vetor de vocabulos(nesse caso letras de cada palavra)
    treino_bigram,vocab = padded_everygram_pipeline(2,todas_as_palavras)    
    #Criando modelo MLE para bigramas
    modelo = MLE(2)
    #Treinando os modelos
    modelo.fit(treino_bigram,vocab)

    return modelo

示例#16

0

显示文件

def main() -> None:
    """Main entrypoint."""
    # Create an argument parser for parsing CLI arguments
    parser = ArgumentParser(
        description=
        "A tool to train an AI to predict the probability of a word in a sentence"
    )

    # Add parameters for the server connection
    parser.add_argument("-i",
                        "--input",
                        required=True,
                        type=str,
                        help="The input file to read from")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        type=str,
                        help="The output file to serialize the model to")
    parser.add_argument("-l",
                        "--language",
                        required=True,
                        type=str,
                        help="The name of the language to use")

    # Parse the arguments
    options = parser.parse_args()

    # Read and extract tokens
    tokens = []
    with open(options.input, "r") as file:
        raw_text = file.read()
        # Tokenize the text.
        tokens = [
            list(map(str.lower, word_tokenize(sentence)))
            for sentence in sent_tokenize(raw_text, language=options.language)
        ]

    # n-gram size (trigram)
    n = 3

    # Prepare train data
    train_data, padded_sentences = padded_everygram_pipeline(n, tokens)

    # Train a Maximum Likelihood Estimation (MLE) model
    model = MLE(n)
    model.fit(train_data, padded_sentences)

    with open(options.output, "wb") as file:
        pickle.dump(model, file)

示例#17

0

显示文件

def train_model(text, n_gram=3):
    """
    MLE训练基于统计的语言模型
    :param text: [['a','b'],['a','b','c']]
    """
    print('train size={}'.format(len(text)))
    train_data, padded_sents = padded_everygram_pipeline(n_gram, text)

    # train model
    model = MLE(n_gram)  # Lets train a 3-grams model, previously we set n=3
    model.fit(train_data, padded_sents)
    print('词表大小={}'.format(len(model.vocab)))

    return model

示例#18

0

显示文件

class ngram_language_model:
    def __init__(self):
        self.model = None
        self.valid_text = None

    def build_and_train_model(self):
        text = brown.sents(categories=[
            'adventure', 'belles_lettres', 'editorial', 'fiction',
            'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery',
            'news', 'religion', 'reviews', 'romance', 'science_fiction'
        ])
        valid_text = []
        for sentence in text:
            words = []
            for word in sentence:
                words.extend(nltk.word_tokenize(word))
            valid_text.append(words)

        self.valid_text = valid_text

        n = 3  # length of largest everygram

        train_data, padded_sents = padded_everygram_pipeline(n, valid_text)

        self.model = MLE(n)
        self.model.fit(train_data, padded_sents)
        return

    def make_predictions(self, msg, number_of_predictions=5):
        """
        makes prediction for the next possible words using the available words
        """
        sentence = []
        for x in msg.strip().split():
            sentence.extend(nltk.word_tokenize(x))
        alpha = 0.1
        beta = 0.3
        gamma = 0.6
        predictions = []
        prediction_dict = {}
        for word in self.model.vocab:
            alpha_prob = alpha * self.model.score(word)
            beta_prob = beta * self.model.score(word, sentence[-1:])
            gamma_prob = gamma * self.model.score(word, sentence[-2:])
            prob = alpha_prob + beta_prob + gamma_prob
            predictions.append((word, prob))
        predictions.sort(key=lambda x: x[1], reverse=True)
        for word, prob in predictions[:number_of_predictions]:
            prediction_dict[word] = prob
        return prediction_dict

示例#19

0

显示文件

def trainNGramModelForWords():
    newsListOne = []
    text = ''
    with open("combined.txt", 'r', encoding='utf-8', errors='ignore') as outfile:
        newslist = json.load(outfile)
    for news in newslist:
        newsListOne.extend(news)
    text = ' '.join([str(elem) for elem in newsListOne])
    tokenized_text = [list(map(str.lower, word_tokenize(sent)))
                      for sent in sent_tokenize(text)]
    n = 3
    train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

    model = MLE(n)  # Lets train a 3-grams maximum likelihood estimation model.
    model.fit(train_data, padded_sents)
    return model

示例#20

0

显示文件

    def create_and_fit_model(self, corpus):
        # Recibe un corpus tokenizado por sentencia y por palabra.
        print('primer paso')
        train_data, padded_sents = padded_everygram_pipeline(
            self.ngram, corpus)

        print('creando modelo')
        # Crea modelo
        model = MLE(self.ngram)

        print('ajustando datos')
        # Ajusta a los datos
        model.fit(train_data, padded_sents)
        print('ajusto')

        return model

示例#21

0

显示文件

文件： train_ngram.py 项目： Getmany1/NLP_ngram_assessor

def train_ngram(corpus, n, words=True):
    """
    Train ngram (POS) language model from a corpus.
    """

    # Read the corpus file
    if corpus[-4:] == '.txt':
        with open(os.path.join('data', 'corpora', corpus),
                  encoding='utf8') as f:
            text = f.read()

    elif corpus[-4:] == '.pkl':
        with open(os.path.join('data', 'corpora', corpus), 'rb') as f:
            text = pickle.load(f)

    # Lowercase if the model will be trained on words (to be skipped for
    # POS tags)
    if words:
        if type(text) is list:
            for sent_idx, sent in enumerate(text):
                for word_idx, word in enumerate(sent):
                    text[sent_idx][word_idx] = word.lower()
        elif type(text) is str:
            text = text.lower()

    # Tokenize
    if type(text) is str:
        text = sent_tokenize(text)
        if words:
            text = [word_tokenize(sent) for sent in text]
        else:
            text = [sent.split() for sent in text]

    # Train ngram language model
    # Do not apply any ngram smoothing thechniques for the model
    train_data, vocab = padded_everygram_pipeline(n, text)
    lm = MLE(n)
    lm.fit(train_data, vocab)

    # Save the model
    with open(
            os.path.join('data', 'models',
                         corpus[:-4] + '_' + str(n) + 'gram' + '.pkl'),
            'wb') as f:
        pickle.dump(lm, f, 4)

    return lm

示例#22

0

显示文件

def create_LanguageModel(docs, model_type="MLE", ngram=3):
    global _ngram
    _ngram = ngram
    tokenized_text = []

    new_docs = preprocess(docs)

    for d in new_docs:
        text = sent_tokenize(d, language="turkish")
        for sent in text:
            temp = []
            for i in word_tokenize(sent, language="turkish"):
                temp.append(i.lower())
            tokenized_text.append(temp)

    training_ngrams, vocab = padded_everygram_pipeline(ngram, tokenized_text)

    if model_type == "MLE":
        model = MLE(ngram)  #, vocabulary=Vocabulary(vocab))
        model.fit(training_ngrams, vocab)
        # print(model.vocab)
        return model
    elif model_type == "KneserNeyInterpolated":
        model = KneserNeyInterpolated(ngram)
        model.fit(training_ngrams, vocab)  # padded_sents)
        # print(model.vocab)
        return model
    else:
        print("Unkown Model Type")
        return 0

示例#23

0

显示文件

文件： LangModelMgr.py 项目： SixingYan/ErrorTextDetection

def _train(n: int, texts: List):
    """ texts 已经分词的文本列表"""
    lm = MLE(n)

    train, vocab = [], set([])
    for t in texts:
        g = ngrams(t,
                   n,
                   pad_left=True,
                   pad_right=True,
                   left_pad_symbol='<s>',
                   right_pad_symbol='</s>')
        g = list(g)
        vocab = vocab | set(t)
        train.append(g)

    lm.fit(train, vocabulary_text=list(vocab))
    return lm

示例#24

0

显示文件

文件： test_ai.py 项目： AlexGustafsson/word-frequencies

def generate_sentence(model: MLE, length: int, seed=random.randint(0, 1e10)):
    content = []
    for token in model.generate(length, random_seed=seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

示例#25

0

显示文件

    def create_model(self, corpus_name):
        print('reading corpus')
        reader = PlaintextCorpusReader(CORPUS_DIR, corpus_name)

        print('padded everygram pipeline')
        train_data, vocab = padded_everygram_pipeline(self.ngram,
                                                      (reader.sents()))

        print('creando modelo')
        # Crea modelo
        model = MLE(self.ngram)

        print('ajustando datos')
        # Ajusta a los datos
        model.fit(train_data, vocab)
        print('ajusto')

        return model

示例#26

0

显示文件

文件： potus.py 项目： pascalkarg/2-markov-chains

def get_tweet(model: MLE):
    tweet = []
    for word in model.generate(30, text_seed=["<s>"]):
        if word == "<s>":
            continue
        elif word == "</s>":
            break
        tweet.append(word)
    return tweet

示例#27

0

显示文件

def generateNGram(tweets, lexicon):
    n = 4
    train_data, padded_sents = padded_everygram_pipeline(n, lexicon)
    model = MLE(n)
    model.fit(train_data, padded_sents)
    ngramdict = []
    for i in lexicon:
        ngramdict.append(model.counts[i])
    featureset = []
    lemmatizer = WordNetLemmatizer()
    for t in tweets:
        words = word_tokenize(t)
        words = [lemmatizer.lemmatize(i) for i in words]
        features = np.zeros(len(lexicon))
        for w in words:
            if w in ngramdict:
                features[lexicon.index(w)] += 1
        featureset.append(list(features))
    return featureset

示例#28

0

显示文件

文件： lm.py 项目： SixingYan/TextErrorDetect

def getModel(n: int, text: List):
    """ 在这里训练模型 """
    lm = MLE(n)

    # get train, vocab
    train, vocab = [], set([])
    for t in text:
        g = ngrams(t,
                   n,
                   pad_left=True,
                   pad_right=True,
                   left_pad_symbol='<s>',
                   right_pad_symbol='</s>')
        g = list(g)
        vocab = vocab | set(t)
        train.append(g)

    lm.fit(train, vocabulary_text=list(vocab))

    return lm

示例#29

0

显示文件

文件： preprocess.py 项目： Borchmann/Voynich

def build_word_model(corpus, order=3):
    """
    Creates character-level n-gram word model.
    """

    words = ' '.join(corpus).split()  # Flatten corpus into words
    words = [w for w in words if re.match(r'[a-z]', w)]  # Use clean words only

    vocab = set()
    data = []

    for word in words:
        w = unigrams(word)
        vocab.update(w)
        data.append(ngramize(w, order))

    model = MLE(order)
    model.fit(data, vocabulary_text=vocab)

    return model

示例#30

0

显示文件

class MleTrigramTests(unittest.TestCase):
    """MLE trigram model tests"""

    score_tests = [
        # count(d | b, c) = 1
        # count(b, c) = 1
        ("d", ("b", "c"), 1),
        # count(d | c) = 1
        # count(c) = 1
        ("d", ["c"], 1),
        # total number of tokens is 18, of which "a" occurred 2 times
        ("a", None, 2.0 / 18),
        # in vocabulary but unseen
        ("z", None, 0),
        # out of vocabulary should use "UNK" score
        ("y", None, 3.0 / 18),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = MLE(3, vocabulary=vocab)
        self.model.fit(training_text)

示例#31

0

显示文件

文件： test_models.py 项目： rmalouf/nltk

class MleTrigramTests(unittest.TestCase):
    """MLE trigram model tests"""

    score_tests = [
        # count(d | b, c) = 1
        # count(b, c) = 1
        ("d", ("b", "c"), 1),
        # count(d | c) = 1
        # count(c) = 1
        ("d", ["c"], 1),
        # total number of tokens is 18, of which "a" occured 2 times
        ("a", None, 2.0 / 18),
        # in vocabulary but unseen
        ("z", None, 0),
        # out of vocabulary should use "UNK" score
        ("y", None, 3.0 / 18),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = MLE(3, vocabulary=vocab)
        self.model.fit(training_text)

示例#32

0

显示文件

文件： test_models.py 项目： rmalouf/nltk

class NgramModelTextGenerationTests(unittest.TestCase):
    """Using MLE estimator, generate some text."""

    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = MLE(3, vocabulary=vocab)
        self.model.fit(training_text)

    def test_generate_one_no_context(self):
        self.assertEqual(self.model.generate(random_seed=3), "<UNK>")

    def test_generate_one_limiting_context(self):
        # We don't need random_seed for contexts with only one continuation
        self.assertEqual(self.model.generate(text_seed=["c"]), "d")
        self.assertEqual(self.model.generate(text_seed=["b", "c"]), "d")
        self.assertEqual(self.model.generate(text_seed=["a", "c"]), "d")

    def test_generate_one_varied_context(self):
        # When context doesn't limit our options enough, seed the random choice
        self.assertEqual(
            self.model.generate(text_seed=("a", "<s>"), random_seed=2), "a"
        )

    def test_generate_cycle(self):
        # Add a cycle to the model: bd -> b, db -> d
        more_training_text = [list(padded_everygrams(self.model.order, list("bdbdbd")))]
        self.model.fit(more_training_text)
        # Test that we can escape the cycle
        self.assertEqual(
            self.model.generate(7, text_seed=("b", "d"), random_seed=5),
            ["b", "d", "b", "d", "b", "d", "</s>"],
        )

    def test_generate_with_text_seed(self):
        self.assertEqual(
            self.model.generate(5, text_seed=("<s>", "e"), random_seed=3),
            ["<UNK>", "a", "d", "b", "<UNK>"],
        )

    def test_generate_oov_text_seed(self):
        self.assertEqual(
            self.model.generate(text_seed=("aliens",), random_seed=3),
            self.model.generate(text_seed=("<UNK>",), random_seed=3),
        )

    def test_generate_None_text_seed(self):
        # should crash with type error when we try to look it up in vocabulary
        with self.assertRaises(TypeError):
            self.model.generate(text_seed=(None,))

        # This will work
        self.assertEqual(
            self.model.generate(text_seed=None, random_seed=3),
            self.model.generate(random_seed=3),
        )

示例#33

0

显示文件

文件： test_models.py 项目： rmalouf/nltk

 def setUp(self):
     vocab, training_text = _prepare_test_data(3)
     self.model = MLE(3, vocabulary=vocab)
     self.model.fit(training_text)

示例#34

0

显示文件

文件： test_models.py 项目： rmalouf/nltk

class MleBigramTests(unittest.TestCase):
    """unit tests for MLENgramModel class"""

    score_tests = [
        ("d", ["c"], 1),
        # Unseen ngrams should yield 0
        ("d", ["e"], 0),
        # Unigrams should also be 0
        ("z", None, 0),
        # N unigrams = 14
        # count('a') = 2
        ("a", None, 2.0 / 14),
        # count('y') = 3
        ("y", None, 3.0 / 14),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = MLE(2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_logscore_zero_score(self):
        # logscore of unseen ngrams should be -inf
        logscore = self.model.logscore("d", ["e"])

        self.assertTrue(math.isinf(logscore))

    def test_entropy_perplexity_seen(self):
        # ngrams seen during training
        trained = [
            ("<s>", "a"),
            ("a", "b"),
            ("b", "<UNK>"),
            ("<UNK>", "a"),
            ("a", "d"),
            ("d", "</s>"),
        ]
        # Ngram = Log score
        # <s>, a    = -1
        # a, b      = -1
        # b, UNK    = -1
        # UNK, a    = -1.585
        # a, d      = -1
        # d, </s>   = -1
        # TOTAL logscores   = -6.585
        # - AVG logscores   = 1.0975
        H = 1.0975
        perplexity = 2.1398

        self.assertAlmostEqual(H, self.model.entropy(trained), places=4)
        self.assertAlmostEqual(perplexity, self.model.perplexity(trained), places=4)

    def test_entropy_perplexity_unseen(self):
        # In MLE, even one unseen ngram should make entropy and perplexity infinite
        untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")]

        self.assertTrue(math.isinf(self.model.entropy(untrained)))
        self.assertTrue(math.isinf(self.model.perplexity(untrained)))

    def test_entropy_perplexity_unigrams(self):
        # word = score, log score
        # <s>   = 0.1429, -2.8074
        # a     = 0.1429, -2.8074
        # c     = 0.0714, -3.8073
        # UNK   = 0.2143, -2.2224
        # d     = 0.1429, -2.8074
        # c     = 0.0714, -3.8073
        # </s>  = 0.1429, -2.8074
        # TOTAL logscores = -21.6243
        # - AVG logscores = 3.0095
        H = 3.0095
        perplexity = 8.0529

        text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)]

        self.assertAlmostEqual(H, self.model.entropy(text), places=4)
        self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)