Exemplo n.º 1
0
 def test_detokenize(self):
     lst = ['oh', 'you', "can't", 'help', 'that', 'said', 'the', 'cat']
     ref_text = "oh you can't help that said the cat"
     ref_text2 = text = "oh you can't help that said the cat "
     text = corpus.detokenize(lst)
     self.assertEquals(ref_text, text, "Should be same text")
     self.assertEquals(ref_text2, text, "Should be same text")
Exemplo n.º 2
0
def generate():
    file_name = input("Please enter a training set's filename:")
    file = open(file_name)
    n = int(input("Please input a n-grams value 'n':"))
    model = init(n)
    sequences = []
    for line in file:
        tokens = corpus.tokenize(line)
        sequences.append(tokens)
    '''
    train_result = lm.train(sequences)
    print(lm.train(sequences))
    '''
    model.train(sequences)
    # print(model.counts)
    new_text_list = model.generate()
    # print(lm.generate())
    new_text = corpus.detokenize(new_text_list)
    return new_text
Exemplo n.º 3
0
def generate_save():
    new_text = ''
    filename = input('Please input a filename:')
    number = int(input('Please input number of desire text:'))

    file_name = input("Please enter a training set's filename:")
    file = open(file_name)
    n = int(input("Please input a n-grams value 'n':"))
    model = init(n)
    sequences = []
    for line in file:
        tokens = (corpus.tokenize(line))
        sequences.append(tokens)
    model.train(sequences)

    for i in range(0, number):
        new_text_list = model.generate()
        new_text += corpus.detokenize(new_text_list) + '\n'

    file = open(filename, 'w')
    file.write(new_text)

    file.close()
Exemplo n.º 4
0
 def test_short(self):
     self.assertEqual(corpus.detokenize(['.']), '.')
Exemplo n.º 5
0
 def test_generate(self):
     model = lm.LanguageModel(3)
     model.train([['This', 'is', 'an', 'apple', '.']])
     self.assertTrue(type(corpus.detokenize(model.generate())) == str)
Exemplo n.º 6
0
def main():
    while True:

        print("Press 1 : Create a new language model with a user-specified n")
        print(
            "Press 2 : Load texts from a file, and train the language model on those texts"
        )
        print(
            "Press 3 : Generate a text from the language model, and print it to the screen"
        )
        print(
            "Press 4 : Generate a user-specified number of texts from the language model, and write them to a file"
        )
        print(
            "Press 5 : Print the predicted  next word's probability distribution"
        )
        print("Press 6 : Perplexity of language model")
        print("Press 7 : Exit")
        print("Enter your choice (integer) ")
        text = input()
        if text == "1":
            print()
            print("Enter the value of n(integer value)")
            n = int(input())
            c = lm.LanguageModel(n)
            print("The value for ngram language model is ", n, "gram model")

        elif text == "2":
            print()
            print("You have pressed 2")
            print("Enter the filename")
            filename = input()
            # filename = "dev_shakespeare.txt"
            # lst = c.load(filename)
            c.load(filename)
            # print(lst)
            # c.train(lst)
            # print((c.counts))

        elif text == "3":
            print()
            print("You have pressed 3 ")
            print("Generate a random text")
            print(corpus.detokenize(c.generate()))

        elif text == "4":
            print()
            print("You have pressed 4 ")
            print("Enter the number for how many random texts you want")
            number_random = int(input())
            print("Enter the filename you want to save for random text")
            filename = input()
            file = open(filename, "w")
            while True:
                if number_random == 0:
                    break
                file.write(corpus.detokenize(c.generate()) + "\n")
                number_random -= 1
            file.close()
            # print(c.generate())

        elif text == "5":
            print()
            print("You have pressed 5 ")
            print(
                "Enter the text and predict the next word's probability distribution"
            )
            # s = "venture forth, The better part of my affections"
            s = input().lower()
            print(c.p_next(corpus.tokenize(s)))

        elif text == "6":
            print()
            print("You have pressed 6 ")
            print("Perplexity of the current language model is ",
                  round(c.perplexity()))

        elif text == "7":
            print()
            print("You have pressed 7 for exit")
            # for x in c.pdf:
            #     print(x, c.pdf[x])
            #
            # print(len(c.pdf))
            print("Exiting the main program")
            sys.exit(0)

        else:
            print(
                "Incorrect input. Please enter correct input for selecting option"
            )
Exemplo n.º 7
0
 def test_empty(self):
     self.assertEqual(corpus.detokenize([]), '')
Exemplo n.º 8
0
def generate5(mdl):
    with open('new_shakespeare.txt', 'w') as f:
        for i in range(5):
            f.write(cp.detokenize(mdl.generate()))
            f.write('\n\n')
Exemplo n.º 9
0
def generate(mdl):
    print('\n' + cp.detokenize(mdl.generate()))
Exemplo n.º 10
0
 def test_detokenize(self):
     print("id: " + self.id())
     result_text = "Simple array for testing detokenization"
     self.assertEqual(corpus.detokenize(self.input_tokens), result_text)
Exemplo n.º 11
0
def test_detokenize_produces_expected_tokens(tokens, detokenized):
    assert [detokenize(tokens)] == detokenized
Exemplo n.º 12
0
def test_detokenize_handles_arbitrary_texts(tokens):
    assert [detokenize(tokens)]