Exemplo n.º 1
0
def test_models_have_correct_lambda_size():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    for i in range(0, lm.n - 2):
        model = lm.models[i]
        assert len(model.lambdas) == len(model.hist_words_dct)
Exemplo n.º 2
0
def test_models_have_correct_n():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    for i in range(0, lm.n - 2):
        model = lm.models[i]
        assert model.n == i + 2
Exemplo n.º 3
0
def test_perplexity_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    perp = round(lm.perplexity(2, math.log(0.5)), 5)
    correct = round(math.sqrt(2), 5)
    assert perp == correct
Exemplo n.º 4
0
def test_kn_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.kn_evaluate(['text', 'shall', 'train']) == -2.0770634192748685
    assert lm.kn_evaluate(['this', 'text', 'dog']) == -3.1656313103493887
    assert lm.kn_evaluate(['the', 'brown', 'cat']) == -2.4724841297894433
Exemplo n.º 5
0
Arquivo: hmm1.py Projeto: TPLink32/nlp
 def __init__(self):
     self.lm = LanguageModel('RenMinData.txt')
     self.dict = {}
     self.words = []
     self.max_len_word = 0
     self.load_dict('dict.txt')
     self.graph = None
     self.viterbi_cache = {}
Exemplo n.º 6
0
def test_models_have_correct_beginning_grams():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert sorted(lm.models[0].beginning_grams) \
     == sorted(['this', 'shall', 'PAD'])
    assert sorted(lm.models[1].beginning_grams) \
     == sorted(['PAD this', 'this text', 'PAD PAD', 'shall train'])
Exemplo n.º 7
0
def test_laplace_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.laplace_evaluate(['this', 'shall', 'train', 'PAD']) \
     == -2.890371757896165
    assert lm.laplace_evaluate(['dog', 'text', '.', 'PAD']) \
     == (math.log(1 / 9) + math.log(1 / 2))
Exemplo n.º 8
0
    def __init__(self):
        super().__init__()

        self.model_lm = LanguageModel()
        self.model_ct = ContentTransfer()
        self.kb = KnowledgeBase()
        self.ranker = Ranker(self.model_lm)
        self.local = True
Exemplo n.º 9
0
def test_train_creates_expected_hist_words_dict():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    assert sorted(list(model.hist_words_dct.keys())) \
     == sorted(['PAD', 'this', 'text', 'shall', 'train', '.'])
    assert list(model.hist_words_dct['this'].keys()) == ['text']
    assert list(model.hist_words_dct['text'].keys()) == ['.']
    assert list(model.hist_words_dct['shall'].keys()) == ['train']
    assert list(model.hist_words_dct['train'].keys()) == ['text']
    assert list(model.hist_words_dct['PAD'].keys()) == ['this']
    assert sorted(list(model.hist_words_dct['.'].keys())) \
     == sorted(['PAD', 'shall'])
Exemplo n.º 10
0
def test_subsequent_training():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    wh1_len = len(model.word_hists_dct)
    hw1_len = len(model.hist_words_dct)
    data = tokenize('This sample.')
    lm.train(data)
    model = lm.models[-1]
    wh2_len = len(model.word_hists_dct)
    hw2_len = len(model.hist_words_dct)
    assert wh2_len - wh1_len == 1
    assert hw2_len - hw1_len == 1
    assert sorted(list(model.word_hists_dct['.'].keys())) \
     == sorted(['text', 'sample'])
    assert sorted(list(model.hist_words_dct['this'].keys())) \
     == sorted(['text', 'sample'])
Exemplo n.º 11
0
def main():
    p = get_argparser()
    args = p.parse_args()

    lm = LanguageModel()
    lm.configure_logger(level=logging.DEBUG if args.DEBUG else logging.INFO,
                        write_file=True)

    if args.train and args.data_path:
        lm.train(args.data_path,
                 output_path=args.train,
                 learning_rate=args.learning_rate,
                 hidden_size=args.hidden_size,
                 batch_size=args.batch_size,
                 max_epoch=args.max_epoch)

    elif args.test and args.data_path:
        lm.predict(args.test, args.data_path)

    else:
        # Well, this is silly.
        p.print_help()
        exit(2)
Exemplo n.º 12
0
def test_laplace_produces_expected_values2():
    lm = LanguageModel(1)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.laplace_evaluate(['text']) == math.log(3 / 12)
    assert lm.laplace_evaluate(['dog']) == math.log(1 / 12)
Exemplo n.º 13
0
                waited += 1
                if waited >= patience:
                    break
            era_index += 1
            era_loss = 0.
            era_samples = 0

    torch.save(checkpoint, os.path.join(save_dir, f"{era_index}_eras.pt"))
    return checkpoint


if __name__ == "__main__":
    vocab_path = "data/vocab.txt"
    in_tokens = 2
    embedding_size = 128
    with open(vocab_path) as r:
        vocab = list(map(lambda l: l.strip(), r.readlines()))
    assert len(vocab) == len(set(vocab))
    vocab_size = len(vocab) + 1

    model = LanguageModel(in_tokens, vocab_size, embedding_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    train(model,
          optimizer,
          vocab, ["data/parted/0.txt"], ["data/parted/1.txt"],
          batch_size=32,
          max_train_eras=100,
          batches_per_era=100,
          max_val_batches=10)
Exemplo n.º 14
0
# -*- coding: utf-8 -*-
from lm import LanguageModel
from memoize import Memoize

lm = LanguageModel()


def splits(text, max_len=10):
    return [(text[:i + 1], text[i + 1:])
            for i in range(min(len(text), max_len))]


@Memoize
def segment(text):
    text = text.strip()
    if not text:
        return []

    candidates = [[left] + segment(right) for left, right in splits(text)]
    return max(candidates, key=lm.get_words_prob)


if __name__ == '__main__':
    test = [
        'colorlessgreenideassleepfuriously.', 'ihaveadream.',
        'howtotrainadragon.', 'canwetakeaphotoofyou?'
    ]

    for text in test:
        words = segment(text)
        print(text)
Exemplo n.º 15
0
 def test_ngram(self):
     result = LanguageModel(2).get_ngrams(["hello", "world", "lmao"])
     self.assertEqual(result, [(None, 'hello'), ('hello', 'world'),
                               ('world', 'lmao'), ('lmao', None)])
Exemplo n.º 16
0
def test_discount():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.discount == 0.75
Exemplo n.º 17
0
def test_p_next_sums_to_one():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert sum(lm.p_next(['this', 'text']).values()) == 1
Exemplo n.º 18
0
 def setUp(self):
     self.lm = LanguageModel(3)
     self.token_sequences = [['the', 'cat', 'runs'], ['the', 'dog', 'runs']]
     self.lm.train(self.token_sequences)
Exemplo n.º 19
0
def test_models_have_correct_vocab_size():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert (lm.models[0].ngram_vocab_size == 7)
    assert (lm.models[1].ngram_vocab_size == 9)
Exemplo n.º 20
0
def test_kn_produces_expected_values_n4():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.kn_evaluate(['shall', 'train', 'text',
                           '.']) == -0.7742507185722116
Exemplo n.º 21
0
    else:
        source = Reader(options.input)

    if options.output == '-':
        writer = sys.stdout
    else:
        writer = Writer(options.output)

    if debug:
        rules.DEBUG = 1

    config = Config(options.config)
    if logger.level <= logging.INFO:
        config.write(sys.stderr)

    lm = LanguageModel(config.lm_file, config.lm_order)
    rule_table = RuleTable.load(config.rule_table_file, lm, config)

    extra_feature_funcs = build_extra_feature_funcs(config)
    recombination_checker = CombinedRecombinationChecker(extra_feature_funcs)
    decoder = CKYDecoder(config,
                         rule_table,
                         lm,
                         recombination_checker=recombination_checker,
                         extra_feature_funcs=extra_feature_funcs,
                         checking_hypo=checking,
                         expend_loser=expend_loser)

    logger.info('Start decoding...')

    def translate(data):
Exemplo n.º 22
0
    parser.add_argument('--vocab_len', type=float, default=19800, dest='vocab_len')
    parser.add_argument('--lr', type=float, default=1e-3, dest='lr')
    parser.add_argument('--minibatch_size', type=int, default=64, dest='minibatch_size')
    parser.add_argument('--num_epochs', type=int, default=30, dest='num_epochs')
    parser.add_argument('--models_folder', default='../lm_models', dest='folder')
    parser.add_argument('--graph_folder', default='../lm_graph', dest='graphs')
    args = parser.parse_args()
    
    # Fit the model

    if args.mode == 'train':
        # Read the initial word vectors
        train_data = np.load(open('lm_train_data.npy','r'))
        train_labels = np.load(open('lm_train_labels.npy','r'))
        
        lm = LanguageModel(args.lr, args.num_steps, args.vocab_len, args.minibatch_size)
        init = tf.global_variables_initializer()

        with tf.Session() as sess:
            sess.run(init)
            lm.fit(sess, train_data, train_labels, num_epochs=args.num_epochs, folder=args.folder, graph_folder=args.graphs)
    else:
        tweets = dill.load(open("tweets", "rb"))
        w2i = dill.load(open("w2i","rb"))
        i2w = dill.load(open("i2w","rb"))
        word_vector = dill.load(open("word_vecs","rb"))

        start_wd = ["president", "@netanyahu", "democrats", "gop", "congress", "white", "my", "the", "#makeamericagreatagain" ,"republicans", "wall", "@realdonaldtrump", "crooked"]
        input_list = [np.array([[word_vector[w2i[item]]]]) for item in start_wd]

        model = LanguageModel(args.lr, args.num_steps, args.vocab_len, args.minibatch_size)
Exemplo n.º 23
0
def test_lm_has_correct_number_tokens_and_unigram_types():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.num_tokens == 7
    assert len(lm.unigrams) == 5
Exemplo n.º 24
0
def main(args):
	"""
	Main function of the program operates based on the argument provided.

	Train
		- Ask for ngram
		- Ask for training file path
		- Train language model
		- Save the trained model

	Generate
		- Load the saved model from pickle file
		- Ask for a beam search (y/n)
			- Ask Beam length
		- Print one generated sentence in terminal
		- Ask for number of sentences to be generated on file
		- Save the input number of sentences in a file (Default: new_shakespeare.txt)

	Perplexity
		- Load Pickle file
		- Ask the test set file path
		- Print perplexity value

	Common
		- Load pickle
		- Ask number of most common ngram
		- Print the most common ngram with their occurence number.

	"""
	if args['train']:
		if not args['--n']:
			ngram = input("Please enter n for n-gram (Default: 3)-\n")
			if not ngram:
				ngram=3
		else:
			ngram=args['--n']
		lm = LanguageModel(int(ngram))

		if not args['--path']:
			path = input("Please enter path of the file-\n")
		else:
			path = args['--path']
		lm.train(readFile(path))
		print("N-gram training completed")
		print("Saving the model")
		f = open('trained_model_ngram.pkl','wb')
		pickle.dump(lm, f)
		f.close()
		print("Model saved")

	if args['generate']:
		lm = loadPickle()

		if click.confirm('Do you want to generate with Beam search?', default=True):
			lm.beam_flag = True
			beam_size =input("Enter beam size (Default: 20)-\n")
			if not beam_size:
				lm.beam_width = beam_size
		else:
			lm.beam_flag = False
		print("Generating one sentence in terminal...")
		print(detokenize(lm.generate()))
		if not args['--lines']:
			noOfText =input("Enter number of generated text you want to save (Default: 10)-\n")
			if not noOfText:
				noOfText=10
		else:
			noOfText = args['--lines']
		generated = []
		for g in range(0, int(noOfText)):
			generated.append(detokenize(lm.generate()))

		with open('new_shakespeare.txt', 'w') as f:
			for g in generated:
				f.write("%s\n" % g)
		print("Sentence file generated in current folder")

	if args['perplexity']:
		lm = loadPickle()
		if not args['--path']:
			path = input("Please enter path of the test file-\n")
		else:
			path = args['--path']
		print("Perplexity for {}-gram is {}".format(lm.ngram,lm.perplexity(readFile(path))))

	if args['common']:
		lm = loadPickle()
		if args['--number']:
			number = args['--number']
		else:
			number = 5
		lm.count_common_ngram(int(number))
Exemplo n.º 25
0
import pickle
from lm import LanguageModel

train_filename = "train_sequence.pkl"
model_filename = "model.pkl"

dataset = pickle.load(open(train_filename, "rb"))

lm = LanguageModel(lidstone_param=3e-4)
lm.fit(dataset)

pickle.dump(lm, open(model_filename, "wb"))