def setUp(self): xnmt.events.clear() self.model_context = ModelContext() self.model_context.dynet_param_collection = PersistentParamCollection( "some_file", 1) self.model = DefaultTranslator( src_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), encoder=BiLSTMSeqTransducer(self.model_context), attender=MlpAttender(self.model_context), trg_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), decoder=MlpSoftmaxDecoder(self.model_context, vocab_size=100, bridge=CopyBridge(self.model_context, dec_layers=1)), ) self.model.initialize_training_strategy(TrainingStrategy()) self.model.set_train(False) self.model.initialize_generator() self.training_corpus = BilingualTrainingCorpus( train_src="examples/data/head.ja", train_trg="examples/data/head.en", dev_src="examples/data/head.ja", dev_trg="examples/data/head.en") self.corpus_parser = BilingualCorpusParser( src_reader=PlainTextReader(), trg_reader=PlainTextReader(), training_corpus=self.training_corpus)
class PretrainedSimpleWordEmbedderSanityTest(unittest.TestCase): def setUp(self): xnmt.events.clear() self.input_reader = PlainTextReader() list(self.input_reader.read_sents('examples/data/head.ja')) self.input_reader.freeze() self.context = ModelContext() self.context.dynet_param_collection = PersistentParamCollection( None, 0) def test_load(self): """ Checks that the embeddings can be loaded, have the right dimension, and that one line matches. """ embedder = PretrainedSimpleWordEmbedder( self.context, self.input_reader.vocab, 'examples/data/wiki.ja.vec.small', 300) # self.assertEqual(embedder.embeddings.shape()[::-1], (self.input_reader.vocab_size(), 300)) with io.open('examples/data/wiki.ja.vec.small', encoding='utf-8') as vecfile: test_line = next(islice(vecfile, 9, None)).split() # Select the vector for '日' test_word = test_line[0] test_id = self.input_reader.vocab.w2i[test_word] test_emb = test_line[1:] self.assertTrue( np.allclose(embedder.embeddings.batch([test_id ]).npvalue().tolist(), np.array(test_emb, dtype=float).tolist(), rtol=1e-5))
def setUp(self): xnmt.events.clear() self.input_reader = PlainTextReader() list(self.input_reader.read_sents('examples/data/head.ja')) self.input_reader.freeze() self.context = ExpGlobal( dynet_param_collection=PersistentParamCollection(None, 0))
def setUp(self): xnmt.events.clear() self.exp_global = ExpGlobal( dynet_param_collection=PersistentParamCollection("some_file", 1)) self.model = DefaultTranslator( src_reader=PlainTextReader(), trg_reader=PlainTextReader(), src_embedder=SimpleWordEmbedder(exp_global=self.exp_global, vocab_size=100), encoder=BiLSTMSeqTransducer(exp_global=self.exp_global), attender=MlpAttender(exp_global=self.exp_global), trg_embedder=SimpleWordEmbedder(exp_global=self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(exp_global=self.exp_global, vocab_size=100, bridge=CopyBridge( exp_global=self.exp_global, dec_layers=1)), ) self.model.set_train(False) self.model.initialize_generator(beam=1) self.src_data = list( self.model.src_reader.read_sents("examples/data/head.ja")) self.trg_data = list( self.model.trg_reader.read_sents("examples/data/head.en"))
def setUp(self): xnmt.events.clear() self.exp_global = ExpGlobal( dynet_param_collection=NonPersistentParamCollection()) self.src_reader = PlainTextReader() self.trg_reader = PlainTextReader() self.src_data = list( self.src_reader.read_sents("examples/data/head.ja")) self.trg_data = list( self.trg_reader.read_sents("examples/data/head.en"))
def setUp(self): xnmt.events.clear() self.model_context = ModelContext() self.model_context.dynet_param_collection = PersistentParamCollection( "some_file", 1) self.training_corpus = BilingualTrainingCorpus( train_src="examples/data/head.ja", train_trg="examples/data/head.en", dev_src="examples/data/head.ja", dev_trg="examples/data/head.en") self.corpus_parser = BilingualCorpusParser( src_reader=PlainTextReader(), trg_reader=PlainTextReader(), training_corpus=self.training_corpus)
def test_overfitting(self): self.exp_global = ExpGlobal( dynet_param_collection=NonPersistentParamCollection(), dropout=0.0) self.exp_global.default_layer_dim = 16 batcher = SrcBatcher(batch_size=10, break_ties_randomly=False) train_args = {} train_args['src_file'] = "examples/data/head.ja" train_args['trg_file'] = "examples/data/head.en" train_args['loss_calculator'] = LossCalculator() train_args['model'] = DefaultTranslator( src_reader=PlainTextReader(), trg_reader=PlainTextReader(), src_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), encoder=BiLSTMSeqTransducer(self.exp_global), attender=MlpAttender(self.exp_global), trg_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(self.exp_global, vocab_size=100, bridge=CopyBridge( exp_global=self.exp_global, dec_layers=1)), ) train_args['dev_tasks'] = [ LossEvalTask(model=train_args['model'], src_file="examples/data/head.ja", ref_file="examples/data/head.en", batcher=batcher) ] train_args['run_for_epochs'] = 1 train_args['trainer'] = AdamTrainer(self.exp_global, alpha=0.1) train_args['batcher'] = batcher training_regimen = xnmt.training_regimen.SimpleTrainingRegimen( exp_global=self.exp_global, **train_args) training_regimen.exp_global = self.exp_global for _ in range(50): training_regimen.run_training(save_fct=lambda: None, update_weights=True) self.assertAlmostEqual(0.0, training_regimen.logger.epoch_loss.sum() / training_regimen.logger.epoch_words, places=2)
def test_overfitting(self): self.model_context = ModelContext() self.model_context.dynet_param_collection = PersistentParamCollection( "some_file", 1) self.model_context.default_layer_dim = 16 train_args = {} training_corpus = BilingualTrainingCorpus( train_src="examples/data/head.ja", train_trg="examples/data/head.en", dev_src="examples/data/head.ja", dev_trg="examples/data/head.en") train_args['corpus_parser'] = BilingualCorpusParser( training_corpus=training_corpus, src_reader=PlainTextReader(), trg_reader=PlainTextReader()) train_args['training_strategy'] = TrainingStrategy() train_args['model'] = DefaultTranslator( src_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), encoder=BiLSTMSeqTransducer(self.model_context), attender=MlpAttender(self.model_context), trg_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), decoder=MlpSoftmaxDecoder(self.model_context, vocab_size=100), ) train_args['model_file'] = None train_args['save_num_checkpoints'] = 0 train_args['trainer'] = AdamTrainer(self.model_context, alpha=0.1) train_args['batcher'] = SrcBatcher(batch_size=10, break_ties_randomly=False) training_regimen = xnmt.train.TrainingRegimen( yaml_context=self.model_context, **train_args) training_regimen.model_context = self.model_context for _ in range(50): training_regimen.one_epoch(update_weights=True) self.assertAlmostEqual( 0.0, training_regimen.logger.epoch_loss.loss_values['loss'] / training_regimen.logger.epoch_words, places=2)
def test_train_dev_loss_equal(self): self.model_context = ModelContext() self.model_context.dynet_param_collection = NonPersistentParamCollection( ) train_args = {} training_corpus = BilingualTrainingCorpus( train_src="examples/data/head.ja", train_trg="examples/data/head.en", dev_src="examples/data/head.ja", dev_trg="examples/data/head.en") train_args['corpus_parser'] = BilingualCorpusParser( training_corpus=training_corpus, src_reader=PlainTextReader(), trg_reader=PlainTextReader()) train_args['loss_calculator'] = LossCalculator() train_args['model'] = DefaultTranslator( src_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), encoder=BiLSTMSeqTransducer(self.model_context), attender=MlpAttender(self.model_context), trg_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), decoder=MlpSoftmaxDecoder(self.model_context, vocab_size=100), ) train_args['trainer'] = None train_args['batcher'] = SrcBatcher(batch_size=5, break_ties_randomly=False) train_args['run_for_epochs'] = 1 training_regimen = xnmt.training_regimen.SimpleTrainingRegimen( yaml_context=self.model_context, **train_args) training_regimen.model_context = self.model_context training_regimen.run_training(update_weights=False) self.assertAlmostEqual( training_regimen.logger.epoch_loss.loss_values['loss'] / training_regimen.logger.epoch_words, training_regimen.logger.dev_score.loss)
class TestTruncatedBatchTraining(unittest.TestCase): def setUp(self): xnmt.events.clear() self.exp_global = ExpGlobal( dynet_param_collection=NonPersistentParamCollection()) self.src_reader = PlainTextReader() self.trg_reader = PlainTextReader() self.src_data = list( self.src_reader.read_sents("examples/data/head.ja")) self.trg_data = list( self.trg_reader.read_sents("examples/data/head.en")) def assert_single_loss_equals_batch_loss(self, model, pad_src_to_multiple=1): """ Tests whether single loss equals batch loss. Truncating src / trg sents to same length so no masking is necessary """ batch_size = 5 src_sents = self.src_data[:batch_size] src_min = min([len(x) for x in src_sents]) src_sents_trunc = [s[:src_min] for s in src_sents] for single_sent in src_sents_trunc: single_sent[src_min - 1] = Vocab.ES while len(single_sent) % pad_src_to_multiple != 0: single_sent.append(Vocab.ES) trg_sents = self.trg_data[:batch_size] trg_min = min([len(x) for x in trg_sents]) trg_sents_trunc = [s[:trg_min] for s in trg_sents] for single_sent in trg_sents_trunc: single_sent[trg_min - 1] = Vocab.ES single_loss = 0.0 for sent_id in range(batch_size): dy.renew_cg() train_loss = model.calc_loss( src=src_sents_trunc[sent_id], trg=trg_sents_trunc[sent_id], loss_calculator=LossCalculator()).value() single_loss += train_loss dy.renew_cg() batched_loss = model.calc_loss( src=mark_as_batch(src_sents_trunc), trg=mark_as_batch(trg_sents_trunc), loss_calculator=LossCalculator()).value() self.assertAlmostEqual(single_loss, sum(batched_loss), places=4) def test_loss_model1(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), encoder=BiLSTMSeqTransducer(self.exp_global), attender=MlpAttender(self.exp_global), trg_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(self.exp_global, vocab_size=100, bridge=CopyBridge( exp_global=self.exp_global, dec_layers=1)), ) model.set_train(False) self.assert_single_loss_equals_batch_loss(model) def test_loss_model2(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), encoder=PyramidalLSTMSeqTransducer(self.exp_global, layers=3), attender=MlpAttender(self.exp_global), trg_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(self.exp_global, vocab_size=100, bridge=CopyBridge( exp_global=self.exp_global, dec_layers=1)), ) model.set_train(False) self.assert_single_loss_equals_batch_loss(model, pad_src_to_multiple=4) def test_loss_model3(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), encoder=BiLSTMSeqTransducer(self.exp_global, layers=3), attender=MlpAttender(self.exp_global), trg_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(exp_global=self.exp_global, vocab_size=100, bridge=CopyBridge( exp_global=self.exp_global, dec_layers=1)), ) model.set_train(False) self.assert_single_loss_equals_batch_loss(model) def test_loss_model4(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), encoder=BiLSTMSeqTransducer(self.exp_global), attender=DotAttender(self.exp_global), trg_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(self.exp_global, vocab_size=100, bridge=CopyBridge( exp_global=self.exp_global, dec_layers=1)), ) model.set_train(False) self.assert_single_loss_equals_batch_loss(model)
class TestBatchTraining(unittest.TestCase): def setUp(self): xnmt.events.clear() self.exp_global = ExpGlobal( dynet_param_collection=NonPersistentParamCollection()) self.src_reader = PlainTextReader() self.trg_reader = PlainTextReader() self.src_data = list( self.src_reader.read_sents("examples/data/head.ja")) self.trg_data = list( self.trg_reader.read_sents("examples/data/head.en")) def assert_single_loss_equals_batch_loss(self, model, pad_src_to_multiple=1): """ Tests whether single loss equals batch loss. Here we don't truncate the target side and use masking. """ batch_size = 5 src_sents = self.src_data[:batch_size] src_min = min([len(x) for x in src_sents]) src_sents_trunc = [s[:src_min] for s in src_sents] for single_sent in src_sents_trunc: single_sent[src_min - 1] = Vocab.ES while len(single_sent) % pad_src_to_multiple != 0: single_sent.append(Vocab.ES) trg_sents = self.trg_data[:batch_size] trg_max = max([len(x) for x in trg_sents]) trg_masks = Mask(np.zeros([batch_size, trg_max])) for i in range(batch_size): for j in range(len(trg_sents[i]), trg_max): trg_masks.np_arr[i, j] = 1.0 trg_sents_padded = [[w for w in s] + [Vocab.ES] * (trg_max - len(s)) for s in trg_sents] single_loss = 0.0 for sent_id in range(batch_size): dy.renew_cg() train_loss = model.calc_loss( src=src_sents_trunc[sent_id], trg=trg_sents[sent_id], loss_calculator=LossCalculator()).value() single_loss += train_loss dy.renew_cg() batched_loss = model.calc_loss( src=mark_as_batch(src_sents_trunc), trg=mark_as_batch(trg_sents_padded, trg_masks), loss_calculator=LossCalculator()).value() self.assertAlmostEqual(single_loss, sum(batched_loss), places=4) def test_loss_model1(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(exp_global=self.exp_global, vocab_size=100), encoder=BiLSTMSeqTransducer(exp_global=self.exp_global), attender=MlpAttender(exp_global=self.exp_global), trg_embedder=SimpleWordEmbedder(exp_global=self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(exp_global=self.exp_global, vocab_size=100, bridge=CopyBridge( exp_global=self.exp_global, dec_layers=1)), ) model.set_train(False) self.assert_single_loss_equals_batch_loss(model) def test_loss_model2(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(exp_global=self.exp_global, vocab_size=100), encoder=PyramidalLSTMSeqTransducer(exp_global=self.exp_global, layers=3), attender=MlpAttender(exp_global=self.exp_global), trg_embedder=SimpleWordEmbedder(exp_global=self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(exp_global=self.exp_global, vocab_size=100, bridge=CopyBridge( exp_global=self.exp_global, dec_layers=1)), ) model.set_train(False) self.assert_single_loss_equals_batch_loss(model, pad_src_to_multiple=4) def test_loss_model3(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(exp_global=self.exp_global, vocab_size=100), encoder=BiLSTMSeqTransducer(exp_global=self.exp_global, layers=3), attender=MlpAttender(exp_global=self.exp_global), trg_embedder=SimpleWordEmbedder(exp_global=self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(exp_global=self.exp_global, vocab_size=100, bridge=CopyBridge( exp_global=self.exp_global, dec_layers=1)), ) model.set_train(False) self.assert_single_loss_equals_batch_loss(model)
class TestEncoder(unittest.TestCase): def setUp(self): xnmt.events.clear() self.exp_global = ExpGlobal( dynet_param_collection=PersistentParamCollection("some_file", 1)) self.src_reader = PlainTextReader() self.trg_reader = PlainTextReader() self.src_data = list( self.src_reader.read_sents("examples/data/head.ja")) self.trg_data = list( self.trg_reader.read_sents("examples/data/head.en")) @xnmt.events.register_xnmt_event def set_train(self, val): pass @xnmt.events.register_xnmt_event def start_sent(self, src): pass def assert_in_out_len_equal(self, model): dy.renew_cg() self.set_train(True) src = self.src_data[0] self.start_sent(src) embeddings = model.src_embedder.embed_sent(src) encodings = model.encoder(embeddings) self.assertEqual(len(embeddings), len(encodings)) def test_bi_lstm_encoder_len(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), encoder=BiLSTMSeqTransducer(self.exp_global, layers=3), attender=MlpAttender(self.exp_global), trg_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(self.exp_global, vocab_size=100), ) self.assert_in_out_len_equal(model) def test_uni_lstm_encoder_len(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), encoder=UniLSTMSeqTransducer(self.exp_global), attender=MlpAttender(self.exp_global), trg_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(self.exp_global, vocab_size=100), ) self.assert_in_out_len_equal(model) def test_res_lstm_encoder_len(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), encoder=ResidualLSTMSeqTransducer(self.exp_global, layers=3), attender=MlpAttender(self.exp_global), trg_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(self.exp_global, vocab_size=100), ) self.assert_in_out_len_equal(model) def test_py_lstm_encoder_len(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), encoder=PyramidalLSTMSeqTransducer(self.exp_global, layers=3), attender=MlpAttender(self.exp_global), trg_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(self.exp_global, vocab_size=100), ) self.set_train(True) for sent_i in range(10): dy.renew_cg() src = self.src_data[sent_i].get_padded_sent( Vocab.ES, 4 - (len(self.src_data[sent_i]) % 4)) self.start_sent(src) embeddings = model.src_embedder.embed_sent(src) encodings = model.encoder(embeddings) self.assertEqual(int(math.ceil(len(embeddings) / float(4))), len(encodings)) def test_py_lstm_mask(self): model = DefaultTranslator( src_reader=self.src_reader, trg_reader=self.trg_reader, src_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), encoder=PyramidalLSTMSeqTransducer(self.exp_global, layers=1), attender=MlpAttender(self.exp_global), trg_embedder=SimpleWordEmbedder(self.exp_global, vocab_size=100), decoder=MlpSoftmaxDecoder(self.exp_global, vocab_size=100), ) batcher = xnmt.batcher.TrgBatcher(batch_size=3) train_src, _ = \ batcher.pack(self.src_data, self.trg_data) self.set_train(True) for sent_i in range(3): dy.renew_cg() src = train_src[sent_i] self.start_sent(src) embeddings = model.src_embedder.embed_sent(src) encodings = model.encoder(embeddings) if train_src[sent_i].mask is None: assert encodings.mask is None else: np.testing.assert_array_almost_equal( train_src[sent_i].mask.np_arr, encodings.mask.np_arr)