def run(model_path, test_path, train_path, settings, batch_size, buffer_size, device, model_info, full, confusion): model = BaseModel.load(model_path).to(device) if model_info: print(model) if hasattr(model, '_settings'): # new models should all have _settings settings = model._settings elif settings: with utils.shutup(): settings = settings_from_file(settings) else: with utils.shutup(): settings = load_default_settings() # overwrite defaults settings.batch_size = batch_size settings.buffer_size = buffer_size settings.device = device trainset = None if train_path: trainset = Dataset(settings, Reader(settings, train_path), model.label_encoder) testset = Dataset(settings, Reader(settings, *test_path), model.label_encoder) for task in model.evaluate(testset, trainset).values(): task.print_summary(full=full, confusion_matrix=confusion)
def _test_conversion(settings, level='token'): reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit_reader(reader) data = Dataset(settings, reader, label_encoder) le = label_encoder.tasks['lemma'] for (inp, tasks), (rinp, rtasks) in data.batch_generator(return_raw=True): # preds tinp, tlen = tasks['lemma'] preds = [ le.stringify(t, l) for t, l in zip(tinp.t().tolist(), tlen.tolist()) ] if level == 'token': preds = [w for line in preds for w in line] # tokens tokens = [tok for line in rinp for tok in line] # trues trues = [w for line in rtasks for w in line['lemma']] # check for pred, token, true in zip(preds, tokens, trues): rec = le.preprocessor_fn.inverse_transform(pred, token) assert rec == true, (pred, token, true, rec)
class TestWordCharEncoding(unittest.TestCase): def setUp(self): settings = settings_from_file(testpath) reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit_reader(reader) self.data = Dataset(settings, reader, label_encoder) def test_lengths(self): ((word, wlen), (char, clen)), _ = next(self.data.batch_generator()) for c, cl in zip(char.t(), clen): self.assertEqual(c[0].item(), self.data.label_encoder.char.get_bos()) self.assertEqual(c[cl - 1].item(), self.data.label_encoder.char.get_eos()) def test_word_char(self): for ((word, wlen), (char, clen)), _ in self.data.batch_generator(): idx = 0 total_words = 0 for sent, nwords in zip(word.t(), wlen): for word in sent[:nwords]: # get word word = self.data.label_encoder.word.inverse_table[word] # get chars chars = char.t()[idx][1:clen[idx] - 1].tolist() # remove <eos>,<bos> chars = ''.join( self.data.label_encoder.char.inverse_transform(chars)) self.assertEqual(word, chars) idx += 1 total_words += nwords self.assertEqual(idx, total_words, "Checked all words")
def setUp(self): settings = settings_from_file(testpath) settings['batch_size'] = 1 reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) insts = label_encoder.fit(line for _, line in reader.readsents()) self.insts = insts self.num_batches = insts // settings.batch_size self.data = Dataset(settings, reader, label_encoder)
def test_batch_level(self): settings = settings_from_file(testpath) settings['batch_size'] = 20 reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit(line for _, line in reader.readsents()) data = Dataset(settings, reader, label_encoder) pre_batches = 0 for batch in data.batch_generator(): pre_batches += 1 self.assertAlmostEqual(pre_batches, self.insts // 20, delta=delta) devset = data.get_dev_split(self.insts, split=0.05) post_batches = 0 for batch in data.batch_generator(): post_batches += 1 self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta)
def setUp(self): settings = settings_from_file(testpath) reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit_reader(reader) self.data = Dataset(settings, reader, label_encoder)
import uuid import torch import os import unittest from pie.models import SimpleModel from pie.data import MultiLabelEncoder, Reader, Dataset from pie.settings import settings_from_file testpath = os.path.join(os.path.dirname(__file__), 'testconfig.json') settings = settings_from_file(testpath) label_encoder = MultiLabelEncoder.from_settings(settings) reader = Reader(settings, settings.input_path) label_encoder.fit_reader(reader) dataset = Dataset(settings, label_encoder, reader) class TestModelSerialization(unittest.TestCase): def setUp(self): emb_dim, hidden_size, num_layers = 64, 100, 1 self.model = SimpleModel(label_encoder, emb_dim, emb_dim, hidden_size, num_layers) def test_serialization(self): model = self.model fid = '/tmp/{}'.format(str(uuid.uuid1())) model.save(fid) model2 = SimpleModel.load(fid) os.remove('{}.tar'.format(fid)) self.assertEqual(model.label_encoder, model2.label_encoder)
print() types = '{}/{}={:.2f}'.format(*label_encoder.word.get_type_stats()) tokens = '{}/{}={:.2f}'.format(*label_encoder.word.get_token_stats()) print("- {:<15} types={:<10} tokens={:<10}".format("word", types, tokens)) types = '{}/{}={:.2f}'.format(*label_encoder.char.get_type_stats()) tokens = '{}/{}={:.2f}'.format(*label_encoder.char.get_token_stats()) print("- {:<15} types={:<10} tokens={:<10}".format("char", types, tokens)) print() print("::: Target tasks :::") print() for task, le in label_encoder.tasks.items(): print("- {:<15} target={:<6} level={:<6} vocab={:<6}" .format(task, le.target, le.level, len(le))) print() trainset = Dataset(settings, reader, label_encoder) devset = None if settings.dev_path: devset = Dataset(settings, Reader(settings, settings.dev_path), label_encoder) devset = devset.get_batches() elif settings.dev_split > 0: devset = trainset.get_dev_split(ninsts, split=settings.dev_split) ninsts = ninsts - (len(devset) * settings.batch_size) else: logging.warning("No devset: cannot monitor/optimize training") testset = None if settings.test_path: testset = Dataset(settings, Reader(settings, settings.test_path), label_encoder)
return wembs + cembs def EmbeddingConcat(): def func(wemb, cemb): return torch.cat([wemb, cemb], dim=-1) return func if __name__ == '__main__': from pie.settings import settings_from_file from pie.data import Dataset settings = settings_from_file('./config.json') data = Dataset(settings) ((word, wlen), (char, clen)), tasks = next(data.batch_generator()) print("lemma", tasks['lemma'][0].size(), tasks['lemma'][1]) print("char", char.size(), clen) print("word", word.size(), wlen) emb_dim = 20 wemb = nn.Embedding(len(data.label_encoder.word), emb_dim) cemb = RNNEmbedding(len(data.label_encoder.char), emb_dim) cnncemb = CNNEmbedding(len(data.label_encoder.char), emb_dim) mixer = EmbeddingMixer(20) w, (c, _) = wemb(word), cemb(char, clen, wlen) output = mixer(w, c) output2 = []
class TestDevSplit(unittest.TestCase): def setUp(self): settings = settings_from_file(testpath) settings['batch_size'] = 1 reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) insts = label_encoder.fit(line for _, line in reader.readsents()) self.insts = insts self.num_batches = insts // settings.batch_size self.data = Dataset(settings, reader, label_encoder) def test_split_length(self): total_batches = 0 for batch in self.data.batch_generator(): total_batches += 1 dev_batches = 0 for batch in self.data.get_dev_split(self.insts, split=0.05): dev_batches += 1 self.assertAlmostEqual(dev_batches, total_batches * 0.05, delta=delta) def test_remaining(self): pre_batches = 0 for batch in self.data.batch_generator(): pre_batches += 1 self.assertEqual(pre_batches, self.insts) # batch size is 1 self.assertEqual(pre_batches, self.num_batches) devset = self.data.get_dev_split(self.insts, split=0.05) post_batches = 0 for batch in self.data.batch_generator(): post_batches += 1 # FIXME self.assertAlmostEqual(len(devset) + post_batches, pre_batches, delta=delta * 5) self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta * 5) def test_batch_level(self): settings = settings_from_file(testpath) settings['batch_size'] = 20 reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit(line for _, line in reader.readsents()) data = Dataset(settings, reader, label_encoder) pre_batches = 0 for batch in data.batch_generator(): pre_batches += 1 self.assertAlmostEqual(pre_batches, self.insts // 20, delta=delta) devset = data.get_dev_split(self.insts, split=0.05) post_batches = 0 for batch in data.batch_generator(): post_batches += 1 self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta)
def setUp(self): settings = settings_from_file(testpath) reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit(line for _, line in reader.readsents()) self.data = Dataset(settings, reader, label_encoder)
raise ValueError() preds[task] = hyps return preds if __name__ == '__main__': from pie.settings import settings_from_file from pie.data import Dataset, Reader, MultiLabelEncoder settings = settings_from_file('./config.json') reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit_reader(reader) data = Dataset(settings, reader, label_encoder) model = SimpleModel(data.label_encoder, settings.tasks, settings.wemb_dim, settings.cemb_dim, settings.hidden_size, settings.num_layers) model.to(settings.device) for batch in data.batch_generator(): model.loss(batch) break ((word, wlen), (char, clen)), tasks = next(data.batch_generator()) wemb, (cemb, cemb_outs) = model.wemb(word), model.cemb(char, clen, wlen) emb = model.merger(wemb, cemb) enc_outs = model.encoder(emb, wlen) model.pos_decoder.predict(enc_outs, wlen) lemma_hyps, _ = model.decoders['lemma'].predict_max( cemb_outs,
def run(config_path): now = datetime.now() seed = now.hour * 10000 + now.minute * 100 + now.second print("Using seed:", seed) random.seed(seed) numpy.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) settings = settings_from_file(config_path) # check settings # - check at least and at most one target has_target = False for task in settings.tasks: if len(settings.tasks) == 1: task['target'] = True if task.get('target', False): if has_target: raise ValueError("Got more than one target task") has_target = True if not has_target: raise ValueError("Needs at least one target task") # datasets reader = Reader(settings, settings.input_path) tasks = reader.check_tasks(expected=None) if settings.verbose: print("::: Available tasks :::") print() for task in tasks: print("- {}".format(task)) print() # label encoder label_encoder = MultiLabelEncoder.from_settings(settings, tasks=tasks) if settings.verbose: print("::: Fitting data :::") print() label_encoder.fit_reader(reader) if settings.verbose: print() print("::: Vocabulary :::") print() types = '{}/{}={:.2f}'.format(*label_encoder.word.get_type_stats()) tokens = '{}/{}={:.2f}'.format(*label_encoder.word.get_token_stats()) print("- {:<15} types={:<10} tokens={:<10}".format( "word", types, tokens)) types = '{}/{}={:.2f}'.format(*label_encoder.char.get_type_stats()) tokens = '{}/{}={:.2f}'.format(*label_encoder.char.get_token_stats()) print("- {:<15} types={:<10} tokens={:<10}".format( "char", types, tokens)) print() print("::: Tasks :::") print() for task, le in label_encoder.tasks.items(): print("- {:<15} target={:<6} level={:<6} vocab={:<6}".format( task, le.target, le.level, len(le))) print() trainset = Dataset(settings, reader, label_encoder) devset = None if settings.dev_path: devset = Dataset(settings, Reader(settings, settings.dev_path), label_encoder) else: logging.warning("No devset: cannot monitor/optimize training") # model model = SimpleModel(label_encoder, settings.tasks, settings.wemb_dim, settings.cemb_dim, settings.hidden_size, settings.num_layers, dropout=settings.dropout, cell=settings.cell, cemb_type=settings.cemb_type, cemb_layers=settings.cemb_layers, custom_cemb_cell=settings.custom_cemb_cell, linear_layers=settings.linear_layers, scorer=settings.scorer, word_dropout=settings.word_dropout, lm_shared_softmax=settings.lm_shared_softmax, include_lm=settings.include_lm) # pretrain(/load pretrained) embeddings if model.wemb is not None: if settings.pretrain_embeddings: print("Pretraining word embeddings") wemb_reader = Reader(settings, settings.input_path, settings.dev_path, settings.test_path) weight = get_pretrained_embeddings(wemb_reader, label_encoder, size=settings.wemb_dim, window=5, negative=5, min_count=1) model.wemb.weight.data = torch.tensor(weight, dtype=torch.float32) elif settings.load_pretrained_embeddings: print("Loading pretrained embeddings") if not os.path.isfile(settings.load_pretrained_embeddings): print("Couldn't find pretrained eembeddings in: {}".format( settings.load_pretrained_embeddings)) initialization.init_pretrained_embeddings( settings.load_pretrained_embeddings, label_encoder.word, model.wemb) # load pretrained weights if settings.load_pretrained_encoder: model.init_from_encoder( pie.Encoder.load(settings.load_pretrained_encoder)) # freeze embeddings if settings.freeze_embeddings: model.wemb.weight.requires_grad = False model.to(settings.device) print("::: Model :::") print() print(model) print() print("::: Model parameters :::") print() trainable = sum(p.nelement() for p in model.parameters() if p.requires_grad) total = sum(p.nelement() for p in model.parameters()) print("{}/{} trainable/total".format(trainable, total)) print() # training print("Starting training") running_time = time.time() trainer = Trainer(settings, model, trainset, reader.get_nsents()) scores = None try: scores = trainer.train_epochs(settings.epochs, devset=devset) except KeyboardInterrupt: print("Stopping training") finally: model.eval() running_time = time.time() - running_time if settings.test_path: print("Evaluating model on test set") testset = Dataset(settings, Reader(settings, settings.test_path), label_encoder) for task in model.evaluate(testset, trainset).values(): task.print_summary() # save model fpath, infix = get_fname_infix(settings) if not settings.run_test: fpath = model.save(fpath, infix=infix, settings=settings) print("Saved best model to: [{}]".format(fpath)) if devset is not None and not settings.run_test: scorers = model.evaluate(devset, trainset) scores = [] for task in sorted(scorers): scorer = scorers[task] result = scorer.get_scores() for acc in result: scores.append('{}:{:.6f}'.format(task, result[acc]['accuracy'])) scores.append('{}-support:{}'.format(task, result[acc]['support'])) path = '{}.results.{}.csv'.format(settings.modelname, '-'.join(get_targets(settings))) with open(path, 'a') as f: line = [infix, str(seed), str(running_time)] line += scores f.write('{}\n'.format('\t'.join(line))) print("Bye!")
if task in tasks: hyps, _ = decoder.predict(enc_outs, wlen) preds[task] = hyps return preds if __name__ == '__main__': from pie.settings import settings_from_file from pie.data import Dataset, Reader, MultiLabelEncoder settings = settings_from_file('./config.json') reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit_reader(reader) data = Dataset(settings, reader, label_encoder) model = SimpleModel(data.label_encoder, settings.wemb_dim, settings.cemb_dim, settings.hidden_size, settings.num_layers) model.to(settings.device) for batch in data.batch_generator(): model.loss(batch) break ((word, wlen), (char, clen)), tasks = next(data.batch_generator()) wemb, (cemb, cemb_outs) = model.wemb(word), model.cemb(char, clen, wlen) emb = model.merger(wemb, cemb) enc_outs = model.encoder(emb, wlen) model.pos_decoder.predict(enc_outs, wlen) lemma_hyps, _ = model.lemma_decoder.predict_max(
parser.add_argument('--buffer_size', type=int, default=100000) parser.add_argument('--device', default='cpu') parser.add_argument('--model_info', action='store_true') parser.add_argument('--full', action='store_true') args = parser.parse_args() model = BaseModel.load(args.model_path).to(args.device) if args.model_info: print(model) if hasattr(model, '_settings'): # new models should all have _settings settings = model._settings elif args.settings: with utils.shutup(): settings = settings_from_file(args.settings) else: with utils.shutup(): settings = load_default_settings() # overwrite defaults settings.batch_size = args.batch_size settings.buffer_size = args.buffer_size settings.device = args.device reader = Reader(settings, *args.test_path) dataset = Dataset(settings, reader, model.label_encoder) dataset = device_wrapper(list(dataset.batch_generator()), args.device) for task in model.evaluate(dataset).values(): task.print_summary(full=args.full)