def test_regularization(self): penalty = self.model.get_regularization_penalty() assert penalty == 0 iterator = BasicIterator(batch_size=32) trainer = Trainer(self.model, None, # optimizer, iterator, self.instances) # You get a RuntimeError if you call `model.forward` twice on the same inputs. # The data and config are such that the whole dataset is one batch. training_batch = next(iterator(self.instances, num_epochs=1)) validation_batch = next(iterator(self.instances, num_epochs=1)) training_loss = trainer._batch_loss(training_batch, for_training=True).data validation_loss = trainer._batch_loss(validation_batch, for_training=False).data # Training loss should have the regularization penalty, but validation loss should not. assert (training_loss == validation_loss).all()
def test_trainer_respects_keep_serialized_model_every_num_seconds(self): # To test: # Create an fake data loader that sleeps for 2.5 second per epoch, so the total # training time for one epoch is slightly greater then 2.5 seconds. # Run for 6 epochs, keeping the last 2 models, models also kept every 5 seconds. # Check the resulting checkpoints. Should then have models at epochs # 2, 4, plus the last two at 5 and 6. class SlowDataLoader: data_loader = DataLoader(self.instances, batch_size=2, collate_fn=allennlp_collate) def __iter__(self): time.sleep(2.5) return iter(self.data_loader) def __len__(self): return len(self.data_loader) trainer = Trainer( self.model, self.optimizer, SlowDataLoader(), num_epochs=6, serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=2, keep_serialized_model_every_num_seconds=5, ) trainer.train() # Now check the serialized files for prefix in ["model_state_epoch_*", "training_state_epoch_*"]: file_names = glob.glob(os.path.join(self.TEST_DIR, prefix)) epochs = [ int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names ] # epoch N has N-1 in file name assert sorted(epochs) == [1, 3, 4, 5]
def test_trainer_saves_and_loads_best_validation_metrics_correctly_1(self): # Use -loss and run 1 epoch of original-training, and one of restored-training # Run 1 epoch of original training. trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, validation_metric="-loss", num_epochs=1, serialization_dir=self.TEST_DIR, ) trainer.train() _ = trainer._restore_checkpoint() best_epoch_1 = trainer._metric_tracker.best_epoch best_validation_metrics_epoch_1 = trainer._metric_tracker.best_epoch_metrics # best_validation_metrics_epoch_1: {'accuracy': 0.75, 'accuracy3': 1.0, 'loss': 0.6243013441562653} assert isinstance(best_validation_metrics_epoch_1, dict) assert "loss" in best_validation_metrics_epoch_1 # Run 1 epoch of restored training. restore_trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, validation_metric="-loss", num_epochs=2, serialization_dir=self.TEST_DIR, ) restore_trainer.train() _ = restore_trainer._restore_checkpoint() best_epoch_2 = restore_trainer._metric_tracker.best_epoch best_validation_metrics_epoch_2 = restore_trainer._metric_tracker.best_epoch_metrics # Because of using -loss, 2nd epoch would be better than 1st. So best val metrics should not be same. assert best_epoch_1 == 0 and best_epoch_2 == 1 assert best_validation_metrics_epoch_2 != best_validation_metrics_epoch_1
def test_trainer_can_run_and_resume_with_momentum_scheduler(self): scheduler = MomentumScheduler.from_params( optimizer=self.optimizer, params=Params({ "type": "inverted_triangular", "cool_down": 2, "warm_up": 2 }), ) trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, momentum_scheduler=scheduler, validation_metric="-loss", train_dataset=self.instances, validation_dataset=self.instances, num_epochs=4, serialization_dir=self.TEST_DIR, ) trainer.train() new_scheduler = MomentumScheduler.from_params( optimizer=self.optimizer, params=Params({ "type": "inverted_triangular", "cool_down": 2, "warm_up": 2 }), ) new_trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, momentum_scheduler=new_scheduler, validation_metric="-loss", train_dataset=self.instances, validation_dataset=self.instances, num_epochs=6, serialization_dir=self.TEST_DIR, ) epoch = new_trainer._restore_checkpoint() assert epoch == 4 assert new_trainer._momentum_scheduler.last_epoch == 3 new_trainer.train()
def test_trainer_can_run_gradient_accumulation(self): instances = list(self.instances) steps_to_accumulate = 2 trainer = Trainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=2, num_gradient_accumulation_steps=steps_to_accumulate, ) assert trainer._num_gradient_accumulation_steps == steps_to_accumulate metrics = trainer.train() num_batches_trained_per_epoch = trainer._batch_num_total // (metrics["training_epochs"] + 1) num_batches_expected = math.ceil( math.ceil(len(instances) / self.data_loader.batch_size) / steps_to_accumulate ) assert num_batches_trained_per_epoch == num_batches_expected
def train(model_name, embed_name, attn_name, data_name): name = model_name + '_' + embed_name + '_' + attn_name logger = get_train_logger(log_path, name, data_name) checkpoints = get_train_checkpoints(checkpoints_path, name, data_name) dataset_reader = load_dataset_reader(data_name, embed_name, cuda_device) train_set, val_set = dataset_reader.load() vocab = Vocabulary.from_instances(train_set + val_set) iterator = BucketIterator(batch_size=batch_size, sorting_keys=[('text', 'num_tokens')]) iterator.index_with(vocab=vocab) encoder = load_encoder(embed_name, vocab) attn = load_attn(attn_name) clf = load_model(model_name)(vocab, encoder=encoder, attention=attn, g=dataset_reader.g, out_dim=dataset_reader.num_labels) if cuda_device > -1: clf = clf.cuda(cuda_device) optimizer = torch.optim.Adam(clf.parameters(), lr=learning_rate) trainer = Trainer( model=clf, optimizer=optimizer, iterator=iterator, validation_metric='+f-score', train_dataset=train_set, validation_dataset=val_set, patience=10, grad_clipping=10, num_epochs=epoch, cuda_device=cuda_device, num_serialized_models_to_keep=1, serialization_dir=checkpoints, ) trainer.train()
def test_regularization(self): iterator = BasicIterator(batch_size=32) trainer = Trainer( self.model, None, # optimizer, iterator, self.instances) # You get a RuntimeError if you call `model.forward` twice on the same inputs. # The data and config are such that the whole dataset is one batch. training_batch = next(iterator(self.instances, num_epochs=1)) validation_batch = next(iterator(self.instances, num_epochs=1)) training_loss = trainer.batch_loss(training_batch, for_training=True).item() / 10 validation_loss = trainer.batch_loss(validation_batch, for_training=False).item() / 10 # Training loss should have the regularization penalty, but validation loss should not. numpy.testing.assert_almost_equal(training_loss, validation_loss, decimal=0)
def test_trainer_saves_models_at_specified_interval(self): iterator = BasicIterator(batch_size=4) iterator.index_with(self.vocab) trainer = Trainer( self.model, self.optimizer, iterator, self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, model_save_interval=0.0001, ) trainer.train() # Now check the serialized files for models saved during the epoch. prefix = "model_state_epoch_*" file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix))) epochs = [ re.search(r"_([0-9\.\-]+)\.th", fname).group(1) for fname in file_names ] # We should have checkpoints at the end of each epoch and during each, e.g. # [0.timestamp, 0, 1.timestamp, 1] assert len(epochs) == 4 assert epochs[3] == "1" assert "." in epochs[0] # Now make certain we can restore from timestamped checkpoint. # To do so, remove the checkpoint from the end of epoch 1&2, so # that we are forced to restore from the timestamped checkpoints. for k in range(2): os.remove( os.path.join(self.TEST_DIR, "model_state_epoch_{}.th".format(k))) os.remove( os.path.join(self.TEST_DIR, "training_state_epoch_{}.th".format(k))) os.remove(os.path.join(self.TEST_DIR, "best.th")) restore_trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, model_save_interval=0.0001, ) epoch = restore_trainer._restore_checkpoint() assert epoch == 2 # One batch per epoch. assert restore_trainer._batch_num_total == 2
def test_trainer_can_resume_training(self): trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=1, serialization_dir=self.TEST_DIR) trainer.train() new_trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=3, serialization_dir=self.TEST_DIR) epoch = new_trainer._restore_checkpoint() # pylint: disable=protected-access assert epoch == 1 tracker = trainer._metric_tracker # pylint: disable=protected-access assert tracker.is_best_so_far() assert tracker._best_so_far is not None # pylint: disable=protected-access new_trainer.train()
def test_regularization(self): penalty = self.model.get_regularization_penalty() assert penalty == 0 iterator = BasicIterator(batch_size=32) trainer = Trainer( self.model, None, # optimizer, iterator, self.instances) # You get a RuntimeError if you call `model.forward` twice on the same inputs. # The data and config are such that the whole dataset is one batch. training_batch = next(iterator(self.instances, num_epochs=1)) validation_batch = next(iterator(self.instances, num_epochs=1)) training_loss = trainer._batch_loss(training_batch, for_training=True).data validation_loss = trainer._batch_loss(validation_batch, for_training=False).data # Training loss should have the regularization penalty, but validation loss should not. assert (training_loss == validation_loss).all()
def test_production_rule_field_with_multiple_gpus(self): wikitables_dir = "allennlp/tests/fixtures/data/wikitables/" search_output_directory = wikitables_dir + "action_space_walker_output/" wikitables_reader = WikiTablesDatasetReader( tables_directory=wikitables_dir, offline_logical_forms_directory=search_output_directory ) instances = wikitables_reader.read(wikitables_dir + "sample_data.examples") archive_path = ( self.FIXTURES_ROOT / "semantic_parsing" / "wikitables" / "serialization" / "model.tar.gz" ) model = load_archive(archive_path).model model.cuda() multigpu_iterator = BasicIterator(batch_size=4) multigpu_iterator.index_with(model.vocab) trainer = Trainer( model, self.optimizer, multigpu_iterator, instances, num_epochs=2, cuda_device=[0, 1] ) trainer.train()
def test_should_stop_early_with_invalid_patience(self): for patience in [0, -1, -2, 1.5, 'None']: with pytest.raises(ConfigurationError, match='.* is an invalid value for "patience": ' 'it must be a positive integer or None ' '\\(if you want to disable early stopping\\)'): Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=100, patience=patience, validation_metric="+test")
def test_should_stop_early_with_invalid_patience(self): for patience in [0, -1, -2, 1.5, 'None']: with pytest.raises( ConfigurationError, message='No ConfigurationError for patience={}'.format( patience)): Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=100, patience=patience, validation_metric="+test")
def test_passing_trainer_multiple_gpus_raises_error(self): self.model.cuda() multigpu_iterator = BasicIterator(batch_size=4) multigpu_iterator.index_with(self.vocab) with pytest.raises(ConfigurationError): Trainer( self.model, self.optimizer, multigpu_iterator, self.instances, num_epochs=2, cuda_device=[0, 1], )
def test_trainer_can_run_multiple_gpu(self): self.model.cuda() class MetaDataCheckWrapper(Model): """ Checks that the metadata field has been correctly split across the batch dimension when running on multiple gpus. """ def __init__(self, model): super().__init__(model.vocab) self.model = model def forward(self, **kwargs) -> Dict[str, torch.Tensor]: # type: ignore # pylint: disable=arguments-differ assert 'metadata' in kwargs and 'tags' in kwargs, \ f'tokens and metadata must be provided. Got {kwargs.keys()} instead.' batch_size = kwargs['tokens']['tokens'].size()[0] assert len(kwargs['metadata']) == batch_size, \ f'metadata must be split appropriately. Expected {batch_size} elements, ' \ f"got {len(kwargs['metadata'])} elements." return self.model.forward(**kwargs) multigpu_iterator = BasicIterator(batch_size=4) multigpu_iterator.index_with(self.vocab) trainer = Trainer(MetaDataCheckWrapper(self.model), self.optimizer, multigpu_iterator, self.instances, num_epochs=2, cuda_device=[0, 1]) metrics = trainer.train() assert 'peak_cpu_memory_MB' in metrics assert isinstance(metrics['peak_cpu_memory_MB'], float) assert metrics['peak_cpu_memory_MB'] > 0 assert 'peak_gpu_0_memory_MB' in metrics assert isinstance(metrics['peak_gpu_0_memory_MB'], int) assert 'peak_gpu_1_memory_MB' in metrics assert isinstance(metrics['peak_gpu_1_memory_MB'], int)
def main(args): params = Params.from_file(args.config_path) stdout_handler = prepare_global_logging(args.output_dir, False) prepare_environment(params) reader = DatasetReader.from_params(params["dataset_reader"]) train_dataset = reader.read(params.pop("train_data_path", None)) valid_dataset = reader.read(params.pop("validation_data_path", None)) test_data_path = params.pop("test_data_path", None) if test_data_path: test_dataset = reader.read(test_data_path) vocab = Vocabulary.from_instances(train_dataset + valid_dataset + test_dataset) else: test_dataset = None vocab = Vocabulary.from_instances(train_dataset + valid_dataset) model_params = params.pop("model", None) model = Model.from_params(model_params.duplicate(), vocab=vocab) vocab.save_to_files(os.path.join(args.output_dir, "vocabulary")) # copy config file with open(args.config_path, "r", encoding="utf-8") as f_in: with open(os.path.join(args.output_dir, "config.json"), "w", encoding="utf-8") as f_out: f_out.write(f_in.read()) iterator = DataIterator.from_params(params.pop("iterator", None)) iterator.index_with(vocab) trainer_params = params.pop("trainer", None) trainer = Trainer.from_params(model=model, serialization_dir=args.output_dir, iterator=iterator, train_data=train_dataset, validation_data=valid_dataset, params=trainer_params.duplicate()) trainer.train() # evaluate on the test set if test_dataset: logging.info("Evaluating on the test set") import torch # import here to ensure the republication of the experiment model.load_state_dict(torch.load(os.path.join(args.output_dir, "best.th"))) test_metrics = evaluate(model, test_dataset, iterator, cuda_device=trainer_params.pop("cuda_device", 0), batch_weight_key=None) logging.info(f"Metrics on the test set: {test_metrics}") with open(os.path.join(args.output_dir, "test_metrics.txt"), "w", encoding="utf-8") as f_out: f_out.write(f"Metrics on the test set: {test_metrics}") cleanup_global_logging(stdout_handler)
def _setup(self): """Setup the trainer components and local resources""" prepare_environment( Params({} if self._trainer_config.random_seed is None else { "random_seed": self._trainer_config.random_seed, "numpy_seed": self._trainer_config.random_seed, "pytorch_seed": self._trainer_config.random_seed, })) os.makedirs(self._output_dir, exist_ok=True) serialization_params = sanitize(self._allennlp_configuration()) with open(os.path.join(self._output_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) self._pipeline.vocab.save_to_files( os.path.join(self._output_dir, "vocabulary")) for dataset in [self._training, self._validation, self._test]: if dataset is not None: dataset.index_with(self._pipeline.backbone.vocab) trainer_params = Params( helpers.sanitize_for_params( self._trainer_config.to_allennlp_trainer())) pipeline_model = self._pipeline._model training_data_loader = create_dataloader( self._training, self._trainer_config.batch_size, self._trainer_config.data_bucketing, self._trainer_config.batches_per_epoch, ) validation_data_loader = (create_dataloader( self._validation, self._trainer_config.batch_size, self._trainer_config.data_bucketing, ) if self._validation else None) self._trainer = Trainer.from_params( model=pipeline_model, serialization_dir=self._output_dir, data_loader=training_data_loader, validation_data_loader=validation_data_loader, params=trainer_params, epoch_callbacks=self._epoch_callbacks, )
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json" self.set_up_model(param_file, self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params["dataset_reader"]) self.data_loader = DataLoader.from_params(dataset=self.instances, params=params["data_loader"]) self.trainer = Trainer.from_params( model=self.model, data_loader=self.data_loader, serialization_dir=self.TEST_DIR, params=params.get("trainer"), )
def test_trainer_respects_keep_serialized_model_every_num_seconds(self): # To test: # Create an iterator that sleeps for 2.5 second per epoch, so the total training # time for one epoch is slightly greater then 2.5 seconds. # Run for 6 epochs, keeping the last 2 models, models also kept every 5 seconds. # Check the resulting checkpoints. Should then have models at epochs # 2, 4, plus the last two at 5 and 6. class WaitingIterator(BasicIterator): def _create_batches(self, *args, **kwargs): time.sleep(2.5) return super()._create_batches(*args, **kwargs) iterator = WaitingIterator(batch_size=2) iterator.index_with(self.vocab) trainer = Trainer( self.model, self.optimizer, iterator, self.instances, num_epochs=6, serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=2, keep_serialized_model_every_num_seconds=5, ) trainer.train() # Now check the serialized files for prefix in ["model_state_epoch_*", "training_state_epoch_*"]: file_names = glob.glob(os.path.join(self.TEST_DIR, prefix)) epochs = [ int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names ] # epoch N has N-1 in file name assert sorted(epochs) == [1, 3, 4, 5]
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json' self.set_up_model(param_file, self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params['dataset_reader']) self.iterator = DataIterator.from_params(params['iterator']) self.trainer = Trainer.from_params( self.model, self.TEST_DIR, self.iterator, self.dataset, None, params.get('trainer') )
def test_should_stop_early_with_increasing_metric(self): new_trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=3, serialization_dir=self.TEST_DIR, patience=5, validation_metric="+test") tracker = new_trainer._metric_tracker # pylint: disable=protected-access new_tracker = copy.deepcopy(tracker) new_tracker.add_metrics([.5, .3, .2, .1, .4, .4]) assert new_tracker.should_stop_early() new_tracker = copy.deepcopy(tracker) new_tracker.add_metrics([.3, .3, .3, .2, .5, .1]) assert not new_tracker.should_stop_early()
def setUp(self): super(SimpleTaggerRegularizationTest, self).setUp() param_file = self.FIXTURES_ROOT / u'simple_tagger' / u'experiment_with_regularization.json' self.set_up_model(param_file, self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv') params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params[u'dataset_reader']) self.iterator = DataIterator.from_params(params[u'iterator']) self.trainer = Trainer.from_params( self.model, self.TEST_DIR, self.iterator, self.dataset, None, params.get(u'trainer') )
def test_should_stop_early_with_invalid_patience(self): for patience in [0, -1, -2, 1.5, "None"]: with pytest.raises( ConfigurationError, match='.* is an invalid value for "patience": ' "it must be a positive integer or None " "\\(if you want to disable early stopping\\)", ): Trainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=100, patience=patience, validation_metric="+test", )
def test_trainer_can_resume_with_lr_scheduler(self): lr_scheduler = LearningRateScheduler.from_params( self.optimizer, Params({ "type": "exponential", "gamma": 0.5 })) trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=lr_scheduler, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, ) trainer.train() new_lr_scheduler = LearningRateScheduler.from_params( self.optimizer, Params({ "type": "exponential", "gamma": 0.5 })) new_trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=new_lr_scheduler, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=4, serialization_dir=self.TEST_DIR, ) epoch = new_trainer._restore_checkpoint() assert epoch == 2 assert new_trainer._learning_rate_scheduler.lr_scheduler.last_epoch == 1 new_trainer.train()
def test_trainer_can_resume_training_for_exponential_moving_average(self): moving_average = ExponentialMovingAverage( self.model.named_parameters()) trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=1, serialization_dir=self.TEST_DIR, moving_average=moving_average, ) trainer.train() new_moving_average = ExponentialMovingAverage( self.model.named_parameters()) new_trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=3, serialization_dir=self.TEST_DIR, moving_average=new_moving_average, ) epoch = new_trainer._restore_checkpoint() assert epoch == 1 tracker = trainer._metric_tracker assert tracker.is_best_so_far() assert tracker._best_so_far is not None new_trainer.train()
def get_trainer_from_config(config: Params, train_instances: List[Instance], val_instances: List[Instance], vocab: Optional[Vocabulary] = None, device: Optional[int] = -1) -> Trainer: trainer_params = config.pop("trainer") trainer_params["cuda_device"] = device model_params = config.pop("model") vocab = vocab or Vocabulary.from_instances(train_instances) model = Model.from_params(model_params, vocab=vocab) iterator = DataIterator.from_params(config.pop("iterator")) iterator.index_with(vocab) trainer = Trainer.from_params( model=model, iterator=iterator, train_data=train_instances, validation_data=val_instances, serialization_dir=None, params=trainer_params) return trainer
def test_should_stop_early_with_flat_lining_metric(self): # pylint: disable=protected-access flatline = [.2] * 6 tracker = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=3, serialization_dir=self.TEST_DIR, patience=5, validation_metric="+test")._metric_tracker tracker.add_metrics(flatline) assert tracker.should_stop_early tracker = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=3, serialization_dir=self.TEST_DIR, patience=5, validation_metric="-test")._metric_tracker tracker.add_metrics(flatline) assert tracker.should_stop_early
def test_should_stop_early_with_increasing_metric(self): new_trainer = Trainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=3, serialization_dir=self.TEST_DIR, patience=5, validation_metric="+test", ) tracker = new_trainer._metric_tracker new_tracker = copy.deepcopy(tracker) new_tracker.add_metrics([0.5, 0.3, 0.2, 0.1, 0.4, 0.4]) assert new_tracker.should_stop_early() new_tracker = copy.deepcopy(tracker) new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1]) assert not new_tracker.should_stop_early()
def test_mode_specified_in_reduce_on_plateau(self): # pylint: disable=protected-access for mode, metric in [("min", "-custom"), ("max", "+custom")]: trainer_params = Params({ "validation_metric": metric, "learning_rate_scheduler": { "type": "reduce_on_plateau", "mode": mode }, "optimizer": { "type": "adam", "lr": 0.01 } }) trainer = Trainer.from_params(model=self.model, serialization_dir=self.TEST_DIR, iterator=self.iterator, train_data=self.instances, validation_data=self.instances, params=trainer_params) assert trainer._learning_rate_scheduler.lr_scheduler.mode == mode
def setUp(self): super().setUp() params = Params( { "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), "validation_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), "data_loader": {"batch_size": 2}, "trainer": {"cuda_device": -1, "num_epochs": 2, "optimizer": "adam"}, } ) all_datasets = datasets_from_params(params) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), instances=(instance for dataset in all_datasets.values() for instance in dataset), ) model = Model.from_params(vocab=vocab, params=params.pop("model")) train_data = all_datasets["train"] train_data.index_with(vocab) data_loader = DataLoader.from_params(dataset=train_data, params=params.pop("data_loader")) trainer_params = params.pop("trainer") serialization_dir = os.path.join(self.TEST_DIR, "test_search_learning_rate") self.trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, data_loader=data_loader, train_data=train_data, params=trainer_params, validation_data=None, validation_iterator=None, )
def test_mode_doesnt_agree_with_metric(self): # pylint: disable=protected-access for mode, metric in [("max", "-custom"), ("min", "+custom")]: trainer_params = Params({ "validation_metric": metric, "learning_rate_scheduler": { "type": "reduce_on_plateau", "mode": mode }, "optimizer": { "type": "adam", "lr": 0.01 } }) with self.assertLogs(logger="allennlp.training.util", level="WARNING"): # we warn when the metric and the mode don't agree trainer = Trainer.from_params(model=self.model, serialization_dir=self.TEST_DIR, iterator=self.iterator, train_data=self.instances, validation_data=self.instances, params=trainer_params) assert trainer._learning_rate_scheduler.lr_scheduler.mode == mode
def test_trainer_can_run(self): trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2, ) metrics = trainer.train() assert "best_validation_loss" in metrics assert isinstance(metrics["best_validation_loss"], float) assert "best_validation_accuracy" in metrics assert isinstance(metrics["best_validation_accuracy"], float) assert "best_validation_accuracy3" in metrics assert isinstance(metrics["best_validation_accuracy3"], float) assert "best_epoch" in metrics assert isinstance(metrics["best_epoch"], int) # Making sure that both increasing and decreasing validation metrics work. trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, train_dataset=self.instances, validation_dataset=self.instances, validation_metric="+loss", num_epochs=2, ) metrics = trainer.train() assert "best_validation_loss" in metrics assert isinstance(metrics["best_validation_loss"], float) assert "best_validation_accuracy" in metrics assert isinstance(metrics["best_validation_accuracy"], float) assert "best_validation_accuracy3" in metrics assert isinstance(metrics["best_validation_accuracy3"], float) assert "best_epoch" in metrics assert isinstance(metrics["best_epoch"], int) assert "peak_cpu_memory_MB" in metrics assert isinstance(metrics["peak_cpu_memory_MB"], float) assert metrics["peak_cpu_memory_MB"] > 0
def search_learning_rate(trainer: Trainer, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None) -> Tuple[List[float], List[float]]: """ Runs training loop on the model using :class:`~allennlp.training.trainer.Trainer` increasing learning rate from ``start_lr`` to ``end_lr`` recording the losses. Parameters ---------- trainer: :class:`~allennlp.training.trainer.Trainer` start_lr: ``float`` The learning rate to start the search. end_lr: ``float`` The learning rate upto which search is done. num_batches: ``int`` Number of batches to run the learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` Returns ------- (learning_rates, losses): ``Tuple[List[float], List[float]]`` Returns list of learning rates and corresponding losses. Note: The losses are recorded before applying the corresponding learning rate """ if num_batches <= 10: raise ConfigurationError('The number of iterations for learning rate finder should be greater than 10.') trainer.model.train() train_generator = trainer.iterator(trainer.train_data, shuffle=trainer.shuffle) train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_batches) learning_rates = [] losses = [] best = 1e9 if linear_steps: lr_update_factor = (end_lr - start_lr) / num_batches else: lr_update_factor = (end_lr / start_lr) ** (1.0 / num_batches) for i, batch in enumerate(train_generator_tqdm): if linear_steps: current_lr = start_lr + (lr_update_factor * i) else: current_lr = start_lr * (lr_update_factor ** i) for param_group in trainer.optimizer.param_groups: param_group['lr'] = current_lr trainer.optimizer.zero_grad() loss = trainer.batch_loss(batch, for_training=True) loss.backward() loss = loss.detach().cpu().item() if stopping_factor is not None and (math.isnan(loss) or loss > stopping_factor * best): logger.info(f'Loss ({loss}) exceeds stopping_factor * lowest recorded loss.') break trainer.rescale_gradients() trainer.optimizer.step() learning_rates.append(current_lr) losses.append(loss) if loss < best and i > 10: best = loss if i == num_batches: break return learning_rates, losses
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- trainer: :class:`~allennlp.common.registrable.Registrable` params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None) logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.') learning_rates, losses = search_learning_rate(trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))