示例#1
0
    def test_regularization(self):
        penalty = self.model.get_regularization_penalty()
        assert penalty == 0

        iterator = BasicIterator(batch_size=32)
        trainer = Trainer(self.model,
                          None,  # optimizer,
                          iterator,
                          self.instances)

        # You get a RuntimeError if you call `model.forward` twice on the same inputs.
        # The data and config are such that the whole dataset is one batch.
        training_batch = next(iterator(self.instances, num_epochs=1))
        validation_batch = next(iterator(self.instances, num_epochs=1))

        training_loss = trainer._batch_loss(training_batch, for_training=True).data
        validation_loss = trainer._batch_loss(validation_batch, for_training=False).data

        # Training loss should have the regularization penalty, but validation loss should not.
        assert (training_loss == validation_loss).all()
示例#2
0
    def test_trainer_respects_keep_serialized_model_every_num_seconds(self):
        # To test:
        #   Create an fake data loader that sleeps for 2.5 second per epoch, so the total
        #   training time for one epoch is slightly greater then 2.5 seconds.
        #   Run for 6 epochs, keeping the last 2 models, models also kept every 5 seconds.
        #   Check the resulting checkpoints.  Should then have models at epochs
        #       2, 4, plus the last two at 5 and 6.

        class SlowDataLoader:
            data_loader = DataLoader(self.instances,
                                     batch_size=2,
                                     collate_fn=allennlp_collate)

            def __iter__(self):
                time.sleep(2.5)
                return iter(self.data_loader)

            def __len__(self):
                return len(self.data_loader)

        trainer = Trainer(
            self.model,
            self.optimizer,
            SlowDataLoader(),
            num_epochs=6,
            serialization_dir=self.TEST_DIR,
            num_serialized_models_to_keep=2,
            keep_serialized_model_every_num_seconds=5,
        )
        trainer.train()

        # Now check the serialized files
        for prefix in ["model_state_epoch_*", "training_state_epoch_*"]:
            file_names = glob.glob(os.path.join(self.TEST_DIR, prefix))
            epochs = [
                int(re.search(r"_([0-9])\.th", fname).group(1))
                for fname in file_names
            ]
            # epoch N has N-1 in file name
            assert sorted(epochs) == [1, 3, 4, 5]
示例#3
0
    def test_trainer_saves_and_loads_best_validation_metrics_correctly_1(self):
        # Use -loss and run 1 epoch of original-training, and one of restored-training
        # Run 1 epoch of original training.
        trainer = Trainer(
            self.model,
            self.optimizer,
            self.iterator,
            self.instances,
            validation_dataset=self.instances,
            validation_metric="-loss",
            num_epochs=1,
            serialization_dir=self.TEST_DIR,
        )
        trainer.train()
        _ = trainer._restore_checkpoint()
        best_epoch_1 = trainer._metric_tracker.best_epoch
        best_validation_metrics_epoch_1 = trainer._metric_tracker.best_epoch_metrics
        # best_validation_metrics_epoch_1: {'accuracy': 0.75, 'accuracy3': 1.0, 'loss': 0.6243013441562653}
        assert isinstance(best_validation_metrics_epoch_1, dict)
        assert "loss" in best_validation_metrics_epoch_1

        # Run 1 epoch of restored training.
        restore_trainer = Trainer(
            self.model,
            self.optimizer,
            self.iterator,
            self.instances,
            validation_dataset=self.instances,
            validation_metric="-loss",
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
        )
        restore_trainer.train()
        _ = restore_trainer._restore_checkpoint()
        best_epoch_2 = restore_trainer._metric_tracker.best_epoch
        best_validation_metrics_epoch_2 = restore_trainer._metric_tracker.best_epoch_metrics

        # Because of using -loss, 2nd epoch would be better than 1st. So best val metrics should not be same.
        assert best_epoch_1 == 0 and best_epoch_2 == 1
        assert best_validation_metrics_epoch_2 != best_validation_metrics_epoch_1
示例#4
0
    def test_trainer_can_run_and_resume_with_momentum_scheduler(self):
        scheduler = MomentumScheduler.from_params(
            optimizer=self.optimizer,
            params=Params({
                "type": "inverted_triangular",
                "cool_down": 2,
                "warm_up": 2
            }),
        )
        trainer = Trainer(
            model=self.model,
            optimizer=self.optimizer,
            iterator=self.iterator,
            momentum_scheduler=scheduler,
            validation_metric="-loss",
            train_dataset=self.instances,
            validation_dataset=self.instances,
            num_epochs=4,
            serialization_dir=self.TEST_DIR,
        )
        trainer.train()

        new_scheduler = MomentumScheduler.from_params(
            optimizer=self.optimizer,
            params=Params({
                "type": "inverted_triangular",
                "cool_down": 2,
                "warm_up": 2
            }),
        )
        new_trainer = Trainer(
            model=self.model,
            optimizer=self.optimizer,
            iterator=self.iterator,
            momentum_scheduler=new_scheduler,
            validation_metric="-loss",
            train_dataset=self.instances,
            validation_dataset=self.instances,
            num_epochs=6,
            serialization_dir=self.TEST_DIR,
        )
        epoch = new_trainer._restore_checkpoint()
        assert epoch == 4
        assert new_trainer._momentum_scheduler.last_epoch == 3
        new_trainer.train()
示例#5
0
    def test_trainer_can_run_gradient_accumulation(self):
        instances = list(self.instances)
        steps_to_accumulate = 2

        trainer = Trainer(
            self.model,
            self.optimizer,
            self.data_loader,
            validation_data_loader=self.validation_data_loader,
            num_epochs=2,
            num_gradient_accumulation_steps=steps_to_accumulate,
        )
        assert trainer._num_gradient_accumulation_steps == steps_to_accumulate

        metrics = trainer.train()

        num_batches_trained_per_epoch = trainer._batch_num_total // (metrics["training_epochs"] + 1)
        num_batches_expected = math.ceil(
            math.ceil(len(instances) / self.data_loader.batch_size) / steps_to_accumulate
        )

        assert num_batches_trained_per_epoch == num_batches_expected
示例#6
0
文件: train.py 项目: blalalt/bert
def train(model_name, embed_name, attn_name, data_name):
    name = model_name + '_' + embed_name + '_' + attn_name
    logger = get_train_logger(log_path, name, data_name)
    checkpoints = get_train_checkpoints(checkpoints_path, name, data_name)

    dataset_reader = load_dataset_reader(data_name, embed_name, cuda_device)
    train_set, val_set = dataset_reader.load()

    vocab = Vocabulary.from_instances(train_set + val_set)
    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[('text', 'num_tokens')])
    iterator.index_with(vocab=vocab)

    encoder = load_encoder(embed_name, vocab)
    attn = load_attn(attn_name)

    clf = load_model(model_name)(vocab,
                                 encoder=encoder,
                                 attention=attn,
                                 g=dataset_reader.g,
                                 out_dim=dataset_reader.num_labels)
    if cuda_device > -1:
        clf = clf.cuda(cuda_device)
    optimizer = torch.optim.Adam(clf.parameters(), lr=learning_rate)
    trainer = Trainer(
        model=clf,
        optimizer=optimizer,
        iterator=iterator,
        validation_metric='+f-score',
        train_dataset=train_set,
        validation_dataset=val_set,
        patience=10,
        grad_clipping=10,
        num_epochs=epoch,
        cuda_device=cuda_device,
        num_serialized_models_to_keep=1,
        serialization_dir=checkpoints,
    )
    trainer.train()
示例#7
0
    def test_regularization(self):
        iterator = BasicIterator(batch_size=32)
        trainer = Trainer(
            self.model,
            None,  # optimizer,
            iterator,
            self.instances)

        # You get a RuntimeError if you call `model.forward` twice on the same inputs.
        # The data and config are such that the whole dataset is one batch.
        training_batch = next(iterator(self.instances, num_epochs=1))
        validation_batch = next(iterator(self.instances, num_epochs=1))

        training_loss = trainer.batch_loss(training_batch,
                                           for_training=True).item() / 10
        validation_loss = trainer.batch_loss(validation_batch,
                                             for_training=False).item() / 10

        # Training loss should have the regularization penalty, but validation loss should not.
        numpy.testing.assert_almost_equal(training_loss,
                                          validation_loss,
                                          decimal=0)
示例#8
0
    def test_trainer_saves_models_at_specified_interval(self):
        iterator = BasicIterator(batch_size=4)
        iterator.index_with(self.vocab)

        trainer = Trainer(
            self.model,
            self.optimizer,
            iterator,
            self.instances,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
            model_save_interval=0.0001,
        )

        trainer.train()

        # Now check the serialized files for models saved during the epoch.
        prefix = "model_state_epoch_*"
        file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix)))
        epochs = [
            re.search(r"_([0-9\.\-]+)\.th", fname).group(1)
            for fname in file_names
        ]
        # We should have checkpoints at the end of each epoch and during each, e.g.
        # [0.timestamp, 0, 1.timestamp, 1]
        assert len(epochs) == 4
        assert epochs[3] == "1"
        assert "." in epochs[0]

        # Now make certain we can restore from timestamped checkpoint.
        # To do so, remove the checkpoint from the end of epoch 1&2, so
        # that we are forced to restore from the timestamped checkpoints.
        for k in range(2):
            os.remove(
                os.path.join(self.TEST_DIR,
                             "model_state_epoch_{}.th".format(k)))
            os.remove(
                os.path.join(self.TEST_DIR,
                             "training_state_epoch_{}.th".format(k)))
        os.remove(os.path.join(self.TEST_DIR, "best.th"))

        restore_trainer = Trainer(
            self.model,
            self.optimizer,
            self.iterator,
            self.instances,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
            model_save_interval=0.0001,
        )
        epoch = restore_trainer._restore_checkpoint()
        assert epoch == 2
        # One batch per epoch.
        assert restore_trainer._batch_num_total == 2
示例#9
0
    def test_trainer_can_resume_training(self):
        trainer = Trainer(self.model, self.optimizer,
                          self.iterator, self.instances,
                          validation_dataset=self.instances,
                          num_epochs=1, serialization_dir=self.TEST_DIR)
        trainer.train()
        new_trainer = Trainer(self.model, self.optimizer,
                              self.iterator, self.instances,
                              validation_dataset=self.instances,
                              num_epochs=3, serialization_dir=self.TEST_DIR)

        epoch = new_trainer._restore_checkpoint()  # pylint: disable=protected-access
        assert epoch == 1

        tracker = trainer._metric_tracker  # pylint: disable=protected-access
        assert tracker.is_best_so_far()
        assert tracker._best_so_far is not None  # pylint: disable=protected-access

        new_trainer.train()
示例#10
0
    def test_regularization(self):
        penalty = self.model.get_regularization_penalty()
        assert penalty == 0

        iterator = BasicIterator(batch_size=32)
        trainer = Trainer(
            self.model,
            None,  # optimizer,
            iterator,
            self.instances)

        # You get a RuntimeError if you call `model.forward` twice on the same inputs.
        # The data and config are such that the whole dataset is one batch.
        training_batch = next(iterator(self.instances, num_epochs=1))
        validation_batch = next(iterator(self.instances, num_epochs=1))

        training_loss = trainer._batch_loss(training_batch,
                                            for_training=True).data
        validation_loss = trainer._batch_loss(validation_batch,
                                              for_training=False).data

        # Training loss should have the regularization penalty, but validation loss should not.
        assert (training_loss == validation_loss).all()
示例#11
0
    def test_production_rule_field_with_multiple_gpus(self):
        wikitables_dir = "allennlp/tests/fixtures/data/wikitables/"
        search_output_directory = wikitables_dir + "action_space_walker_output/"
        wikitables_reader = WikiTablesDatasetReader(
            tables_directory=wikitables_dir, offline_logical_forms_directory=search_output_directory
        )
        instances = wikitables_reader.read(wikitables_dir + "sample_data.examples")
        archive_path = (
            self.FIXTURES_ROOT
            / "semantic_parsing"
            / "wikitables"
            / "serialization"
            / "model.tar.gz"
        )
        model = load_archive(archive_path).model
        model.cuda()

        multigpu_iterator = BasicIterator(batch_size=4)
        multigpu_iterator.index_with(model.vocab)
        trainer = Trainer(
            model, self.optimizer, multigpu_iterator, instances, num_epochs=2, cuda_device=[0, 1]
        )
        trainer.train()
示例#12
0
 def test_should_stop_early_with_invalid_patience(self):
     for patience in [0, -1, -2, 1.5, 'None']:
         with pytest.raises(ConfigurationError,
                            match='.* is an invalid value for "patience": '
                            'it must be a positive integer or None '
                            '\\(if you want to disable early stopping\\)'):
             Trainer(self.model,
                     self.optimizer,
                     self.iterator,
                     self.instances,
                     validation_dataset=self.instances,
                     num_epochs=100,
                     patience=patience,
                     validation_metric="+test")
示例#13
0
 def test_should_stop_early_with_invalid_patience(self):
     for patience in [0, -1, -2, 1.5, 'None']:
         with pytest.raises(
                 ConfigurationError,
                 message='No ConfigurationError for patience={}'.format(
                     patience)):
             Trainer(self.model,
                     self.optimizer,
                     self.iterator,
                     self.instances,
                     validation_dataset=self.instances,
                     num_epochs=100,
                     patience=patience,
                     validation_metric="+test")
示例#14
0
    def test_passing_trainer_multiple_gpus_raises_error(self):
        self.model.cuda()

        multigpu_iterator = BasicIterator(batch_size=4)
        multigpu_iterator.index_with(self.vocab)
        with pytest.raises(ConfigurationError):
            Trainer(
                self.model,
                self.optimizer,
                multigpu_iterator,
                self.instances,
                num_epochs=2,
                cuda_device=[0, 1],
            )
示例#15
0
    def test_trainer_can_run_multiple_gpu(self):
        self.model.cuda()

        class MetaDataCheckWrapper(Model):
            """
            Checks that the metadata field has been correctly split across the batch dimension
            when running on multiple gpus.
            """
            def __init__(self, model):
                super().__init__(model.vocab)
                self.model = model

            def forward(self, **kwargs) -> Dict[str, torch.Tensor]:  # type: ignore # pylint: disable=arguments-differ
                assert 'metadata' in kwargs and 'tags' in kwargs, \
                    f'tokens and metadata must be provided. Got {kwargs.keys()} instead.'
                batch_size = kwargs['tokens']['tokens'].size()[0]
                assert len(kwargs['metadata']) == batch_size, \
                    f'metadata must be split appropriately. Expected {batch_size} elements, ' \
                    f"got {len(kwargs['metadata'])} elements."
                return self.model.forward(**kwargs)

        multigpu_iterator = BasicIterator(batch_size=4)
        multigpu_iterator.index_with(self.vocab)
        trainer = Trainer(MetaDataCheckWrapper(self.model),
                          self.optimizer,
                          multigpu_iterator,
                          self.instances,
                          num_epochs=2,
                          cuda_device=[0, 1])
        metrics = trainer.train()
        assert 'peak_cpu_memory_MB' in metrics
        assert isinstance(metrics['peak_cpu_memory_MB'], float)
        assert metrics['peak_cpu_memory_MB'] > 0
        assert 'peak_gpu_0_memory_MB' in metrics
        assert isinstance(metrics['peak_gpu_0_memory_MB'], int)
        assert 'peak_gpu_1_memory_MB' in metrics
        assert isinstance(metrics['peak_gpu_1_memory_MB'], int)
示例#16
0
def main(args):
    params = Params.from_file(args.config_path)
    stdout_handler = prepare_global_logging(args.output_dir, False)
    prepare_environment(params)

    reader = DatasetReader.from_params(params["dataset_reader"])
    train_dataset = reader.read(params.pop("train_data_path", None))
    valid_dataset = reader.read(params.pop("validation_data_path", None))
    test_data_path = params.pop("test_data_path", None)
    if test_data_path:
        test_dataset = reader.read(test_data_path)
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset + test_dataset)
    else:
        test_dataset = None
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset)

    model_params = params.pop("model", None)
    model = Model.from_params(model_params.duplicate(), vocab=vocab)
    vocab.save_to_files(os.path.join(args.output_dir, "vocabulary"))
    # copy config file
    with open(args.config_path, "r", encoding="utf-8") as f_in:
        with open(os.path.join(args.output_dir, "config.json"), "w", encoding="utf-8") as f_out:
            f_out.write(f_in.read())

    iterator = DataIterator.from_params(params.pop("iterator", None))
    iterator.index_with(vocab)
    
    trainer_params = params.pop("trainer", None)
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=args.output_dir,
                                  iterator=iterator,
                                  train_data=train_dataset,
                                  validation_data=valid_dataset,
                                  params=trainer_params.duplicate())
    trainer.train()

    # evaluate on the test set
    if test_dataset:
        logging.info("Evaluating on the test set")
        import torch  # import here to ensure the republication of the experiment
        model.load_state_dict(torch.load(os.path.join(args.output_dir, "best.th")))
        test_metrics = evaluate(model, test_dataset, iterator,
                                cuda_device=trainer_params.pop("cuda_device", 0),
                                batch_weight_key=None)
        logging.info(f"Metrics on the test set: {test_metrics}")
        with open(os.path.join(args.output_dir, "test_metrics.txt"), "w", encoding="utf-8") as f_out:
            f_out.write(f"Metrics on the test set: {test_metrics}")

    cleanup_global_logging(stdout_handler)
示例#17
0
    def _setup(self):
        """Setup the trainer components and local resources"""
        prepare_environment(
            Params({} if self._trainer_config.random_seed is None else {
                "random_seed": self._trainer_config.random_seed,
                "numpy_seed": self._trainer_config.random_seed,
                "pytorch_seed": self._trainer_config.random_seed,
            }))
        os.makedirs(self._output_dir, exist_ok=True)

        serialization_params = sanitize(self._allennlp_configuration())
        with open(os.path.join(self._output_dir, CONFIG_NAME),
                  "w") as param_file:
            json.dump(serialization_params, param_file, indent=4)

        self._pipeline.vocab.save_to_files(
            os.path.join(self._output_dir, "vocabulary"))

        for dataset in [self._training, self._validation, self._test]:
            if dataset is not None:
                dataset.index_with(self._pipeline.backbone.vocab)

        trainer_params = Params(
            helpers.sanitize_for_params(
                self._trainer_config.to_allennlp_trainer()))

        pipeline_model = self._pipeline._model

        training_data_loader = create_dataloader(
            self._training,
            self._trainer_config.batch_size,
            self._trainer_config.data_bucketing,
            self._trainer_config.batches_per_epoch,
        )

        validation_data_loader = (create_dataloader(
            self._validation,
            self._trainer_config.batch_size,
            self._trainer_config.data_bucketing,
        ) if self._validation else None)

        self._trainer = Trainer.from_params(
            model=pipeline_model,
            serialization_dir=self._output_dir,
            data_loader=training_data_loader,
            validation_data_loader=validation_data_loader,
            params=trainer_params,
            epoch_callbacks=self._epoch_callbacks,
        )
示例#18
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json"
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     self.data_loader = DataLoader.from_params(dataset=self.instances,
                                               params=params["data_loader"])
     self.trainer = Trainer.from_params(
         model=self.model,
         data_loader=self.data_loader,
         serialization_dir=self.TEST_DIR,
         params=params.get("trainer"),
     )
示例#19
0
    def test_trainer_respects_keep_serialized_model_every_num_seconds(self):
        # To test:
        #   Create an iterator that sleeps for 2.5 second per epoch, so the total training
        #       time for one epoch is slightly greater then 2.5 seconds.
        #   Run for 6 epochs, keeping the last 2 models, models also kept every 5 seconds.
        #   Check the resulting checkpoints.  Should then have models at epochs
        #       2, 4, plus the last two at 5 and 6.
        class WaitingIterator(BasicIterator):
            def _create_batches(self, *args, **kwargs):
                time.sleep(2.5)
                return super()._create_batches(*args, **kwargs)

        iterator = WaitingIterator(batch_size=2)
        iterator.index_with(self.vocab)

        trainer = Trainer(
            self.model,
            self.optimizer,
            iterator,
            self.instances,
            num_epochs=6,
            serialization_dir=self.TEST_DIR,
            num_serialized_models_to_keep=2,
            keep_serialized_model_every_num_seconds=5,
        )
        trainer.train()

        # Now check the serialized files
        for prefix in ["model_state_epoch_*", "training_state_epoch_*"]:
            file_names = glob.glob(os.path.join(self.TEST_DIR, prefix))
            epochs = [
                int(re.search(r"_([0-9])\.th", fname).group(1))
                for fname in file_names
            ]
            # epoch N has N-1 in file name
            assert sorted(epochs) == [1, 3, 4, 5]
示例#20
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json'
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params['dataset_reader'])
     self.iterator = DataIterator.from_params(params['iterator'])
     self.trainer = Trainer.from_params(
             self.model,
             self.TEST_DIR,
             self.iterator,
             self.dataset,
             None,
             params.get('trainer')
     )
示例#21
0
    def test_should_stop_early_with_increasing_metric(self):
        new_trainer = Trainer(self.model, self.optimizer,
                              self.iterator, self.instances,
                              validation_dataset=self.instances,
                              num_epochs=3, serialization_dir=self.TEST_DIR,
                              patience=5, validation_metric="+test")

        tracker = new_trainer._metric_tracker  # pylint: disable=protected-access

        new_tracker = copy.deepcopy(tracker)
        new_tracker.add_metrics([.5, .3, .2, .1, .4, .4])
        assert new_tracker.should_stop_early()

        new_tracker = copy.deepcopy(tracker)
        new_tracker.add_metrics([.3, .3, .3, .2, .5, .1])
        assert not new_tracker.should_stop_early()
示例#22
0
 def setUp(self):
     super(SimpleTaggerRegularizationTest, self).setUp()
     param_file = self.FIXTURES_ROOT / u'simple_tagger' / u'experiment_with_regularization.json'
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv')
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params[u'dataset_reader'])
     self.iterator = DataIterator.from_params(params[u'iterator'])
     self.trainer = Trainer.from_params(
             self.model,
             self.TEST_DIR,
             self.iterator,
             self.dataset,
             None,
             params.get(u'trainer')
     )
示例#23
0
 def test_should_stop_early_with_invalid_patience(self):
     for patience in [0, -1, -2, 1.5, "None"]:
         with pytest.raises(
                 ConfigurationError,
                 match='.* is an invalid value for "patience": '
                 "it must be a positive integer or None "
                 "\\(if you want to disable early stopping\\)",
         ):
             Trainer(
                 self.model,
                 self.optimizer,
                 self.data_loader,
                 validation_data_loader=self.validation_data_loader,
                 num_epochs=100,
                 patience=patience,
                 validation_metric="+test",
             )
示例#24
0
    def test_trainer_can_resume_with_lr_scheduler(self):
        lr_scheduler = LearningRateScheduler.from_params(
            self.optimizer, Params({
                "type": "exponential",
                "gamma": 0.5
            }))
        trainer = Trainer(
            model=self.model,
            optimizer=self.optimizer,
            iterator=self.iterator,
            learning_rate_scheduler=lr_scheduler,
            train_dataset=self.instances,
            validation_dataset=self.instances,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
        )
        trainer.train()

        new_lr_scheduler = LearningRateScheduler.from_params(
            self.optimizer, Params({
                "type": "exponential",
                "gamma": 0.5
            }))
        new_trainer = Trainer(
            model=self.model,
            optimizer=self.optimizer,
            iterator=self.iterator,
            learning_rate_scheduler=new_lr_scheduler,
            train_dataset=self.instances,
            validation_dataset=self.instances,
            num_epochs=4,
            serialization_dir=self.TEST_DIR,
        )
        epoch = new_trainer._restore_checkpoint()
        assert epoch == 2
        assert new_trainer._learning_rate_scheduler.lr_scheduler.last_epoch == 1
        new_trainer.train()
示例#25
0
    def test_trainer_can_resume_training_for_exponential_moving_average(self):
        moving_average = ExponentialMovingAverage(
            self.model.named_parameters())

        trainer = Trainer(
            self.model,
            self.optimizer,
            self.iterator,
            self.instances,
            validation_dataset=self.instances,
            num_epochs=1,
            serialization_dir=self.TEST_DIR,
            moving_average=moving_average,
        )
        trainer.train()

        new_moving_average = ExponentialMovingAverage(
            self.model.named_parameters())
        new_trainer = Trainer(
            self.model,
            self.optimizer,
            self.iterator,
            self.instances,
            validation_dataset=self.instances,
            num_epochs=3,
            serialization_dir=self.TEST_DIR,
            moving_average=new_moving_average,
        )

        epoch = new_trainer._restore_checkpoint()
        assert epoch == 1

        tracker = trainer._metric_tracker
        assert tracker.is_best_so_far()
        assert tracker._best_so_far is not None

        new_trainer.train()
示例#26
0
def get_trainer_from_config(config: Params,
                            train_instances: List[Instance],
                            val_instances: List[Instance],
                            vocab: Optional[Vocabulary] = None,
                            device: Optional[int] = -1) -> Trainer:
    trainer_params = config.pop("trainer")
    trainer_params["cuda_device"] = device
    model_params = config.pop("model")
    vocab = vocab or Vocabulary.from_instances(train_instances)
    model = Model.from_params(model_params, vocab=vocab)
    iterator = DataIterator.from_params(config.pop("iterator"))
    iterator.index_with(vocab)
    trainer = Trainer.from_params(
        model=model,
        iterator=iterator,
        train_data=train_instances,
        validation_data=val_instances,
        serialization_dir=None,
        params=trainer_params)
    return trainer
示例#27
0
    def test_should_stop_early_with_flat_lining_metric(self):
        # pylint: disable=protected-access
        flatline = [.2] * 6
        tracker = Trainer(self.model, self.optimizer,
                          self.iterator, self.instances,
                          validation_dataset=self.instances,
                          num_epochs=3,
                          serialization_dir=self.TEST_DIR,
                          patience=5,
                          validation_metric="+test")._metric_tracker
        tracker.add_metrics(flatline)
        assert tracker.should_stop_early

        tracker = Trainer(self.model, self.optimizer,
                          self.iterator, self.instances,
                          validation_dataset=self.instances,
                          num_epochs=3,
                          serialization_dir=self.TEST_DIR,
                          patience=5,
                          validation_metric="-test")._metric_tracker
        tracker.add_metrics(flatline)
        assert tracker.should_stop_early
示例#28
0
    def test_should_stop_early_with_increasing_metric(self):
        new_trainer = Trainer(
            self.model,
            self.optimizer,
            self.data_loader,
            validation_data_loader=self.validation_data_loader,
            num_epochs=3,
            serialization_dir=self.TEST_DIR,
            patience=5,
            validation_metric="+test",
        )

        tracker = new_trainer._metric_tracker

        new_tracker = copy.deepcopy(tracker)
        new_tracker.add_metrics([0.5, 0.3, 0.2, 0.1, 0.4, 0.4])
        assert new_tracker.should_stop_early()

        new_tracker = copy.deepcopy(tracker)
        new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1])
        assert not new_tracker.should_stop_early()
示例#29
0
 def test_mode_specified_in_reduce_on_plateau(self):
     # pylint: disable=protected-access
     for mode, metric in [("min", "-custom"), ("max", "+custom")]:
         trainer_params = Params({
             "validation_metric": metric,
             "learning_rate_scheduler": {
                 "type": "reduce_on_plateau",
                 "mode": mode
             },
             "optimizer": {
                 "type": "adam",
                 "lr": 0.01
             }
         })
         trainer = Trainer.from_params(model=self.model,
                                       serialization_dir=self.TEST_DIR,
                                       iterator=self.iterator,
                                       train_data=self.instances,
                                       validation_data=self.instances,
                                       params=trainer_params)
         assert trainer._learning_rate_scheduler.lr_scheduler.mode == mode
示例#30
0
    def setUp(self):
        super().setUp()
        params = Params(
            {
                "model": {
                    "type": "simple_tagger",
                    "text_field_embedder": {
                        "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
                    },
                    "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
                "validation_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
                "data_loader": {"batch_size": 2},
                "trainer": {"cuda_device": -1, "num_epochs": 2, "optimizer": "adam"},
            }
        )
        all_datasets = datasets_from_params(params)
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            instances=(instance for dataset in all_datasets.values() for instance in dataset),
        )
        model = Model.from_params(vocab=vocab, params=params.pop("model"))
        train_data = all_datasets["train"]
        train_data.index_with(vocab)

        data_loader = DataLoader.from_params(dataset=train_data, params=params.pop("data_loader"))
        trainer_params = params.pop("trainer")
        serialization_dir = os.path.join(self.TEST_DIR, "test_search_learning_rate")

        self.trainer = Trainer.from_params(
            model=model,
            serialization_dir=serialization_dir,
            data_loader=data_loader,
            train_data=train_data,
            params=trainer_params,
            validation_data=None,
            validation_iterator=None,
        )
示例#31
0
 def test_mode_doesnt_agree_with_metric(self):
     # pylint: disable=protected-access
     for mode, metric in [("max", "-custom"), ("min", "+custom")]:
         trainer_params = Params({
             "validation_metric": metric,
             "learning_rate_scheduler": {
                 "type": "reduce_on_plateau",
                 "mode": mode
             },
             "optimizer": {
                 "type": "adam",
                 "lr": 0.01
             }
         })
         with self.assertLogs(logger="allennlp.training.util",
                              level="WARNING"):
             # we warn when the metric and the mode don't agree
             trainer = Trainer.from_params(model=self.model,
                                           serialization_dir=self.TEST_DIR,
                                           iterator=self.iterator,
                                           train_data=self.instances,
                                           validation_data=self.instances,
                                           params=trainer_params)
         assert trainer._learning_rate_scheduler.lr_scheduler.mode == mode
示例#32
0
    def test_trainer_can_run(self):
        trainer = Trainer(
            model=self.model,
            optimizer=self.optimizer,
            iterator=self.iterator,
            train_dataset=self.instances,
            validation_dataset=self.instances,
            num_epochs=2,
        )
        metrics = trainer.train()
        assert "best_validation_loss" in metrics
        assert isinstance(metrics["best_validation_loss"], float)
        assert "best_validation_accuracy" in metrics
        assert isinstance(metrics["best_validation_accuracy"], float)
        assert "best_validation_accuracy3" in metrics
        assert isinstance(metrics["best_validation_accuracy3"], float)
        assert "best_epoch" in metrics
        assert isinstance(metrics["best_epoch"], int)

        # Making sure that both increasing and decreasing validation metrics work.
        trainer = Trainer(
            model=self.model,
            optimizer=self.optimizer,
            iterator=self.iterator,
            train_dataset=self.instances,
            validation_dataset=self.instances,
            validation_metric="+loss",
            num_epochs=2,
        )
        metrics = trainer.train()
        assert "best_validation_loss" in metrics
        assert isinstance(metrics["best_validation_loss"], float)
        assert "best_validation_accuracy" in metrics
        assert isinstance(metrics["best_validation_accuracy"], float)
        assert "best_validation_accuracy3" in metrics
        assert isinstance(metrics["best_validation_accuracy3"], float)
        assert "best_epoch" in metrics
        assert isinstance(metrics["best_epoch"], int)
        assert "peak_cpu_memory_MB" in metrics
        assert isinstance(metrics["peak_cpu_memory_MB"], float)
        assert metrics["peak_cpu_memory_MB"] > 0
示例#33
0
def search_learning_rate(trainer: Trainer,
                         start_lr: float = 1e-5,
                         end_lr: float = 10,
                         num_batches: int = 100,
                         linear_steps: bool = False,
                         stopping_factor: float = None) -> Tuple[List[float], List[float]]:
    """
    Runs training loop on the model using :class:`~allennlp.training.trainer.Trainer`
    increasing learning rate from ``start_lr`` to ``end_lr`` recording the losses.

    Parameters
    ----------
    trainer: :class:`~allennlp.training.trainer.Trainer`
    start_lr: ``float``
        The learning rate to start the search.
    end_lr: ``float``
        The learning rate upto which search is done.
    num_batches: ``int``
        Number of batches to run the learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``

    Returns
    -------
    (learning_rates, losses): ``Tuple[List[float], List[float]]``
        Returns list of learning rates and corresponding losses.
        Note: The losses are recorded before applying the corresponding learning rate
    """
    if num_batches <= 10:
        raise ConfigurationError('The number of iterations for learning rate finder should be greater than 10.')

    trainer.model.train()

    train_generator = trainer.iterator(trainer.train_data,
                                       shuffle=trainer.shuffle)
    train_generator_tqdm = Tqdm.tqdm(train_generator,
                                     total=num_batches)

    learning_rates = []
    losses = []
    best = 1e9
    if linear_steps:
        lr_update_factor = (end_lr - start_lr) / num_batches
    else:
        lr_update_factor = (end_lr / start_lr) ** (1.0 / num_batches)

    for i, batch in enumerate(train_generator_tqdm):

        if linear_steps:
            current_lr = start_lr + (lr_update_factor * i)
        else:
            current_lr = start_lr * (lr_update_factor ** i)

        for param_group in trainer.optimizer.param_groups:
            param_group['lr'] = current_lr

        trainer.optimizer.zero_grad()
        loss = trainer.batch_loss(batch, for_training=True)
        loss.backward()
        loss = loss.detach().cpu().item()

        if stopping_factor is not None and (math.isnan(loss) or loss > stopping_factor * best):
            logger.info(f'Loss ({loss}) exceeds stopping_factor * lowest recorded loss.')
            break

        trainer.rescale_gradients()
        trainer.optimizer.step()

        learning_rates.append(current_lr)
        losses.append(loss)

        if loss < best and i > 10:
            best = loss

        if i == num_batches:
            break

    return learning_rates, losses
示例#34
0
def find_learning_rate_model(params: Params, serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    trainer: :class:`~allennlp.common.registrable.Registrable`
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is '
                                 f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  params=trainer_params,
                                  validation_data=None,
                                  validation_iterator=None)

    logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.')
    learning_rates, losses = search_learning_rate(trainer,
                                                  start_lr=start_lr,
                                                  end_lr=end_lr,
                                                  num_batches=num_batches,
                                                  linear_steps=linear_steps,
                                                  stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))