示例#1
0
    def find_lr(
        self,
        trainer_config: TrainerConfiguration,
        find_lr_config: FindLRConfiguration,
        training_data: Union[DataSource, InstancesDataset],
    ):
        """Returns a learning rate scan on the model.

        It increases the learning rate step by step while recording the losses.
        For a guide on how to select the learning rate please refer to this excellent
        [blog post](https://towardsdatascience.com/estimating-optimal-learning-rate-for-a-deep-neural-network-ce32f2556ce0)

        Parameters
        ----------
        trainer_config
            A trainer configuration
        find_lr_config
            A configuration for finding the learning rate
        training_data
            The training data

        Returns
        -------
        (learning_rates, losses)
            Returns a list of learning rates and corresponding losses.
            Note: The losses are recorded before applying the corresponding learning rate
        """
        from biome.text._helpers import create_trainer_for_finding_lr

        # The original pipeline keeps unchanged
        find_lr_pipeline = self._make_copy()

        if vocabulary.is_empty(find_lr_pipeline.backbone.vocab,
                               self.config.features.keys):
            raise EmptyVocabError(
                "Found an empty vocabulary. "
                "You probably forgot to create a vocabulary with '.create_vocabulary()'."
            )

        if isinstance(training_data, DataSource):
            training_data = find_lr_pipeline.create_dataset(training_data)

        trainer = create_trainer_for_finding_lr(
            pipeline=find_lr_pipeline,
            trainer_config=trainer_config,
            training_data=training_data,
        )

        learning_rates, losses = search_learning_rate(
            trainer=trainer,
            start_lr=find_lr_config.start_lr,
            end_lr=find_lr_config.end_lr,
            num_batches=find_lr_config.num_batches,
            linear_steps=find_lr_config.linear_steps,
            stopping_factor=find_lr_config.stopping_factor,
        )

        return learning_rates, losses
示例#2
0
    def _setup_for_training(self):
        """Create vocab, configure default loggers/callbacks, create optimizer/lr scheduler, setup best metrics"""
        # create vocab
        if self._vocab_config is not None:
            vocab_datasets = [self._train_instances]
            if (
                self._valid_instances is not None
                and self._vocab_config.include_valid_data
            ):
                vocab_datasets += [self._valid_instances]
            self._pipeline.create_vocab(vocab_datasets, config=self._vocab_config)

        # Check for an empty vocab
        if vocabulary.is_empty(
            self._pipeline.vocab, self._pipeline.config.features.configured_namespaces
        ):
            raise EmptyVocabError(
                "All your features need a non-empty vocabulary for a training!"
            )

        # we give some special attention to these loggers/callbacks
        self._wandb_logger: Optional[WandbLogger] = None
        self._model_checkpoint: Optional[ModelCheckpoint] = None

        # add default callbacks/loggers
        self._trainer_config.callbacks = self._add_default_callbacks()
        if self._trainer_config.logger is not False:
            self._trainer_config.logger = self._add_default_loggers()

        # create optimizer, has to come AFTER creating the vocab!
        self._pipeline.model.optimizer = Optimizer.from_params(
            Params(
                {
                    "model_parameters": self._pipeline.model.named_parameters(),
                    **self._trainer_config.optimizer,
                }
            )
        )

        # create lr scheduler, has to come AFTER creating the optimizer!
        if not (
            self._trainer_config.warmup_steps == 0
            and self._trainer_config.lr_decay is None
        ):
            self._pipeline.model.lr_scheduler = self._create_lr_scheduler()
        else:
            self._pipeline.model.lr_scheduler = None

        # set monitor and mode for best validation metrics
        self._pipeline.model.monitor = self._trainer_config.monitor
        self._pipeline.model.monitor_mode = self._trainer_config.monitor_mode
示例#3
0
    def train(
        self,
        output: str,
        training: Union[DataSource, InstancesDataset],
        trainer: Optional[TrainerConfiguration] = None,
        validation: Optional[Union[DataSource, InstancesDataset]] = None,
        test: Optional[Union[DataSource, InstancesDataset]] = None,
        extend_vocab: Optional[VocabularyConfiguration] = None,
        loggers: List[BaseTrainLogger] = None,
        restore: bool = False,
        quiet: bool = False,
    ) -> TrainingResults:
        """Launches a training run with the specified configurations and data sources

        Parameters
        ----------
        output:
            The experiment output path
        training:
            The training DataSource
        trainer:
            The trainer file path
        validation:
            The validation DataSource (optional)
        test:
            The test DataSource (optional)
        extend_vocab:
            Extends the vocabulary tokens with the provided VocabularyConfiguration
        loggers:
            A list of loggers that execute a callback before the training, after each epoch,
            and at the end of the training (see `biome.text.logger.MlflowLogger`, for example)
        restore:
            If enabled, tries to read previous training status from the `output` folder and
            continues the training process
        quiet:
            If enabled, disables most logging messages keeping only warning and error messages.
            In any case, all logging info will be stored into a file at ${output}/train.log

        Returns
        -------
        training_results
            Training results including the generated model path and the related metrics
        """
        if extend_vocab is not None and isinstance(self, _BlankPipeline):
            raise ActionNotSupportedError(
                "If you want to customize pipeline vocab, please use the `create_vocabulary()` method instead"
            )

        trainer = trainer or TrainerConfiguration()
        try:
            if not restore and os.path.isdir(output):
                shutil.rmtree(output)

            self.__configure_training_logging(output, quiet)

            # The original pipeline keeps unchanged
            train_pipeline = self._make_copy()
            vocab = None

            if restore:
                vocab = vocabulary.load_vocabulary(
                    os.path.join(output, "vocabulary"))
            if extend_vocab is not None and not vocab:
                vocab = train_pipeline._extend_vocabulary(
                    train_pipeline.backbone.vocab, vocab_config=extend_vocab)
            if vocab:
                train_pipeline._set_vocab(vocab)

            if vocabulary.is_empty(train_pipeline.backbone.vocab,
                                   self.config.features.keys):
                raise EmptyVocabError(
                    "Found an empty vocabulary. "
                    "You probably forgot to create a vocabulary with '.create_vocabulary()'."
                )

            from ._helpers import PipelineTrainer

            datasets = {
                "training": training,
                "validation": validation,
                "test": test
            }
            for name, dataset in datasets.items():
                if isinstance(dataset, DataSource):
                    datasets[name] = train_pipeline.create_dataset(dataset)

            loggers = loggers or []
            add_default_wandb_logger_if_needed(loggers)

            pipeline_trainer = PipelineTrainer(
                train_pipeline,
                trainer_config=trainer,
                output_dir=output,
                epoch_callbacks=loggers,
                **datasets,
            )

            for logger in loggers:
                try:
                    logger.init_train(
                        pipeline=train_pipeline,
                        trainer_configuration=trainer,
                        **datasets,
                    )
                except Exception as e:
                    self.__LOGGER.warning("Logger %s failed on init_train: %s",
                                          logger, e)

            model_path, metrics = pipeline_trainer.train()
            train_results = TrainingResults(model_path, metrics)

            for logger in loggers:
                try:
                    logger.end_train(train_results)
                except Exception as e:
                    self.__LOGGER.warning("Logger %s failed on end_traing: %s",
                                          logger, e)

            return train_results

        finally:
            self.__restore_training_logging()
示例#4
0
    def _prepare_vocab(
        self,
        vocabulary_folder: Optional[str] = None,
        vocab_config: Optional[Union[str,
                                     VocabularyConfiguration]] = "default",
        training_data: Optional[Dataset] = None,
        lazy: bool = False,
    ):
        """Prepare and set the vocab for a training or learning rate scan.

        Parameters
        ----------
        vocabulary_folder
            If specified, load the vocab from this folder
        vocab_config
            A `VocabularyConfiguration` to create/extend the pipeline's vocabulary if necessary.
            If 'default' (str), we will use the default configuration
            `VocabularyConfiguration(datasets=[training_data])`.
            If None, we will leave the pipeline's vocabulary untouched.
        training_data
            The training data in case we need to construct the default config
        lazy
            If true, dataset instances are lazily loaded from disk, otherwise they are loaded and kept in memory.
        """
        # The transformers feature comes with its own vocab, no need to prepare anything if it is the only feature
        if self.config.features.configured_namespaces == [
                TransformersFeatures.namespace
        ]:
            return

        # If the vocab is empty, we assume this is an untrained pipeline
        # and we want to raise an error if the weights file is not found.
        # Extending the vocab with a non-existent weights file only throws a warning.
        try:
            assert is_url_or_existing_file(
                Path(self.config.features.word.weights_file))
        except AssertionError:
            if vocabulary.is_empty(self.vocab, [WordFeatures.namespace]):
                raise FileNotFoundError(
                    f"Cannot find the weights file {self.config.features.word.weights_file}"
                )
        # no word feature, or weights_file is None
        except (AttributeError, TypeError):
            pass

        if vocabulary_folder is not None:
            self._model.extend_vocabulary(
                Vocabulary.from_files(vocabulary_folder))
            vocab_config = None

        vocab_config = (VocabularyConfiguration(datasets=[training_data])
                        if vocab_config == "default" else vocab_config)
        if vocab_config is not None:
            vocab = vocab_config.build_vocab(pipeline=self, lazy=lazy)
            self._model.extend_vocabulary(vocab)

        if vocabulary.is_empty(self.vocab,
                               self.config.features.configured_namespaces):
            raise EmptyVocabError(
                "All your features need a non-empty vocabulary for a training!"
            )