def find_lr( self, trainer_config: TrainerConfiguration, find_lr_config: FindLRConfiguration, training_data: Union[DataSource, InstancesDataset], ): """Returns a learning rate scan on the model. It increases the learning rate step by step while recording the losses. For a guide on how to select the learning rate please refer to this excellent [blog post](https://towardsdatascience.com/estimating-optimal-learning-rate-for-a-deep-neural-network-ce32f2556ce0) Parameters ---------- trainer_config A trainer configuration find_lr_config A configuration for finding the learning rate training_data The training data Returns ------- (learning_rates, losses) Returns a list of learning rates and corresponding losses. Note: The losses are recorded before applying the corresponding learning rate """ from biome.text._helpers import create_trainer_for_finding_lr # The original pipeline keeps unchanged find_lr_pipeline = self._make_copy() if vocabulary.is_empty(find_lr_pipeline.backbone.vocab, self.config.features.keys): raise EmptyVocabError( "Found an empty vocabulary. " "You probably forgot to create a vocabulary with '.create_vocabulary()'." ) if isinstance(training_data, DataSource): training_data = find_lr_pipeline.create_dataset(training_data) trainer = create_trainer_for_finding_lr( pipeline=find_lr_pipeline, trainer_config=trainer_config, training_data=training_data, ) learning_rates, losses = search_learning_rate( trainer=trainer, start_lr=find_lr_config.start_lr, end_lr=find_lr_config.end_lr, num_batches=find_lr_config.num_batches, linear_steps=find_lr_config.linear_steps, stopping_factor=find_lr_config.stopping_factor, ) return learning_rates, losses
def _setup_for_training(self): """Create vocab, configure default loggers/callbacks, create optimizer/lr scheduler, setup best metrics""" # create vocab if self._vocab_config is not None: vocab_datasets = [self._train_instances] if ( self._valid_instances is not None and self._vocab_config.include_valid_data ): vocab_datasets += [self._valid_instances] self._pipeline.create_vocab(vocab_datasets, config=self._vocab_config) # Check for an empty vocab if vocabulary.is_empty( self._pipeline.vocab, self._pipeline.config.features.configured_namespaces ): raise EmptyVocabError( "All your features need a non-empty vocabulary for a training!" ) # we give some special attention to these loggers/callbacks self._wandb_logger: Optional[WandbLogger] = None self._model_checkpoint: Optional[ModelCheckpoint] = None # add default callbacks/loggers self._trainer_config.callbacks = self._add_default_callbacks() if self._trainer_config.logger is not False: self._trainer_config.logger = self._add_default_loggers() # create optimizer, has to come AFTER creating the vocab! self._pipeline.model.optimizer = Optimizer.from_params( Params( { "model_parameters": self._pipeline.model.named_parameters(), **self._trainer_config.optimizer, } ) ) # create lr scheduler, has to come AFTER creating the optimizer! if not ( self._trainer_config.warmup_steps == 0 and self._trainer_config.lr_decay is None ): self._pipeline.model.lr_scheduler = self._create_lr_scheduler() else: self._pipeline.model.lr_scheduler = None # set monitor and mode for best validation metrics self._pipeline.model.monitor = self._trainer_config.monitor self._pipeline.model.monitor_mode = self._trainer_config.monitor_mode
def train( self, output: str, training: Union[DataSource, InstancesDataset], trainer: Optional[TrainerConfiguration] = None, validation: Optional[Union[DataSource, InstancesDataset]] = None, test: Optional[Union[DataSource, InstancesDataset]] = None, extend_vocab: Optional[VocabularyConfiguration] = None, loggers: List[BaseTrainLogger] = None, restore: bool = False, quiet: bool = False, ) -> TrainingResults: """Launches a training run with the specified configurations and data sources Parameters ---------- output: The experiment output path training: The training DataSource trainer: The trainer file path validation: The validation DataSource (optional) test: The test DataSource (optional) extend_vocab: Extends the vocabulary tokens with the provided VocabularyConfiguration loggers: A list of loggers that execute a callback before the training, after each epoch, and at the end of the training (see `biome.text.logger.MlflowLogger`, for example) restore: If enabled, tries to read previous training status from the `output` folder and continues the training process quiet: If enabled, disables most logging messages keeping only warning and error messages. In any case, all logging info will be stored into a file at ${output}/train.log Returns ------- training_results Training results including the generated model path and the related metrics """ if extend_vocab is not None and isinstance(self, _BlankPipeline): raise ActionNotSupportedError( "If you want to customize pipeline vocab, please use the `create_vocabulary()` method instead" ) trainer = trainer or TrainerConfiguration() try: if not restore and os.path.isdir(output): shutil.rmtree(output) self.__configure_training_logging(output, quiet) # The original pipeline keeps unchanged train_pipeline = self._make_copy() vocab = None if restore: vocab = vocabulary.load_vocabulary( os.path.join(output, "vocabulary")) if extend_vocab is not None and not vocab: vocab = train_pipeline._extend_vocabulary( train_pipeline.backbone.vocab, vocab_config=extend_vocab) if vocab: train_pipeline._set_vocab(vocab) if vocabulary.is_empty(train_pipeline.backbone.vocab, self.config.features.keys): raise EmptyVocabError( "Found an empty vocabulary. " "You probably forgot to create a vocabulary with '.create_vocabulary()'." ) from ._helpers import PipelineTrainer datasets = { "training": training, "validation": validation, "test": test } for name, dataset in datasets.items(): if isinstance(dataset, DataSource): datasets[name] = train_pipeline.create_dataset(dataset) loggers = loggers or [] add_default_wandb_logger_if_needed(loggers) pipeline_trainer = PipelineTrainer( train_pipeline, trainer_config=trainer, output_dir=output, epoch_callbacks=loggers, **datasets, ) for logger in loggers: try: logger.init_train( pipeline=train_pipeline, trainer_configuration=trainer, **datasets, ) except Exception as e: self.__LOGGER.warning("Logger %s failed on init_train: %s", logger, e) model_path, metrics = pipeline_trainer.train() train_results = TrainingResults(model_path, metrics) for logger in loggers: try: logger.end_train(train_results) except Exception as e: self.__LOGGER.warning("Logger %s failed on end_traing: %s", logger, e) return train_results finally: self.__restore_training_logging()
def _prepare_vocab( self, vocabulary_folder: Optional[str] = None, vocab_config: Optional[Union[str, VocabularyConfiguration]] = "default", training_data: Optional[Dataset] = None, lazy: bool = False, ): """Prepare and set the vocab for a training or learning rate scan. Parameters ---------- vocabulary_folder If specified, load the vocab from this folder vocab_config A `VocabularyConfiguration` to create/extend the pipeline's vocabulary if necessary. If 'default' (str), we will use the default configuration `VocabularyConfiguration(datasets=[training_data])`. If None, we will leave the pipeline's vocabulary untouched. training_data The training data in case we need to construct the default config lazy If true, dataset instances are lazily loaded from disk, otherwise they are loaded and kept in memory. """ # The transformers feature comes with its own vocab, no need to prepare anything if it is the only feature if self.config.features.configured_namespaces == [ TransformersFeatures.namespace ]: return # If the vocab is empty, we assume this is an untrained pipeline # and we want to raise an error if the weights file is not found. # Extending the vocab with a non-existent weights file only throws a warning. try: assert is_url_or_existing_file( Path(self.config.features.word.weights_file)) except AssertionError: if vocabulary.is_empty(self.vocab, [WordFeatures.namespace]): raise FileNotFoundError( f"Cannot find the weights file {self.config.features.word.weights_file}" ) # no word feature, or weights_file is None except (AttributeError, TypeError): pass if vocabulary_folder is not None: self._model.extend_vocabulary( Vocabulary.from_files(vocabulary_folder)) vocab_config = None vocab_config = (VocabularyConfiguration(datasets=[training_data]) if vocab_config == "default" else vocab_config) if vocab_config is not None: vocab = vocab_config.build_vocab(pipeline=self, lazy=lazy) self._model.extend_vocabulary(vocab) if vocabulary.is_empty(self.vocab, self.config.features.configured_namespaces): raise EmptyVocabError( "All your features need a non-empty vocabulary for a training!" )