def test_transformers_and_word(tmp_path, pipeline_dict, trainer_dict, train_dataset): """Testing Transformer pipeline with an added word feature layer""" # Changing the pipeline to delete the BERT pooler and add a word feature del pipeline_dict["head"]["pooler"] pipeline_dict["features"].update( {"word": {"embedding_dim": 16, "lowercase_tokens": True}} ) pl = Pipeline.from_config(pipeline_dict) pl.predict(text="test") output = tmp_path / "output" trainer = TrainerConfiguration(**trainer_dict) pl.train(output=str(output), trainer=trainer, training=train_dataset) # Check a fixed vocabulary size for the transformer and the word feature assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 assert pl.backbone.vocab.get_vocab_size("word") == 273 # Test vocab from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) # Check a fixed vocabulary size for the transformer and the word feature after loading assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 assert pl.backbone.vocab.get_vocab_size("word") == 273
def create_trainer_for_finding_lr( pipeline: Pipeline, trainer_config: TrainerConfiguration, training_data: InstancesDataset, ) -> GradientDescentTrainer: """Returns an AllenNLP Trainer used for the learning rate scan. Parameters ---------- pipeline The pipeline with the model trainer_config A trainer configuration training_data The training data """ prepare_environment(Params({})) if hasattr(training_data, "index_with"): training_data.index_with(pipeline.backbone.vocab) trainer_params = Params( helpers.sanitize_for_params(trainer_config.to_allennlp_trainer())) training_data_loader = create_dataloader(training_data, trainer_config.batch_size, trainer_config.data_bucketing) return Trainer.from_params( model=pipeline._model, data_loader=training_data_loader, params=trainer_params, serialization_dir=None, )
def create_trainer_for_finding_lr( model: PipelineModel, trainer_config: TrainerConfiguration, training_data: InstancesDataset, ) -> GradientDescentTrainer: """Returns an AllenNLP Trainer used for the learning rate scan. Parameters ---------- model The underlying model trainer_config A trainer configuration training_data The training data """ prepare_environment(Params({})) trainer_params = Params( helpers.sanitize_for_params(trainer_config.to_allennlp_trainer())) training_data_loader = create_dataloader(training_data, trainer_config.batch_size, trainer_config.data_bucketing) return cast( "GradientDescentTrainer", Trainer.from_params( model=model, data_loader=training_data_loader, params=trainer_params, serialization_dir=None, ), )
def train( pipeline_path: str, output: str, trainer_config: str, train_data: str, valid_data: Optional[str] = None, ) -> None: """Train a pipeline. PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file), or the path to a pipeline configuration (YAML file). """ _, extension = os.path.splitext(pipeline_path) extension = extension[1:].lower() pipeline = ( Pipeline.from_yaml(pipeline_path) if extension in ["yaml", "yml"] else Pipeline.from_pretrained(pipeline_path) ) datasets = { "train": dataset_from_path(train_data), "validation": dataset_from_path(valid_data) if valid_data else None, } trainer = Trainer( pipeline=pipeline, train_dataset=datasets["train"], valid_dataset=datasets["validation"], trainer_config=TrainerConfiguration(**yaml_to_dict(trainer_config)), ) trainer.fit(output_dir=output)
def test_training_from_pretrained_with_head_replace(pipeline: Pipeline, dataset: Dataset, tmp_path: str): configuration = TrainerConfiguration( data_bucketing=True, batch_size=2, num_epochs=5, cuda_device=-1, ) output_dir = os.path.join(tmp_path, "output") pipeline.train(output=output_dir, trainer=configuration, training=dataset, quiet=True) pipeline.set_head(TestHead) pipeline.config.tokenizer_config.max_nr_of_sentences = 3 copied = pipeline.copy() assert isinstance(copied.head, TestHead) assert copied.num_parameters == pipeline.num_parameters assert copied.num_trainable_parameters == pipeline.num_trainable_parameters copied_model_state = copied._model.state_dict() original_model_state = pipeline._model.state_dict() for key, value in copied_model_state.items(): if "backbone" in key: assert torch.all(torch.eq(value, original_model_state[key])) assert copied.backbone.featurizer.tokenizer.config.max_nr_of_sentences == 3
def test_training_from_pretrained_with_head_replace(pipeline, dataset, tmp_path): trainer_config = TrainerConfiguration( batch_size=2, max_epochs=5, gpus=0, ) trainer = Trainer(pipeline, train_dataset=dataset, trainer_config=trainer_config) trainer.fit(tmp_path / "output") pipeline.set_head(TestHead) pipeline.config.tokenizer_config.max_nr_of_sentences = 3 copied = pipeline.copy() assert isinstance(copied.head, TestHead) assert copied.num_parameters == pipeline.num_parameters assert copied.num_trainable_parameters == pipeline.num_trainable_parameters copied_model_state = copied._model.state_dict() original_model_state = pipeline._model.state_dict() for key, value in copied_model_state.items(): if "backbone" in key: assert torch.all(torch.eq(value, original_model_state[key])) assert copied.backbone.featurizer.tokenizer.config.max_nr_of_sentences == 3
def test_training_from_pretrained_with_head_replace( pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str ): training = pipeline_test.create_dataset(datasource_test) pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training])) configuration = TrainerConfiguration( data_bucketing=True, batch_size=2, num_epochs=5 ) output_dir = os.path.join(tmp_path, "output") results = pipeline_test.train( output=output_dir, trainer=configuration, training=training, quiet=True ) trained = Pipeline.from_pretrained(results.model_path) trained.set_head(TestHead) trained.config.tokenizer.max_nr_of_sentences = 3 copied = trained._make_copy() assert isinstance(copied.head, TestHead) assert copied.num_parameters == trained.num_parameters assert copied.num_trainable_parameters == trained.num_trainable_parameters copied_model_state = copied._model.state_dict() original_model_state = trained._model.state_dict() for key, value in copied_model_state.items(): if "backbone" in key: assert torch.all(torch.eq(value, original_model_state[key])) assert copied.backbone.featurizer.tokenizer.max_nr_of_sentences == 3
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path): """Testing a classifier made from scratch""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], ) pipeline.train( output=str(tmp_path / "relation_classifier"), trainer=TrainerConfiguration(**trainer_dict), training=training_dataset, validation=training_dataset, ) # test loading Pipeline.from_pretrained(str(tmp_path / "relation_classifier"))
def trainer_config() -> TrainerConfiguration: return TrainerConfiguration( max_epochs=1, optimizer={ "type": "adamw", "lr": 0.002 }, gpus=0, )
def _default_trainable(config, checkpoint_dir=None): """A default trainable function used by `tune.run` It performs the most straight forward training loop with the provided `config`: - Create the pipeline (optionally with a provided vocab) - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch - Execute the training """ if config["silence"]: logging.getLogger("biome.text").setLevel(logging.ERROR) pipeline = Pipeline.from_config(config["pipeline_config"]) trainer_config = TrainerConfiguration(**config["trainer_config"]) vocab_config = config["vocab_config"] if vocab_config: vocab_config = VocabularyConfiguration(**vocab_config) callbacks = trainer_config.callbacks if not isinstance(callbacks, list): callbacks = [callbacks] if not any( [isinstance(callback, TuneReportCallback) for callback in callbacks] ): tune_callback = TuneReportCallback(metrics=config["metrics"]) if trainer_config.callbacks is None: trainer_config.callbacks = tune_callback else: trainer_config.callbacks = callbacks + [tune_callback] train_ds = Dataset.load_from_disk(config["train_dataset_path"]) valid_ds = Dataset.load_from_disk(config["valid_dataset_path"]) train_instances = train_ds.to_instances(pipeline=pipeline, disable_tqdm=True) valid_instances = valid_ds.to_instances(pipeline=pipeline, disable_tqdm=True) trainer = Trainer( pipeline=pipeline, train_dataset=train_instances, valid_dataset=valid_instances, trainer_config=trainer_config, vocab_config=vocab_config, ) trainer.fit()
def trainer_config() -> TrainerConfiguration: return TrainerConfiguration( max_epochs=2, optimizer={ "type": "adam", "amsgrad": True, "lr": 0.002 }, gpus=0, )
def trainer_config(tmp_path) -> TrainerConfiguration: return TrainerConfiguration( batch_size=16, max_epochs=1, optimizer={ "type": "adam", "lr": 0.0001, }, gpus=0, default_root_dir=str(tmp_path), )
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path): """Testing the correct working of prediction, vocab creating and training""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(record1={"first_name": "Hans"}, record2={"first_name": "Hansel"}) pipeline.train( output=str(tmp_path / "record_bimpm_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_dataset, validation=training_dataset, )
def test_mlflow_logger(): logger = MlflowLogger(experiment_name="test-experiment", run_name="test_run", tag1="my-tag") pipeline = Pipeline.from_config( PipelineConfiguration( name="test-pipeline", head=TaskHeadConfiguration(type=TextClassification, labels=["A", "B"]), )) trainer = TrainerConfiguration() logger.init_train(pipeline, trainer, training=None) for epoch in range(0, 10): logger.log_epoch_metrics(epoch, metrics={"key": 10 * epoch}) model_path = mkdtemp() metrics = {"metric": 200} logger.end_train(TrainingResults(model_path, metrics)) run = mlflow.get_run(logger._run_id) assert run # Tags assert "test_run" == run.data.tags[mlflow_tags.MLFLOW_RUN_NAME] assert "my-tag" == run.data.tags["tag1"] # Parameters expected_parmams = { "pipeline.features.word.trainable": "True", "pipeline.num_parameters": "202", "pipeline.num_trainable_parameters": "202", "pipeline.features.word.embedding_dim": "50", "pipeline.head.type": "biome.text.modules.heads.classification.text_classification.TextClassification", "pipeline.head.labels": "['A', 'B']", "pipeline.name": "test-pipeline", "pipeline.tokenizer.lang": "en", "trainer.batch_size": "16", "trainer.validation_metric": "-loss", "trainer.optimizer.type": "adam", "trainer.patience": "2", "trainer.num_epochs": "20", "trainer.num_serialized_models_to_keep": "1", "pipeline.tokenizer.remove_space_tokens": "True", } assert expected_parmams == run.data.params # Artifacts assert os.path.basename(model_path) in os.listdir( urlparse(run.info.artifact_uri).path) # Metrics for metric in metrics: assert (metric in run.data.metrics and run.data.metrics[metric] == metrics[metric])
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(text="my name is juan") pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "lm"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, )
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path): """Testing the correct working of prediction, vocab creating and training""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(text="my name is juan") pipeline.train( output=str(tmp_path / "lm"), trainer=TrainerConfiguration(**trainer_dict), training=training_dataset, validation=training_dataset, )
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(text="Test this NER machine") pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "ner_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, )
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) assert pipeline.output == ["entities", "tags"] assert pipeline.head.span_labels == ["NER"] assert pipeline.head.labels == ["B-NER", "I-NER", "U-NER", "L-NER", "O"] pipeline.train( output=str(tmp_path / "ner_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_dataset, )
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(record1={"first_name": "Hans"}, record2={"first_name": "Hansel"}) pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "record_bimpm_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, )
def test_train_from_pretrained(pipeline, dataset, tmp_path): output_path = tmp_path / "test_train_from_pretrained_output" trainer_config = TrainerConfiguration(max_epochs=1, batch_size=2, gpus=0) trainer = Trainer( pipeline=pipeline, train_dataset=dataset, trainer_config=trainer_config ) trainer.fit(output_path) prediction = pipeline.predict("a test") pipeline_loaded = Pipeline.from_pretrained(output_path / "model.tar.gz") prediction_loaded = pipeline_loaded.predict("a test") assert_allclose(prediction["probabilities"], prediction_loaded["probabilities"])
def test_use_amp(dataset, pipeline, tmp_path, capsys): trainer_config = TrainerConfiguration( num_epochs=1, batch_size=2, use_amp=True, ) pipeline.train( output=str(tmp_path / "test_use_amp_output"), training=dataset, trainer=trainer_config, ) captured = capsys.readouterr() assert "use_amp = True" in captured.err
def test_train_from_pretrained(pipeline, dataset, tmp_path): output_path = tmp_path / "test_train_from_pretrained_output" trainer_config = TrainerConfiguration(num_epochs=1, batch_size=2, cuda_device=-1) pipeline.train(output=str(output_path), training=dataset, trainer=trainer_config) prediction = pipeline.predict("a test") pipeline_loaded = Pipeline.from_pretrained(str(output_path)) prediction_loaded = pipeline_loaded.predict("a test") assert_allclose(prediction["probabilities"], prediction_loaded["probabilities"])
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], ) pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "relation_classifier"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, ) pl_trained = Pipeline.from_pretrained(str(tmp_path / "relation_classifier")) pl_trained.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], )
def test_training_with_logging(pipeline: Pipeline, dataset: Dataset, tmp_path: str): configuration = TrainerConfiguration(data_bucketing=True, batch_size=2, num_epochs=5) output_dir = os.path.join(tmp_path, "output") pipeline.train(output=output_dir, trainer=configuration, training=dataset, quiet=True) assert os.path.exists(os.path.join(output_dir, "train.log")) with open(os.path.join(output_dir, "train.log")) as train_log: for line in train_log.readlines()[3:]: assert "allennlp" in line assert logging.getLogger("allennlp").level == logging.ERROR assert logging.getLogger("biome").level == logging.INFO
def test_text_classification( tmp_path, pipeline_dict, trainer_dict, train_valid_dataset ): """Apart from a well specified training, this also tests the vocab creation!""" random.seed(42) np.random.seed(422) torch.manual_seed(4222) if torch.cuda.is_available(): torch.cuda.manual_seed_all(4222) pl = Pipeline.from_config(pipeline_dict) train_ds = train_valid_dataset[0] valid_ds = train_valid_dataset[1] trainer = TrainerConfiguration(**trainer_dict) vocab_config = VocabularyConfiguration( datasets=[train_ds], max_vocab_size={"word": 50} ) output = tmp_path / "output" pl.train( output=str(output), trainer=trainer, training=train_ds, validation=valid_ds, vocab_config=vocab_config, ) assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83 assert pl.num_trainable_parameters == 22070 with (output / "metrics.json").open() as file: metrics = json.load(file) # It may fail in some systems assert metrics["training_loss"] == pytest.approx(0.684, abs=0.003) # Test vocab from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
def _default_trainable(config, reporter): """A default trainable function used by `tune.run` It performs the most straight forward training loop with the provided `config`: - Create the pipeline (optionally with a provided vocab) - Set up a MLFlow and WandB logger - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch - Create the vocab if necessary - Execute the training """ pipeline = Pipeline.from_config(config["pipeline_config"], vocab_path=config["vocab_path"]) trainer_config = TrainerConfiguration( **helpers.sanitize_for_params(config["trainer_config"])) mlflow_tracking_uri = config["mlflow_tracking_uri"] mlflow.set_tracking_uri(mlflow_tracking_uri) train_ds = Dataset.load_from_disk(config["train_dataset_path"]) valid_ds = Dataset.load_from_disk(config["valid_dataset_path"]) train_loggers = [ MlflowLogger( experiment_name=config["name"], run_name=reporter.trial_name, ray_trial_id=reporter.trial_id, ray_logdir=reporter.logdir, ), TuneMetricsLogger(), ] if is_wandb_installed_and_logged_in(): train_loggers = [WandBLogger(project_name=config["name"]) ] + train_loggers pipeline.train( output="training", training=train_ds, validation=valid_ds, trainer=trainer_config, loggers=train_loggers, vocab_config=None if config["vocab_path"] else "default", )
def test_pure_transformers(tmp_path, pipeline_dict, trainer_dict, train_dataset): """Testing a Transformer training process and a model load""" pl = Pipeline.from_config(pipeline_dict) trainer = TrainerConfiguration(**trainer_dict) # Check a fixed vocabulary size for the model assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 pl.predict(text="test") output = tmp_path / "output" pl.train(output=str(output), trainer=trainer, training=train_dataset) # Test vocabulary from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) # Check a fixed vocabulary size for the model after loading assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
def test_create_pipeline_with_weights_file(pipeline_config, dataset, tmp_path): pipeline = Pipeline.from_config(pipeline_config) output = tmp_path / "pretrained_word_vector_output" pipeline.train( output=str(output), training=dataset, trainer=TrainerConfiguration(num_epochs=1, cuda_device=-1), ) instance = pipeline.head.featurize("test") instance.index_fields(pipeline.vocab) assert_allclose( pipeline.backbone.embedder(instance.as_tensor_dict()["text"], 0), torch.tensor([[0.66, 0.33]]), ) # Loading a pretrained model without the weights file should work Path(pipeline_config["features"]["word"]["weights_file"]).unlink() assert isinstance(Pipeline.from_pretrained(str(output / "model.tar.gz")), Pipeline)
def test_training_with_logging( pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str ): training = pipeline_test.create_dataset(datasource_test) pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training])) configuration = TrainerConfiguration( data_bucketing=True, batch_size=2, num_epochs=5 ) output_dir = os.path.join(tmp_path, "output") pipeline_test.train( output=output_dir, trainer=configuration, training=training, quiet=True ) assert os.path.exists(os.path.join(output_dir, "train.log")) with open(os.path.join(output_dir, "train.log")) as train_log: for line in train_log.readlines(): assert "allennlp" in line assert logging.getLogger("allennlp").level == logging.ERROR assert logging.getLogger("biome").level == logging.INFO
def test_training_with_data_bucketing(pipeline: Pipeline, dataset: Dataset, tmp_path: str): configuration = TrainerConfiguration(data_bucketing=True, batch_size=2, num_epochs=5) pipeline.copy().train( output=os.path.join(tmp_path, "output"), trainer=configuration, training=dataset, validation=dataset, lazy=False, ) pipeline.copy().train( output=os.path.join(tmp_path, "output"), trainer=configuration, training=dataset, validation=dataset, lazy=True, )