Пример #1
0
def test_training_from_pretrained_with_head_replace(
    pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str
):
    training = pipeline_test.create_dataset(datasource_test)
    pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training]))
    configuration = TrainerConfiguration(
        data_bucketing=True, batch_size=2, num_epochs=5
    )
    output_dir = os.path.join(tmp_path, "output")
    results = pipeline_test.train(
        output=output_dir, trainer=configuration, training=training, quiet=True
    )

    trained = Pipeline.from_pretrained(results.model_path)
    trained.set_head(TestHead)
    trained.config.tokenizer.max_nr_of_sentences = 3
    copied = trained._make_copy()
    assert isinstance(copied.head, TestHead)
    assert copied.num_parameters == trained.num_parameters
    assert copied.num_trainable_parameters == trained.num_trainable_parameters
    copied_model_state = copied._model.state_dict()
    original_model_state = trained._model.state_dict()
    for key, value in copied_model_state.items():
        if "backbone" in key:
            assert torch.all(torch.eq(value, original_model_state[key]))
    assert copied.backbone.featurizer.tokenizer.max_nr_of_sentences == 3
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(text="Test this NER machine")
    pipeline.create_vocabulary(
        VocabularyConfiguration(sources=[training_data_source]))

    pipeline.train(
        output=str(tmp_path / "ner_experiment"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_data_source,
        validation=training_data_source,
    )
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(text="my name is juan")
    pipeline.create_vocabulary(
        VocabularyConfiguration(sources=[training_data_source]))

    pipeline.train(
        output=str(tmp_path / "lm"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_data_source,
        validation=training_data_source,
    )
Пример #4
0
def test_specific_vocab_config(pipeline, train_dataset, valid_dataset):
    vocab_config = VocabularyConfiguration(include_valid_data=True)

    Trainer(
        pipeline,
        train_dataset=train_dataset,
        valid_dataset=valid_dataset,
        vocab_config=vocab_config,
    )
    assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 16
    assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 19
    assert pipeline.vocab.get_vocab_size(
        TransformersFeatures.namespace) == 28996
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(record1={"first_name": "Hans"},
                     record2={"first_name": "Hansel"})
    pipeline.create_vocabulary(
        VocabularyConfiguration(sources=[training_data_source]))

    pipeline.train(
        output=str(tmp_path / "record_bimpm_experiment"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_data_source,
        validation=training_data_source,
    )
Пример #6
0
    def __init__(
        self,
        pipeline_config: dict,
        trainer_config: TrainerConfiguration,
        train_dataset: Dataset,
        valid_dataset: Dataset,
        vocab_config: Optional[Union[str, VocabularyConfiguration]] = "default",
        metrics: Union[None, str, List[str], Dict[str, str]] = None,
        name: Optional[str] = None,
        trainable: Optional[Callable] = None,
        silence: bool = False,
        **kwargs,
    ):
        if (
            "name" in kwargs.keys()
            or "run" in kwargs.keys()
            or "config" in kwargs.keys()
        ):
            raise ValueError(
                f"Your `kwargs` must not contain the 'name', 'run' or 'config' key."
                f"These are provided automatically by `TuneExperiment`."
            )

        # save created tmp dirs in this list to clean them up when object gets destroyed
        self._created_tmp_dirs: List[tempfile.TemporaryDirectory] = []

        self._train_dataset_path = self._save_dataset_to_disk(train_dataset)
        self._valid_dataset_path = self._save_dataset_to_disk(valid_dataset)

        self._pipeline_config = pipeline_config
        self._trainer_config = asdict(trainer_config)
        vocab_config: Optional[VocabularyConfiguration] = (
            VocabularyConfiguration() if vocab_config == "default" else vocab_config
        )
        self._vocab_config: Optional[Dict] = (
            asdict(vocab_config) if vocab_config else vocab_config
        )

        self.trainable = trainable or self._default_trainable

        self._silence = silence

        self._name = name or f"HPO on {datetime.now().strftime('%Y-%m-%d (%I-%M)')}"
        if not os.environ.get("WANDB_PROJECT"):
            os.environ["WANDB_PROJECT"] = self._name

        self._metrics = metrics

        super().__init__(
            name=self._name, run=self.trainable, config=self.config, **kwargs
        )
Пример #7
0
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )
    pipeline.create_vocabulary(
        VocabularyConfiguration(sources=[training_data_source]))

    pipeline.train(
        output=str(tmp_path / "relation_classifier"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_data_source,
        validation=training_data_source,
    )

    pl_trained = Pipeline.from_pretrained(str(tmp_path /
                                              "relation_classifier"))
    pl_trained.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )
Пример #8
0
def test_text_classification(tmp_path, pipeline_dict, train_valid_dataset):
    """Apart from a well specified training, this also tests the vocab creation!"""
    seed_everything(43)

    pl = Pipeline.from_config(pipeline_dict)
    train_ds = train_valid_dataset[0]
    valid_ds = train_valid_dataset[1]

    vocab_config = VocabularyConfiguration(max_vocab_size={"word": 50})
    trainer_config = TrainerConfiguration(
        batch_size=64,
        optimizer={
            "type": "adam",
            "lr": 0.01
        },
        max_epochs=5,
        default_root_dir=str(tmp_path),
        gpus=0,  # turn off gpus even if available
    )

    trainer = Trainer(
        pipeline=pl,
        train_dataset=train_ds,
        valid_dataset=valid_ds,
        trainer_config=trainer_config,
        vocab_config=vocab_config,
    )

    trainer.fit(tmp_path / "output")

    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83

    assert pl.num_trainable_parameters == 22070

    evaluation = trainer.test(valid_ds, batch_size=16)

    # Reminder: the value depends on the batch_size!
    assert evaluation["test_loss"] == pytest.approx(0.7404146790504456,
                                                    abs=0.003)

    Pipeline.from_pretrained(str(tmp_path / "output" / "model.tar.gz"))

    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
Пример #9
0
def test_text_classification(
    tmp_path, pipeline_dict, trainer_dict, train_valid_dataset
):
    """Apart from a well specified training, this also tests the vocab creation!"""

    random.seed(42)
    np.random.seed(422)
    torch.manual_seed(4222)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(4222)

    pl = Pipeline.from_config(pipeline_dict)
    train_ds = train_valid_dataset[0]
    valid_ds = train_valid_dataset[1]
    trainer = TrainerConfiguration(**trainer_dict)
    vocab_config = VocabularyConfiguration(
        datasets=[train_ds], max_vocab_size={"word": 50}
    )

    output = tmp_path / "output"

    pl.train(
        output=str(output),
        trainer=trainer,
        training=train_ds,
        validation=valid_ds,
        vocab_config=vocab_config,
    )
    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83

    assert pl.num_trainable_parameters == 22070

    with (output / "metrics.json").open() as file:
        metrics = json.load(file)

    # It may fail in some systems
    assert metrics["training_loss"] == pytest.approx(0.684, abs=0.003)

    # Test vocab from a pretrained file
    pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))

    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
Пример #10
0
    def _default_trainable(config, checkpoint_dir=None):
        """A default trainable function used by `tune.run`

        It performs the most straight forward training loop with the provided `config`:
        - Create the pipeline (optionally with a provided vocab)
        - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch
        - Execute the training
        """
        if config["silence"]:
            logging.getLogger("biome.text").setLevel(logging.ERROR)

        pipeline = Pipeline.from_config(config["pipeline_config"])

        trainer_config = TrainerConfiguration(**config["trainer_config"])

        vocab_config = config["vocab_config"]
        if vocab_config:
            vocab_config = VocabularyConfiguration(**vocab_config)

        callbacks = trainer_config.callbacks
        if not isinstance(callbacks, list):
            callbacks = [callbacks]
        if not any(
            [isinstance(callback, TuneReportCallback) for callback in callbacks]
        ):
            tune_callback = TuneReportCallback(metrics=config["metrics"])
            if trainer_config.callbacks is None:
                trainer_config.callbacks = tune_callback
            else:
                trainer_config.callbacks = callbacks + [tune_callback]

        train_ds = Dataset.load_from_disk(config["train_dataset_path"])
        valid_ds = Dataset.load_from_disk(config["valid_dataset_path"])
        train_instances = train_ds.to_instances(pipeline=pipeline, disable_tqdm=True)
        valid_instances = valid_ds.to_instances(pipeline=pipeline, disable_tqdm=True)

        trainer = Trainer(
            pipeline=pipeline,
            train_dataset=train_instances,
            valid_dataset=valid_instances,
            trainer_config=trainer_config,
            vocab_config=vocab_config,
        )
        trainer.fit()
Пример #11
0
def test_vocab_config(tmp_path, pipeline_config, trainer_config, dataset):
    vocab_config = VocabularyConfiguration(max_vocab_size=1)

    my_exp = TuneExperiment(
        pipeline_config=pipeline_config,
        trainer_config=trainer_config,
        train_dataset=dataset,
        valid_dataset=dataset,
        vocab_config=vocab_config,
        name="test_vocab_config",
        local_dir=str(tmp_path),
    )

    analysis = tune.run(my_exp)
    pl = Pipeline.from_pretrained(
        Path(analysis.get_best_logdir("validation_loss", "min")) / "output" /
        "model.tar.gz")

    assert pl.vocab.get_vocab_size("word") == 3
Пример #12
0
def test_training_with_logging(
    pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str
):
    training = pipeline_test.create_dataset(datasource_test)
    pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training]))

    configuration = TrainerConfiguration(
        data_bucketing=True, batch_size=2, num_epochs=5
    )
    output_dir = os.path.join(tmp_path, "output")
    pipeline_test.train(
        output=output_dir, trainer=configuration, training=training, quiet=True
    )

    assert os.path.exists(os.path.join(output_dir, "train.log"))
    with open(os.path.join(output_dir, "train.log")) as train_log:
        for line in train_log.readlines():
            assert "allennlp" in line

    assert logging.getLogger("allennlp").level == logging.ERROR
    assert logging.getLogger("biome").level == logging.INFO
Пример #13
0
def train(
    pipeline_path: str,
    output: str,
    trainer: str,
    training: str,
    validation: Optional[str] = None,
    test: Optional[str] = None,
) -> None:
    """Train a pipeline.

    PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file),
    or the path to a pipeline configuration (YAML file).
    """
    _, extension = os.path.splitext(pipeline_path)
    extension = extension[1:].lower()
    pipeline = (
        Pipeline.from_yaml(pipeline_path)
        if extension in ["yaml", "yml"]
        else Pipeline.from_pretrained(pipeline_path)
    )

    datasets = {
        "train": dataset_from_path(training),
        "validation": dataset_from_path(validation) if validation else None,
        "test": dataset_from_path(test) if test else None,
    }

    pipeline.create_vocabulary(
        VocabularyConfiguration(
            sources=[dataset for dataset in datasets.values() if dataset]
        ),
    )

    pipeline.train(
        output=output,
        trainer=TrainerConfiguration(**yaml_to_dict(trainer)),
        training=datasets["training"],
        validation=datasets["validation"],
        test=datasets["test"],
    )
def test_train(tmp_path, pipeline_dict, trainer_dict, train_data_source):
    pl = Pipeline.from_config(pipeline_dict)
    trainer = TrainerConfiguration(**trainer_dict)
    vocab = VocabularyConfiguration(sources=[train_data_source])
    pl.create_vocabulary(vocab)

    assert pl.backbone.vocab.get_vocab_size("transformers") == 50265

    pl.predict(text="test")

    output = tmp_path / "output"

    training_results = pl.train(
        output=str(output),
        trainer=trainer,
        training=train_data_source,
    )

    # test vocab from a pretrained file
    pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))

    assert pl.backbone.vocab.get_vocab_size("transformers") == 50265
Пример #15
0
def test_tune_exp_save_dataset_and_vocab(
    dataset, pipeline_config, trainer_config, monkeypatch
):
    pl = Pipeline.from_config(pipeline_config)
    vocab = VocabularyConfiguration(datasets=[dataset]).build_vocab(pipeline=pl)

    my_exp = TuneExperiment(
        pipeline_config=pipeline_config,
        trainer_config=trainer_config,
        train_dataset=dataset,
        valid_dataset=dataset,
        vocab=vocab,
    )

    config = my_exp.config
    pl2 = Pipeline.from_config(config["pipeline_config"], config["vocab_path"])

    pl._model.extend_vocabulary(vocab)
    assert pl.backbone.vocab._index_to_token == pl2.backbone.vocab._index_to_token
    assert pl.backbone.vocab._token_to_index == pl2.backbone.vocab._token_to_index

    assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:]
    assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:]
Пример #16
0
def test_training_with_data_bucketing(
    pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str
):
    lazy_ds = pipeline_test.create_dataset(datasource_test, lazy=True)
    non_lazy_ds = pipeline_test.create_dataset(datasource_test)

    pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[lazy_ds]))

    configuration = TrainerConfiguration(
        data_bucketing=True, batch_size=2, num_epochs=5
    )
    pipeline_test.train(
        output=os.path.join(tmp_path, "output"),
        trainer=configuration,
        training=lazy_ds,
        validation=non_lazy_ds,
    )

    pipeline_test.train(
        output=os.path.join(tmp_path, "output"),
        trainer=configuration,
        training=non_lazy_ds,
        validation=lazy_ds,
    )