def test_fail_using_reserved_words(): # This issue was reported and maybe gets resolved: https://github.com/huggingface/datasets/issues/1110 ds = Dataset.from_dict({ "a": [{ "a": 1, "b": "two" } for _ in range(0, 100)], "_type": ["whatever" for _ in range(0, 100)], }) split = ds.train_test_split() new_ds = Dataset.from_datasets(split.values()) assert len(new_ds) == len(ds) ds = ds.map( lambda example: { "new_field": { "c": str(example["a"]["a"]), "d": f"this {example['a']['b']}" } }) split = ds.train_test_split() with pytest.raises(TypeError): Dataset.from_datasets(split.values())
def dataset() -> Dataset: data = { "text": ["Test", "this", "shaight", "!"], "label": ["0", "1", "0", "0"] } return Dataset.from_dict(data)
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2, capsys, caplog): pipeline = Pipeline.from_config(pipeline_config) # create vocab pipeline.create_vocab([dataset.to_instances(pipeline)]) # extending the vocab with the weights file available should apply the pretrained weights pipeline.create_vocab([dataset2.to_instances(pipeline)]) instance = pipeline.head.featurize("this") instance.index_fields(pipeline.vocab) assert_allclose( pipeline.backbone.embedder(instance.as_tensor_dict()["text"]), torch.tensor([[0.25, 0.75]]), ) # extending the vocab with the weights file deleted should trigger a warning Path(pipeline_config["features"]["word"]["weights_file"]).unlink() ds = Dataset.from_dict({"text": ["that"], "label": ["good"]}) pipeline.create_vocab([ds.to_instances(pipeline)]) assert caplog.record_tuples[-1][ 0] == "allennlp.modules.token_embedders.embedding" assert caplog.record_tuples[-1][1] == 30 assert ( "Embedding at model_path, " "_head.backbone.embedder.token_embedder_word cannot locate the pretrained_file." in caplog.record_tuples[-1][2])
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2, deactivate_pipeline_trainer, caplog): pipeline = Pipeline.from_config(pipeline_config) # create vocab pipeline.train( output="dummy", training=dataset, ) # extending the vocab with the weights file available should apply the pretrained weights pipeline.train( output="dummy", training=dataset2, ) instance = pipeline.head.featurize("this") instance.index_fields(pipeline.vocab) assert_allclose( pipeline.backbone.embedder(instance.as_tensor_dict()["text"]), torch.tensor([[0.25, 0.75]]), ) # extending the vocab with the weights file deleted should trigger a warning logging.captureWarnings(True) Path(pipeline_config["features"]["word"]["weights_file"]).unlink() pipeline.train( output="dummy", training=Dataset.from_dict({ "text": ["that"], "label": ["good"] }), ) assert caplog.records[0].module == "embedding" assert "cannot locate the pretrained_file" in caplog.records[0].message
def dataset(): return Dataset.from_dict( { "text": ["this is", "a test"], "label": ["a", "b"], } )
def training_dataset() -> Dataset: """Creating the dataframe.""" data = { "record1": [ { "@first_name": "Hans", "@last_name": "Peter" }, { "@first_name": "Heinrich", "@last_name": "Meier" }, { "@first_name": "Hans", "@last_name": "Peter" }, ], "record2": [ { "@first_name": "Hans", "@last_name": "Petre" }, { "@first_name": "Heinz", "@last_name": "Meier" }, { "@first_name": "Hansel", "@last_name": "Peter" }, ], "label": ["duplicate", "not_duplicate", "duplicate"], } return Dataset.from_dict(data)
def training_dataset() -> Dataset: """Creating the dataframe.""" data = { "text": [ "this is a text", "my name is dani", "this is a table", "my name is paco", ], } return Dataset.from_dict(data)
def dataset(tmp_path) -> Dataset: data = { "text": ["A common text", "This is why you get", "Seriosly?, I'm not sure"], "label": ["one", "zero", "zero"], } ds = Dataset.from_dict(data) # we save and load it here to be able to lazily read from it ds_path = tmp_path / "test_pipeline_datasets" / "dataset" ds.save_to_disk(str(ds_path)) return Dataset.load_from_disk(str(ds_path))
def test_trainer_configs(configurations_path): configs = _read_configs(configurations_path, "Trainer") pipeline = Pipeline.from_config( { "name": "test", "head": {"type": "TextClassification", "labels": ["pos", "neg"]}, } ) dataset = Dataset.from_dict({"text": ["test"], "label": ["pos"]}) linear = nn.Linear(2, 2) for config_name, config in configs.items(): assert isinstance(config, TrainerConfiguration) trainer = Trainer( pipeline=pipeline, train_dataset=dataset, trainer_config=config ) assert isinstance(trainer.trainer, pytorch_lightning.Trainer)
def training_dataset() -> Dataset: """Creating the dataframe.""" data = { "text": [ "The most common audits were about waste and recycling.", "The company fabricates plastic chairs.", ], "entities": [ [ { "start": 34, "end": 39, "label": "PN", "text": "waste" }, { "start": 16, "end": 22, "label": "QTY", "text": "audits" }, ], [ { "start": 4, "end": 11, "label": "OBJECT", "text": "company" }, { "start": 31, "end": 37, "label": "SUBJECT", "text": "chairs" }, ], ], "label": ["Message-Topic(e1,e2)", "Product-Producer(e2,e1)"], } return Dataset.from_dict(data)
def test_from_dict(): ds = Dataset.from_dict({"a": [1, 2, 3], "b": [4, 5, 6]}) assert ds.dataset.column_names == ["a", "b"] assert ds["a"] == [1, 2, 3] assert len(ds) == 3
def dataset2() -> Dataset: data = {"text": ["this"], "label": ["good"]} return Dataset.from_dict(data)
def dataset() -> Dataset: """Creating the dataset""" data = {"text": ["This is a simple test"], "label": ["a"]} return Dataset.from_dict(data)
def dataset() -> Dataset: data = { "text": ["test", "this", "shaight", "good"], "label": ["good", "good", "bad", "good"], } return Dataset.from_dict(data)
def valid_dataset(): data = { "text": ["and what about the validation", "do not forget this one"], "label": ["bad", "good"], } return Dataset.from_dict(data)
def train_dataset(): data = { "text": ["this is a test", "and another one"], "label": ["good", "bad"] } return Dataset.from_dict(data)