예제 #1
0
    def test_features(self):
        n_rows = 10
        n_cols = 3

        def get_features(type):
            return Features({str(i): type for i in range(n_cols)})

        with tempfile.TemporaryDirectory() as tmp_dir:
            open(os.path.join(tmp_dir, "table.csv"), "w",
                 encoding="utf-8").write("\n".join(
                     ",".join([str(i) for i in range(n_cols)])
                     for _ in range(n_rows + 1)))
            for type in [
                    Value("float64"),
                    Value("int8"),
                    ClassLabel(num_classes=n_cols)
            ]:
                features = get_features(type)
                ds = load_dataset(
                    "csv",
                    data_files=os.path.join(tmp_dir, "table.csv"),
                    cache_dir=tmp_dir,
                    split="train",
                    features=features,
                )
                self.assertEqual(len(ds), n_rows)
                self.assertDictEqual(ds.features, features)
                del ds
def test_load_dataset_zip_jsonl(data_file, streaming, zip_jsonl_path,
                                zip_jsonl_with_dir_path, jsonl_path):
    data_file_paths = {
        "zip_jsonl_path": zip_jsonl_path,
        "zip_jsonl_with_dir_path": zip_jsonl_with_dir_path,
        "jsonl_path": jsonl_path,
    }
    data_files = str(data_file_paths[data_file])
    expected_size = 8 if data_file.startswith("zip") else 4
    features = Features({
        "col_1": Value("string"),
        "col_2": Value("int32"),
        "col_3": Value("float32")
    })
    ds = load_dataset("json",
                      split="train",
                      data_files=data_files,
                      features=features,
                      streaming=streaming)
    if streaming:
        ds_item_counter = 0
        for ds_item in ds:
            if ds_item_counter == 0:
                assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
            ds_item_counter += 1
        assert ds_item_counter == expected_size
    else:
        assert ds.shape[0] == expected_size
        ds_item = next(iter(ds))
        assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
def test_builder_as_dataset(split, expected_dataset_class, expected_dataset_length, in_memory, tmp_path):
    cache_dir = str(tmp_path)
    dummy_builder = DummyBuilder(cache_dir=cache_dir, name="dummy")
    os.makedirs(dummy_builder.cache_dir)

    dummy_builder.info.splits = SplitDict()
    dummy_builder.info.splits.add(SplitInfo("train", num_examples=10))
    dummy_builder.info.splits.add(SplitInfo("test", num_examples=10))

    for info_split in dummy_builder.info.splits:
        with ArrowWriter(
            path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{info_split}.arrow"),
            features=Features({"text": Value("string")}),
        ) as writer:
            writer.write_batch({"text": ["foo"] * 10})
            writer.finalize()

    with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase():
        dataset = dummy_builder.as_dataset(split=split, in_memory=in_memory)
    assert isinstance(dataset, expected_dataset_class)
    if isinstance(dataset, DatasetDict):
        assert list(dataset.keys()) == ["train", "test"]
        datasets = dataset.values()
        expected_splits = ["train", "test"]
    elif isinstance(dataset, Dataset):
        datasets = [dataset]
        expected_splits = [split]
    for dataset, expected_split in zip(datasets, expected_splits):
        assert dataset.split == expected_split
        assert len(dataset) == expected_dataset_length
        assert dataset.features == Features({"text": Value("string")})
        dataset.column_names == ["text"]
예제 #4
0
    def test_features_dicts_are_synced(self):
        def assert_features_dicts_are_synced(features: Features):
            assert (hasattr(features, "_column_requires_decoding")
                    and features.keys()
                    == features._column_requires_decoding.keys())

        features = Features(
            {"foo": Sequence({"bar": {
                "my_value": Value("int32")
            }})})
        assert_features_dicts_are_synced(features)
        features["barfoo"] = Image()
        assert_features_dicts_are_synced(features)
        del features["barfoo"]
        assert_features_dicts_are_synced(features)
        features.update({"foobar": Value("string")})
        assert_features_dicts_are_synced(features)
        features.pop("foobar")
        assert_features_dicts_are_synced(features)
        features.popitem()
        assert_features_dicts_are_synced(features)
        features.setdefault("xyz", Value("bool"))
        assert_features_dicts_are_synced(features)
        features.clear()
        assert_features_dicts_are_synced(features)
예제 #5
0
 def __init__(self):
     super(TernaryNaturalLanguageInference, self).__init__(
         num_classes=3,
         input_schema=Schema(
             features=OrderedDict([
                 ("premise", Value(dtype="string")),
                 ("hypothesis", Value(dtype="string")),
             ]),
             grounding_candidates={
                 "premise": {"premise", "sentence1"},
                 "hypothesis": {"hypothesis", "sentence2"},
             },
         ),
         output_schema=Schema(
             features=OrderedDict([
                 (
                     "label",
                     ClassLabel(
                         names=["entailment", "neutral", "contradiction"]),
                 ),
             ]),
             grounding_candidates={
                 "label": {"label"},
             },
         ),
         identifier=self.__class__.__name__,
     )
예제 #6
0
 def test_flatten(self):
     dset_split = Dataset.from_dict(
         {
             "a": [{
                 "b": {
                     "c": ["text"]
                 }
             }] * 10,
             "foo": [1] * 10
         },
         features=Features({
             "a": {
                 "b": Sequence({"c": Value("string")})
             },
             "foo": Value("int64")
         }),
     )
     dset = DatasetDict({"train": dset_split, "test": dset_split})
     dset = dset.flatten()
     self.assertDictEqual(dset.column_names, {
         "train": ["a.b.c", "foo"],
         "test": ["a.b.c", "foo"]
     })
     self.assertListEqual(sorted(dset["train"].features.keys()),
                          ["a.b.c", "foo"])
     self.assertDictEqual(
         dset["train"].features,
         Features({
             "a.b.c": Sequence(Value("string")),
             "foo": Value("int64")
         }))
     del dset
def test_iterable_dataset_cast(generate_examples_fn):
    ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 10})
    features = Features({"id": Value("int64"), "label": Value("int64")})
    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))
    new_features = Features({"id": Value("int64"), "label": Value("bool")})
    casted_dataset = dataset.cast(new_features)
    assert list(casted_dataset) == [new_features.encode_example(ex) for _, ex in ex_iterable]
예제 #8
0
def dataset():
    n = 10
    features = Features({
        "tokens":
        Sequence(Value("string")),
        "labels":
        Sequence(ClassLabel(names=["negative", "positive"])),
        "answers":
        Sequence({
            "text": Value("string"),
            "answer_start": Value("int32"),
        }),
        "id":
        Value("int64"),
    })
    dataset = Dataset.from_dict(
        {
            "tokens": [["foo"] * 5] * n,
            "labels": [[1] * 5] * n,
            "answers": [{
                "answer_start": [97],
                "text": ["1976"]
            }] * 10,
            "id": list(range(n)),
        },
        features=features,
    )
    return dataset
예제 #9
0
 def test_from_dict(self):
     input_schema = Features({"text": Value("string")})
     label_schema = Features({"summary": Value("string")})
     template_dict = {"text_column": "input_text", "summary_column": "input_summary"}
     task = Summarization.from_dict(template_dict)
     self.assertEqual("summarization", task.task)
     self.assertEqual(input_schema, task.input_schema)
     self.assertEqual(label_schema, task.label_schema)
예제 #10
0
def test_load_dataset_streaming_csv(path_extension, streaming, csv_path, bz2_csv_path):
    paths = {"csv": csv_path, "csv.bz2": bz2_csv_path}
    data_files = str(paths[path_extension])
    features = Features({"col_1": Value("string"), "col_2": Value("int32"), "col_3": Value("float32")})
    ds = load_dataset("csv", split="train", data_files=data_files, features=features, streaming=streaming)
    assert isinstance(ds, IterableDataset if streaming else Dataset)
    ds_item = next(iter(ds))
    assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
예제 #11
0
    def test_caching(self):
        n_rows = 10

        features = Features({"foo": Value("string"), "bar": Value("string")})

        with tempfile.TemporaryDirectory() as tmp_dir:
            # Use \n for newline. Windows automatically adds the \r when writing the file
            # see https://docs.python.org/3/library/os.html#os.linesep
            open(os.path.join(tmp_dir, "table.csv"), "w",
                 encoding="utf-8").write("\n".join(",".join(["foo", "bar"])
                                                   for _ in range(n_rows + 1)))
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                keep_in_memory=False,
            )
            data_file = ds.cache_files[0]["filename"]
            fingerprint = ds._fingerprint
            self.assertEqual(len(ds), n_rows)
            del ds
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                keep_in_memory=False,
            )
            self.assertEqual(ds.cache_files[0]["filename"], data_file)
            self.assertEqual(ds._fingerprint, fingerprint)
            del ds
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                features=features,
                keep_in_memory=False,
            )
            self.assertNotEqual(ds.cache_files[0]["filename"], data_file)
            self.assertNotEqual(ds._fingerprint, fingerprint)
            del ds

            open(os.path.join(tmp_dir, "table.csv"), "w",
                 encoding="utf-8").write("\n".join(",".join(["Foo", "Bar"])
                                                   for _ in range(n_rows + 1)))
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                keep_in_memory=False,
            )
            self.assertNotEqual(ds.cache_files[0]["filename"], data_file)
            self.assertNotEqual(ds._fingerprint, fingerprint)
            self.assertEqual(len(ds), n_rows)
            del ds
 def _info(self):
     return MetricInfo(
         description="dummy metric for tests",
         citation="insert citation here",
         features=Features({
             "inputs": Value("int64"),
             "targets": Value("int64")
         }),
     )
예제 #13
0
 def _info(self):
     return MetricInfo(
         description="dummy metric for tests",
         citation="insert citation here",
         features=Features({
             "predictions": Value("int64"),
             "references": Value("int64")
         }),
     )
 def test_cache_dir_for_features(self):
     with tempfile.TemporaryDirectory() as tmp_dir:
         f1 = Features({"id": Value("int8")})
         f2 = Features({"id": Value("int32")})
         dummy_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f1)
         other_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f1)
         self.assertEqual(dummy_builder.cache_dir, other_builder.cache_dir)
         other_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, name="dummy", features=f2)
         self.assertNotEqual(dummy_builder.cache_dir, other_builder.cache_dir)
예제 #15
0
 def test_from_arrow_schema_simple(self):
     data = {"a": [{"b": {"c": "text"}}] * 10, "foo": [1] * 10}
     original_features = Features({"a": {"b": {"c": Value("string")}}, "foo": Value("int64")})
     dset = Dataset.from_dict(data, features=original_features)
     new_features = dset.features
     new_dset = Dataset.from_dict(data, features=new_features)
     self.assertEqual(original_features.type, new_features.type)
     self.assertDictEqual(dset[0], new_dset[0])
     self.assertDictEqual(dset[:], new_dset[:])
예제 #16
0
 def test_align_labels_with_mapping(self):
     train_features = Features({
         "input_text":
         Value("string"),
         "input_labels":
         ClassLabel(num_classes=3,
                    names=["entailment", "neutral", "contradiction"]),
     })
     test_features = Features({
         "input_text":
         Value("string"),
         "input_labels":
         ClassLabel(num_classes=3,
                    names=["entailment", "contradiction", "neutral"]),
     })
     train_data = {
         "input_text": ["a", "a", "b", "b", "c", "c"],
         "input_labels": [0, 0, 1, 1, 2, 2]
     }
     test_data = {
         "input_text": ["a", "a", "c", "c", "b", "b"],
         "input_labels": [0, 0, 1, 1, 2, 2]
     }
     label2id = {"CONTRADICTION": 0, "ENTAILMENT": 2, "NEUTRAL": 1}
     id2label = {v: k for k, v in label2id.items()}
     train_expected_labels = [2, 2, 1, 1, 0, 0]
     test_expected_labels = [2, 2, 0, 0, 1, 1]
     train_expected_label_names = [
         id2label[idx] for idx in train_expected_labels
     ]
     test_expected_label_names = [
         id2label[idx] for idx in test_expected_labels
     ]
     dsets = DatasetDict({
         "train":
         Dataset.from_dict(train_data, features=train_features),
         "test":
         Dataset.from_dict(test_data, features=test_features),
     })
     dsets = dsets.align_labels_with_mapping(label2id, "input_labels")
     self.assertListEqual(train_expected_labels,
                          dsets["train"]["input_labels"])
     self.assertListEqual(test_expected_labels,
                          dsets["test"]["input_labels"])
     train_aligned_label_names = [
         dsets["train"].features["input_labels"].int2str(idx)
         for idx in dsets["train"]["input_labels"]
     ]
     test_aligned_label_names = [
         dsets["test"].features["input_labels"].int2str(idx)
         for idx in dsets["test"]["input_labels"]
     ]
     self.assertListEqual(train_expected_label_names,
                          train_aligned_label_names)
     self.assertListEqual(test_expected_label_names,
                          test_aligned_label_names)
예제 #17
0
 def test_cast(self):
     dset = self._create_dummy_dataset_dict(multiple_columns=True)
     features = dset["train"].features
     features["col_1"] = Value("float64")
     dset = dset.cast(features)
     for dset_split in dset.values():
         self.assertEqual(dset_split.num_columns, 2)
         self.assertEqual(dset_split.features["col_1"], Value("float64"))
         self.assertIsInstance(dset_split[0]["col_1"], float)
     del dset
예제 #18
0
 def _info(self):
     return MetricInfo(
         description="dummy metric for tests",
         citation="insert citation here",
         features=Features(
             {"predictions": Sequence(Value("int64")), "references": Sequence(Value("int64"))}
             if self.config_name == "multilabel"
             else {"predictions": Value("int64"), "references": Value("int64")}
         ),
     )
예제 #19
0
 def test_from_dict(self):
     input_schema = Features({"audio_file_path": Value("string")})
     label_schema = Features({"transcription": Value("string")})
     template_dict = {
         "audio_file_path_column": "input_audio_file_path",
         "transcription_column": "input_transcription",
     }
     task = AutomaticSpeechRecognition.from_dict(template_dict)
     self.assertEqual("automatic-speech-recognition", task.task)
     self.assertEqual(input_schema, task.input_schema)
     self.assertEqual(label_schema, task.label_schema)
예제 #20
0
def test_load_dataset_zip_csv(zip_csv_path):
    data_files = str(zip_csv_path)
    features = Features({
        "col_1": Value("string"),
        "col_2": Value("int32"),
        "col_3": Value("float32")
    })
    ds = load_dataset("csv",
                      split="train",
                      data_files=data_files,
                      features=features)
    ds_item = next(iter(ds))
    assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
예제 #21
0
 def test_flatten_with_sequence(self):
     features = Features(
         {"foo": Sequence({"bar": {
             "my_value": Value("int32")
         }})})
     _features = features.copy()
     flattened_features = features.flatten()
     assert flattened_features == {
         "foo.bar": [{
             "my_value": Value("int32")
         }]
     }
     assert features == _features, "calling flatten shouldn't alter the current features"
예제 #22
0
def test_from_excel_file(resources_data_path):
    """This only shows an example of how one could read in an excel file"""
    str_value = Value("string")
    int_value = Value("int64")
    features = Features(Notification=int_value,
                        Type=str_value,
                        Plant=int_value,
                        Serial=str_value)

    file_path = resources_data_path / "test.xlsx"
    df = pd.read_excel(file_path)

    dataset = Dataset.from_pandas(df, features=features)

    assert len(dataset) > 0
예제 #23
0
    def test_as_dataset(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy")
            os.makedirs(dummy_builder.cache_dir)

            dummy_builder.info.splits = SplitDict()
            dummy_builder.info.splits.add(SplitInfo("train", num_examples=10))
            dummy_builder.info.splits.add(SplitInfo("test", num_examples=10))

            for split in dummy_builder.info.splits:
                writer = ArrowWriter(
                    path=os.path.join(dummy_builder.cache_dir,
                                      f"dummy_builder-{split}.arrow"),
                    features=Features({"text": Value("string")}),
                )
                writer.write_batch({"text": ["foo"] * 10})
                writer.finalize()

            dsets = dummy_builder.as_dataset()
            self.assertIsInstance(dsets, DatasetDict)
            self.assertListEqual(list(dsets.keys()), ["train", "test"])
            self.assertEqual(len(dsets["train"]), 10)
            self.assertEqual(len(dsets["test"]), 10)
            self.assertDictEqual(dsets["train"].features,
                                 Features({"text": Value("string")}))
            self.assertDictEqual(dsets["test"].features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dsets["train"].column_names, ["text"])
            self.assertListEqual(dsets["test"].column_names, ["text"])
            del dsets

            dset = dummy_builder.as_dataset("train")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train")
            self.assertEqual(len(dset), 10)
            self.assertDictEqual(dset.features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dset.column_names, ["text"])
            del dset

            dset = dummy_builder.as_dataset("train+test[:30%]")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train+test[:30%]")
            self.assertEqual(len(dset), 13)
            self.assertDictEqual(dset.features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dset.column_names, ["text"])
            del dset
예제 #24
0
 def __init__(self):
     super(BinarySentiment, self).__init__(
         num_classes=2,
         input_schema=Schema(
             features=OrderedDict(
                 [
                     ("text", Value(dtype="string")),
                 ]
             ),
             grounding_candidates={
                 "text": {"text", "sentence"},
             },
         ),
         output_schema=Schema(
             features=OrderedDict(
                 [
                     ("label", ClassLabel(names=["negative", "positive"])),
                 ]
             ),
             grounding_candidates={
                 "label": {"label"},
             },
         ),
         identifier=self.__class__.__name__,
     )
예제 #25
0
def test_datasetdict_from_text(split, features, keep_in_memory, text_path,
                               tmp_path):
    if split:
        path = {split: text_path}
    else:
        split = "train"
        path = {"train": text_path, "test": text_path}
    cache_dir = tmp_path / "cache"
    default_expected_features = {"text": "string"}
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_text(path,
                                        features=features,
                                        cache_dir=cache_dir,
                                        keep_in_memory=keep_in_memory)
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 1
    assert dataset.column_names == ["text"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
예제 #26
0
def test_dataset_with_audio_feature_map_is_decoded(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = {"audio": [audio_path], "text": ["Hello"]}
    features = Features({"audio": Audio(), "text": Value("string")})
    dset = Dataset.from_dict(data, features=features)

    def process_audio_sampling_rate_by_example(example):
        example["double_sampling_rate"] = 2 * example["audio"]["sampling_rate"]
        return example

    decoded_dset = dset.map(process_audio_sampling_rate_by_example)
    for item in decoded_dset:
        assert item.keys() == {"audio", "text", "double_sampling_rate"}
        assert item["double_sampling_rate"] == 88200

    def process_audio_sampling_rate_by_batch(batch):
        double_sampling_rates = []
        for audio in batch["audio"]:
            double_sampling_rates.append(2 * audio["sampling_rate"])
        batch["double_sampling_rate"] = double_sampling_rates
        return batch

    decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True)
    for item in decoded_dset:
        assert item.keys() == {"audio", "text", "double_sampling_rate"}
        assert item["double_sampling_rate"] == 88200
예제 #27
0
def test_load_dataset_builder_for_absolute_script_dir(
        dataset_loading_script_dir, data_dir):
    builder = datasets.load_dataset_builder(dataset_loading_script_dir,
                                            data_dir=data_dir)
    assert isinstance(builder, DatasetBuilder)
    assert builder.name == DATASET_LOADING_SCRIPT_NAME
    assert builder.info.features == Features({"text": Value("string")})
예제 #28
0
 def test_flatten(self):
     features = Features({
         "foo": {
             "bar1": Value("int32"),
             "bar2": {
                 "foobar": Value("string")
             }
         }
     })
     _features = features.copy()
     flattened_features = features.flatten()
     assert flattened_features == {
         "foo.bar1": Value("int32"),
         "foo.bar2.foobar": Value("string")
     }
     assert features == _features, "calling flatten shouldn't alter the current features"
예제 #29
0
def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path,
                              tmp_path):
    if split:
        path = {split: csv_path}
    else:
        split = "train"
        path = {"train": csv_path, "test": csv_path}
    cache_dir = tmp_path / "cache"
    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {
        "col_1": "int64",
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_csv(path,
                                       features=features,
                                       cache_dir=cache_dir,
                                       keep_in_memory=keep_in_memory)
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
def test_iterable_dataset_map_complex_features(dataset: IterableDataset, generate_examples_fn):
    # https://github.com/huggingface/datasets/issues/3505
    ex_iterable = ExamplesIterable(generate_examples_fn, {"label": "positive"})
    features = Features(
        {
            "id": Value("int64"),
            "label": Value("string"),
        }
    )
    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))
    dataset = dataset.cast_column("label", ClassLabel(names=["negative", "positive"]))
    dataset = dataset.map(lambda x: {"id+1": x["id"] + 1, **x})
    assert isinstance(dataset._ex_iterable, MappedExamplesIterable)
    features["label"] = ClassLabel(names=["negative", "positive"])
    assert [{k: v for k, v in ex.items() if k != "id+1"} for ex in dataset] == [
        features.encode_example(ex) for _, ex in ex_iterable
    ]