def test_builder_as_dataset(split, expected_dataset_class, expected_dataset_length, in_memory, tmp_path):
    cache_dir = str(tmp_path)
    dummy_builder = DummyBuilder(cache_dir=cache_dir, name="dummy")
    os.makedirs(dummy_builder.cache_dir)

    dummy_builder.info.splits = SplitDict()
    dummy_builder.info.splits.add(SplitInfo("train", num_examples=10))
    dummy_builder.info.splits.add(SplitInfo("test", num_examples=10))

    for info_split in dummy_builder.info.splits:
        with ArrowWriter(
            path=os.path.join(dummy_builder.cache_dir, f"dummy_builder-{info_split}.arrow"),
            features=Features({"text": Value("string")}),
        ) as writer:
            writer.write_batch({"text": ["foo"] * 10})
            writer.finalize()

    with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase():
        dataset = dummy_builder.as_dataset(split=split, in_memory=in_memory)
    assert isinstance(dataset, expected_dataset_class)
    if isinstance(dataset, DatasetDict):
        assert list(dataset.keys()) == ["train", "test"]
        datasets = dataset.values()
        expected_splits = ["train", "test"]
    elif isinstance(dataset, Dataset):
        datasets = [dataset]
        expected_splits = [split]
    for dataset, expected_split in zip(datasets, expected_splits):
        assert dataset.split == expected_split
        assert len(dataset) == expected_dataset_length
        assert dataset.features == Features({"text": Value("string")})
        dataset.column_names == ["text"]
示例#2
0
    def test_read_files(self):
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)

        with tempfile.TemporaryDirectory() as tmp_dir:
            reader = ReaderTest(tmp_dir, info)

            files = [
                {
                    "filename": os.path.join(tmp_dir, "train")
                },
                {
                    "filename": os.path.join(tmp_dir, "test"),
                    "skip": 10,
                    "take": 10
                },
            ]
            dset = Dataset(
                **reader.read_files(files, original_instructions=""))
            self.assertEqual(dset.num_rows, 110)
            self.assertEqual(dset.num_columns, 1)
            self.assertEqual(dset._data_files, files)
            del dset
示例#3
0
    def test_read(self):
        name = "my_name"
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_infos = [train_info, test_info]
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)

        with tempfile.TemporaryDirectory() as tmp_dir:
            reader = ReaderTest(tmp_dir, info)

            instructions = "test[:33%]"
            dset = Dataset(**reader.read(name, instructions, split_infos))
            self.assertEqual(dset["filename"][0], f"{name}-test")
            self.assertEqual(dset.num_rows, 33)
            self.assertEqual(dset.num_columns, 1)

            instructions = ["train", "test[:33%]"]
            datasets_kwargs = [
                reader.read(name, instr, split_infos) for instr in instructions
            ]
            train_dset, test_dset = [
                Dataset(**dataset_kwargs) for dataset_kwargs in datasets_kwargs
            ]
            self.assertEqual(train_dset["filename"][0], f"{name}-train")
            self.assertEqual(train_dset.num_rows, 100)
            self.assertEqual(train_dset.num_columns, 1)
            self.assertEqual(test_dset["filename"][0], f"{name}-test")
            self.assertEqual(test_dset.num_rows, 33)
            self.assertEqual(test_dset.num_columns, 1)
            del train_dset, test_dset
示例#4
0
    def test_as_dataset(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy")
            os.makedirs(dummy_builder.cache_dir)

            dummy_builder.info.splits = SplitDict()
            dummy_builder.info.splits.add(SplitInfo("train", num_examples=10))
            dummy_builder.info.splits.add(SplitInfo("test", num_examples=10))

            for split in dummy_builder.info.splits:
                writer = ArrowWriter(
                    path=os.path.join(dummy_builder.cache_dir,
                                      f"dummy_builder-{split}.arrow"),
                    features=Features({"text": Value("string")}),
                )
                writer.write_batch({"text": ["foo"] * 10})
                writer.finalize()

            dsets = dummy_builder.as_dataset()
            self.assertIsInstance(dsets, DatasetDict)
            self.assertListEqual(list(dsets.keys()), ["train", "test"])
            self.assertEqual(len(dsets["train"]), 10)
            self.assertEqual(len(dsets["test"]), 10)
            self.assertDictEqual(dsets["train"].features,
                                 Features({"text": Value("string")}))
            self.assertDictEqual(dsets["test"].features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dsets["train"].column_names, ["text"])
            self.assertListEqual(dsets["test"].column_names, ["text"])
            del dsets

            dset = dummy_builder.as_dataset("train")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train")
            self.assertEqual(len(dset), 10)
            self.assertDictEqual(dset.features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dset.column_names, ["text"])
            del dset

            dset = dummy_builder.as_dataset("train+test[:30%]")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train+test[:30%]")
            self.assertEqual(len(dset), 13)
            self.assertDictEqual(dset.features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dset.column_names, ["text"])
            del dset
    def test_read(self):
        name = "my_name"
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_infos = [train_info, test_info]
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)

        with tempfile.TemporaryDirectory() as tmp_dir:
            reader = ReaderTest(tmp_dir, info)

            instructions = "test[:33%]"
            dset = Dataset(**reader.read(name, instructions, split_infos))
            self.assertEqual(dset["filename"][0], f"{name}-test")
            self.assertEqual(dset.num_rows, 33)
            self.assertEqual(dset.num_columns, 1)

            instructions1 = ["train", "test[:33%]"]
            instructions2 = [
                Split.TRAIN,
                ReadInstruction.from_spec("test[:33%]")
            ]
            for instructions in [instructions1, instructions2]:
                datasets_kwargs = [
                    reader.read(name, instr, split_infos)
                    for instr in instructions
                ]
                train_dset, test_dset = (Dataset(**dataset_kwargs)
                                         for dataset_kwargs in datasets_kwargs)
                self.assertEqual(train_dset["filename"][0], f"{name}-train")
                self.assertEqual(train_dset.num_rows, 100)
                self.assertEqual(train_dset.num_columns, 1)
                self.assertIsInstance(train_dset.split, NamedSplit)
                self.assertEqual(str(train_dset.split), "train")
                self.assertEqual(test_dset["filename"][0], f"{name}-test")
                self.assertEqual(test_dset.num_rows, 33)
                self.assertEqual(test_dset.num_columns, 1)
                self.assertIsInstance(test_dset.split, NamedSplit)
                self.assertEqual(str(test_dset.split), "test[:33%]")
                del train_dset, test_dset
示例#6
0
    def test_as_dataset_with_post_process_with_index(self):
        def _post_process(self, dataset, resources_paths):
            if os.path.exists(resources_paths["index"]):
                dataset.load_faiss_index("my_index", resources_paths["index"])
                return dataset
            else:
                dataset.add_faiss_index_from_external_arrays(
                    external_arrays=np.ones((len(dataset), 8)),
                    string_factory="Flat",
                    index_name="my_index")
                dataset.save_faiss_index("my_index", resources_paths["index"])
                return dataset

        def _post_processing_resources(self, split):
            return {"index": "Flat-{split}.faiss".format(split=split)}

        with tempfile.TemporaryDirectory() as tmp_dir:
            dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy")
            dummy_builder._post_process = types.MethodType(
                _post_process, dummy_builder)
            dummy_builder._post_processing_resources = types.MethodType(
                _post_processing_resources, dummy_builder)
            os.makedirs(dummy_builder.cache_dir)

            dummy_builder.info.splits = SplitDict()
            dummy_builder.info.splits.add(SplitInfo("train", num_examples=10))
            dummy_builder.info.splits.add(SplitInfo("test", num_examples=10))

            for split in dummy_builder.info.splits:
                writer = ArrowWriter(
                    path=os.path.join(dummy_builder.cache_dir,
                                      f"dummy_builder-{split}.arrow"),
                    features=Features({"text": Value("string")}),
                )
                writer.write_batch({"text": ["foo"] * 10})
                writer.finalize()

                writer = ArrowWriter(
                    path=os.path.join(dummy_builder.cache_dir,
                                      f"small_dataset-{split}.arrow"),
                    features=Features({"text": Value("string")}),
                )
                writer.write_batch({"text": ["foo"] * 2})
                writer.finalize()

            dsets = dummy_builder.as_dataset()
            self.assertIsInstance(dsets, DatasetDict)
            self.assertListEqual(list(dsets.keys()), ["train", "test"])
            self.assertEqual(len(dsets["train"]), 10)
            self.assertEqual(len(dsets["test"]), 10)
            self.assertDictEqual(dsets["train"].features,
                                 Features({"text": Value("string")}))
            self.assertDictEqual(dsets["test"].features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dsets["train"].column_names, ["text"])
            self.assertListEqual(dsets["test"].column_names, ["text"])
            self.assertListEqual(dsets["train"].list_indexes(), ["my_index"])
            self.assertListEqual(dsets["test"].list_indexes(), ["my_index"])
            self.assertGreater(dummy_builder.info.post_processing_size, 0)
            self.assertGreater(
                dummy_builder.info.post_processed.resources_checksums["train"]
                ["index"]["num_bytes"], 0)
            del dsets

            dset = dummy_builder.as_dataset("train")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train")
            self.assertEqual(len(dset), 10)
            self.assertDictEqual(dset.features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dset.column_names, ["text"])
            self.assertListEqual(dset.list_indexes(), ["my_index"])
            del dset

            dset = dummy_builder.as_dataset("train+test[:30%]")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train+test[:30%]")
            self.assertEqual(len(dset), 13)
            self.assertDictEqual(dset.features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dset.column_names, ["text"])
            self.assertListEqual(dset.list_indexes(), ["my_index"])
            del dset
示例#7
0
    def test_as_dataset_with_post_process(self):
        def _post_process(self, dataset, resources_paths):
            def char_tokenize(example):
                return {"tokens": list(example["text"])}

            return dataset.map(
                char_tokenize,
                cache_file_name=resources_paths["tokenized_dataset"])

        def _post_processing_resources(self, split):
            return {
                "tokenized_dataset":
                "tokenized_dataset-{split}.arrow".format(split=split)
            }

        with tempfile.TemporaryDirectory() as tmp_dir:
            dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy")
            dummy_builder.info.post_processed = PostProcessedInfo(
                features=Features({
                    "text": Value("string"),
                    "tokens": [Value("string")]
                }))
            dummy_builder._post_process = types.MethodType(
                _post_process, dummy_builder)
            dummy_builder._post_processing_resources = types.MethodType(
                _post_processing_resources, dummy_builder)
            os.makedirs(dummy_builder.cache_dir)

            dummy_builder.info.splits = SplitDict()
            dummy_builder.info.splits.add(SplitInfo("train", num_examples=10))
            dummy_builder.info.splits.add(SplitInfo("test", num_examples=10))

            for split in dummy_builder.info.splits:
                writer = ArrowWriter(
                    path=os.path.join(dummy_builder.cache_dir,
                                      f"dummy_builder-{split}.arrow"),
                    features=Features({"text": Value("string")}),
                )
                writer.write_batch({"text": ["foo"] * 10})
                writer.finalize()

                writer = ArrowWriter(
                    path=os.path.join(dummy_builder.cache_dir,
                                      f"tokenized_dataset-{split}.arrow"),
                    features=Features({
                        "text": Value("string"),
                        "tokens": [Value("string")]
                    }),
                )
                writer.write_batch({
                    "text": ["foo"] * 10,
                    "tokens": [list("foo")] * 10
                })
                writer.finalize()

            dsets = dummy_builder.as_dataset()
            self.assertIsInstance(dsets, DatasetDict)
            self.assertListEqual(list(dsets.keys()), ["train", "test"])
            self.assertEqual(len(dsets["train"]), 10)
            self.assertEqual(len(dsets["test"]), 10)
            self.assertDictEqual(
                dsets["train"].features,
                Features({
                    "text": Value("string"),
                    "tokens": [Value("string")]
                }))
            self.assertDictEqual(
                dsets["test"].features,
                Features({
                    "text": Value("string"),
                    "tokens": [Value("string")]
                }))
            self.assertListEqual(dsets["train"].column_names,
                                 ["text", "tokens"])
            self.assertListEqual(dsets["test"].column_names,
                                 ["text", "tokens"])
            del dsets

            dset = dummy_builder.as_dataset("train")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train")
            self.assertEqual(len(dset), 10)
            self.assertDictEqual(
                dset.features,
                Features({
                    "text": Value("string"),
                    "tokens": [Value("string")]
                }))
            self.assertListEqual(dset.column_names, ["text", "tokens"])
            self.assertGreater(dummy_builder.info.post_processing_size, 0)
            self.assertGreater(
                dummy_builder.info.post_processed.resources_checksums["train"]
                ["tokenized_dataset"]["num_bytes"], 0)
            del dset

            dset = dummy_builder.as_dataset("train+test[:30%]")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train+test[:30%]")
            self.assertEqual(len(dset), 13)
            self.assertDictEqual(
                dset.features,
                Features({
                    "text": Value("string"),
                    "tokens": [Value("string")]
                }))
            self.assertListEqual(dset.column_names, ["text", "tokens"])
            del dset

        def _post_process(self, dataset, resources_paths):
            return dataset.select([0, 1], keep_in_memory=True)

        with tempfile.TemporaryDirectory() as tmp_dir:
            dummy_builder = DummyBuilder(cache_dir=tmp_dir, name="dummy")
            dummy_builder._post_process = types.MethodType(
                _post_process, dummy_builder)
            os.makedirs(dummy_builder.cache_dir)

            dummy_builder.info.splits = SplitDict()
            dummy_builder.info.splits.add(SplitInfo("train", num_examples=10))
            dummy_builder.info.splits.add(SplitInfo("test", num_examples=10))

            for split in dummy_builder.info.splits:
                writer = ArrowWriter(
                    path=os.path.join(dummy_builder.cache_dir,
                                      f"dummy_builder-{split}.arrow"),
                    features=Features({"text": Value("string")}),
                )
                writer.write_batch({"text": ["foo"] * 10})
                writer.finalize()

                writer = ArrowWriter(
                    path=os.path.join(dummy_builder.cache_dir,
                                      f"small_dataset-{split}.arrow"),
                    features=Features({"text": Value("string")}),
                )
                writer.write_batch({"text": ["foo"] * 2})
                writer.finalize()

            dsets = dummy_builder.as_dataset()
            self.assertIsInstance(dsets, DatasetDict)
            self.assertListEqual(list(dsets.keys()), ["train", "test"])
            self.assertEqual(len(dsets["train"]), 2)
            self.assertEqual(len(dsets["test"]), 2)
            self.assertDictEqual(dsets["train"].features,
                                 Features({"text": Value("string")}))
            self.assertDictEqual(dsets["test"].features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dsets["train"].column_names, ["text"])
            self.assertListEqual(dsets["test"].column_names, ["text"])
            del dsets

            dset = dummy_builder.as_dataset("train")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train")
            self.assertEqual(len(dset), 2)
            self.assertDictEqual(dset.features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dset.column_names, ["text"])
            del dset

            dset = dummy_builder.as_dataset("train+test[:30%]")
            self.assertIsInstance(dset, Dataset)
            self.assertEqual(dset.split, "train+test[:30%]")
            self.assertEqual(len(dset), 2)
            self.assertDictEqual(dset.features,
                                 Features({"text": Value("string")}))
            self.assertListEqual(dset.column_names, ["text"])
            del dset