def test_concatenate(self): data1, data2, data3 = {"id": [0, 1, 2]}, {"id": [3, 4, 5]}, {"id": [6, 7]} info1 = DatasetInfo(description="Dataset1") info2 = DatasetInfo(description="Dataset2") dset1, dset2, dset3 = ( Dataset.from_dict(data1, info=info1), Dataset.from_dict(data2, info=info2), Dataset.from_dict(data3), ) dset_concat = concatenate_datasets([dset1, dset2, dset3]) self.assertEquals(len(dset_concat), len(dset1) + len(dset2) + len(dset3)) self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2\n\n")
def test_read(self): name = "my_name" train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_infos = [train_info, test_info] split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) reader = ReaderTest("", info) instructions = "test[:33%]" dset = reader.read(name, instructions, split_infos) self.assertEqual(dset["filename"][0], f"{name}-test") self.assertEqual(dset.num_rows, 33) self.assertEqual(dset.num_columns, 1) instructions = ["train", "test[:33%]"] train_dset, test_dset = reader.read(name, instructions, split_infos) self.assertEqual(train_dset["filename"][0], f"{name}-train") self.assertEqual(train_dset.num_rows, 100) self.assertEqual(train_dset.num_columns, 1) self.assertEqual(test_dset["filename"][0], f"{name}-test") self.assertEqual(test_dset.num_rows, 33) self.assertEqual(test_dset.num_columns, 1)
def test_from_pandas(self): data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} df = pd.DataFrame.from_dict(data) dset = Dataset.from_pandas(df) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) self.assertDictEqual(dset.features, Features({"col_1": Value("int64"), "col_2": Value("string")})) features = Features({"col_1": Value("int64"), "col_2": Value("string")}) dset = Dataset.from_pandas(df, features=features) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) self.assertDictEqual(dset.features, Features({"col_1": Value("int64"), "col_2": Value("string")})) features = Features({"col_1": Value("int64"), "col_2": Value("string")}) dset = Dataset.from_pandas(df, features=features, info=DatasetInfo(features=features)) self.assertListEqual(dset["col_1"], data["col_1"]) self.assertListEqual(dset["col_2"], data["col_2"]) self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) self.assertDictEqual(dset.features, Features({"col_1": Value("int64"), "col_2": Value("string")})) features = Features({"col_1": Value("string"), "col_2": Value("string")}) self.assertRaises(pa.ArrowTypeError, Dataset.from_pandas, df, features=features)
def test_concatenate(self): data1, data2, data3 = { "id": [0, 1, 2] }, { "id": [3, 4, 5] }, { "id": [6, 7] } dset1, dset2, dset3 = Dataset.from_dict(data1), Dataset.from_dict( data2), Dataset.from_dict(data3) dset1._info = DatasetInfo(description="Dataset1") dset2._info = DatasetInfo(description="Dataset2") dset3._info = None dset_concat = concatenate_datasets([dset1, dset2, dset3]) self.assertEquals(len(dset_concat), len(dset1) + len(dset2) + len(dset3)) self.assertEquals(dset_concat.info.description, "Dataset1\n\nDataset2")
def _create_dummy_dataset(self): name = "my_name" train_info = SplitInfo(name="train", num_examples=30) test_info = SplitInfo(name="test", num_examples=30) split_infos = [train_info, test_info] split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) reader = ReaderTester("", info) dset = reader.read(name, "train", split_infos) return dset
def test_read_files(self): train_info = SplitInfo(name="train", num_examples=100) test_info = SplitInfo(name="test", num_examples=100) split_dict = SplitDict() split_dict.add(train_info) split_dict.add(test_info) info = DatasetInfo(splits=split_dict) reader = ReaderTest("", info) files = [{"filename": "train"}, {"filename": "test", "skip": 10, "take": 10}] dset = reader.read_files(files, original_instructions="") self.assertEqual(dset.num_rows, 110) self.assertEqual(dset.num_columns, 1) self.assertEqual(dset._data_files, files)
def _info(self): return DatasetInfo(features=Features({"text": Value("string")}))