def test_datasetdict_from_parquet_keep_in_memory(keep_in_memory, parquet_path, tmp_path): cache_dir = tmp_path / "cache" expected_features = { "col_1": "string", "col_2": "int64", "col_3": "float64" } with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_parquet({"train": parquet_path}, cache_dir=cache_dir, keep_in_memory=keep_in_memory) _check_parquet_datasetdict(dataset, expected_features)
def test_datasetdict_from_parquet_split(split, parquet_path, tmp_path): if split: path = {split: parquet_path} else: split = "train" path = {"train": parquet_path, "test": parquet_path} cache_dir = tmp_path / "cache" expected_features = { "col_1": "string", "col_2": "int64", "col_3": "float64" } dataset = DatasetDict.from_parquet(path, cache_dir=cache_dir) _check_parquet_datasetdict(dataset, expected_features, splits=list(path.keys())) assert all(dataset[split].split == split for split in path.keys())
def test_datasetdict_from_parquet_features(features, parquet_path, tmp_path): cache_dir = tmp_path / "cache" default_expected_features = { "col_1": "string", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = (Features({ feature: Value(dtype) for feature, dtype in features.items() }) if features is not None else None) dataset = DatasetDict.from_parquet({"train": parquet_path}, features=features, cache_dir=cache_dir) _check_parquet_datasetdict(dataset, expected_features)