示例#1
0
def test_text_dataset_reader(path_type, split, features, keep_in_memory,
                             text_path, tmp_path):
    if issubclass(path_type, str):
        path = text_path
    elif issubclass(path_type, list):
        path = [text_path]
    cache_dir = tmp_path / "cache"

    expected_split = str(split) if split else "train"

    default_expected_features = {"text": "string"}
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = TextDatasetReader(path,
                                    split=split,
                                    features=features,
                                    cache_dir=cache_dir,
                                    keep_in_memory=keep_in_memory).read()
    assert isinstance(dataset, Dataset)
    assert dataset.num_rows == 4
    assert dataset.num_columns == 1
    assert dataset.column_names == ["text"]
    assert dataset.split == expected_split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#2
0
def test_text_datasetdict_reader(split, features, keep_in_memory, text_path,
                                 tmp_path):
    if split:
        path = {split: text_path}
    else:
        split = "train"
        path = {"train": text_path, "test": text_path}
    cache_dir = tmp_path / "cache"

    default_expected_features = {"text": "string"}
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = TextDatasetReader(path,
                                    features=features,
                                    cache_dir=cache_dir,
                                    keep_in_memory=keep_in_memory).read()
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 1
    assert dataset.column_names == ["text"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#3
0
def test_csv_dataset_reader(path_type, split, features, keep_in_memory,
                            csv_path, tmp_path):
    if issubclass(path_type, str):
        path = csv_path
    elif issubclass(path_type, list):
        path = [csv_path]
    cache_dir = tmp_path / "cache"

    expected_split = str(split) if split else "train"

    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {
        "col_1": "int64",
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = CsvDatasetReader(path,
                                   split=split,
                                   features=features,
                                   cache_dir=cache_dir,
                                   keep_in_memory=keep_in_memory).read()
    assert isinstance(dataset, Dataset)
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == expected_split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#4
0
def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path,
                              tmp_path):
    if split:
        path = {split: csv_path}
    else:
        split = "train"
        path = {"train": csv_path, "test": csv_path}
    cache_dir = tmp_path / "cache"
    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {
        "col_1": "int64",
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_csv(path,
                                       features=features,
                                       cache_dir=cache_dir,
                                       keep_in_memory=keep_in_memory)
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#5
0
def test_datasetdict_from_json(
    split,
    features,
    keep_in_memory,
    jsonl_path,
    tmp_path,
):
    file_path = jsonl_path
    field = None
    if split:
        path = {split: file_path}
    else:
        split = "train"
        path = {"train": file_path, "test": file_path}
    cache_dir = tmp_path / "cache"
    default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
    expected_features = features.copy() if features else default_expected_features
    features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_json(
            path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, field=field
        )
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#6
0
def test_json_dataset_reader(
    path_type,
    split,
    features,
    keep_in_memory,
    jsonl_path,
    tmp_path,
):
    file_path = jsonl_path
    field = None
    if issubclass(path_type, str):
        path = file_path
    elif issubclass(path_type, list):
        path = [file_path]
    cache_dir = tmp_path / "cache"

    expected_split = str(split) if split else "train"

    default_expected_features = {
        "col_1": "string",
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = JsonDatasetReader(path,
                                    split=split,
                                    features=features,
                                    cache_dir=cache_dir,
                                    keep_in_memory=keep_in_memory,
                                    field=field).read()
    assert isinstance(dataset, Dataset)
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == expected_split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#7
0
def test_csv_datasetdict_reader(split, features, keep_in_memory, csv_path,
                                tmp_path):
    if split:
        path = {split: csv_path}
    else:
        split = "train"
        path = {"train": csv_path, "test": csv_path}
    cache_dir = tmp_path / "cache"

    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {
        "col_1": "int64",
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    previous_allocated_memory = pa.total_allocated_bytes()
    dataset = CsvDatasetReader(path,
                               features=features,
                               cache_dir=cache_dir,
                               keep_in_memory=keep_in_memory).read()
    increased_allocated_memory = (pa.total_allocated_bytes() -
                                  previous_allocated_memory) > 0
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
    assert increased_allocated_memory == keep_in_memory