示例#1
0
def test_open_dataset_list_of_files(tempdir):
    tables, (path1, path2) = _create_directory_of_files(tempdir)
    table = pa.concat_tables(tables)

    # list of exact files needs to be passed to source() function
    # (dataset() will interpret it as separate sources)
    for dataset in [
            ds.dataset(ds.source([path1, path2])),
            ds.dataset(ds.source([str(path1), str(path2)]))]:
        assert dataset.schema.equals(table.schema, check_metadata=False)
        result = dataset.new_scan().finish().to_table()
        assert result.replace_schema_metadata().equals(table)
示例#2
0
def test_open_dataset_list_of_files(tempdir):
    tables, (path1, path2) = _create_directory_of_files(tempdir)
    table = pa.concat_tables(tables)

    # list of exact files needs to be passed to source() function
    # (dataset() will interpret it as separate sources)
    for dataset in [
            ds.dataset(ds.source([path1, path2])),
            ds.dataset(ds.source([str(path1), str(path2)]))
    ]:
        assert dataset.schema.equals(table.schema, check_metadata=False)
        result = dataset.to_table(use_threads=False)  # deterministic row order
        assert result.equals(table, check_metadata=False)
示例#3
0
def test_dataset_factory(multisourcefs):
    src = ds.source('/plain', filesystem=multisourcefs, format='parquet')
    factory = ds.DatasetFactory([src])

    assert len(factory.sources) == 1
    assert len(factory.inspect_schemas()) == 1
    assert all(isinstance(s, ds.SourceFactory) for s in factory.sources)
    assert all(isinstance(s, pa.Schema) for s in factory.inspect_schemas())
    assert factory.inspect_schemas()[0].equals(src.inspect())
    assert factory.inspect().equals(src.inspect())
    assert isinstance(factory.finish(), ds.Dataset)
示例#4
0
def _check_dataset_from_path(path, table, **kwargs):
    import pathlib

    # pathlib object
    assert isinstance(path, pathlib.Path)
    dataset = ds.dataset(ds.source(path, **kwargs))
    assert dataset.schema.equals(table.schema, check_metadata=False)
    result = dataset.new_scan().finish().to_table()
    assert result.replace_schema_metadata().equals(table)

    # string path
    dataset = ds.dataset(ds.source(str(path), **kwargs))
    assert dataset.schema.equals(table.schema, check_metadata=False)
    result = dataset.new_scan().finish().to_table()
    assert result.replace_schema_metadata().equals(table)

    # passing directly to dataset
    dataset = ds.dataset(str(path), **kwargs)
    assert dataset.schema.equals(table.schema, check_metadata=False)
    result = dataset.new_scan().finish().to_table()
    assert result.replace_schema_metadata().equals(table)
示例#5
0
def _check_dataset_from_path(path, table, **kwargs):
    import pathlib

    # pathlib object
    assert isinstance(path, pathlib.Path)
    dataset = ds.dataset(ds.source(path, **kwargs))
    assert dataset.schema.equals(table.schema, check_metadata=False)
    result = dataset.to_table(use_threads=False)  # deterministic row order
    assert result.equals(table, check_metadata=False)

    # string path
    dataset = ds.dataset(ds.source(str(path), **kwargs))
    assert dataset.schema.equals(table.schema, check_metadata=False)
    result = dataset.to_table(use_threads=False)  # deterministic row order
    assert result.equals(table, check_metadata=False)

    # passing directly to dataset
    dataset = ds.dataset(str(path), **kwargs)
    assert dataset.schema.equals(table.schema, check_metadata=False)
    result = dataset.to_table(use_threads=False)  # deterministic row order
    assert result.equals(table, check_metadata=False)
示例#6
0
def test_multiple_sources(multisourcefs):
    src1 = ds.source('/plain', filesystem=multisourcefs, format='parquet')
    src2 = ds.source('/schema',
                     filesystem=multisourcefs,
                     format='parquet',
                     partitioning=['week', 'color'])
    src3 = ds.source('/hive',
                     filesystem=multisourcefs,
                     format='parquet',
                     partitioning='hive')

    assembled = ds.dataset([src1, src2, src3])
    assert isinstance(assembled, ds.Dataset)

    expected_schema = pa.schema([
        ('date', pa.date32()),
        ('index', pa.int64()),
        ('value', pa.float64()),
        ('color', pa.string()),
        ('week', pa.int32()),
        ('month', pa.int32()),
        ('year', pa.int32()),
    ])
    assert assembled.schema.equals(expected_schema, check_metadata=False)
示例#7
0
def test_open_dataset_from_source_additional_kwargs(tempdir):
    _, path = _create_single_file(tempdir)
    with pytest.raises(ValueError, match="cannot pass any additional"):
        ds.dataset(ds.source(path), format="parquet")