Exemplo n.º 1
0
def _check_dataset_from_path(path, table, **kwargs):
    import pathlib

    # pathlib object
    assert isinstance(path, pathlib.Path)
    dataset = ds.dataset(ds.factory(path, **kwargs))
    assert dataset.schema.equals(table.schema)
    result = dataset.to_table(use_threads=False)  # deterministic row order
    assert result.equals(table)

    # string path
    dataset = ds.dataset(ds.factory(str(path), **kwargs))
    assert dataset.schema.equals(table.schema)
    result = dataset.to_table(use_threads=False)  # deterministic row order
    assert result.equals(table)

    # relative string path
    with change_cwd(path.parent):
        dataset = ds.dataset(ds.factory(path.name, **kwargs))
        assert dataset.schema.equals(table.schema)
        result = dataset.to_table(use_threads=False)  # deterministic row order
        assert result.equals(table)

    # passing directly to dataset
    dataset = ds.dataset(str(path), **kwargs)
    assert dataset.schema.equals(table.schema)
    result = dataset.to_table(use_threads=False)  # deterministic row order
    assert result.equals(table)
Exemplo n.º 2
0
def test_open_dataset_list_of_files(tempdir):
    tables, (path1, path2) = _create_directory_of_files(tempdir)
    table = pa.concat_tables(tables)

    # list of exact files needs to be passed to source() function
    # (dataset() will interpret it as separate sources)
    datasets = [
        ds.dataset(ds.factory([path1, path2])),
        ds.dataset(ds.factory([str(path1), str(path2)]))
    ]
    for dataset in datasets:
        assert dataset.schema.equals(table.schema)
        result = dataset.to_table(use_threads=False)  # deterministic row order
        assert result.equals(table)
Exemplo n.º 3
0
def test_dataset_factory(multisourcefs):
    child = ds.factory('/plain', filesystem=multisourcefs, format='parquet')
    factory = ds.UnionDatasetFactory([child])

    # TODO(bkietz) reintroduce factory.children property
    assert len(factory.inspect_schemas()) == 1
    assert all(isinstance(s, pa.Schema) for s in factory.inspect_schemas())
    assert factory.inspect_schemas()[0].equals(child.inspect())
    assert factory.inspect().equals(child.inspect())
    assert isinstance(factory.finish(), ds.Dataset)
Exemplo n.º 4
0
def test_multiple_factories(multisourcefs):
    src1 = ds.factory('/plain', filesystem=multisourcefs, format='parquet')
    src2 = ds.factory('/schema', filesystem=multisourcefs, format='parquet',
                      partitioning=['week', 'color'])
    src3 = ds.factory('/hive', filesystem=multisourcefs, format='parquet',
                      partitioning='hive')

    assembled = ds.dataset([src1, src2, src3])
    assert isinstance(assembled, ds.Dataset)

    expected_schema = pa.schema([
        ('date', pa.date32()),
        ('index', pa.int64()),
        ('value', pa.float64()),
        ('color', pa.string()),
        ('week', pa.int32()),
        ('year', pa.int32()),
        ('month', pa.int32()),
    ])
    assert assembled.schema.equals(expected_schema)
Exemplo n.º 5
0
def _check_dataset_from_path(path, table, **kwargs):
    import pathlib

    # pathlib object
    assert isinstance(path, pathlib.Path)
    dataset = ds.dataset(ds.factory(path, **kwargs))
    assert dataset.schema.equals(table.schema, check_metadata=False)
    result = dataset.to_table(use_threads=False)  # deterministic row order
    assert result.equals(table, check_metadata=False)

    # string path
    dataset = ds.dataset(ds.factory(str(path), **kwargs))
    assert dataset.schema.equals(table.schema, check_metadata=False)
    result = dataset.to_table(use_threads=False)  # deterministic row order
    assert result.equals(table, check_metadata=False)

    # passing directly to dataset
    dataset = ds.dataset(str(path), **kwargs)
    assert dataset.schema.equals(table.schema, check_metadata=False)
    result = dataset.to_table(use_threads=False)  # deterministic row order
    assert result.equals(table, check_metadata=False)
Exemplo n.º 6
0
def _check_dataset_from_path(path, table, **kwargs):
    import pathlib

    # pathlib object
    assert isinstance(path, pathlib.Path)
    dataset = ds.dataset(ds.factory(path, **kwargs))
    assert isinstance(dataset, ds.FileSystemDataset)
    _check_dataset(dataset, table)

    # string path
    dataset = ds.dataset(ds.factory(str(path), **kwargs))
    assert isinstance(dataset, ds.FileSystemDataset)
    _check_dataset(dataset, table)

    # relative string path
    with change_cwd(path.parent):
        dataset = ds.dataset(ds.factory(path.name, **kwargs))
        assert isinstance(dataset, ds.FileSystemDataset)
        _check_dataset(dataset, table)

    # passing directly to dataset
    dataset = ds.dataset(path, **kwargs)
    assert isinstance(dataset, ds.FileSystemDataset)
    _check_dataset(dataset, table)

    dataset = ds.dataset(str(path), **kwargs)
    assert isinstance(dataset, ds.FileSystemDataset)
    _check_dataset(dataset, table)

    # passing list of files (even of length-1) gives UnionDataset
    dataset = ds.dataset([path], **kwargs)
    assert isinstance(dataset, ds.UnionDataset)
    _check_dataset(dataset, table)

    dataset = ds.dataset([str(path)], **kwargs)
    assert isinstance(dataset, ds.UnionDataset)
    _check_dataset(dataset, table)