Пример #1
0
def test_construct_from_invalid_sources_raise(multisourcefs):
    child1 = ds.FileSystemDatasetFactory(
        multisourcefs,
        fs.FileSelector('/plain'),
        format=ds.ParquetFileFormat()
    )
    child2 = ds.FileSystemDatasetFactory(
        multisourcefs,
        fs.FileSelector('/schema'),
        format=ds.ParquetFileFormat()
    )

    with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'):
        ds.dataset([child1, child2])

    expected = (
        "Expected a list of path-like or dataset objects. The given list "
        "contains the following types: int"
    )
    with pytest.raises(TypeError, match=expected):
        ds.dataset([1, 2, 3])

    expected = (
        "Expected a path-like, list of path-likes or a list of Datasets "
        "instead of the given type: NoneType"
    )
    with pytest.raises(TypeError, match=expected):
        ds.dataset(None)
Пример #2
0
    def _as_generator(self):
        info = self._local_fs_client.get_file_info([self.path])[0]
        if info.type == fs.FileType.NotFound:
            raise FileNotFoundError(f"file {self.path} not found")

        elif info.type == fs.FileType.File:
            for line in self._read_buffer_lines():
                yield line
        else:
            selector = fs.FileSelector(self.path)
            file_infos = self._local_fs_client.get_file_info(selector)
            for file_info in file_infos:
                if file_info.base_name.startswith(
                        ".") or file_info.base_name.startswith("_"):
                    continue
                assert (
                    file_info.is_file
                ), f"{self.path} is directory contains a subdirectory: {file_info.path}"
                with io.TextIOWrapper(
                        buffer=self._local_fs_client.open_input_stream(
                            f"{self._address.file_path:}/{file_info.path}"),
                        encoding="utf-8",
                ) as reader:
                    for line in reader:
                        yield line
Пример #3
0
def main2():
    # By default, MinIO will listen for unencrypted HTTP traffic.
    minio = fs.S3FileSystem(scheme="http", endpoint_override="10.0.0.2:9000")

    # List all contents in a bucket, recursively
    file_selector = fs.FileSelector('customer-data-text', recursive=True)
    print_file_info(minio, file_selector)

    print(read_pafs_file(minio, 'customer-data-text/customer.csv'))
    print(read_pafs_stream(minio, 'customer-data-text/customer.csv'))

    endpoint_url = 'http://10.0.0.2:9000'
    print_boto3_buckets(endpoint_url)

    # TODO: read multiple files using dataset

    # https://stackoverflow.com/questions/45082832/how-to-read-partitioned-parquet-files-from-s3-using-pyarrow-in-python
    file_system = get_s3fs()
    print(file_system.ls('example-data'))

    bucket_uri = 's3://example-data/external-data'
    print_parquet_pandas_shape(bucket_uri, file_system)
    print_parquet_dataset_info(bucket_uri, file_system, verbose=False)

    bucket_uri = 's3://example-data/external-clustered'
    print_parquet_pandas_shape(bucket_uri, file_system)
    print_parquet_dataset_info(bucket_uri, file_system, verbose=False)
Пример #4
0
    def get(self, local_path):
        bucket, path = parsePath(local_path)
        print(bucket, path)
        fs_client = fs.LocalFileSystem()
        file_info_list = fs_client.get_file_info(
            fs.FileSelector(path, recursive=False))

        files = []
        dirs = []
        for info in file_info_list:
            if info.type.value == 2:
                # File type
                files.append({
                    'name': info.base_name,
                    'ext': info.extension,
                    'size': info.size,
                    'mtime': info.mtime.isoformat()
                })
            elif info.type.value == 3:
                # Directory type
                dirs.append({
                    'name': info.base_name,
                    'mtime': info.mtime.isoformat()
                })

        self.finish(json.dumps({'files': files, 'dirs': dirs}))
Пример #5
0
def test_open_dataset_from_source_additional_kwargs(multisourcefs):
    child = ds.FileSystemDatasetFactory(
        multisourcefs, fs.FileSelector('/plain'),
        format=ds.ParquetFileFormat()
    )
    with pytest.raises(ValueError, match="cannot pass any additional"):
        ds.dataset(child, format="parquet")
Пример #6
0
def test_partitioning_factory(mockfs):
    paths_or_selector = fs.FileSelector('subdir', recursive=True)
    format = ds.ParquetFileFormat()

    options = ds.FileSystemFactoryOptions('subdir')
    partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key'])
    assert isinstance(partitioning_factory, ds.PartitioningFactory)
    options.partitioning_factory = partitioning_factory

    factory = ds.FileSystemDatasetFactory(
        mockfs, paths_or_selector, format, options
    )
    inspected_schema = factory.inspect()
    # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths
    expected_schema = pa.schema([
        ("i64", pa.int64()),
        ("f64", pa.float64()),
        ("str", pa.string()),
        ("group", pa.int32()),
        ("key", pa.string()),
    ])
    assert inspected_schema.equals(expected_schema)

    hive_partitioning_factory = ds.HivePartitioning.discover()
    assert isinstance(hive_partitioning_factory, ds.PartitioningFactory)
Пример #7
0
    def ls(self, path: str, recursive=False) -> List[File]:
        path = self._unwrap_path(path)
        files = []
        try:
            curr_path_info = self._client.get_file_info(path)
            res_files = []
            if curr_path_info.type == fs.FileType.File:
                res_files = [curr_path_info]
            elif curr_path_info.type == fs.FileType.Directory:
                res_files = self._client.get_file_info(
                    fs.FileSelector(path, recursive=recursive))

            for file in res_files:
                if file.type == fs.FileType.File:
                    files.append(
                        File(
                            path=self._wrap_path(file.path),
                            size=file.size,
                            # ns to second
                            mtime=int(file.mtime_ns / 1e9)))
        except RuntimeError as error:
            # This is a hack that snakebite can not handle generator
            if str(error) == 'generator raised StopIteration':
                pass
            else:
                raise
        return files
Пример #8
0
 def test_fileinfo_list(self):
     fn = self.get_fresh_key() + "-listdir"
     self.write_file(fn, b"hello1")
     infs = self.s3.get_file_info(fs.FileSelector("bucket/",
                                                  recursive=True))
     fns = [x.path for x in infs]
     self.assertIn(fn, fns)
Пример #9
0
    def _as_generator(self):
        info = self._hdfs_client.get_file_info([self.path])[0]
        if info.type == fs.FileType.NotFound:
            raise FileNotFoundError(f"file {self.path} not found")

        elif info.type == fs.FileType.File:
            # todo:
            with io.TextIOWrapper(buffer=self._hdfs_client.open_input_stream(
                    self.path),
                                  encoding="utf-8") as reader:
                for line in reader:
                    yield line
        else:
            selector = fs.FileSelector(os.path.join("/", self._address.path))
            file_infos = self._hdfs_client.get_file_info(selector)
            for file_info in file_infos:
                if file_info.base_name == "_SUCCESS":
                    continue
                assert (
                    file_info.is_file
                ), f"{self.path} is directory contains a subdirectory: {file_info.path}"
                with io.TextIOWrapper(
                        buffer=self._hdfs_client.open_input_stream(
                            f"{self._address.name_node}/{file_info.path}"),
                        encoding="utf-8",
                ) as reader:
                    for line in reader:
                        yield line
Пример #10
0
def dataset(mockfs):
    format = ds.ParquetFileFormat()
    selector = fs.FileSelector('subdir', recursive=True)
    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
    return factory.finish()
Пример #11
0
def dataset(mockfs):
    format = ds.ParquetFileFormat()
    selector = fs.FileSelector('subdir', recursive=True)
    options = ds.FileSystemDiscoveryOptions('subdir')
    options.partition_scheme = ds.SchemaPartitionScheme(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    discovery = ds.FileSystemDataSourceDiscovery(mockfs, selector, format,
                                                 options)
    schema = discovery.inspect()
    source = discovery.finish()
    return ds.Dataset([source], schema)
Пример #12
0
def test_dataset_union(multisourcefs):
    child = ds.FileSystemDatasetFactory(
        multisourcefs, fs.FileSelector('/plain'),
        format=ds.ParquetFileFormat()
    )
    factory = ds.UnionDatasetFactory([child])

    # TODO(bkietz) reintroduce factory.children property
    assert len(factory.inspect_schemas()) == 1
    assert all(isinstance(s, pa.Schema) for s in factory.inspect_schemas())
    assert factory.inspect_schemas()[0].equals(child.inspect())
    assert factory.inspect().equals(child.inspect())
    assert isinstance(factory.finish(), ds.Dataset)
Пример #13
0
         pa.field('f64', pa.float64())])
    assert condition.validate(schema) == pa.bool_()

    i64_is_5 = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                       ds.FieldExpression('i64'),
                                       ds.ScalarExpression(5))
    i64_is_7 = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                       ds.FieldExpression('i64'),
                                       ds.ScalarExpression(7))
    assert condition.assume(i64_is_5).equals(ds.ScalarExpression(False))
    assert condition.assume(i64_is_7).equals(ds.ScalarExpression(True))
    assert str(condition) == "(i64 > 5:int64)"


@pytest.mark.parametrize('paths_or_selector', [
    fs.FileSelector('subdir', recursive=True),
    [
        'subdir',
        'subdir/1',
        'subdir/1/xxx',
        'subdir/1/xxx/file0.parquet',
        'subdir/2',
        'subdir/2/yyy',
        'subdir/2/yyy/file1.parquet',
    ]
])
def test_file_system_factory(mockfs, paths_or_selector):
    format = ds.ParquetFileFormat()

    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(