def mockfs(request): return dict( fs=_MockFileSystem(), pathfn=lambda p: p, allow_copy_file=True, allow_move_dir=True, allow_append_to_file=True, )
def test_mockfs_mtime_roundtrip(mockfs): dt = datetime.fromtimestamp(1568799826, timezone.utc) fs = _MockFileSystem(dt) with fs.open_output_stream('foo'): pass [info] = fs.get_file_info(['foo']) assert info.mtime == dt
def py_mockfs(request): return dict( fs=PyFileSystem(ProxyHandler(_MockFileSystem())), pathfn=lambda p: p, allow_copy_file=True, allow_move_dir=True, allow_append_to_file=True, )
def test_filesystem_equals(): fs0 = LocalFileSystem() fs1 = LocalFileSystem() fs2 = _MockFileSystem() assert fs0.equals(fs0) assert fs0.equals(fs1) with pytest.raises(TypeError): fs0.equals('string') assert fs0 == fs0 == fs1 assert fs0 != 4 assert fs2 == fs2 assert fs2 != _MockFileSystem() assert SubTreeFileSystem('/base', fs0) == SubTreeFileSystem('/base', fs0) assert SubTreeFileSystem('/base', fs0) != SubTreeFileSystem('/base', fs2) assert SubTreeFileSystem('/base', fs0) != SubTreeFileSystem('/other', fs0)
def test_open_dataset_filesystem(tempdir): # # single file table, path = _create_single_file(tempdir) # filesystem inferred from path dataset1 = ds.dataset(str(path)) assert dataset1.schema.equals(table.schema, check_metadata=False) # filesystem specified dataset2 = ds.dataset(str(path), filesystem=fs.LocalFileSystem()) assert dataset2.schema.equals(table.schema, check_metadata=False) # passing different filesystem with pytest.raises(FileNotFoundError): ds.dataset(str(path), filesystem=fs._MockFileSystem())
def multisourcefs(request): request.config.pyarrow.requires('pandas') request.config.pyarrow.requires('parquet') import pyarrow.parquet as pq df = _generate_data(1000) mockfs = fs._MockFileSystem() # simply split the dataframe into three chunks to construct a data source # from each chunk into its own directory df_a, df_b, df_c, df_d = np.array_split(df, 4) # create a directory containing a flat sequence of parquet files without # any partitioning involved mockfs.create_dir('plain') for i, chunk in enumerate(np.array_split(df_a, 10)): path = 'plain/chunk-{}.parquet'.format(i) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with schema partitioning by week and color mockfs.create_dir('schema') for part, chunk in df_b.groupby([df_b.date.dt.week, df_b.color]): folder = 'schema/{}/{}'.format(*part) path = '{}/chunk.parquet'.format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with hive partitioning by year and month mockfs.create_dir('hive') for part, chunk in df_c.groupby([df_c.date.dt.year, df_c.date.dt.month]): folder = 'hive/year={}/month={}'.format(*part) path = '{}/chunk.parquet'.format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with hive partitioning by color mockfs.create_dir('hive_color') for part, chunk in df_d.groupby(["color"]): folder = 'hive_color/color={}'.format(*part) path = '{}/chunk.parquet'.format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) return mockfs
def test_open_dataset_filesystem(tempdir): # single file table, path = _create_single_file(tempdir) # filesystem inferred from path dataset1 = ds.dataset(str(path)) assert dataset1.schema.equals(table.schema) # filesystem specified dataset2 = ds.dataset(str(path), filesystem=fs.LocalFileSystem()) assert dataset2.schema.equals(table.schema) # local filesystem specified with relative path with change_cwd(tempdir): dataset3 = ds.dataset("test.parquet", filesystem=fs.LocalFileSystem()) assert dataset3.schema.equals(table.schema) # passing different filesystem with pytest.raises(FileNotFoundError): ds.dataset(str(path), filesystem=fs._MockFileSystem())
def mockfs(): import pyarrow.parquet as pq mockfs = fs._MockFileSystem() data = [list(range(5)), list(map(float, range(5)))] schema = pa.schema( [pa.field('i64', pa.int64()), pa.field('f64', pa.float64())]) batch = pa.record_batch(data, schema=schema) table = pa.Table.from_batches([batch]) directories = [ 'subdir/1/xxx', 'subdir/2/yyy', ] for i, directory in enumerate(directories): path = '{}/file{}.parquet'.format(directory, i) mockfs.create_dir(directory) with mockfs.open_output_stream(path) as out: pq.write_table(table, out) return mockfs
def test_type_name(): fs = LocalFileSystem() assert fs.type_name == "local" fs = _MockFileSystem() assert fs.type_name == "mock"