def test_parquet_dask(gp_multipoint, gp_multiline, tmp_path): # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'points': GeoSeries(gp_multipoint[:n]), 'lines': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }) ddf = dd.from_pandas(df, npartitions=3) path = tmp_path / 'ddf.parq' ddf.to_parquet(path) ddf_read = read_parquet_dask(path) # Check type assert isinstance(ddf_read, DaskGeoDataFrame) # Check that partition bounds were loaded assert set(ddf_read._partition_bounds) == {'points', 'lines'} pd.testing.assert_frame_equal( ddf['points'].partition_bounds, ddf_read._partition_bounds['points'], ) pd.testing.assert_frame_equal( ddf['lines'].partition_bounds, ddf_read._partition_bounds['lines'], )
def test_pack_partitions_to_parquet_glob(gp_multipoint1, gp_multiline1, gp_multipoint2, gp_multiline2, tmp_path): # Build dataframe1 n = min(len(gp_multipoint1), len(gp_multiline1)) df1 = GeoDataFrame({ 'points': GeoSeries(gp_multipoint1[:n]), 'lines': GeoSeries(gp_multiline1[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf1 = dd.from_pandas(df1, npartitions=3) path1 = tmp_path / 'ddf1.parq' ddf_packed1 = ddf1.pack_partitions_to_parquet(str(path1), npartitions=3) # Build dataframe2 n = min(len(gp_multipoint2), len(gp_multiline2)) df2 = GeoDataFrame({ 'points': GeoSeries(gp_multipoint2[:n]), 'lines': GeoSeries(gp_multiline2[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf2 = dd.from_pandas(df2, npartitions=3) path2 = tmp_path / 'ddf2.parq' ddf_packed2 = ddf2.pack_partitions_to_parquet(str(path2), npartitions=4) # Load both packed datasets with glob ddf_globbed = read_parquet_dask(tmp_path / "ddf*.parq", geometry="lines") # Check the number of partitions (< 7 can happen in the case of empty partitions) assert ddf_globbed.npartitions <= 7 # Check contents expected_df = pd.concat([ddf_packed1.compute(), ddf_packed2.compute()]) df_globbed = ddf_globbed.compute() pd.testing.assert_frame_equal(df_globbed, expected_df) # Check partition bounds expected_bounds = { 'points': pd.concat([ ddf_packed1._partition_bounds['points'], ddf_packed2._partition_bounds['points'], ]).reset_index(drop=True), 'lines': pd.concat([ ddf_packed1._partition_bounds['lines'], ddf_packed2._partition_bounds['lines'], ]).reset_index(drop=True), } expected_bounds['points'].index.name = 'partition' expected_bounds['lines'].index.name = 'partition' pd.testing.assert_frame_equal(expected_bounds['points'], ddf_globbed._partition_bounds['points']) pd.testing.assert_frame_equal(expected_bounds['lines'], ddf_globbed._partition_bounds['lines']) assert ddf_globbed.geometry.name == 'lines'
def test_pack_partitions_to_parquet(gp_multipoint, gp_multiline, use_temp_format, tmp_path_factory): with tmp_path_factory.mktemp("spatialpandas", numbered=True) as tmp_path: # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'points': GeoSeries(gp_multipoint[:n]), 'lines': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf = dd.from_pandas(df, npartitions=3) path = tmp_path / 'ddf.parq' if use_temp_format: (tmp_path / 'scratch').mkdir(parents=True, exist_ok=True) tempdir_format = str(tmp_path / 'scratch' / 'part-{uuid}-{partition:03d}') else: tempdir_format = None _retry_args = dict(wait_exponential_multiplier=10, wait_exponential_max=20000, stop_max_attempt_number=4) ddf_packed = ddf.pack_partitions_to_parquet( str(path), npartitions=12, tempdir_format=tempdir_format, _retry_args=_retry_args, ) # Check the number of partitions (< 4 can happen in the case of empty partitions) assert ddf_packed.npartitions <= 12 # Check that rows are now sorted in order of hilbert distance total_bounds = df.lines.total_bounds hilbert_distances = ddf_packed.lines.map_partitions( lambda s: s.hilbert_distance(total_bounds=total_bounds)).compute( ).values # Compute expected total_bounds expected_distances = np.sort( df.lines.hilbert_distance(total_bounds=total_bounds).values) np.testing.assert_equal(expected_distances, hilbert_distances) assert ddf_packed.geometry.name == 'points' # Read columns columns = ['a', 'lines'] ddf_read_cols = read_parquet_dask(path, columns=columns) pd.testing.assert_frame_equal(ddf_read_cols.compute(), ddf_packed[columns].compute())
def load_trajectory_table(root_path, year, month, espg=2845, head=None): """helper to load the trajectory table""" input_path = f"{root_path}/{year}/{month}" # read parquet file(s) from disk ddf = read_parquet_dask(input_path) # bring it all into memory, convert to a geopandas GeoDataFrame, and set CRS if head: gdf = ddf.head(head).to_geopandas().set_crs(f"EPSG:{espg}") else: gdf = ddf.compute().to_geopandas().set_crs(f"EPSG:{espg}") return gdf
def test_pack_partitions_to_parquet(gp_multipoint, gp_multiline, use_temp_format, tmp_path): # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'points': GeoSeries(gp_multipoint[:n]), 'lines': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf = dd.from_pandas(df, npartitions=3) path = tmp_path / 'ddf.parq' if use_temp_format: tempdir_format = str(tmp_path / 'scratch' / 'part-{uuid}-{partition:03d}') else: tempdir_format = None ddf_packed = ddf.pack_partitions_to_parquet(path, npartitions=12, tempdir_format=tempdir_format) # Check the number of partitions (< 4 can happen in the case of empty partitions) assert ddf_packed.npartitions <= 12 # Check that rows are now sorted in order of hilbert distance total_bounds = df.lines.total_bounds hilbert_distances = ddf_packed.lines.map_partitions( lambda s: s.hilbert_distance(total_bounds=total_bounds)).compute( ).values # Compute expected total_bounds expected_distances = np.sort( df.lines.hilbert_distance(total_bounds=total_bounds).values) np.testing.assert_equal(expected_distances, hilbert_distances) assert ddf_packed.geometry.name == 'points' # Read columns columns = ['a', 'lines'] ddf_read_cols = read_parquet_dask(path, columns=columns + ['hilbert_distance']) pd.testing.assert_frame_equal(ddf_read_cols.compute(), ddf_packed[columns].compute())
def test_parquet_dask(gp_multipoint, gp_multiline, tmp_path_factory): with tmp_path_factory.mktemp("spatialpandas", numbered=True) as tmp_path: # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'points': GeoSeries(gp_multipoint[:n]), 'lines': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }) ddf = dd.from_pandas(df, npartitions=3) path = tmp_path / 'ddf.parq' ddf.to_parquet(str(path)) ddf_read = read_parquet_dask(str(path)) # Check type assert isinstance(ddf_read, DaskGeoDataFrame) # Check that partition bounds were loaded nonempty = np.nonzero( np.asarray(ddf.map_partitions(len).compute() > 0))[0] assert set(ddf_read._partition_bounds) == {'points', 'lines'} expected_partition_bounds = ( ddf['points'].partition_bounds.iloc[nonempty].reset_index( drop=True)) expected_partition_bounds.index.name = 'partition' pd.testing.assert_frame_equal( expected_partition_bounds, ddf_read._partition_bounds['points'], ) expected_partition_bounds = ( ddf['lines'].partition_bounds.iloc[nonempty].reset_index( drop=True)) expected_partition_bounds.index.name = 'partition' pd.testing.assert_frame_equal( expected_partition_bounds, ddf_read._partition_bounds['lines'], ) assert ddf_read.geometry.name == 'points'
def test_pack_partitions_to_parquet_list_bounds( gp_multipoint1, gp_multiline1, gp_multipoint2, gp_multiline2, bounds, tmp_path, ): # Build dataframe1 n = min(len(gp_multipoint1), len(gp_multiline1)) df1 = GeoDataFrame({ 'points': GeoSeries(gp_multipoint1[:n]), 'lines': GeoSeries(gp_multiline1[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf1 = dd.from_pandas(df1, npartitions=3) path1 = tmp_path / 'ddf1.parq' ddf_packed1 = ddf1.pack_partitions_to_parquet(str(path1), npartitions=3) # Build dataframe2 n = min(len(gp_multipoint2), len(gp_multiline2)) df2 = GeoDataFrame({ 'points': GeoSeries(gp_multipoint2[:n]), 'lines': GeoSeries(gp_multiline2[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf2 = dd.from_pandas(df2, npartitions=3) path2 = tmp_path / 'ddf2.parq' ddf_packed2 = ddf2.pack_partitions_to_parquet(str(path2), npartitions=4) # Load both packed datasets with glob ddf_read = read_parquet_dask( [str(tmp_path / "ddf1.parq"), str(tmp_path / "ddf2.parq")], geometry="points", bounds=bounds) # Check the number of partitions (< 7 can happen in the case of empty partitions) assert ddf_read.npartitions <= 7 # Check contents xslice = slice(bounds[0], bounds[2]) yslice = slice(bounds[1], bounds[3]) expected_df = pd.concat([ ddf_packed1.cx_partitions[xslice, yslice].compute(), ddf_packed2.cx_partitions[xslice, yslice].compute() ]) df_read = ddf_read.compute() pd.testing.assert_frame_equal(df_read, expected_df) # Compute expected partition bounds points_bounds = pd.concat([ ddf_packed1._partition_bounds['points'], ddf_packed2._partition_bounds['points'], ]).reset_index(drop=True) x0, y0, x1, y1 = bounds x0, x1 = (x0, x1) if x0 <= x1 else (x1, x0) y0, y1 = (y0, y1) if y0 <= y1 else (y1, y0) partition_inds = ~((points_bounds.x1 < x0) | (points_bounds.y1 < y0) | (points_bounds.x0 > x1) | (points_bounds.y0 > y1)) points_bounds = points_bounds[partition_inds].reset_index(drop=True) lines_bounds = pd.concat([ ddf_packed1._partition_bounds['lines'], ddf_packed2._partition_bounds['lines'], ]).reset_index(drop=True)[partition_inds].reset_index(drop=True) points_bounds.index.name = 'partition' lines_bounds.index.name = 'partition' # Check partition bounds pd.testing.assert_frame_equal(points_bounds, ddf_read._partition_bounds['points']) pd.testing.assert_frame_equal(lines_bounds, ddf_read._partition_bounds['lines']) # Check active geometry column assert ddf_read.geometry.name == 'points'