def init(cls, eltype, data, kdims, vdims): import pandas as pd from spatialpandas import GeoDataFrame, GeoSeries if kdims is None: kdims = eltype.kdims if vdims is None: vdims = eltype.vdims if isinstance(data, GeoSeries): data = data.to_frame() if 'geopandas' in sys.modules: import geopandas as gpd if isinstance(data, gpd.GeoSeries): data = data.to_frame() if isinstance(data, gpd.GeoDataFrame): data = GeoDataFrame(data) if isinstance(data, list): if 'shapely' in sys.modules: data = from_shapely(data) if isinstance(data, list): data = from_multi(eltype, data, kdims, vdims) elif not isinstance(data, GeoDataFrame): raise ValueError( "SpatialPandasInterface only support spatialpandas DataFrames." ) elif 'geometry' not in data: cls.geo_column(data) index_names = data.index.names if isinstance( data, pd.DataFrame) else [data.index.name] if index_names == [None]: index_names = ['index'] for kd in kdims + vdims: kd = dimension_name(kd) if kd in data.columns: continue if any(kd == ('index' if name is None else name) for name in index_names): data = data.reset_index() break return data, {'kdims': kdims, 'vdims': vdims}, {}
def test_multipoint_cx_frame_selection(gp_multipoint, rect): x0, y0, x1, y1 = rect expected = GeoDataFrame( GeoSeries(gp_multipoint.cx[x0:x1, y0:y1], dtype='multipoint')) sp_multipoint = GeoSeries(gp_multipoint).to_frame() result = sp_multipoint.cx[x0:x1, y0:y1] assert_frame_equal(expected, result, obj='GeoDataFrame')
def test_multipoint_cx_frame_selection_dask(gp_multipoint, rect): x0, y0, x1, y1 = rect expected = GeoDataFrame( GeoSeries(gp_multipoint.cx[x0:x1, y0:y1], dtype='multipoint') ) sp_multipoint = dd.from_pandas(GeoSeries(gp_multipoint).to_frame(), npartitions=3) result = sp_multipoint.cx[x0:x1, y0:y1].compute() assert_frame_equal(expected, result, obj='GeoDataFrame')
def read_parquet(path, columns=None): # Load using standard pandas read_parquet result = pd_read_parquet(path, engine="auto", columns=columns) # Import geometry columns, not needed for pyarrow >= 0.16 metadata = _load_parquet_pandas_metadata(path) geom_cols = _get_geometry_columns(metadata) if geom_cols: result = _import_geometry_columns(result, geom_cols) # Return result return GeoDataFrame(result)
def load_kinsa(): print('Loading Kinsa Data') county_df = gpd.read_file(path.expanduser('~/kinsa_geometry.gpkg'), layer='county') oneday_df = ddf.read_csv('s3://makepath-demo/kinsa/one_day.csv').compute() oneday_df['region_id'] = oneday_df['region_id'].astype(str).str.zfill(5) county_df['GEOID'] = county_df['GEOID'].astype(str).str.zfill(5) fields = ['GEOID', 'geometry', 'forecast_upper'] county_df = county_df.join(oneday_df.set_index('region_id'), on='GEOID')[fields] return GeoDataFrame(county_df, geometry='geometry')
def test_pack_partitions_to_parquet(gp_multipoint, gp_multiline, use_temp_format, tmp_path_factory): with tmp_path_factory.mktemp("spatialpandas", numbered=True) as tmp_path: # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'points': GeoSeries(gp_multipoint[:n]), 'lines': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf = dd.from_pandas(df, npartitions=3) path = tmp_path / 'ddf.parq' if use_temp_format: (tmp_path / 'scratch').mkdir(parents=True, exist_ok=True) tempdir_format = str(tmp_path / 'scratch' / 'part-{uuid}-{partition:03d}') else: tempdir_format = None _retry_args = dict(wait_exponential_multiplier=10, wait_exponential_max=20000, stop_max_attempt_number=4) ddf_packed = ddf.pack_partitions_to_parquet( str(path), npartitions=12, tempdir_format=tempdir_format, _retry_args=_retry_args, ) # Check the number of partitions (< 4 can happen in the case of empty partitions) assert ddf_packed.npartitions <= 12 # Check that rows are now sorted in order of hilbert distance total_bounds = df.lines.total_bounds hilbert_distances = ddf_packed.lines.map_partitions( lambda s: s.hilbert_distance(total_bounds=total_bounds)).compute( ).values # Compute expected total_bounds expected_distances = np.sort( df.lines.hilbert_distance(total_bounds=total_bounds).values) np.testing.assert_equal(expected_distances, hilbert_distances) assert ddf_packed.geometry.name == 'points' # Read columns columns = ['a', 'lines'] ddf_read_cols = read_parquet_dask(path, columns=columns) pd.testing.assert_frame_equal(ddf_read_cols.compute(), ddf_packed[columns].compute())
def split(cls, dataset, start, end, datatype, **kwargs): from spatialpandas import GeoDataFrame, GeoSeries from ...element import Polygons objs = [] if not len(dataset.data): return [] xdim, ydim = cls.geom_dims(dataset) value_dims = [ dim for dim in dataset.kdims + dataset.vdims if dim not in (xdim, ydim) ] row = dataset.data.iloc[0] col = cls.geo_column(dataset.data) geom_type = cls.geom_type(dataset) if datatype is not None: arr = geom_to_array(row[col], geom_type=geom_type) d = {(xdim.name, ydim.name): arr} d.update({dim.name: row[dim.name] for dim in value_dims}) ds = dataset.clone(d, datatype=['dictionary']) holes = cls.holes(dataset) if cls.has_holes(dataset) else None for i, row in dataset.data.iterrows(): if datatype is None: gdf = GeoDataFrame({ c: GeoSeries([row[c]]) if c == 'geometry' else [row[c]] for c in dataset.data.columns }) objs.append(dataset.clone(gdf)) continue geom = row[col] gt = geom_type or get_geom_type(dataset.data, col) arr = geom_to_array(geom, geom_type=gt) d = {xdim.name: arr[:, 0], ydim.name: arr[:, 1]} d.update({dim.name: row[dim.name] for dim in value_dims}) if datatype in ('dictionary', 'columns'): if holes is not None: d[Polygons._hole_key] = holes[i] d['geom_type'] = gt objs.append(d) continue ds.data = d if datatype == 'array': obj = ds.array(**kwargs) elif datatype == 'dataframe': obj = ds.dframe(**kwargs) else: raise ValueError("%s datatype not support" % datatype) objs.append(obj) return objs
def test_dataframe_slice_types(): gdf = GeoDataFrame({ 'a': [3, 2, 1], 'b': [10, 11, 12], 'points': pd.array([[0, 0, 1, 1], [2, 2, 3, 3], [4, 4]], dtype='multipoint'), 'line': pd.array([[0, 0, 1, 1], [2, 2, 3, 3], [4, 4]], dtype='line'), }) assert isinstance(gdf['a'], pd.Series) assert isinstance(gdf['points'], GeoSeries) assert isinstance(gdf[['a', 'b']], pd.DataFrame) assert isinstance(gdf[['a', 'line']], GeoDataFrame)
def test_parquet(gp_point, gp_multipoint, gp_multiline, tmp_path): # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'point': GeoSeries(gp_point[:n]), 'multipoint': GeoSeries(gp_multipoint[:n]), 'multiline': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }) path = tmp_path / 'df.parq' to_parquet(df, path) df_read = read_parquet(path) assert isinstance(df_read, GeoDataFrame) assert all(df == df_read)
def test_parquet_columns(gp_point, gp_multipoint, gp_multiline, tmp_path): # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'point': GeoSeries(gp_point[:n]), 'multipoint': GeoSeries(gp_multipoint[:n]), 'multiline': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }) path = tmp_path / 'df.parq' to_parquet(df, path) columns = ['a', 'multiline'] df_read = read_parquet(str(path), columns=columns) assert isinstance(df_read, GeoDataFrame) pd.testing.assert_frame_equal(df[columns], df_read)
def _to_spatialpandas( column: List[Union[int, float]], polygon_points: List[np.ndarray], column_name: str, ): from spatialpandas import GeoDataFrame from spatialpandas.geometry import PolygonArray # spatialpandas expects 1d numpy arrays. for i, arrays in enumerate(polygon_points): polygon_points[i] = \ list(map(lambda array: np.reshape(array, -1), arrays)) df = GeoDataFrame({ column_name: column, "geometry": PolygonArray(polygon_points) }) return df
def test_pack_partitions_to_parquet(gp_multipoint, gp_multiline, use_temp_format, tmp_path): # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'points': GeoSeries(gp_multipoint[:n]), 'lines': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf = dd.from_pandas(df, npartitions=3) path = tmp_path / 'ddf.parq' if use_temp_format: tempdir_format = str(tmp_path / 'scratch' / 'part-{uuid}-{partition:03d}') else: tempdir_format = None ddf_packed = ddf.pack_partitions_to_parquet(path, npartitions=12, tempdir_format=tempdir_format) # Check the number of partitions (< 4 can happen in the case of empty partitions) assert ddf_packed.npartitions <= 12 # Check that rows are now sorted in order of hilbert distance total_bounds = df.lines.total_bounds hilbert_distances = ddf_packed.lines.map_partitions( lambda s: s.hilbert_distance(total_bounds=total_bounds)).compute( ).values # Compute expected total_bounds expected_distances = np.sort( df.lines.hilbert_distance(total_bounds=total_bounds).values) np.testing.assert_equal(expected_distances, hilbert_distances) assert ddf_packed.geometry.name == 'points' # Read columns columns = ['a', 'lines'] ddf_read_cols = read_parquet_dask(path, columns=columns + ['hilbert_distance']) pd.testing.assert_frame_equal(ddf_read_cols.compute(), ddf_packed[columns].compute())
def test_parquet(gp_point, gp_multipoint, gp_multiline, tmp_path): # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'point': GeoSeries(gp_point[:n]), 'multipoint': GeoSeries(gp_multipoint[:n]), 'multiline': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }) df.index.name = 'range_idx' path = tmp_path / 'df.parq' to_parquet(df, path) df_read = read_parquet(str(path), columns=['point', 'multipoint', 'multiline', 'a']) assert isinstance(df_read, GeoDataFrame) pd.testing.assert_frame_equal(df, df_read) assert df_read.index.name == df.index.name
def test_parquet_dask(gp_multipoint, gp_multiline, tmp_path_factory): with tmp_path_factory.mktemp("spatialpandas", numbered=True) as tmp_path: # Build dataframe n = min(len(gp_multipoint), len(gp_multiline)) df = GeoDataFrame({ 'points': GeoSeries(gp_multipoint[:n]), 'lines': GeoSeries(gp_multiline[:n]), 'a': list(range(n)) }) ddf = dd.from_pandas(df, npartitions=3) path = tmp_path / 'ddf.parq' ddf.to_parquet(str(path)) ddf_read = read_parquet_dask(str(path)) # Check type assert isinstance(ddf_read, DaskGeoDataFrame) # Check that partition bounds were loaded nonempty = np.nonzero( np.asarray(ddf.map_partitions(len).compute() > 0))[0] assert set(ddf_read._partition_bounds) == {'points', 'lines'} expected_partition_bounds = ( ddf['points'].partition_bounds.iloc[nonempty].reset_index( drop=True)) expected_partition_bounds.index.name = 'partition' pd.testing.assert_frame_equal( expected_partition_bounds, ddf_read._partition_bounds['points'], ) expected_partition_bounds = ( ddf['lines'].partition_bounds.iloc[nonempty].reset_index( drop=True)) expected_partition_bounds.index.name = 'partition' pd.testing.assert_frame_equal( expected_partition_bounds, ddf_read._partition_bounds['lines'], ) assert ddf_read.geometry.name == 'points'
def test_active_geometry(use_dask): gdf = GeoDataFrame( OrderedDict([ ('a', [3, 2, 1]), ('points', pd.array([[0, 0, 1, 1], [2, 2, 3, 3], [4, 4]], dtype='multipoint')), ('line', pd.array([[0, 0, 1, 1], [2, 2, 3, 3], [4, 4]], dtype='line')), ])) if use_dask: gdf = dd.from_pandas(gdf, npartitions=2) # geometry start out as first compatible column in data frame assert gdf.geometry.name == 'points' # set_geometry default to copy operation assert gdf.set_geometry('line').geometry.name == 'line' assert gdf.geometry.name == 'points' # set_geometry inplace mutates geometry column if use_dask: # inplace not supported for DaskGeoDataFrame gdf = gdf.set_geometry('line') else: gdf.set_geometry('line', inplace=True) assert gdf.geometry.name == 'line' # Activate geometry propagates through slicing sliced_gdf = gdf.loc[[0, 2, 1, 0]] assert isinstance(sliced_gdf, type(gdf)) assert sliced_gdf.geometry.name == 'line' # Select columns not including active geometry selected_gdf = gdf[['a', 'points']] with pytest.raises(ValueError): selected_gdf.geometry assert selected_gdf.set_geometry('points').geometry.name == 'points'
def test_pack_partitions_to_parquet_list_bounds( gp_multipoint1, gp_multiline1, gp_multipoint2, gp_multiline2, bounds, tmp_path, ): # Build dataframe1 n = min(len(gp_multipoint1), len(gp_multiline1)) df1 = GeoDataFrame({ 'points': GeoSeries(gp_multipoint1[:n]), 'lines': GeoSeries(gp_multiline1[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf1 = dd.from_pandas(df1, npartitions=3) path1 = tmp_path / 'ddf1.parq' ddf_packed1 = ddf1.pack_partitions_to_parquet(str(path1), npartitions=3) # Build dataframe2 n = min(len(gp_multipoint2), len(gp_multiline2)) df2 = GeoDataFrame({ 'points': GeoSeries(gp_multipoint2[:n]), 'lines': GeoSeries(gp_multiline2[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf2 = dd.from_pandas(df2, npartitions=3) path2 = tmp_path / 'ddf2.parq' ddf_packed2 = ddf2.pack_partitions_to_parquet(str(path2), npartitions=4) # Load both packed datasets with glob ddf_read = read_parquet_dask( [str(tmp_path / "ddf1.parq"), str(tmp_path / "ddf2.parq")], geometry="points", bounds=bounds) # Check the number of partitions (< 7 can happen in the case of empty partitions) assert ddf_read.npartitions <= 7 # Check contents xslice = slice(bounds[0], bounds[2]) yslice = slice(bounds[1], bounds[3]) expected_df = pd.concat([ ddf_packed1.cx_partitions[xslice, yslice].compute(), ddf_packed2.cx_partitions[xslice, yslice].compute() ]) df_read = ddf_read.compute() pd.testing.assert_frame_equal(df_read, expected_df) # Compute expected partition bounds points_bounds = pd.concat([ ddf_packed1._partition_bounds['points'], ddf_packed2._partition_bounds['points'], ]).reset_index(drop=True) x0, y0, x1, y1 = bounds x0, x1 = (x0, x1) if x0 <= x1 else (x1, x0) y0, y1 = (y0, y1) if y0 <= y1 else (y1, y0) partition_inds = ~((points_bounds.x1 < x0) | (points_bounds.y1 < y0) | (points_bounds.x0 > x1) | (points_bounds.y0 > y1)) points_bounds = points_bounds[partition_inds].reset_index(drop=True) lines_bounds = pd.concat([ ddf_packed1._partition_bounds['lines'], ddf_packed2._partition_bounds['lines'], ]).reset_index(drop=True)[partition_inds].reset_index(drop=True) points_bounds.index.name = 'partition' lines_bounds.index.name = 'partition' # Check partition bounds pd.testing.assert_frame_equal(points_bounds, ddf_read._partition_bounds['points']) pd.testing.assert_frame_equal(lines_bounds, ddf_read._partition_bounds['lines']) # Check active geometry column assert ddf_read.geometry.name == 'points'
def _perform_read_parquet_dask( paths, columns, filesystem, load_divisions, geometry=None, bounds=None, categories=None, ): filesystem = validate_coerce_filesystem(paths[0], filesystem) datasets = [ pa.parquet.ParquetDataset(path, filesystem=filesystem, validate_schema=False) for path in paths ] # Create delayed partition for each piece pieces = [] for dataset in datasets: # Perform natural sort on pieces so that "part.10" comes after "part.2" dataset_pieces = sorted(dataset.pieces, key=lambda piece: natural_sort_key(piece.path)) pieces.extend(dataset_pieces) delayed_partitions = [ delayed(read_parquet)(piece.path, columns=columns, filesystem=filesystem) for piece in pieces ] # Load divisions if load_divisions: div_mins_list, div_maxes_list = zip( *[_load_divisions(dataset) for dataset in datasets]) div_mins = reduce(lambda a, b: a + b, div_mins_list, []) div_maxes = reduce(lambda a, b: a + b, div_maxes_list, []) else: div_mins = None div_maxes = None # load partition bounds partition_bounds_list = [ _load_partition_bounds(dataset) for dataset in datasets ] if not any([b is None for b in partition_bounds_list]): partition_bounds = {} # We have partition bounds for all datasets for partition_bounds_el in partition_bounds_list: for col, col_bounds in partition_bounds_el.items(): col_bounds_list = partition_bounds.get(col, []) col_bounds_list.append(col_bounds) partition_bounds[col] = col_bounds_list # Concat bounds for each geometry column for col in list(partition_bounds): partition_bounds[col] = pd.concat(partition_bounds[col], axis=0).reset_index(drop=True) partition_bounds[col].index.name = 'partition' else: partition_bounds = {} # Use Dask's read_parquet to get metadata if columns is not None: cols_no_index = [col for col in columns if col != "hilbert_distance"] else: cols_no_index = None meta = dd_read_parquet( paths[0], columns=cols_no_index, filesystem=filesystem, engine='pyarrow', categories=categories, gather_statistics=False, )._meta # Import geometry columns in meta, not needed for pyarrow >= 0.16 metadata = _load_parquet_pandas_metadata(paths[0], filesystem=filesystem) geom_cols = _get_geometry_columns(metadata) if geom_cols: meta = _import_geometry_columns(meta, geom_cols) meta = GeoDataFrame(meta) # Handle geometry in meta if geometry: meta = meta.set_geometry(geometry) geometry = meta.geometry.name # Filter partitions by bounding box if bounds and geometry in partition_bounds: # Unpack bounds coordinates and make sure x0, y0, x1, y1 = bounds # Make sure x0 < c1 if x0 > x1: x0, x1 = x1, x0 # Make sure y0 < y1 if y0 > y1: y0, y1 = y1, y0 # Make DataFrame with bounds and parquet piece partitions_df = partition_bounds[geometry].assign( delayed_partition=delayed_partitions) if load_divisions: partitions_df = partitions_df.assign(div_mins=div_mins, div_maxes=div_maxes) inds = ~((partitions_df.x1 < x0) | (partitions_df.y1 < y0) | (partitions_df.x0 > x1) | (partitions_df.y0 > y1)) partitions_df = partitions_df[inds] for col in list(partition_bounds): partition_bounds[col] = partition_bounds[col][inds] partition_bounds[col].reset_index(drop=True, inplace=True) partition_bounds[col].index.name = "partition" delayed_partitions = partitions_df.delayed_partition.tolist() if load_divisions: div_mins = partitions_df.div_mins div_maxes = partitions_df.div_maxes if load_divisions: divisions = div_mins + [div_maxes[-1]] if divisions != sorted(divisions): raise ValueError( "Cannot load divisions because the discovered divisions are unsorted.\n" "Set load_divisions=False to skip loading divisions.") else: divisions = None # Create DaskGeoDataFrame if delayed_partitions: result = from_delayed(delayed_partitions, divisions=divisions, meta=meta, verify_meta=False) else: # Single partition empty result result = from_pandas(meta, npartitions=1) # Set partition bounds if partition_bounds: result._partition_bounds = partition_bounds return result
def to_spatialpandas(data, xdim, ydim, columns=[], geom='point'): """Converts list of dictionary format geometries to spatialpandas line geometries. Args: data: List of dictionaries representing individual geometries xdim: Name of x-coordinates column ydim: Name of y-coordinates column columns: List of columns to add geom: The type of geometry Returns: A spatialpandas.GeoDataFrame version of the data """ from spatialpandas import GeoSeries, GeoDataFrame from spatialpandas.geometry import (Point, Line, Polygon, Ring, LineArray, PolygonArray, PointArray, MultiLineArray, MultiPolygonArray, MultiPointArray, RingArray) from ...element import Polygons poly = any(Polygons._hole_key in d for d in data) or geom == 'Polygon' if poly: geom_type = Polygon single_array, multi_array = PolygonArray, MultiPolygonArray elif geom == 'Line': geom_type = Line single_array, multi_array = LineArray, MultiLineArray elif geom == 'Ring': geom_type = Ring single_array, multi_array = RingArray, MultiLineArray else: geom_type = Point single_array, multi_array = PointArray, MultiPointArray array_type = None hole_arrays, geom_arrays = [], [] for geom in data: geom = dict(geom) if xdim not in geom or ydim not in geom: raise ValueError('Could not find geometry dimensions') xs, ys = geom.pop(xdim), geom.pop(ydim) xscalar, yscalar = isscalar(xs), isscalar(ys) if xscalar and yscalar: xs, ys = np.array([xs]), np.array([ys]) elif xscalar: xs = np.full_like(ys, xs) elif yscalar: ys = np.full_like(xs, ys) geom_array = np.column_stack([xs, ys]) if geom_type in (Polygon, Ring): geom_array = ensure_ring(geom_array) splits = np.where( np.isnan(geom_array[:, :2].astype('float')).sum(axis=1))[0] split_geoms = np.split(geom_array, splits + 1) if len(splits) else [geom_array] split_holes = geom.pop(Polygons._hole_key, None) if split_holes is not None: if len(split_holes) != len(split_geoms): raise DataError( 'Polygons with holes containing multi-geometries ' 'must declare a list of holes for each geometry.', SpatialPandasInterface) else: split_holes = [[ensure_ring(np.asarray(h)) for h in hs] for hs in split_holes] geom_arrays.append(split_geoms) hole_arrays.append(split_holes) if geom_type is Point: if len(splits) > 1 or any(len(g) > 1 for g in split_geoms): array_type = multi_array elif array_type is None: array_type = single_array elif len(splits): array_type = multi_array elif array_type is None: array_type = single_array converted = defaultdict(list) for geom, arrays, holes in zip(data, geom_arrays, hole_arrays): parts = [] for i, g in enumerate(arrays): if i != (len(arrays) - 1): g = g[:-1] if len(g) < (3 if poly else 2) and geom_type is not Point: continue if poly: parts.append([]) subparts = parts[-1] else: subparts = parts subparts.append(g[:, :2]) if poly and holes is not None: subparts += [np.array(h) for h in holes[i]] for c, v in geom.items(): converted[c].append(v) if array_type is PointArray: parts = parts[0].flatten() elif array_type is MultiPointArray: parts = np.concatenate([sp.flatten() for sp in parts]) elif array_type is multi_array: parts = [[ssp.flatten() for ssp in sp] if poly else sp.flatten() for sp in parts] else: parts = [np.asarray(sp).flatten() for sp in parts[0]] if poly else parts[0].flatten() converted['geometry'].append(parts) if converted: geometries = converted['geometry'] if array_type is PointArray: geometries = np.concatenate(geometries) geom_array = array_type(geometries) if poly: geom_array = geom_array.oriented() converted['geometry'] = GeoSeries(geom_array) else: converted['geometry'] = GeoSeries(single_array([])) return GeoDataFrame(converted, columns=['geometry'] + columns)
def dask_GeoDataFrame(*args, **kwargs): return dd.from_pandas(GeoDataFrame(*args, **kwargs), npartitions=3)
def append_traj_info( df: spd.GeoDataFrame, gdfs: Dict[str, gpd.GeoDataFrame], ) -> spd.GeoDataFrame: """Append trajectory info from shape files. Parameters ---------- df DataFrame containing trajectories. gdfs Dict containing the following GeoPandas GeoDataFrames: - tracts - tsz - city - county Returns ------- Dataframe with appended information. """ df = df.reset_index(drop=True) start_locations = gpd.GeoDataFrame( geometry=df.start_geohash.apply(geohash_decode_point), index=df.index, crs="EPSG:4326", ) end_locations = gpd.GeoDataFrame( geometry=df.end_geohash.apply(geohash_decode_point), index=df.index, crs="EPSG:4326", ) tracts = gdfs["tracts"].to_crs("EPSG:4326") df["start_CensusBlock2019"] = ( gpd.sjoin( start_locations, tracts[["geometry", "GEOID"]].set_index("GEOID"), how="left", ).rename(columns={ "index_right": "start_CensusBlock2019" }).drop(columns=["geometry"]).astype(str).fillna("unknown") ["start_CensusBlock2019"].pipe(lambda s: s.groupby(s.index).head(1))) df["end_CensusBlock2019"] = ( gpd.sjoin( end_locations, tracts[["geometry", "GEOID"]].set_index("GEOID"), how="left", ).rename(columns={ "index_right": "end_CensusBlock2019" }).drop(columns=["geometry"]).astype(str).fillna("unknown") ["end_CensusBlock2019"].pipe(lambda s: s.groupby(s.index).head(1))) tsz = gdfs["tsz"].to_crs("EPSG:4326") df["start_TSZ"] = ( gpd.sjoin( start_locations, tsz[["geometry", "TSZ"]].set_index("TSZ"), how="left", ).rename(columns={ "index_right": "start_TSZ" }).drop(columns=["geometry"]).astype(str).fillna("Out of NCTCOG area") ["start_TSZ"].pipe(lambda s: s.groupby(s.index).head(1))) df["end_TSZ"] = (gpd.sjoin( end_locations, tsz[["geometry", "TSZ"]].set_index("TSZ"), how="left", ).rename(columns={ "index_right": "end_TSZ" }).drop(columns=["geometry"]).astype(str).fillna("Out of NCTCOG area") ["end_TSZ"].pipe(lambda s: s.groupby(s.index).head(1))) county = gdfs["county"] # File already in EPSG:4326 df["start_county"] = (gpd.sjoin( start_locations, county[["geometry", "CNTY_NM"]].set_index("CNTY_NM"), how="left", ).rename(columns={ "index_right": "start_county" }).drop(columns=["geometry"]).fillna("unknown")["start_county"].pipe( lambda s: s.groupby(s.index).head(1))) df["end_county"] = (gpd.sjoin( end_locations, county[["geometry", "CNTY_NM"]].set_index("CNTY_NM"), how="left", ).rename(columns={ "index_right": "end_county" }).drop(columns=["geometry"]).fillna("unknown")["end_county"].pipe( lambda s: s.groupby(s.index).head(1))) city = gdfs["city"] # File already in EPSG:4326 df["start_city"] = (gpd.sjoin( start_locations, city[["geometry", "CITY_NM"]].set_index("CITY_NM"), how="left", ).rename(columns={ "index_right": "start_city" }).drop(columns=["geometry"]).fillna("unknown")["start_city"].pipe( lambda s: s.groupby(s.index).head(1))) df["end_city"] = (gpd.sjoin( end_locations, city[["geometry", "CITY_NM"]].set_index("CITY_NM"), how="left", ).rename(columns={ "index_right": "end_city" }).drop(columns=["geometry"]).fillna("unknown")["end_city"].pipe( lambda s: s.groupby(s.index).head(1))) return df
def test_pack_partitions_to_parquet_glob(gp_multipoint1, gp_multiline1, gp_multipoint2, gp_multiline2, tmp_path_factory): with tmp_path_factory.mktemp("spatialpandas", numbered=True) as tmp_path: # Build dataframe1 n = min(len(gp_multipoint1), len(gp_multiline1)) df1 = GeoDataFrame({ 'points': GeoSeries(gp_multipoint1[:n]), 'lines': GeoSeries(gp_multiline1[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf1 = dd.from_pandas(df1, npartitions=3) path1 = tmp_path / 'ddf1.parq' ddf_packed1 = ddf1.pack_partitions_to_parquet(str(path1), npartitions=3) # Build dataframe2 n = min(len(gp_multipoint2), len(gp_multiline2)) df2 = GeoDataFrame({ 'points': GeoSeries(gp_multipoint2[:n]), 'lines': GeoSeries(gp_multiline2[:n]), 'a': list(range(n)) }).set_geometry('lines') ddf2 = dd.from_pandas(df2, npartitions=3) path2 = tmp_path / 'ddf2.parq' ddf_packed2 = ddf2.pack_partitions_to_parquet(str(path2), npartitions=4) # Load both packed datasets with glob ddf_globbed = read_parquet_dask(tmp_path / "ddf*.parq", geometry="lines") # Check the number of partitions (< 7 can happen in the case of empty partitions) assert ddf_globbed.npartitions <= 7 # Check contents expected_df = pd.concat([ddf_packed1.compute(), ddf_packed2.compute()]) df_globbed = ddf_globbed.compute() pd.testing.assert_frame_equal(df_globbed, expected_df) # Check partition bounds expected_bounds = { 'points': pd.concat([ ddf_packed1._partition_bounds['points'], ddf_packed2._partition_bounds['points'], ]).reset_index(drop=True), 'lines': pd.concat([ ddf_packed1._partition_bounds['lines'], ddf_packed2._partition_bounds['lines'], ]).reset_index(drop=True), } expected_bounds['points'].index.name = 'partition' expected_bounds['lines'].index.name = 'partition' pd.testing.assert_frame_equal(expected_bounds['points'], ddf_globbed._partition_bounds['points']) pd.testing.assert_frame_equal(expected_bounds['lines'], ddf_globbed._partition_bounds['lines']) assert ddf_globbed.geometry.name == 'lines'
def _sjoin_pandas_pandas( left_df, right_df, how="inner", op="intersects", lsuffix="left", rsuffix="right" ): from spatialpandas import GeoDataFrame # Record original index name(s), generate new index name(s), reset index column(s) original_right_df = right_df original_left_df = left_df right_df, right_index_name, index_right = _record_reset_index( original_right_df, rsuffix ) left_df, left_index_name, index_left = _record_reset_index( original_left_df, lsuffix ) if any(original_left_df.columns.isin(index_left + index_right)) or any( original_right_df.columns.isin(index_left + index_right) ): raise ValueError( "'{0}' and '{1}' cannot be column names in the GeoDataFrames being" " joined".format(index_left, index_right) ) # Get spatial index for left frame sindex = left_df.geometry.sindex left_geom = left_df.geometry.array right_geom = right_df.geometry.array # Get bounds from right geometry right_bounds = right_df.geometry.bounds.values # Init list of arrays, the same length as right_df, where each array holds the # indices into left_df that intersect with the corresponding element. left_inds = [np.array([], dtype='uint32')] * len(right_df) # right_inds will hold the inds into right_df that correspond to left_inds right_inds = [np.array([], dtype='uint32')] * len(right_df) # Loop over the right frame for i in range(len(right_df)): # Get bounds for shape in current row of right_df shape_bounds = right_bounds[i, :] # Use spatial index on left_df to get indices of shapes with bounding boxes that # intersect with these bounds candidate_inds = sindex.intersects(shape_bounds) if len(candidate_inds) > 0: right_shape = right_geom[i] intersecting_mask = left_geom.intersects(right_shape, inds=candidate_inds) intersecting_inds = candidate_inds[intersecting_mask] left_inds[i] = intersecting_inds right_inds[i] = np.full(len(intersecting_inds), i) # Flatten nested arrays of indices if left_inds: flat_left_inds = np.concatenate(left_inds) flat_right_inds = np.concatenate(right_inds) else: flat_left_inds = np.array([], dtype='uint32') flat_right_inds = np.array([], dtype='uint32') # Build pandas DataFrame from inds result = pd.DataFrame({ '_key_left': flat_left_inds, '_key_right': flat_right_inds }) # Perform join if how == "inner": result = result.set_index("_key_left") joined = ( left_df.merge( result, left_index=True, right_index=True ).merge( right_df.drop(right_df.geometry.name, axis=1), left_on="_key_right", right_index=True, suffixes=("_%s" % lsuffix, "_%s" % rsuffix), ).set_index( index_left ).drop( ["_key_right"], axis=1 ) ) if len(left_index_name) > 1: joined.index.names = left_index_name else: joined.index.name = left_index_name[0] elif how == "left": result = result.set_index("_key_left") joined = ( left_df.merge( result, left_index=True, right_index=True, how="left" ).merge( right_df.drop(right_df.geometry.name, axis=1), how="left", left_on="_key_right", right_index=True, suffixes=("_%s" % lsuffix, "_%s" % rsuffix), ).set_index( index_left ).drop( ["_key_right"], axis=1 ) ) if len(left_index_name) > 1: joined.index.names = left_index_name else: joined.index.name = left_index_name[0] else: # how == 'right': joined = ( left_df.drop( left_df.geometry.name, axis=1 ).merge( result.merge( right_df, left_on="_key_right", right_index=True, how="right" ), left_index=True, right_on="_key_left", suffixes=("_%s" % lsuffix, "_%s" % rsuffix), how="right", ).set_index( index_right ).drop( ["_key_left", "_key_right"], axis=1 ) ) if len(right_index_name) > 1: joined.index.names = right_index_name else: joined.index.name = right_index_name[0] return GeoDataFrame(joined)