def _basic_checks( left_df, right_df, how, lsuffix, rsuffix, allowed_hows=("left", "right", "inner") ): """Checks the validity of join input parameters. `how` must be one of the valid options. `'index_'` concatenated with `lsuffix` or `rsuffix` must not already exist as columns in the left or right data frames. Parameters ------------ left_df : GeoDataFrame right_df : GeoData Frame how : str, one of allowed_hows join type lsuffix : str left index suffix rsuffix : str right index suffix """ if not isinstance(left_df, GeoDataFrame): raise ValueError( "'left_df' should be GeoDataFrame, got {}".format(type(left_df)) ) if not isinstance(right_df, GeoDataFrame): raise ValueError( "'right_df' should be GeoDataFrame, got {}".format(type(right_df)) ) if how not in allowed_hows: raise ValueError( '`how` was "{}" but is expected to be in {}'.format(how, allowed_hows) ) if not _check_crs(left_df, right_df): _crs_mismatch_warn(left_df, right_df, stacklevel=4) index_left = "index_{}".format(lsuffix) index_right = "index_{}".format(rsuffix) # due to GH 352 if any(left_df.columns.isin([index_left, index_right])) or any( right_df.columns.isin([index_left, index_right]) ): raise ValueError( "'{0}' and '{1}' cannot be names in the frames being" " joined".format(index_left, index_right) )
def sjoin(left_df, right_df, how="inner", op="intersects", lsuffix="left", rsuffix="right"): """Spatial join of two GeoDataFrames. Parameters ---------- left_df, right_df : GeoDataFrames how : string, default 'inner' The type of join: * 'left': use keys from left_df; retain only left_df geometry column * 'right': use keys from right_df; retain only right_df geometry column * 'inner': use intersection of keys from both dfs; retain only left_df geometry column op : string, default 'intersects' Binary predicate, one of {'intersects', 'contains', 'within'}. See http://shapely.readthedocs.io/en/latest/manual.html#binary-predicates. lsuffix : string, default 'left' Suffix to apply to overlapping column names (left GeoDataFrame). rsuffix : string, default 'right' Suffix to apply to overlapping column names (right GeoDataFrame). """ if not isinstance(left_df, GeoDataFrame): raise ValueError("'left_df' should be GeoDataFrame, got {}".format( type(left_df))) if not isinstance(right_df, GeoDataFrame): raise ValueError("'right_df' should be GeoDataFrame, got {}".format( type(right_df))) allowed_hows = ["left", "right", "inner"] if how not in allowed_hows: raise ValueError('`how` was "%s" but is expected to be in %s' % (how, allowed_hows)) allowed_ops = ["contains", "within", "intersects"] if op not in allowed_ops: raise ValueError('`op` was "%s" but is expected to be in %s' % (op, allowed_ops)) if not _check_crs(left_df, right_df): _crs_mismatch_warn(left_df, right_df, stacklevel=3) index_left = "index_%s" % lsuffix index_right = "index_%s" % rsuffix # due to GH 352 if any(left_df.columns.isin([index_left, index_right])) or any( right_df.columns.isin([index_left, index_right])): raise ValueError("'{0}' and '{1}' cannot be names in the frames being" " joined".format(index_left, index_right)) # Attempt to re-use spatial indexes, otherwise generate the spatial index # for the longer dataframe. If we are joining to an empty dataframe, # don't bother generating the index. if right_df._sindex_generated or (not left_df._sindex_generated and right_df.shape[0] > left_df.shape[0]): tree_idx = right_df.sindex if len(left_df) > 0 else None tree_idx_right = True else: tree_idx = left_df.sindex if len(right_df) > 0 else None tree_idx_right = False # the rtree spatial index only allows limited (numeric) index types, but an # index in geopandas may be any arbitrary dtype. so reset both indices now # and store references to the original indices, to be reaffixed later. # GH 352 left_df = left_df.copy(deep=True) try: left_index_name = left_df.index.name left_df.index = left_df.index.rename(index_left) except TypeError: index_left = [ "index_%s" % lsuffix + str(pos) for pos, ix in enumerate(left_df.index.names) ] left_index_name = left_df.index.names left_df.index = left_df.index.rename(index_left) left_df = left_df.reset_index() right_df = right_df.copy(deep=True) try: right_index_name = right_df.index.name right_df.index = right_df.index.rename(index_right) except TypeError: index_right = [ "index_%s" % rsuffix + str(pos) for pos, ix in enumerate(right_df.index.names) ] right_index_name = right_df.index.names right_df.index = right_df.index.rename(index_right) right_df = right_df.reset_index() if op == "within": # within implemented as the inverse of contains; swap names left_df, right_df = right_df, left_df tree_idx_right = not tree_idx_right r_idx = np.empty((0, 0)) l_idx = np.empty((0, 0)) # get rtree spatial index. If tree_idx does not exist, it is due to either a # failure to generate the index (e.g., if the column is empty), or the # other dataframe is empty so it wasn't necessary to generate it. if tree_idx_right and tree_idx: idxmatch = left_df.geometry.apply(lambda x: x.bounds).apply( lambda x: list(tree_idx.intersection(x)) if not x == () else []) idxmatch = idxmatch[idxmatch.apply(len) > 0] # indexes of overlapping boundaries if idxmatch.shape[0] > 0: r_idx = np.concatenate(idxmatch.values) l_idx = np.concatenate([[i] * len(v) for i, v in idxmatch.iteritems()]) elif not tree_idx_right and tree_idx: # tree_idx_df == 'left' idxmatch = right_df.geometry.apply(lambda x: x.bounds).apply( lambda x: list(tree_idx.intersection(x)) if not x == () else []) idxmatch = idxmatch[idxmatch.apply(len) > 0] if idxmatch.shape[0] > 0: # indexes of overlapping boundaries l_idx = np.concatenate(idxmatch.values) r_idx = np.concatenate([[i] * len(v) for i, v in idxmatch.iteritems()]) if len(r_idx) > 0 and len(l_idx) > 0: if compat.USE_PYGEOS: import pygeos predicate_d = { "intersects": pygeos.intersects, "contains": pygeos.contains, "within": pygeos.contains, } check_predicates = predicate_d[op] else: # Vectorize predicate operations def find_intersects(a1, a2): return a1.intersects(a2) def find_contains(a1, a2): return a1.contains(a2) predicate_d = { "intersects": find_intersects, "contains": find_contains, "within": find_contains, } check_predicates = np.vectorize(predicate_d[op]) if compat.USE_PYGEOS: res = check_predicates( left_df.geometry[l_idx].values.data, right_df[right_df.geometry.name][r_idx].values.data, ) else: res = check_predicates( left_df.geometry.apply(lambda x: prepared.prep(x))[l_idx], right_df[right_df.geometry.name][r_idx], ) result = pd.DataFrame(np.column_stack([l_idx, r_idx, res])) result.columns = ["_key_left", "_key_right", "match_bool"] result = pd.DataFrame(result[result["match_bool"] == 1]).drop( "match_bool", axis=1) else: # when output from the join has no overlapping geometries result = pd.DataFrame(columns=["_key_left", "_key_right"], dtype=float) if op == "within": # within implemented as the inverse of contains; swap names left_df, right_df = right_df, left_df result = result.rename(columns={ "_key_left": "_key_right", "_key_right": "_key_left" }) if how == "inner": result = result.set_index("_key_left") joined = (left_df.merge(result, left_index=True, right_index=True).merge( right_df.drop(right_df.geometry.name, axis=1), left_on="_key_right", right_index=True, suffixes=("_%s" % lsuffix, "_%s" % rsuffix), ).set_index(index_left).drop(["_key_right"], axis=1)) if isinstance(index_left, list): joined.index.names = left_index_name else: joined.index.name = left_index_name elif how == "left": result = result.set_index("_key_left") joined = (left_df.merge(result, left_index=True, right_index=True, how="left").merge( right_df.drop(right_df.geometry.name, axis=1), how="left", left_on="_key_right", right_index=True, suffixes=("_%s" % lsuffix, "_%s" % rsuffix), ).set_index(index_left).drop(["_key_right"], axis=1)) if isinstance(index_left, list): joined.index.names = left_index_name else: joined.index.name = left_index_name else: # how == 'right': joined = (left_df.drop(left_df.geometry.name, axis=1).merge( result.merge(right_df, left_on="_key_right", right_index=True, how="right"), left_index=True, right_on="_key_left", how="right", ).set_index(index_right).drop(["_key_left", "_key_right"], axis=1)) if isinstance(index_right, list): joined.index.names = right_index_name else: joined.index.name = right_index_name return joined
def overlay(df1, df2, how="intersection", make_valid=True, keep_geom_type=True): """Perform spatial overlay between two GeoDataFrames. Currently only supports data GeoDataFrames with uniform geometry types, i.e. containing only (Multi)Polygons, or only (Multi)Points, or a combination of (Multi)LineString and LinearRing shapes. Implements several methods that are all effectively subsets of the union. Parameters ---------- df1 : GeoDataFrame df2 : GeoDataFrame how : string Method of spatial overlay: 'intersection', 'union', 'identity', 'symmetric_difference' or 'difference'. keep_geom_type : bool If True, return only geometries of the same geometry type as df1 has, if False, return all resulting gemetries. Returns ------- df : GeoDataFrame GeoDataFrame with new set of polygons and attributes resulting from the overlay """ # Allowed operations allowed_hows = [ "intersection", "union", "identity", "symmetric_difference", "difference", # aka erase ] # Error Messages if how not in allowed_hows: raise ValueError("`how` was '{0}' but is expected to be in {1}".format( how, allowed_hows)) if isinstance(df1, GeoSeries) or isinstance(df2, GeoSeries): raise NotImplementedError("overlay currently only implemented for " "GeoDataFrames") if not _check_crs(df1, df2): _crs_mismatch_warn(df1, df2, stacklevel=3) polys = ["Polygon", "MultiPolygon"] lines = ["LineString", "MultiLineString", "LinearRing"] points = ["Point", "MultiPoint"] for i, df in enumerate([df1, df2]): poly_check = df.geom_type.isin(polys).any() lines_check = df.geom_type.isin(lines).any() points_check = df.geom_type.isin(points).any() if sum([poly_check, lines_check, points_check]) > 1: raise NotImplementedError( "df{} contains mixed geometry types.".format(i + 1)) # Computations df1 = df1.copy() df2 = df2.copy() if df1.geom_type.isin(polys).all(): df1[df1._geometry_column_name] = df1.geometry.buffer(0) if df2.geom_type.isin(polys).all(): df2[df2._geometry_column_name] = df2.geometry.buffer(0) with warnings.catch_warnings( ): # CRS checked above, supress array-level warning warnings.filterwarnings("ignore", message="CRS mismatch between the CRS") if how == "difference": return _overlay_difference(df1, df2) elif how == "intersection": result = _overlay_intersection(df1, df2) elif how == "symmetric_difference": result = _overlay_symmetric_diff(df1, df2) elif how == "union": result = _overlay_union(df1, df2) elif how == "identity": dfunion = _overlay_union(df1, df2) result = dfunion[dfunion["__idx1"].notnull()].copy() if keep_geom_type: type = df1.geom_type.iloc[0] if type in polys: result = result.loc[result.geom_type.isin(polys)] elif type in lines: result = result.loc[result.geom_type.isin(lines)] elif type in points: result = result.loc[result.geom_type.isin(points)] else: raise TypeError( "`keep_geom_type` does not support {}.".format(type)) result.reset_index(drop=True, inplace=True) result.drop(["__idx1", "__idx2"], axis=1, inplace=True) return result
def test_check_crs(): t1 = T.copy() t1.crs = 4326 assert _check_crs(t1, T) is False assert _check_crs(t1, t1) is True assert _check_crs(t1, T, allow_none=True) is True
def overlay(df1, df2, how="intersection", keep_geom_type=None, make_valid=True): """Perform spatial overlay between two GeoDataFrames. Currently only supports data GeoDataFrames with uniform geometry types, i.e. containing only (Multi)Polygons, or only (Multi)Points, or a combination of (Multi)LineString and LinearRing shapes. Implements several methods that are all effectively subsets of the union. See the User Guide page :doc:`../../user_guide/set_operations` for details. Parameters ---------- df1 : GeoDataFrame df2 : GeoDataFrame how : string Method of spatial overlay: 'intersection', 'union', 'identity', 'symmetric_difference' or 'difference'. keep_geom_type : bool If True, return only geometries of the same geometry type as df1 has, if False, return all resulting geometries. Default is None, which will set keep_geom_type to True but warn upon dropping geometries. make_valid : bool, default True If True, any invalid input geometries are corrected with a call to `buffer(0)`, if False, a `ValueError` is raised if any input geometries are invalid. Returns ------- df : GeoDataFrame GeoDataFrame with new set of polygons and attributes resulting from the overlay Examples -------- >>> from shapely.geometry import Polygon >>> polys1 = geopandas.GeoSeries([Polygon([(0,0), (2,0), (2,2), (0,2)]), ... Polygon([(2,2), (4,2), (4,4), (2,4)])]) >>> polys2 = geopandas.GeoSeries([Polygon([(1,1), (3,1), (3,3), (1,3)]), ... Polygon([(3,3), (5,3), (5,5), (3,5)])]) >>> df1 = geopandas.GeoDataFrame({'geometry': polys1, 'df1_data':[1,2]}) >>> df2 = geopandas.GeoDataFrame({'geometry': polys2, 'df2_data':[1,2]}) >>> geopandas.overlay(df1, df2, how='union') df1_data df2_data geometry 0 1.0 1.0 POLYGON ((1.00000 2.00000, 2.00000 2.00000, 2.... 1 2.0 1.0 POLYGON ((3.00000 2.00000, 2.00000 2.00000, 2.... 2 2.0 2.0 POLYGON ((3.00000 4.00000, 4.00000 4.00000, 4.... 3 1.0 NaN POLYGON ((2.00000 1.00000, 2.00000 0.00000, 0.... 4 2.0 NaN MULTIPOLYGON (((3.00000 3.00000, 4.00000 3.000... 5 NaN 1.0 MULTIPOLYGON (((2.00000 2.00000, 3.00000 2.000... 6 NaN 2.0 POLYGON ((3.00000 4.00000, 3.00000 5.00000, 5.... >>> geopandas.overlay(df1, df2, how='intersection') df1_data df2_data geometry 0 1 1 POLYGON ((1.00000 2.00000, 2.00000 2.00000, 2.... 1 2 1 POLYGON ((3.00000 2.00000, 2.00000 2.00000, 2.... 2 2 2 POLYGON ((3.00000 4.00000, 4.00000 4.00000, 4.... >>> geopandas.overlay(df1, df2, how='symmetric_difference') df1_data df2_data geometry 0 1.0 NaN POLYGON ((2.00000 1.00000, 2.00000 0.00000, 0.... 1 2.0 NaN MULTIPOLYGON (((3.00000 3.00000, 4.00000 3.000... 2 NaN 1.0 MULTIPOLYGON (((2.00000 2.00000, 3.00000 2.000... 3 NaN 2.0 POLYGON ((3.00000 4.00000, 3.00000 5.00000, 5.... >>> geopandas.overlay(df1, df2, how='difference') geometry df1_data 0 POLYGON ((2.00000 1.00000, 2.00000 0.00000, 0.... 1 1 MULTIPOLYGON (((2.00000 3.00000, 2.00000 4.000... 2 >>> geopandas.overlay(df1, df2, how='identity') df1_data df2_data geometry 0 1.0 1.0 POLYGON ((1.00000 2.00000, 2.00000 2.00000, 2.... 1 2.0 1.0 POLYGON ((3.00000 2.00000, 2.00000 2.00000, 2.... 2 2.0 2.0 POLYGON ((3.00000 4.00000, 4.00000 4.00000, 4.... 3 1.0 NaN POLYGON ((2.00000 1.00000, 2.00000 0.00000, 0.... 4 2.0 NaN MULTIPOLYGON (((3.00000 3.00000, 4.00000 3.000... See also -------- sjoin : spatial join Notes ------ Every operation in GeoPandas is planar, i.e. the potential third dimension is not taken into account. """ # Allowed operations allowed_hows = [ "intersection", "union", "identity", "symmetric_difference", "difference", # aka erase ] # Error Messages if how not in allowed_hows: raise ValueError("`how` was '{0}' but is expected to be in {1}".format( how, allowed_hows)) if isinstance(df1, GeoSeries) or isinstance(df2, GeoSeries): raise NotImplementedError("overlay currently only implemented for " "GeoDataFrames") if not _check_crs(df1, df2): _crs_mismatch_warn(df1, df2, stacklevel=3) if keep_geom_type is None: keep_geom_type = True keep_geom_type_warning = True else: keep_geom_type_warning = False polys = ["Polygon", "MultiPolygon"] lines = ["LineString", "MultiLineString", "LinearRing"] points = ["Point", "MultiPoint"] for i, df in enumerate([df1, df2]): poly_check = df.geom_type.isin(polys).any() lines_check = df.geom_type.isin(lines).any() points_check = df.geom_type.isin(points).any() if sum([poly_check, lines_check, points_check]) > 1: raise NotImplementedError( "df{} contains mixed geometry types.".format(i + 1)) # Computations def _make_valid(df): df = df.copy() if df.geom_type.isin(polys).all(): mask = ~df.geometry.is_valid col = df._geometry_column_name if make_valid: df.loc[mask, col] = df.loc[mask, col].buffer(0) elif mask.any(): raise ValueError( "You have passed make_valid=False along with " f"{mask.sum()} invalid input geometries. " "Use make_valid=True or make sure that all geometries " "are valid before using overlay.") return df df1 = _make_valid(df1) df2 = _make_valid(df2) with warnings.catch_warnings( ): # CRS checked above, supress array-level warning warnings.filterwarnings("ignore", message="CRS mismatch between the CRS") if how == "difference": return _overlay_difference(df1, df2) elif how == "intersection": result = _overlay_intersection(df1, df2) elif how == "symmetric_difference": result = _overlay_symmetric_diff(df1, df2) elif how == "union": result = _overlay_union(df1, df2) elif how == "identity": dfunion = _overlay_union(df1, df2) result = dfunion[dfunion["__idx1"].notnull()].copy() if keep_geom_type: key_order = result.keys() exploded = result.reset_index(drop=True).explode() exploded = exploded.reset_index(level=0) orig_num_geoms = result.shape[0] geom_type = df1.geom_type.iloc[0] if geom_type in polys: exploded = exploded.loc[exploded.geom_type.isin(polys)] elif geom_type in lines: exploded = exploded.loc[exploded.geom_type.isin(lines)] elif geom_type in points: exploded = exploded.loc[exploded.geom_type.isin(points)] else: raise TypeError( "`keep_geom_type` does not support {}.".format(geom_type)) # level_0 created with above reset_index operation # and represents the original geometry collections result = exploded.dissolve(by="level_0")[key_order] if (result.shape[0] != orig_num_geoms) and keep_geom_type_warning: num_dropped = orig_num_geoms - result.shape[0] warnings.warn( "`keep_geom_type=True` in overlay resulted in {} dropped " "geometries of different geometry types than df1 has. " "Set `keep_geom_type=False` to retain all " "geometries".format(num_dropped), UserWarning, stacklevel=2, ) result.reset_index(drop=True, inplace=True) result.drop(["__idx1", "__idx2"], axis=1, inplace=True) return result
def clip(gdf, mask, keep_geom_type=False): """Clip points, lines, or polygon geometries to the mask extent. Both layers must be in the same Coordinate Reference System (CRS). The `gdf` will be clipped to the full extent of the clip object. If there are multiple polygons in mask, data from `gdf` will be clipped to the total boundary of all polygons in mask. Parameters ---------- gdf : GeoDataFrame or GeoSeries Vector layer (point, line, polygon) to be clipped to mask. mask : GeoDataFrame, GeoSeries, (Multi)Polygon Polygon vector layer used to clip `gdf`. The mask's geometry is dissolved into one geometric feature and intersected with `gdf`. keep_geom_type : boolean, default False If True, return only geometries of original type in case of intersection resulting in multiple geometry types or GeometryCollections. If False, return all resulting geometries (potentially mixed-types). Returns ------- GeoDataFrame or GeoSeries Vector data (points, lines, polygons) from `gdf` clipped to polygon boundary from mask. Examples -------- Clip points (global cities) with a polygon (the South American continent): >>> world = geopandas.read_file( ... geopandas.datasets.get_path('naturalearth_lowres')) >>> south_america = world[world['continent'] == "South America"] >>> capitals = geopandas.read_file( ... geopandas.datasets.get_path('naturalearth_cities')) >>> capitals.shape (202, 2) >>> sa_capitals = geopandas.clip(capitals, south_america) >>> sa_capitals.shape (12, 2) """ if not isinstance(gdf, (GeoDataFrame, GeoSeries)): raise TypeError( "'gdf' should be GeoDataFrame or GeoSeries, got {}".format( type(gdf))) if not isinstance(mask, (GeoDataFrame, GeoSeries, Polygon, MultiPolygon)): raise TypeError("'mask' should be GeoDataFrame, GeoSeries or" "(Multi)Polygon, got {}".format(type(mask))) if isinstance(mask, (GeoDataFrame, GeoSeries)): if not _check_crs(gdf, mask): _crs_mismatch_warn(gdf, mask, stacklevel=3) if isinstance(mask, (GeoDataFrame, GeoSeries)): box_mask = mask.total_bounds else: box_mask = mask.bounds box_gdf = gdf.total_bounds if not (((box_mask[0] <= box_gdf[2]) and (box_gdf[0] <= box_mask[2])) and ((box_mask[1] <= box_gdf[3]) and (box_gdf[1] <= box_mask[3]))): return gdf.iloc[:0] if isinstance(mask, (GeoDataFrame, GeoSeries)): poly = mask.geometry.unary_union else: poly = mask geom_types = gdf.geometry.type poly_idx = np.asarray((geom_types == "Polygon") | (geom_types == "MultiPolygon")) line_idx = np.asarray((geom_types == "LineString") | (geom_types == "LinearRing") | (geom_types == "MultiLineString")) point_idx = np.asarray((geom_types == "Point") | (geom_types == "MultiPoint")) geomcoll_idx = np.asarray((geom_types == "GeometryCollection")) if point_idx.any(): point_gdf = _clip_points(gdf[point_idx], poly) else: point_gdf = None if poly_idx.any(): poly_gdf = _clip_line_poly(gdf[poly_idx], poly) else: poly_gdf = None if line_idx.any(): line_gdf = _clip_line_poly(gdf[line_idx], poly) else: line_gdf = None if geomcoll_idx.any(): geomcoll_gdf = _clip_line_poly(gdf[geomcoll_idx], poly) else: geomcoll_gdf = None order = pd.Series(range(len(gdf)), index=gdf.index) concat = pd.concat([point_gdf, line_gdf, poly_gdf, geomcoll_gdf]) if keep_geom_type: geomcoll_concat = (concat.geom_type == "GeometryCollection").any() geomcoll_orig = geomcoll_idx.any() new_collection = geomcoll_concat and not geomcoll_orig if geomcoll_orig: warnings.warn("keep_geom_type can not be called on a " "GeoDataFrame with GeometryCollection.") else: polys = ["Polygon", "MultiPolygon"] lines = ["LineString", "MultiLineString", "LinearRing"] points = ["Point", "MultiPoint"] # Check that the gdf for multiple geom types (points, lines and/or polys) orig_types_total = sum([ gdf.geom_type.isin(polys).any(), gdf.geom_type.isin(lines).any(), gdf.geom_type.isin(points).any(), ]) # Check how many geometry types are in the clipped GeoDataFrame clip_types_total = sum([ concat.geom_type.isin(polys).any(), concat.geom_type.isin(lines).any(), concat.geom_type.isin(points).any(), ]) # Check there aren't any new geom types in the clipped GeoDataFrame more_types = orig_types_total < clip_types_total if orig_types_total > 1: warnings.warn( "keep_geom_type can not be called on a mixed type GeoDataFrame." ) elif new_collection or more_types: orig_type = gdf.geom_type.iloc[0] if new_collection: concat = concat.explode() if orig_type in polys: concat = concat.loc[concat.geom_type.isin(polys)] elif orig_type in lines: concat = concat.loc[concat.geom_type.isin(lines)] # Return empty GeoDataFrame or GeoSeries if no shapes remain if len(concat) == 0: return gdf.iloc[:0] # Preserve the original order of the input if isinstance(concat, GeoDataFrame): concat["_order"] = order return concat.sort_values(by="_order").drop(columns="_order") else: concat = GeoDataFrame(geometry=concat) concat["_order"] = order return concat.sort_values(by="_order").geometry
def sjoin(left_df, right_df, how="inner", op="intersects", lsuffix="left", rsuffix="right"): """Spatial join of two GeoDataFrames. Parameters ---------- left_df, right_df : GeoDataFrames how : string, default 'inner' The type of join: * 'left': use keys from left_df; retain only left_df geometry column * 'right': use keys from right_df; retain only right_df geometry column * 'inner': use intersection of keys from both dfs; retain only left_df geometry column op : string, default 'intersects' Binary predicate, one of {'intersects', 'contains', 'within'}. See http://shapely.readthedocs.io/en/latest/manual.html#binary-predicates. lsuffix : string, default 'left' Suffix to apply to overlapping column names (left GeoDataFrame). rsuffix : string, default 'right' Suffix to apply to overlapping column names (right GeoDataFrame). """ if not isinstance(left_df, GeoDataFrame): raise ValueError("'left_df' should be GeoDataFrame, got {}".format( type(left_df))) if not isinstance(right_df, GeoDataFrame): raise ValueError("'right_df' should be GeoDataFrame, got {}".format( type(right_df))) allowed_hows = ["left", "right", "inner"] if how not in allowed_hows: raise ValueError('`how` was "%s" but is expected to be in %s' % (how, allowed_hows)) allowed_ops = ["contains", "within", "intersects"] if op not in allowed_ops: raise ValueError('`op` was "%s" but is expected to be in %s' % (op, allowed_ops)) if not _check_crs(left_df, right_df): _crs_mismatch_warn(left_df, right_df, stacklevel=3) index_left = "index_%s" % lsuffix index_right = "index_%s" % rsuffix # due to GH 352 if any(left_df.columns.isin([index_left, index_right])) or any( right_df.columns.isin([index_left, index_right])): raise ValueError("'{0}' and '{1}' cannot be names in the frames being" " joined".format(index_left, index_right)) # query index with warnings.catch_warnings(): # We don't need to show our own warning here # TODO remove this once the deprecation has been enforced warnings.filterwarnings("ignore", "Generated spatial index is empty", FutureWarning) if op == "within": # within is implemented as the inverse of contains # contains is a faster predicate # see discussion at https://github.com/geopandas/geopandas/pull/1421 predicate = "contains" sindex = left_df.sindex input_geoms = right_df.geometry else: # all other predicates are symmetric # keep them the same predicate = op sindex = right_df.sindex input_geoms = left_df.geometry if sindex: l_idx, r_idx = sindex.query_bulk(input_geoms, predicate=predicate, sort=False) result = pd.DataFrame({"_key_left": l_idx, "_key_right": r_idx}) else: # when sindex is empty / has no valid geometries result = pd.DataFrame(columns=["_key_left", "_key_right"], dtype=float) if op == "within": # within is implemented as the inverse of contains # flip back the results result = result.rename(columns={ "_key_left": "_key_right", "_key_right": "_key_left" }) # the spatial index only allows limited (numeric) index types, but an # index in geopandas may be any arbitrary dtype. so reset both indices now # and store references to the original indices, to be reaffixed later. # GH 352 left_df = left_df.copy(deep=True) try: left_index_name = left_df.index.name left_df.index = left_df.index.rename(index_left) except TypeError: index_left = [ "index_%s" % lsuffix + str(pos) for pos, ix in enumerate(left_df.index.names) ] left_index_name = left_df.index.names left_df.index = left_df.index.rename(index_left) left_df = left_df.reset_index() right_df = right_df.copy(deep=True) try: right_index_name = right_df.index.name right_df.index = right_df.index.rename(index_right) except TypeError: index_right = [ "index_%s" % rsuffix + str(pos) for pos, ix in enumerate(right_df.index.names) ] right_index_name = right_df.index.names right_df.index = right_df.index.rename(index_right) right_df = right_df.reset_index() # perform join on the dataframes if how == "inner": result = result.set_index("_key_left") joined = (left_df.merge(result, left_index=True, right_index=True).merge( right_df.drop(right_df.geometry.name, axis=1), left_on="_key_right", right_index=True, suffixes=("_%s" % lsuffix, "_%s" % rsuffix), ).set_index(index_left).drop(["_key_right"], axis=1)) if isinstance(index_left, list): joined.index.names = left_index_name else: joined.index.name = left_index_name elif how == "left": result = result.set_index("_key_left") joined = (left_df.merge(result, left_index=True, right_index=True, how="left").merge( right_df.drop(right_df.geometry.name, axis=1), how="left", left_on="_key_right", right_index=True, suffixes=("_%s" % lsuffix, "_%s" % rsuffix), ).set_index(index_left).drop(["_key_right"], axis=1)) if isinstance(index_left, list): joined.index.names = left_index_name else: joined.index.name = left_index_name else: # how == 'right': joined = (left_df.drop(left_df.geometry.name, axis=1).merge( result.merge(right_df, left_on="_key_right", right_index=True, how="right"), left_index=True, right_on="_key_left", how="right", ).set_index(index_right).drop(["_key_left", "_key_right"], axis=1)) if isinstance(index_right, list): joined.index.names = right_index_name else: joined.index.name = right_index_name return joined
def overlay(df1, df2, how="intersection", keep_geom_type=None, make_valid=True): """Perform spatial overlay between two GeoDataFrames. Currently only supports data GeoDataFrames with uniform geometry types, i.e. containing only (Multi)Polygons, or only (Multi)Points, or a combination of (Multi)LineString and LinearRing shapes. Implements several methods that are all effectively subsets of the union. See the User Guide page :doc:`../../user_guide/set_operations` for details. Parameters ---------- df1 : GeoDataFrame df2 : GeoDataFrame how : string Method of spatial overlay: 'intersection', 'union', 'identity', 'symmetric_difference' or 'difference'. keep_geom_type : bool If True, return only geometries of the same geometry type as df1 has, if False, return all resulting geometries. Default is None, which will set keep_geom_type to True but warn upon dropping geometries. make_valid : bool, default True If True, any invalid input geometries are corrected with a call to `buffer(0)`, if False, a `ValueError` is raised if any input geometries are invalid. Returns ------- df : GeoDataFrame GeoDataFrame with new set of polygons and attributes resulting from the overlay Examples -------- >>> from shapely.geometry import Polygon >>> polys1 = geopandas.GeoSeries([Polygon([(0,0), (2,0), (2,2), (0,2)]), ... Polygon([(2,2), (4,2), (4,4), (2,4)])]) >>> polys2 = geopandas.GeoSeries([Polygon([(1,1), (3,1), (3,3), (1,3)]), ... Polygon([(3,3), (5,3), (5,5), (3,5)])]) >>> df1 = geopandas.GeoDataFrame({'geometry': polys1, 'df1_data':[1,2]}) >>> df2 = geopandas.GeoDataFrame({'geometry': polys2, 'df2_data':[1,2]}) >>> geopandas.overlay(df1, df2, how='union') df1_data df2_data geometry 0 1.0 1.0 POLYGON ((2.00000 2.00000, 2.00000 1.00000, 1.... 1 2.0 1.0 POLYGON ((2.00000 2.00000, 2.00000 3.00000, 3.... 2 2.0 2.0 POLYGON ((4.00000 4.00000, 4.00000 3.00000, 3.... 3 1.0 NaN POLYGON ((2.00000 0.00000, 0.00000 0.00000, 0.... 4 2.0 NaN MULTIPOLYGON (((3.00000 3.00000, 4.00000 3.000... 5 NaN 1.0 MULTIPOLYGON (((2.00000 2.00000, 3.00000 2.000... 6 NaN 2.0 POLYGON ((3.00000 5.00000, 5.00000 5.00000, 5.... >>> geopandas.overlay(df1, df2, how='intersection') df1_data df2_data geometry 0 1 1 POLYGON ((2.00000 2.00000, 2.00000 1.00000, 1.... 1 2 1 POLYGON ((2.00000 2.00000, 2.00000 3.00000, 3.... 2 2 2 POLYGON ((4.00000 4.00000, 4.00000 3.00000, 3.... >>> geopandas.overlay(df1, df2, how='symmetric_difference') df1_data df2_data geometry 0 1.0 NaN POLYGON ((2.00000 0.00000, 0.00000 0.00000, 0.... 1 2.0 NaN MULTIPOLYGON (((3.00000 3.00000, 4.00000 3.000... 2 NaN 1.0 MULTIPOLYGON (((2.00000 2.00000, 3.00000 2.000... 3 NaN 2.0 POLYGON ((3.00000 5.00000, 5.00000 5.00000, 5.... >>> geopandas.overlay(df1, df2, how='difference') geometry df1_data 0 POLYGON ((2.00000 0.00000, 0.00000 0.00000, 0.... 1 1 MULTIPOLYGON (((3.00000 3.00000, 4.00000 3.000... 2 >>> geopandas.overlay(df1, df2, how='identity') df1_data df2_data geometry 0 1.0 1.0 POLYGON ((2.00000 2.00000, 2.00000 1.00000, 1.... 1 2.0 1.0 POLYGON ((2.00000 2.00000, 2.00000 3.00000, 3.... 2 2.0 2.0 POLYGON ((4.00000 4.00000, 4.00000 3.00000, 3.... 3 1.0 NaN POLYGON ((2.00000 0.00000, 0.00000 0.00000, 0.... 4 2.0 NaN MULTIPOLYGON (((3.00000 3.00000, 4.00000 3.000... See also -------- sjoin : spatial join GeoDataFrame.overlay : equivalent method Notes ------ Every operation in GeoPandas is planar, i.e. the potential third dimension is not taken into account. """ # Allowed operations allowed_hows = [ "intersection", "union", "identity", "symmetric_difference", "difference", # aka erase ] # Error Messages if how not in allowed_hows: raise ValueError( "`how` was '{0}' but is expected to be in {1}".format(how, allowed_hows) ) if isinstance(df1, GeoSeries) or isinstance(df2, GeoSeries): raise NotImplementedError( "overlay currently only implemented for " "GeoDataFrames" ) if not _check_crs(df1, df2): _crs_mismatch_warn(df1, df2, stacklevel=3) if keep_geom_type is None: keep_geom_type = True keep_geom_type_warning = True else: keep_geom_type_warning = False polys = ["Polygon", "MultiPolygon"] lines = ["LineString", "MultiLineString", "LinearRing"] points = ["Point", "MultiPoint"] for i, df in enumerate([df1, df2]): poly_check = df.geom_type.isin(polys).any() lines_check = df.geom_type.isin(lines).any() points_check = df.geom_type.isin(points).any() if sum([poly_check, lines_check, points_check]) > 1: raise NotImplementedError( "df{} contains mixed geometry types.".format(i + 1) ) if how == "intersection": box_gdf1 = df1.total_bounds box_gdf2 = df2.total_bounds if not ( ((box_gdf1[0] <= box_gdf2[2]) and (box_gdf2[0] <= box_gdf1[2])) and ((box_gdf1[1] <= box_gdf2[3]) and (box_gdf2[1] <= box_gdf1[3])) ): result = df1.iloc[:0].merge( df2.iloc[:0].drop(df2.geometry.name, axis=1), left_index=True, right_index=True, suffixes=("_1", "_2"), ) return result[ result.columns.drop(df1.geometry.name).tolist() + [df1.geometry.name] ] # Computations def _make_valid(df): df = df.copy() if df.geom_type.isin(polys).all(): mask = ~df.geometry.is_valid col = df._geometry_column_name if make_valid: df.loc[mask, col] = df.loc[mask, col].buffer(0) elif mask.any(): raise ValueError( "You have passed make_valid=False along with " f"{mask.sum()} invalid input geometries. " "Use make_valid=True or make sure that all geometries " "are valid before using overlay." ) return df df1 = _make_valid(df1) df2 = _make_valid(df2) with warnings.catch_warnings(): # CRS checked above, suppress array-level warning warnings.filterwarnings("ignore", message="CRS mismatch between the CRS") if how == "difference": result = _overlay_difference(df1, df2) elif how == "intersection": result = _overlay_intersection(df1, df2) elif how == "symmetric_difference": result = _overlay_symmetric_diff(df1, df2) elif how == "union": result = _overlay_union(df1, df2) elif how == "identity": dfunion = _overlay_union(df1, df2) result = dfunion[dfunion["__idx1"].notnull()].copy() if how in ["intersection", "symmetric_difference", "union", "identity"]: result.drop(["__idx1", "__idx2"], axis=1, inplace=True) if keep_geom_type: geom_type = df1.geom_type.iloc[0] # First we filter the geometry types inside GeometryCollections objects # (e.g. GeometryCollection([polygon, point]) -> polygon) # we do this separately on only the relevant rows, as this is an expensive # operation (an expensive no-op for geometry types other than collections) is_collection = result.geom_type == "GeometryCollection" if is_collection.any(): geom_col = result._geometry_column_name collections = result[[geom_col]][is_collection] exploded = collections.reset_index(drop=True).explode(index_parts=True) exploded = exploded.reset_index(level=0) orig_num_geoms_exploded = exploded.shape[0] if geom_type in polys: exploded.loc[~exploded.geom_type.isin(polys), geom_col] = None elif geom_type in lines: exploded.loc[~exploded.geom_type.isin(lines), geom_col] = None elif geom_type in points: exploded.loc[~exploded.geom_type.isin(points), geom_col] = None else: raise TypeError( "`keep_geom_type` does not support {}.".format(geom_type) ) num_dropped_collection = ( orig_num_geoms_exploded - exploded.geometry.isna().sum() ) # level_0 created with above reset_index operation # and represents the original geometry collections # TODO avoiding dissolve to call unary_union in this case could further # improve performance (we only need to collect geometries in their # respective Multi version) dissolved = exploded.dissolve(by="level_0") result.loc[is_collection, geom_col] = dissolved[geom_col].values else: num_dropped_collection = 0 # Now we filter all geometries (in theory we don't need to do this # again for the rows handled above for GeometryCollections, but filtering # them out is probably more expensive as simply including them when this # is typically about only a few rows) orig_num_geoms = result.shape[0] if geom_type in polys: result = result.loc[result.geom_type.isin(polys)] elif geom_type in lines: result = result.loc[result.geom_type.isin(lines)] elif geom_type in points: result = result.loc[result.geom_type.isin(points)] else: raise TypeError("`keep_geom_type` does not support {}.".format(geom_type)) num_dropped = orig_num_geoms - result.shape[0] if (num_dropped > 0 or num_dropped_collection > 0) and keep_geom_type_warning: warnings.warn( "`keep_geom_type=True` in overlay resulted in {} dropped " "geometries of different geometry types than df1 has. " "Set `keep_geom_type=False` to retain all " "geometries".format(num_dropped + num_dropped_collection), UserWarning, stacklevel=2, ) result.reset_index(drop=True, inplace=True) return result