def prune_columns(gds: GeoDataFrame): """Remove unneeded columns.""" columns_remove = gds.keys().difference(['osmid', 'geometry', 'main_tag']) if len(columns_remove) == 0: return gds.drop(columns_remove, inplace=True)
def building_density_per_block(bldgs: gpd.GeoDataFrame, blocks: gpd.GeoDataFrame) -> gpd.GeoDataFrame: ''' Adds a column to blocks dataframe which contains the total building area and the building density ''' assert 'block_id' in bldgs.columns, "ERROR: bldgs dataframe does not have block_id" bldgs['bldg_area'] = bldgs.area bldgs['bldg_count'] = 1 bldg_area_by_block = bldgs[['block_id', 'bldg_area', 'bldg_count']].groupby('block_id').sum() bldg_area_by_block.reset_index(inplace=True) for c in ['bldg_area', 'bldg_count']: if c in blocks.columns: blocks.drop(columns=[c], inplace=True) blocks = blocks.merge(bldg_area_by_block, how='left', on='block_id') blocks['block_area'] = blocks.area blocks['bldg_density'] = blocks['bldg_area'] / blocks['block_area'] blocks.fillna(value=0.0, inplace=True) return blocks
def from_lines( gdf: gpd.GeoDataFrame, attrs: List[str] = [], tolerance: float = .001, ) -> nx.DiGraph: if attrs: gdf.drop( [ col for col in gdf.columns if col not in attrs and col != 'geometry' ], inplace=True, axis=1, ) gdf['_source'] = gdf.geometry.map(lambda geom: geom.coords[0]) gdf['_target'] = gdf.geometry.map(lambda geom: geom.coords[-1]) if tolerance > 0: rounding = int(np.ceil(-np.log10(tolerance))) def rounder(tup): return tuple(round(value, rounding) for value in tup) gdf['_source'] = gdf['_source'].map(rounder) gdf['_target'] = gdf['_target'].map(rounder) return nx.from_pandas_edgelist(gdf, '_source', '_target', edge_attr=True, create_using=nx.DiGraph)
def get_pair_footprints(pair_ids: List, plot=True, save_path=None): """ Plot or save overlapping areas between pairs of NAC images :param save_path: Path to output a vector file of the geometries, or None for no output. File based on extension. :param plot: Create a figure :param pair_ids: Pair ids, a list given like ['M106761561LExxM1101080055RE', 'M1096364254RExxM1142334242LE'] :return: GeoDataFrame containing the pair footprints """ pairs = [pair_id.split('xx') for pair_id in pair_ids] df = DataFrame(pairs, index=pair_ids, columns=['prod_id_0', 'prod_id_1']) df = df.applymap(get_geometry_from_ODE).applymap(wkt.loads) df['intersection'] = df.apply(lambda a: a[0].intersection(a[1]), axis='columns') gdf = GeoDataFrame(df, geometry='intersection') gdf['pair_ids'] = gdf.index.values if save_path: if save_path.endswith('.json'): save_driver = 'GeoJSON' gdf.drop(['prod_id_0', 'prod_id_1'], axis='columns').to_file(save_path, driver=save_driver) if plot: gdf.geometry = gdf.geometry.boundary intersection_plot = gdf.plot(column='pair_ids', legend=True, legend_kwds={ 'loc': 'center left', 'bbox_to_anchor': (1, 0.5) }) pyplot.xlabel('Longitude, degrees E') pyplot.ylabel('Latitude, degrees N') return gdf
def paginate( pop_by_plot: gpd.GeoDataFrame, order: Sequence, page_distribution: Distribution, page_col: str, plot_number_col: str, district_number_col: str, n_plots: int = None, ): if not n_plots: n_plots = len(pop_by_plot.index) if order: if n_plots % len(order) != 0: raise ValueError('orders and plots do not match') pop_by_plot['order'] = order pop_by_plot.sort_values(by='district order'.split(), inplace=True) pop_by_plot.drop(columns='order', inplace=True) pages = _get_simulated_plots_by_page(page_distribution, n_plots) pop_by_page = _get_simulated_pop_by_page( pop_by_plot, pages, page_col=page_col, plot_number_col=plot_number_col, district_number_col=district_number_col, ) return pop_by_page
def remove_erroneous_pv_polygons( self, raw_PV_installations_on_rooftop: gpd.GeoDataFrame = None ) -> gpd.GeoDataFrame: """ Removes PV polygons whose aggregated intersected area is larger than their original raw area Parameters ---------- raw_PV_installations_on_rooftop: GeoPandas.GeoDataFrame GeoDataFrame which must contain the columns "area_inter", "raw_area", and "identifier" Returns ------- GeoPandas.GeoDataFrame Input GeoDataFrame where erroneous PV polygons have been removed """ # Compute share of raw area that the intersected pv polygon covers raw_PV_installations_on_rooftop["percentage_intersect"] = ( raw_PV_installations_on_rooftop["area_inter"] / raw_PV_installations_on_rooftop["raw_area"]) # Group intersection by polygon identifier and sum percentage group_intersection_id = raw_PV_installations_on_rooftop.groupby( "identifier").agg({ "area_inter": "sum", "Street": "first", "Street_Address": "first", "raw_area": "first", "City": "first", "PostalCode": "first", "percentage_intersect": "sum", }) # Find erroneous polygons whose area after intersection is larger than their original (raw) area polygone = group_intersection_id[ group_intersection_id["percentage_intersect"] > 1.1].index.tolist( ) # Filter out erroneous polygons identified above and all their respective sub-parts raw_PV_installations_on_rooftop = raw_PV_installations_on_rooftop.drop( raw_PV_installations_on_rooftop.index[ (raw_PV_installations_on_rooftop["identifier"].isin(polygone)) & (raw_PV_installations_on_rooftop["percentage_intersect"] < 1)]) # Drop duplicate identifiers for erroneous polygons raw_PV_installations_on_rooftop = raw_PV_installations_on_rooftop.drop( raw_PV_installations_on_rooftop.index[ (raw_PV_installations_on_rooftop["identifier"].isin(polygone)) & (raw_PV_installations_on_rooftop["identifier"].duplicated())]) return raw_PV_installations_on_rooftop
def delete_small_polygons(polygons: geopandas.GeoDataFrame, area=1e-6): """deletion is in place""" todrop = [] for id, p in enumerate(polygons.geometry): if p.area < 1e-6: # print("null area") todrop.append(id) polygons.drop(todrop, inplace=True) return len(todrop)
def _clean_attributes(self, input_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: for col_name in input_gdf.columns: if col_name in self.__USELESS_COLUMNS: input_gdf.drop(columns=[col_name], inplace=True) if self._ID_DEFAULT_FIELD not in input_gdf.columns: input_gdf.loc[:, self._ID_DEFAULT_FIELD] = input_gdf.index.astype(str) return input_gdf
def prepare_parcels(bldgs: gpd.GeoDataFrame, blocks: gpd.GeoDataFrame, parcels: gpd.GeoDataFrame) -> pd.DataFrame: ''' For a single GADM, this script (1) creates the PlanarGraph associated with each respective parcel and (2) maps all buildings to their corresponding parcel. The buildings are converted to centroids and then to Node types so they can just be added to the PlanarGraph ''' # Convert buildings to centroids bldgs['centroids'] = bldgs['geometry'].centroid bldgs.set_geometry('centroids', inplace=True) # We want to map each building to a given block to then map the buildings to a parcel bldgs = gpd.sjoin(bldgs, blocks, how='left', op='within') bldgs.drop(columns=['index_right'], inplace=True) # Now, join the parcels with the buildings parcels = parcels.merge(bldgs[['block_id', 'centroids']], how='left', on='block_id') parcels.rename(columns={ 'geometry': 'parcel_geometry', 'centroids': 'buildings' }, inplace=True) # Now collapse on the block and clean parcels = parcels.groupby('block_id').agg(list) parcels['parcel_geometry'] = parcels['parcel_geometry'].apply( lambda x: x[0]) parcels['buildings'] = parcels['buildings'].apply(lambda x: [] if x == [np.nan] else x) # Checks assert blocks.shape[0] == parcels.shape[ 0] # We should maintain block count parcels['buildings_count'] = parcels['buildings'].apply(lambda x: len(x)) #assert parcels['buildings_count'].sum() == bldgs.shape[0] # We should maintain bldgs count parcels.reset_index(inplace=True) # Now, create the graph for each parcel parcels['planar_graph'] = parcels['parcel_geometry'].apply( PlanarGraph.multilinestring_to_planar_graph) # And convert the buildings from shapely.Points -> topology.Nodes parcels['buildings'] = parcels['buildings'].apply( lambda x: [point_to_node(p) for p in x]) return parcels
def get_flat_priority(pou_src, out_file): flat = GeoDataFrame(columns=['id', 'DT', 'geo', 'obj']) pou = read_file(pou_src) pou = pou[(pou['PURPOSE'] == 'IRRIGATION') & (pou['WRSTATUS'] == 'ACTIVE')] pou['ENFRPRIDAT'] = [to_datetime(x) for x in pou['ENFRPRIDAT']] pou = pou.rename(columns={'geometry': 'geo', 'ENFRPRIDAT': 'dt'}) pou = pou.sort_values(by='dt') pou = pou[['dt', 'geo', 'OBJECTID']] pou = pou.reset_index(drop=True) good_rows = [ i for i, x in enumerate(pou['dt']) if isinstance(x, Timestamp) ] pou = pou.loc[good_rows] pou = pou.astype({'OBJECTID': int}) first, covered = True, None ct = 0 for i, (dt, g, obj) in tqdm(pou.iterrows(), total=pou.shape[0]): if first: flat.loc[ct] = [ct, dt, g, obj] ct += 1 first = False else: equal = [ i for i, x in enumerate(flat['geo']) if g.almost_equals(x) ] if any(equal): continue inter = [i for i, x in enumerate(flat['geo']) if g.intersects(x)] if not any(inter): flat.loc[ct] = [ct, dt, g, obj] ct += 1 else: for ix in inter: g = g.difference(flat.loc[ix]['geo']) if g.area > 0: flat.loc[ct] = [ct, dt, g, obj] ct += 1 good_rows = [ i for i, x in enumerate(flat['geo']) if isinstance(x, Polygon) or isinstance(x, MultiPolygon) ] flat = flat.loc[good_rows] geo = flat['geo'] flat['DT'] = [str(x)[:10] for x in flat['DT']] flat['dt_int'] = [int(''.join(x.split('-'))) for x in flat['DT']] flat.drop(columns=['geo'], inplace=True) gdf = GeoDataFrame(flat, geometry=geo, crs='EPSG:32100') gdf.to_file(out_file)
def write_outputs( cfg: dict, bin_gdf: GeoDataFrame, eq_gdf: GeoDataFrame, write_index: bool = False, ) -> None: """ Writes output GIS files and plots (i.e., maps or MFD plots.) All of the options for what to write are specified in the `cfg`. :param cfg: Configuration for the evaluations, such as that parsed from the YAML config file. :param bin_gdf: :class:`GeoDataFrame` with the spatial bins for testing :param eq_gdf: :class:`GeoDataFrame` with the observed earthquake catalog. """ logger.info("writing outputs") if "plots" in cfg["output"].keys(): write_mfd_plots_to_gdf(bin_gdf, **cfg["output"]["plots"]["kwargs"]) if "map_epsg" in cfg["config"]: out_gdf = out_gdf.to_crs(cfg["config"]["map_epsg"]) if "bin_gdf" in cfg["output"].keys(): outfile = cfg["output"]["bin_gdf"]["file"] out_format = outfile.split(".")[-1] bin_gdf["bin_index"] = bin_gdf.index bin_gdf.index = np.arange(len(bin_gdf)) if out_format == "csv": write_bin_gdf_to_csv(outfile, bin_gdf) else: try: bin_gdf.drop("SpacemagBin", axis=1).to_file( outfile, driver=OUTPUT_FILE_MAP[out_format], index=write_index, ) except KeyError: raise Exception(f"No writer for {out_format} format")
def add_block_id(bldg_pop: gpd.GeoDataFrame, block: Union[gpd.GeoDataFrame, str], ) -> gpd.GeoDataFrame: """ add_block_id() Step 2: some bldg files don't have the block_id so that may need to be joined on NOTE: block can be a path to the block GeoDataFrame, or the already loaded GeoDataFrame Joins block_id column on to the builing geodf. """ block = flex_load(block) bldg_pop = utils.join_block_building(block, bldg_pop) if 'index_right' in bldg_pop.columns: bldg_pop.drop(columns=['index_right'], inplace=True) return bldg_pop
def add_block_bldg_area(bldg_pop: gpd.GeoDataFrame, block: gpd.GeoDataFrame, ) -> gpd.GeoDataFrame: """ Calculates the number of buildings in a block and adds that to the bldg_pop geodf """ bldg_pop = bldg_pop.to_crs("EPSG:3395") bldg_pop['bldg_area'] = (bldg_pop.area * 1e-6) block_bldg_area = bldg_pop[['block_id', 'bldg_area']].groupby('block_id').sum().reset_index() block_bldg_area.rename(columns={'bldg_area': 'block_bldg_area'}, inplace=True) bldg_pop = bldg_pop.merge(block_bldg_area, how='left', on='block_id') bldg_pop = bldg_pop.to_crs("EPSG:4326") bldg_pop.drop(columns=["bldg_area"], inplace=True) return bldg_pop
def convert_GeoPandas_to_Bokeh_format( gdf : gpd.GeoDataFrame ) -> ColumnDataSource : """ Function to convert a GeoPandas GeoDataFrame to a Bokeh ColumnDataSource object. :param: (GeoDataFrame) gdf: GeoPandas GeoDataFrame with polygon(s) under the column name 'geometry.' :return: ColumnDataSource for Bokeh. """ gdf_new = gdf.drop('geometry', axis=1).copy() gdf_new['x'] = gdf.apply(getGeometryCoords, geom='geometry', coord_type='x', shape_type='polygon', axis=1) gdf_new['y'] = gdf.apply(getGeometryCoords, geom='geometry', coord_type='y', shape_type='polygon', axis=1) return ColumnDataSource(gdf_new)
def __geometrize_gdf(self, gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: try: geometry = gdf['geometry'].map(shapely.wkt.loads) geometrized = gdf.drop(columns='geometry') return gpd.GeoDataFrame(geometrized, geometry=geometry) except: return gdf
def dissolve( gdf: gpd.GeoDataFrame, by: Iterable[str], func: Union[Callable, str, list, dict], how: Union[Literal["union", "first"], Callable[[gpd.GeoSeries], BaseGeometry]] = "union", ) -> gpd.GeoDataFrame: """ Dissolve layer by aggregating features based on common attributes. Args: gdf: GeoDataFrame with non-empty (Multi)Polygon geometries. by: Names of columns to group features by. func: Aggregation function for data columns (see :meth:`pd.DataFrame.groupby`). how: Aggregation function for geometry column. Either 'union' (:meth:`gpd.GeoSeries.unary_union`), 'first' (first geometry in group), or a function aggregating multiple geometries into one. Returns: GeoDataFrame with dissolved geometry and data columns, and grouping columns set as the index. """ check_gdf(gdf) merges = {"union": lambda x: x.unary_union, "first": lambda x: x.iloc[0]} data = gdf.drop(columns=gdf.geometry.name).groupby(by=by).aggregate(func) geometry = gdf.groupby(by=by, group_keys=False)[gdf.geometry.name].aggregate( merges.get(how, how)) return gpd.GeoDataFrame(geometry, geometry=gdf.geometry.name, crs=gdf.crs).join(data)
def get_aggregate_locations_by_district( population_data: pd.DataFrame, location_data: gpd.GeoDataFrame, ) -> gpd.GeoDataFrame: len_pop = len(population_data.index) len_loc = len(location_data.index) if len_loc == 0 or len_pop == 0: return gpd.GeoDataFrame() elif len_loc < len_pop: sample_index = interval_sample( population_data.index, len_loc, ) new_geom = gpd.GeoDataFrame( {'geometry': location_data.geometry}, index=sample_index, ) try: new_geom = new_geom.align( population_data, join='outer', method='pad', ) except NotImplementedError: return gpd.GeoDataFrame() location_data = new_geom elif len_pop < len_loc: sample_index = interval_sample( location_data.index, len_pop, ) location_data = location_data.loc[sample_index, ] location_data = location_data.reset_index() location_data = location_data.drop( columns=['level_0', 'index'], errors='ignore', ) population_data = population_data.reset_index() population_data = population_data.drop( columns=['plot_number', 'district'], errors='ignore', ) geodata = pd.concat( [location_data, population_data], axis=1, ) geodata = geodata.drop( columns=['index', 'Unnamed: 0'], errors='ignore', ) return geodata
def save_selection(df: gpd.GeoDataFrame, name: str, project_path: str) -> None: ''' Saves selection to directory ''' out_dir = Path(project_path) / 'exported' out_dir.mkdir(parents=True, exist_ok=True) v = 0 out_path = out_dir / "{}.v{}.geojson".format(name, v) while out_path.is_file(): v += 1 out_path = out_dir / "{}.v{}.geojson".format(name, v) if 'x' in df.columns: df = df.drop(columns=['x']) if 'y' in df.columns: df = df.drop(columns=['y']) df.to_file(str(out_path), driver='GeoJSON') print('Saved to: {}'.format(out_path.resolve()))
def clip_bands_to_polygon(bands, out_bands, mask): with fiona.open(mask, 'r') as src: feat = [f for f in src] bounds = shape(feat[0]['geometry']) df = read_csv(bands) gdf = GeoDataFrame(df, geometry=points_from_xy(y=df['LAT_GCS'], x=df['Lon_GCS'])) gdf = clip(gdf, mask=bounds) df = DataFrame(gdf.drop(columns='geometry')) df.to_csv(out_bands)
def map_choropleth( gdf: geopandas.GeoDataFrame, color_field, *, highlight_polygon: str = "", min_color: str = "#F4D2D2", max_color: str = "#CC0000", color_steps: int = 5, legend_title: str = None, ) -> alt.Chart: """ Creates a choropleth map of covid data from a geopandas dataframe. Args: gdf (geopandas.GeoDataFrame): geodataframe of covid data. color_field (str): Column from gdf that will be used for the choropleth map. highlight_polygon (str, optional): Creates a border around a selected polygon to emphasise it. min_color (str, optional): HSL, RGB, HEX, WEB of min color of choropleth range. Defaults to "#F4D2D2" max_color (str, optional): HSL, RGB, HEX, WEB of max color of choropleth range. Defaults to "#CC0000" color_steps (int, optional): Number of steps between min and max for final choropleth color range. Defaults to 5. legend_title (str, optional): Title for legend. Defaults to color_field value. Returns: Altair chart instance. """ gdf = gdf.drop( ["id"], axis=1) # dropping ID col to avoid warning message from gpdvega/altair data = convert_gfp_to_alt(gdf) color_range = list( Color(min_color).range_to(Color(max_color), color_steps)) color_range = [x.hex for x in color_range] legend_title = color_field if not legend_title else legend_title chart = ( alt.Chart(data).mark_geoshape( strokeWidth=1, stroke="#fff", # width=300,height=200 ).project().encode(color=alt.Color( f"properties.{color_field}:Q", scale=alt.Scale( type="quantize", nice=True, range=color_range, ), legend=alt.Legend(orient="top", title=legend_title, titleLimit=200), )).properties(width=600, height=460)) if highlight_polygon: gdf_highlight = gdf[gdf["NAME"].str.contains(highlight_polygon, case=False)]
def _import_gdf( gdf: GeoDataFrame, sql_tablename: str, geom_type: str, uri: str = DEFAULT_DB_URI ) -> None: """ Import a geopandas GeoDataFrame to SQL """ gdf.columns = [x.lower() for x in gdf.columns] epsg_code = int(str(gdf.crs).split(":")[1]) gdf["geom"] = gdf["geometry"].apply(lambda x: WKTElement(x.wkt, srid=epsg_code)) gdf.drop("geometry", 1, inplace=True) engine = sqlalchemy.create_engine(uri) gdf.to_sql( sql_tablename, engine, dtype={"geom": Geometry(geom_type.upper(), srid=epsg_code)}, if_exists="replace", ) engine.dispose()
def __init__( self, data: gpd.GeoDataFrame, variable: str, kernel: Kernel, cell_size, polygon=None, ): if not isinstance(data, gpd.GeoDataFrame): raise TypeError('data should be a geopandas GeoDataFrame') if 'geometry' not in data.columns: data['geometry'] = data[data._geometry_column_name] data = data.drop([data._geometry_column_name], axis=1) data = data.set_geometry('geometry') self.polygon = polygon data = data.rename(columns={ variable: 'variable', }) self.cell_size = cell_size data.points = data.geometry.centroid convex = MultiPoint(data.geometry).convex_hull if not self.polygon: self.polygon = convex.buffer(kernel.bandwidth) xmin, ymin, xmax, ymax = self.bbox = self.polygon.bounds x = np.arange(xmin, xmax, self.cell_size) y = np.arange(ymin, ymax, self.cell_size) y = np.flipud(y) x, y = np.meshgrid(x, y) self.shape = x.shape flat = x.flatten()[:, np.newaxis], y.flatten()[:, np.newaxis] df = pd.DataFrame(np.hstack(flat), columns=['x', 'y']) outside = [row.Index for row in df.itertuples() if not self.polygon.contains(Point(row.x, row.y))] self.df = df.drop(outside) self.kernel = kernel x1, x2 = np.meshgrid(self.df['x'], data.geometry.x) y1, y2 = np.meshgrid(self.df['y'], data.geometry.y) self.d = np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) self.w = self.kernel(self.d) vals = data['variable'].values.reshape(len(data), 1) self.df['density'] = np.sum(self.w * vals, axis=0) zeros = pd.Series(np.zeros(len(outside)), index=outside) grid = self.df['density'].append(zeros, verify_integrity=True).sort_index() self.grid = grid.values.reshape(self.shape)
def to_geojson(gdf: gpd.GeoDataFrame, file_name, crs="EPSG:4326"): r""" Write GeoDataFrame to GeoJson file Write GeoDataFrame to a GeoJson file with the default coordinate reference system (crs) "EPSG:4326". If a GeoDataFrame has multiple columns containing geometries, only the column `GeoDataFrame.geometry.name` is kept. Parameters ---------- crs : str, defaults to "EPSG:4326" The coordinate reference system (crs) of the output GeoJson file. The default value is "EPSG:4326". See Also -------- ~stplanpy.geo.read_shp Examples -------- The example data file, "`tl_2011_06_taz10.zip`_", can be downloaded from github. .. code-block:: python from stplanpy import geo # Read taz data from zip file taz = geo.read_shp("tl_2011_06_taz10.zip") # Write to file taz.to_geojson("taz.GeoJson") .. _tl_2011_06_taz10.zip: https://raw.githubusercontent.com/pctBayArea/stplanpy/main/examples/tl_2011_06_taz10.zip """ gdf.drop(gdf.loc[:, (gdf.dtypes == "geometry") & (gdf.columns != gdf.geometry.name)].columns, axis = 1).to_crs(crs).to_file(file_name, driver="GeoJSON")
def load_crime_stats(population_group=None, crime_list=None, provence=None): # lower provers if provence is not None: provence = provence.lower() # get data set dir data_path = get_work_path() # load an clean police police_stats = clean_police_stats( data_path.joinpath('Police_Statistics___2005_-_2017.csv')) if crime_list is not None: police_stats = police_stats[police_stats['Crime'].isin(crime_list)] if provence is not None: police_stats = police_stats.query(f"Province == '{provence}'") # population shape file pop_stats = clean_popluation_stats( data_path.joinpath( 'population/geo_export_3ec3ac74-ddff-4220-8007-b9b5643f79af.shp')) base_group = ['sal_code_i', 'pr_name', 'sp_name', 'geometry'] if population_group is not None: # filter out columns pop_stats = pop_stats[pop_groups[population_group] + base_group] if provence is not None: pop_stats = pop_stats.query(f"pr_name == '{provence}'") # shape id to weights precinct = clean_area_2_precint( data_path.joinpath('Precinct_to_small_area_weights.csv')) # munge data df = merge(precinct, pop_stats, left_on='small_area', right_on='sal_code_i') df = merge(df, police_stats, left_on='precinct', right_on='Police Station') # calclate crime per shape file as proportion of precint weight df['total_crime'] = df.weight * df.Incidents # keep as geo-dataframe df = GeoDataFrame(df, crs=pop_stats.crs) # clean data frame df = df.drop([ 'sal_code_i', 'pr_name', 'sp_name', 'Police Station', 'Incidents', 'weight' ], axis=1) # agg precinct back into shapes temp_df = df.groupby(['small_area', 'Year', 'Crime'])[['total_crime']].sum().round() df = df.drop_duplicates(subset=['small_area', 'Year', 'Crime']).drop( ['total_crime'], axis=1) df = merge(df, temp_df, on=['small_area', 'Year', 'Crime']) return df
def geocode_dataframe(address_dataframe, address_column): df = address_dataframe.copy() df['geocode'] = df[address_column].apply(geolocator.geocode) df['geometry'] = df['geocode'].apply( lambda x: Point(x.longitude, x.latitude)) # create a geometry column gdf = GeoDataFrame( df, geometry='geometry', ) gdf.crs = {'init': 'epsg:4326'} return gdf.drop(columns=['geocode'])
def geocode_dataframe(address_dataframe, address_column): df = address_dataframe.copy() # geocode address column by apply-ing geolocator.geocode df['geocode'] = df[address_column].apply(geolocator.geocode) # create a geometry column df['geometry'] = df['geocode'].apply( lambda x: Point(x.longitude, x.latitude)) # create a geodataframe called gdf from df # define the .crs = {'init': 'epsg:4326'} gdf = GeoDataFrame(df, geometry='geometry', crs='epsg:4326') # drop the geocode column from the gdf gdf = gdf.drop(['geocode'], axis=1) return gdf
def surface_dissim( data: gpd.GeoDataFrame, group_1_pop_var: str, group_2_pop_var: str, w: Kernel = None, ): if not isinstance(data, gpd.GeoDataFrame): raise TypeError('data should be a geopandas GeoDataFrame') if 'geometry' not in data.columns: data['geometry'] = data[data._geometry_column_name] data = data.drop([data._geometry_column_name], axis=1) data = data.set_geometry('geometry') data = data.rename(columns={ group_1_pop_var: 'group_1_pop_var', group_2_pop_var: 'group_2_pop_var', }) sum_1 = data['group_1_pop_var'].sum() data['group_1_pop_var_norm'] = data['group_1_pop_var'] / sum_1 sum_2 = data['group_2_pop_var'].sum() data['group_2_pop_var_norm'] = data['group_2_pop_var'] / sum_2 if not w: points = [(p.x, p.y) for p in data.centroid] w = Kernel(points) w_, _ = w.full() density_1 = w_ * data['group_1_pop_var_norm'].values density_2 = w_ * data['group_2_pop_var_norm'].values densities = np.vstack([density_1.sum(axis=1), density_2.sum(axis=1)]) v_union = densities.max(axis=0).sum() v_intersect = densities.min(axis=0).sum() s = 1 - v_intersect / v_union core_data = data[['group_1_pop_var', 'group_2_pop_var', 'geometry']] return s, core_data
def get_all_connections(roads: gpd.GeoDataFrame, points: gpd.GeoDataFrame) -> gpd.GeoDataFrame: """ find long roads and split them and merge them Parameters ---------- roads points Returns ------- """ long_roads = find_long_roads(roads=roads, points=points) smaller_parts = split_long_roads(long_roads, roads=roads, points=points) small_roads = roads.drop(axis=0, index=long_roads.keys()) gdf_long_roads_split = gpd.GeoDataFrame(crs=roads.crs, geometry=smaller_parts) gdf_small_roads = gpd.GeoDataFrame(crs=roads.crs, geometry=small_roads.geometry) connections = pd.concat([gdf_small_roads, gdf_long_roads_split], axis=0, ignore_index=True, sort=False) print('prepared all connections') return connections
def fill_hex_grid(gdf: gpd.GeoDataFrame, geom_column: str = "geometry") -> gpd.GeoDataFrame: bbox = gdf.total_bounds # Pandas somehow mangles Geopandas geometry column types so that the types # become mixed after concatenation and may cause TypeErrors, i.e. some # Shapely geometries may be cast as strings in the process. We have to # concatenate regular dataframes instead and reconstruct a geodataframe # from the hex indices afterwards. Utterly stupid. df = gdf.drop(columns=['geometry']) bbox_polygon = box(*bbox) hex_column = next((col for col in df.columns if col.startswith("hex")), False) if not hex_column: raise AssertionError( "Cannot calculate clusters, hex column not found.") resolution = int(hex_column.replace("hex", "")) # H3 polyfill needs geojson-like stuff. geo_json_conformant switches coordinate order hexes_in_bbox = h3.polyfill(mapping(bbox_polygon), resolution, geo_json_conformant=True) # Add only missing hexes here missing_hexes = set(hexes_in_bbox).difference(df[hex_column]) missing_df = pd.DataFrame(list(missing_hexes), columns=[hex_column]).set_index(hex_column, drop=False) columns_to_add = df.columns.difference(missing_df.columns) for column in columns_to_add: # Just add zeroes for missing index values missing_df.insert(0, column, 0) combined_df = pd.concat((df, missing_df)) # Add centroid geometries and reconstruct the geodataframe centroid_lat_lon = [h3.h3_to_geo(hex) for hex in combined_df[hex_column]] centroids = [Point(geom[1], geom[0]) for geom in centroid_lat_lon] combined_gdf = gpd.GeoDataFrame(combined_df) combined_gdf = combined_gdf.set_geometry(centroids) return combined_gdf
def _remove_overlapping( df: geopandas.GeoDataFrame, overlapping_threshold: float) -> geopandas.GeoDataFrame: rows_to_remove: Set[int] = set() org_len = len(df) df = df[df.geometry.is_valid] print(f"Removed {org_len - len(df)} invalid geometries") for idx_1, row_1 in tqdm(df.iterrows(), total=len(df)): for idx_2, row_2 in df.iloc[idx_1 + 1:].iterrows(): if row_1.geometry.intersects(row_2.geometry): max_intersection_area = overlapping_threshold * min( row_1.geometry.area, row_2.geometry.area) if row_1.geometry.intersection( row_2.geometry).area >= max_intersection_area: rows_to_remove.add(idx_1) rows_to_remove.add(idx_2) cleaned_df = df.drop(rows_to_remove) print(f"New df has len {len(cleaned_df)}, from {len(df)}") return cleaned_df
def assert_geodataframe_equal(left, right, check_dtype=True, check_index_type='equiv', check_column_type='equiv', check_frame_type=True, check_like=False, check_less_precise=False, check_geom_type=False, check_crs=True): """ Check that two GeoDataFrames are equal/ Parameters ---------- left, right : two GeoDataFrames check_dtype : bool, default True Whether to check the DataFrame dtype is identical. check_index_type, check_column_type : bool, default 'equiv' Check that index types are equal. check_frame_type : bool, default True Check that both are same type (*and* are GeoDataFrames). If False, will attempt to convert both into GeoDataFrame. check_like : bool, default False If true, ignore the order of rows & columns check_less_precise : bool, default False If True, use geom_almost_equals. if False, use geom_equals. check_geom_type : bool, default False If True, check that all the geom types are equal. check_crs: bool, default True If `check_frame_type` is True, then also check that the crs matches. """ try: # added from pandas 0.20 from pandas.testing import assert_frame_equal, assert_index_equal except ImportError: from pandas.util.testing import assert_frame_equal, assert_index_equal # instance validation if check_frame_type: assert isinstance(left, GeoDataFrame) assert isinstance(left, type(right)) if check_crs: # no crs can be either None or {} if not left.crs and not right.crs: pass else: assert left.crs == right.crs else: if not isinstance(left, GeoDataFrame): left = GeoDataFrame(left) if not isinstance(right, GeoDataFrame): right = GeoDataFrame(right) # shape comparison assert left.shape == right.shape, ( 'GeoDataFrame shape mismatch, left: {lshape!r}, right: {rshape!r}.\n' 'Left columns: {lcols!r}, right columns: {rcols!r}'.format( lshape=left.shape, rshape=right.shape, lcols=left.columns, rcols=right.columns)) if check_like: left, right = left.reindex_like(right), right # column comparison assert_index_equal(left.columns, right.columns, exact=check_column_type, obj='GeoDataFrame.columns') # geometry comparison assert_geoseries_equal( left.geometry, right.geometry, check_dtype=check_dtype, check_less_precise=check_less_precise, check_geom_type=check_geom_type, check_crs=False) # drop geometries and check remaining columns left2 = left.drop([left._geometry_column_name], axis=1) right2 = right.drop([right._geometry_column_name], axis=1) assert_frame_equal(left2, right2, check_dtype=check_dtype, check_index_type=check_index_type, check_column_type=check_column_type, obj='GeoDataFrame')
class TestSpatialJoinNYBB(unittest.TestCase): def setUp(self): nybb_filename, nybb_zip_path = download_nybb() self.polydf = read_file(nybb_zip_path, vfs='zip://' + nybb_filename) self.tempdir = tempfile.mkdtemp() self.crs = self.polydf.crs N = 20 b = [int(x) for x in self.polydf.total_bounds] self.pointdf = GeoDataFrame([ {'geometry' : Point(x, y), 'pointattr1': x + y, 'pointattr2': x - y} for x, y in zip(range(b[0], b[2], int((b[2]-b[0])/N)), range(b[1], b[3], int((b[3]-b[1])/N)))], crs=self.crs) def tearDown(self): shutil.rmtree(self.tempdir) def test_geometry_name(self): # test sjoin is working with other geometry name polydf_original_geom_name = self.polydf.geometry.name self.polydf = (self.polydf.rename(columns={'geometry': 'new_geom'}) .set_geometry('new_geom')) self.assertNotEqual(polydf_original_geom_name, self.polydf.geometry.name) res = sjoin(self.polydf, self.pointdf, how="left") self.assertEqual(self.polydf.geometry.name, res.geometry.name) def test_sjoin_left(self): df = sjoin(self.pointdf, self.polydf, how='left') self.assertEquals(df.shape, (21,8)) for i, row in df.iterrows(): self.assertEquals(row.geometry.type, 'Point') self.assertTrue('pointattr1' in df.columns) self.assertTrue('BoroCode' in df.columns) def test_sjoin_right(self): # the inverse of left df = sjoin(self.pointdf, self.polydf, how="right") df2 = sjoin(self.polydf, self.pointdf, how="left") self.assertEquals(df.shape, (12, 8)) self.assertEquals(df.shape, df2.shape) for i, row in df.iterrows(): self.assertEquals(row.geometry.type, 'MultiPolygon') for i, row in df2.iterrows(): self.assertEquals(row.geometry.type, 'MultiPolygon') def test_sjoin_inner(self): df = sjoin(self.pointdf, self.polydf, how="inner") self.assertEquals(df.shape, (11, 8)) def test_sjoin_op(self): # points within polygons df = sjoin(self.pointdf, self.polydf, how="left", op="within") self.assertEquals(df.shape, (21,8)) self.assertEquals(df.ix[1]['BoroName'], 'Staten Island') # points contain polygons? never happens so we should have nulls df = sjoin(self.pointdf, self.polydf, how="left", op="contains") self.assertEquals(df.shape, (21, 8)) self.assertTrue(np.isnan(df.ix[1]['Shape_Area'])) def test_sjoin_bad_op(self): # AttributeError: 'Point' object has no attribute 'spandex' self.assertRaises(ValueError, sjoin, self.pointdf, self.polydf, how="left", op="spandex") def test_sjoin_duplicate_column_name(self): pointdf2 = self.pointdf.rename(columns={'pointattr1': 'Shape_Area'}) df = sjoin(pointdf2, self.polydf, how="left") self.assertTrue('Shape_Area_left' in df.columns) self.assertTrue('Shape_Area_right' in df.columns) def test_sjoin_values(self): # GH190 self.polydf.index = [1, 3, 4, 5, 6] df = sjoin(self.pointdf, self.polydf, how='left') self.assertEquals(df.shape, (21,8)) df = sjoin(self.polydf, self.pointdf, how='left') self.assertEquals(df.shape, (12,8)) @unittest.skipIf(str(pd.__version__) < LooseVersion('0.19'), pandas_0_18_problem) @pytest.mark.xfail def test_no_overlapping_geometry(self): # Note: these tests are for correctly returning GeoDataFrame # when result of the join is empty df_inner = sjoin(self.pointdf.iloc[17:], self.polydf, how='inner') df_left = sjoin(self.pointdf.iloc[17:], self.polydf, how='left') df_right = sjoin(self.pointdf.iloc[17:], self.polydf, how='right') # Recent Pandas development has introduced a new way of handling merges # this change has altered the output when no overlapping geometries if str(pd.__version__) > LooseVersion('0.18.1'): right_idxs = pd.Series(range(0,5), name='index_right',dtype='int64') else: right_idxs = pd.Series(name='index_right',dtype='int64') expected_inner_df = pd.concat([self.pointdf.iloc[:0], pd.Series(name='index_right', dtype='int64'), self.polydf.drop('geometry', axis = 1).iloc[:0]], axis = 1) expected_inner = GeoDataFrame(expected_inner_df, crs = {'init': 'epsg:4326', 'no_defs': True}) expected_right_df = pd.concat([self.pointdf.drop('geometry', axis = 1).iloc[:0], pd.concat([pd.Series(name='index_left',dtype='int64'), right_idxs], axis=1), self.polydf], axis = 1) expected_right = GeoDataFrame(expected_right_df, crs = {'init': 'epsg:4326', 'no_defs': True})\ .set_index('index_right') expected_left_df = pd.concat([self.pointdf.iloc[17:], pd.Series(name='index_right', dtype='int64'), self.polydf.iloc[:0].drop('geometry', axis=1)], axis = 1) expected_left = GeoDataFrame(expected_left_df, crs = {'init': 'epsg:4326', 'no_defs': True}) self.assertTrue(expected_inner.equals(df_inner)) self.assertTrue(expected_right.equals(df_right)) self.assertTrue(expected_left.equals(df_left)) @unittest.skip("Not implemented") def test_sjoin_outer(self): df = sjoin(self.pointdf, self.polydf, how="outer") self.assertEquals(df.shape, (21,8))
class TestSpatialJoinNYBB: def setup_method(self): nybb_filename = geopandas.datasets.get_path('nybb') self.polydf = read_file(nybb_filename) self.crs = self.polydf.crs N = 20 b = [int(x) for x in self.polydf.total_bounds] self.pointdf = GeoDataFrame( [{'geometry': Point(x, y), 'pointattr1': x + y, 'pointattr2': x - y} for x, y in zip(range(b[0], b[2], int((b[2]-b[0])/N)), range(b[1], b[3], int((b[3]-b[1])/N)))], crs=self.crs) def test_geometry_name(self): # test sjoin is working with other geometry name polydf_original_geom_name = self.polydf.geometry.name self.polydf = (self.polydf.rename(columns={'geometry': 'new_geom'}) .set_geometry('new_geom')) assert polydf_original_geom_name != self.polydf.geometry.name res = sjoin(self.polydf, self.pointdf, how="left") assert self.polydf.geometry.name == res.geometry.name def test_sjoin_left(self): df = sjoin(self.pointdf, self.polydf, how='left') assert df.shape == (21, 8) for i, row in df.iterrows(): assert row.geometry.type == 'Point' assert 'pointattr1' in df.columns assert 'BoroCode' in df.columns def test_sjoin_right(self): # the inverse of left df = sjoin(self.pointdf, self.polydf, how="right") df2 = sjoin(self.polydf, self.pointdf, how="left") assert df.shape == (12, 8) assert df.shape == df2.shape for i, row in df.iterrows(): assert row.geometry.type == 'MultiPolygon' for i, row in df2.iterrows(): assert row.geometry.type == 'MultiPolygon' def test_sjoin_inner(self): df = sjoin(self.pointdf, self.polydf, how="inner") assert df.shape == (11, 8) def test_sjoin_op(self): # points within polygons df = sjoin(self.pointdf, self.polydf, how="left", op="within") assert df.shape == (21, 8) assert df.loc[1]['BoroName'] == 'Staten Island' # points contain polygons? never happens so we should have nulls df = sjoin(self.pointdf, self.polydf, how="left", op="contains") assert df.shape == (21, 8) assert np.isnan(df.loc[1]['Shape_Area']) def test_sjoin_bad_op(self): # AttributeError: 'Point' object has no attribute 'spandex' with pytest.raises(ValueError): sjoin(self.pointdf, self.polydf, how="left", op="spandex") def test_sjoin_duplicate_column_name(self): pointdf2 = self.pointdf.rename(columns={'pointattr1': 'Shape_Area'}) df = sjoin(pointdf2, self.polydf, how="left") assert 'Shape_Area_left' in df.columns assert 'Shape_Area_right' in df.columns @pytest.mark.parametrize('how', ['left', 'right', 'inner']) def test_sjoin_named_index(self, how): #original index names should be unchanged pointdf2 = self.pointdf.copy() pointdf2.index.name = 'pointid' df = sjoin(pointdf2, self.polydf, how=how) assert pointdf2.index.name == 'pointid' assert self.polydf.index.name == None def test_sjoin_values(self): # GH190 self.polydf.index = [1, 3, 4, 5, 6] df = sjoin(self.pointdf, self.polydf, how='left') assert df.shape == (21, 8) df = sjoin(self.polydf, self.pointdf, how='left') assert df.shape == (12, 8) @pytest.mark.skipif(str(pd.__version__) < LooseVersion('0.19'), reason=pandas_0_18_problem) @pytest.mark.xfail def test_no_overlapping_geometry(self): # Note: these tests are for correctly returning GeoDataFrame # when result of the join is empty df_inner = sjoin(self.pointdf.iloc[17:], self.polydf, how='inner') df_left = sjoin(self.pointdf.iloc[17:], self.polydf, how='left') df_right = sjoin(self.pointdf.iloc[17:], self.polydf, how='right') # Recent Pandas development has introduced a new way of handling merges # this change has altered the output when no overlapping geometries if str(pd.__version__) > LooseVersion('0.18.1'): right_idxs = pd.Series(range(0, 5), name='index_right', dtype='int64') else: right_idxs = pd.Series(name='index_right', dtype='int64') expected_inner_df = pd.concat( [self.pointdf.iloc[:0], pd.Series(name='index_right', dtype='int64'), self.polydf.drop('geometry', axis=1).iloc[:0]], axis=1) expected_inner = GeoDataFrame( expected_inner_df, crs={'init': 'epsg:4326', 'no_defs': True}) expected_right_df = pd.concat( [self.pointdf.drop('geometry', axis=1).iloc[:0], pd.concat([pd.Series(name='index_left', dtype='int64'), right_idxs], axis=1), self.polydf], axis=1) expected_right = GeoDataFrame( expected_right_df, crs={'init': 'epsg:4326', 'no_defs': True})\ .set_index('index_right') expected_left_df = pd.concat( [self.pointdf.iloc[17:], pd.Series(name='index_right', dtype='int64'), self.polydf.iloc[:0].drop('geometry', axis=1)], axis=1) expected_left = GeoDataFrame( expected_left_df, crs={'init': 'epsg:4326', 'no_defs': True}) assert expected_inner.equals(df_inner) assert expected_right.equals(df_right) assert expected_left.equals(df_left) @pytest.mark.skip("Not implemented") def test_sjoin_outer(self): df = sjoin(self.pointdf, self.polydf, how="outer") assert df.shape == (21, 8)