def update_original_database(new_data, new_data_dups): ''' Adiciona os dados das últimas 24h ao banco completo de fogo NRT ''' # Lê o banco de dados original em formato feather gdf = gpd.read_feather( f"{PROJECT_ROOT}/output/feathers/tilesets/bd_completo.feather") gdf_dups = gpd.read_feather( f"{PROJECT_ROOT}/output/feathers/tilesets/bd_completo_com_duplicatas.feather" ) for datapoints, updates, label in zip([gdf, gdf_dups], [new_data, new_data_dups], ["clean", "duplicated"]): # Concatena com os novos dados datapoints = pd.concat((datapoints, updates)) # Mantém apenas os dados do último ano year = datetime.datetime.now().year datapoints["datetime"] = pd.to_datetime(datapoints.data) datapoints = datapoints[datapoints.datetime.dt.year == year] datapoints = datapoints.drop("datetime", axis=1) # Reindexa datapoints = datapoints.reset_index(drop=True) # Salva como CSV if label == "clean": datapoints = sanitize_api_duplicates(datapoints, "bd_completo") save_csv(datapoints, f"{PROJECT_ROOT}/output/csvs/tilesets/bd_completo.csv") save_feather( datapoints, f"{PROJECT_ROOT}/output/feathers/tilesets/bd_completo.feather") save_geojson( datapoints, f"{PROJECT_ROOT}/output/jsons/tilesets/bd_completo.json") # Salva os dados sem duplicatas em uma variável gdf = datapoints.copy() elif label == "duplicated": #save_csv(datapoints, f"{PROJECT_ROOT}/output/csvs/tilesets/bd_completo_com_duplicatas.csv") save_feather( datapoints, f"{PROJECT_ROOT}/output/csvs/tilesets/bd_completo_com_duplicatas.feather" ) #save_geojson(datapoints, f"{PROJECT_ROOT}/output/csvs/tilesets/bd_completo_com_duplicatas.geojson") gdf_dups = datapoints.copy() return gdf, gdf_dups
def main(): df_24h = gpd.read_feather( f"{PROJECT_ROOT}/output/feathers/tilesets/24h.feather") df_7d = gpd.read_feather( f"{PROJECT_ROOT}/output/feathers/tilesets/7d.feather") full_db = gpd.read_feather( f"{PROJECT_ROOT}/output/feathers/tilesets/bd_completo.feather") # Cria arquivos com os dados estáticos sobre terras indígenas e unidades de conservação print("> Creating land databases") update_land_datasets(df_24h, df_7d, full_db)
def get_slr(self): """Extract SLR for any geometries that overlap bounds where SLR is available Returns ------- dict {"slr_acres": <acres>, "slr": [<slr_0ft>, <slr_1ft>, ..., <slr_6ft>]} """ slr_bounds = gp.read_feather( slr_bounds_filename).geometry.values.data[0] ix = pg.intersects(self.geometry, slr_bounds) if not ix.sum(): # No overlap return None # only extract SLR where there are overlaps slr_results = extract_slr_by_geometry(self.shapes[ix], bounds=pg.total_bounds( self.geometry[ix])) # None only if no shape mask if slr_results is None: return None slr = [slr_results[i] for i in range(7)] return {"slr_acres": slr_results["shape_mask"], "slr": slr}
def read_file(fpath): gdf = gpd.read_feather(fpath) #gdf = gdf[ [ "Cod_setor", "populacao_residente", "geometry"] ] return gdf
def summarize_by_huc12(units_df): print("Calculating overlap with land ownership and protection") ownership = gp.read_feather( ownership_filename, columns=["geometry", "FEE_ORGTYP", "GAP_STATUS"]) index_name = units_df.index.name df = intersection(units_df, ownership) if not len(df): return df["acres"] = pg.area(df.geometry_right.values.data) * M2_ACRES # drop areas that touch but have no overlap df = df.loc[df.acres > 0].copy() by_owner = (df[["FEE_ORGTYP", "acres"]].groupby( [index_name, "FEE_ORGTYP"]).acres.sum().astype("float32").round().reset_index()) by_protection = (df[["GAP_STATUS", "acres"]].groupby( [index_name, "GAP_STATUS"]).acres.sum().astype("float32").round().reset_index()) by_owner.to_feather(ownership_results_filename) by_protection.to_feather(protection_results_filename)
def import_df (list_df, list_prop): data_dir = file_fct.get_parent_dir(2, 'data') list_final_df = [] for df_name, a_prop in zip(list_df, list_prop): source_df = read_db_list (a_prop) import_path = os.path.normcase(f'{data_dir}/{a_prop}/{source_df.loc[df_name, "sub_dir"]}/{source_df.loc[df_name, "file_name"]}') export_format = source_df.loc[df_name, "file_name"].split('.')[-1] if source_df.loc[df_name, 'type'] == 'Pandas': if export_format == 'csv': importing_df = pandas.read_csv(import_path, sep=source_df.loc[df_name, 'sep'], encoding=source_df.loc[df_name, 'encoding']) elif export_format == 'json': importing_df = pandas.read_json(import_path, orient = "table") elif source_df.loc[df_name, 'type'] == 'GeoPandas': if export_format == 'csv' or export_format == 'shp': importing_df = gpd.read_file(import_path) elif export_format == 'json' or export_format == 'geojson': importing_df = gpd.read_json(import_path, orient = "table") elif export_format == 'feather': importing_df = gpd.read_feather(import_path) list_final_df.append(importing_df) #print(df_name) #print(importing_df) return list_final_df
def test_write_read_feather_expand_user(): gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326") test_file = "~/test_file.feather" gdf.to_feather(test_file) f_df = geopandas.read_feather(test_file) assert_geodataframe_equal(gdf, f_df, check_crs=True) os.remove(os.path.expanduser(test_file))
def aerial_buffer(sample_gdf, layer_dir): """ Aggregate data from layers to sample_gdf according to defined radii :param sample_gdf: :param layer_dir: :return: """ # Test if sample_gdf has crs assert sample_gdf.crs is not None, AttributeError ('Assign coordinate reference system to sample_gdf') # Test if layers and columns exist within GeoPackage gdfs={layer: gpd.read_feather(f'{layer_dir}/{layer}.feather') for layer, cols in tqdm(network_layers.items())} for layer, cols in tqdm(network_layers.items()): for column in cols: assert column in gdfs[layer].columns, ValueError (f'{column} column not found in {layer} layer') for layer, cols in tqdm(network_layers.items()): for column in cols: right_gdf = gdfs[layer] # Test if right_gdf has crs assert right_gdf.crs is not None, AttributeError (f'Assign coordinate reference system to {layer}') # Test if column type is categorical or numerical if is_numeric_dtype(right_gdf[column]): for radius in radii: sample_gdf = Analyst(sample_gdf).buffer_join(right_gdf.loc[:, [column, 'geometry']], radius) else: for category in right_gdf[column].unique(): filtered = right_gdf[right_gdf[column] == category] filtered[category] = 1 for radius in radii: sample_gdf = Analyst(sample_gdf).buffer_join(filtered.loc[:, [category, 'geometry']], radius) return sample_gdf
def summarize_by_huc12(units_df): """Calculate spatial join with counties Parameters ---------- df : GeoDataFrame summary units """ print("Calculating overlap with PARCAs") parca = gp.read_feather(parca_filename) df = intersection(units_df, parca) df["acres"] = pg.area(df.geometry_right.values.data) * M2_ACRES # drop areas that touch but have no overlap df = df.loc[df.acres > 0].copy() # aggregate these back up by ID by_parca = (df[[ "parca_id", "name", "description", "acres" ]].groupby(by=[df.index.get_level_values(0), "parca_id"]).agg({ "name": "first", "description": "first", "acres": "sum" }).reset_index().rename(columns={"level_0": "id"})) by_parca.acres = by_parca.acres.astype("float32").round() by_parca.to_feather(results_filename)
def get_input_area_boundary(input_area): """Extract and union polygons associated with input area into a single boundary (Multi)Polygon. Parameters ---------- input_area : str id of input area Returns ------- (Multi)Polygon """ # have to make valid or we get errors during union for FL values = [ e["value"] for e in INPUT_AREA_VALUES if input_area in set(e["id"].split(",")) ] inputs_df = gp.read_feather(bnd_dir / "input_areas.feather") bnd = pg.union_all( pg.make_valid( inputs_df.loc[inputs_df.value.isin(values)].geometry.values.data)) return bnd
def get_parca(self): parca = gp.read_feather(parca_filename) df = intersection(pd.DataFrame({"geometry": self.geometry}), parca) if not len(df): return None df["acres"] = pg.area(df.geometry_right.values.data) * M2_ACRES df = df.loc[df.acres > 0].copy() # aggregate these back up by ID by_parca = (df[[ "parca_id", "name", "description", "acres" ]].groupby(by=[df.index.get_level_values(0), "parca_id"]).agg({ "name": "first", "description": "first", "acres": "sum" }).reset_index().rename(columns={"level_0": "id"})) by_parca.acres = by_parca.acres.astype("float32").round() return { "parca": by_parca[["name", "description", "acres"]].to_dict(orient="records") }
def get_city_count(data, fpath): centroids = gpd.read_feather("../output/city_info.feather") centroids = centroids[["code_muni", "geometry"]] # We need 6 digits IBGE code data.city_ibge_code = data.city_ibge_code.str.extract("(\d{6})") # Fix for Vitória centroids.loc[centroids.code_muni == "320530", "geometry"] = Point([-40.297984, -20.277465]) centroids = centroids.merge(data, left_on="code_muni", right_on="city_ibge_code") centroids = centroids[["geometry", "deaths"]] centroids = centroids[centroids.deaths > 0] # Round point size centroids.geometry = centroids.geometry.apply( lambda x: Point([round(coord, 2) for coord in x.coords[0]])) centroids.to_file(f"{fpath}/deaths.json", driver='GeoJSON')
def _read_to_geodf( self, path: Union[str, os.PathLike], ) -> gpd.GeoDataFrame: gdf = gpd.read_feather(path) return gdf
def read_feather(self): """市区町村ポリゴンを読み込んでGeoDataFrameを作成する Returns: GeoDataFrame: 市区町村ポリゴンのGeoDataFrame """ return gpd.read_feather(str(self.city_features_path.resolve()))
def _read_multipolygon(self, path: Union[str, os.PathLike], fix: bool = True) -> MultiPolygon: multipolygon = MultiPolygon(list(gpd.read_feather(path).geometry)) if fix: multipolygon = self._get_valid_multipolygon(multipolygon) return multipolygon
def read_gdfs(f_path, files): gdfs = {} for file in files: if file is not None: file_type = file.split('.')[len(file.split('.')) - 1] if file_type == 'feather': gdf = gpd.read_feather(f'{f_path}/{file}').to_crs(26910) else: gdf = gpd.read_file(f'{f_path}/{file}').to_crs(26910) gdfs[file] = gdf return gdfs
def get_counties(self): counties = gp.read_feather(county_filename)[[ "geometry", "FIPS", "state", "county" ]] df = (sjoin(pd.DataFrame({"geometry": self.geometry}), counties)[[ "FIPS", "state", "county" ]].reset_index(drop=True).sort_values(by=["state", "county"])) if not len(df): return None return {"counties": df.to_dict(orient="records")}
def main(): # Lê os dados necessários points_24h = gpd.read_feather(f"{PROJECT_ROOT}/output/feathers/tilesets/24h.feather") # Salva os recortes com dados de 24h inside_ucs = points_24h[~points_24h.cod_uc.isna()] inside_tis = points_24h[~points_24h.cod_ti.isna()] ti_most_fire_id = find_place_with_most_fire(points_24h, "cod_ti", position=1) ti_most_fire = points_24h[points_24h.cod_ti == ti_most_fire_id] uc_most_fire_id = find_place_with_most_fire(points_24h, "cod_uc", position=1) uc_most_fire = points_24h[points_24h.cod_uc == uc_most_fire_id] # Salva os recortes com dados de 7d grid = gpd.read_feather(f"{PROJECT_ROOT}/output/feathers/land_info/grid_20km.feather") points_7d = gpd.read_feather(f"{PROJECT_ROOT}/output/feathers/tilesets/7d.feather") grid_most_fire_1_id = find_grid_with_most_fire(grid, time="7d", position=1) grid_most_fire_1 = points_7d[points_7d.cod_box == grid_most_fire_1_id] grid_most_fire_2_id = find_grid_with_most_fire(grid, time="7d", position=2) grid_most_fire_2 = points_7d[points_7d.cod_box == grid_most_fire_2_id] grid_most_fire_3_id = find_grid_with_most_fire(grid, time="7d", position=3) grid_most_fire_3 = points_7d[points_7d.cod_box == grid_most_fire_3_id] # Salva os recortes de 24h inside_tis.to_file(f"{PROJECT_ROOT}/output/jsons/tilesets/24h_tis.json", driver="GeoJSON") inside_ucs.to_file(f"{PROJECT_ROOT}/output/jsons/tilesets/24h_ucs.json", driver="GeoJSON") uc_most_fire.to_file(f"{PROJECT_ROOT}/output/jsons/tilesets/24h_uc_most_fire.json", driver="GeoJSON") ti_most_fire.to_file(f"{PROJECT_ROOT}/output/jsons/tilesets/24h_ti_most_fire.json", driver="GeoJSON") # Salva os recortes de 7d grid_most_fire_1.to_file(f"{PROJECT_ROOT}/output/jsons/tilesets/7d_grid_1.json", driver="GeoJSON") grid_most_fire_2.to_file(f"{PROJECT_ROOT}/output/jsons/tilesets/7d_grid_2.json", driver="GeoJSON") grid_most_fire_3.to_file(f"{PROJECT_ROOT}/output/jsons/tilesets/7d_grid_3.json", driver="GeoJSON")
def run_query(point): # Gets information from the user input point = parse_input(point) # Opens the file with the current count of covid-19 deaths target = get_covid_count(measure='deaths') cities_info = gpd.read_feather("../output/city_info.feather") # Gets the parts of the census tracts with the user data that we need to load gdf = find_user_area(point, target) # Uses a buffer to avoid self-intercepting shapes gdf["geometry"] = gdf.geometry.buffer(0) # Creates a sindex to improve search spatial_index = gdf.sindex # Finds the area that we will need to highlight along with the respective population radius_data = find_radius(point, gdf, spatial_index, target) # Finds informations about the user city city_data = find_user_city(point, target, cities_info) # If the user city has less population than covid deaths, # the closest city that would vanish is itself if city_data["pop_2019"] <= target: neighbor_data = city_data.copy() # Else, finds the closest city with population smaller to the total deaths else: neighbor_data = find_neighboring_city(point, target, cities_info) # Selects two random capitals to highlight capitals_data = choose_capitals(point, city_data["code_muni"], cities_info) output = { "radius": radius_data, "user_city": city_data, "neighboring_city": neighbor_data, "capitals_to_highlight": capitals_data } return output
def test_feather_compression(compression, tmpdir): """Using compression options should not raise errors, and should return identical GeoDataFrame. """ test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) filename = os.path.join(str(tmpdir), "test.feather") df.to_feather(filename, compression=compression) pq_df = read_feather(filename) assert isinstance(pq_df, GeoDataFrame) assert_geodataframe_equal(df, pq_df)
def summarize_by_huc12(units_df): """Calculate spatial join with counties Parameters ---------- df : GeoDataFrame summary units """ print("Calculating spatial join with counties") counties = gp.read_feather(county_filename) df = (sjoin(units_df, counties, how="inner")[["FIPS", "state", "county"]].reset_index().round()) df.to_feather(results_filename)
def get_ownership(self): ownership = gp.read_feather(ownership_filename) df = intersection(pd.DataFrame({"geometry": self.geometry}), ownership) if not len(df): return None df["acres"] = pg.area(df.geometry_right.values.data) * M2_ACRES df = df.loc[df.acres > 0].copy() if not len(df): return None results = dict() by_owner = (df[[ "FEE_ORGTYP", "acres" ]].groupby(by="FEE_ORGTYP").acres.sum().astype("float32").to_dict()) # use the native order of OWNERSHIP to drive order of results results["ownership"] = [{ "label": value["label"], "acres": by_owner[key] } for key, value in OWNERSHIP.items() if key in by_owner] by_protection = (df[[ "GAP_STATUS", "acres" ]].groupby(by="GAP_STATUS").acres.sum().astype("float32").to_dict()) # use the native order of PROTECTION to drive order of results results["protection"] = [{ "label": value["label"], "acres": by_protection[key] } for key, value in PROTECTION.items() if key in by_protection] by_area = (df[["AREA_NAME", "FEE_OWNER", "acres"]].groupby( by=[df.index.get_level_values(0), "AREA_NAME", "FEE_OWNER" ]).acres.sum().astype("float32").round().reset_index().rename( columns={ "level_0": "id", "AREA_NAME": "name", "FEE_OWNER": "owner" }).sort_values(by="acres", ascending=False)) # drop very small areas, these are not helpful by_area = by_area.loc[by_area.acres >= 1].copy() results["protected_areas"] = by_area.head(25).to_dict(orient="records") results["num_protected_areas"] = len(by_area) return results
def test_write(tmp_path): df = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres")) ddf = dask_geopandas.from_geopandas(df, npartitions=4) basedir = tmp_path / "dataset" ddf.to_feather(basedir) # each partition (4) is written as a feather file paths = list(basedir.glob("*.feather")) assert len(paths) == 4 # each individual file is a valid feather file result_part0 = geopandas.read_feather(basedir / "part.0.feather") result_part0.index.name = None assert_geodataframe_equal(result_part0, df.iloc[:45])
def summarize_by_aoi(df, analysis_acres, total_acres): """Calculate ranks and areas of overlap within Caribbean Priority Watersheds. Parameters ---------- df : GeoDataframe area of interest analysis_acres : float area in acres of area of interest less any area outside SE Blueprint total_acres : float area in acres of area of interest dict { "priorities": [...], "legend": [...], "analysis_notes": <analysis_notes> } """ car_df = gp.read_feather(caribbean_filename, columns=["geometry", "carrank"]) df = intersection(df, car_df) df["acres"] = pg.area(df.geometry_right.values.data) * M2_ACRES # aggregate totals by rank by_rank = (df[["carrank", "acres"]].groupby( by="carrank").acres.sum().astype("float32").reset_index().sort_values( by="carrank")) priorities = [] for ix, row in by_rank.iterrows(): value = get_rank_value(row.carrank) value["acres"] = row.acres value["percent"] = 100 * row.acres / analysis_acres priorities.append(value) # Note: input area remainder deliberately omitted, since all # areas outside but close to this input are outside SE Blueprint return { "priorities": priorities, "legend": LEGEND, "analysis_notes": get_analysis_notes(), "analysis_acres": analysis_acres, "total_acres": total_acres, }
def get_slr(self): slr_bounds = gp.read_feather( slr_bounds_filename).geometry.values.data[0] ix = pg.intersects(self.geometry, slr_bounds) if not ix.sum(): # No overlap return None # only extract SLR where there are overlaps slr_results = extract_slr_by_geometry(self.shapes[ix], bounds=pg.total_bounds( self.geometry[ix])) # None only if no shape mask if slr_results is None: return None slr = [slr_results[i] for i in range(7)] return {"slr_acres": slr_results["shape_mask"], "slr": slr}
def summarize_by_huc12(geometries): """Summarize by HUC12 Parameters ---------- geometries : Series of pygeos geometries, indexed by HUC12 id """ # find the indexes of the geometries that overlap with SLR bounds; these are the only # ones that need to be analyzed for SLR impacts slr_bounds = gp.read_feather(slr_bounds_filename).geometry tree = pg.STRtree(geometries) ix = tree.query(slr_bounds.geometry.values.data[0], predicate="intersects") geometries = geometries.iloc[ix].copy() if not len(geometries): return results = [] index = [] for huc12, geometry in Bar( "Calculating SLR counts for HUC12", max=len(geometries) ).iter(geometries.iteritems()): zone_results = extract_by_geometry( [to_dict(geometry)], bounds=pg.total_bounds(geometry) ) if zone_results is None: continue index.append(huc12) results.append(zone_results) df = pd.DataFrame(results, index=index) # reorder columns df = df[["shape_mask"] + list(df.columns.difference(["shape_mask"]))] # extract only areas that actually had SLR pixels df = df[df[df.columns[1:]].sum(axis=1) > 0] df.columns = [str(c) for c in df.columns] df = df.reset_index().rename(columns={"index": "id"}).round() df.to_feather(results_filename)
def get_results(self): sa_bnd = gp.read_feather(boundary_filename) # if area of interest does not intersect SA boundary, there will be no results if not pg.intersects(self.geometry, sa_bnd.geometry.values.data).max(): return None results = { "type": "", "acres": pg.area(self.geometry).sum() * M2_ACRES, "name": self.name, } blueprint_results = self.get_blueprint() if blueprint_results is None: return None results.update(blueprint_results) urban_results = self.get_urban() if urban_results is not None: results.update(urban_results) slr_results = self.get_slr() if slr_results is not None: results.update(slr_results) ownership_results = self.get_ownership() if ownership_results is not None: results.update(ownership_results) county_results = self.get_counties() if county_results is not None: results.update(county_results) parca_results = self.get_parca() if parca_results is not None: results.update(parca_results) return results
def get_counties(self): """Get county and state names that overlap this area. Returns ------- dict {"counties": [ {"FIPS": <FIPS>, "state": <state name>, "county": <county_name>}, ... ] """ counties = gp.read_feather(county_filename)[[ "geometry", "FIPS", "state", "county" ]] df = (sjoin(self.gdf, counties)[[ "FIPS", "state", "county" ]].reset_index(drop=True).sort_values(by=["state", "county"])) if not len(df): return None return {"counties": df.to_dict(orient="records")}
nhd_dams["damID"] = nhd_dams.index.copy() nhd_dams.damID = nhd_dams.damID.astype("uint32") nhd_dams = nhd_dams.set_index("damID") merged = None for huc2 in huc2s: region_start = time() print(f"----- {huc2} ------") dams = nhd_dams.loc[nhd_dams.HUC2 == huc2, ["geometry"]].copy() print("Reading flowlines...") flowlines = gp.read_feather( clean_dir / huc2 / "flowlines.feather", columns=["lineID", "loop", "geometry", "sizeclass"], ).set_index("lineID") joins = pd.read_feather( clean_dir / huc2 / "flowline_joins.feather", columns=["downstream_id", "upstream_id"], ) ### Find all intersection points with flowlines # we do this before looking for adjacent drain points, since there may be # multiple flowlines of different networks associated with a given dam print(f"Joining {len(dams):,} NHD dams to {len(flowlines):,} flowlines") join_start = time() dams = ( pd.DataFrame( sjoin_geometry(
data_dir = Path("data") boundaries_dir = data_dir / "boundaries" nhd_dir = data_dir / "nhd" barriers_dir = data_dir / "barriers" src_dir = barriers_dir / "source" master_dir = barriers_dir / "master" snapped_dir = barriers_dir / "snapped" qa_dir = barriers_dir / "qa" start = time() ### Read in SARP states and merge print("Reading dams in SARP states") df = gp.read_feather(src_dir / "sarp_dams.feather") print(f"Read {len(df):,} dams in region states") ### Read in non-SARP states and join in # these are for states that overlap with HUC4s that overlap with SARP states print( "Reading dams that fall outside region states, but within HUC4s that overlap with region states..." ) outside_df = gp.read_feather(src_dir / "dams_outer_huc4.feather") # drop any that are in the main dataset, since there are several dams at state lines outside_df = outside_df.loc[~outside_df.SARPID.isin(df.SARPID.unique())].copy() print(f"Read {len(outside_df):,} dams outer HUC4s") df = df.append(outside_df, ignore_index=True, sort=False)