def export_duplicate_areas(dups, path): """Export duplicate barriers to a geopackage for QA. Parameters ---------- dups : DataFrame contains pygeos geometries in "geometry" and "dup_group" to indicate group path : str or Path output path """ dups["geometry"] = pg.buffer(dups.geometry, dups.dup_tolerance) dissolved = dissolve(dups[["geometry", "dup_group"]], by="dup_group") groups = (dups[["id", "SARPID", "dup_group" ]].join(dissolved.geometry, on="dup_group").groupby("dup_group").agg({ "geometry": "first", "SARPID": "unique", "id": "unique" })) groups["id"] = groups.id.apply(lambda x: ", ".join([str(s) for s in x])) groups["SARPID"] = groups.SARPID.apply( lambda x: ", ".join([str(s) for s in x])) to_gpkg(groups, path, crs=CRS)
def export_snap_dist_lines(df, original_locations, out_dir, prefix=""): """Creates lines from the original coordinate to the snapped coordinate to help QA/QC snapping operation. Creates geopackages in out_dir: - pre_snap_to_post_snap: line between snapped and unsnapped coordinate - pre_snap: unsnapped points - post_snap: snapped points Parameters ---------- df : DataFrame contains pygeos geometries in "geometry" column original_locations : DataFrame contains pygeos geometries in "geometry" column out_dir : Path prefix : str prefix to add to filename """ tmp = df.loc[ df.snapped, ["geometry", "Name", "SARPID", "snapped", "snap_dist", "snap_log"] ].join(original_locations.geometry.rename("orig_pt")) tmp["new_pt"] = tmp.geometry.copy() tmp["geometry"] = connect_points(tmp.new_pt, tmp.orig_pt) to_gpkg( tmp.drop(columns=["new_pt", "orig_pt"]).reset_index(drop=True), out_dir / "{}pre_snap_to_post_snap".format(prefix), crs=CRS, ) to_gpkg( tmp.drop(columns=["geometry", "new_pt"]) .rename(columns={"orig_pt": "geometry"}) .reset_index(drop=True), out_dir / "{}pre_snap".format(prefix), crs=CRS, ) to_gpkg( tmp.drop(columns=["geometry", "orig_pt"]) .rename(columns={"new_pt": "geometry"}) .reset_index(drop=True), out_dir / "{}post_snap".format(prefix), crs=CRS, )
], ).set_index("lineID") df = df.join(flowlines, on="lineID") df["loop"] = df.loop.fillna(False) print(df.groupby("loop").size()) ### All done processing! print("\n--------------\n") df = df.reset_index(drop=True) to_geofeather(df, master_dir / "waterfalls.feather") print("writing GIS for QA/QC") to_gpkg(df, qa_dir / "waterfalls") # to_shp(df, qa_dir / "waterfalls.shp") # Extract out only the snapped ones df = df.loc[df.snapped & ~(df.duplicate | df.dropped | df.excluded)].reset_index( drop=True) df.lineID = df.lineID.astype("uint32") df.NHDPlusID = df.NHDPlusID.astype("uint64") print("Serializing {0} snapped waterfalls".format(len(df))) to_geofeather( df[["geometry", "id", "HUC2", "lineID", "NHDPlusID", "loop", "waterbody"]], snapped_dir / "waterfalls.feather", )
df = deserialize_dfs([ networks_dir / region / "small_barriers/barriers_network.feather" for region in REGION_GROUPS ], ) networkIDs = df.loc[df.kind == "small_barrier"].upNetID.unique() for region in list(REGION_GROUPS.keys()): print("\n----------------\n processing {}".format(region)) networks = from_geofeather(networks_dir / region / "small_barriers" / "network.feather") # Extract only the networks associated with small barriers, the rest are dams networks = networks.loc[networks.networkID.isin(networkIDs), ["networkID", "geometry"]] if len(networks) == 0: print("No small barriers in this region, skipping") continue print("Writing to GPKG") to_gpkg( networks.reset_index(drop=True), data_dir / "tiles" / "small_barriers_network{}".format(region), index=False, name="networks", crs=CRS_WKT, )
# fix index data type issues waterbodies.index = waterbodies.index.astype("uint32") print("------------------") print("Serializing {:,} flowlines".format(len(flowlines))) flowlines = flowlines.reset_index() to_geofeather(flowlines, out_dir / "flowlines.feather", crs=CRS) serialize_df(joins.reset_index(drop=True), out_dir / "flowline_joins.feather") print("Serializing {:,} waterbodies".format(len(waterbodies))) to_geofeather(waterbodies.reset_index(), out_dir / "waterbodies.feather", crs=CRS) serialize_df(wb_joins.reset_index(drop=True), out_dir / "waterbody_flowline_joins.feather") print("Serializing {:,} drain points".format(len(drains))) to_geofeather(drains, out_dir / "waterbody_drain_points.feather", crs=CRS) # Serialize to GIS files print("Serializing to GIS files") to_gpkg(flowlines.reset_index(), out_dir / "flowlines", crs=CRS) to_gpkg(waterbodies.reset_index(), out_dir / "waterbodies", crs=CRS) to_gpkg(drains, out_dir / "waterbody_drain_points", crs=CRS) print("Region done in {:.2f}s".format(time() - region_start)) print("==============\nAll done in {:.2f}s".format(time() - start))
df = df.join(flowlines, on="lineID") df["loop"] = df.loop.fillna(False) print(df.groupby("loop").size()) print("\n--------------\n") df = df.reset_index(drop=True) print("Serializing {:,} small barriers".format(len(df))) to_geofeather(df, master_dir / "small_barriers.feather") print("writing GIS for QA/QC") to_gpkg(df, qa_dir / "small_barriers") # Extract out only the snapped ones df = df.loc[df.snapped & ~(df.duplicate | df.dropped | df.excluded)].reset_index( drop=True ) df.lineID = df.lineID.astype("uint32") df.NHDPlusID = df.NHDPlusID.astype("uint64") print("Serializing {:,} snapped small barriers".format(len(df))) to_geofeather( df[["geometry", "id", "HUC2", "lineID", "NHDPlusID", "loop", "waterbody"]], snapped_dir / "small_barriers.feather", )
df = df.join(flowlines, on="lineID") df["loop"] = df.loop.fillna(False) print(df.groupby("loop").size()) ### All done processing! print("\n--------------\n") df = df.reset_index(drop=True) print("Serializing {:,} dams to master file".format(len(df))) to_geofeather(df, master_dir / "dams.feather", crs=CRS) print("writing GIS for QA/QC") to_gpkg(df, qa_dir / "dams", crs=CRS) # Extract out only the snapped ones df = df.loc[df.snapped & ~(df.duplicate | df.dropped | df.excluded)].reset_index( drop=True ) df.lineID = df.lineID.astype("uint32") df.NHDPlusID = df.NHDPlusID.astype("uint64") print("Serializing {:,} snapped dams".format(len(df))) to_geofeather( df[["geometry", "id", "HUC2", "lineID", "NHDPlusID", "loop", "waterbody"]], snapped_dir / "dams.feather", crs=CRS, )
drains.set_index("wbID").geometry, 250) dams.loc[nearest_drains.index, "wbID"] = nearest_drains print("Found {:,} nearest neighbors in {:.2f}s".format(len(nearest_drains), time() - nearest_start)) print("{:,} dams not associated with waterbodies".format( dams.wbID.isnull().sum())) dams.wbID = dams.wbID.fillna(-1) print("Serializing...") # dams = to_gdf(dams, crs=CRS).reset_index(drop=True) dams = dams.reset_index() to_geofeather(dams, out_dir / "nhd_dams_pt.feather", crs=CRS) to_gpkg(dams, out_dir / "nhd_dams_pt") nhd_dams = nhd_dams.loc[nhd_dams.index.isin(dams.id.unique())].reset_index() to_geofeather(nhd_dams, out_dir / "nhd_dams_poly.feather", crs=CRS) to_gpkg(nhd_dams, out_dir / "nhd_dams_poly") # TODO: exporting to shapefile segfaults, not sure why. Issue likely with conversion to shapely geoms print("==============\nAll done in {:.2f}s".format(time() - start)) #### Old code - tries to use network topology to reduce set of lines per dam: # counts = downstreams.apply(len) # ix = dams.loc[dams.id.isin(counts.loc[counts == 1].index)].index # dams.loc[ix, "newLineID"] = dams.loc[ix].id.apply(lambda id: downstreams.loc[id][0]) # dams.loc[dams.id.isin(ids)].id.apply(lambda id: downstreams.loc[id][0]) # ix = lines_by_dam.loc[counts == 1]
networks = from_geofeather(networks_dir / region / "dams" / "network.feather") # Extract only the networks associated with small barriers, the rest are dams networks = networks.loc[networks.networkID.isin(networkIDs), ["networkID", "geometry"]] if len(networks) == 0: print("No small barriers in this region, skipping") continue print("Writing to GPKG") to_gpkg( networks.reset_index(drop=True), data_dir / "tiles" / "dam_networks{}".format(region), index=False, name="networks", crs=CRS_WKT, ) ### Region 21 is a special case region = "21" print("\n----------------\n processing {}".format(region)) df = from_geofeather(networks_dir / region / "dams" / "barriers.feather") networkIDs = df.upNetID.unique() networks = from_geofeather(networks_dir / region / "dams" / "network.feather") networks = networks.loc[networks.networkID.isin(networkIDs)] print("Writing to GPKG") to_gpkg(