def process_huc4s(src_dir, huc4s): merged = None for HUC4 in huc4s: print("\n\n------------------- Reading {} -------------------".format( HUC4)) gdb = src_dir / HUC4 / "NHDPLUS_H_{HUC4}_HU4_GDB.gdb".format(HUC4=HUC4) df = read_dataframe(gdb, layer="NHDPlusCatchment", columns=["NHDPlusID"]) print(f"Read {len(df):,} catchments") df = df.dropna(subset=["NHDPlusID"]) print("Kept {:,} catchments after dropping those without NHDPlusID". format(len(df))) df.NHDPlusID = df.NHDPlusID.astype("uint64") df = df.to_crs(CRS) merged = append(merged, df) df = merged # add uniqueID df["catchID"] = df.index.astype("uint32") + 1 # add string version of NHDPlusID df["NHDIDSTR"] = df.NHDPlusID.astype("str") return df
f"Found {ix.sum():,} dams associated with waterbodies in {time() - join_start:,.2f}s" ) dams["geometry"] = dams.pt.values dams.loc[ix, "geometry"] = dams.loc[ix].drain.values.data dams.loc[ix, "lineID"] = dams.loc[ix].drainLineID.astype("uint32") dams = dams.drop(columns=["drain", "drainLineID", "pt"]).join( flowlines[["loop", "sizeclass"]], on="lineID" ) # drop duplicates dams = dams.reset_index().drop_duplicates( subset=["damPtID", "damID", "lineID", "geometry"] ) merged = append(merged, dams) print("Region done in {:.2f}s".format(time() - region_start)) print("----------------------------------------------") dams = merged.reset_index(drop=True).join( nhd_dams.drop(columns=["geometry"]), on="damID" ) nhd_dams = nhd_dams.loc[nhd_dams.index.isin(dams.damID.unique())].reset_index() print( f"Found {len(nhd_dams):,} NHD dams and {len(dams):,} NHD dam / flowline crossings" )
"Federal_Status", "State_Status", "SGCN_Listing", "Regional_SGCN", ]].rename( columns={ "HUC12_Code": "HUC12", "Species_Name": "SNAME", "Common_Name": "CNAME", "Federal_Status": "federal", "State_Status": "state", "SGCN_Listing": "sgcn", "Regional_SGCN": "regional", }) merged = append(merged, df) df = merged print("Processing species data") # fix data issues for col in df.columns[1:]: df[col] = df[col].fillna("").str.strip() df = df.loc[df.SNAME.notnull() & (df.SNAME != "")].copy() # drop duplicates df = (df.sort_values(by=["HUC12", "SNAME", "CNAME"]).groupby( ["HUC12", "SNAME"]).first().reset_index()) # Update to overcome taxonomic issues
# some geometries are invalid, filter them out df = df.loc[pg.is_geometry(df.geometry.values.data)].copy() if not len(df): continue df = df.to_crs(CRS) # Mark structurally altered types where # codes with x (excavated), d (ditched), r (artificial substrate), h (diked) # strip any terminal numbers then take last character df["modifier"] = df.nwi_code.str.rstrip("123456789").str[-1:] df["altered"] = df.modifier.isin(MODIFIERS) waterbodies = append(waterbodies, df.loc[df.nwi_type.isin(["Lake", "Pond"])]) rivers = append( rivers, df.loc[(df.nwi_type == "Riverine") & (df.altered)].drop(columns=["nwi_type"]), ) ### Process waterbodies # only keep that intersect flowlines print(f"Extracted {len(waterbodies):,} NWI lakes and ponds") left, right = tree.query_bulk(waterbodies.geometry.values.data, predicate="intersects") waterbodies = waterbodies.iloc[np.unique(left)].reset_index(drop=True) print(f"Kept {len(waterbodies):,} that intersect flowlines") # TODO: explode, repair, dissolve, explode, reset index
def process_huc4s(src_dir, out_dir, huc4s): merged_flowlines = None merged_joins = None merged_waterbodies = None merged_points = None merged_lines = None merged_poly = None merged_altered_rivers = None merged_marine = None for huc4 in huc4s: print(f"------------------- Reading {huc4} -------------------") huc_id = int(huc4) * 1000000 gdb = src_dir / huc4 / f"NHDPLUS_H_{huc4}_HU4_GDB.gdb" ### Read flowlines and joins read_start = time() flowlines, joins = extract_flowlines(gdb, target_crs=CRS) print( f"Read {len(flowlines):,} flowlines in {time() - read_start:.2f} seconds" ) flowlines["HUC4"] = huc4 joins["HUC4"] = huc4 # Calculate lineIDs to be unique across all HUC2s flowlines["lineID"] += huc_id # Set updated lineIDs with the HUC4 prefix joins.loc[joins.upstream_id != 0, "upstream_id"] += huc_id joins.loc[joins.downstream_id != 0, "downstream_id"] += huc_id merged_flowlines = append(merged_flowlines, flowlines) merged_joins = append(merged_joins, joins) ### Read waterbodies read_start = time() waterbodies = extract_waterbodies(gdb, target_crs=CRS) print("Read {:,} waterbodies in {:.2f} seconds".format( len(waterbodies), time() - read_start)) waterbodies["HUC4"] = huc4 # calculate ids to be unique across region waterbodies["wbID"] += huc_id ### Only retain waterbodies that intersect flowlines print("Intersecting waterbodies and flowlines") # use waterbodies to query flowlines since there are many more flowlines tree = pg.STRtree(flowlines.geometry.values.data) left, right = tree.query_bulk(waterbodies.geometry.values.data, predicate="intersects") waterbodies = waterbodies.iloc[np.unique(left)].copy() print("Retained {:,} waterbodies that intersect flowlines".format( len(waterbodies))) merged_waterbodies = append(merged_waterbodies, waterbodies) ### Extract barrier points, lines, polygons points = extract_barrier_points(gdb, target_crs=CRS) points.HUC4 = huc4 points["id"] += huc_id merged_points = append(merged_points, points) lines = extract_barrier_lines(gdb, target_crs=CRS) lines.HUC4 = huc4 lines["id"] += huc_id merged_lines = append(merged_lines, lines) poly = extract_barrier_polygons(gdb, target_crs=CRS) poly.HUC4 = huc4 poly["id"] += huc_id merged_poly = append(merged_poly, poly) ### Extract altered rivers altered_rivers = extract_altered_rivers(gdb, target_crs=CRS) altered_rivers.HUC4 = huc4 altered_rivers["id"] += huc_id merged_altered_rivers = append(merged_altered_rivers, altered_rivers) ### Extract marine marine = extract_marine(gdb, target_crs=CRS) marine.HUC4 = huc4 merged_marine = append(merged_marine, marine) print("--------------------") flowlines = merged_flowlines.reset_index(drop=True) joins = merged_joins.reset_index(drop=True) waterbodies = merged_waterbodies.reset_index(drop=True) points = merged_points.reset_index(drop=True) lines = merged_lines.reset_index(drop=True) poly = merged_poly.reset_index(drop=True) altered_rivers = merged_altered_rivers.reset_index(drop=True) marine = merged_marine.reset_index(drop=True) ### Deduplicate waterbodies that are duplicated between adjacent HUC4s print("Removing duplicate waterbodies, starting with {:,}".format( len(waterbodies))) # Calculate a hash of the WKB bytes of the polygon. # This correctly catches polygons that are EXACTLY the same. # It will miss those that are NEARLY the same. waterbodies["hash"] = pd.util.hash_array( pg.to_wkb(waterbodies.geometry.values.data)) id_map = (waterbodies.set_index("wbID")[["hash"]].join( waterbodies.groupby("hash").wbID.first(), on="hash").wbID) # extract out where they are not equal; these are the ones to drop waterbodies = (waterbodies.loc[waterbodies.wbID.isin(id_map)].drop( columns=["hash"]).reset_index(drop=True)) print("{:,} waterbodies remain after removing duplicates".format( len(waterbodies))) ### Update the missing upstream_ids at the joins between HUCs. # These are the segments that are immediately DOWNSTREAM of segments that flow into this HUC4 # We set a new UPSTREAM id for them based on the segment that is next upstream huc_in_idx = joins.loc[joins.type == "huc_in"].index cross_huc_joins = joins.loc[huc_in_idx] new_upstreams = (cross_huc_joins.join( joins.set_index("downstream").downstream_id.rename("new_upstream"), on="upstream", ).new_upstream.fillna(0).astype("uint32")) joins.loc[new_upstreams.index, "upstream_id"] = new_upstreams # update new internal joins joins.loc[(joins.type == "huc_in") & (joins.upstream_id != 0), "type"] = "internal" # remove the duplicate downstreams that used to be terminals for their respective HUCs joins = joins.loc[~(joins.upstream.isin(cross_huc_joins.upstream) & (joins.type == "terminal"))] # remove dead ends joins = joins.loc[~((joins.downstream == 0) & (joins.upstream == 0))].reset_index(drop=True) print("\n--------------------") print(f"serializing {len(flowlines):,} flowlines") flowlines.to_feather(out_dir / "flowlines.feather") joins.to_feather(out_dir / "flowline_joins.feather") print(f"serializing {len(waterbodies):,} waterbodies") waterbodies.to_feather(out_dir / "waterbodies.feather") # DEBUG: # write_dataframe(flowlines, out_dir / "flowlines.gpkg") # write_dataframe(waterbodies, out_dir / "waterbodies.gpkg") if len(points): print(f"serializing {len(points):,} NHD barrier points") points.to_feather(out_dir / "nhd_points.feather") # DEBUG: # write_dataframe(points, out_dir / 'nhd_points.gpkg') if len(lines): print(f"serializing {len(lines):,} NHD barrier lines") lines.to_feather(out_dir / "nhd_lines.feather") # DEBUG: # write_dataframe(lines, out_dir / 'nhd_lines.gpkg') if len(poly): print(f"serializing {len(poly):,} NHD barrier polygons") poly.to_feather(out_dir / "nhd_poly.feather") # DEBUG: # write_dataframe(poly, out_dir / 'nhd_poly.gpkg') if len(altered_rivers): print(f"serializing {len(altered_rivers):,} NHD altered rivers") altered_rivers.to_feather(out_dir / "nhd_altered_rivers.feather") # DEBUG: # write_dataframe(altered_rivers, out_dir / "nhd_altered_rivers.gpkg") if len(marine): print(f"serializing {len(marine):,} NHD marine areas") marine.to_feather(out_dir / "nhd_marine.feather")
def get_network_results(df, network_type, barrier_type=None, rank=True): """Read network results, calculate derived metric classes, and calculate tiers. Only barriers that are not unranked (invasive spp barriers) have tiers calculated. Parameters ---------- df : DataFrame barriers data; must contain State and unranked network_type : {"dams", "small_barriers"} network scenario barrier_type : {"dams", "small_barriers", "waterfalls"}, optional (default: None) if present, used to filter barrier kind from network results rank : bool, optional (default: True) if True, results will include tiers for the Southeast and state level Returns ------- DataFrame Contains network metrics and tiers """ barrier_type = barrier_type or network_type huc2s = [huc2 for huc2 in df.HUC2.unique() if huc2] networks = ( read_feathers( [ Path("data/networks/clean") / huc2 / f"{network_type}_network.feather" for huc2 in huc2s ], columns=NETWORK_COLUMNS, ) .rename(columns=NETWORK_COLUMN_NAMES) .set_index("id") ) # FIXME: temporary fix networks.PercentPerennialUnaltered = networks.PercentPerennialUnaltered.fillna(0) # select barrier type networks = networks.loc[networks.kind == barrier_type[:-1]].drop(columns=["kind"]) # Convert dtypes to allow missing data when joined to barriers later # NOTE: upNetID or downNetID may be 0 if there aren't networks on that side, but # we set to int dtype instead of uint to allow -1 for missing data later for col in ["upNetID", "downNetID"]: networks[col] = networks[col].astype("int") for col in ["NumBarriersDownstream"]: networks[col] = networks[col].astype("int16") for column in ("Landcover", "SizeClasses", "FlowsToOcean"): networks[column] = networks[column].astype("int8") # sanity check to make sure no duplicate networks if networks.groupby(level=0).size().max() > 1: raise Exception( f"ERROR: multiple networks found for some {barrier_type} networks" ) networks = networks.join(df[df.columns.intersection(["unranked", "State"])]) # update data types and calculate total fields # calculate size classes GAINED instead of total # doesn't apply to those that don't have upstream networks networks.loc[networks.SizeClasses > 0, "SizeClasses"] -= 1 # Calculate miles GAINED if barrier is removed # this is the lesser of the upstream or free downstream lengths. # Non-free miles downstream (downstream waterbodies) are omitted from this analysis. networks["GainMiles"] = networks[["TotalUpstreamMiles", "FreeDownstreamMiles"]].min( axis=1 ) networks["PerennialGainMiles"] = networks[ ["PerennialUpstreamMiles", "FreePerennialDownstreamMiles"] ].min(axis=1) # TotalNetworkMiles is sum of upstream and free downstream miles networks["TotalNetworkMiles"] = networks[ ["TotalUpstreamMiles", "FreeDownstreamMiles"] ].sum(axis=1) networks["TotalPerennialNetworkMiles"] = networks[ ["PerennialUpstreamMiles", "FreePerennialDownstreamMiles"] ].sum(axis=1) # Round floating point columns to 3 decimals for column in [c for c in networks.columns if c.endswith("Miles")]: networks[column] = networks[column].round(3).fillna(-1).astype("float32") # Calculate network metric classes networks["GainMilesClass"] = classify_gainmiles(networks.GainMiles) networks["PerennialGainMilesClass"] = classify_gainmiles( networks.PerennialGainMiles ) if not rank: return networks.drop(columns=["unranked", "State"], errors="ignore") # only calculate ranks / tiers for ranked barriers # (exclude unranked invasive spp. barriers) to_rank = networks.loc[~networks.unranked] ### Calculate regional tiers for SARP (Southeast) region # NOTE: this is limited to SARP region; other regions are not ranked at regional level # TODO: consider deprecating this ix = to_rank.State.isin(SARP_STATE_NAMES) sarp_tiers = calculate_tiers(to_rank.loc[ix]) sarp_tiers = sarp_tiers.rename( columns={col: f"SE_{col}" for col in sarp_tiers.columns} ) ### Calculate state tiers for each of total and perennial state_tiers = None for state in to_rank.State.unique(): state_tiers = append( state_tiers, calculate_tiers(to_rank.loc[to_rank.State == state]).reset_index(), ) state_tiers = state_tiers.set_index("id").rename( columns={col: f"State_{col}" for col in state_tiers.columns} ) networks = networks.join(sarp_tiers).join(state_tiers) for col in [col for col in networks.columns if col.endswith("_tier")]: networks[col] = networks[col].fillna(-1).astype("int8") return networks.drop(columns=["unranked", "State"])
def nearest(source, target, max_distance, keep_all=False): """Find the nearest target geometry for each record in source, if one can be found within distance. Parameters ---------- source : Series contains pygeos geometries target : Series contains target pygeos geometries to search against max_distance : number or ndarray radius within which to find target geometries If ndarray, must be equal length to source. keep_all : bool (default: False) If True, will keep all equidistant results Returns ------- DataFrame indexed by original index of source, has index of target for each nearest target geom. Includes distance """ left_index_name = source.index.name or "index" right_index_name = target.index.name or "index_right" tree = pg.STRtree(target.values.data) if np.isscalar(max_distance): (left_ix, right_ix), distance = tree.nearest_all(source.values.data, max_distance=max_distance, return_distance=True) # Note: there may be multiple equidistant or intersected results, so we take the first df = pd.DataFrame( { right_index_name: target.index.take(right_ix), "distance": distance, }, index=source.index.take(left_ix), ) else: # array merged = None for d in np.unique(max_distance): ix = max_distance == d left = source.loc[ix] (left_ix, right_ix), distance = tree.nearest_all(left.values.data, max_distance=d, return_distance=True) merged = append( merged, pd.DataFrame( { left_index_name: left.index.take(left_ix), right_index_name: target.index.take(right_ix), "distance": distance, }, ), ) df = merged.set_index(left_index_name) if keep_all: df = df.reset_index().drop_duplicates().set_index(left_index_name) else: df = df.groupby(level=0).first() df.index.name = source.index.name return df
merged = None for huc2 in units.keys(): print(f"Processing floodplain stats for {huc2}") if huc2 == "02": filename = region02_gdb_filename layer = "Region002_Catchments_Natl_LCStats" else: filename = gdb_filename layer = layers[huc2] df = read_dataframe(filename, layer=layer) df["HUC2"] = huc2 df["NHDPlusID"] = df.NHDIDSTR.astype("uint64") cols = [c for c in df.columns if c.startswith("VALUE_")] natural_cols = [c for c in cols if int(c.split("_")[1]) in NATURAL_TYPES] df["floodplain_km2"] = df[cols].sum(axis=1) * 1e-6 df["nat_floodplain_km2"] = df[natural_cols].sum(axis=1) * 1e-6 merged = append( merged, df[["NHDPlusID", "HUC2", "nat_floodplain_km2", "floodplain_km2"]]) merged.reset_index(drop=True).to_feather(src_dir / "floodplain_stats.feather") print("Done in {:.2f}".format(time() - start))