def calculate_sinuosity(geometries): """Calculate sinuosity of the line. This is the length of the line divided by the distance between the endpoints of the line. By definition, it is always >=1. Parameters ---------- geometries : Series or ndarray of pygeos geometries Returns ------- Series or ndarray sinuosity values """ # By definition, sinuosity should not be less than 1 first = pg.get_point(geometries, 0) last = pg.get_point(geometries, -1) straight_line_distance = pg.distance(first, last) sinuosity = np.ones((len(geometries), )).astype("float32") # if there is no straight line distance there can be no sinuosity ix = straight_line_distance > 0 # by definition, all values must be at least 1, so clip lower bound sinuosity[ix] = (pg.length(geometries[ix]) / straight_line_distance).clip(1) if isinstance(geometries, pd.Series): return pd.Series(sinuosity, index=geometries.index) return sinuosity
def cut_line_at_points(line, cut_points, tolerance=1e-6): """Cut a pygeos line geometry at points. If there are no interior points, the original line will be returned. Parameters ---------- line : pygeos Linestring cut_points : list-like of pygeos Points will be projected onto the line; those interior to the line will be used to cut the line in to new segments. tolerance : float, optional (default: 1e-6) minimum distance from endpoints to consider the points interior to the line. Returns ------- MultiLineStrings (or LineString, if unchanged) """ if not pg.get_type_id(line) == 1: raise ValueError("line is not a single linestring") vertices = pg.get_point(line, range(pg.get_num_points(line))) offsets = pg.line_locate_point(line, vertices) cut_offsets = pg.line_locate_point(line, cut_points) # only keep those that are interior to the line and ignore those very close # to endpoints or beyond endpoints cut_offsets = cut_offsets[(cut_offsets > tolerance) & (cut_offsets < offsets[-1] - tolerance)] if len(cut_offsets) == 0: # nothing to cut, return original return line # get coordinates of new vertices from the cut points (interpolated onto the line) cut_offsets.sort() # add in the last coordinate of the line cut_offsets = np.append(cut_offsets, offsets[-1]) # TODO: convert this to a pygos ufunc coords = pg.get_coordinates(line) cut_coords = pg.get_coordinates( pg.line_interpolate_point(line, cut_offsets)) lines = [] orig_ix = 0 for cut_ix in range(len(cut_offsets)): offset = cut_offsets[cut_ix] segment = [] if cut_ix > 0: segment = [cut_coords[cut_ix - 1]] while offsets[orig_ix] < offset: segment.append(coords[orig_ix]) orig_ix += 1 segment.append(cut_coords[cut_ix]) lines.append(pg.linestrings(segment)) return pg.multilinestrings(lines)
) # Now can just reduce dams back to these lineIDs dams = ( dams[["damID", "geometry"]] .join(downstreams, on="damID", how="inner") .drop_duplicates(subset=["damID", "lineID"]) .join(flowlines.geometry.rename("flowline"), on="lineID",) .reset_index(drop=True) ) print(f"Found {len(dams):,} joins between NHD dams and flowlines") ### Extract representative point # Look at either end of overlapping line and use that as representative point. # Otherwise intersect and extract first coordinate of overlapping line last_pt = pg.get_point(dams.flowline.values.data, -1) ix = pg.intersects(dams.geometry.values.data, last_pt) dams.loc[ix, "pt"] = last_pt[ix] # override with upstream most point when both intersect first_pt = pg.get_point(dams.flowline.values.data, 0) ix = pg.intersects(dams.geometry.values.data, first_pt) dams.loc[ix, "pt"] = first_pt[ix] ix = dams.pt.isnull() # WARNING: this might fail for odd intersection geoms; we always take the first line # below pt = pd.Series( pg.get_point( pg.get_geometry( pg.intersection(
def find_dam_face_from_waterbody(waterbody, drain_pt): total_area = pg.area(waterbody) ring = pg.get_exterior_ring(pg.normalize(waterbody)) total_length = pg.length(ring) num_pts = pg.get_num_points(ring) - 1 # drop closing coordinate vertices = pg.get_point(ring, range(num_pts)) ### Extract line segments that are no more than 1/3 coordinates of polygon # starting from the vertex nearest the drain # note: lower numbers are to the right tree = pg.STRtree(vertices) ix = tree.nearest(drain_pt)[1][0] side_width = min(num_pts // 3, MAX_SIDE_PTS) left_ix = ix + side_width right_ix = ix - side_width # extract these as a left-to-write line; pts = vertices[max(right_ix, 0):min(num_pts, left_ix)][::-1] if left_ix >= num_pts: pts = np.append(vertices[0:left_ix - num_pts][::-1], pts) if right_ix < 0: pts = np.append(pts, vertices[num_pts + right_ix:num_pts][::-1]) coords = pg.get_coordinates(pts) if len(coords) > 2: # first run a simplification process to extract the major shape and bends # then run the straight line algorithm simp_coords, simp_ix = simplify_vw( coords, min(MAX_SIMPLIFY_AREA, total_area / 100)) if len(simp_coords) > 2: keep_coords, ix = extract_straight_segments( simp_coords, max_angle=MAX_STRAIGHT_ANGLE, loops=5) keep_ix = simp_ix.take(ix) else: keep_coords = simp_coords keep_ix = simp_ix else: keep_coords = coords keep_ix = np.arange(len(coords)) ### Calculate the length of each run and drop any that are not sufficiently long lengths = segment_length(keep_coords) ix = (lengths >= MIN_DAM_WIDTH) & (lengths / total_length < MAX_WIDTH_RATIO) pairs = np.dstack([keep_ix[:-1][ix], keep_ix[1:][ix]])[0] # since ranges are ragged, we have to do this in a loop instead of vectorized segments = [] for start, end in pairs: segments.append(pg.linestrings(coords[start:end + 1])) segments = np.array(segments) # only keep the segments that are close to the drain segments = segments[ pg.intersects(segments, pg.buffer(drain_pt, MAX_DRAIN_DIST)), ] if not len(segments): return segments # only keep those where the drain is interior to the line pos = pg.line_locate_point(segments, drain_pt) lengths = pg.length(segments) ix = (pos >= MIN_INTERIOR_DIST) & (pos <= (lengths - MIN_INTERIOR_DIST)) return segments[ix]
def test_get_point_non_linestring(geom): actual = pygeos.get_point(geom, [0, 2, -1]) assert pygeos.is_missing(actual).all()
def test_get_point(geom): n = pygeos.get_num_points(geom) actual = pygeos.get_point(geom, [0, -n, n, -(n + 1)]) assert pygeos.equals(actual[0], actual[1]).all() assert pygeos.is_missing(actual[2:4]).all()
def test_get_point(): actual = pygeos.get_point(line_string, 1) assert pygeos.equals(actual, pygeos.points(1, 0))
def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]): """ Extracts flowlines data from NHDPlusHR data product. Extract flowlines from NHDPlusHR data product, joins to VAA table, and filters out coastlines. Extracts joins between flowlines, and filters out coastlines. Parameters ---------- gdb_path : str path to the NHD HUC4 Geodatabase target_crs: GeoPandas CRS object target CRS to project NHD to for analysis, like length calculations. Must be a planar projection. extra_cols: list List of extra field names to extract from NHDFlowline layer Returns ------- tuple of (GeoDataFrame, DataFrame) (flowlines, joins) """ ### Read in flowline data and convert to data frame print("Reading flowlines") flowline_cols = FLOWLINE_COLS + extra_flowline_cols df = read_dataframe( gdb_path, layer="NHDFlowline", force_2d=True, columns=[flowline_cols], ) # Index on NHDPlusID for easy joins to other NHD data df.NHDPlusID = df.NHDPlusID.astype("uint64") df = df.set_index(["NHDPlusID"], drop=False) # convert MultiLineStrings to LineStrings (all have a single linestring) df.geometry = pg.get_geometry(df.geometry.values.data, 0) print("making valid and projecting to target projection") df.geometry = make_valid(df.geometry.values.data) df = df.to_crs(target_crs) print(f"Read {len(df):,} flowlines") ### Read in VAA and convert to data frame # NOTE: not all records in Flowlines have corresponding records in VAA # we drop those that do not since we need these fields. print("Reading VAA table and joining...") vaa_df = read_dataframe(gdb_path, layer="NHDPlusFlowlineVAA", columns=[VAA_COLS]) vaa_df.NHDPlusID = vaa_df.NHDPlusID.astype("uint64") vaa_df = vaa_df.set_index(["NHDPlusID"]) df = df.join(vaa_df, how="inner") print(f"{len(df):,} features after join to VAA") # Simplify data types for smaller files and faster IO df.FType = df.FType.astype("uint16") df.FCode = df.FCode.astype("uint16") df.StreamOrde = df.StreamOrde.astype("uint8") df.Slope = df.Slope.astype("float32") df.MinElevSmo = df.MinElevSmo.astype("float32") df.MaxElevSmo = df.MaxElevSmo.astype("float32") ### Read in flowline joins print("Reading flowline joins") join_df = read_dataframe( gdb_path, layer="NHDPlusFlow", read_geometry=False, columns=["FromNHDPID", "ToNHDPID"], ).rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"}) join_df.upstream = join_df.upstream.astype("uint64") join_df.downstream = join_df.downstream.astype("uint64") ### Fix errors in NHD # some valid joins are marked as terminals (downstream==0) in NHD; we need # to backfill the missing join info. # To do this, we intersect all terminals back with flowlines dropping any # that are themselves terminals. Then we calculate the distance to the upstream # point of the intersected line, and the upstream point of the next segment # downstream. We use the ID of whichever one is closer (must be within 100m). ix = join_df.loc[join_df.downstream == 0].upstream.unique() # get last point, is furthest downstream tmp = df.loc[df.index.isin(ix), ["geometry"]].copy() tmp["geometry"] = pg.get_point(tmp.geometry.values.data, -1) target = df.loc[~df.index.isin(ix)] # only search against other flowlines tree = pg.STRtree(target.geometry.values.data) # search within a tolerance of 0.001, these are very very close left, right = tree.nearest_all(tmp.geometry.values.data, max_distance=0.001) pairs = pd.DataFrame( { "left": tmp.index.take(left), "right": target.index.take(right), "source": tmp.geometry.values.data.take(left), # take upstream / downstream points of matched lines "upstream_target": pg.get_point(df.geometry.values.data.take(right), 0), } ) # drop any pairs where the other side is also a terminal (these appear as # V shaped tiny networks that need to be left as is) pairs = pairs.loc[~pairs.right.isin(ix)] # calculate the next segment downstream (only keep the first if multiple; possible logic issue) next_downstream = ( join_df.loc[(join_df.upstream != 0) & (join_df.downstream != 0)] .groupby("upstream") .downstream.first() ) pairs["next_downstream"] = pairs.right.map(next_downstream) pairs.loc[pairs.next_downstream.notnull(), "downstream_target"] = pg.get_point( df.loc[ pairs.loc[pairs.next_downstream.notnull()].next_downstream ].geometry.values.data, 0, ) pairs["upstream_dist"] = pg.distance(pairs.source, pairs.upstream_target) ix = pairs.next_downstream.notnull() pairs.loc[ix, "downstream_dist"] = pg.distance( pairs.loc[ix].source, pairs.loc[ix].downstream_target ) # this ignores any nan pairs["dist"] = pairs[["upstream_dist", "downstream_dist"]].min(axis=1) # discard any that are too far (>100m) pairs = pairs.loc[pairs.dist <= 100].copy() # sort by distance to upstream point of matched flowline; this allows us # to sort on those then dedup to calculate a new downstream ID for this source line pairs = pairs.sort_values(by=["left", "dist"]) # set the right value to the next downstream if it is closer # this also ignores na ix = pairs.downstream_dist < pairs.upstream_dist pairs.loc[ix, "right"] = pairs.loc[ix].next_downstream.astype("uint64") ids = pairs.groupby("left").right.first() if len(ids): # save to send to NHD pd.DataFrame({"NHDPlusID": ids.index.unique()}).to_csv( f"/tmp/{gdb_path.stem}_bad_joins.csv", index=False ) ix = join_df.upstream.isin(ids.index) join_df.loc[ix, "downstream"] = join_df.loc[ix].upstream.map(ids) print( f"Repaired {len(ids):,} joins marked by NHD as terminals but actually joined to flowlines" ) # set join types to make it easier to track join_df["type"] = "internal" # set default # upstream-most origin points join_df.loc[join_df.upstream == 0, "type"] = "origin" # downstream-most termination points join_df.loc[join_df.downstream == 0, "type"] = "terminal" ### Filter out coastlines and update joins # WARNING: we tried filtering out pipelines (FType == 428). It doesn't work properly; # there are many that go through dams and are thus needed to calculate # network connectivity and gain of removing a dam. print("Filtering out coastlines...") coastline_idx = df.loc[df.FType == 566].index df = df.loc[~df.index.isin(coastline_idx)].copy() print(f"{len(df):,} features after removing coastlines") # remove any joins that have coastlines as upstream # these are themselves coastline segments join_df = join_df.loc[~join_df.upstream.isin(coastline_idx)].copy() # set the downstream to 0 for any that join coastlines # this will enable us to mark these as downstream terminals in # the network analysis later join_df["marine"] = join_df.downstream.isin(coastline_idx) join_df.loc[join_df.marine, "downstream"] = 0 join_df.loc[join_df.marine, "type"] = "terminal" # drop any duplicates (above operation sets some joins to upstream and downstream of 0) join_df = join_df.drop_duplicates(subset=["upstream", "downstream"]) ### Filter out underground connectors ix = df.loc[df.FType == 420].index print("Removing {:,} underground conduits".format(len(ix))) df = df.loc[~df.index.isin(ix)].copy() join_df = remove_joins( join_df, ix, downstream_col="downstream", upstream_col="upstream" ) ### Label loops for easier removal later # WARNING: loops may be very problematic from a network processing standpoint. # Include with caution. print("Identifying loops") df["loop"] = (df.StreamOrde != df.StreamCalc) | (df.FlowDir.isnull()) idx = df.loc[df.loop].index join_df["loop"] = join_df.upstream.isin(idx) | join_df.downstream.isin(idx) ### Add calculated fields # Set our internal master IDs to the original index of the file we start from # Assume that we can always fit into a uint32, which is ~400 million records # and probably bigger than anything we could ever read in df["lineID"] = df.index.values.astype("uint32") + 1 join_df = ( join_df.join(df.lineID.rename("upstream_id"), on="upstream") .join(df.lineID.rename("downstream_id"), on="downstream") .fillna(0) ) for col in ("upstream", "downstream"): join_df[col] = join_df[col].astype("uint64") for col in ("upstream_id", "downstream_id"): join_df[col] = join_df[col].astype("uint32") ### Calculate size classes print("Calculating size class") drainage = df.TotDASqKm df.loc[drainage < 10, "sizeclass"] = "1a" df.loc[(drainage >= 10) & (drainage < 100), "sizeclass"] = "1b" df.loc[(drainage >= 100) & (drainage < 518), "sizeclass"] = "2" df.loc[(drainage >= 518) & (drainage < 2590), "sizeclass"] = "3a" df.loc[(drainage >= 2590) & (drainage < 10000), "sizeclass"] = "3b" df.loc[(drainage >= 10000) & (drainage < 25000), "sizeclass"] = "4" df.loc[drainage >= 25000, "sizeclass"] = "5" # Calculate length and sinuosity print("Calculating length and sinuosity") df["length"] = df.geometry.length.astype("float32") df["sinuosity"] = calculate_sinuosity(df.geometry.values.data).astype("float32") # drop columns not useful for later processing steps df = df.drop(columns=["FlowDir", "StreamCalc"]) # calculate incoming joins (have valid upstream, but not in this HUC4) join_df.loc[(join_df.upstream != 0) & (join_df.upstream_id == 0), "type"] = "huc_in" return df, join_df
def remove_marine_flowlines(flowlines, joins, marine): """Remove flowlines that originate within or are mostly within marine areas for coastal HUC2s. Marks any that have endpoints in marine areas or are upstream of those removed here as terminating in marine. Parameters ---------- flowlines : GeoDataFrame joins : DataFrame marine : GeoDataFrame Returns ------- (GeoDataFrame, DataFrame) flowlines, joins """ # Remove those that start in marine areas points = pg.get_point(flowlines.geometry.values.data, 0) tree = pg.STRtree(points) left, right = tree.query_bulk(marine.geometry.values.data, predicate="intersects") ix = flowlines.index.take(np.unique(right)) print(f"Removing {len(ix):,} flowlines that originate in marine areas") # mark any that terminated in those as marine joins.loc[joins.downstream_id.isin(ix), "marine"] = True flowlines = flowlines.loc[~flowlines.index.isin(ix)].copy() joins = remove_joins(joins, ix, downstream_col="downstream_id", upstream_col="upstream_id") # Mark those that end in marine areas as marine endpoints = pg.get_point(flowlines.geometry.values.data, -1) tree = pg.STRtree(endpoints) left, right = tree.query_bulk(marine.geometry.values.data, predicate="intersects") ix = flowlines.index.take(np.unique(right)) joins.loc[joins.upstream_id.isin(ix), "marine"] = True # For any that end in marine but didn't originate there, check the amount of overlap; # any that are >= 90% in marine should get cut print("Calculating overlap of remaining lines with marine areas") tmp = pd.DataFrame({ "lineID": flowlines.iloc[right].index, "geometry": flowlines.iloc[right].geometry.values.data, "marine": marine.iloc[left].geometry.values.data, }) tmp["overlap"] = pg.intersection(tmp.geometry, tmp.marine) tmp["pct_overlap"] = 100 * pg.length(tmp.overlap) / pg.length(tmp.geometry) ix = tmp.loc[tmp.pct_overlap >= 90].lineID.unique() print(f"Removing {len(ix):,} flowlines that mostly overlap marine areas") # mark any that terminated in those as marine joins.loc[joins.downstream_id.isin(ix), "marine"] = True flowlines = flowlines.loc[~flowlines.index.isin(ix)].copy() joins = remove_joins(joins, ix, downstream_col="downstream_id", upstream_col="upstream_id") return flowlines, joins
def create_drain_points(flowlines, joins, waterbodies, wb_joins): """Create drain points from furthest downstream point of flowlines that overlap with waterbodies. WARNING: If multiple flowlines intersect at the drain point, there will be multiple drain points at the same location Parameters ---------- flowlines : GeoDataFrame joins : DataFrame flowline joins waterbodies : GeoDataFrame wb_joins : DataFrame waterbody / flowline joins Returns ------- GeoDataFrame Drain points dataframe """ start = time() wb_atts = waterbodies[["altered", "km2", "flowlineLength"]].copy() tmp_flowlines = flowlines[[ "geometry", "FCode", "FType", "MaxElevSmo", "MinElevSmo", "Slope", "TotDASqKm", "StreamOrde", "sizeclass", "HUC4", "loop", ]].rename(columns={ "FCode": "lineFCode", "FType": "lineFType" }) ### Find the downstream most point(s) on the flowline for each waterbody # This is used for snapping barriers, if possible. # Drop any where there is no flowline below the drain point (often pipelines # that were removed) tmp = wb_joins[["lineID", "wbID"]].set_index("lineID") drains = (joins.loc[joins.upstream_id.isin(wb_joins.lineID.unique()) & (joins.downstream_id != 0)].join( tmp.wbID.rename("upstream_wbID"), on="upstream_id").join( tmp.wbID.rename("downstream_wbID"), on="downstream_id")) # Only keep those that terminate outside the same waterbody as the upstream end drains = drains.loc[drains.upstream_wbID != drains.downstream_wbID].copy() # Join in stats from waterbodies and geometries from flowlines drain_pts = (wb_joins.loc[wb_joins.lineID.isin( drains.upstream_id.unique())].join( wb_atts, on="wbID", ).join( tmp_flowlines[["geometry", "loop", "TotDASqKm"]], on="lineID", ).reset_index(drop=True)) # create a point from the last coordinate, which is the furthest one downstream drain_pts.geometry = pg.get_point(drain_pts.geometry.values.data, -1) # drop any that are downstream terminals; these are most likely waterbodies # that do not have further downstream networks (e.g., flow to ocean) ix = joins.loc[joins.upstream_id.isin(drain_pts.lineID) & (joins.downstream_id == 0)].upstream_id drain_pts = drain_pts.loc[~drain_pts.lineID.isin(ix)].copy() ### Find all drain points that share the same geometry. # These are most likely multiple segments that terminate in same drain point, # so we need to assign them their common downstream ID instead so that # snapping dams to these works properly later (otherwise snapped to only one of segments) drain_pts["hash"] = pd.util.hash_array( pg.to_wkb(drain_pts.geometry.values.data)) s = drain_pts.groupby("hash").size() ix = drain_pts.hash.isin(s[s > 1].index) if ix.sum(): print(f"Deduplicating {ix.sum():,} duplicate drain points") # find downstream_id for each of these, and deduplicate if there are multiple # downstreams, favoring the non-loops j = (joins.loc[joins.upstream_id.isin(drain_pts.loc[ix].lineID) & (joins.downstream_id != 0), ["upstream_id", "downstream_id", "loop"], ].sort_values( by=["upstream_id", "loop"], ascending=True).groupby( "upstream_id").first().downstream_id) drain_pts = drain_pts.join(j, on="lineID") # for those at same location that share the same downstream line, use that line instead s = (drain_pts.loc[drain_pts.downstream_id.notnull()].groupby( "downstream_id").size()) ix = drain_pts.downstream_id.isin(s[s > 1].index.astype("uint32")) drain_pts.loc[ix, "lineID"] = drain_pts.loc[ix].downstream_id.astype( "uint32") # update the line properties to match that lineID lids = drain_pts.loc[ix].lineID.values drain_pts.loc[ix, "flowlineLength"] = flowlines.loc[lids, "length"].values drain_pts.loc[ix, "loop"] = flowlines.loc[lids].loop.values drain_pts.loc[ix, "TotDASqKm"] = flowlines.loc[lids].TotDASqKm.values drain_pts = drain_pts.drop(columns=["downstream_id"]) # keep the first unique drain point and sort the rest so they are oriented # from upstream to downstream drain_pts = (drain_pts.drop(columns=["hash"]).groupby( ["lineID", "wbID"]).first().sort_values(by="TotDASqKm", ascending=True).reset_index()) drain_pts = gp.GeoDataFrame(drain_pts, geometry="geometry", crs=flowlines.crs) ### Deduplicate drains by network topology # Find the downstream-most drains for waterbodies when there are multiple distinct ones per waterbody. # These may result from flowlines that cross in and out of waterbodies multiple # times (not valid), or there may be drains on downstream loops # (esp. at dams) (valid). dups = drain_pts.groupby("wbID").size() > 1 if dups.sum(): print( f"Found {dups.sum():,} waterbodies with multiple drain points; cleaing up" ) # find all waterbodies that have duplicate drains ix = drain_pts.wbID.isin(dups[dups].index) wb_ids = drain_pts.loc[ix].wbID.unique() # find all corresponding line IDs for these waterbodies line_ids = wb_joins.loc[wb_joins.wbID.isin(wb_ids)].lineID.unique() lines_per_wb = (drain_pts.loc[drain_pts.wbID.isin(wb_ids)].groupby( "wbID").lineID.unique()) # search within 20 degrees removed from ids; this hopefully # picks up any gaps where lines exit waterbodies for a ways then re-enter # some floodplain areas have very big loops outside waterbody pairs = find_joins( joins, line_ids, downstream_col="downstream_id", upstream_col="upstream_id", expand=20, )[["upstream_id", "downstream_id"]] # remove any terminal points pairs = pairs.loc[(pairs.upstream_id != 0) & (pairs.downstream_id != 0)] # create a directed graph facing DOWNSTREAM graph = DirectedGraph(pairs, source="upstream_id", target="downstream_id") # find all lines that are upstream of other lines # these are "parents" in the directed graph upstreams = graph.find_all_parents(lines_per_wb.values) ix = pd.Series(upstreams).explode().dropna().unique() print( f"Dropping {len(ix):,} drains that are upstream of other drains in the same waterbody" ) drain_pts = drain_pts.loc[~drain_pts.lineID.isin(ix)] ### check if drain points are on a loop and very close to the junction # of the loop and nonloop (e.g., Hoover Dam, HUC2 == 15) drain_pts["snap_to_junction"] = False drain_pts["snap_dist"] = 0 drains_by_wb = drain_pts.groupby("wbID").size() multiple_drain_wb = drains_by_wb[drains_by_wb > 1].index # limit this to drain points on loops where there are multiple drains per waterbody loop_pts = drain_pts.loc[drain_pts.loop & (drain_pts.wbID.isin(multiple_drain_wb))].copy() # search within 3 degrees removed from ids; this hopefully # picks up any downstream junctions pairs = find_joins( joins, loop_pts.lineID.unique(), downstream_col="downstream_id", upstream_col="upstream_id", expand=3, )[["upstream_id", "downstream_id"]] # drop endpoints pairs = pairs.loc[(pairs.upstream_id != 0) & (pairs.downstream_id != 0)].copy() # find all junctions that have > 1 flowline upstream of them grouped = pairs.groupby("downstream_id").size() downstream_junctions = grouped[grouped > 1].index # extract upstream endoint for each junction line downstream_junction_pts = pd.Series( pg.get_point(flowlines.loc[downstream_junctions].geometry.values.data, 0), index=downstream_junctions, ) # find the nearest junctions within 5m tolerance of drain points on loops tree = pg.STRtree(downstream_junction_pts.values.data) left, right = tree.nearest_all(loop_pts.geometry.values.data, max_distance=5) # make sure they are connected on the network g = DirectedGraph(pairs, source="upstream_id", target="downstream_id") ix = g.is_reachable(loop_pts.iloc[left].lineID.values, downstream_junction_pts.iloc[right].index) left = left[ix] right = right[ix] if len(left): print( f"Found {len(left)} drains on loops within 5m upstream of a junction, updating them..." ) # NOTE: these are attributed to the flowline that is DOWNSTREAM of the junction point # whereas other drains are attributed to the flowline upstream of themselves ix = loop_pts.index.take(left) drain_pts.loc[ix, "snap_to_junction"] = True drain_pts.loc[ix, "snap_dist"] = pg.distance( drain_pts.loc[ix].geometry.values.data, downstream_junction_pts.iloc[right].values, ) drain_pts.loc[ix, "lineID"] = downstream_junction_pts.iloc[right].index drain_pts.loc[ix, "geometry"] = downstream_junction_pts.iloc[right].values ### Extract the drain points of upstream headwaters waterbodies # these are flowlines that originate at a waterbody wb_geom = waterbodies.loc[waterbodies.flowlineLength == 0].geometry wb_geom = pd.Series(wb_geom.values.data, index=wb_geom.index) # take only the upstream most point tmp_flowline_pts = tmp_flowlines[["geometry", "loop", "TotDASqKm"]].copy() tmp_flowline_pts["geometry"] = pg.get_point(flowlines.geometry.values.data, 0) fl_pt = pd.Series(tmp_flowline_pts.geometry.values.data, index=tmp_flowline_pts.index) headwaters = (sjoin_geometry( wb_geom, fl_pt, predicate="intersects").rename("lineID").reset_index()) headwaters = (headwaters.join( wb_atts, on="wbID", ).join( tmp_flowline_pts, on="lineID", ).reset_index(drop=True)) headwaters["headwaters"] = True headwaters["snap_to_junction"] = False headwaters["snap_dist"] = 0 print( f"Found {len(headwaters):,} headwaters waterbodies, adding drain points for these too" ) drain_pts["headwaters"] = False drain_pts = drain_pts.append(headwaters, sort=False, ignore_index=True).reset_index(drop=True) # join in line properties drain_pts = drain_pts.drop(columns=["loop", "TotDASqKm"]).join( tmp_flowlines.drop(columns=["geometry"]), on="lineID") # calculate unique index huc_id = drain_pts["HUC4"].astype("uint16") * 1000000 drain_pts["drainID"] = drain_pts.index.values.astype("uint32") + huc_id # Convert back to GeoDataFrame; above steps make it into a DataFrame drain_pts = gp.GeoDataFrame(drain_pts, geometry="geometry", crs=flowlines.crs) drain_pts.wbID = drain_pts.wbID.astype("uint32") drain_pts.lineID = drain_pts.lineID.astype("uint32") drain_pts.flowlineLength = drain_pts.flowlineLength.astype("float32") print("Done extracting {:,} waterbody drain points in {:.2f}s".format( len(drain_pts), time() - start)) return drain_pts
# NOTE: downstreams is indexed on id, not dams.index downstreams = (lines_by_dam.apply(find_downstreams).reset_index().explode( "lineID").drop_duplicates().set_index("id").lineID) # Now can just reduce dams back to these lineIDs dams = (dams[["id", "GNIS_Name", "geometry"]].join( downstreams, on="id", how="inner").drop_duplicates(subset=["id", "lineID"]).join( flowlines.geometry.rename("line"), on="lineID").reset_index(drop=True)) print("Found {:,} joins between NHD dams and flowlines".format(len(dams))) ### Extract representative point # Look at either end of overlapping line and use that as representative point. # Otherwise intersect and extract first coordinate of overlapping line first = pg.get_point(dams.line, 0) intersects_first = pg.intersects(dams.geometry, first) ix = intersects_first dams.loc[ix, "pt"] = first.loc[ix] ix = ~intersects_first last = pg.get_point(dams.loc[ix].line, -1) intersects_last = pg.intersects(dams.loc[ix].geometry, last) last = last.loc[intersects_last] dams.loc[last.index, "pt"] = last ix = dams.pt.isnull() # WARNING: this might fail for odd intersection geoms pt = pg.get_point( pg.intersection(dams.loc[ix].geometry, dams.loc[ix].line), 0).dropna() dams.loc[pt.index, "pt"] = pt