def test_distance_nan(): actual = pygeos.distance( np.array([point, np.nan, np.nan, point, None, None, point]), np.array([np.nan, point, np.nan, None, point, None, point]), ) assert actual[-1] == 0.0 assert np.isnan(actual[:-1].astype(np.float)).all()
def boundary_distance(polygon, points): """ Find the distance between a polygon's boundary and an array of points. Uses either `shapely` or `pygeos` (5-10x faster) as a backend. Parameters ------------- polygon : shapely.geometry.Polygon Polygon to query points : (n, 2) float 2D points Returns ------------ distance : (n,) float Minimum distance from each point to polygon boundary """ try: import pygeos # the pygeos way is 5-10x faster pg_points = pygeos.points(*points.T) pg_boundary = pygeos.boundary(pygeos.Geometry(polygon.wkt)) distance = pygeos.distance(pg_boundary, pg_points) except BaseException: # in pure shapely we have to loop inverse = polygon.boundary distance = np.array([ inverse.distance(i) for i in MultiPoint(points)]) return distance
def calculate_sinuosity(geometries): """Calculate sinuosity of the line. This is the length of the line divided by the distance between the endpoints of the line. By definition, it is always >=1. Parameters ---------- geometries : Series or ndarray of pygeos geometries Returns ------- Series or ndarray sinuosity values """ # By definition, sinuosity should not be less than 1 first = pg.get_point(geometries, 0) last = pg.get_point(geometries, -1) straight_line_distance = pg.distance(first, last) sinuosity = np.ones((len(geometries), )).astype("float32") # if there is no straight line distance there can be no sinuosity ix = straight_line_distance > 0 # by definition, all values must be at least 1, so clip lower bound sinuosity[ix] = (pg.length(geometries[ix]) / straight_line_distance).clip(1) if isinstance(geometries, pd.Series): return pd.Series(sinuosity, index=geometries.index) return sinuosity
def naive_compute_distance_similarity_matrix(sorted_detections, ground_truths): """Computes a similarity based on euclidean distance between all pairs of geometries in a naive fashion. Args: sorted_detections (ndarray, list) : A ndarray of detections stored as: * Bounding boxes for a given class where each row is a detection stored as: ``[BoundingBox, confidence]`` * Polygons for a given class where each row is a detection stored as: ``[Polygon, confidence]`` * Points for a given class where each row is a detection stored as: ``[Point, confidence]`` ground_truths (ndarray,list) : A ndarray of ground truth stored as: * Bounding boxes for a given class where each row is a ground truth stored as: ``[BoundingBox]`` * Polygons for a given class where each row is a ground truth stored as: ``[Polygon]`` * Points for a given class where each row is a ground truth stored as: ``[Point]`` Returns: ndarray : An similarity matrix (#detections, #ground truth) """ # We prepare the distance matrix (#detection, #gt) distance_matrix = np.zeros((sorted_detections.shape[0], len(ground_truths))) # Naive iterative distance matrix construction (Note: we iterate over the sorted detections) for k, ground_truth in enumerate(ground_truths): for m, detection in enumerate(sorted_detections): distance_matrix[m, k] = distance(centroid(detection[0]), centroid(ground_truth[0])) return 1 - distance_matrix
def time_tree_nearest_all_poly_python(self): # returns all input points # use an arbitrary search tolerance that seems appropriate for the density of # geometries tolerance = 200 b = pygeos.buffer(self.points, tolerance, quadsegs=1) left, right = self.tree.query_bulk(b) dist = pygeos.distance(self.points.take(left), self.polygons.take(right)) # sort by left, distance ix = np.lexsort((right, dist, left)) left = left[ix] right = right[ix] dist = dist[ix] run_start = np.r_[True, left[:-1] != left[1:]] run_counts = np.diff(np.r_[np.nonzero(run_start)[0], left.shape[0]]) mins = dist[run_start] # spread to rest of array so we can extract out all within each group that match all_mins = np.repeat(mins, run_counts) ix = dist == all_mins left = left[ix] right = right[ix] dist = dist[ix]
def _op(self, candidates, x, y): point_x = as_points(x) point_y = as_points(y) areas = np.Inf * np.ones((len(x), len(y))) areas[candidates[:, 0], candidates[:, 1]] = distance(point_x[candidates[:, 0]], point_y[candidates[:, 1]]).squeeze() return areas
def time_tree_nearest_points_equidistant_manual_all(self): # This benchmark approximates nearest_all for equidistant results # starting from singular nearest neighbors and searching for more # within same distance. # try to find all equidistant neighbors ourselves given single nearest # result l, r = self.grid_point_tree.nearest(self.grid_points) # calculate distance to nearest neighbor dist = pygeos.distance( self.grid_points.take(l), self.grid_point_tree.geometries.take(r) ) # include a slight epsilon to ensure nearest are within this radius b = pygeos.buffer(self.grid_points, dist + 1e-8) # query the tree for others in the same buffer distance left, right = self.grid_point_tree.query_bulk(b, predicate="intersects") dist = pygeos.distance( self.grid_points.take(left), self.grid_point_tree.geometries.take(right) ) # sort by left, distance ix = np.lexsort((right, dist, left)) left = left[ix] right = right[ix] dist = dist[ix] run_start = np.r_[True, left[:-1] != left[1:]] run_counts = np.diff(np.r_[np.nonzero(run_start)[0], left.shape[0]]) mins = dist[run_start] # spread to rest of array so we can extract out all within each group that match all_mins = np.repeat(mins, run_counts) ix = dist == all_mins left = left[ix] right = right[ix] dist = dist[ix]
def dist_F_vectorized(self, road, i, points_array): ml = [] for c in points_array: if LSDisplacer.DIST == 'MIN': m = LSDisplacer._multiline_from_points(c, self.talus_lengths) else: m = LSDisplacer._lines_from_points(c, self.talus_lengths) ml.append(m) ml = np.array(ml) dists = pygeos.distance(road, ml) if LSDisplacer.DIST != 'MIN': dists = dists.mean(axis=1) dists = np.where(dists > self.buffers[i], 0., self.buffers[i] - dists) return dists
def nearest_network_node_list(gdf_admin, gdf_nodes, sg): """[summary] Args: gdf_admin ([type]): [description] gdf_nodes ([type]): [description] sg ([type]): [description] Returns: [type]: [description] """ gdf_nodes = gdf_nodes.loc[gdf_nodes.id.isin(sg.vs['name'])] gdf_nodes.reset_index(drop=True, inplace=True) nodes = {} for admin_ in gdf_admin.itertuples(): nodes[admin_.name] = gdf_nodes.iloc[pygeos.distance( (admin_.centroid), gdf_nodes.geometry).idxmin()].id return nodes
def near(source, target, distance): """Return target geometries within distance of source geometries. Only returns records from source that intersected at least one feature in target. Parameters ---------- source : Series contains pygeos geometries target : Series contains target pygeos geometries to search against distance : number or ndarray radius within which to find target geometries. If ndarray, must be equal length to source. Returns ------- DataFrame indexed on original index of source includes distance """ # Get all indices from target_values that intersect buffers of input geometry idx = sjoin_geometry(pg.buffer(source, distance), target) hits = (pd.DataFrame(idx).join(source.rename("geometry"), how="inner").join( target.rename("geometry_right"), on="index_right", how="inner")) # this changes the index if hits is empty, causing downstream problems if not len(hits): hits.index.name = idx.index.name hits["distance"] = pg.distance(hits.geometry, hits.geometry_right).astype("float32") return (hits.drop(columns=["geometry", "geometry_right"]).rename( columns={ "index_right": target.index.name or "index_right" }).sort_values(by="distance"))
def _extend_line(coords, target, tolerance, snap=True): """ Extends a line geometry to snap on the target within a tolerance. """ if snap: extrapolation = _get_extrapolated_line( coords[-4:] if len(coords.shape) == 1 else coords[-2:].flatten(), tolerance, ) int_idx = target.sindex.query(extrapolation, predicate="intersects") intersection = pygeos.intersection( target.iloc[int_idx].geometry.values.data, extrapolation) if intersection.size > 0: if len(intersection) > 1: distances = {} ix = 0 for p in intersection: distance = pygeos.distance(p, pygeos.points(coords[-1])) distances[ix] = distance ix = ix + 1 minimal = min(distances.items(), key=operator.itemgetter(1))[0] new_point_coords = pygeos.get_coordinates( intersection[minimal]) else: new_point_coords = pygeos.get_coordinates(intersection[0]) coo = np.append(coords, new_point_coords) new = np.reshape(coo, (int(len(coo) / 2), 2)) return new return coords extrapolation = _get_extrapolated_line( coords[-4:] if len(coords.shape) == 1 else coords[-2:].flatten(), tolerance, point=True, ) return np.vstack([coords, extrapolation])
def time_distance(self): pygeos.distance(self.points, self.polygon)
def street_profile(streets, buildings, distance=3, tick_length=50): pygeos_lines = streets.geometry.values.data list_points = np.empty((0, 2)) ids = [] lengths = pygeos.length(pygeos_lines) for ix, (line, length) in enumerate(zip(pygeos_lines, lengths)): pts = pygeos.line_interpolate_point( line, np.linspace(0, length, num=int((length) // distance)) ) # .1 offset to keep a gap between two segments list_points = np.append(list_points, pygeos.get_coordinates(pts), axis=0) ids += [ix] * len(pts) * 2 ticks = [] for num, pt in enumerate(list_points, 1): # start chainage 0 if num == 1: angle = _getAngle(pt, list_points[num]) line_end_1 = _getPoint1(pt, angle, tick_length / 2) angle = _getAngle(line_end_1, pt) line_end_2 = _getPoint2(line_end_1, angle, tick_length) ticks.append([line_end_1, pt]) ticks.append([line_end_2, pt]) # everything in between if num < len(list_points) - 1: angle = _getAngle(pt, list_points[num]) line_end_1 = _getPoint1( list_points[num], angle, tick_length / 2 ) angle = _getAngle(line_end_1, list_points[num]) line_end_2 = _getPoint2(line_end_1, angle, tick_length) ticks.append([line_end_1, list_points[num]]) ticks.append([line_end_2, list_points[num]]) # end chainage if num == len(list_points): angle = _getAngle(list_points[num - 2], pt) line_end_1 = _getPoint1(pt, angle, tick_length / 2) angle = _getAngle(line_end_1, pt) line_end_2 = _getPoint2(line_end_1, angle, tick_length) ticks.append([line_end_1, pt]) ticks.append([line_end_2, pt]) ticks = pygeos.linestrings(ticks) inp, res = pygeos.STRtree(ticks).query_bulk(buildings.geometry.values.data, predicate='intersects') intersections = pygeos.intersection(ticks[res], buildings.geometry.values.data[inp]) distances = pygeos.distance(intersections, pygeos.points(list_points[res // 2])) dists = np.zeros((len(ticks),)) dists[:] = np.nan dists[res] = distances ids = np.array(ids) widths = [] openness = [] deviations = [] for i in range(len(streets)): f = ids == i s = dists[f] lefts = s[::2] rights = s[1::2] left_mean = np.nanmean(lefts) if ~np.isnan(lefts).all() else tick_length / 2 right_mean = np.nanmean(rights) if ~np.isnan(rights).all() else tick_length / 2 widths.append(np.mean([left_mean, right_mean]) * 2) openness.append(np.isnan(s).sum() / (f).sum()) deviations.append(np.nanstd(s)) return (widths, deviations, openness)
def test_distance_missing(): actual = pygeos.distance(point, None) assert np.isnan(actual)
def test_distance_empty(): actual = pygeos.distance(point, empty) assert np.isnan(actual)
def __init__(self, left, right, heights=None, distance=10, tick_length=50, verbose=True): self.left = left self.right = right self.distance = distance self.tick_length = tick_length pygeos_lines = left.geometry.values.data list_points = np.empty((0, 2)) ids = [] end_markers = [] lengths = pygeos.length(pygeos_lines) for ix, (line, length) in enumerate(zip(pygeos_lines, lengths)): pts = pygeos.line_interpolate_point( line, np.linspace(0, length, num=int((length) // distance))) list_points = np.append(list_points, pygeos.get_coordinates(pts), axis=0) if len(pts) > 1: ids += [ix] * len(pts) * 2 markers = [True] + ([False] * (len(pts) - 2)) + [True] end_markers += markers elif len(pts) == 1: end_markers += [True] ids += [ix] * 2 ticks = [] for num, (pt, end) in enumerate(zip(list_points, end_markers), 1): if end: ticks.append([pt, pt]) ticks.append([pt, pt]) else: angle = self._getAngle(pt, list_points[num]) line_end_1 = self._getPoint1(pt, angle, tick_length / 2) angle = self._getAngle(line_end_1, pt) line_end_2 = self._getPoint2(line_end_1, angle, tick_length) ticks.append([line_end_1, pt]) ticks.append([line_end_2, pt]) ticks = pygeos.linestrings(ticks) inp, res = right.sindex.query_bulk(ticks, predicate="intersects") intersections = pygeos.intersection(ticks[inp], right.geometry.values.data[res]) distances = pygeos.distance(intersections, pygeos.points(list_points[inp // 2])) inp_uni, inp_cts = np.unique(inp, return_counts=True) splitter = np.cumsum(inp_cts)[:-1] dist_per_res = np.split(distances, splitter) inp_per_res = np.split(res, splitter) min_distances = [] min_inds = [] for dis, ind in zip(dist_per_res, inp_per_res): min_distances.append(np.min(dis)) min_inds.append(ind[np.argmin(dis)]) dists = np.zeros((len(ticks), )) dists[:] = np.nan dists[inp_uni] = min_distances if heights is not None: if isinstance(heights, str): heights = self.heights = right[heights] elif not isinstance(heights, pd.Series): heights = self.heights = pd.Series(heights) blgs = np.zeros((len(ticks), )) blgs[:] = None blgs[inp_uni] = min_inds do_heights = True else: do_heights = False ids = np.array(ids) widths = [] openness = [] deviations = [] heights_list = [] heights_deviations_list = [] for i in range(len(left)): f = ids == i s = dists[f] lefts = s[::2] rights = s[1::2] left_mean = np.nanmean( lefts) if ~np.isnan(lefts).all() else tick_length / 2 right_mean = (np.nanmean(rights) if ~np.isnan(rights).all() else tick_length / 2) widths.append(np.mean([left_mean, right_mean]) * 2) openness.append(np.isnan(s).sum() / (f).sum()) deviations.append(np.nanstd(s)) if do_heights: b = blgs[f] h = heights.iloc[b[~np.isnan(b)]] heights_list.append(h.mean()) heights_deviations_list.append(h.std()) self.w = pd.Series(widths, index=left.index) self.wd = pd.Series(deviations, index=left.index).fillna( 0) # fill for empty intersections self.o = pd.Series(openness, index=left.index).fillna(1) if do_heights: self.h = pd.Series(heights_list, index=left.index).fillna( 0) # fill for empty intersections self.hd = pd.Series(heights_deviations_list, index=left.index).fillna( 0) # fill for empty intersections self.p = self.h / self.w.replace(0, np.nan) # replace to avoid np.inf
def test_distance(): actual = pygeos.distance(*point_polygon_testdata) expected = [2 * 2**0.5, 2**0.5, 0, 0, 0, 2**0.5] np.testing.assert_allclose(actual, expected)
def create_drain_points(flowlines, joins, waterbodies, wb_joins): """Create drain points from furthest downstream point of flowlines that overlap with waterbodies. WARNING: If multiple flowlines intersect at the drain point, there will be multiple drain points at the same location Parameters ---------- flowlines : GeoDataFrame joins : DataFrame flowline joins waterbodies : GeoDataFrame wb_joins : DataFrame waterbody / flowline joins Returns ------- GeoDataFrame Drain points dataframe """ start = time() wb_atts = waterbodies[["altered", "km2", "flowlineLength"]].copy() tmp_flowlines = flowlines[[ "geometry", "FCode", "FType", "MaxElevSmo", "MinElevSmo", "Slope", "TotDASqKm", "StreamOrde", "sizeclass", "HUC4", "loop", ]].rename(columns={ "FCode": "lineFCode", "FType": "lineFType" }) ### Find the downstream most point(s) on the flowline for each waterbody # This is used for snapping barriers, if possible. # Drop any where there is no flowline below the drain point (often pipelines # that were removed) tmp = wb_joins[["lineID", "wbID"]].set_index("lineID") drains = (joins.loc[joins.upstream_id.isin(wb_joins.lineID.unique()) & (joins.downstream_id != 0)].join( tmp.wbID.rename("upstream_wbID"), on="upstream_id").join( tmp.wbID.rename("downstream_wbID"), on="downstream_id")) # Only keep those that terminate outside the same waterbody as the upstream end drains = drains.loc[drains.upstream_wbID != drains.downstream_wbID].copy() # Join in stats from waterbodies and geometries from flowlines drain_pts = (wb_joins.loc[wb_joins.lineID.isin( drains.upstream_id.unique())].join( wb_atts, on="wbID", ).join( tmp_flowlines[["geometry", "loop", "TotDASqKm"]], on="lineID", ).reset_index(drop=True)) # create a point from the last coordinate, which is the furthest one downstream drain_pts.geometry = pg.get_point(drain_pts.geometry.values.data, -1) # drop any that are downstream terminals; these are most likely waterbodies # that do not have further downstream networks (e.g., flow to ocean) ix = joins.loc[joins.upstream_id.isin(drain_pts.lineID) & (joins.downstream_id == 0)].upstream_id drain_pts = drain_pts.loc[~drain_pts.lineID.isin(ix)].copy() ### Find all drain points that share the same geometry. # These are most likely multiple segments that terminate in same drain point, # so we need to assign them their common downstream ID instead so that # snapping dams to these works properly later (otherwise snapped to only one of segments) drain_pts["hash"] = pd.util.hash_array( pg.to_wkb(drain_pts.geometry.values.data)) s = drain_pts.groupby("hash").size() ix = drain_pts.hash.isin(s[s > 1].index) if ix.sum(): print(f"Deduplicating {ix.sum():,} duplicate drain points") # find downstream_id for each of these, and deduplicate if there are multiple # downstreams, favoring the non-loops j = (joins.loc[joins.upstream_id.isin(drain_pts.loc[ix].lineID) & (joins.downstream_id != 0), ["upstream_id", "downstream_id", "loop"], ].sort_values( by=["upstream_id", "loop"], ascending=True).groupby( "upstream_id").first().downstream_id) drain_pts = drain_pts.join(j, on="lineID") # for those at same location that share the same downstream line, use that line instead s = (drain_pts.loc[drain_pts.downstream_id.notnull()].groupby( "downstream_id").size()) ix = drain_pts.downstream_id.isin(s[s > 1].index.astype("uint32")) drain_pts.loc[ix, "lineID"] = drain_pts.loc[ix].downstream_id.astype( "uint32") # update the line properties to match that lineID lids = drain_pts.loc[ix].lineID.values drain_pts.loc[ix, "flowlineLength"] = flowlines.loc[lids, "length"].values drain_pts.loc[ix, "loop"] = flowlines.loc[lids].loop.values drain_pts.loc[ix, "TotDASqKm"] = flowlines.loc[lids].TotDASqKm.values drain_pts = drain_pts.drop(columns=["downstream_id"]) # keep the first unique drain point and sort the rest so they are oriented # from upstream to downstream drain_pts = (drain_pts.drop(columns=["hash"]).groupby( ["lineID", "wbID"]).first().sort_values(by="TotDASqKm", ascending=True).reset_index()) drain_pts = gp.GeoDataFrame(drain_pts, geometry="geometry", crs=flowlines.crs) ### Deduplicate drains by network topology # Find the downstream-most drains for waterbodies when there are multiple distinct ones per waterbody. # These may result from flowlines that cross in and out of waterbodies multiple # times (not valid), or there may be drains on downstream loops # (esp. at dams) (valid). dups = drain_pts.groupby("wbID").size() > 1 if dups.sum(): print( f"Found {dups.sum():,} waterbodies with multiple drain points; cleaing up" ) # find all waterbodies that have duplicate drains ix = drain_pts.wbID.isin(dups[dups].index) wb_ids = drain_pts.loc[ix].wbID.unique() # find all corresponding line IDs for these waterbodies line_ids = wb_joins.loc[wb_joins.wbID.isin(wb_ids)].lineID.unique() lines_per_wb = (drain_pts.loc[drain_pts.wbID.isin(wb_ids)].groupby( "wbID").lineID.unique()) # search within 20 degrees removed from ids; this hopefully # picks up any gaps where lines exit waterbodies for a ways then re-enter # some floodplain areas have very big loops outside waterbody pairs = find_joins( joins, line_ids, downstream_col="downstream_id", upstream_col="upstream_id", expand=20, )[["upstream_id", "downstream_id"]] # remove any terminal points pairs = pairs.loc[(pairs.upstream_id != 0) & (pairs.downstream_id != 0)] # create a directed graph facing DOWNSTREAM graph = DirectedGraph(pairs, source="upstream_id", target="downstream_id") # find all lines that are upstream of other lines # these are "parents" in the directed graph upstreams = graph.find_all_parents(lines_per_wb.values) ix = pd.Series(upstreams).explode().dropna().unique() print( f"Dropping {len(ix):,} drains that are upstream of other drains in the same waterbody" ) drain_pts = drain_pts.loc[~drain_pts.lineID.isin(ix)] ### check if drain points are on a loop and very close to the junction # of the loop and nonloop (e.g., Hoover Dam, HUC2 == 15) drain_pts["snap_to_junction"] = False drain_pts["snap_dist"] = 0 drains_by_wb = drain_pts.groupby("wbID").size() multiple_drain_wb = drains_by_wb[drains_by_wb > 1].index # limit this to drain points on loops where there are multiple drains per waterbody loop_pts = drain_pts.loc[drain_pts.loop & (drain_pts.wbID.isin(multiple_drain_wb))].copy() # search within 3 degrees removed from ids; this hopefully # picks up any downstream junctions pairs = find_joins( joins, loop_pts.lineID.unique(), downstream_col="downstream_id", upstream_col="upstream_id", expand=3, )[["upstream_id", "downstream_id"]] # drop endpoints pairs = pairs.loc[(pairs.upstream_id != 0) & (pairs.downstream_id != 0)].copy() # find all junctions that have > 1 flowline upstream of them grouped = pairs.groupby("downstream_id").size() downstream_junctions = grouped[grouped > 1].index # extract upstream endoint for each junction line downstream_junction_pts = pd.Series( pg.get_point(flowlines.loc[downstream_junctions].geometry.values.data, 0), index=downstream_junctions, ) # find the nearest junctions within 5m tolerance of drain points on loops tree = pg.STRtree(downstream_junction_pts.values.data) left, right = tree.nearest_all(loop_pts.geometry.values.data, max_distance=5) # make sure they are connected on the network g = DirectedGraph(pairs, source="upstream_id", target="downstream_id") ix = g.is_reachable(loop_pts.iloc[left].lineID.values, downstream_junction_pts.iloc[right].index) left = left[ix] right = right[ix] if len(left): print( f"Found {len(left)} drains on loops within 5m upstream of a junction, updating them..." ) # NOTE: these are attributed to the flowline that is DOWNSTREAM of the junction point # whereas other drains are attributed to the flowline upstream of themselves ix = loop_pts.index.take(left) drain_pts.loc[ix, "snap_to_junction"] = True drain_pts.loc[ix, "snap_dist"] = pg.distance( drain_pts.loc[ix].geometry.values.data, downstream_junction_pts.iloc[right].values, ) drain_pts.loc[ix, "lineID"] = downstream_junction_pts.iloc[right].index drain_pts.loc[ix, "geometry"] = downstream_junction_pts.iloc[right].values ### Extract the drain points of upstream headwaters waterbodies # these are flowlines that originate at a waterbody wb_geom = waterbodies.loc[waterbodies.flowlineLength == 0].geometry wb_geom = pd.Series(wb_geom.values.data, index=wb_geom.index) # take only the upstream most point tmp_flowline_pts = tmp_flowlines[["geometry", "loop", "TotDASqKm"]].copy() tmp_flowline_pts["geometry"] = pg.get_point(flowlines.geometry.values.data, 0) fl_pt = pd.Series(tmp_flowline_pts.geometry.values.data, index=tmp_flowline_pts.index) headwaters = (sjoin_geometry( wb_geom, fl_pt, predicate="intersects").rename("lineID").reset_index()) headwaters = (headwaters.join( wb_atts, on="wbID", ).join( tmp_flowline_pts, on="lineID", ).reset_index(drop=True)) headwaters["headwaters"] = True headwaters["snap_to_junction"] = False headwaters["snap_dist"] = 0 print( f"Found {len(headwaters):,} headwaters waterbodies, adding drain points for these too" ) drain_pts["headwaters"] = False drain_pts = drain_pts.append(headwaters, sort=False, ignore_index=True).reset_index(drop=True) # join in line properties drain_pts = drain_pts.drop(columns=["loop", "TotDASqKm"]).join( tmp_flowlines.drop(columns=["geometry"]), on="lineID") # calculate unique index huc_id = drain_pts["HUC4"].astype("uint16") * 1000000 drain_pts["drainID"] = drain_pts.index.values.astype("uint32") + huc_id # Convert back to GeoDataFrame; above steps make it into a DataFrame drain_pts = gp.GeoDataFrame(drain_pts, geometry="geometry", crs=flowlines.crs) drain_pts.wbID = drain_pts.wbID.astype("uint32") drain_pts.lineID = drain_pts.lineID.astype("uint32") drain_pts.flowlineLength = drain_pts.flowlineLength.astype("float32") print("Done extracting {:,} waterbody drain points in {:.2f}s".format( len(drain_pts), time() - start)) return drain_pts
# some drains are at exact same point as extracted flowline crossing point tmp["same_subnet"] = tmp.lineID == tmp.drainLineID ix = ~tmp.same_subnet tmp.loc[ix, "same_subnet"] = g.is_reachable( tmp.loc[ix].lineID.values, tmp.loc[ix].drainLineID.values, 4 ) # try from other direction ix = ~tmp.same_subnet tmp.loc[ix, "same_subnet"] = g.is_reachable( tmp.loc[ix].drainLineID.values, tmp.loc[ix].lineID.values, 4 ) tmp = tmp.loc[tmp.same_subnet].copy() # take the closest drain to the crossing point if there are multiple on the # same flowline tmp["dist"] = pg.distance(tmp.geometry.values.data, tmp.pt.values.data) use_drains = ( tmp.sort_values(by=["damPtID", "dist"], ascending=True) .drop(columns=["same_subnet", "dist", "pt", "lineID"]) .groupby("damPtID") .first() ) dams = dams.join( use_drains[["drainID", "wbID", "drainLineID", "geometry"]].rename( columns={"geometry": "drain"} ) ) ix = dams.drainID.notnull() print( f"Found {ix.sum():,} dams associated with waterbodies in {time() - join_start:,.2f}s"
def snap_to_large_waterbodies(df, to_snap): """Snap to nearest large waterbody. NOTE: only run this on dams that could not snap to flowlines, to avoid moving them far away. This captures large dam centerpoints that are not near enough to flowlines. Updates df with snapping results, and returns to_snap as set of dams still needing to be snapped after this operation. Parameters ---------- df : GeoDataFrame master dataset, this is where all snapping gets recorded to_snap : DataFrame data frame containing pygeos geometries to snap ("geometry") and snapping tolerance ("snap_tolerance") Returns ------- tuple of (GeoDataFrame, DataFrame) (df, to_snap) """ wb = from_geofeather(nhd_dir / "merged" / "large_waterbodies.feather").set_index( "wbID" ) drains = ( from_geofeather(nhd_dir / "merged" / "large_waterbody_drain_points.feather") .rename(columns={"id": "drainID"}) .set_index("drainID") ) near_wb = nearest(to_snap.geometry, pg.boundary(wb.geometry), NEAR_WB_TOLERANCE) near_wb = ( pd.DataFrame(near_wb) .join(to_snap.geometry) .join( drains.reset_index() .set_index("wbID")[["geometry", "drainID", "lineID"]] .rename(columns={"geometry": "drain"}), on="wbID", ) .dropna(subset=["drain"]) ) near_wb["snap_dist"] = pg.distance(near_wb.geometry, near_wb.drain) # drop any that are > 250 m away, these aren't useful near_wb = near_wb.loc[near_wb.snap_dist <= WB_DRAIN_MAX_TOLERANCE].copy() # take the closest drain point near_wb = near_wb.sort_values(by="snap_dist").groupby(level=0).first() ix = near_wb.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = near_wb.drain df.loc[ix, "snap_dist"] = near_wb.distance df.loc[ix, "snap_ref_id"] = near_wb.drainID df.loc[ix, "lineID"] = near_wb.lineID df.loc[ix, "wbID"] = near_wb.wbID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", WB_DRAIN_MAX_TOLERANCE, "m tolerance of drain point of large waterbody that is within ", NEAR_WB_TOLERANCE, "m of dam", ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( "Found {:,} dams within {}m of large waterbodies and within {}m of the drain point of those waterbodies".format( len(near_wb), NEAR_WB_TOLERANCE, WB_DRAIN_MAX_TOLERANCE ) ) return df, to_snap
def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]): """ Extracts flowlines data from NHDPlusHR data product. Extract flowlines from NHDPlusHR data product, joins to VAA table, and filters out coastlines. Extracts joins between flowlines, and filters out coastlines. Parameters ---------- gdb_path : str path to the NHD HUC4 Geodatabase target_crs: GeoPandas CRS object target CRS to project NHD to for analysis, like length calculations. Must be a planar projection. extra_cols: list List of extra field names to extract from NHDFlowline layer Returns ------- tuple of (GeoDataFrame, DataFrame) (flowlines, joins) """ ### Read in flowline data and convert to data frame print("Reading flowlines") flowline_cols = FLOWLINE_COLS + extra_flowline_cols df = read_dataframe( gdb_path, layer="NHDFlowline", force_2d=True, columns=[flowline_cols], ) # Index on NHDPlusID for easy joins to other NHD data df.NHDPlusID = df.NHDPlusID.astype("uint64") df = df.set_index(["NHDPlusID"], drop=False) # convert MultiLineStrings to LineStrings (all have a single linestring) df.geometry = pg.get_geometry(df.geometry.values.data, 0) print("making valid and projecting to target projection") df.geometry = make_valid(df.geometry.values.data) df = df.to_crs(target_crs) print(f"Read {len(df):,} flowlines") ### Read in VAA and convert to data frame # NOTE: not all records in Flowlines have corresponding records in VAA # we drop those that do not since we need these fields. print("Reading VAA table and joining...") vaa_df = read_dataframe(gdb_path, layer="NHDPlusFlowlineVAA", columns=[VAA_COLS]) vaa_df.NHDPlusID = vaa_df.NHDPlusID.astype("uint64") vaa_df = vaa_df.set_index(["NHDPlusID"]) df = df.join(vaa_df, how="inner") print(f"{len(df):,} features after join to VAA") # Simplify data types for smaller files and faster IO df.FType = df.FType.astype("uint16") df.FCode = df.FCode.astype("uint16") df.StreamOrde = df.StreamOrde.astype("uint8") df.Slope = df.Slope.astype("float32") df.MinElevSmo = df.MinElevSmo.astype("float32") df.MaxElevSmo = df.MaxElevSmo.astype("float32") ### Read in flowline joins print("Reading flowline joins") join_df = read_dataframe( gdb_path, layer="NHDPlusFlow", read_geometry=False, columns=["FromNHDPID", "ToNHDPID"], ).rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"}) join_df.upstream = join_df.upstream.astype("uint64") join_df.downstream = join_df.downstream.astype("uint64") ### Fix errors in NHD # some valid joins are marked as terminals (downstream==0) in NHD; we need # to backfill the missing join info. # To do this, we intersect all terminals back with flowlines dropping any # that are themselves terminals. Then we calculate the distance to the upstream # point of the intersected line, and the upstream point of the next segment # downstream. We use the ID of whichever one is closer (must be within 100m). ix = join_df.loc[join_df.downstream == 0].upstream.unique() # get last point, is furthest downstream tmp = df.loc[df.index.isin(ix), ["geometry"]].copy() tmp["geometry"] = pg.get_point(tmp.geometry.values.data, -1) target = df.loc[~df.index.isin(ix)] # only search against other flowlines tree = pg.STRtree(target.geometry.values.data) # search within a tolerance of 0.001, these are very very close left, right = tree.nearest_all(tmp.geometry.values.data, max_distance=0.001) pairs = pd.DataFrame( { "left": tmp.index.take(left), "right": target.index.take(right), "source": tmp.geometry.values.data.take(left), # take upstream / downstream points of matched lines "upstream_target": pg.get_point(df.geometry.values.data.take(right), 0), } ) # drop any pairs where the other side is also a terminal (these appear as # V shaped tiny networks that need to be left as is) pairs = pairs.loc[~pairs.right.isin(ix)] # calculate the next segment downstream (only keep the first if multiple; possible logic issue) next_downstream = ( join_df.loc[(join_df.upstream != 0) & (join_df.downstream != 0)] .groupby("upstream") .downstream.first() ) pairs["next_downstream"] = pairs.right.map(next_downstream) pairs.loc[pairs.next_downstream.notnull(), "downstream_target"] = pg.get_point( df.loc[ pairs.loc[pairs.next_downstream.notnull()].next_downstream ].geometry.values.data, 0, ) pairs["upstream_dist"] = pg.distance(pairs.source, pairs.upstream_target) ix = pairs.next_downstream.notnull() pairs.loc[ix, "downstream_dist"] = pg.distance( pairs.loc[ix].source, pairs.loc[ix].downstream_target ) # this ignores any nan pairs["dist"] = pairs[["upstream_dist", "downstream_dist"]].min(axis=1) # discard any that are too far (>100m) pairs = pairs.loc[pairs.dist <= 100].copy() # sort by distance to upstream point of matched flowline; this allows us # to sort on those then dedup to calculate a new downstream ID for this source line pairs = pairs.sort_values(by=["left", "dist"]) # set the right value to the next downstream if it is closer # this also ignores na ix = pairs.downstream_dist < pairs.upstream_dist pairs.loc[ix, "right"] = pairs.loc[ix].next_downstream.astype("uint64") ids = pairs.groupby("left").right.first() if len(ids): # save to send to NHD pd.DataFrame({"NHDPlusID": ids.index.unique()}).to_csv( f"/tmp/{gdb_path.stem}_bad_joins.csv", index=False ) ix = join_df.upstream.isin(ids.index) join_df.loc[ix, "downstream"] = join_df.loc[ix].upstream.map(ids) print( f"Repaired {len(ids):,} joins marked by NHD as terminals but actually joined to flowlines" ) # set join types to make it easier to track join_df["type"] = "internal" # set default # upstream-most origin points join_df.loc[join_df.upstream == 0, "type"] = "origin" # downstream-most termination points join_df.loc[join_df.downstream == 0, "type"] = "terminal" ### Filter out coastlines and update joins # WARNING: we tried filtering out pipelines (FType == 428). It doesn't work properly; # there are many that go through dams and are thus needed to calculate # network connectivity and gain of removing a dam. print("Filtering out coastlines...") coastline_idx = df.loc[df.FType == 566].index df = df.loc[~df.index.isin(coastline_idx)].copy() print(f"{len(df):,} features after removing coastlines") # remove any joins that have coastlines as upstream # these are themselves coastline segments join_df = join_df.loc[~join_df.upstream.isin(coastline_idx)].copy() # set the downstream to 0 for any that join coastlines # this will enable us to mark these as downstream terminals in # the network analysis later join_df["marine"] = join_df.downstream.isin(coastline_idx) join_df.loc[join_df.marine, "downstream"] = 0 join_df.loc[join_df.marine, "type"] = "terminal" # drop any duplicates (above operation sets some joins to upstream and downstream of 0) join_df = join_df.drop_duplicates(subset=["upstream", "downstream"]) ### Filter out underground connectors ix = df.loc[df.FType == 420].index print("Removing {:,} underground conduits".format(len(ix))) df = df.loc[~df.index.isin(ix)].copy() join_df = remove_joins( join_df, ix, downstream_col="downstream", upstream_col="upstream" ) ### Label loops for easier removal later # WARNING: loops may be very problematic from a network processing standpoint. # Include with caution. print("Identifying loops") df["loop"] = (df.StreamOrde != df.StreamCalc) | (df.FlowDir.isnull()) idx = df.loc[df.loop].index join_df["loop"] = join_df.upstream.isin(idx) | join_df.downstream.isin(idx) ### Add calculated fields # Set our internal master IDs to the original index of the file we start from # Assume that we can always fit into a uint32, which is ~400 million records # and probably bigger than anything we could ever read in df["lineID"] = df.index.values.astype("uint32") + 1 join_df = ( join_df.join(df.lineID.rename("upstream_id"), on="upstream") .join(df.lineID.rename("downstream_id"), on="downstream") .fillna(0) ) for col in ("upstream", "downstream"): join_df[col] = join_df[col].astype("uint64") for col in ("upstream_id", "downstream_id"): join_df[col] = join_df[col].astype("uint32") ### Calculate size classes print("Calculating size class") drainage = df.TotDASqKm df.loc[drainage < 10, "sizeclass"] = "1a" df.loc[(drainage >= 10) & (drainage < 100), "sizeclass"] = "1b" df.loc[(drainage >= 100) & (drainage < 518), "sizeclass"] = "2" df.loc[(drainage >= 518) & (drainage < 2590), "sizeclass"] = "3a" df.loc[(drainage >= 2590) & (drainage < 10000), "sizeclass"] = "3b" df.loc[(drainage >= 10000) & (drainage < 25000), "sizeclass"] = "4" df.loc[drainage >= 25000, "sizeclass"] = "5" # Calculate length and sinuosity print("Calculating length and sinuosity") df["length"] = df.geometry.length.astype("float32") df["sinuosity"] = calculate_sinuosity(df.geometry.values.data).astype("float32") # drop columns not useful for later processing steps df = df.drop(columns=["FlowDir", "StreamCalc"]) # calculate incoming joins (have valid upstream, but not in this HUC4) join_df.loc[(join_df.upstream != 0) & (join_df.upstream_id == 0), "type"] = "huc_in" return df, join_df
def snap_to_nhd_dams(df, to_snap): """Attempt to snap points from to_snap to NHD dams. Updates df with snapping results, and returns to_snap as set of dams still needing to be snapped after this operation. Parameters ---------- df : GeoDataFrame master dataset, this is where all snapping gets recorded to_snap : DataFrame data frame containing pygeos geometries to snap ("geometry") and snapping tolerance ("snap_tolerance") Returns ------- tuple of (GeoDataFrame, DataFrame) (df, to_snap) """ print("Snapping to NHD dams...") # NOTE: id is not unique for points nhd_dams_poly = ( from_geofeather(nhd_dir / "merged" / "nhd_dams_poly.feather") .rename(columns={"id": "damID"}) .set_index("damID") .drop(columns=["index"], errors="ignore") ) nhd_dams = ( from_geofeather(nhd_dir / "merged" / "nhd_dams_pt.feather") .rename(columns={"id": "damID"}) .set_index("damID") .drop(columns=["index"], errors="ignore") ) # set nulls back to na nhd_dams.wbID = nhd_dams.wbID.replace(-1, np.nan) ### Find dams that are really close (50m) to NHD dam polygons # Those that have multiple dams nearby are usually part of a dam complex snap_start = time() near_nhd = nearest( to_snap.geometry, nhd_dams_poly.geometry, distance=NHD_DAM_TOLERANCE )[["damID"]] # snap to nearest dam point for that dam (some are > 1 km away) # NOTE: this will create multiple entries for some dams near_nhd = near_nhd.join(to_snap.geometry.rename("source_pt")).join( nhd_dams, on="damID" ) near_nhd["snap_dist"] = pg.distance(near_nhd.geometry, near_nhd.source_pt) near_nhd = ( near_nhd.reset_index().sort_values(by=["id", "snap_dist"]).groupby("id").first() ) ix = near_nhd.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = near_nhd.geometry df.loc[ix, "snap_dist"] = near_nhd.snap_dist df.loc[ix, "snap_ref_id"] = near_nhd.damID df.loc[ix, "lineID"] = near_nhd.lineID df.loc[ix, "wbID"] = near_nhd.wbID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", NHD_DAM_TOLERANCE, "m of NHD dam polygon" ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( "Snapped {:,} dams to NHD dam polygons in {:.2f}s".format( len(ix), time() - snap_start ) ) ### Find dams that are close (within snapping tolerance) of NHD dam points snap_start = time() tmp = nhd_dams.reset_index() # reset index so we have unique index to join on near_nhd = nearest( to_snap.geometry, tmp.geometry, distance=to_snap.snap_tolerance ).rename(columns={"distance": "snap_dist"}) near_nhd = near_nhd.join(to_snap.geometry.rename("source_pt")).join( tmp, on="index_right" ) near_nhd = ( near_nhd.reset_index().sort_values(by=["id", "snap_dist"]).groupby("id").first() ) ix = near_nhd.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = near_nhd.geometry df.loc[ix, "snap_dist"] = near_nhd.snap_dist df.loc[ix, "snap_ref_id"] = near_nhd.damID df.loc[ix, "lineID"] = near_nhd.lineID df.loc[ix, "wbID"] = near_nhd.wbID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", to_snap.loc[ix].snap_tolerance, "m tolerance of NHD dam point but >", NHD_DAM_TOLERANCE, "m from NHD dam polygon", ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( "Snapped {:,} dams to NHD dam points in {:.2f}s".format( len(ix), time() - snap_start ) ) ### TODO: identify any NHD dam points that didn't get claimed (need to do this after snapping others) return df, to_snap
def snap_to_waterbodies(df, to_snap): """Attempt to snap points from to_snap to waterbody drain points. Updates df with snapping results, and returns to_snap as set of dams still needing to be snapped after this operation. Parameters ---------- df : GeoDataFrame master dataset, this is where all snapping gets recorded to_snap : DataFrame data frame containing pygeos geometries to snap ("geometry") and snapping tolerance ("snap_tolerance") Returns ------- tuple of (GeoDataFrame, DataFrame) (df, to_snap) """ ### Attempt to snap to waterbody drain points for major waterbodies # Use larger tolerance for larger waterbodies print("Snapping to waterbodies and drain points..") wb = from_geofeather(nhd_dir / "merged" / "waterbodies.feather").set_index("wbID") drains = ( from_geofeather(nhd_dir / "merged" / "waterbody_drain_points.feather") .rename(columns={"id": "drainID"}) .set_index("drainID") ) ### First pass - find the dams that are contained by waterbodies contained_start = time() in_wb = sjoin(to_snap, wb, how="inner").index_right.rename("wbID") # update wbID in dataset, but this doesn't mean it is snapped ix = in_wb.index df.loc[ix, "wbID"] = in_wb print( "Found {:,} dams in waterbodies in {:.2f}s".format( len(in_wb), time() - contained_start ) ) print("Finding nearest drain points...") snap_start = time() # join back to pygeos geoms and join to drains # NOTE: this may produce multiple drains for some waterbodies in_wb = ( pd.DataFrame(in_wb) .join(to_snap[["geometry", "snap_tolerance"]]) .join( drains.reset_index() .set_index("wbID")[["geometry", "drainID", "lineID"]] .rename(columns={"geometry": "drain"}), on="wbID", ) .dropna(subset=["drain"]) ) in_wb["snap_dist"] = pg.distance(in_wb.geometry, in_wb.drain) # drop any that are > 500 m away, these aren't useful in_wb = in_wb.loc[in_wb.snap_dist <= 500].copy() # take the closest drain point in_wb.index.name = "index" in_wb = ( in_wb.reset_index() .sort_values(by=["index", "snap_dist"]) .groupby("index") .first() ) # Any that are within the snap tolerance just snap to that drain close_enough = in_wb.loc[in_wb.snap_dist <= in_wb.snap_tolerance] ix = close_enough.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = close_enough.drain df.loc[ix, "snap_dist"] = close_enough.snap_dist df.loc[ix, "snap_ref_id"] = close_enough.drainID df.loc[ix, "lineID"] = close_enough.lineID df.loc[ix, "wbID"] = close_enough.wbID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", to_snap.loc[ix].snap_tolerance, "m tolerance of drain point for waterbody that contains this dam", ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( "Found {:,} dams within tolerance of the drain points for their waterbody in {:.2f}s".format( len(ix), time() - snap_start ) ) # Any that are > tolerance away from their own drain, but within tolerance of another drain # should snap to the other drain; these are in chains of multiple waterbodies. # Visually confirmed this by looking at several. snap_start = time() further = in_wb.loc[in_wb.snap_dist > in_wb.snap_tolerance].copy() nearest_drains = nearest(further.geometry, drains.geometry, further.snap_tolerance) maybe_near_neighbor = further.join(nearest_drains, rsuffix="_nearest") ix = maybe_near_neighbor.loc[ maybe_near_neighbor.distance < maybe_near_neighbor.snap_dist ].index near_neighbor = ( ( maybe_near_neighbor.loc[ix] .drop(columns=["drain", "drainID", "wbID", "lineID", "snap_dist"]) .rename(columns={"drainID_nearest": "drainID", "distance": "snap_dist"}) .join( drains[["geometry", "lineID", "wbID"]].rename( columns={"geometry": "drain"} ), on="drainID", ) ) .sort_values(by="snap_dist") .groupby(level=0) .first() ) df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = near_neighbor.drain df.loc[ix, "snap_dist"] = near_neighbor.snap_dist df.loc[ix, "snap_ref_id"] = near_neighbor.drainID df.loc[ix, "lineID"] = near_neighbor.lineID df.loc[ix, "wbID"] = near_neighbor.wbID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", to_snap.loc[ix].snap_tolerance, "m tolerance of drain point for adjacent waterbody", ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( "Found {:,} dams close to drain points for an adjacent waterbody in {:.2f}s".format( len(ix), time() - snap_start ) ) # Any that remain and are < 250 in their waterbody snap to nearest drain further = further.loc[ ~further.index.isin(ix) & (further.snap_dist <= WB_DRAIN_MAX_TOLERANCE) ].copy() ix = further.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = further.drain df.loc[ix, "snap_dist"] = further.snap_dist df.loc[ix, "snap_ref_id"] = further.drainID df.loc[ix, "lineID"] = further.lineID df.loc[ix, "wbID"] = further.wbID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", to_snap.loc[ix].snap_tolerance, "-", WB_DRAIN_MAX_TOLERANCE, "m tolerance of drain point of waterbody that contains this dam", ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( "Found {:,} dams within <{}m of the drain points for their waterbody".format( len(ix), WB_DRAIN_MAX_TOLERANCE ) ) ### Find the ones that are not in a waterbody but within tolerance of a drain # Visually inspected several that had multiple waterbodies nearby # in all cases, the nearest one was sufficient print("Finding nearest waterbody drains for unsnapped dams...") snap_start = time() nearest_drains = nearest(to_snap.geometry, drains.geometry, to_snap.snap_tolerance) nearest_drains = nearest_drains.join(to_snap.geometry).join( drains[["geometry", "wbID", "lineID"]].rename(columns={"geometry": "drain"}), on="drainID", ) ix = nearest_drains.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = nearest_drains.drain df.loc[ix, "snap_dist"] = nearest_drains.distance df.loc[ix, "snap_ref_id"] = nearest_drains.drainID df.loc[ix, "lineID"] = nearest_drains.lineID df.loc[ix, "wbID"] = nearest_drains.wbID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", to_snap.loc[ix].snap_tolerance, "m tolerance of drain point of waterbody (dam not in waterbody)", ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( "Found {:,} dams within {}m of waterbody drain points".format( len(ix), to_snap.snap_tolerance.max() ) ) # TODO: need to track which waterbodies were claimed by dams return df, to_snap
def snap_to_nhd_dams(df, to_snap): """Attempt to snap points from to_snap to NHD dams. Updates df with snapping results, and returns to_snap as set of dams still needing to be snapped after this operation. Parameters ---------- df : GeoDataFrame master dataset, this is where all snapping gets recorded to_snap : DataFrame data frame containing pygeos geometries to snap ("geometry") and snapping tolerance ("snap_tolerance") Returns ------- tuple of (GeoDataFrame, DataFrame) (df, to_snap) """ snap_start = time() print("=================\nSnapping to NHD dams...") nhd_dams_poly = gp.read_feather( nhd_dir / "merged" / "nhd_dams_poly.feather", columns=["damID", "geometry"]).set_index("damID") # NOTE: there may be multiple points per damID nhd_dams = gp.read_feather( nhd_dir / "merged" / "nhd_dams_pt.feather", columns=["damID", "wbID", "lineID", "loop", "sizeclass", "geometry"], ).set_index("damID") # set nulls back to na nhd_dams.wbID = nhd_dams.wbID.replace(-1, np.nan) ### Find dams that are really close (50m) to NHD dam polygons near_nhd_pt = nearest( pd.Series(to_snap.geometry.values.data, index=to_snap.index), pd.Series(nhd_dams.geometry.values.data, index=nhd_dams.index), max_distance=NHD_DAM_TOLERANCE, )[["damID"]] near_nhd_pt = near_nhd_pt.join(to_snap.geometry.rename("source_pt")).join( nhd_dams, on="damID") near_nhd_pt.reset_index().drop_duplicates( subset=["id", "damID", "lineID", "geometry"]).set_index("id") near_nhd_pt["snap_dist"] = pg.distance(near_nhd_pt.geometry.values.data, near_nhd_pt.source_pt.values.data) # take the largest, nonloop near_nhd_pt = (near_nhd_pt.reset_index().sort_values( by=["id", "sizeclass", "loop", "snap_dist"], ascending=[True, False, True, True], ).groupby("id").first()) ix = near_nhd_pt.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = near_nhd_pt.geometry df.loc[ix, "snap_dist"] = near_nhd_pt.snap_dist df.loc[ix, "snap_ref_id"] = near_nhd_pt.damID df.loc[ix, "lineID"] = near_nhd_pt.lineID df.loc[ix, "wbID"] = near_nhd_pt.wbID df.loc[ ix, "snap_log"] = f"snapped: within {NHD_DAM_TOLERANCE}m tolerance of NHD dam point" to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( f"Snapped {len(ix):,} dams within {NHD_DAM_TOLERANCE} to NHD dam points in {time() - snap_start:.2f}s" ) ### Find dams that are really close (50m) to NHD dam polygons # Those that have multiple dams nearby are usually part of a dam complex near_nhd = near( pd.Series(to_snap.geometry.values.data, index=to_snap.index), pd.Series(nhd_dams_poly.geometry.values.data, index=nhd_dams_poly.index), distance=NHD_DAM_TOLERANCE, )[["damID"]] # snap to nearest dam point for that dam (some are > 1 km away) # NOTE: this will create multiple entries for some dams; the closest is used near_nhd = near_nhd.join(to_snap.geometry.rename("source_pt")).join( nhd_dams, on="damID") near_nhd.reset_index().drop_duplicates( subset=["id", "damID", "lineID", "geometry"]).set_index("id") near_nhd["snap_dist"] = pg.distance(near_nhd.geometry.values.data, near_nhd.source_pt.values.data) # Sort to prioritize larger size classes and non-loops, then distance # this also drops duplicates near_nhd = (near_nhd.reset_index().sort_values( by=["id", "sizeclass", "loop", "snap_dist"], ascending=[True, False, True, True], ).groupby("id").first()) ix = near_nhd.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = near_nhd.geometry df.loc[ix, "snap_dist"] = near_nhd.snap_dist df.loc[ix, "snap_ref_id"] = near_nhd.damID df.loc[ix, "lineID"] = near_nhd.lineID df.loc[ix, "wbID"] = near_nhd.wbID df.loc[ix, "snap_log"] = ndarray_append_strings("snapped: within ", NHD_DAM_TOLERANCE, "m of NHD dam polygon") to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print("Snapped {:,} dams to NHD dam polygons in {:.2f}s".format( len(ix), time() - snap_start)) ### Find dams that are close (within snapping tolerance) of NHD dam points # most of these should have been picked up above, but this picks up ones that are # greater than NHD_DAM_TOLERANCE away due to bad locations snap_start = time() tmp = nhd_dams.reset_index( ) # reset index so we have unique index to join on near_nhd = nearest( pd.Series(to_snap.geometry.values.data, index=to_snap.index), pd.Series(tmp.geometry.values.data, index=tmp.index), max_distance=np.clip(to_snap.snap_tolerance.values, 0, NHD_DAM_PT_TOLERANCE), ).rename(columns={"distance": "snap_dist"}) near_nhd = near_nhd.join(to_snap.geometry.rename("source_pt")).join( tmp, on="index_right") near_nhd = (near_nhd.reset_index().sort_values( by=["id", "sizeclass", "loop", "snap_dist"], ascending=[True, False, True, True], ).groupby("id").first()) ix = near_nhd.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = near_nhd.geometry df.loc[ix, "snap_dist"] = near_nhd.snap_dist df.loc[ix, "snap_ref_id"] = near_nhd.damID df.loc[ix, "lineID"] = near_nhd.lineID df.loc[ix, "wbID"] = near_nhd.wbID df.loc[ ix, "snap_log"] = f"snapped: within {NHD_DAM_PT_TOLERANCE}m tolerance of NHD dam point but >{NHD_DAM_TOLERANCE}m from NHD dam polygon" to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( f"Snapped {len(ix):,} dams to NHD dam points in {time() - snap_start:.2f}s" ) return df, to_snap
def snap_to_waterbodies(df, to_snap): """Attempt to snap points from to_snap to waterbody drain points. Updates df with snapping results, and returns to_snap as set of dams still needing to be snapped after this operation. Parameters ---------- df : GeoDataFrame master dataset, this is where all snapping gets recorded to_snap : DataFrame data frame containing pygeos geometries to snap ("geometry") and snapping tolerance ("snap_tolerance") Returns ------- tuple of (GeoDataFrame, DataFrame) (df, to_snap) """ ### Attempt to snap to waterbody drain points for major waterbodies # Use larger tolerance for larger waterbodies # NOTE: this specifically excludes known lowhead dams from snapping to waterbodies print("=================\nSnapping to waterbodies and drain points..") for huc2 in sorted(to_snap.HUC2.unique()): print(f"\n----- {huc2} ------") in_huc2 = to_snap.loc[(to_snap.HUC2 == huc2) & (to_snap.LowheadDam != 1)].copy() wb = gp.read_feather( nhd_dir / "clean" / huc2 / "waterbodies.feather", columns=["wbID", "geometry"], ).set_index("wbID") drains = gp.read_feather( nhd_dir / "clean" / huc2 / "waterbody_drain_points.feather", columns=[ "drainID", "wbID", "lineID", "loop", "sizeclass", "geometry" ], ).set_index("drainID") print( f"HUC {huc2} selected {len(in_huc2):,} barriers in region to snap against {len(wb):,} waterbodies" ) ### First pass - find the dams that are contained by waterbodies contained_start = time() # Join to nearest waterbodies within 1m (basically inside) # and keep only the first match tree = pg.STRtree(wb.geometry.values.data) left, right = tree.nearest_all(in_huc2.geometry.values.data, max_distance=1) in_wb = (pd.DataFrame({ "id": in_huc2.index.values.take(left), "wbID": wb.index.values.take(right), }).groupby("id").first()) in_wb_index = in_wb.index # update wbID in dataset, but this doesn't mean it is snapped df.loc[in_wb.index, "wbID"] = in_wb.wbID print( f"Found {len(in_wb):,} dams in waterbodies in {time() - contained_start:.2f}s" ) print("Finding nearest drain points...") snap_start = time() # join back to pygeos geoms and join to drains # NOTE: this may bring in multiple drains for some waterbodies, we take the # closest drain below in_wb = (in_wb.join(to_snap[["geometry", "snap_tolerance"]]).join( drains.reset_index().set_index("wbID")[[ "drainID", "lineID", "loop", "sizeclass", "geometry" ]].rename(columns={"geometry": "drain"}), on="wbID", ).dropna(subset=["drain"])) in_wb["snap_dist"] = pg.distance(in_wb.geometry.values.data, in_wb.drain.values.data) # sort drains by largest size class, nonloop, then descending distance in_wb = (in_wb.loc[ in_wb.snap_dist <= in_wb.snap_tolerance].reset_index().sort_values( by=["sizeclass", "loop", "snap_dist"], ascending=[False, True, True], ).groupby("id").first()) # Any that are within the snap tolerance just snap to that drain ix = in_wb.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = in_wb.drain df.loc[ix, "snap_dist"] = in_wb.snap_dist df.loc[ix, "snap_ref_id"] = in_wb.drainID df.loc[ix, "lineID"] = in_wb.lineID df.loc[ix, "wbID"] = in_wb.wbID df.loc[ix, "snap_log"] = ndarray_append_strings( "snapped: within ", to_snap.loc[ix].snap_tolerance, "m tolerance of drain point for waterbody that contains this dam", ) to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( f"Found {len(ix):,} dams within tolerance of the drain points for their waterbody in {time() - snap_start:.2f}s" ) ### Find the ones that are not in a waterbody but within tolerance of a drain # Visually inspected several that had multiple waterbodies nearby # in all cases, the nearest waterbody was sufficient print("Finding nearest waterbody drains for unsnapped dams...") snap_start = time() # only snap those that are not in waterbodies not_in_wb = in_huc2.loc[~in_huc2.index.isin(in_wb_index.unique() )].copy() nearest_drains = nearest( pd.Series(not_in_wb.geometry.values.data, index=not_in_wb.index), pd.Series(drains.geometry.values.data, index=drains.index), max_distance=np.clip(not_in_wb.snap_tolerance.values, 0, WB_DRAIN_MAX_TOLERANCE), ) # join in all drains for waterbody of nearest drain point nearest_drains = (nearest_drains.drop(columns=["distance"]).join( not_in_wb[["geometry", "snap_tolerance"]]).join( drains.wbID, on="drainID", ).drop(columns=["drainID"]).join( drains.reset_index().set_index("wbID")[[ "geometry", "drainID", "lineID", "loop", "sizeclass" ]].rename(columns={"geometry": "drain"}), on="wbID", )) nearest_drains["snap_dist"] = pg.distance( nearest_drains.geometry.values.data, nearest_drains.drain.values.data) # take the nearest, largest non-loop drain point within tolerance nearest_drains = ( nearest_drains.loc[nearest_drains.snap_dist < nearest_drains.snap_tolerance].sort_values( by=["sizeclass", "loop", "snap_dist"], ascending=[False, True, True], ).groupby(level=0).first()) ix = nearest_drains.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = nearest_drains.drain df.loc[ix, "snap_dist"] = nearest_drains.snap_dist df.loc[ix, "snap_ref_id"] = nearest_drains.drainID df.loc[ix, "lineID"] = nearest_drains.lineID df.loc[ix, "wbID"] = nearest_drains.wbID df.loc[ ix, "snap_log"] = f"snapped: within {WB_DRAIN_MAX_TOLERANCE}m or less of drain point of waterbody (dam not in waterbody)" to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( f"Found {len(ix):,} dams within {WB_DRAIN_MAX_TOLERANCE}m or less of waterbody drain points" ) return df, to_snap
def snap_estimated_dams_to_drains(df, to_snap): """Snap estimated dams to waterbody drain points. Dams that were estimated from waterbodies are snapped to the nearest drain points (should be very small snap_dist). Other estimated dams often occur inside / immediately adjacent to waterbodies and are snapped to the nearest drain point of those waterbodies if < 2km. Parameters ---------- df : GeoDataFrame master dataset, this is where all snapping gets recorded to_snap : DataFrame data frame containing pygeos geometries to snap ("geometry") and snapping tolerance ("snap_tolerance") Returns ------- tuple of (GeoDataFrame, DataFrame) (df, to_snap) """ snap_start = time() # if estimated dam and was not manually reviewed and moved or verified at correct location ix = (to_snap.snap_group.isin([1, 3 ])) & (~to_snap.ManualReview.isin([4, 13])) estimated = to_snap.loc[ix].copy() print(f"=================\nSnapping {len(estimated):,} estimated dams...") for huc2 in sorted(estimated.HUC2.unique()): wb = gp.read_feather( nhd_dir / "clean" / huc2 / "waterbodies.feather", columns=["wbID", "geometry"], ).set_index("wbID") drains = gp.read_feather( nhd_dir / "clean" / huc2 / "waterbody_drain_points.feather", columns=["drainID", "wbID", "lineID", "geometry"], ).set_index("drainID") in_huc2 = estimated.loc[estimated.HUC2 == huc2].copy() # most estimated dams were originally derived from waterbody drain points, # so process those first tmp = in_huc2.loc[in_huc2.snap_group == 3] if len(tmp): max_drain_dist = tmp.snap_tolerance.unique()[0] tree = pg.STRtree(drains.geometry.values.data) left, right = tree.nearest_all(tmp.geometry.values.data, max_distance=max_drain_dist) drain_joins = (pd.DataFrame({ "id": tmp.index.values.take(left), "geometry": tmp.geometry.values.take(left), "drainID": drains.index.values.take(right), "drain": drains.geometry.values.take(right), "wbID": drains.wbID.values.take(right), "lineID": drains.lineID.values.take(right), }).groupby("id").first()) drain_joins["snap_dist"] = pg.distance( drain_joins.geometry.values.data, drain_joins.drain.values.data) ix = drain_joins.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = drain_joins.drain df.loc[ix, "snap_dist"] = drain_joins.snap_dist df.loc[ix, "snap_ref_id"] = drain_joins.drainID df.loc[ix, "lineID"] = drain_joins.lineID df.loc[ix, "wbID"] = drain_joins.wbID df.loc[ ix, "snap_log"] = "snapped: dams estimated from waterbody snapped to nearest drain point" to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( f"HUC {huc2}: snapped {len(drain_joins):,} of {len(drain_joins):,} dams estimated from waterbodies in region to waterbody drain points" ) in_huc2 = in_huc2.loc[in_huc2.snap_group == 1] # Some estimated dams are just barely outside their waterbodies # so we take the nearest waterbody for each, within a tolerance of 1m tree = pg.STRtree(wb.geometry.values.data) left, right = tree.nearest_all(in_huc2.geometry.values.data, max_distance=1) # take the first in case of duplicates in_wb = (pd.DataFrame({ "id": in_huc2.index.values.take(left), "wbID": wb.index.values.take(right), }).groupby("id").first().join(in_huc2.geometry).join( drains[["wbID", "lineID", "geometry"]].reset_index().set_index("wbID").rename( columns={"geometry": "drain"}), on="wbID", )) in_wb["snap_dist"] = pg.distance(in_wb.geometry.values.data, in_wb.drain.values.data) grouped = in_wb.sort_values(by="snap_dist").groupby(level=0) in_wb = grouped.first() # any waterbodies that have > 2 drains are dubious fits; remove them s = grouped.size() ix = s[s > 2].index in_wb = in_wb.loc[~in_wb.index.isin(ix)].copy() # any that are >2,000m away are likely incorrect; some ones near that length are OK in_wb = in_wb.loc[in_wb.snap_dist <= 2000].copy() ix = in_wb.index df.loc[ix, "snapped"] = True df.loc[ix, "geometry"] = in_wb.drain df.loc[ix, "snap_dist"] = in_wb.snap_dist df.loc[ix, "snap_ref_id"] = in_wb.drainID df.loc[ix, "lineID"] = in_wb.lineID df.loc[ix, "wbID"] = in_wb.wbID df.loc[ ix, "snap_log"] = "snapped: estimated dam in waterbody snapped to nearest drain point" to_snap = to_snap.loc[~to_snap.index.isin(ix)].copy() print( f"HUC {huc2}: snapped {len(in_wb):,} of {len(in_huc2):,} estimated dams in region to waterbody drain points" ) print( f"Snapped {len(df.loc[df.snap_log.str.startswith('snapped: estimated dam')]):,} estimated dams to waterbody drain points in {time() - snap_start:.2f}s" ) return df, to_snap