def convert_to_bounding_box(input_array, trim_invalid_geometry=False, autocorrect_invalid_geometry=False): r"""Convert an input array to a BoundingBox array. Args: input_array (ndarray, list): A ndarray of BoundingBox optionally followed by a confidence value and/or a label where each row is: ``[xmin, ymin, xmax, ymax, (confidence), (label)]`` trim_invalid_geometry (bool): Optional, default to ``False``. If set to ``True`` conversion will ignore invalid geometries and leave them out of ``output_array``. This means that the function will return an array where ``output_array.shape[0] <= input_array.shape[0]``. If set to ``False``, an invalid geometry will raise an :exc:`~playground_metrics.utils.geometry_utils.InvalidGeometryError`. autocorrect_invalid_geometry (Bool): Optional, default to ``False``. Doesn't do anything, introduced to unify convert functions interfaces. Returns: ndarray: A BoundingBox ndarray where each row contains a geometry followed by optionally confidence and a label e.g.: ``[BoundingBox, (confidence), (label)]`` Raises: ValueError: If ``input_array`` have invalid dimensions. """ input_array = np.array(input_array, dtype=np.dtype('O')) if input_array.size == 0: return 'undefined', input_array if len(input_array.shape) == 1 or len(input_array.shape) > 2: raise ValueError('Invalid array number of dimensions: ' 'Expected a 2D array, found {}D.'.format( len(input_array.shape))) coordinates_array = input_array[:, :4].astype(np.float64) object_array = np.ndarray((input_array.shape[0], input_array.shape[1] - 3), dtype=np.dtype('O')) object_array[:, 0] = box(coordinates_array[:, 0], coordinates_array[:, 1], coordinates_array[:, 2], coordinates_array[:, 3]) object_array[:, 1:] = input_array[:, 4:] if trim_invalid_geometry: object_array = object_array[is_valid(object_array[:, 0]), :] return object_array
def make_valid(geometries): """Make geometries valid. Parameters ---------- geometries : ndarray of pygeos geometries Returns ------- ndarray of pygeos geometries """ ix = ~pg.is_valid(geometries) if ix.sum(): geometries = geometries.copy() print(f"Repairing {ix.sum()} geometries") geometries[ix] = pg.make_valid(geometries[ix]) return geometries
def is_valid(data): if compat.USE_PYGEOS: return pygeos.is_valid(data) else: return _unary_op("is_valid", data, null_value=False)
def convert_to_polygon(input_array, trim_invalid_geometry=False, autocorrect_invalid_geometry=False): r"""Convert an input array to a Polygon array. Args: input_array (ndarray, list): A ndarray of Polygons optionally followed by a confidence value and/or a label where each row is: ``[[[outer_ring], [inner_rings]], (confidence), (label)]`` trim_invalid_geometry (bool): Optional, default to ``False``. If set to ``True`` conversion will ignore invalid geometries and leave them out of ``output_array``. This means that the function will return an array where ``output_array.shape[0] <= input_array.shape[0]``. If set to ``False``, an invalid geometry will raise an :exc:`~playground_metrics.utils.geometry_utils.InvalidGeometryError`. autocorrect_invalid_geometry (Bool): Optional, default to ``False``. Whether to attempt correcting a faulty geometry to form a valid one. If set to ``True`` and the autocorrect attempt is unsuccessful, it falls back to the behaviour defined in ``trim_invalid_geometry``. Note: * Polygon auto-correction only corrects self-crossing exterior rings, in which case it creates one Polygon out of every simple ring which might be extracted from the original Polygon exterior. * Polygon auto-correction will systematically fail on Polygons with at least one inner ring. Returns: ndarray: A Polygon ndarray where each row contains a geometry followed by optionally confidence and a label e.g.: ``[Polygon, (confidence), (label)]`` Raises: ValueError: If ``input_array`` have invalid dimensions. """ input_array = np.array(input_array, dtype=np.dtype('O')) if input_array.size == 0: return 'undefined', input_array if (len(input_array.shape) == 1 or len(input_array.shape) > 2) and \ (not len(input_array.shape) == 5 and not len(input_array.shape) == 3): raise ValueError('Invalid array number of dimensions: ' 'Expected a 2D array, found {}D.'.format( len(input_array.shape))) if len(input_array.shape) == 5 and not input_array.shape[4] == 2: raise ValueError('Invalid array fifth dimension: ' 'Expected 2, found {}.'.format(len( input_array.shape))) elif len(input_array.shape) == 3 and not input_array.shape[2] == 1: raise ValueError('Invalid array third dimension: ' 'Expected 1, found {}.'.format(len( input_array.shape))) object_array = np.ndarray((input_array.shape[0], input_array.shape[1]), dtype=np.dtype('O')) for i, coordinate in enumerate(input_array[:, 0]): line = [polygons(np.array(coordinate[0], dtype=np.float64), np.array(coordinate[1:], dtype=np.float64))] \ if len(coordinate) > 1 else [polygons(np.array(coordinate[0], dtype=np.float64))] line.extend(input_array[i, 1:]) object_array[i] = np.array(line, dtype=np.dtype('O')) if autocorrect_invalid_geometry: object_array[:, 0] = _clean_multi_geometries( make_valid(object_array[:, 0])) if trim_invalid_geometry: object_array = object_array[is_valid(object_array[:, 0]), :] if not np.all(is_type(object_array[:, 0], GeometryType.POLYGON)): raise ValueError( 'Conversion is impossible: Some geometries could not be converted to valid polygons.' ) return object_array
tree = pg.STRtree(df.geometry.values.data) left, right = tree.query_bulk(breaks, predicate="intersects") pairs = pd.DataFrame( {"break_geometry": breaks.take(left)}, index=df.index.take(right) ) grouped = pairs.groupby(level=0).break_geometry.apply( lambda g: pg.multipolygons(g.values.data) ) df.loc[grouped.index, "geometry"] = pg.difference( df.loc[grouped.index].geometry.values.data, grouped.values ) df = explode(df).reset_index(drop=True) # make sure all polygons are valid ix = ~pg.is_valid(df.geometry.values.data) if ix.sum(): print(f"Repairing {ix.sum()} invalid waterbodies") df.loc[ix, "geometry"] = pg.make_valid(df.loc[ix].geometry.values.data) df = explode(explode(df)) df = df.loc[pg.get_type_id(df.geometry.values.data) == 3].reset_index() # assign a new unique wbID df["wbID"] = df.index.values.astype("uint32") + 1 + int(huc2) * 1000000 df["km2"] = pg.area(df.geometry.values.data) / 1e6 df.to_feather(huc2_dir / "waterbodies.feather") write_dataframe(df, huc2_dir / "waterbodies.gpkg") print("--------------------") print(f"HUC2: {huc2} done in {time() - huc2_start:.0f}s\n\n")
df.loc[(df.nwi_type == "Riverine") & (df.altered)].drop(columns=["nwi_type"]), ) ### Process waterbodies # only keep that intersect flowlines print(f"Extracted {len(waterbodies):,} NWI lakes and ponds") left, right = tree.query_bulk(waterbodies.geometry.values.data, predicate="intersects") waterbodies = waterbodies.iloc[np.unique(left)].reset_index(drop=True) print(f"Kept {len(waterbodies):,} that intersect flowlines") # TODO: explode, repair, dissolve, explode, reset index waterbodies = explode(waterbodies) # make valid ix = ~pg.is_valid(waterbodies.geometry.values.data) if ix.sum(): print(f"Repairing {ix.sum():,} invalid waterbodies") waterbodies.loc[ix, "geometry"] = pg.make_valid( waterbodies.loc[ix].geometry.values.data) # note: nwi_code, nwi_type are discarded here since they aren't used later print("Dissolving adjacent waterbodies") waterbodies = dissolve(waterbodies, by=["altered"]) waterbodies = explode(waterbodies).reset_index(drop=True) waterbodies["km2"] = pg.area(waterbodies.geometry.values.data) / 1e6 waterbodies.to_feather(huc2_dir / "waterbodies.feather") write_dataframe(waterbodies, huc2_dir / "waterbodies.gpkg")
def cut_lines_by_waterbodies(flowlines, joins, waterbodies, wb_joins, out_dir): """ Cut lines by waterbodies. 1. Intersects all previously intersected flowlines with waterbodies. 2. For those that cross but are not completely contained by waterbodies, cut them. 3. Evaluate the cuts, only those that have substantive cuts inside and outside are retained as cuts. 4. Any flowlines that are not contained or crossing waterbodies are dropped from joins Parameters ---------- flowlines : GeoDataFrame joins : DataFrame flowline joins waterbodies : GeoDataFrame wb_joins : DataFrame waterbody flowline joins outdir : pathlib.Path output directory for writing error files, if needed Returns ------- tuple of (GeoDataFrame, DataFrame, GeoDataFrame, DataFrame) (flowlines, joins, waterbodies, waterbody joins) """ start = time() fl_geom = flowlines.loc[flowlines.index.isin(wb_joins.lineID), ["geometry"]].copy() # Many waterbodies have interior polygons (islands); these break the analysis below for cutting lines # Extract a new polygon of just their outer boundary wb_geom = waterbodies[["geometry"]].copy() wb_geom["waterbody"] = pg.polygons(pg.get_exterior_ring(wb_geom.geometry)) print("Validating waterbodies...") ix = ~pg.is_valid(wb_geom.waterbody) invalid_count = ix.sum() if invalid_count: print("{:,} invalid waterbodies found, repairing...".format(invalid_count)) # Buffer by 0 to fix # TODO: may need to do this by a small fraction and simplify instead repair_start = time() wb_geom.loc[ix, "waterbody"] = pg.buffer(wb_geom.loc[ix].waterbody, 0) waterbodies.loc[ix, "geometry"] = wb_geom.loc[ix].waterbody print("Repaired geometry in {:.2f}s".format(time() - repair_start)) # Set indices and create combined geometry object for analysis wb_joins = wb_joins.set_index(["lineID", "wbID"]) geoms = wb_joins.join(fl_geom, how="inner").join(wb_geom.waterbody) ### Find contained geometries print( "Identifying flowlines completely within waterbodies out of {:,} flowline / waterbody combinations...".format( len(geoms) ) ) contained_start = time() geoms["inside"] = pg.contains(geoms.waterbody.values, geoms.geometry.values) print( "Identified {:,} flowlines completely contained by waterbodies in {:.2f}s".format( geoms.inside.sum(), time() - contained_start ) ) # Check for logic errors - no flowline should be completely contained by more than 1 waterbody errors = geoms.groupby(level=[0]).inside.sum().astype("uint8") > 1 if errors.max(): # this most likely indicates duplicate waterbodies, which should have been resolved before this print( "ERROR: major logic error - some flowlines claim to be completely contained by multiple waterbodies" ) print( "===> error flowlines written to {}/contained_errors.feather".format( out_dir ) ) to_geofeather( flowlines.loc[flowlines.index.isin(errors)], out_dir / "contained_errors.feather", crs=CRS, ) ### Check those that aren't contained to see if they cross print("Determining which flowlines actually cross into waterbodies...") cross_start = time() geoms = geoms.loc[~geoms.inside].copy() geoms["crosses"] = pg.crosses(geoms.geometry, geoms.waterbody) outside = geoms.loc[~(geoms["crosses"] | geoms.inside)].index # keep the ones that cross for further processing geoms = geoms.loc[geoms.crosses].copy() print( "Identified {:,} flowlines completely outside waterbodies and {:,} flowlines that cross waterbody boundaries in {:.2f}s".format( len(outside), len(geoms), time() - cross_start ) ) # Any that do not cross and are not completely within waterbodies should be dropped now # Can only drop joins by BOTH lineID and wbID (the index here) # Also drop associated waterbodies that no longer have joins wb_joins = wb_joins.loc[~wb_joins.index.isin(outside)].copy() # FIXME: for closely adjacent waterbodies, these are important to keep # Need to cut them by their multiple polys, update their joins, and feed back into following analysis # pg.intersection_all might work here # check for multiple crossings - these are errors from NHD that we can drop from here errors = geoms.groupby(level=0).size() > 1 if errors.max(): print( "Found {:,} flowlines that cross multiple waterbodies. These are bad data and will be dropped from waterbody intersection.".format( errors.sum() ) ) to_geofeather( flowlines.loc[errors.index].reset_index(), out_dir / "error_crosses_multiple.feather", crs=CRS, ) # completely remove the flowlines from intersections and drop the waterbodies wb_joins = wb_joins.loc[ ~wb_joins.index.get_level_values(0).isin(errors.loc[errors].index) ].copy() waterbodies = waterbodies.loc[ waterbodies.index.isin(wb_joins.index.get_level_values(1)) ].copy() geoms = geoms.loc[geoms.index.isin(wb_joins.index)].copy() print("Calculating geometric intersection of flowlines and waterbodies...") int_start = time() geoms = geoms[["geometry", "waterbody"]].join(flowlines.length.rename("origLength")) # First, calculate the geometric intersection between the lines and waterbodies # WARNING: this intersection may return LineString, MultiLineString, Point, GeometryCollection geoms["intersection"] = pg.intersection(geoms.geometry, geoms.waterbody) types = pg.get_type_id(geoms.intersection) # NOTE: all the points should be captured by the above logic for crosses is_point = types.isin([0, 4]) is_line = types.isin([1, 5]) others = types[~(is_point | is_line)].unique() # GeometryCollection indicates a mess, skip those if len(others): print( "WARNING: Found other types of geometric intersection: {} (n={:,}), these will be dropped".format( others, len(types[~(is_point | is_line)]) ) ) # Any that intersect only at a point are OUTSIDE outside = geoms.loc[is_point].index # TODO: confirm this works wb_joins = wb_joins.loc[~wb_joins.index.isin(outside)].copy() print("Identified {:,} more flowlines outside waterbodies".format(len(outside))) # Drop those that are not lines from further analysis geoms = geoms.loc[is_line].copy() # Inspect amount of overlay - if the intersected length is within 1m of final length, it is completely within # if it is near 0, it is completely outside geoms["length"] = pg.length(geoms.intersection) outside = geoms.length < 1 inside = (geoms.origLength - geoms.length).abs() < 1 print( "Found {:,} more completely outside, {:,} completely inside".format( outside.sum(), inside.sum() ) ) # drop the ones that are outside wb_joins = wb_joins.loc[~wb_joins.index.isin(outside[outside].index)].copy() # cut the ones that aren't completely inside or outside geoms = geoms.loc[~(inside | outside)].copy() print("Done evaluating intersection in {:.2f}s".format(time() - int_start)) if len(geoms): print("Cutting {:,} flowlines ...".format(len(geoms))) cut_start = time() geoms = geoms[["geometry", "waterbody", "origLength"]] # WARNING: difference is not precise, the point of split is not exactly at the intersection between lines # but within some tolerance. This will cause them to fail the contains() test below. boundary = pg.boundary(geoms.waterbody) geoms["geometry"] = pg.difference(geoms.geometry, boundary) errors = ~pg.is_valid(geoms.geometry) if errors.max(): print("WARNING: geometry errors for {:,} cut lines".format(errors.sum())) length = pg.length(geoms.geometry) errors = (length - geoms.origLength).abs() > 1 if errors.max(): print( "WARNING: {:,} lines were not completely cut by waterbodies (maybe shared edge?).\nThese will not be cut".format( errors.sum() ) ) to_geofeather( flowlines.loc[ errors.loc[errors].index.get_level_values(0).unique() ].reset_index(), out_dir / "error_incomplete_cut.feather", crs=CRS, ) # remove these from the cut geoms and retain their originals geoms = geoms.loc[~errors].copy() # Explode the multilines into single line segments geoms["geometry"] = explode(geoms.geometry) geoms = geoms.explode("geometry") # mark those parts of the cut lines that are within waterbodies # WARNING: this is not capturing all that should be inside after cutting! geoms["iswithin"] = pg.contains(geoms.waterbody, geoms.geometry) errors = geoms.groupby(level=0).iswithin.max() == False if errors.max(): print( "WARNING: {:,} flowlines that cross waterbodies had no parts contained within those waterbodies".format( errors.sum() ) ) to_geofeather( flowlines.loc[errors.index].reset_index(), out_dir / "error_crosses_but_not_contained.feather", crs=CRS, ) # If they cross, assume they are within print("Attempting to correct these based on which ones cross") ix = geoms.loc[ geoms.index.get_level_values(0).isin(errors.loc[errors].index) ].index geoms.loc[ix, "iswithin"] = pg.crosses( geoms.loc[ix].geometry, geoms.loc[ix].waterbody ) errors = geoms.groupby(level=0).iswithin.max() == False print("{:,} still have no part in a waterbody".format(errors.sum())) # calculate total length of within and outside parts geoms["length"] = pg.length(geoms.geometry) # drop any new segments that are < 1m, these are noise print("Dropping {:,} new segments < 1m".format((geoms.length < 1).sum())) geoms = geoms.loc[geoms.length >= 1].copy() if len(geoms) > 1: length = geoms.groupby(["lineID", "wbID", "iswithin"]).agg( {"length": "sum", "origLength": "first"} ) # Anything within 1 meter of original length is considered unchanged # This is so that we ignore slivers length["unchanged"] = (length.origLength - length["length"]).abs() < 1 unchanged = ( length[["unchanged"]] .reset_index() .groupby(["lineID", "wbID"]) .unchanged.max() .rename("max_unchanged") ) unchanged = ( length.reset_index().set_index(["lineID", "wbID"]).join(unchanged) ) is_within = ( unchanged.loc[unchanged.max_unchanged] .reset_index() .set_index(["lineID", "wbID"]) .iswithin ) # For any that are unchanged and NOT within waterbodies, # remove them from wb_joins ix = is_within.loc[~is_within].index wb_joins = wb_joins.loc[~wb_joins.index.isin(ix)].copy() # Remove any that are unchanged from intersection analysis geoms = geoms.loc[~geoms.index.isin(is_within.index)].copy() print( "Created {:,} new flowlines by splitting {:,} flowlines at waterbody edges in {:.2f}".format( len(geoms), len(geoms.index.get_level_values(0).unique()), time() - cut_start, ) ) if len(geoms) > 1: ### These are our final new lines to add # remove their lineIDs from flowlines and append # replace their outer joins to these ones and add intermediates # Join in previous line information from flowlines new_lines = ( geoms[["geometry", "length", "iswithin"]] .reset_index() .set_index("lineID") .join(flowlines.drop(columns=["geometry", "length", "sinuosity"])) .reset_index() .rename(columns={"lineID": "origLineID", "iswithin": "waterbody"}) ) error = ( new_lines.groupby("origLineID").wbID.unique().apply(len).max() > 1 ) if error: # Watch for errors - if a flowline is cut by multiple waterbodies # there will be problems with our logic for splicing in new lines # also - our intersection logic above is wrong print( """\n========\n MAJOR LOGIC ERROR: multiple waterbodies associated with a single flowline that as been cut. \n========\n """ ) # recalculate length and sinuosity new_lines["length"] = pg.length(new_lines.geometry).astype("float32") new_lines["sinuosity"] = calculate_sinuosity(new_lines.geometry).astype( "float32" ) # calculate new IDS next_segment_id = int(flowlines.index.max() + 1) new_lines["lineID"] = next_segment_id + new_lines.index new_lines.lineID = new_lines.lineID.astype("uint32") ### Update waterbody joins # remove joins replaced by above ix = new_lines.set_index(["origLineID", "wbID"]).index wb_joins = wb_joins.loc[~wb_joins.index.isin(ix)].copy() # add new joins wb_joins = ( wb_joins.reset_index() .append( new_lines.loc[new_lines.waterbody, ["lineID", "wbID"]], ignore_index=True, sort=False, ) .set_index(["lineID", "wbID"]) ) ### Update flowline joins # transform new lines to create new joins l = new_lines.groupby("origLineID").lineID # the first new line per original line is the furthest upstream, so use its # ID as the new downstream ID for anything that had this origLineID as its downstream first = l.first().rename("new_downstream_id") # the last new line per original line is the furthest downstream... last = l.last().rename("new_upstream_id") # Update existing joins with the new lineIDs we created at the upstream or downstream # ends of segments we just created joins = update_joins( joins, first, last, downstream_col="downstream_id", upstream_col="upstream_id", ) ### Create new line joins for any that weren't inserted above # Transform all groups of new line IDs per original lineID, wbID # into joins structure pairs = lambda a: pd.Series(zip(a[:-1], a[1:])) new_joins = ( new_lines.groupby(["origLineID", "wbID"]) .lineID.apply(pairs) .apply(pd.Series) .reset_index() .rename(columns={0: "upstream_id", 1: "downstream_id"}) .join( flowlines[["NHDPlusID", "loop"]].rename( columns={"NHDPlusID": "upstream"} ), on="origLineID", ) ) # NHDPlusID is same for both sides new_joins["downstream"] = new_joins.upstream new_joins["type"] = "internal" new_joins = new_joins[ [ "upstream", "downstream", "upstream_id", "downstream_id", "type", "loop", ] ] joins = joins.append( new_joins, ignore_index=True, sort=False ).sort_values(["downstream_id", "upstream_id"]) ### Update flowlines # remove originals now replaced by cut versions here flowlines = ( flowlines.loc[~flowlines.index.isin(new_lines.origLineID)] .reset_index() .append( new_lines[["lineID"] + list(flowlines.columns) + ["waterbody"]], ignore_index=True, sort=False, ) .sort_values("lineID") .set_index("lineID") ) # End cut geometries # Update waterbody bool for other flowlines based on those that completely intersected # above flowlines.loc[ flowlines.index.isin(wb_joins.index.get_level_values(0).unique()), "waterbody" ] = True flowlines.waterbody = flowlines.waterbody.fillna(False) ### Update waterbodies and calculate flowline stats wb_joins = wb_joins.reset_index() stats = ( wb_joins.join(flowlines.length.rename("flowlineLength"), on="lineID") .groupby("wbID") .flowlineLength.sum() .astype("float32") ) waterbodies = waterbodies.loc[waterbodies.index.isin(wb_joins.wbID)].join(stats) print("Done cutting flowlines by waterbodies in {:.2f}s".format(time() - start)) return flowlines, joins, waterbodies, wb_joins
}).set_index("index")) nhd_dams = nhd_dams.join(groups) # Extract composite names for the group name = (nhd_dams.groupby("group").GNIS_Name.unique().apply( lambda n: ", ".join([s for s in n if s]))) nhd_dams = (dissolve( nhd_dams[["HUC2", "group", "geometry"]], by="group").join(name).reset_index(drop=True).rename(columns={ "group": "id" }).set_index("id")) nhd_dams.GNIS_Name = nhd_dams.GNIS_Name.fillna("") # cleanup invalid geometries ix = ~pg.is_valid(nhd_dams.geometry) nhd_dams.loc[ix, "geometry"] = pg.buffer(nhd_dams.loc[ix].geometry, 0.1, quadsegs=1) ### Intersect with flowlines by region merged = None for region, HUC2s in list(REGION_GROUPS.items()): region_start = time() print("\n----- {} ------\n".format(region)) print("Reading flowlines...") flowlines = from_geofeather(src_dir / region / "flowlines.feather").set_index("lineID") joins = deserialize_df(