def process_huc4s(src_dir, huc4s):

    merged = None
    for HUC4 in huc4s:
        print("\n\n------------------- Reading {} -------------------".format(
            HUC4))

        gdb = src_dir / HUC4 / "NHDPLUS_H_{HUC4}_HU4_GDB.gdb".format(HUC4=HUC4)

        df = read_dataframe(gdb,
                            layer="NHDPlusCatchment",
                            columns=["NHDPlusID"])
        print(f"Read {len(df):,} catchments")

        df = df.dropna(subset=["NHDPlusID"])

        print("Kept {:,} catchments after dropping those without NHDPlusID".
              format(len(df)))

        df.NHDPlusID = df.NHDPlusID.astype("uint64")

        df = df.to_crs(CRS)
        merged = append(merged, df)

    df = merged

    # add uniqueID
    df["catchID"] = df.index.astype("uint32") + 1

    # add string version of NHDPlusID
    df["NHDIDSTR"] = df.NHDPlusID.astype("str")

    return df
        f"Found {ix.sum():,} dams associated with waterbodies in {time() - join_start:,.2f}s"
    )
    dams["geometry"] = dams.pt.values
    dams.loc[ix, "geometry"] = dams.loc[ix].drain.values.data
    dams.loc[ix, "lineID"] = dams.loc[ix].drainLineID.astype("uint32")

    dams = dams.drop(columns=["drain", "drainLineID", "pt"]).join(
        flowlines[["loop", "sizeclass"]], on="lineID"
    )

    # drop duplicates
    dams = dams.reset_index().drop_duplicates(
        subset=["damPtID", "damID", "lineID", "geometry"]
    )

    merged = append(merged, dams)

    print("Region done in {:.2f}s".format(time() - region_start))


print("----------------------------------------------")

dams = merged.reset_index(drop=True).join(
    nhd_dams.drop(columns=["geometry"]), on="damID"
)

nhd_dams = nhd_dams.loc[nhd_dams.index.isin(dams.damID.unique())].reset_index()

print(
    f"Found {len(nhd_dams):,} NHD dams and {len(dams):,} NHD dam / flowline crossings"
)
Пример #3
0
        "Federal_Status",
        "State_Status",
        "SGCN_Listing",
        "Regional_SGCN",
    ]].rename(
        columns={
            "HUC12_Code": "HUC12",
            "Species_Name": "SNAME",
            "Common_Name": "CNAME",
            "Federal_Status": "federal",
            "State_Status": "state",
            "SGCN_Listing": "sgcn",
            "Regional_SGCN": "regional",
        })

    merged = append(merged, df)

df = merged

print("Processing species data")

# fix data issues
for col in df.columns[1:]:
    df[col] = df[col].fillna("").str.strip()

df = df.loc[df.SNAME.notnull() & (df.SNAME != "")].copy()
# drop duplicates
df = (df.sort_values(by=["HUC12", "SNAME", "CNAME"]).groupby(
    ["HUC12", "SNAME"]).first().reset_index())

# Update to overcome taxonomic issues
Пример #4
0
        # some geometries are invalid, filter them out
        df = df.loc[pg.is_geometry(df.geometry.values.data)].copy()

        if not len(df):
            continue

        df = df.to_crs(CRS)

        # Mark structurally altered types where
        # codes with x (excavated), d (ditched), r (artificial substrate), h (diked)
        # strip any terminal numbers then take last character

        df["modifier"] = df.nwi_code.str.rstrip("123456789").str[-1:]
        df["altered"] = df.modifier.isin(MODIFIERS)

        waterbodies = append(waterbodies,
                             df.loc[df.nwi_type.isin(["Lake", "Pond"])])
        rivers = append(
            rivers,
            df.loc[(df.nwi_type == "Riverine")
                   & (df.altered)].drop(columns=["nwi_type"]),
        )

    ### Process waterbodies
    # only keep that intersect flowlines
    print(f"Extracted {len(waterbodies):,} NWI lakes and ponds")
    left, right = tree.query_bulk(waterbodies.geometry.values.data,
                                  predicate="intersects")
    waterbodies = waterbodies.iloc[np.unique(left)].reset_index(drop=True)
    print(f"Kept {len(waterbodies):,} that intersect flowlines")

    # TODO: explode, repair, dissolve, explode, reset index
Пример #5
0
def process_huc4s(src_dir, out_dir, huc4s):
    merged_flowlines = None
    merged_joins = None
    merged_waterbodies = None
    merged_points = None
    merged_lines = None
    merged_poly = None
    merged_altered_rivers = None
    merged_marine = None

    for huc4 in huc4s:
        print(f"------------------- Reading {huc4} -------------------")

        huc_id = int(huc4) * 1000000

        gdb = src_dir / huc4 / f"NHDPLUS_H_{huc4}_HU4_GDB.gdb"

        ### Read flowlines and joins
        read_start = time()
        flowlines, joins = extract_flowlines(gdb, target_crs=CRS)
        print(
            f"Read {len(flowlines):,} flowlines in {time() - read_start:.2f} seconds"
        )

        flowlines["HUC4"] = huc4
        joins["HUC4"] = huc4

        # Calculate lineIDs to be unique across all HUC2s
        flowlines["lineID"] += huc_id
        # Set updated lineIDs with the HUC4 prefix
        joins.loc[joins.upstream_id != 0, "upstream_id"] += huc_id
        joins.loc[joins.downstream_id != 0, "downstream_id"] += huc_id

        merged_flowlines = append(merged_flowlines, flowlines)
        merged_joins = append(merged_joins, joins)

        ### Read waterbodies
        read_start = time()
        waterbodies = extract_waterbodies(gdb, target_crs=CRS)
        print("Read {:,} waterbodies in  {:.2f} seconds".format(
            len(waterbodies),
            time() - read_start))

        waterbodies["HUC4"] = huc4

        # calculate ids to be unique across region
        waterbodies["wbID"] += huc_id

        ### Only retain waterbodies that intersect flowlines
        print("Intersecting waterbodies and flowlines")
        # use waterbodies to query flowlines since there are many more flowlines
        tree = pg.STRtree(flowlines.geometry.values.data)
        left, right = tree.query_bulk(waterbodies.geometry.values.data,
                                      predicate="intersects")
        waterbodies = waterbodies.iloc[np.unique(left)].copy()
        print("Retained {:,} waterbodies that intersect flowlines".format(
            len(waterbodies)))

        merged_waterbodies = append(merged_waterbodies, waterbodies)

        ### Extract barrier points, lines, polygons
        points = extract_barrier_points(gdb, target_crs=CRS)
        points.HUC4 = huc4
        points["id"] += huc_id
        merged_points = append(merged_points, points)

        lines = extract_barrier_lines(gdb, target_crs=CRS)
        lines.HUC4 = huc4
        lines["id"] += huc_id
        merged_lines = append(merged_lines, lines)

        poly = extract_barrier_polygons(gdb, target_crs=CRS)
        poly.HUC4 = huc4
        poly["id"] += huc_id
        merged_poly = append(merged_poly, poly)

        ### Extract altered rivers
        altered_rivers = extract_altered_rivers(gdb, target_crs=CRS)
        altered_rivers.HUC4 = huc4
        altered_rivers["id"] += huc_id
        merged_altered_rivers = append(merged_altered_rivers, altered_rivers)

        ### Extract marine
        marine = extract_marine(gdb, target_crs=CRS)
        marine.HUC4 = huc4
        merged_marine = append(merged_marine, marine)

    print("--------------------")

    flowlines = merged_flowlines.reset_index(drop=True)
    joins = merged_joins.reset_index(drop=True)
    waterbodies = merged_waterbodies.reset_index(drop=True)
    points = merged_points.reset_index(drop=True)
    lines = merged_lines.reset_index(drop=True)
    poly = merged_poly.reset_index(drop=True)
    altered_rivers = merged_altered_rivers.reset_index(drop=True)
    marine = merged_marine.reset_index(drop=True)

    ### Deduplicate waterbodies that are duplicated between adjacent HUC4s
    print("Removing duplicate waterbodies, starting with {:,}".format(
        len(waterbodies)))
    # Calculate a hash of the WKB bytes of the polygon.
    # This correctly catches polygons that are EXACTLY the same.
    # It will miss those that are NEARLY the same.

    waterbodies["hash"] = pd.util.hash_array(
        pg.to_wkb(waterbodies.geometry.values.data))

    id_map = (waterbodies.set_index("wbID")[["hash"]].join(
        waterbodies.groupby("hash").wbID.first(), on="hash").wbID)
    # extract out where they are not equal; these are the ones to drop
    waterbodies = (waterbodies.loc[waterbodies.wbID.isin(id_map)].drop(
        columns=["hash"]).reset_index(drop=True))
    print("{:,} waterbodies remain after removing duplicates".format(
        len(waterbodies)))

    ### Update the missing upstream_ids at the joins between HUCs.
    # These are the segments that are immediately DOWNSTREAM of segments that flow into this HUC4
    # We set a new UPSTREAM id for them based on the segment that is next upstream

    huc_in_idx = joins.loc[joins.type == "huc_in"].index
    cross_huc_joins = joins.loc[huc_in_idx]

    new_upstreams = (cross_huc_joins.join(
        joins.set_index("downstream").downstream_id.rename("new_upstream"),
        on="upstream",
    ).new_upstream.fillna(0).astype("uint32"))
    joins.loc[new_upstreams.index, "upstream_id"] = new_upstreams

    # update new internal joins
    joins.loc[(joins.type == "huc_in") & (joins.upstream_id != 0),
              "type"] = "internal"

    # remove the duplicate downstreams that used to be terminals for their respective HUCs
    joins = joins.loc[~(joins.upstream.isin(cross_huc_joins.upstream) &
                        (joins.type == "terminal"))]

    # remove dead ends
    joins = joins.loc[~((joins.downstream == 0) &
                        (joins.upstream == 0))].reset_index(drop=True)

    print("\n--------------------")

    print(f"serializing {len(flowlines):,} flowlines")
    flowlines.to_feather(out_dir / "flowlines.feather")
    joins.to_feather(out_dir / "flowline_joins.feather")

    print(f"serializing {len(waterbodies):,} waterbodies")
    waterbodies.to_feather(out_dir / "waterbodies.feather")

    # DEBUG:
    # write_dataframe(flowlines, out_dir / "flowlines.gpkg")
    # write_dataframe(waterbodies, out_dir / "waterbodies.gpkg")

    if len(points):
        print(f"serializing {len(points):,} NHD barrier points")
        points.to_feather(out_dir / "nhd_points.feather")
        # DEBUG:
        # write_dataframe(points, out_dir / 'nhd_points.gpkg')

    if len(lines):
        print(f"serializing {len(lines):,} NHD barrier lines")
        lines.to_feather(out_dir / "nhd_lines.feather")
        # DEBUG:
        # write_dataframe(lines, out_dir / 'nhd_lines.gpkg')

    if len(poly):
        print(f"serializing {len(poly):,} NHD barrier polygons")
        poly.to_feather(out_dir / "nhd_poly.feather")
        # DEBUG:
        # write_dataframe(poly, out_dir / 'nhd_poly.gpkg')

    if len(altered_rivers):
        print(f"serializing {len(altered_rivers):,} NHD altered rivers")
        altered_rivers.to_feather(out_dir / "nhd_altered_rivers.feather")
        # DEBUG:
        # write_dataframe(altered_rivers, out_dir / "nhd_altered_rivers.gpkg")

    if len(marine):
        print(f"serializing {len(marine):,} NHD marine areas")
        marine.to_feather(out_dir / "nhd_marine.feather")
Пример #6
0
def get_network_results(df, network_type, barrier_type=None, rank=True):
    """Read network results, calculate derived metric classes, and calculate
    tiers.

    Only barriers that are not unranked (invasive spp barriers) have tiers calculated.

    Parameters
    ----------
    df : DataFrame
        barriers data; must contain State and unranked
    network_type : {"dams", "small_barriers"}
        network scenario
    barrier_type : {"dams", "small_barriers", "waterfalls"}, optional (default: None)
        if present, used to filter barrier kind from network results
    rank : bool, optional (default: True)
        if True, results will include tiers for the Southeast and state level

    Returns
    -------
    DataFrame
        Contains network metrics and tiers
    """

    barrier_type = barrier_type or network_type

    huc2s = [huc2 for huc2 in df.HUC2.unique() if huc2]

    networks = (
        read_feathers(
            [
                Path("data/networks/clean") / huc2 / f"{network_type}_network.feather"
                for huc2 in huc2s
            ],
            columns=NETWORK_COLUMNS,
        )
        .rename(columns=NETWORK_COLUMN_NAMES)
        .set_index("id")
    )

    # FIXME: temporary fix
    networks.PercentPerennialUnaltered = networks.PercentPerennialUnaltered.fillna(0)

    # select barrier type
    networks = networks.loc[networks.kind == barrier_type[:-1]].drop(columns=["kind"])

    # Convert dtypes to allow missing data when joined to barriers later
    # NOTE: upNetID or downNetID may be 0 if there aren't networks on that side, but
    # we set to int dtype instead of uint to allow -1 for missing data later
    for col in ["upNetID", "downNetID"]:
        networks[col] = networks[col].astype("int")

    for col in ["NumBarriersDownstream"]:
        networks[col] = networks[col].astype("int16")

    for column in ("Landcover", "SizeClasses", "FlowsToOcean"):
        networks[column] = networks[column].astype("int8")

    # sanity check to make sure no duplicate networks
    if networks.groupby(level=0).size().max() > 1:
        raise Exception(
            f"ERROR: multiple networks found for some {barrier_type} networks"
        )

    networks = networks.join(df[df.columns.intersection(["unranked", "State"])])

    # update data types and calculate total fields
    # calculate size classes GAINED instead of total
    # doesn't apply to those that don't have upstream networks
    networks.loc[networks.SizeClasses > 0, "SizeClasses"] -= 1

    # Calculate miles GAINED if barrier is removed
    # this is the lesser of the upstream or free downstream lengths.
    # Non-free miles downstream (downstream waterbodies) are omitted from this analysis.
    networks["GainMiles"] = networks[["TotalUpstreamMiles", "FreeDownstreamMiles"]].min(
        axis=1
    )
    networks["PerennialGainMiles"] = networks[
        ["PerennialUpstreamMiles", "FreePerennialDownstreamMiles"]
    ].min(axis=1)

    # TotalNetworkMiles is sum of upstream and free downstream miles
    networks["TotalNetworkMiles"] = networks[
        ["TotalUpstreamMiles", "FreeDownstreamMiles"]
    ].sum(axis=1)
    networks["TotalPerennialNetworkMiles"] = networks[
        ["PerennialUpstreamMiles", "FreePerennialDownstreamMiles"]
    ].sum(axis=1)

    # Round floating point columns to 3 decimals
    for column in [c for c in networks.columns if c.endswith("Miles")]:
        networks[column] = networks[column].round(3).fillna(-1).astype("float32")

    # Calculate network metric classes
    networks["GainMilesClass"] = classify_gainmiles(networks.GainMiles)
    networks["PerennialGainMilesClass"] = classify_gainmiles(
        networks.PerennialGainMiles
    )

    if not rank:
        return networks.drop(columns=["unranked", "State"], errors="ignore")

    # only calculate ranks / tiers for ranked barriers
    # (exclude unranked invasive spp. barriers)
    to_rank = networks.loc[~networks.unranked]

    ### Calculate regional tiers for SARP (Southeast) region
    # NOTE: this is limited to SARP region; other regions are not ranked at regional level
    # TODO: consider deprecating this
    ix = to_rank.State.isin(SARP_STATE_NAMES)
    sarp_tiers = calculate_tiers(to_rank.loc[ix])
    sarp_tiers = sarp_tiers.rename(
        columns={col: f"SE_{col}" for col in sarp_tiers.columns}
    )

    ### Calculate state tiers for each of total and perennial
    state_tiers = None
    for state in to_rank.State.unique():
        state_tiers = append(
            state_tiers,
            calculate_tiers(to_rank.loc[to_rank.State == state]).reset_index(),
        )

    state_tiers = state_tiers.set_index("id").rename(
        columns={col: f"State_{col}" for col in state_tiers.columns}
    )

    networks = networks.join(sarp_tiers).join(state_tiers)
    for col in [col for col in networks.columns if col.endswith("_tier")]:
        networks[col] = networks[col].fillna(-1).astype("int8")

    return networks.drop(columns=["unranked", "State"])
Пример #7
0
def nearest(source, target, max_distance, keep_all=False):
    """Find the nearest target geometry for each record in source, if one
    can be found within distance.

    Parameters
    ----------
    source : Series
        contains pygeos geometries
    target : Series
        contains target pygeos geometries to search against
    max_distance : number or ndarray
        radius within which to find target geometries
        If ndarray, must be equal length to source.
    keep_all : bool (default: False)
        If True, will keep all equidistant results

    Returns
    -------
    DataFrame
        indexed by original index of source, has index of target for each
        nearest target geom.
        Includes distance
    """

    left_index_name = source.index.name or "index"
    right_index_name = target.index.name or "index_right"

    tree = pg.STRtree(target.values.data)

    if np.isscalar(max_distance):
        (left_ix,
         right_ix), distance = tree.nearest_all(source.values.data,
                                                max_distance=max_distance,
                                                return_distance=True)

        # Note: there may be multiple equidistant or intersected results, so we take the first
        df = pd.DataFrame(
            {
                right_index_name: target.index.take(right_ix),
                "distance": distance,
            },
            index=source.index.take(left_ix),
        )

    else:  # array
        merged = None
        for d in np.unique(max_distance):
            ix = max_distance == d
            left = source.loc[ix]
            (left_ix,
             right_ix), distance = tree.nearest_all(left.values.data,
                                                    max_distance=d,
                                                    return_distance=True)
            merged = append(
                merged,
                pd.DataFrame(
                    {
                        left_index_name: left.index.take(left_ix),
                        right_index_name: target.index.take(right_ix),
                        "distance": distance,
                    }, ),
            )
        df = merged.set_index(left_index_name)

    if keep_all:
        df = df.reset_index().drop_duplicates().set_index(left_index_name)
    else:
        df = df.groupby(level=0).first()

    df.index.name = source.index.name

    return df
Пример #8
0
merged = None

for huc2 in units.keys():
    print(f"Processing floodplain stats for {huc2}")

    if huc2 == "02":
        filename = region02_gdb_filename
        layer = "Region002_Catchments_Natl_LCStats"
    else:
        filename = gdb_filename
        layer = layers[huc2]

    df = read_dataframe(filename, layer=layer)

    df["HUC2"] = huc2
    df["NHDPlusID"] = df.NHDIDSTR.astype("uint64")
    cols = [c for c in df.columns if c.startswith("VALUE_")]
    natural_cols = [c for c in cols if int(c.split("_")[1]) in NATURAL_TYPES]

    df["floodplain_km2"] = df[cols].sum(axis=1) * 1e-6
    df["nat_floodplain_km2"] = df[natural_cols].sum(axis=1) * 1e-6

    merged = append(
        merged,
        df[["NHDPlusID", "HUC2", "nat_floodplain_km2", "floodplain_km2"]])

merged.reset_index(drop=True).to_feather(src_dir / "floodplain_stats.feather")

print("Done in {:.2f}".format(time() - start))