예제 #1
0
def test_get_parts(geom):
    expected_num_parts = pygeos.get_num_geometries(geom)
    expected_parts = pygeos.get_geometry(geom, range(0, expected_num_parts))

    parts = pygeos.get_parts(geom)
    assert len(parts) == expected_num_parts
    assert np.all(pygeos.equals_exact(parts, expected_parts))
예제 #2
0
def close_gaps(df, tolerance):
    """Close gaps in LineString geometry where it should be contiguous.

    Snaps both lines to a centroid of a gap in between.

    """
    geom = df.geometry.values.data
    coords = pygeos.get_coordinates(geom)
    indices = pygeos.get_num_coordinates(geom)

    # generate a list of start and end coordinates and create point geometries
    edges = [0]
    i = 0
    for ind in indices:
        ix = i + ind
        edges.append(ix - 1)
        edges.append(ix)
        i = ix
    edges = edges[:-1]
    points = pygeos.points(np.unique(coords[edges], axis=0))

    buffered = pygeos.buffer(points, tolerance)

    dissolved = pygeos.union_all(buffered)

    exploded = [
        pygeos.get_geometry(dissolved, i)
        for i in range(pygeos.get_num_geometries(dissolved))
    ]

    centroids = pygeos.centroid(exploded)

    snapped = pygeos.snap(geom, pygeos.union_all(centroids), tolerance)

    return snapped
예제 #3
0
def explode(df):
    """Explodes multipart geometries to single parts.  Attributes are copied
    to each individual geometry.

    NOTE: Faster method not yet supported in pygeos, in https://github.com/pygeos/pygeos/pull/130
    This branch must be checked out and built for this functionality.

    Parameters
    ----------
    df : GeoDataFrame

    Returns
    -------
    GeoDataFrame
    """

    # Fast method:
    # ix, parts = pg.get_parts(df.geometry.values.data)
    # series = pd.Series(parts, index=df.index[ix], name="geometry")
    # return df.drop(columns=["geometry"]).join(series)

    # Slower method
    geometries = df.geometry.values.data
    ix = []
    parts = []
    for i in range(len(df)):
        num_parts = pg.get_num_geometries(geometries[i])
        ix.extend(np.repeat(df.index[i], num_parts))
        parts.extend(pg.get_geometry(geometries[i], range(num_parts)))

    return gp.GeoDataFrame({
        "geometry": parts
    }, index=ix, crs=df.crs).join(df.drop(columns=["geometry"]))
예제 #4
0
파일: benchmarks.py 프로젝트: 92kns/Shapely
    def time_get_parts_python(self):
        """Python / ufuncs version of get_parts"""

        parts = []
        for i in range(len(self.multipolygons)):
            num_parts = pygeos.get_num_geometries(self.multipolygons[i])
            parts.append(pygeos.get_geometry(self.multipolygons[i], range(num_parts)))

        parts = np.concatenate(parts)
예제 #5
0
def test_get_parts_array():
    # note: this also verifies that None is handled correctly
    # in the mix; internally it returns -1 for count of geometries
    geom = np.array([None, empty_line_string, multi_point, point, multi_polygon])
    expected_parts = []
    for g in geom:
        for i in range(0, pygeos.get_num_geometries(g)):
            expected_parts.append(pygeos.get_geometry(g, i))

    parts = pygeos.get_parts(geom)
    assert len(parts) == len(expected_parts)
    assert np.all(pygeos.equals_exact(parts, expected_parts))
예제 #6
0
def test_get_parts_geometry_collection_multi():
    """On the first pass, the individual Multi* geometry objects are returned
    from the collection.  On the second pass, the individual singular geometry
    objects within those are returned.
    """
    geom = pygeos.geometrycollections([multi_point, multi_line_string, multi_polygon])
    expected_num_parts = pygeos.get_num_geometries(geom)
    expected_parts = pygeos.get_geometry(geom, range(0, expected_num_parts))

    parts = pygeos.get_parts(geom)
    assert len(parts) == expected_num_parts
    assert np.all(pygeos.equals_exact(parts, expected_parts))

    expected_subparts = []
    for g in np.asarray(expected_parts):
        for i in range(0, pygeos.get_num_geometries(g)):
            expected_subparts.append(pygeos.get_geometry(g, i))

    subparts = pygeos.get_parts(parts)
    assert len(subparts) == len(expected_subparts)
    assert np.all(pygeos.equals_exact(subparts, expected_subparts))
예제 #7
0
def test_get_parts_return_index():
    geom = np.array([multi_point, point, multi_polygon])
    expected_parts = []
    expected_index = []
    for i, g in enumerate(geom):
        for j in range(0, pygeos.get_num_geometries(g)):
            expected_parts.append(pygeos.get_geometry(g, j))
            expected_index.append(i)

    parts, index = pygeos.get_parts(geom, return_index=True)
    assert len(parts) == len(expected_parts)
    assert np.all(pygeos.equals_exact(parts, expected_parts))
    assert np.array_equal(index, expected_index)
예제 #8
0
def close_gaps(gdf, tolerance):
    """Close gaps in LineString geometry where it should be contiguous.

    Snaps both lines to a centroid of a gap in between.

    Parameters
    ----------
    gdf : GeoDataFrame, GeoSeries
        GeoDataFrame  or GeoSeries containing LineString representation of a network.
    tolerance : float
        nodes within a tolerance will be snapped together

    Returns
    -------
    GeoSeries
    
    See also
    --------
    momepy.extend_lines
    momepy.remove_false_nodes

    """
    geom = gdf.geometry.values.data
    coords = pygeos.get_coordinates(geom)
    indices = pygeos.get_num_coordinates(geom)

    # generate a list of start and end coordinates and create point geometries
    edges = [0]
    i = 0
    for ind in indices:
        ix = i + ind
        edges.append(ix - 1)
        edges.append(ix)
        i = ix
    edges = edges[:-1]
    points = pygeos.points(np.unique(coords[edges], axis=0))

    buffered = pygeos.buffer(points, tolerance / 2)

    dissolved = pygeos.union_all(buffered)

    exploded = [
        pygeos.get_geometry(dissolved, i)
        for i in range(pygeos.get_num_geometries(dissolved))
    ]

    centroids = pygeos.centroid(exploded)

    snapped = pygeos.snap(geom, pygeos.union_all(centroids), tolerance)

    return gpd.GeoSeries(snapped, crs=gdf.crs)
예제 #9
0
def explode(series):
    """Convert multipart geometries to a list of geometries

    Parameters
    ----------
    series : Series

    Returns
    -------
    Series

    """
    return series.apply(
        lambda g: [pg.get_geometry(g, i) for i in range(0, pg.get_num_geometries(g))]
    )
예제 #10
0
def extract_waterbodies(gdb_path, target_crs):
    """Extract waterbodies from NHDPlusHR data product that are are not one of
    the excluded types (e.g., estuary, playa, swamp/marsh).

    Parameters
    ----------
    gdb_path : str
        path to the NHD HUC4 Geodatabase
    target_crs: GeoPandas CRS object
        target CRS to project NHD to for analysis, like length calculations.
        Must be a planar projection.

    Returns
    -------
    GeoDataFrame
    """
    print("Reading waterbodies")
    df = read_dataframe(
        gdb_path,
        layer="NHDWaterbody",
        columns=[WATERBODY_COLS],
        force_2d=True,
        where=f"FType not in {tuple(WATERBODY_EXCLUDE_FTYPES)}",
    )
    print("Read {:,} waterbodies".format(len(df)))

    # Convert multipolygons to polygons
    # those we checked that are true multipolygons are errors
    df.geometry = pg.get_geometry(df.geometry.values.data, 0)
    df.geometry = make_valid(df.geometry.values.data)

    print("projecting to target projection")
    df = df.to_crs(target_crs)

    df.NHDPlusID = df.NHDPlusID.astype("uint64")
    df.AreaSqKm = df.AreaSqKm.astype("float32")
    df.FType = df.FType.astype("uint16")

    ### Add calculated fields
    df["wbID"] = df.index.values.astype("uint32") + 1

    return df
예제 #11
0
def to_dict(geometry):
    """Convert pygeos Geometry object to a dictionary representation.
    Equivalent to structure of GeoJSON.

    Parameters
    ----------
    geometry : pygeos Geometry object (singular)

    Returns
    -------
    dict
        GeoJSON dict representation of geometry
    """
    geometry = pg.normalize(geometry)

    def get_ring_coords(polygon):
        # outer ring must be reversed to be counterclockwise[::-1]
        coords = [pg.get_coordinates(pg.get_exterior_ring(polygon)).tolist()]
        for i in range(pg.get_num_interior_rings(polygon)):
            # inner rings must be reversed to be clockwise[::-1]
            coords.append(
                pg.get_coordinates(pg.get_interior_ring(polygon, i)).tolist())

        return coords

    geom_type = GEOJSON_TYPE[pg.get_type_id(geometry)]
    coords = []

    if geom_type == "MultiPolygon":
        coords = []
        geoms = pg.get_geometry(geometry,
                                range(pg.get_num_geometries(geometry)))
        for geom in geoms:
            coords.append(get_ring_coords(geom))

    elif geom_type == "Polygon":
        coords = get_ring_coords(geometry)

    else:
        raise NotImplementedError("Not built")

    return {"type": geom_type, "coordinates": coords}
예제 #12
0
def _clean_multi_geometries(object_array):
    """Cleanup a sequence of geometries to remove multi geometries.

    Args:
        object_array (numpy.ndarray): The object array to cleanup

    Returns:
        numpy.ndarray: Cleaned-up object array.

    """
    # Handle multi-geometries
    geometries = object_array[:, 0]
    num_geometries = get_num_geometries(geometries)
    for index in np.nonzero(num_geometries > 1)[0]:
        split_geometries = [
            np.concatenate((get_geometry(geometries[index],
                                         i), object_array[index, 1:]))
            for i in range(num_geometries[index])
        ]
        object_array[index] = split_geometries[0]
        object_array = np.concatenate((object_array, split_geometries[1:, :]))

    return geometries
예제 #13
0
파일: occult.py 프로젝트: vmario89/occult
def occult(lines: LineCollection, tolerance: float) -> LineCollection:
    """
    Remove occulted lines.

    The order of the geometries in 'lines' matters, see example below.

    'tolerance' controls the distance tolerance between the first and last points
    of a geometry to consider it closed.

    Examples:
        $ vpype line 0 0 5 5 rect 2 2 1 1 occult show  # line is occulted by rect

        $ vpype rect 2 2 1 1 line 0 0 5 5 occult show  # line is NOT occulted by rect,
        as the line is drawn after the rectangle.
    """

    line_arr = np.array(
        [pygeos.linestrings(list(zip(line.real, line.imag))) for line in lines]
    )

    for i, line in enumerate(line_arr):
        coords = pygeos.get_coordinates(line)

        if math.hypot(coords[-1, 0] - coords[0, 0], coords[-1, 1] - coords[0, 1]) < tolerance:
            tree = pygeos.STRtree(line_arr[:i])
            p = pygeos.polygons(coords)
            geom_idx = tree.query(p, predicate="intersects")
            line_arr[geom_idx] = pygeos.set_operations.difference(line_arr[geom_idx], p)

    new_lines = LineCollection()
    for geom in line_arr:
        for i in range(pygeos.get_num_geometries(geom)):
            coords = pygeos.get_coordinates(pygeos.get_geometry(geom, i))
            new_lines.append(coords[:, 0] + coords[:, 1] * 1j)

    return new_lines
예제 #14
0
def test_shared_paths_linestring():
    g1 = pygeos.linestrings([(0, 0), (1, 0), (1, 1)])
    g2 = pygeos.linestrings([(0, 0), (1, 0)])
    actual1 = pygeos.shared_paths(g1, g2)
    assert pygeos.equals(pygeos.get_geometry(actual1, 0), g2)
    ix = pg.intersects(dams.geometry.values.data, last_pt)
    dams.loc[ix, "pt"] = last_pt[ix]

    # override with upstream most point when both intersect
    first_pt = pg.get_point(dams.flowline.values.data, 0)
    ix = pg.intersects(dams.geometry.values.data, first_pt)
    dams.loc[ix, "pt"] = first_pt[ix]

    ix = dams.pt.isnull()
    # WARNING: this might fail for odd intersection geoms; we always take the first line
    # below
    pt = pd.Series(
        pg.get_point(
            pg.get_geometry(
                pg.intersection(
                    dams.loc[ix].geometry.values.data, dams.loc[ix].flowline.values.data
                ),
                0,
            ),
            0,
        ),
        index=dams.loc[ix].index,
    ).dropna()
    dams.loc[pt.index, "pt"] = pt

    # Few should be dropped at this point, since all should have overlapped at least by a point
    errors = dams.pt.isnull()
    if errors.max():
        print(
            f"{errors.sum():,} dam / flowline joins could not be represented as points and were dropped"
        )
예제 #16
0
def test_get_geometry_collection(geom):
    n = pygeos.get_num_geometries(geom)
    actual = pygeos.get_geometry(geom, [0, -n, n, -(n + 1)])
    assert pygeos.equals(actual[0], actual[1]).all()
    assert pygeos.is_missing(actual[2:4]).all()
예제 #17
0
def test_get_geometry_simple(geom):
    actual = pygeos.get_geometry(geom, [0, -1, 1, -2])
    assert pygeos.equals(actual[0], actual[1]).all()
    assert pygeos.is_missing(actual[2:4]).all()
예제 #18
0
    os.makedirs(network_dir)

start = time()
print("Reading Puerto Rico networks...")
networks = pio.read_dataframe(gdb,
                              layer=network_layer,
                              as_pygeos=True,
                              columns=[NET_COLS])
src_crs = networks.crs
networks = networks.rename(columns={
    "batNetID": "networkID",
    "StreamOrde": "streamorder"
}).set_index("networkID")

# convert to LineStrings
networks.geometry = pg.get_geometry(networks.geometry, 0)

# project to crs
networks.geometry = to_crs(networks.geometry, src_crs, CRS)

networks["length"] = pg.length(networks.geometry)
networks["miles"] = networks.length * 0.000621371
# sinuosity of each segment
networks["sinuosity"] = calculate_sinuosity(networks.geometry)

# aggregate up to the network
network_length = networks.groupby(level=0)[["length"]].sum()
temp_df = networks[["length", "sinuosity"]].join(network_length,
                                                 rsuffix="_total")

# Calculate length-weighted sinuosity
예제 #19
0
def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]):
    """
    Extracts flowlines data from NHDPlusHR data product.
    Extract flowlines from NHDPlusHR data product, joins to VAA table,
    and filters out coastlines.
    Extracts joins between flowlines, and filters out coastlines.

    Parameters
    ----------
    gdb_path : str
        path to the NHD HUC4 Geodatabase
    target_crs: GeoPandas CRS object
        target CRS to project NHD to for analysis, like length calculations.
        Must be a planar projection.
    extra_cols: list
        List of extra field names to extract from NHDFlowline layer

    Returns
    -------
    tuple of (GeoDataFrame, DataFrame)
        (flowlines, joins)
    """

    ### Read in flowline data and convert to data frame
    print("Reading flowlines")
    flowline_cols = FLOWLINE_COLS + extra_flowline_cols
    df = read_dataframe(
        gdb_path, layer="NHDFlowline", force_2d=True, columns=[flowline_cols],
    )

    # Index on NHDPlusID for easy joins to other NHD data
    df.NHDPlusID = df.NHDPlusID.astype("uint64")
    df = df.set_index(["NHDPlusID"], drop=False)

    # convert MultiLineStrings to LineStrings (all have a single linestring)
    df.geometry = pg.get_geometry(df.geometry.values.data, 0)

    print("making valid and projecting to target projection")
    df.geometry = make_valid(df.geometry.values.data)
    df = df.to_crs(target_crs)
    print(f"Read {len(df):,} flowlines")

    ### Read in VAA and convert to data frame
    # NOTE: not all records in Flowlines have corresponding records in VAA
    # we drop those that do not since we need these fields.
    print("Reading VAA table and joining...")
    vaa_df = read_dataframe(gdb_path, layer="NHDPlusFlowlineVAA", columns=[VAA_COLS])

    vaa_df.NHDPlusID = vaa_df.NHDPlusID.astype("uint64")
    vaa_df = vaa_df.set_index(["NHDPlusID"])
    df = df.join(vaa_df, how="inner")
    print(f"{len(df):,} features after join to VAA")

    # Simplify data types for smaller files and faster IO
    df.FType = df.FType.astype("uint16")
    df.FCode = df.FCode.astype("uint16")
    df.StreamOrde = df.StreamOrde.astype("uint8")
    df.Slope = df.Slope.astype("float32")
    df.MinElevSmo = df.MinElevSmo.astype("float32")
    df.MaxElevSmo = df.MaxElevSmo.astype("float32")

    ### Read in flowline joins
    print("Reading flowline joins")
    join_df = read_dataframe(
        gdb_path,
        layer="NHDPlusFlow",
        read_geometry=False,
        columns=["FromNHDPID", "ToNHDPID"],
    ).rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"})
    join_df.upstream = join_df.upstream.astype("uint64")
    join_df.downstream = join_df.downstream.astype("uint64")

    ### Fix errors in NHD
    # some valid joins are marked as terminals (downstream==0) in NHD; we need
    # to backfill the missing join info.
    # To do this, we intersect all terminals back with flowlines dropping any
    # that are themselves terminals.  Then we calculate the distance to the upstream
    # point of the intersected line, and the upstream point of the next segment
    # downstream.  We use the ID of whichever one is closer (must be within 100m).
    ix = join_df.loc[join_df.downstream == 0].upstream.unique()
    # get last point, is furthest downstream
    tmp = df.loc[df.index.isin(ix), ["geometry"]].copy()
    tmp["geometry"] = pg.get_point(tmp.geometry.values.data, -1)

    target = df.loc[~df.index.isin(ix)]

    # only search against other flowlines
    tree = pg.STRtree(target.geometry.values.data)
    # search within a tolerance of 0.001, these are very very close
    left, right = tree.nearest_all(tmp.geometry.values.data, max_distance=0.001)

    pairs = pd.DataFrame(
        {
            "left": tmp.index.take(left),
            "right": target.index.take(right),
            "source": tmp.geometry.values.data.take(left),
            # take upstream / downstream points of matched lines
            "upstream_target": pg.get_point(df.geometry.values.data.take(right), 0),
        }
    )

    # drop any pairs where the other side is also a terminal (these appear as
    # V shaped tiny networks that need to be left as is)
    pairs = pairs.loc[~pairs.right.isin(ix)]

    # calculate the next segment downstream (only keep the first if multiple; possible logic issue)
    next_downstream = (
        join_df.loc[(join_df.upstream != 0) & (join_df.downstream != 0)]
        .groupby("upstream")
        .downstream.first()
    )
    pairs["next_downstream"] = pairs.right.map(next_downstream)
    pairs.loc[pairs.next_downstream.notnull(), "downstream_target"] = pg.get_point(
        df.loc[
            pairs.loc[pairs.next_downstream.notnull()].next_downstream
        ].geometry.values.data,
        0,
    )

    pairs["upstream_dist"] = pg.distance(pairs.source, pairs.upstream_target)
    ix = pairs.next_downstream.notnull()
    pairs.loc[ix, "downstream_dist"] = pg.distance(
        pairs.loc[ix].source, pairs.loc[ix].downstream_target
    )

    # this ignores any nan
    pairs["dist"] = pairs[["upstream_dist", "downstream_dist"]].min(axis=1)
    # discard any that are too far (>100m)
    pairs = pairs.loc[pairs.dist <= 100].copy()

    # sort by distance to upstream point of matched flowline; this allows us
    # to sort on those then dedup to calculate a new downstream ID for this source line
    pairs = pairs.sort_values(by=["left", "dist"])

    # set the right value to the next downstream if it is closer
    # this also ignores na
    ix = pairs.downstream_dist < pairs.upstream_dist
    pairs.loc[ix, "right"] = pairs.loc[ix].next_downstream.astype("uint64")

    ids = pairs.groupby("left").right.first()

    if len(ids):
        # save to send to NHD
        pd.DataFrame({"NHDPlusID": ids.index.unique()}).to_csv(
            f"/tmp/{gdb_path.stem}_bad_joins.csv", index=False
        )

        ix = join_df.upstream.isin(ids.index)
        join_df.loc[ix, "downstream"] = join_df.loc[ix].upstream.map(ids)

        print(
            f"Repaired {len(ids):,} joins marked by NHD as terminals but actually joined to flowlines"
        )

    # set join types to make it easier to track
    join_df["type"] = "internal"  # set default
    # upstream-most origin points
    join_df.loc[join_df.upstream == 0, "type"] = "origin"
    # downstream-most termination points
    join_df.loc[join_df.downstream == 0, "type"] = "terminal"

    ### Filter out coastlines and update joins
    # WARNING: we tried filtering out pipelines (FType == 428).  It doesn't work properly;
    # there are many that go through dams and are thus needed to calculate
    # network connectivity and gain of removing a dam.
    print("Filtering out coastlines...")
    coastline_idx = df.loc[df.FType == 566].index
    df = df.loc[~df.index.isin(coastline_idx)].copy()
    print(f"{len(df):,} features after removing coastlines")

    # remove any joins that have coastlines as upstream
    # these are themselves coastline segments
    join_df = join_df.loc[~join_df.upstream.isin(coastline_idx)].copy()

    # set the downstream to 0 for any that join coastlines
    # this will enable us to mark these as downstream terminals in
    # the network analysis later
    join_df["marine"] = join_df.downstream.isin(coastline_idx)
    join_df.loc[join_df.marine, "downstream"] = 0
    join_df.loc[join_df.marine, "type"] = "terminal"

    # drop any duplicates (above operation sets some joins to upstream and downstream of 0)
    join_df = join_df.drop_duplicates(subset=["upstream", "downstream"])

    ### Filter out underground connectors
    ix = df.loc[df.FType == 420].index
    print("Removing {:,} underground conduits".format(len(ix)))
    df = df.loc[~df.index.isin(ix)].copy()
    join_df = remove_joins(
        join_df, ix, downstream_col="downstream", upstream_col="upstream"
    )

    ### Label loops for easier removal later
    # WARNING: loops may be very problematic from a network processing standpoint.
    # Include with caution.
    print("Identifying loops")
    df["loop"] = (df.StreamOrde != df.StreamCalc) | (df.FlowDir.isnull())

    idx = df.loc[df.loop].index
    join_df["loop"] = join_df.upstream.isin(idx) | join_df.downstream.isin(idx)

    ### Add calculated fields
    # Set our internal master IDs to the original index of the file we start from
    # Assume that we can always fit into a uint32, which is ~400 million records
    # and probably bigger than anything we could ever read in
    df["lineID"] = df.index.values.astype("uint32") + 1
    join_df = (
        join_df.join(df.lineID.rename("upstream_id"), on="upstream")
        .join(df.lineID.rename("downstream_id"), on="downstream")
        .fillna(0)
    )

    for col in ("upstream", "downstream"):
        join_df[col] = join_df[col].astype("uint64")

    for col in ("upstream_id", "downstream_id"):
        join_df[col] = join_df[col].astype("uint32")

    ### Calculate size classes
    print("Calculating size class")
    drainage = df.TotDASqKm
    df.loc[drainage < 10, "sizeclass"] = "1a"
    df.loc[(drainage >= 10) & (drainage < 100), "sizeclass"] = "1b"
    df.loc[(drainage >= 100) & (drainage < 518), "sizeclass"] = "2"
    df.loc[(drainage >= 518) & (drainage < 2590), "sizeclass"] = "3a"
    df.loc[(drainage >= 2590) & (drainage < 10000), "sizeclass"] = "3b"
    df.loc[(drainage >= 10000) & (drainage < 25000), "sizeclass"] = "4"
    df.loc[drainage >= 25000, "sizeclass"] = "5"

    # Calculate length and sinuosity
    print("Calculating length and sinuosity")
    df["length"] = df.geometry.length.astype("float32")
    df["sinuosity"] = calculate_sinuosity(df.geometry.values.data).astype("float32")

    # drop columns not useful for later processing steps
    df = df.drop(columns=["FlowDir", "StreamCalc"])

    # calculate incoming joins (have valid upstream, but not in this HUC4)
    join_df.loc[(join_df.upstream != 0) & (join_df.upstream_id == 0), "type"] = "huc_in"

    return df, join_df
    ### Aggregate waterbodies that are in contact / overlapping each other
    waterbodies = from_geofeather(src_dir / region /
                                  "waterbodies.feather").set_index("wbID")
    wb_joins = deserialize_df(src_dir / region /
                              "waterbody_flowline_joins.feather")
    print("Read {:,} waterbodies and {:,} flowine / waterbody joins".format(
        len(waterbodies), len(wb_joins)))

    # TODO: remove this on next full rerun of extract_flowlines...
    waterbodies = waterbodies.drop(columns=["hash"], errors="ignore")
    # Convert multipolygons to single part poylgons
    idx = (
        pg.get_type_id(waterbodies.geometry) == 6
    )  # idx = waterbodies.loc[waterbodies.geometry.type == "MultiPolygon"].index
    waterbodies.loc[idx, "geometry"] = waterbodies.loc[idx].geometry.apply(
        lambda g: pg.get_geometry(g, 0))

    # raise min size
    waterbodies = waterbodies.loc[
        waterbodies.AreaSqKm >= WATERBODY_MIN_SIZE].copy()
    wb_joins = wb_joins.loc[wb_joins.wbID.isin(waterbodies.index)].copy()

    # End TODO:

    # Drop any waterbodies and waterbody joins to flowlines that are no longer present
    # based on above processing of flowlines
    wb_joins = wb_joins.loc[wb_joins.lineID.isin(flowlines.index)].copy()
    to_drop = ~waterbodies.index.isin(wb_joins.wbID)
    print(
        "Dropping {:,} waterbodies that no longer intersect with the flowlines retained above"
        .format(to_drop.sum()))