Пример #1
0
def test_locate_sites(shellmound_sfrdata, reach_id_col, outdir):

    X, Y, rno = zip(*((515459.9, 1189906.1, 202), (515375.2, 1189942.5, 204)))
    df = pd.DataFrame({
        'geometry': [Point(x, y) for x, y in zip(X, Y)],
        'site_no': rno
    })
    sites_shapefile = '{}/sites.shp'.format(outdir)
    df2shp(df, sites_shapefile, crs=5070)
    sfrlines_shapefile = '{}/shellmound_lines.shp'.format(outdir)
    shellmound_sfrdata.export_lines(sfrlines_shapefile)
    # test reading sfrlines as a dataframe
    # and sfrlines without a reach number column
    if reach_id_col is None:
        reach_id_col = 'rno'
        sfrlines = gpd.read_file(sfrlines_shapefile)
        sfrlines.drop('rno', axis=1, inplace=True)
        sfrlines_shapefile = sfrlines
    active_area = box(*shellmound_sfrdata.grid.bounds)
    locs = locate_sites(sites_shapefile,
                        sfrlines_shapefile,
                        active_area,
                        keep_columns=None,
                        reach_id_col=reach_id_col,
                        ireach_col='ireach',
                        iseg_col='iseg',
                        site_number_col='site_no',
                        perimeter_buffer=1000,
                        distance_threshold=1600)
    assert np.array_equal(locs.rno.values, locs.site_no.values)
    # check that iseg and ireach columns are in the located sites table
    # (for modflow-2005 style sfr packages)
    assert 'iseg' in locs.columns
    assert 'ireach' in locs.columns
Пример #2
0
def export_reach_data(reach_data,
                      grid,
                      filename,
                      nodes=None,
                      geomtype='Polygon'):
    """Generic method for exporting data to a shapefile; joins
    attributes in reach_data to geometries in grid using node numbers.
    """
    assert grid is not None, "need grid attribute for export"
    if nodes is not None:
        keep = [True if n in nodes else False for n in reach_data.node]
        rd = reach_data.loc[keep].copy()
    else:
        rd = reach_data.copy()
    assert isinstance(
        grid, sfrmaker.grid.Grid), "grid needs to be an sfrmaker.Grid instance"
    assert np.array_equal(grid.df.node.values, np.arange(grid.size))
    assert np.array_equal(grid.df.node.values, grid.df.index.values)
    polygons = grid.df.loc[rd.node, 'geometry'].values
    epsg = grid.crs.epsg
    proj_str = grid.crs.proj_str
    if geomtype.lower() == 'polygon':
        rd['geometry'] = polygons
    elif geomtype.lower() == 'point':
        rd['geometry'] = [p.centroid for p in polygons]
    else:
        raise ValueError('Unrecognized geomtype "{}"'.format(geomtype))
    df2shp(rd, filename, epsg=epsg, proj_str=proj_str)
Пример #3
0
def active_area(outfolder):
    active_area_tuple = -90.55, 33.5, -90.16, 33.86
    active_area_poly = box(*active_area_tuple)
    df = pd.DataFrame({'geometry': [active_area_poly], 'id': [0]})
    active_area = os.path.join(outfolder, 'active_area.shp')
    df2shp(df, active_area, crs=4269)
    return active_area_tuple
Пример #4
0
def preprocessed_flowlines(test_data_path, culled_flowlines, outfolder,
                           project_root_path):

    kwargs = culled_flowlines.copy()
    #kwargs['demfile'] = os.path.join(test_data_path, 'meras_100m_dem.tif')
    kwargs['demfile'] = os.path.join(test_data_path, 'meras_30m_dem.tif')
    #kwargs['demfile'] = os.path.join(project_root_path, 'examples/meras/dem_min_elevs_1000.tif')
    kwargs['dem_length_units'] = 'feet'
    kwargs['narwidth_shapefile'] = os.path.join(test_data_path, 'NARwidth.shp')
    kwargs['waterbody_shapefiles'] = os.path.join(
        test_data_path, 'NHDPlus08/NHDSnapshot/Hydrography/NHDWaterbody.shp')
    kwargs['asum_thresh'] = 20.
    kwargs['width_from_asum_a_param'] = 0.0592
    kwargs['width_from_asum_b_param'] = 0.5127
    kwargs['known_connections'] = {
        17955195: 17955197,
        17955197: 17955185,
        17954979: 17954993,
        17954993: 17955075
    }
    kwargs['logger'] = None
    kwargs['output_length_units'] = 'meters'
    kwargs['outfolder'] = outfolder
    kwargs['project_epsg'] = 5070
    preprocessed_flowlines = preprocess_nhdplus(**kwargs)

    # check that the known_connections were routed correctly
    for comid, tocomid in kwargs['known_connections'].items():
        assert preprocessed_flowlines.loc[comid, 'tocomid'] == tocomid

    out_shapefile = os.path.join(outfolder, 'preprocessed_flowlines.shp')
    df2shp(preprocessed_flowlines, out_shapefile, crs=5070)
    return preprocessed_flowlines
Пример #5
0
 def write_shapefile(self, filename='grid.shp'):
     i, j = np.indices((self.nrow, self.ncol))
     df = pd.DataFrame({'node': list(range(len(self.polygons))),
                        'i': i.ravel(),
                        'j': j.ravel(),
                        'geometry': self.polygons
                        })
     df2shp(df, filename, epsg=self.epsg, proj_str=self.proj_str)
Пример #6
0
def write_bbox_shapefile(modelgrid, outshp):
    outline = get_grid_bounding_box(modelgrid)
    df2shp(pd.DataFrame({
        'desc': ['model bounding box'],
        'geometry': [outline]
    }),
           outshp,
           epsg=modelgrid.epsg)
Пример #7
0
def extent_poly():
    extent_poly_ll = box(-92.7, 46.7, -92.6, 46.8)

    extent_poly = project(extent_poly_ll, "+init=epsg:{}".format(4269),
                          "+init=epsg:26915")
    df = pd.DataFrame({'geometry': [extent_poly], 'id': [0]})
    df2shp(df, 'examples/data/bbox.shp', epsg=26915)
    return extent_poly_ll
Пример #8
0
def shapefile_features(polygon_features, test_output_path):
    df = pd.DataFrame({
        'id': list(range(len(polygon_features))),
        'geometry': polygon_features
    })
    shapefile_name = '{}/zstats_features.shp'.format(test_output_path)
    df2shp(df, shapefile_name, epsg=3070)
    return shapefile_name
Пример #9
0
def point_data(test_output_path):
    df = pd.DataFrame({
        'x': [1, 3, 5, 5, 3, 2],
        'y': [1, 1, 1, 3, 2, 4],
        'values': np.random.randn(6),
    })
    df['geometry'] = [Point(x, y) for x, y in zip(df.x, df.y)]
    df2shp(df, test_output_path / 'test_points.shp', crs=5070)
Пример #10
0
def shellmound_active_area(shellmound_grid, outdir):
    """Make a shapefile of the shellmound bounding box."""
    l, r, b, t, = shellmound_grid.extent
    bbox = box(l, b, r, t)
    df = pd.DataFrame({'geometry': [bbox], 'id': [0]})
    out_shapefile = os.path.join(outdir, 'shellmound', 'shellmound_bbox.shp')
    gisutils.df2shp(df, out_shapefile, crs=5070)
    return out_shapefile
Пример #11
0
 def write_active_area_shapefile(self, outshp='active_area.shp'):
     if self._active_area is None:
         self.create_active_area_polygon_from_isfr()
     assert isinstance(self._active_area, Polygon), \
         "active area didn't get set correctly (not a shapely Polygon)"
     df = pd.DataFrame({'geometry': [self._active_area],
                        'description': ['Active area where SFR will be applied.']})
     df2shp(df, outshp, crs=self.crs)
Пример #12
0
    def write_shapefile(self, outshp='flowlines.shp'):
        """Write a shapefile of :py:attr:`Lines.df`.

        Parameters
        ----------
        outshp : str, optional
            Shapefile name, by default 'flowlines.shp'
        """
        df2shp(self.df, outshp, crs=self.crs)
Пример #13
0
def export_shapefile(filename, data, modelgrid, kper=None,
                     squeeze=True,
                     epsg=None, proj_str=None, prj=None,
                     verbose=False):
    t0 = time.time()
    if isinstance(data, MFTransientList) or isinstance(data, MfList):
        df = mftransientlist_to_dataframe(data, squeeze=squeeze)
    elif isinstance(data, np.recarray):
        df = pd.DataFrame(data)
    elif isinstance(data, pd.DataFrame):
        df = data
    else:
        raise TypeError("data needs to be a pandas DataFrame, MFList, or numpy recarray")

    if epsg is None:
        epsg = modelgrid.epsg
    if proj_str is None:
        proj_str = modelgrid.proj_str

    if 'cellid' in df.columns and isinstance(df['cellid'].values[0], tuple):
        k, i, j = list(zip(*df['cellid']))
        i = np.array(i)
        j = np.array(j)
    elif 'i' in df.columns and 'j' in df.columns:
        i, j = df['i'].values, df['j'].values
    elif 'geometry' not in df.columns:
        raise ValueError('DataFrame needs cellid, (i, j) or geometry'
                         'information to be exported to shapefile.')

    if kper is not None:
        df = df.loc[df.per == kper]
        verts = np.array(modelgrid.get_cell_vertices(i, j))
    elif df is not None:
        verts = modelgrid.get_vertices(i, j)
    # use cell geometries from the model grid
    if 'geometry' not in df.columns:
        polys = np.array([Polygon(v) for v in verts])
        df['geometry'] = polys
        # unfortunately, reaches through inactive cells 
        # lose their cellid (k, i, j) location
        # so there is no way to plot these 
        # without geometries from another source (such as the sfrlines)
        # drop such geometries, which are identified by k, i, j == -1
        invalid_geoms = np.any(df[['k', 'i', 'j']] < 0, axis=1)
        df = df.loc[~invalid_geoms].copy()
    if epsg is None:
        epsg = modelgrid.epsg
    if proj_str is None:
        proj_str = modelgrid.proj_str
    if prj is None:
        prj = modelgrid.prj
    df2shp(df, filename, epsg=epsg, proj_str=proj_str, prj=prj)
    if verbose:
        print("shapefile export took {:.2f}s".format(time.time() - t0))
Пример #14
0
 def export_lines(self, filename=None):
     """Export shapefile of linework"""
     if filename is None:
         filename = '{}_{}_cells.shp'.format(self.package_name, self.package_type)
     if self.package_type == 'sfr':
         data = self.reach_data
     else:
         data = self.stress_period_data
     assert 'geometry' in data.columns and \
            isinstance(data.geometry.values[0], LineString), \
         "No LineStrings in reach_data.geometry"
     df2shp(data, filename, crs=self.grid.crs)
Пример #15
0
def export_shapefile(filename,
                     data,
                     modelgrid,
                     kper=None,
                     squeeze=True,
                     epsg=None,
                     proj_str=None,
                     prj=None,
                     verbose=False):
    t0 = time.time()
    if isinstance(data, MFTransientList) or isinstance(data, MfList):
        df = mftransientlist_to_dataframe(data, squeeze=squeeze)
    elif isinstance(data, np.recarray):
        df = pd.DataFrame(data)
    elif isinstance(data, pd.DataFrame):
        df = data
    else:
        raise TypeError(
            "data needs to be a pandas DataFrame, MFList, or numpy recarray")

    if epsg is None:
        epsg = modelgrid.epsg
    if proj_str is None:
        proj_str = modelgrid.proj_str

    if 'cellid' in df.columns and isinstance(df['cellid'].values[0], tuple):
        k, i, j = list(zip(*df['cellid']))
        i = np.array(i)
        j = np.array(j)
    elif 'i' in df.columns and 'j' in df.columns:
        i, j = df['i'].values, df['j'].values
    elif 'geometry' not in df.columns:
        raise ValueError('DataFrame needs cellid, (i, j) or geometry'
                         'information to be exported to shapefile.')

    if kper is not None:
        df = df.loc[df.per == kper]
        verts = np.array(modelgrid.get_cell_vertices(i, j))
    elif df is not None:
        verts = modelgrid.get_vertices(i, j)
    if 'geometry' not in df.columns:
        polys = np.array([Polygon(v) for v in verts])
        df['geometry'] = polys
    if epsg is None:
        epsg = modelgrid.epsg
    if proj_str is None:
        proj_str = modelgrid.proj_str
    if prj is None:
        prj = modelgrid.prj
    df2shp(df, filename, epsg=epsg, proj_str=proj_str, prj=prj)
    if verbose:
        print("shapefile export took {:.2f}s".format(time.time() - t0))
Пример #16
0
def test_locate_sites(shellmound_sfrdata, outdir):

    X, Y, rno = zip(*((515459.9, 1189906.1, 202), (515375.2, 1189942.5, 204)))
    df = pd.DataFrame({
        'geometry': [Point(x, y) for x, y in zip(X, Y)],
        'site_no': rno
    })
    sites_shapefile = '{}/sites.shp'.format(outdir)
    df2shp(df, sites_shapefile, epsg=5070)
    sfrlines_shapefile = '{}/shellmound_lines.shp'.format(outdir)
    shellmound_sfrdata.export_lines(sfrlines_shapefile)
    active_area = box(*shellmound_sfrdata.grid.bounds)
    locs = locate_sites(sites_shapefile,
                        sfrlines_shapefile,
                        active_area,
                        keep_columns=None,
                        reach_id_col='rno',
                        site_number_col='site_no',
                        perimeter_buffer=1000,
                        distance_threshold=1600)
    assert locs.rno.equals(locs.site_no)
Пример #17
0
    def write_shp(self, df, shpname='NWIS_export.shp', **kwargs):
        """Write a shapefile of points from NWIS site file

        Parameters
        ----------
        df: dataframe
            dataframe of site info, must have dec_long_va and dec_lat_va columns with lon/lat in DD

        shpname: string
            Name for output shapefile

        Notes
        -----
        NAD83 is assumed for dec_long_va and dec_lat_va.
        If some entries are in NAD27, a difference of ~5 to >15m will result for WI
        (see http://en.wikipedia.org/wiki/North_American_Datum#/media/File:Datum_Shift_Between_NAD27_and_NAD83.png)
        """
        shpdf = df.copy()
        shpdf['geometry'] = [
            Point(r.dec_long_va, r.dec_lat_va) for i, r in shpdf.iterrows()
        ]
        gisutils.df2shp(shpdf, shpname, epsg=4269)
Пример #18
0
def assign_layers_from_screen_top_botm(data,
                                       model,
                                       flux_col='q',
                                       screen_top_col='screen_top',
                                       screen_botm_col='screen_botm',
                                       label_col='site_no',
                                       across_layers=False,
                                       distribute_by='thickness',
                                       minimum_layer_thickness=2.):
    """Assign model layers to pumping flux data based on
    open interval. Fluxes are applied to each layer proportional
    to the fraction of open interval in that layer.

    Parameters
    ----------
    data : dataframe of well info
        Must have i, j or x, y locations
    model : mfsetup.MF6model or mfsetup.MFnwtModel instance
        Must have dis, and optionally, attached MFsetupGrid instance
    flux_col : column in data with well fluxes
    screen_top_col : column in data with screen top elevations
    screen_botm_col : column in data with screen bottom elevations
    label_col : column with well names (optional; default site_no)
    across_layers : bool
        True to distribute fluxes to multipler layers intersected by open interval
    distribute_by : str ('thickness' or 'transmissivity')
        Distribute fluxes to layers based on thickness or transmissivity of
        intersected open intervals.

    Returns
    -------
    data : dataframe of well info, modified so that each row represents
        pumping in a single model layer (with fluxes modified proportional
        to the amount of open interval in that layer).
    """
    # inactive cells in either MODFLOW version
    if model.version == 'mf6':
        idomain = model.idomain
    else:
        idomain = model.bas6.ibound.array

    # 'boundname' column is used by wel setup for identifying wells
    if label_col in data.columns:
        data['boundname'] = data[label_col]
    if across_layers:
        raise NotImplemented('Distributing fluxes to multiple layers')
    else:
        if distribute_by == 'thickness':
            i, j, x, y, screen_botm, screen_top = None, None, None, None, None, None
            if 'i' in data.columns and 'y' in data.columns:
                i, j = data['i'].values, data['j'].values
            elif 'x' in data.columns and 'y' in data.columns:
                x, y = data['x'].values, data['y'].values
            if screen_top_col in data.columns:
                screen_top = data[screen_top_col].values
            if screen_botm_col in data.columns:
                screen_botm = data[screen_botm_col].values
            thicknesses = get_open_interval_thickness(model,
                                                      i=i,
                                                      j=j,
                                                      x=x,
                                                      y=y,
                                                      screen_top=screen_top,
                                                      screen_botm=screen_botm)
            # for each i, j location with a well,
            # get the layer with highest thickness in the open interval
            data['k'] = np.argmax(thicknesses, axis=0)
            # get the thickness for those layers
            all_layers = np.zeros((model.nlay + 1, model.nrow, model.ncol))
            all_layers[0] = model.dis.top.array
            all_layers[1:] = model.dis.botm.array
            layer_thicknesses = -np.diff(all_layers[:, i, j], axis=0)

            # only include thicknesses for valid layers
            # set inactive cells to 0 thickness for the purpose or relocating wells
            layer_thicknesses[idomain[:, i, j] != 1] = 0
            data['idomain'] = idomain[data['k'], i, j]
            data['laythick'] = layer_thicknesses[
                data['k'].values,
                list(range(layer_thicknesses.shape[1]))]
            # flag layers that are too thin or inactive
            inactive = idomain[data.k, data.i, data.j] != 1
            invalid_open_interval = (data['laythick'] <
                                     minimum_layer_thickness) | inactive

            if any(invalid_open_interval):
                outfile = model.cfg['wel']['output_files'][
                    'dropped_wells_file'].format(model.name)

                # move wells that are still in a thin layer to the thickest active layer
                data['orig_layer'] = data['k']
                thickest_layer = np.argmax(layer_thicknesses, axis=0)
                data.loc[invalid_open_interval,
                         'k'] = thickest_layer[invalid_open_interval]
                data['laythick'] = layer_thicknesses[
                    data['k'].values,
                    list(range(layer_thicknesses.shape[1]))]
                data['idomain'] = idomain[data['k'], i, j]

                # record which wells were moved or dropped, and why
                bad_wells = data.loc[invalid_open_interval].copy()
                bad_wells['category'] = 'moved'
                bad_wells[
                    'reason'] = 'longest open interval thickness < {} {} minimum'.format(
                        minimum_layer_thickness, model.length_units)
                bad_wells[
                    'routine'] = __name__ + '.assign_layers_from_screen_top_botm'
                msg = (
                    'Warning: {} of {} wells in layers less than '
                    'specified minimum thickness of {} {}\n'
                    'were moved to the thickest layer at their i, j locations.\n'
                    .format(invalid_open_interval.sum(), len(data),
                            minimum_layer_thickness, model.length_units))
                still_below_minimum = bad_wells[
                    'laythick'] < minimum_layer_thickness
                bad_wells.loc[still_below_minimum, 'category'] = 'dropped'
                bad_wells.loc[
                    still_below_minimum,
                    'reason'] = 'no layer above minimum thickness of {} {}'.format(
                        minimum_layer_thickness, model.length_units)
                n_below = np.sum(still_below_minimum)
                if n_below > 0:
                    msg += (
                        'Out of these, {} of {} total wells remaining in layers less than '
                        'specified minimum thickness of {} {}'
                        ''.format(n_below, len(data), minimum_layer_thickness,
                                  model.length_units))
                    if flux_col in data.columns:
                        pct_flux_below = 100 * bad_wells.loc[
                            still_below_minimum,
                            flux_col].sum() / data[flux_col].sum()
                        msg += ', \nrepresenting {:.2f} %% of total flux,'.format(
                            pct_flux_below)

                    msg += '\nwere dropped. See {} for details.'.format(
                        outfile)
                    print(msg)

                # write shapefile and CSV output for wells that were dropped
                cols = [
                    'k', 'i', 'j', 'boundname', 'category', 'laythick',
                    'idomain', 'reason', 'routine', 'x', 'y'
                ]
                if flux_col in data.columns:
                    cols.insert(3, flux_col)
                flux_below = bad_wells.groupby(['k', 'i', 'j'
                                                ]).first().reset_index()[cols]
                append_csv(outfile, flux_below, index=False, float_format='%g')
                if 'x' in flux_below.columns and 'y' in flux_below.columns:
                    flux_below['geometry'] = [
                        Point(xi, yi)
                        for xi, yi in zip(flux_below.x, flux_below.y)
                    ]
                    df2shp(flux_below,
                           outfile[:-4] + '.shp',
                           epsg=model.modelgrid.epsg)

                # cull the wells that are still below the min. layer thickness
                data = data.loc[
                    data['laythick'] > minimum_layer_thickness].copy()

        elif distribute_by == 'tranmissivity':
            raise NotImplemented(
                'Distributing well fluxes by layer transmissivity')

        else:
            raise ValueError(
                'Unrecognized argument for distribute_by: {}'.format(
                    distribute_by))
    return data
Пример #19
0
 def write_grid_shapefile(self, outshp='grid.shp'):
     df2shp(self.df, outshp, crs=self.crs)
Пример #20
0
    def assign_monthly_production(self, outfile='processed_swuds.csv'):
        """ Assign production wells for water use, skipping IR (irrigation) and
        TE (thermal electric) to production zones.  If production zones are not
        assigned or if the well bottom doesn't fall into a production zone, then
        the screen_top and screen_bot are assigned using well_depth and the
        default screen length.

        Production is given in cubic m per day.
        todo:  add unit conversion parameter so other units can be used?

        Parameters
        ----------
        outfile: str
            path to final processed monthly water-use file with production zone
            information 
        """

        # fill in missing monthly values with annual value
        for c in self.monthly_cols:
            idx = self.df.loc[self.df[c].isnull()].index.values
            self.df.loc[idx, c] = self.df.loc[idx, 'ANNUAL_VAL']

        # pull out groundwater sites that are not IR, AQ or TE
        self.df = self.df.loc[(self.df['WATER_CD'] == 'GW')
                              & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'IR')
                              & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'AQ')
                              & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'TE')]

        # reshape dataframe to have monthly values in same column
        stacked = pd.DataFrame(self.df[self.monthly_cols].stack())
        stacked.reset_index(inplace=True)
        stacked.rename(columns={
            'level_1': 'month',
            0: 'q_monthly'
        },
                       inplace=True)
        stacked.q_monthly = stacked.q_monthly
        stacked.index = stacked.level_0
        stacked = stacked.join(self.df)
        keep_cols = [c for c in stacked.columns if c not in self.monthly_cols]
        stacked = stacked[keep_cols]
        month = {name: i + 1 for i, name in enumerate(self.monthly_cols)}
        dates = [
            '{}-{:02d}'.format(year, month[month_column_name])
            for year, month_column_name in zip(stacked.YEAR, stacked.month)
        ]
        stacked['datetime'] = pd.to_datetime(dates)
        stacked.sort_values(by=['SITE_NO', 'datetime'], inplace=True)

        # set start and end dates if not already set
        if self.start_date is None:
            self.start_date = stacked.datetime.min()
        if self.end_date is None:
            self.end_date = stacked.datetime.max()

        groups = stacked.groupby('SITE_NO')
        all_groups = []
        for site_no, group in groups:
            group = group.copy()
            group.index = pd.to_datetime(group['datetime'])
            start_date = pd.Timestamp(self.start_date)
            end_date = pd.Timestamp(self.end_date)

            monthly_values_2010 = group.loc[group.datetime.dt.year == 2010]
            monthly_values_2010 = dict(
                zip(monthly_values_2010.datetime.dt.month,
                    monthly_values_2010.q_monthly))
            avg_monthly_values = group.groupby(
                group.index.month).mean().q_monthly.to_dict()
            q_mean = group.q_monthly.mean()

            # reindex the site data to include all months for simulation period
            all_dates = pd.date_range(start_date, end_date, freq='MS')
            group = group.reindex(all_dates)
            # fill empty dates
            q = []
            for month, q_monthly in zip(group.index.month, group.q_monthly):
                # try to use 2010 values if they exist
                if np.isnan(q_monthly):
                    q_monthly = monthly_values_2010.get(month, np.nan)
                # otherwise take the average value for each month
                if np.isnan(q_monthly):
                    q_monthly = avg_monthly_values[month]
                # fill missing months with the mean value for the site
                if np.isnan(q_monthly):
                    q_monthly = q_mean
                q.append(q_monthly)

            # assume most values represent abstraction
            # if sum is positive, invert so that output values are negative
            if np.sum(q) > 0:
                q = -np.array(q)
            group['q'] = q
            #group['q'] = group['q'] * 3785.4  # convert from mgd to cubic m per d
            group['q'] = group['q'] * convert_volume_units(
                self.data_volume_units, self.model_length_units)

            group['site_no'] = f'swuds_{site_no}'
            group['well_elev'] = self.well_elevations[site_no]
            group['depth'] = self.depths[site_no]
            well_botm_depth = self.well_elevations[site_no] - self.depths[
                site_no]
            group['x'] = np.nanmin(group['x'])
            group['y'] = np.nanmin(group['y'])

            # assign a production zone from default dict.  If the bottom of the
            # well does not fall in a zone, or if the dictionary is empty; then
            # the production zone is assigned 'unnamed'
            production_zone = 'unnamed'
            for prod_name in self.prod_zone_top.keys():
                prod_zone_top = self.prod_zone_top[prod_name][site_no]
                prod_zone_bot = self.prod_zone_bot[prod_name][site_no]
                if np.isnan(prod_zone_top) or np.isnan(
                        prod_zone_bot):  # missing zone
                    group['screen_bot'] = self.well_elevations[
                        site_no] - self.depths[site_no]
                    group['screen_top'] = self.well_elevations[
                        site_no] - self.depths[
                            site_no] + self.default_screen_len
                    group['open_int_method'] = 'well depth'
                else:
                    if well_botm_depth < prod_zone_top and well_botm_depth > prod_zone_bot:
                        production_zone = prod_name
                        group['screen_bot'] = prod_zone_bot
                        group['screen_top'] = prod_zone_top
                        group['open_int_method'] = 'production zone'
                    else:
                        group['screen_bot'] = self.well_elevations[
                            site_no] - self.depths[site_no]
                        group['screen_top'] = self.well_elevations[
                            site_no] - self.depths[
                                site_no] + self.default_screen_len
                        group['open_int_method'] = 'well depth'
            group['production_zone'] = production_zone

            # add aquifer name
            group['aquifer_name'] = self.aquifer_names.get(
                group["FROM_AQFR_CD"].values[0], 'unnamed')

            cols = [
                'site_no', 'q', 'q_monthly', 'month', 'well_elev', 'depth',
                'screen_bot', 'screen_top', 'x', 'y'
            ]
            all_groups.append(group[cols])

        self.df = pd.concat(all_groups)
        self.df[
            'start_datetime'] = self.df.index  # start date of each pumping period
        if outfile is not None:
            outfile = Path(outfile)
            self.df.to_csv(outfile, index=False)
            print(
                'processed SWUDS data written to {0} and in dataframe attribute'
                .format(outfile))
            self.df['geometry'] = [
                Point(x, y) for x, y in zip(self.df.x, self.df.y)
            ]
            # write only unique pumping values to shapefile
            to_shapefile = self.df.groupby(['site_no',
                                            'q']).first().reset_index()
            shapefile = outfile.with_suffix('.shp')
            df2shp(to_shapefile, shapefile, crs=self.dest_crs)
Пример #21
0
 def write_grid_shapefile(self, outshp='grid.shp'):
     df2shp(self.df, outshp, epsg=self.crs.epsg, prj=self.crs.prjfile)
Пример #22
0
def preprocess_headobs(
        data,
        metadata,
        head_data_columns=['head', 'last_head', 'head_std'],
        dem=None,
        dem_units='meters',
        start_date='1998-04-01',
        active_area=None,
        active_area_id_column=None,
        active_area_feature_id=None,
        source_crs=4269,
        dest_crs=5070,
        data_length_units='meters',
        model_length_units='meters',
        geographic_groups=None,
        geographic_groups_col=None,
        max_obsname_len=None,
        outfile='../source_data/observations/head_obs/preprocessed_head_obs.csv'
):
    """Preprocess head observation data, for example, groundwater level data output from the
    `visGWDB program <https://doi.org/10.5066/P9W004O6>`_.

    * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates)
      to the CRS of the model (`dest_crs`)
    * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area
    * length units are converted to those of the groundwater model. Open intervals for the wells are
      converted from depths to elevations
    * missing open intervals are filled based on well bottom depths (if availabile) and the median open
      interval length for the dataset.
    * Wells are categorized based on the quality of the open interval information (see the documentation
      for :func:`mapgwm.headobs.fill_well_open_intervals`).
    * Prefixes for observation names (with an optional length limit) that identify the location are generated
    * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons
      (`aoi` parameter)

    Parameters
    ----------
    data : DataFrame
        Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`.
        Columns:

        ========= ================================================================
        site_no   site identifier
        lat       lattitude
        lon       longitude
        datetime  measurement dates in pandas datetime format
        head      average head for the period represented by the datetime
        last_head last head measurement for the period represented by the datetime
        head_std  standard deviation of measured heads within the datetime period
        ========= ================================================================

        Notes:

        * lat and lon columns can alternatively be in the metadata table
        * `last_head` and `head_std` only need to be included if they are in
          `head_data_columns`

    metadata : DataFrame
        Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`.

        Must have the following columns:

        ================= ==========================================================================
        site_no (index)   site identifier
        aqfr_cd           Local aquifer code
        screen_botm       Well screen bottom, as a depth below land surface, in feet
        screen_top        Well screen top, as a depth below land surface, in feet
        well_depth        Well depth, in feet
        well_el           Altitude of land surface, in feet
        ================= ==========================================================================

    head_data_columns : list of strings
        Columns in data with head values or their statistics.
        By default, 'head', 'last_head', 'head_std', which allows both
        the average and last head values for the stress period to be considered,
        as well as the variability of water levels contributing to an average value.
    dem : str, optional
        DEM raster of the land surface. Used for estimating missing wellhead elevations.
        Any reprojection to dest_crs is handled automatically, assuming
        the DEM raster has CRS information embedded (arc-ascii grids do not!)
        By default, None.
    dem_units : str, {'feet', 'meters', ..}
        Units of DEM elevations, by default, 'meters'
    start_date : str (YYYY-mm-dd)
        Simulation start date (cull observations before this date)
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269

    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    data_length_units : str; 'meters', 'feet', etc.
        Length units of head observations.
    model_length_units : str; 'meters', 'feet', etc.
        Length units of model.
    geographic_groups : file, dict or list-like
        Option to group observations by area(s) of interest. Can
        be a shapefile, list of shapefiles, or dictionary of shapely polygons.
        A 'group' column will be created in the metadata, and observation
        sites within each polygon will be assigned the group name
        associated with that polygon.

        For example::

            geographic_groups='../source_data/extents/CompositeHydrographArea.shp'
            geographic_groups=['../source_data/extents/CompositeHydrographArea.shp']
            geographic_groups={'cha': <shapely Polygon>}

        Where 'cha' is an observation group name for observations located within the
        the area defined by CompositeHydrographArea.shp. For shapefiles,
        group names are provided in a `geographic_groups_col`.

    geographic_groups_col : str
        Field name in the `geographic_groups` shapefile(s) containing the
        observation group names associated with each polygon.

    max_obsname_len : int or None
        Maximum length for observation name prefix. Default of 13
        allows for a PEST obsnme of 20 characters or less with
        <prefix>_yyyydd or <prefix>_<per>d<per>
        (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1)
        If None, observation names will not be truncated. PEST++ does not have
        a limit on observation name length.
    outfile : str
        Where output file will be written. Metadata are written to a file
        with the same name, with an additional "_info" suffix prior to
        the file extension.

    Returns
    -------
    df : DataFrame
        Preprocessed time series
    well_info : DataFrame
        Preprocessed metadata

    References
    ----------
    `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>`
    """

    df = data.copy()
    # multiplier to convert input length units to model units
    unit_conversion = convert_length_units(data_length_units,
                                           model_length_units)

    # outputs
    out_plot = None
    if outfile is not None:
        outpath, filename = os.path.split(outfile)
        makedirs(outpath)
        outname, ext = os.path.splitext(outfile)
        out_info_csvfile = outname + '_info.csv'
        out_data_csvfile = outfile
        out_plot = os.path.join(outpath, 'open_interval_lengths.pdf')
        out_shapefile = outname + '_info.shp'

    # set the starting and ending dates here
    stdate = pd.Timestamp(start_date)

    # convert to datetime; drop the timestamps
    df['datetime'] = pd.to_datetime(df.datetime).dt.normalize()

    # trim to the time range
    n_measurements = len(data)
    n_sites = len(set(data.site_no))
    print(
        f'starting with {n_measurements:,d} measurements at {n_sites:,d} unique wells'
    )
    no_data_in_period = df.datetime < stdate

    if np.any(no_data_in_period):
        in_period = df.datetime >= stdate
        n_sites_before = len(
            set(df.loc[no_data_in_period,
                       'site_no']).difference(set(df.loc[in_period,
                                                         'site_no'])))
        print((
            f'culling {in_period.sum():,d} measurements from {n_sites_before:,d} '
            f'sites that are prior to start date of {start_date}'))
        df = df.loc[in_period]

    # collapse dataset to mean values at each site
    groups = df.groupby('site_no')
    well_info = groups.mean().copy()
    well_info = well_info.join(metadata, rsuffix='_meta')
    well_info['start_dt'] = groups.datetime.min()
    well_info['end_dt'] = groups.datetime.max()
    well_info.drop(labels=['year', 'month'], axis=1, inplace=True)
    well_info['site_no'] = well_info.index
    well_info['n'] = groups.datetime.count()

    # project x, y to model crs
    x_pr, y_pr = project((well_info.lon.values, well_info.lat.values),
                         source_crs, dest_crs)
    well_info.drop(['lon', 'lat'], axis=1, inplace=True)
    well_info['x'], well_info['y'] = x_pr, y_pr
    well_info['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull data to that within the model area
    if active_area is not None:
        df, md = cull_data_to_active_area(df,
                                          active_area,
                                          active_area_id_column,
                                          active_area_feature_id,
                                          data_crs=dest_crs,
                                          metadata=well_info)

    # convert length units; convert screen tops and botms to depths
    missing_elevations = well_info.well_el.isna()
    if dem is not None and np.any(missing_elevations):
        well_location_elevations = get_values_at_points(dem,
                                                        well_info['x'],
                                                        well_info['y'],
                                                        points_crs=dest_crs)
        well_location_elevations *= convert_length_units(
            dem_units, model_length_units)
        well_info.loc[missing_elevations,
                      'well_el'] = well_location_elevations[missing_elevations]

    length_columns = ['well_el'
                      ] + head_data_columns + ['screen_top', 'screen_botm']
    for col in length_columns:
        if col in well_info.columns:
            well_info[col] *= unit_conversion

    well_info['well_botm'] = well_info['well_el'] - well_info['well_depth']
    well_info['screen_top'] = well_info['well_el'] - well_info['screen_top']
    well_info['screen_botm'] = well_info['well_el'] - well_info['screen_botm']

    # just the data, site numbers, times and aquifer
    head_data_columns = head_data_columns + ['head_std']
    transient_cols = ['site_no', 'datetime'] + head_data_columns + ['n']
    transient_cols = [c for c in transient_cols if c in df.columns]
    df = df[transient_cols].copy()
    for c in head_data_columns:
        if c in df.columns:
            df[c] *= unit_conversion

    # #### trim down to only well_info with both estimated water levels and standard deviation
    # monthly measured levels may not have standard deviation
    # (as opposed to monthly statistical estimates)
    criteria = pd.notnull(well_info['head'])
    #if 'head_std' in df.columns:
    #    criteria = criteria & pd.notnull(well_info['head_std'])
    well_info = well_info[criteria]

    # verify that all well_info have a wellhead elevation
    assert not np.any(np.isnan(well_info.well_el))

    # categorize wells based on quality of open interval information
    # estimate missing open intervals where possible
    well_info = fill_well_open_intervals(well_info, out_plot=out_plot)

    # drop well_info with negative reported open interval
    #well_info = well_info.loc[open_interval_length > 0]

    # cull data to well_info in well info table
    has_metadata = df.site_no.isin(well_info.index)
    if np.any(~has_metadata):
        warnings.warn('culling {} wells not found in metadata table!'.format(
            np.sum(~has_metadata)))
        df = df.loc[has_metadata].copy()

    # make unique n-character prefixes (site identifiers) for each observation location
    # 13 character length allows for prefix_yyyymmm in 20 character observation names
    # (BeoPEST limit)
    unique_obsnames = set()
    obsnames = []
    for sn in well_info.index.tolist():
        if max_obsname_len is not None:
            name = make_obsname(sn,
                                unique_names=unique_obsnames,
                                maxlen=max_obsname_len)
            assert name not in unique_obsnames
        else:
            name = sn
        unique_obsnames.add(name)
        obsnames.append(name)
    well_info['obsprefix'] = obsnames
    obsprefix = dict(zip(well_info.index, well_info.obsprefix))
    df['obsprefix'] = [obsprefix[sn] for sn in df.site_no]

    # add area of interest information
    well_info['group'] = 'heads'
    well_info = assign_geographic_obsgroups(well_info,
                                            geographic_groups,
                                            geographic_groups_col,
                                            metadata_crs=dest_crs)

    # save out the results
    if outfile is not None:
        df2shp(well_info.drop(['x', 'y'], axis=1),
               out_shapefile,
               index=False,
               crs=dest_crs)
        print('writing {}'.format(out_info_csvfile))
        well_info.drop('geometry', axis=1).to_csv(out_info_csvfile,
                                                  index=False,
                                                  float_format='%.2f')
        print('writing {}'.format(out_data_csvfile))
        df.to_csv(out_data_csvfile, index=False, float_format='%.2f')
    return df, well_info
Пример #23
0
 def write_shapefile(self, outshp='flowlines.shp'):
     df2shp(self.df, outshp, epsg=self.crs.epsg, prj=self.crs.prjfile)
Пример #24
0
def export_array_contours(filename,
                          a,
                          modelgrid,
                          fieldname='level',
                          interval=None,
                          levels=None,
                          maxlevels=1000,
                          epsg=None,
                          proj_str=None,
                          verbose=False,
                          **kwargs):
    """
    Contour an array using matplotlib; write shapefile of contours.

    Parameters
    ----------
    filename : str
        Path of output file with '.shp' extention.
    a : 2D numpy array
        Array to contour
    epsg : int
        EPSG code. See https://www.epsg-registry.org/ or spatialreference.org
    prj : str
        Existing projection file to be used with new shapefile.
    **kwargs : keyword arguments to matplotlib.axes.Axes.contour

    """
    t0 = time.time()
    if epsg is None:
        epsg = modelgrid.epsg
    if proj_str is None:
        proj_str = modelgrid.proj_str

    if interval is not None:
        kwargs['levels'] = make_levels(a, interval, maxlevels)
    elif levels is not None:
        kwargs['levels'] = levels

    ax = plt.subplots()[-1]
    contours = ax.contour(modelgrid.xcellcenters, modelgrid.ycellcenters, a,
                          **kwargs)
    plt.close()

    if not isinstance(contours, list):
        contours = [contours]

    if epsg is None:
        epsg = modelgrid.epsg
    if proj_str is None:
        proj_str = modelgrid.proj_str

    geoms = []
    level = []
    for ctr in contours:
        levels = ctr.levels
        for i, c in enumerate(ctr.collections):
            paths = c.get_paths()
            geoms += [
                LineString(p.vertices) if len(p) > 1 else LineString()
                for p in paths
            ]
            level += list(np.ones(len(paths)) * levels[i])

    # convert the dictionary to a recarray
    df = pd.DataFrame({'level': level, 'geometry': geoms})
    df2shp(df, filename, epsg=epsg, proj_str=proj_str)
    if verbose:
        print("array contour export took {:.2f}s".format(time.time() - t0))
    return
Пример #25
0
def preprocess_te_wateruse(data,
                           start_date=None,
                           end_date=None,
                           active_area=None,
                           active_area_id_column=None,
                           active_area_feature_id=None,
                           estimated_production_zone_top=None,
                           estimated_production_zone_botm=None,
                           estimated_production_surface_units='feet',
                           source_crs=4269,
                           dest_crs=5070,
                           interp_method='linear',
                           data_volume_units='mgal',
                           model_length_units='meters',
                           outfile=None):
    """Preprocess water use data from thermoelectric power plants:

    * reproject data to a destination CRS `dest_crs`)
    * cull data to an area of interest (`active_area`)
    * if input data do not have information on the well screen intervals;
      sample screen tops and bottoms from raster surfaces bounding
      an estimated production zone (e.g. `estimated_production_zone_top`)
    * reindex the data to continous monthly values extending from `start_date`
      to `end_date`. Typically, these would bracket the time period for which
      the pumping should be simulated in a model. For example, the earliest data
      may be from 2010, but if the model starts in 2008, it may be appropriate to
      begin using the 2010 rates then (``start_date='2008'``). If no start or end
      date are given, the first and last years of pumping in `data` are used.
    * fill empty months by interpolation via a specified `interp_method`
    * backfill any remaining empty months going back to the `start_date`
    * write processed data to a CSV file and shapefile of the same name

    Parameters
    ----------
    data : DataFrame
        Thermoelectric water use data in the following format
        (similar to that output by :func:`mapgwm.te_wateruse.read_te_water_use_spreadsheet`):

        =============== =======================================================
        site_no         power plant identifier (plant code)
        start_datetime  pandas datetime representative of flux (e.g. '2010')
        x               x-coordinate of withdrawl, in `source_crs`
        y               y-coordinate of withdrawl, in `source_crs`
        q               withdrawl flux, in `data_volume_units` per days
        =============== =======================================================

    start_date : str
        Start date for pumping rates. If earlier than the dates in `data`,
        pumping rates will be backfilled to this date.
    end_date : str
        End date for pumping rates. If later than the dates in `data`,
        pumping rates will be forward filled to this date.
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    estimated_production_zone_top : file path
        Raster surface for assigning screen tops
    estimated_production_zone_botm : file path
        Raster surface for assigning screen bottoms
    estimated_production_surface_units : str, {'meters', 'ft', etc.}
        Length units of elevations in estimated production surface rasters.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    interp_method : str
        Interpolation method to use for filling pumping rates to monthly values.
        By default, 'linear'
    data_volume_units : str; e.g. 'mgal', 'm3', 'cubic feet', etc.
        Volume units of pumping data. All time units are assumed to be in days.
    model_length_units : str; e.g. 'feet', 'm', 'meters', etc.
        Length units of model.
    outfile : str
        Path for output file. A shapefile of the same name is also written.
        If None, no output file is written. By default, None

    Returns
    -------
    df_monthly : DataFrame
        

    Notes
    -----
    * time units for TE data and model are assumed to be days

    """
    df = data.copy()

    # reproject to dest_crs
    x, y = project(zip(df['x'], df['y']), source_crs, dest_crs)
    df['x'], df['y'] = x, y
    df['geometry'] = [Point(x, y) for x, y in zip(x, y)]

    # drop wells with no location information (for now)
    df.dropna(subset=['x', 'y'], axis=0, inplace=True)

    # cull sites to those within the Delta footprint
    # cull data to that within the model area
    if active_area is not None:
        df = cull_data_to_active_area(df,
                                      active_area,
                                      active_area_id_column,
                                      active_area_feature_id,
                                      data_crs=dest_crs)

    # get top and bottom of estimated production interval at each well
    if estimated_production_zone_top is not None and \
            estimated_production_zone_botm is not None:
        surf_unit_conversion = convert_length_units(
            estimated_production_surface_units, model_length_units)
        x, y = df.x.values, df.y.values
        est_screen_top = get_values_at_points(estimated_production_zone_top,
                                              x,
                                              y,
                                              points_crs=dest_crs)
        est_screen_top *= surf_unit_conversion
        est_screen_botm = get_values_at_points(estimated_production_zone_botm,
                                               x,
                                               y,
                                               points_crs=dest_crs)
        est_screen_botm *= surf_unit_conversion
        df['screen_top'] = est_screen_top
        df['screen_botm'] = est_screen_botm

    # distribute fluxes to monthly values
    # set start and end dates if not already set
    if start_date is None:
        start_date = df.start_datetime.min()
    if end_date is None:
        end_date = df.start_datetime.mmax()
    groups = df.groupby('site_no')
    all_groups = []
    for site_no, group in groups:
        dfg = group.copy()

        # create a continuous monthly time index
        # labeled at the month start
        all_dates = pd.date_range(start_date, end_date, freq='MS')
        dfg.index = dfg['start_datetime']
        dfg = dfg.reindex(all_dates)

        # interpolate the discharge values;
        # back filling to the start date
        dfg['q'] = dfg.q.interpolate(method=interp_method).bfill()
        dfg['q'] *= convert_volume_units(data_volume_units, model_length_units)

        # fill remaining columns
        dfg['start_datetime'] = dfg.index
        fill_columns = set(dfg.columns).difference({'q', 'start_datetime'})
        fill_values = group.iloc[0].to_dict()
        for c in fill_columns:
            dfg[c] = fill_values[c]

        # add 'te' prefix to site number
        dfg['site_no'] = f'te_{site_no}'
        all_groups.append(dfg)
    df_monthly = pd.concat(all_groups)

    # assume most values represent abstraction
    # if sum is positive, invert so that output values are negative
    if df_monthly['q'].sum() > 0:
        df_monthly['q'] *= -1

    # clean up the columns
    cols = [
        'site_no', 'start_datetime', 'x', 'y', 'screen_top', 'screen_botm',
        'q', 'geometry'
    ]
    cols += list(set(df_monthly.columns).difference(cols))
    df_monthly = df_monthly[cols]

    # write the output
    if outfile is not None:
        outfile = Path(outfile)
        df_monthly.drop('geometry', axis=1).to_csv(outfile,
                                                   index=False,
                                                   float_format='%g')
        print('wrote {}'.format(outfile))

        # write only unique pumping values to shapefile
        to_shapefile = df_monthly.groupby(['site_no',
                                           'q']).first().reset_index()
        shapefile = outfile.with_suffix('.shp')
        df2shp(to_shapefile, shapefile, crs=dest_crs)
    return df_monthly
Пример #26
0
def preprocess_flows(
    data,
    metadata=None,
    flow_data_columns=['flow'],
    start_date=None,
    active_area=None,
    active_area_id_column=None,
    active_area_feature_id=None,
    source_crs=4269,
    dest_crs=5070,
    datetime_col='datetime',
    site_no_col='site_no',
    line_id_col='line_id',
    x_coord_col='x',
    y_coord_col='y',
    name_col='name',
    flow_qualifier_column=None,
    default_qualifier='measured',
    include_sites=None,
    include_line_ids=None,
    source_volume_units='ft3',
    source_time_units='s',
    dest_volume_units='m3',
    dest_time_units='d',
    geographic_groups=None,
    geographic_groups_col=None,
    max_obsname_len=None,
    add_leading_zeros_to_sw_site_nos=False,
    column_renames=None,
    outfile=None,
):
    """Preprocess stream flow observation data, for example, from NWIS or another data source that
    outputs time series in CSV format with site locations and identifiers.

    * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates)
      to the CRS of the model (`dest_crs`)
    * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area
    * length and time units are converted to those of the groundwater model.
    * Prefixes for observation names (with an optional length limit) that identify the location are generated
    * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons
      (`geographic_groups` parameter)

    Parameters
    ----------
    data : csv file or DataFrame
        Time series of stream flow observations.
        Columns:

        ===================== ======================================
        site_no               site identifier
        datetime              measurement dates/times
        x                     x-coordinate of site
        y                     y-coordinate of site
        flow_data_columns     Columns of observed streamflow values
        flow_qualifier_column Optional column with qualifiers for flow values
        ===================== ======================================

        Notes:

        * x and y columns can alternatively be in the metadata table
        * flow_data_columns are denoted in `flow_data_columns`; multiple
          columns can be included to process base flow and total flow, or
          other statistics in tandem
        * For example, `flow_qualifier_column` may have "estimated" or "measured"
          flags denoting whether streamflows were derived from measured values
          or statistical estimates.

    metadata : csv file or DataFrame
        Stream flow observation site information.

        May include columns:

        ================= ================================================================================
        site_no           site identifier
        x                 x-coordinate of site
        y                 y-coordinate of site
        name              name of site
        line_id_col       Identifier for a line in a hydrography dataset that the site is associated with.
        ================= ================================================================================

        Notes:

        * other columns in metadata will be passed through to the metadata output

    flow_data_columns : list of strings
        Columns in data with flow values or their statistics.
        By default, ['q_cfs']
        start_date : str (YYYY-mm-dd)
        Simulation start date (cull observations before this date)
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    datetime_col : str, optional
        Column name in data with observation date/times,
        by default 'datetime'
    site_no_col : str, optional
        Column name in data and metadata with site identifiers,
        by default 'site_no'
    line_id_col : str, optional
        Column name in data or metadata with identifiers for
        hydrography lines associated with observation sites.
        by default 'line_id'
    x_coord_col : str, optional
        Column name in data or metadata with x-coordinates,
        by default 'x'
    y_coord_col : str, optional
        Column name in data or metadata with y-coordinates,
        by default 'y'
    name_col : str, optional
        Column name in data or metadata with observation site names,
        by default 'name'
    flow_qualifier_column : str, optional
        Column name in data with flow observation qualifiers, such
        as "measured" or "estimated"
        by default 'category'
    default_qualifier : str, optional
        Default qualifier to populate flow_qualifier_column if it
        is None. By default, "measured"
    include_sites : list-like, optional
        Exclude output to these sites.
        by default, None (include all sites)
    include_line_ids : list-like, optional
        Exclude output to these sites, represented by line identifiers.
        by default, None (include all sites)
    source_volume_units : str, 'm3', 'cubic meters', 'ft3', etc.
        Volume units of the source data. By default, 'ft3'
    source_time_units : str, 's', 'seconds', 'days', etc.
        Time units of the source data. By default, 's'
    dest_volume_units : str, 'm3', 'cubic meters', 'ft3', etc.
        Volume units of the output (model). By default, 'm3'
    dest_time_units : str, 's', 'seconds', 'days', etc.
        Time units of the output (model). By default, 'd'
    geographic_groups : file, dict or list-like
        Option to group observations by area(s) of interest. Can
        be a shapefile, list of shapefiles, or dictionary of shapely polygons.
        A 'group' column will be created in the metadata, and observation
        sites within each polygon will be assigned the group name
        associated with that polygon.

        For example::

            geographic_groups='../source_data/extents/CompositeHydrographArea.shp'
            geographic_groups=['../source_data/extents/CompositeHydrographArea.shp']
            geographic_groups={'cha': <shapely Polygon>}

        Where 'cha' is an observation group name for observations located within the
        the area defined by CompositeHydrographArea.shp. For shapefiles,
        group names are provided in a `geographic_groups_col`.

    geographic_groups_col : str
        Field name in the `geographic_groups` shapefile(s) containing the
        observation group names associated with each polygon.
    max_obsname_len : int or None
        Maximum length for observation name prefix. Default of 13
        allows for a PEST obsnme of 20 characters or less with
        <prefix>_yyyydd or <prefix>_<per>d<per>
        (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1)
        If None, observation names will not be truncated. PEST++ does not have
        a limit on observation name length.
    add_leading_zeros_to_sw_site_nos : bool
        Whether or not to pad site numbers using the
        :func:~`mapgwm.swflows.format_usgs_sw_site_id` function.
        By default, False.
    column_renames : dict, optional
        Option to rename columns in the data or metadata that are different than those listed above.
        For example, if the data file has a 'SITE_NO' column instead of 'SITE_BADGE'::

            column_renames={'SITE_NO': 'site_no'}

        by default None, in which case the renames listed above will be used.
        Note that the renames must be the same as those listed above for
        :func:`mapgwm.swflows.preprocess_flows` to work.
    outfile : str
        Where output file will be written. Metadata are written to a file
        with the same name, with an additional "_info" suffix prior to
        the file extension.

    Returns
    -------
    data : DataFrame
        Preprocessed time series
    metadata : DataFrame
        Preprocessed metadata

    References
    ----------
    `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>`

    Notes
    -----

    """
    # outputs
    if outfile is not None:
        outpath, filename = os.path.split(outfile)
        makedirs(outpath)
        outname, ext = os.path.splitext(outfile)
        out_info_csvfile = outname + '_info.csv'
        out_data_csvfile = outfile
        out_shapefile = outname + '_info.shp'

    # read the source data
    if not isinstance(data, pd.DataFrame):
        df = pd.read_csv(data, dtype={site_no_col: object})
    else:
        df = data.copy()
    # check the columns
    for col in [datetime_col] + flow_data_columns:
        assert col in df.columns, "Column {} not found in {}".format(col, data)
    assert any({site_no_col, line_id_col}.intersection(df.columns)), \
        "Neither {} or {} found in {}. Need to specify a site_no_col or line_id_col".format(site_no_col,
                                                                                            line_id_col, data)
    # rename input columns to these names,
    # for consistent output
    dest_columns = {
        datetime_col: 'datetime',
        site_no_col: 'site_no',
        line_id_col: 'line_id',
        x_coord_col: 'x',
        y_coord_col: 'y',
        name_col: 'name',
        flow_qualifier_column: 'category'
    }
    # update the default column renames
    # with any supplied via column_renames parameter
    if isinstance(column_renames, collections.Mapping):
        dest_columns.update(column_renames)
    df.rename(columns=dest_columns, inplace=True)
    flow_data_columns = [
        c if c not in dest_columns else dest_columns[c]
        for c in flow_data_columns
    ]
    # convert site numbers to strings;
    # add leading 0s to any USGS sites that should have them
    if 'site_no' in df.columns:
        df['site_no'] = format_site_ids(df['site_no'],
                                        add_leading_zeros_to_sw_site_nos)
    else:
        df['site_no'] = df[line_id_col]

    # read the source data
    if metadata is not None:
        if not isinstance(metadata, pd.DataFrame):
            md = pd.read_csv(metadata, dtype={site_no_col: object})
        else:
            md = metadata.copy()
        if site_no_col not in md.columns or 'site_no' not in df.columns:
            raise IndexError(
                'If metadata are supplied, both data and metadata must '
                'have a site_no column.')
        md.rename(columns=dest_columns, inplace=True)
        md['site_no'] = format_site_ids(md['site_no'],
                                        add_leading_zeros_to_sw_site_nos)
        md.index = md['site_no']
        by_site = df.groupby('site_no')
        md['start_dt'] = pd.DataFrame(by_site['datetime'].first())
    else:
        by_site = df.groupby('site_no')
        md = pd.DataFrame(by_site['datetime'].first())
        md.columns = ['start_dt']
        md['site_no'] = md.index

    md['end_dt'] = pd.DataFrame(by_site['datetime'].last())
    md['n'] = pd.DataFrame(by_site['datetime'].count())
    md.reset_index(inplace=True, drop=True)

    # assign metadata if supplied
    for col in 'x', 'y', 'line_id', 'name':
        if col in df.columns and col not in md.columns:
            by_site_no = dict(zip(df['site_no'], df[col]))
            md[col] = [by_site_no[sn] for sn in md['site_no']]
            if col != 'line_id':
                df.drop(col, axis=1, inplace=True)

    # index the dataframe to times;
    # truncate data before start date
    df.index = pd.to_datetime(df['datetime'])
    df.index.name = 'datetime'
    df = df.loc[start_date:].copy()

    # project x, y to model crs
    x_pr, y_pr = project((md.x.values, md.y.values), source_crs, dest_crs)
    md['x'], md['y'] = x_pr, y_pr
    md['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull data to that within the model area
    if active_area is not None:
        df, md = cull_data_to_active_area(df,
                                          active_area,
                                          active_area_id_column,
                                          active_area_feature_id,
                                          data_crs=dest_crs,
                                          metadata=md)

    # get the hydrography IDs corresponding to each site
    # using the included lookup table
    #if 'line_id' not in df.columns:
    #    assert line_id_lookup is not None, \
    #    "need to include line_ids in a column, or line_id_lookup dictionary mapping line_ids to site numbers"
    #    df = df.loc[df['site_no'].isin(line_id_lookup)].copy()
    #    df['line_id'] = [line_id_lookup[sn] for sn in df['site_no']]

    if include_sites is not None:
        md = md.loc[md.site_no.isin(include_sites)]
        df = df.loc[df.site_no.isin(include_sites)]
    if include_line_ids is not None:
        md = md.loc[md.line_id.isin(include_line_ids)]
        df = df.loc[df.line_id.isin(include_line_ids)]

    # convert units
    # ensure that flow values are numeric (may be objects if taken directly from NWIS)
    unit_conversion = (
        convert_volume_units(source_volume_units, dest_volume_units) /
        convert_time_units(source_time_units, dest_time_units))
    for flow_col in flow_data_columns:
        df[flow_col] = pd.to_numeric(df[flow_col],
                                     errors='coerce') * unit_conversion
    df.dropna(subset=flow_data_columns, axis=0, inplace=True)

    # reformat qualifiers for consistent output
    # (lump to dest category columns of either estimated or measured)
    # with measured including values derived from baseflow separation or actual measurements)
    # output column name for flow qualifier column:
    dest_flow_qualifier_column = 'category'
    if flow_qualifier_column is not None:
        flow_qualifiers = {
            'calculated': 'measured',  # 'measured',
            'base flow separated from measured values':
            'measured',  # 'measured',
            'measured total flow': 'measured',
            'estimated gaged': 'estimated',
            'estimated ungaged': 'estimated'
        }
        df[dest_flow_qualifier_column] = df[flow_qualifier_column].replace(
            flow_qualifiers)
    else:
        df['category'] = default_qualifier

    # make unique n-character prefixes (site identifiers) for each observation location
    # 13 character length allows for prefix_yyyymmm in 20 character observation names
    # (BeoPEST limit)
    unique_obsnames = set()
    obsnames = []
    for sn in md['site_no'].tolist():
        if max_obsname_len is not None:
            name = make_obsname(sn,
                                unique_names=unique_obsnames,
                                maxlen=max_obsname_len)
            assert name not in unique_obsnames
        else:
            name = sn
        unique_obsnames.add(name)
        obsnames.append(name)
    md['obsprefix'] = obsnames

    # add area of interest information
    md['group'] = 'fluxes'
    md = assign_geographic_obsgroups(md,
                                     geographic_groups,
                                     geographic_groups_col,
                                     metadata_crs=dest_crs)

    # data columns
    data_cols = ['site_no', 'line_id', 'datetime'
                 ] + flow_data_columns + ['category']
    #if 'line_id' in md.columns and 'line_id' not in df.columns:
    #    # only map line_ids to data if there are more site numbers
    #    # implying that no site number maps to more than one line_id
    #    if len(set(df.site_no)) >= len(set(df.line_id)):
    #        ids = dict(zip(md['site_no'], md['line_id']))
    #    df['line_id'] = [ids[sn] for sn in df['site_no']]
    data_cols = [c for c in data_cols if c in df.columns]
    df = df[data_cols]

    md.index = md['site_no']
    # save out the results
    if outfile is not None:
        df2shp(md.drop(['x', 'y'], axis=1), out_shapefile, crs=dest_crs)
        print('writing {}'.format(out_info_csvfile))
        md.drop('geometry', axis=1).to_csv(out_info_csvfile,
                                           index=False,
                                           float_format='%g')
        print('writing {}'.format(out_data_csvfile))
        df.to_csv(out_data_csvfile, index=False, float_format='%g')
    return df, md
Пример #27
0
def assign_layers_from_screen_top_botm(data,
                                       model,
                                       flux_col='q',
                                       screen_top_col='screen_top',
                                       screen_botm_col='screen_botm',
                                       label_col='site_no',
                                       across_layers=False,
                                       distribute_by='transmissivity',
                                       minimum_layer_thickness=2.):
    """Assign model layers to pumping flux data based on
    open interval. Fluxes are applied to each layer proportional
    to the fraction of open interval in that layer.

    Parameters
    ----------
    data : dataframe of well info
        Must have i, j or x, y locations
    model : mfsetup.MF6model or mfsetup.MFnwtModel instance
        Must have dis, and optionally, attached MFsetupGrid instance
    flux_col : column in data with well fluxes
    screen_top_col : column in data with screen top elevations
    screen_botm_col : column in data with screen bottom elevations
    label_col : column with well names (optional; default site_no)
    across_layers : bool
        True to distribute fluxes to multipler layers intersected by open interval
    distribute_by : str ('thickness' or 'transmissivity')
        Distribute fluxes to layers based on thickness or transmissivity of
        intersected open intervals.

    Returns
    -------
    data : dataframe of well info, modified so that each row represents
        pumping in a single model layer (with fluxes modified proportional
        to the amount of open interval in that layer).
    """
    # inactive cells in either MODFLOW version
    if model.version == 'mf6':
        idomain = model.idomain
    else:
        idomain = model.bas6.ibound.array

    # 'boundname' column is used by wel setup for identifying wells
    if label_col in data.columns:
        data['boundname'] = data[label_col]
    if across_layers:
        raise NotImplemented('Distributing fluxes to multiple layers')
    else:
        if distribute_by in {'thickness', 'transmissivity'}:
            i, j, x, y, screen_botm, screen_top = None, None, None, None, None, None
            if 'i' in data.columns and 'j' in data.columns:
                i, j = data['i'].values, data['j'].values
            elif 'x' in data.columns and 'y' in data.columns:
                raise NotImplementedError(
                    'Assigning well layers with just x, y')
                x, y = data['x'].values, data['y'].values
            if screen_top_col in data.columns:
                screen_top = data[screen_top_col].values
            if screen_botm_col in data.columns:
                screen_botm = data[screen_botm_col].values

            # get starting heads if available
            no_strt_msg = (
                f'Well setup: distribute_by: {distribute_by} selected '
                'but model has no {} package for computing sat. '
                'thickness.\nUsing full layer thickness.')
            strt3D = None
            if model.version == 'mf6':
                strt_package = 'IC'
            else:
                strt_package = 'BAS6'

            if strt_package not in model.get_package_list():
                warnings.warn(no_strt_msg.format(strt_package), UserWarning)
                strt2D = None
                strt3D = None
            else:
                strt = getattr(getattr(model, strt_package.lower()), 'strt')
                strt3D = strt.array
                strt2D = strt3D[:, i, j]

            thicknesses = get_open_interval_thickness(model,
                                                      heads=strt2D,
                                                      i=i,
                                                      j=j,
                                                      x=x,
                                                      y=y,
                                                      screen_top=screen_top,
                                                      screen_botm=screen_botm)
            hk = np.ones_like(thicknesses)
            if distribute_by == 'transmissivity':
                no_k_msg = (
                    'Well setup: distribute_by: transmissivity selected '
                    'but model has no {} package.\nFalling back to'
                    'distributing wells by layer thickness.')
                if model.version == 'mf6':
                    hk_package = 'NPF'
                    hk_var = 'k'
                elif model.version == 'mfnwt':
                    hk_package = 'UPW'
                    hk_var = 'hk'
                else:
                    hk_package = 'LPF'
                    hk_var = 'hk'

                if hk_package not in model.get_package_list():
                    warnings.warn(no_k_msg.format(hk_package), UserWarning)
                    hk = np.ones_like(thicknesses)
                else:
                    hk = getattr(getattr(model, hk_package.lower()), hk_var)
                    hk = hk.array[:, i, j]

            # for each i, j location with a well,
            # get the layer with highest transmissivity in the open interval
            # if distribute_by == 'thickness' or no hk array,
            # T == thicknesses
            # round to avoid erratic floating point behavior
            # for (nearly) equal quantities
            T = np.round(thicknesses * hk, 2)

            # to get the deepest occurance of a max value
            # (argmax will result in the first, or shallowest)
            # take the argmax on the reversed view of the array
            # data['k'] = np.argmax(T, axis=0)
            T_r = T[::-1]
            data['k'] = len(T_r) - np.argmax(T_r, axis=0) - 1

            # get thicknesses for all layers
            # (including portions of layers outside open interval)
            all_layers = np.zeros((model.nlay + 1, model.nrow, model.ncol))
            all_layers[0] = model.dis.top.array
            all_layers[1:] = model.dis.botm.array
            all_layer_thicknesses = np.abs(np.diff(all_layers, axis=0))
            layer_thicknesses = -np.diff(all_layers[:, i, j], axis=0)

            # only include thicknesses for valid layers
            # reset thicknesses to sat. thickness
            if strt3D is not None:
                sat_thickness = strt3D - model.dis.botm.array
                # cells where the head is above the layer top
                no_unsat = sat_thickness > all_layer_thicknesses
                sat_thickness[no_unsat] = all_layer_thicknesses[no_unsat]
                # cells where the head is below the cell bottom
                sat_thickness[sat_thickness < 0] = 0
                layer_thicknesses = sat_thickness[:, i, j]

            # set inactive cells to 0 thickness for the purpose or relocating wells
            layer_thicknesses[idomain[:, i, j] < 1] = 0
            data['idomain'] = idomain[data['k'], i, j]
            data['laythick'] = layer_thicknesses[
                data['k'].values,
                list(range(layer_thicknesses.shape[1]))]
            # flag layers that are too thin or inactive
            inactive = idomain[data.k, data.i, data.j] < 1
            invalid_open_interval = (data['laythick'] <
                                     minimum_layer_thickness) | inactive

            if any(invalid_open_interval):
                outfile = model.cfg['wel']['output_files'][
                    'dropped_wells_file'].format(model.name)

                # move wells that are still in a thin layer to the thickest active layer
                data['orig_layer'] = data['k']
                # get T for all layers
                T_all_layers = np.round(layer_thicknesses * hk, 2)

                # to get the deepest occurance of a max value
                # (argmax will result in the first, or shallowest)
                # take the argmax on the reversed view of the array
                # Tmax_layer = np.argmax(T_all_layers, axis=0)
                T_all_layers_r = T_all_layers[::-1]
                Tmax_layer = len(T_all_layers_r) - np.argmax(T_all_layers_r,
                                                             axis=0) - 1

                data.loc[invalid_open_interval,
                         'k'] = Tmax_layer[invalid_open_interval]
                data['laythick'] = layer_thicknesses[
                    data['k'].values,
                    list(range(layer_thicknesses.shape[1]))]
                data['idomain'] = idomain[data['k'], i, j]

                # record which wells were moved or dropped, and why
                bad_wells = data.loc[invalid_open_interval].copy()
                bad_wells['category'] = 'moved'
                bad_wells[
                    'reason'] = 'longest open interval thickness < {} {} minimum'.format(
                        minimum_layer_thickness, model.length_units)
                bad_wells[
                    'routine'] = __name__ + '.assign_layers_from_screen_top_botm'
                msg = (
                    'Warning: {} of {} wells in layers less than '
                    'specified minimum thickness of {} {}\n'
                    'were moved to the thickest layer at their i, j locations.\n'
                    .format(invalid_open_interval.sum(), len(data),
                            minimum_layer_thickness, model.length_units))
                still_below_minimum = bad_wells[
                    'laythick'] < minimum_layer_thickness
                bad_wells.loc[still_below_minimum, 'category'] = 'dropped'
                bad_wells.loc[
                    still_below_minimum,
                    'reason'] = 'no layer above minimum thickness of {} {}'.format(
                        minimum_layer_thickness, model.length_units)
                n_below = np.sum(still_below_minimum)
                if n_below > 0:
                    msg += (
                        'Out of these, {} of {} total wells remaining in layers less than '
                        'specified minimum thickness of {} {}'
                        ''.format(n_below, len(data), minimum_layer_thickness,
                                  model.length_units))
                    if flux_col in data.columns:
                        pct_flux_below = 100 * bad_wells.loc[
                            still_below_minimum,
                            flux_col].sum() / data[flux_col].sum()
                        msg += ', \nrepresenting {:.2f} %% of total flux,'.format(
                            pct_flux_below)

                    msg += '\nwere dropped. See {} for details.'.format(
                        outfile)
                    print(msg)

                # write shapefile and CSV output for wells that were dropped
                cols = [
                    'k', 'i', 'j', 'boundname', 'category', 'laythick',
                    'idomain', 'reason', 'routine', 'x', 'y'
                ]
                if flux_col in data.columns:
                    cols.insert(3, flux_col)
                flux_below = bad_wells.groupby(['k', 'i', 'j'
                                                ]).first().reset_index()[cols]
                append_csv(outfile, flux_below, index=False, float_format='%g')
                if 'x' in flux_below.columns and 'y' in flux_below.columns:
                    flux_below['geometry'] = [
                        Point(xi, yi)
                        for xi, yi in zip(flux_below.x, flux_below.y)
                    ]
                    df2shp(flux_below,
                           outfile[:-4] + '.shp',
                           epsg=model.modelgrid.epsg)

                # cull the wells that are still below the min. layer thickness
                data = data.loc[
                    data['laythick'] > minimum_layer_thickness].copy()
        else:
            raise ValueError(
                'Unrecognized argument for distribute_by: {}'.format(
                    distribute_by))
    return data