예제 #1
def test_plot_wateruse(preprocessed_iwum_data, preprocessed_swuds_data,

    extent = test_data_path / 'extents/shellmound_bbox.shp'
    perioddata = test_data_path / 'shellmound/tables/stress_period_data.csv'
    preprocessed_iwum_data['geometry'] = [
        Point(x, y)
        for x, y in zip(preprocessed_iwum_data.x, preprocessed_iwum_data.y)
    iwum_data = cull_data_to_active_area(preprocessed_iwum_data,
    swuds_data = preprocessed_swuds_data.df
    swuds_data['geometry'] = [
        Point(x, y) for x, y in zip(swuds_data.x, swuds_data.y)
    swuds_data = cull_data_to_active_area(swuds_data,
    wel_files = sorted(
        glob.glob(str(test_data_path / 'shellmound/external/wel_*.dat')))
    wel_files = dict(zip(range(1, len(wel_files) + 1), wel_files))

    add_data = {
        'IWUM estimates': {
            'data': iwum_data
        'SWUDs data': {
            'data': swuds_data
    results = plot_wateruse(wel_files, perioddata, add_data)
    j = 2
예제 #2
    def apply_footprint(
        """Keep sites in the df pandas dataframe that fall
        into the passed bounding shapefile polygon. Requires
        that df dataframe has a Point geometry column as assigned
        in the reproject method.

        active_area: str
            path to shapefile with footprint for current analysis
        active_area_id_column : str, optional
            Column in active_area with feature ids.
            By default, None, in which case all features are used.
        active_area_feature_id : str, optional
            ID of feature to use for active area
            By default, None, in which case all features are used.
        outshp: str
            optional path to output shapefile with points within the footprint
        self.df = cull_data_to_active_area(self.df,
예제 #3
def preprocess_flows(
    """Preprocess stream flow observation data, for example, from NWIS or another data source that
    outputs time series in CSV format with site locations and identifiers.

    * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates)
      to the CRS of the model (`dest_crs`)
    * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area
    * length and time units are converted to those of the groundwater model.
    * Prefixes for observation names (with an optional length limit) that identify the location are generated
    * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons
      (`geographic_groups` parameter)

    data : csv file or DataFrame
        Time series of stream flow observations.

        ===================== ======================================
        site_no               site identifier
        datetime              measurement dates/times
        x                     x-coordinate of site
        y                     y-coordinate of site
        flow_data_columns     Columns of observed streamflow values
        flow_qualifier_column Optional column with qualifiers for flow values
        ===================== ======================================


        * x and y columns can alternatively be in the metadata table
        * flow_data_columns are denoted in `flow_data_columns`; multiple
          columns can be included to process base flow and total flow, or
          other statistics in tandem
        * For example, `flow_qualifier_column` may have "estimated" or "measured"
          flags denoting whether streamflows were derived from measured values
          or statistical estimates.

    metadata : csv file or DataFrame
        Stream flow observation site information.

        May include columns:

        ================= ================================================================================
        site_no           site identifier
        x                 x-coordinate of site
        y                 y-coordinate of site
        name              name of site
        line_id_col       Identifier for a line in a hydrography dataset that the site is associated with.
        ================= ================================================================================


        * other columns in metadata will be passed through to the metadata output

    flow_data_columns : list of strings
        Columns in data with flow values or their statistics.
        By default, ['q_cfs']
        start_date : str (YYYY-mm-dd)
        Simulation start date (cull observations before this date)
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    datetime_col : str, optional
        Column name in data with observation date/times,
        by default 'datetime'
    site_no_col : str, optional
        Column name in data and metadata with site identifiers,
        by default 'site_no'
    line_id_col : str, optional
        Column name in data or metadata with identifiers for
        hydrography lines associated with observation sites.
        by default 'line_id'
    x_coord_col : str, optional
        Column name in data or metadata with x-coordinates,
        by default 'x'
    y_coord_col : str, optional
        Column name in data or metadata with y-coordinates,
        by default 'y'
    name_col : str, optional
        Column name in data or metadata with observation site names,
        by default 'name'
    flow_qualifier_column : str, optional
        Column name in data with flow observation qualifiers, such
        as "measured" or "estimated"
        by default 'category'
    default_qualifier : str, optional
        Default qualifier to populate flow_qualifier_column if it
        is None. By default, "measured"
    include_sites : list-like, optional
        Exclude output to these sites.
        by default, None (include all sites)
    include_line_ids : list-like, optional
        Exclude output to these sites, represented by line identifiers.
        by default, None (include all sites)
    source_volume_units : str, 'm3', 'cubic meters', 'ft3', etc.
        Volume units of the source data. By default, 'ft3'
    source_time_units : str, 's', 'seconds', 'days', etc.
        Time units of the source data. By default, 's'
    dest_volume_units : str, 'm3', 'cubic meters', 'ft3', etc.
        Volume units of the output (model). By default, 'm3'
    dest_time_units : str, 's', 'seconds', 'days', etc.
        Time units of the output (model). By default, 'd'
    geographic_groups : file, dict or list-like
        Option to group observations by area(s) of interest. Can
        be a shapefile, list of shapefiles, or dictionary of shapely polygons.
        A 'group' column will be created in the metadata, and observation
        sites within each polygon will be assigned the group name
        associated with that polygon.

        For example::

            geographic_groups={'cha': <shapely Polygon>}

        Where 'cha' is an observation group name for observations located within the
        the area defined by CompositeHydrographArea.shp. For shapefiles,
        group names are provided in a `geographic_groups_col`.

    geographic_groups_col : str
        Field name in the `geographic_groups` shapefile(s) containing the
        observation group names associated with each polygon.
    max_obsname_len : int or None
        Maximum length for observation name prefix. Default of 13
        allows for a PEST obsnme of 20 characters or less with
        <prefix>_yyyydd or <prefix>_<per>d<per>
        (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1)
        If None, observation names will not be truncated. PEST++ does not have
        a limit on observation name length.
    add_leading_zeros_to_sw_site_nos : bool
        Whether or not to pad site numbers using the
        :func:~`mapgwm.swflows.format_usgs_sw_site_id` function.
        By default, False.
    column_renames : dict, optional
        Option to rename columns in the data or metadata that are different than those listed above.
        For example, if the data file has a 'SITE_NO' column instead of 'SITE_BADGE'::

            column_renames={'SITE_NO': 'site_no'}

        by default None, in which case the renames listed above will be used.
        Note that the renames must be the same as those listed above for
        :func:`mapgwm.swflows.preprocess_flows` to work.
    outfile : str
        Where output file will be written. Metadata are written to a file
        with the same name, with an additional "_info" suffix prior to
        the file extension.

    data : DataFrame
        Preprocessed time series
    metadata : DataFrame
        Preprocessed metadata

    `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>`


    # outputs
    if outfile is not None:
        outpath, filename = os.path.split(outfile)
        outname, ext = os.path.splitext(outfile)
        out_info_csvfile = outname + '_info.csv'
        out_data_csvfile = outfile
        out_shapefile = outname + '_info.shp'

    # read the source data
    if not isinstance(data, pd.DataFrame):
        df = pd.read_csv(data, dtype={site_no_col: object})
        df = data.copy()
    # check the columns
    for col in [datetime_col] + flow_data_columns:
        assert col in df.columns, "Column {} not found in {}".format(col, data)
    assert any({site_no_col, line_id_col}.intersection(df.columns)), \
        "Neither {} or {} found in {}. Need to specify a site_no_col or line_id_col".format(site_no_col,
                                                                                            line_id_col, data)
    # rename input columns to these names,
    # for consistent output
    dest_columns = {
        datetime_col: 'datetime',
        site_no_col: 'site_no',
        line_id_col: 'line_id',
        x_coord_col: 'x',
        y_coord_col: 'y',
        name_col: 'name',
        flow_qualifier_column: 'category'
    # update the default column renames
    # with any supplied via column_renames parameter
    if isinstance(column_renames, collections.Mapping):
    df.rename(columns=dest_columns, inplace=True)
    flow_data_columns = [
        c if c not in dest_columns else dest_columns[c]
        for c in flow_data_columns
    # convert site numbers to strings;
    # add leading 0s to any USGS sites that should have them
    if 'site_no' in df.columns:
        df['site_no'] = format_site_ids(df['site_no'],
        df['site_no'] = df[line_id_col]

    # read the source data
    if metadata is not None:
        if not isinstance(metadata, pd.DataFrame):
            md = pd.read_csv(metadata, dtype={site_no_col: object})
            md = metadata.copy()
        if site_no_col not in md.columns or 'site_no' not in df.columns:
            raise IndexError(
                'If metadata are supplied, both data and metadata must '
                'have a site_no column.')
        md.rename(columns=dest_columns, inplace=True)
        md['site_no'] = format_site_ids(md['site_no'],
        md.index = md['site_no']
        by_site = df.groupby('site_no')
        md['start_dt'] = pd.DataFrame(by_site['datetime'].first())
        by_site = df.groupby('site_no')
        md = pd.DataFrame(by_site['datetime'].first())
        md.columns = ['start_dt']
        md['site_no'] = md.index

    md['end_dt'] = pd.DataFrame(by_site['datetime'].last())
    md['n'] = pd.DataFrame(by_site['datetime'].count())
    md.reset_index(inplace=True, drop=True)

    # assign metadata if supplied
    for col in 'x', 'y', 'line_id', 'name':
        if col in df.columns and col not in md.columns:
            by_site_no = dict(zip(df['site_no'], df[col]))
            md[col] = [by_site_no[sn] for sn in md['site_no']]
            if col != 'line_id':
                df.drop(col, axis=1, inplace=True)

    # index the dataframe to times;
    # truncate data before start date
    df.index = pd.to_datetime(df['datetime'])
    df.index.name = 'datetime'
    df = df.loc[start_date:].copy()

    # project x, y to model crs
    x_pr, y_pr = project((md.x.values, md.y.values), source_crs, dest_crs)
    md['x'], md['y'] = x_pr, y_pr
    md['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull data to that within the model area
    if active_area is not None:
        df, md = cull_data_to_active_area(df,

    # get the hydrography IDs corresponding to each site
    # using the included lookup table
    #if 'line_id' not in df.columns:
    #    assert line_id_lookup is not None, \
    #    "need to include line_ids in a column, or line_id_lookup dictionary mapping line_ids to site numbers"
    #    df = df.loc[df['site_no'].isin(line_id_lookup)].copy()
    #    df['line_id'] = [line_id_lookup[sn] for sn in df['site_no']]

    if include_sites is not None:
        md = md.loc[md.site_no.isin(include_sites)]
        df = df.loc[df.site_no.isin(include_sites)]
    if include_line_ids is not None:
        md = md.loc[md.line_id.isin(include_line_ids)]
        df = df.loc[df.line_id.isin(include_line_ids)]

    # convert units
    # ensure that flow values are numeric (may be objects if taken directly from NWIS)
    unit_conversion = (
        convert_volume_units(source_volume_units, dest_volume_units) /
        convert_time_units(source_time_units, dest_time_units))
    for flow_col in flow_data_columns:
        df[flow_col] = pd.to_numeric(df[flow_col],
                                     errors='coerce') * unit_conversion
    df.dropna(subset=flow_data_columns, axis=0, inplace=True)

    # reformat qualifiers for consistent output
    # (lump to dest category columns of either estimated or measured)
    # with measured including values derived from baseflow separation or actual measurements)
    # output column name for flow qualifier column:
    dest_flow_qualifier_column = 'category'
    if flow_qualifier_column is not None:
        flow_qualifiers = {
            'calculated': 'measured',  # 'measured',
            'base flow separated from measured values':
            'measured',  # 'measured',
            'measured total flow': 'measured',
            'estimated gaged': 'estimated',
            'estimated ungaged': 'estimated'
        df[dest_flow_qualifier_column] = df[flow_qualifier_column].replace(
        df['category'] = default_qualifier

    # make unique n-character prefixes (site identifiers) for each observation location
    # 13 character length allows for prefix_yyyymmm in 20 character observation names
    # (BeoPEST limit)
    unique_obsnames = set()
    obsnames = []
    for sn in md['site_no'].tolist():
        if max_obsname_len is not None:
            name = make_obsname(sn,
            assert name not in unique_obsnames
            name = sn
    md['obsprefix'] = obsnames

    # add area of interest information
    md['group'] = 'fluxes'
    md = assign_geographic_obsgroups(md,

    # data columns
    data_cols = ['site_no', 'line_id', 'datetime'
                 ] + flow_data_columns + ['category']
    #if 'line_id' in md.columns and 'line_id' not in df.columns:
    #    # only map line_ids to data if there are more site numbers
    #    # implying that no site number maps to more than one line_id
    #    if len(set(df.site_no)) >= len(set(df.line_id)):
    #        ids = dict(zip(md['site_no'], md['line_id']))
    #    df['line_id'] = [ids[sn] for sn in df['site_no']]
    data_cols = [c for c in data_cols if c in df.columns]
    df = df[data_cols]

    md.index = md['site_no']
    # save out the results
    if outfile is not None:
        df2shp(md.drop(['x', 'y'], axis=1), out_shapefile, crs=dest_crs)
        print('writing {}'.format(out_info_csvfile))
        md.drop('geometry', axis=1).to_csv(out_info_csvfile,
        print('writing {}'.format(out_data_csvfile))
        df.to_csv(out_data_csvfile, index=False, float_format='%g')
    return df, md
예제 #4
def preprocess_iwum_pumping(ncfile,
    """Get pumping from the Irrigation Water Use Model (IWUM; Wilson, 2020) output and
    assign open interval information, using raster surfaces of the
    top and bottom of an estimated production zone.

    ncfile : file path
        NetCDF output from Irrigation Water Use Model
    start_date : str
        Cull data before this date.
    end_date : str
        Cull data after this date.
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    estimated_production_zone_top : file path
        Raster surface for assigning screen tops
    estimated_production_zone_botm : file path
        Raster surface for assigning screen bottoms
    flux_variable : str
        Varible in ncfile for pumping fluxes. Fluxes are assumed to
        represent total volumes for each time period.
    nc_crs : obj
        Coordinate Reference System (CRS) of ncfile.
        A Python int, dict, str, or pyproj.crs.CRS instance
        passed to the pyproj.crs.from_user_input
        See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input.
        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class
    nc_length_units : str, {'meters', 'ft', etc.}
        Length units of pumped volumes in ncfile
    estimated_production_surface_units : str, {'meters', 'ft', etc.}
        Length units of elevations in estimated production surface rasters.
    model_length_units : str, {'meters', 'ft', etc.}
        Length units of model.
    outfile : csv file for output table

    df : DataFrame
        Table of pumping rates in m3/day, location
        and open interval information.


        ============== ================================================
        site_no        index position of pumping rate in ncfile grid
        x              x-coordinate in `dest_crs`
        y              y-coordinate in `dest_crs`
        start_datetime start date of pumping period
        end_datetime   end date of pumping period
        screen_top     screen top elevation, in `model_length_units`
        screen_botm    screen bottom elevation, in `model_length_units`
        q              pumping rate, in model units
        geometry       shapely Point object representing location
        ============== ================================================

    * Time units are assumed to be days.
    * Fluxes are assumed to represent total volumes for each time period
      indicated by the differences between successive values along the time axis of ncfile.
    ds = xr.open_dataset(ncfile)
    time_variable = [k for k in ds.coords.keys() if k.lower() not in {'x', 'y'}][0]
    ds_x, ds_y = np.meshgrid(ds['x'], ds['y'])

    # original values are in m3, in each 1 mi2 cell
    # can leave in m3 if reassigning to 1km grid as point values
    length_conversion = convert_volume_units(nc_length_units,
                                             model_length_units) ** 3
    unit_suffix = vol_suffix[model_length_units] + 'd'
    flux_col = 'q'  # 'flux_{}'.format(unit_suffix)  # output field name for fluxes

    # get top/botm elevations
    est_screen_top = None
    est_screen_botm = None
    if estimated_production_zone_top is not None and \
            estimated_production_zone_botm is not None:
        surf_unit_conversion = convert_length_units(estimated_production_surface_units,
        est_screen_top = get_values_at_points(estimated_production_zone_top, ds_x, ds_y,
        est_screen_top *= surf_unit_conversion
        est_screen_botm = get_values_at_points(estimated_production_zone_botm, ds_x, ds_y,
        est_screen_botm *= surf_unit_conversion

        # in any places where screen top is less than the screen botm,
        # set both at the mean
        loc = est_screen_top < est_screen_botm
        means = np.mean([est_screen_top, est_screen_botm], axis=0)
        est_screen_top[loc] = means[loc]
        est_screen_botm[loc] = means[loc]
        print(f'Reset screen top and bottom to mean elevation at {loc.ravel().sum()} '
              f'locations where screen top was < screen bottom')

    dfs = []
    times = pd.DatetimeIndex(ds[time_variable].loc[start_date:end_date].values)
    for n, period_start_date in enumerate(times):

        # for each time entry, get the data
        kwargs = {time_variable: period_start_date}
        arr = ds[flux_variable].sel(**kwargs).values

        # make sure pumping sign is  negative
        # based on assumption that values are mostly abstraction
        if arr.sum() > 0:
            arr *= -1

        # set up a dataframe
        data = {'site_no': np.arange(ds_x.size),
                'x': ds_x.ravel(),
                'y': ds_y.ravel(),
        if est_screen_top is not None and est_screen_botm is not None:
            data.update({'screen_top': est_screen_top.ravel(),
                         'screen_botm': est_screen_botm.ravel()
        df = pd.DataFrame(data)
        df['start_datetime'] = period_start_date

        # get the end_date, handling last entry
        if n + 1 < len(times):
            period_end_date = times[n + 1]
            # set end date for last period on previous period length
            last_start = dfs[-1]['start_datetime'].values[0]
            ndays = (pd.Timestamp(period_start_date) -
            period_end_date = period_start_date + pd.Timedelta(ndays, unit='d')

        # convert the time units
        ndays = (pd.Timestamp(period_end_date) -
        assert ndays > 0, "period_end_date {} is before period_start_date {}"\
            .format(period_end_date, period_start_date)
        time_conversion = 1 / ndays  # original quantities are volumes for the time period

        # time indexing in pandas is through last value
        period_end_date = pd.Timestamp(period_end_date) - pd.Timedelta(1, unit='d')
        df['end_datetime'] = period_end_date
        df[flux_col] = arr.ravel() * length_conversion * time_conversion

        # only includes fluxes > 0
        df = df.loc[df[flux_col] < 0]

    df = pd.concat(dfs)

    # site number column (that would be unique from other integers from other data sources)
    df['site_no'] = [f'iwum_{node}' for node in df.site_no]

    # project the data to a destination crs, if provided
    # make a separate metadata dataframe with 1 row per location
    # to avoid redundant operations
    metadata = df.groupby('site_no').first().reset_index()[['site_no', 'x', 'y']]
    metadata.index = metadata['site_no']
    x_pr, y_pr = project((metadata.x.values, metadata.y.values), nc_crs, dest_crs)
    metadata['x'], metadata['y'] = x_pr, y_pr
    metadata['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull the data to the model area, if provided
    if active_area is not None:
        df, metadata = cull_data_to_active_area(df, active_area,
                                      data_crs=dest_crs, metadata=metadata)

    # update data with x,y values projected in metadata
    x = dict(zip(metadata.site_no, metadata.x))
    y = dict(zip(metadata.site_no, metadata.y))
    df['x'] = [x[sn] for sn in df.site_no]
    df['y'] = [y[sn] for sn in df.site_no]
    if outfile is not None:
        outfile = Path(outfile)
        df.to_csv(outfile, index=False, float_format='%g')
        print('wrote {}'.format(outfile))

        # Make a plot of iwum output in mgal/day
        out_pdf_path = outfile.parent / 'plots'
        plot_iwum_output(ncfile, flux_variable=flux_variable, outpath=out_pdf_path)

    return df
예제 #5
def preprocess_te_wateruse(data,
    """Preprocess water use data from thermoelectric power plants:

    * reproject data to a destination CRS `dest_crs`)
    * cull data to an area of interest (`active_area`)
    * if input data do not have information on the well screen intervals;
      sample screen tops and bottoms from raster surfaces bounding
      an estimated production zone (e.g. `estimated_production_zone_top`)
    * reindex the data to continous monthly values extending from `start_date`
      to `end_date`. Typically, these would bracket the time period for which
      the pumping should be simulated in a model. For example, the earliest data
      may be from 2010, but if the model starts in 2008, it may be appropriate to
      begin using the 2010 rates then (``start_date='2008'``). If no start or end
      date are given, the first and last years of pumping in `data` are used.
    * fill empty months by interpolation via a specified `interp_method`
    * backfill any remaining empty months going back to the `start_date`
    * write processed data to a CSV file and shapefile of the same name

    data : DataFrame
        Thermoelectric water use data in the following format
        (similar to that output by :func:`mapgwm.te_wateruse.read_te_water_use_spreadsheet`):

        =============== =======================================================
        site_no         power plant identifier (plant code)
        start_datetime  pandas datetime representative of flux (e.g. '2010')
        x               x-coordinate of withdrawl, in `source_crs`
        y               y-coordinate of withdrawl, in `source_crs`
        q               withdrawl flux, in `data_volume_units` per days
        =============== =======================================================

    start_date : str
        Start date for pumping rates. If earlier than the dates in `data`,
        pumping rates will be backfilled to this date.
    end_date : str
        End date for pumping rates. If later than the dates in `data`,
        pumping rates will be forward filled to this date.
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    estimated_production_zone_top : file path
        Raster surface for assigning screen tops
    estimated_production_zone_botm : file path
        Raster surface for assigning screen bottoms
    estimated_production_surface_units : str, {'meters', 'ft', etc.}
        Length units of elevations in estimated production surface rasters.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    interp_method : str
        Interpolation method to use for filling pumping rates to monthly values.
        By default, 'linear'
    data_volume_units : str; e.g. 'mgal', 'm3', 'cubic feet', etc.
        Volume units of pumping data. All time units are assumed to be in days.
    model_length_units : str; e.g. 'feet', 'm', 'meters', etc.
        Length units of model.
    outfile : str
        Path for output file. A shapefile of the same name is also written.
        If None, no output file is written. By default, None

    df_monthly : DataFrame

    * time units for TE data and model are assumed to be days

    df = data.copy()

    # reproject to dest_crs
    x, y = project(zip(df['x'], df['y']), source_crs, dest_crs)
    df['x'], df['y'] = x, y
    df['geometry'] = [Point(x, y) for x, y in zip(x, y)]

    # drop wells with no location information (for now)
    df.dropna(subset=['x', 'y'], axis=0, inplace=True)

    # cull sites to those within the Delta footprint
    # cull data to that within the model area
    if active_area is not None:
        df = cull_data_to_active_area(df,

    # get top and bottom of estimated production interval at each well
    if estimated_production_zone_top is not None and \
            estimated_production_zone_botm is not None:
        surf_unit_conversion = convert_length_units(
            estimated_production_surface_units, model_length_units)
        x, y = df.x.values, df.y.values
        est_screen_top = get_values_at_points(estimated_production_zone_top,
        est_screen_top *= surf_unit_conversion
        est_screen_botm = get_values_at_points(estimated_production_zone_botm,
        est_screen_botm *= surf_unit_conversion
        df['screen_top'] = est_screen_top
        df['screen_botm'] = est_screen_botm

    # distribute fluxes to monthly values
    # set start and end dates if not already set
    if start_date is None:
        start_date = df.start_datetime.min()
    if end_date is None:
        end_date = df.start_datetime.mmax()
    groups = df.groupby('site_no')
    all_groups = []
    for site_no, group in groups:
        dfg = group.copy()

        # create a continuous monthly time index
        # labeled at the month start
        all_dates = pd.date_range(start_date, end_date, freq='MS')
        dfg.index = dfg['start_datetime']
        dfg = dfg.reindex(all_dates)

        # interpolate the discharge values;
        # back filling to the start date
        dfg['q'] = dfg.q.interpolate(method=interp_method).bfill()
        dfg['q'] *= convert_volume_units(data_volume_units, model_length_units)

        # fill remaining columns
        dfg['start_datetime'] = dfg.index
        fill_columns = set(dfg.columns).difference({'q', 'start_datetime'})
        fill_values = group.iloc[0].to_dict()
        for c in fill_columns:
            dfg[c] = fill_values[c]

        # add 'te' prefix to site number
        dfg['site_no'] = f'te_{site_no}'
    df_monthly = pd.concat(all_groups)

    # assume most values represent abstraction
    # if sum is positive, invert so that output values are negative
    if df_monthly['q'].sum() > 0:
        df_monthly['q'] *= -1

    # clean up the columns
    cols = [
        'site_no', 'start_datetime', 'x', 'y', 'screen_top', 'screen_botm',
        'q', 'geometry'
    cols += list(set(df_monthly.columns).difference(cols))
    df_monthly = df_monthly[cols]

    # write the output
    if outfile is not None:
        outfile = Path(outfile)
        df_monthly.drop('geometry', axis=1).to_csv(outfile,
        print('wrote {}'.format(outfile))

        # write only unique pumping values to shapefile
        to_shapefile = df_monthly.groupby(['site_no',
        shapefile = outfile.with_suffix('.shp')
        df2shp(to_shapefile, shapefile, crs=dest_crs)
    return df_monthly
예제 #6
def preprocess_headobs(
        head_data_columns=['head', 'last_head', 'head_std'],
    """Preprocess head observation data, for example, groundwater level data output from the
    `visGWDB program <https://doi.org/10.5066/P9W004O6>`_.

    * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates)
      to the CRS of the model (`dest_crs`)
    * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area
    * length units are converted to those of the groundwater model. Open intervals for the wells are
      converted from depths to elevations
    * missing open intervals are filled based on well bottom depths (if availabile) and the median open
      interval length for the dataset.
    * Wells are categorized based on the quality of the open interval information (see the documentation
      for :func:`mapgwm.headobs.fill_well_open_intervals`).
    * Prefixes for observation names (with an optional length limit) that identify the location are generated
    * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons
      (`aoi` parameter)

    data : DataFrame
        Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`.

        ========= ================================================================
        site_no   site identifier
        lat       lattitude
        lon       longitude
        datetime  measurement dates in pandas datetime format
        head      average head for the period represented by the datetime
        last_head last head measurement for the period represented by the datetime
        head_std  standard deviation of measured heads within the datetime period
        ========= ================================================================


        * lat and lon columns can alternatively be in the metadata table
        * `last_head` and `head_std` only need to be included if they are in

    metadata : DataFrame
        Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`.

        Must have the following columns:

        ================= ==========================================================================
        site_no (index)   site identifier
        aqfr_cd           Local aquifer code
        screen_botm       Well screen bottom, as a depth below land surface, in feet
        screen_top        Well screen top, as a depth below land surface, in feet
        well_depth        Well depth, in feet
        well_el           Altitude of land surface, in feet
        ================= ==========================================================================

    head_data_columns : list of strings
        Columns in data with head values or their statistics.
        By default, 'head', 'last_head', 'head_std', which allows both
        the average and last head values for the stress period to be considered,
        as well as the variability of water levels contributing to an average value.
    dem : str, optional
        DEM raster of the land surface. Used for estimating missing wellhead elevations.
        Any reprojection to dest_crs is handled automatically, assuming
        the DEM raster has CRS information embedded (arc-ascii grids do not!)
        By default, None.
    dem_units : str, {'feet', 'meters', ..}
        Units of DEM elevations, by default, 'meters'
    start_date : str (YYYY-mm-dd)
        Simulation start date (cull observations before this date)
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269

    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    data_length_units : str; 'meters', 'feet', etc.
        Length units of head observations.
    model_length_units : str; 'meters', 'feet', etc.
        Length units of model.
    geographic_groups : file, dict or list-like
        Option to group observations by area(s) of interest. Can
        be a shapefile, list of shapefiles, or dictionary of shapely polygons.
        A 'group' column will be created in the metadata, and observation
        sites within each polygon will be assigned the group name
        associated with that polygon.

        For example::

            geographic_groups={'cha': <shapely Polygon>}

        Where 'cha' is an observation group name for observations located within the
        the area defined by CompositeHydrographArea.shp. For shapefiles,
        group names are provided in a `geographic_groups_col`.

    geographic_groups_col : str
        Field name in the `geographic_groups` shapefile(s) containing the
        observation group names associated with each polygon.

    max_obsname_len : int or None
        Maximum length for observation name prefix. Default of 13
        allows for a PEST obsnme of 20 characters or less with
        <prefix>_yyyydd or <prefix>_<per>d<per>
        (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1)
        If None, observation names will not be truncated. PEST++ does not have
        a limit on observation name length.
    outfile : str
        Where output file will be written. Metadata are written to a file
        with the same name, with an additional "_info" suffix prior to
        the file extension.

    df : DataFrame
        Preprocessed time series
    well_info : DataFrame
        Preprocessed metadata

    `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>`

    df = data.copy()
    # multiplier to convert input length units to model units
    unit_conversion = convert_length_units(data_length_units,

    # outputs
    out_plot = None
    if outfile is not None:
        outpath, filename = os.path.split(outfile)
        outname, ext = os.path.splitext(outfile)
        out_info_csvfile = outname + '_info.csv'
        out_data_csvfile = outfile
        out_plot = os.path.join(outpath, 'open_interval_lengths.pdf')
        out_shapefile = outname + '_info.shp'

    # set the starting and ending dates here
    stdate = pd.Timestamp(start_date)

    # convert to datetime; drop the timestamps
    df['datetime'] = pd.to_datetime(df.datetime).dt.normalize()

    # trim to the time range
    n_measurements = len(data)
    n_sites = len(set(data.site_no))
        f'starting with {n_measurements:,d} measurements at {n_sites:,d} unique wells'
    no_data_in_period = df.datetime < stdate

    if np.any(no_data_in_period):
        in_period = df.datetime >= stdate
        n_sites_before = len(
            f'culling {in_period.sum():,d} measurements from {n_sites_before:,d} '
            f'sites that are prior to start date of {start_date}'))
        df = df.loc[in_period]

    # collapse dataset to mean values at each site
    groups = df.groupby('site_no')
    well_info = groups.mean().copy()
    well_info = well_info.join(metadata, rsuffix='_meta')
    well_info['start_dt'] = groups.datetime.min()
    well_info['end_dt'] = groups.datetime.max()
    well_info.drop(labels=['year', 'month'], axis=1, inplace=True)
    well_info['site_no'] = well_info.index
    well_info['n'] = groups.datetime.count()

    # project x, y to model crs
    x_pr, y_pr = project((well_info.lon.values, well_info.lat.values),
                         source_crs, dest_crs)
    well_info.drop(['lon', 'lat'], axis=1, inplace=True)
    well_info['x'], well_info['y'] = x_pr, y_pr
    well_info['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull data to that within the model area
    if active_area is not None:
        df, md = cull_data_to_active_area(df,

    # convert length units; convert screen tops and botms to depths
    missing_elevations = well_info.well_el.isna()
    if dem is not None and np.any(missing_elevations):
        well_location_elevations = get_values_at_points(dem,
        well_location_elevations *= convert_length_units(
            dem_units, model_length_units)
                      'well_el'] = well_location_elevations[missing_elevations]

    length_columns = ['well_el'
                      ] + head_data_columns + ['screen_top', 'screen_botm']
    for col in length_columns:
        if col in well_info.columns:
            well_info[col] *= unit_conversion

    well_info['well_botm'] = well_info['well_el'] - well_info['well_depth']
    well_info['screen_top'] = well_info['well_el'] - well_info['screen_top']
    well_info['screen_botm'] = well_info['well_el'] - well_info['screen_botm']

    # just the data, site numbers, times and aquifer
    head_data_columns = head_data_columns + ['head_std']
    transient_cols = ['site_no', 'datetime'] + head_data_columns + ['n']
    transient_cols = [c for c in transient_cols if c in df.columns]
    df = df[transient_cols].copy()
    for c in head_data_columns:
        if c in df.columns:
            df[c] *= unit_conversion

    # #### trim down to only well_info with both estimated water levels and standard deviation
    # monthly measured levels may not have standard deviation
    # (as opposed to monthly statistical estimates)
    criteria = pd.notnull(well_info['head'])
    #if 'head_std' in df.columns:
    #    criteria = criteria & pd.notnull(well_info['head_std'])
    well_info = well_info[criteria]

    # verify that all well_info have a wellhead elevation
    assert not np.any(np.isnan(well_info.well_el))

    # categorize wells based on quality of open interval information
    # estimate missing open intervals where possible
    well_info = fill_well_open_intervals(well_info, out_plot=out_plot)

    # drop well_info with negative reported open interval
    #well_info = well_info.loc[open_interval_length > 0]

    # cull data to well_info in well info table
    has_metadata = df.site_no.isin(well_info.index)
    if np.any(~has_metadata):
        warnings.warn('culling {} wells not found in metadata table!'.format(
        df = df.loc[has_metadata].copy()

    # make unique n-character prefixes (site identifiers) for each observation location
    # 13 character length allows for prefix_yyyymmm in 20 character observation names
    # (BeoPEST limit)
    unique_obsnames = set()
    obsnames = []
    for sn in well_info.index.tolist():
        if max_obsname_len is not None:
            name = make_obsname(sn,
            assert name not in unique_obsnames
            name = sn
    well_info['obsprefix'] = obsnames
    obsprefix = dict(zip(well_info.index, well_info.obsprefix))
    df['obsprefix'] = [obsprefix[sn] for sn in df.site_no]

    # add area of interest information
    well_info['group'] = 'heads'
    well_info = assign_geographic_obsgroups(well_info,

    # save out the results
    if outfile is not None:
        df2shp(well_info.drop(['x', 'y'], axis=1),
        print('writing {}'.format(out_info_csvfile))
        well_info.drop('geometry', axis=1).to_csv(out_info_csvfile,
        print('writing {}'.format(out_data_csvfile))
        df.to_csv(out_data_csvfile, index=False, float_format='%.2f')
    return df, well_info