Exemplo n.º 1
0
def plot_wateruse(wel_files, perioddata, add_data=None,
                  wel_flux_col='q',
                  model_volume_units='$m^3$', model_time_units='day',
                  plot_volume_units='mgal', plot_time_units='day',
                  outfile=None):
    """

    Parameters
    ----------
    wel_files :
        A head line with column names is assumed. For example:
        #k,i,j,q,boundname

    perioddata :
    add_data :
    model_volume_units :
    model_time_units :
    plot_volume_units :
    plot_time_units :

    Returns
    -------

    """

    # read the stress period information
    if not isinstance(perioddata, pd.DataFrame):
        perioddata = pd.read_csv(perioddata)
    else:
        perioddata = perioddata.copy()
    perioddata.index = perioddata['per']

    dfs = []
    for i, f in wel_files.items():
        df = pd.read_csv(f, delim_whitespace=True)
        df.columns = [c.strip('#') for c in df.columns]
        df['per'] = i
        df['start_datetime'] = perioddata.loc[i, 'start_datetime']
        df['end_datetime'] = perioddata.loc[i, 'end_datetime']
        dfs.append(df)
    df = pd.concat(dfs)

    # sum the model pumping by stress period
    period_sums = df.groupby('per').first()
    period_sums[wel_flux_col] = df.groupby('per')[wel_flux_col].sum()
    # fill nan values (from any periods without wel files) with 0s
    period_sums = period_sums.reindex(range(period_sums.index.max()))
    period_sums['start_datetime'] = perioddata['start_datetime']
    period_sums['end_datetime'] = perioddata['end_datetime']
    period_sums[wel_flux_col].fillna(0, inplace=True)
    period_sums.index = pd.to_datetime(period_sums['start_datetime'])
    period_sums['WEL package input'] = period_sums['q']
    period_sums = period_sums[['WEL package input', 'start_datetime', 'end_datetime']]

    # convert units
    model_vol_conv = convert_volume_units(model_volume_units, plot_volume_units)
    model_time_conv = convert_time_units(model_time_units, plot_time_units)
    model_conv = model_vol_conv * model_time_conv

    # plot any additional comparison data
    if add_data is not None:
        for label, items in add_data.items():
            # read the stress period information
            if not isinstance(items['data'], pd.DataFrame):
                items['data'] = pd.read_csv(items['data'])
            req_cols = {'q', 'start_datetime'}
            assert not req_cols.difference(items['data'].columns), \
                f"add_data: {label} data must have columns: {req_cols}"

            items['data']['start_datetime'] = pd.to_datetime(items['data']['start_datetime'])
            aux_period_sums = items['data'].groupby('start_datetime').first()
            aux_period_sums[label] = items['data'].groupby('start_datetime')['q'].sum()
            # fill nan values (from any periods without wel files) with 0s
            #aux_period_sums[label].fillna(0, inplace=True)
            aux_period_sums['start_datetime'] = aux_period_sums.index

            period_sums = period_sums.join(aux_period_sums[[label]], how='outer')
            j=2

    # forward fill nan WEL values values
    # (where other times may have been inserted)
    period_sums['WEL package input'] = period_sums['WEL package input'].ffill()
    #period_sums = period_sums.resample('M').mean() #.ffill()

    # make a plot
    fig, ax = plt.subplots(figsize=(11, 8.5))
    ax = period_sums.plot(ax=ax)
    units_text = f'{model_volume_units}/{model_time_units}'
    ax.set_ylabel(f'Pumpage, in {units_text}')
    ax.set_xlabel('')

    # second axis with another volume unit
    def second_axis_conversion(x):
        return x * model_conv

    def second_axis_conversion_r(x):
        return x * 1 / model_conv
    ax2 = ax.secondary_yaxis('right', functions=(second_axis_conversion,
                                                 second_axis_conversion_r))
    ax2.set_ylabel(f'Pumpage, in {plot_volume_units}/{plot_time_units}')
    #format_xtick_labels(period_sums, ax, maxlabels=30, date_format='%Y-%m-%d')
    h, l = ax.get_legend_handles_labels()
    means = (period_sums.mean(axis=0) * model_conv).to_dict()
    plot_units_text = f'{plot_volume_units}/{plot_time_units}'
    labels_with_means = []
    for label in l:
        new_label = label
        if label in means:
            new_label += f' (mean: {means[label]:g} {plot_units_text})'
        labels_with_means.append(new_label)
    ax.legend(h, labels_with_means)

    if outfile is not None:
        Path(outfile).parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(outfile)
        plt.close()
        print(f'wrote {outfile}')
    else:
        return ax
Exemplo n.º 2
0
def preprocess_flows(
    data,
    metadata=None,
    flow_data_columns=['flow'],
    start_date=None,
    active_area=None,
    active_area_id_column=None,
    active_area_feature_id=None,
    source_crs=4269,
    dest_crs=5070,
    datetime_col='datetime',
    site_no_col='site_no',
    line_id_col='line_id',
    x_coord_col='x',
    y_coord_col='y',
    name_col='name',
    flow_qualifier_column=None,
    default_qualifier='measured',
    include_sites=None,
    include_line_ids=None,
    source_volume_units='ft3',
    source_time_units='s',
    dest_volume_units='m3',
    dest_time_units='d',
    geographic_groups=None,
    geographic_groups_col=None,
    max_obsname_len=None,
    add_leading_zeros_to_sw_site_nos=False,
    column_renames=None,
    outfile=None,
):
    """Preprocess stream flow observation data, for example, from NWIS or another data source that
    outputs time series in CSV format with site locations and identifiers.

    * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates)
      to the CRS of the model (`dest_crs`)
    * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area
    * length and time units are converted to those of the groundwater model.
    * Prefixes for observation names (with an optional length limit) that identify the location are generated
    * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons
      (`geographic_groups` parameter)

    Parameters
    ----------
    data : csv file or DataFrame
        Time series of stream flow observations.
        Columns:

        ===================== ======================================
        site_no               site identifier
        datetime              measurement dates/times
        x                     x-coordinate of site
        y                     y-coordinate of site
        flow_data_columns     Columns of observed streamflow values
        flow_qualifier_column Optional column with qualifiers for flow values
        ===================== ======================================

        Notes:

        * x and y columns can alternatively be in the metadata table
        * flow_data_columns are denoted in `flow_data_columns`; multiple
          columns can be included to process base flow and total flow, or
          other statistics in tandem
        * For example, `flow_qualifier_column` may have "estimated" or "measured"
          flags denoting whether streamflows were derived from measured values
          or statistical estimates.

    metadata : csv file or DataFrame
        Stream flow observation site information.

        May include columns:

        ================= ================================================================================
        site_no           site identifier
        x                 x-coordinate of site
        y                 y-coordinate of site
        name              name of site
        line_id_col       Identifier for a line in a hydrography dataset that the site is associated with.
        ================= ================================================================================

        Notes:

        * other columns in metadata will be passed through to the metadata output

    flow_data_columns : list of strings
        Columns in data with flow values or their statistics.
        By default, ['q_cfs']
        start_date : str (YYYY-mm-dd)
        Simulation start date (cull observations before this date)
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    datetime_col : str, optional
        Column name in data with observation date/times,
        by default 'datetime'
    site_no_col : str, optional
        Column name in data and metadata with site identifiers,
        by default 'site_no'
    line_id_col : str, optional
        Column name in data or metadata with identifiers for
        hydrography lines associated with observation sites.
        by default 'line_id'
    x_coord_col : str, optional
        Column name in data or metadata with x-coordinates,
        by default 'x'
    y_coord_col : str, optional
        Column name in data or metadata with y-coordinates,
        by default 'y'
    name_col : str, optional
        Column name in data or metadata with observation site names,
        by default 'name'
    flow_qualifier_column : str, optional
        Column name in data with flow observation qualifiers, such
        as "measured" or "estimated"
        by default 'category'
    default_qualifier : str, optional
        Default qualifier to populate flow_qualifier_column if it
        is None. By default, "measured"
    include_sites : list-like, optional
        Exclude output to these sites.
        by default, None (include all sites)
    include_line_ids : list-like, optional
        Exclude output to these sites, represented by line identifiers.
        by default, None (include all sites)
    source_volume_units : str, 'm3', 'cubic meters', 'ft3', etc.
        Volume units of the source data. By default, 'ft3'
    source_time_units : str, 's', 'seconds', 'days', etc.
        Time units of the source data. By default, 's'
    dest_volume_units : str, 'm3', 'cubic meters', 'ft3', etc.
        Volume units of the output (model). By default, 'm3'
    dest_time_units : str, 's', 'seconds', 'days', etc.
        Time units of the output (model). By default, 'd'
    geographic_groups : file, dict or list-like
        Option to group observations by area(s) of interest. Can
        be a shapefile, list of shapefiles, or dictionary of shapely polygons.
        A 'group' column will be created in the metadata, and observation
        sites within each polygon will be assigned the group name
        associated with that polygon.

        For example::

            geographic_groups='../source_data/extents/CompositeHydrographArea.shp'
            geographic_groups=['../source_data/extents/CompositeHydrographArea.shp']
            geographic_groups={'cha': <shapely Polygon>}

        Where 'cha' is an observation group name for observations located within the
        the area defined by CompositeHydrographArea.shp. For shapefiles,
        group names are provided in a `geographic_groups_col`.

    geographic_groups_col : str
        Field name in the `geographic_groups` shapefile(s) containing the
        observation group names associated with each polygon.
    max_obsname_len : int or None
        Maximum length for observation name prefix. Default of 13
        allows for a PEST obsnme of 20 characters or less with
        <prefix>_yyyydd or <prefix>_<per>d<per>
        (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1)
        If None, observation names will not be truncated. PEST++ does not have
        a limit on observation name length.
    add_leading_zeros_to_sw_site_nos : bool
        Whether or not to pad site numbers using the
        :func:~`mapgwm.swflows.format_usgs_sw_site_id` function.
        By default, False.
    column_renames : dict, optional
        Option to rename columns in the data or metadata that are different than those listed above.
        For example, if the data file has a 'SITE_NO' column instead of 'SITE_BADGE'::

            column_renames={'SITE_NO': 'site_no'}

        by default None, in which case the renames listed above will be used.
        Note that the renames must be the same as those listed above for
        :func:`mapgwm.swflows.preprocess_flows` to work.
    outfile : str
        Where output file will be written. Metadata are written to a file
        with the same name, with an additional "_info" suffix prior to
        the file extension.

    Returns
    -------
    data : DataFrame
        Preprocessed time series
    metadata : DataFrame
        Preprocessed metadata

    References
    ----------
    `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>`

    Notes
    -----

    """
    # outputs
    if outfile is not None:
        outpath, filename = os.path.split(outfile)
        makedirs(outpath)
        outname, ext = os.path.splitext(outfile)
        out_info_csvfile = outname + '_info.csv'
        out_data_csvfile = outfile
        out_shapefile = outname + '_info.shp'

    # read the source data
    if not isinstance(data, pd.DataFrame):
        df = pd.read_csv(data, dtype={site_no_col: object})
    else:
        df = data.copy()
    # check the columns
    for col in [datetime_col] + flow_data_columns:
        assert col in df.columns, "Column {} not found in {}".format(col, data)
    assert any({site_no_col, line_id_col}.intersection(df.columns)), \
        "Neither {} or {} found in {}. Need to specify a site_no_col or line_id_col".format(site_no_col,
                                                                                            line_id_col, data)
    # rename input columns to these names,
    # for consistent output
    dest_columns = {
        datetime_col: 'datetime',
        site_no_col: 'site_no',
        line_id_col: 'line_id',
        x_coord_col: 'x',
        y_coord_col: 'y',
        name_col: 'name',
        flow_qualifier_column: 'category'
    }
    # update the default column renames
    # with any supplied via column_renames parameter
    if isinstance(column_renames, collections.Mapping):
        dest_columns.update(column_renames)
    df.rename(columns=dest_columns, inplace=True)
    flow_data_columns = [
        c if c not in dest_columns else dest_columns[c]
        for c in flow_data_columns
    ]
    # convert site numbers to strings;
    # add leading 0s to any USGS sites that should have them
    if 'site_no' in df.columns:
        df['site_no'] = format_site_ids(df['site_no'],
                                        add_leading_zeros_to_sw_site_nos)
    else:
        df['site_no'] = df[line_id_col]

    # read the source data
    if metadata is not None:
        if not isinstance(metadata, pd.DataFrame):
            md = pd.read_csv(metadata, dtype={site_no_col: object})
        else:
            md = metadata.copy()
        if site_no_col not in md.columns or 'site_no' not in df.columns:
            raise IndexError(
                'If metadata are supplied, both data and metadata must '
                'have a site_no column.')
        md.rename(columns=dest_columns, inplace=True)
        md['site_no'] = format_site_ids(md['site_no'],
                                        add_leading_zeros_to_sw_site_nos)
        md.index = md['site_no']
        by_site = df.groupby('site_no')
        md['start_dt'] = pd.DataFrame(by_site['datetime'].first())
    else:
        by_site = df.groupby('site_no')
        md = pd.DataFrame(by_site['datetime'].first())
        md.columns = ['start_dt']
        md['site_no'] = md.index

    md['end_dt'] = pd.DataFrame(by_site['datetime'].last())
    md['n'] = pd.DataFrame(by_site['datetime'].count())
    md.reset_index(inplace=True, drop=True)

    # assign metadata if supplied
    for col in 'x', 'y', 'line_id', 'name':
        if col in df.columns and col not in md.columns:
            by_site_no = dict(zip(df['site_no'], df[col]))
            md[col] = [by_site_no[sn] for sn in md['site_no']]
            if col != 'line_id':
                df.drop(col, axis=1, inplace=True)

    # index the dataframe to times;
    # truncate data before start date
    df.index = pd.to_datetime(df['datetime'])
    df.index.name = 'datetime'
    df = df.loc[start_date:].copy()

    # project x, y to model crs
    x_pr, y_pr = project((md.x.values, md.y.values), source_crs, dest_crs)
    md['x'], md['y'] = x_pr, y_pr
    md['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull data to that within the model area
    if active_area is not None:
        df, md = cull_data_to_active_area(df,
                                          active_area,
                                          active_area_id_column,
                                          active_area_feature_id,
                                          data_crs=dest_crs,
                                          metadata=md)

    # get the hydrography IDs corresponding to each site
    # using the included lookup table
    #if 'line_id' not in df.columns:
    #    assert line_id_lookup is not None, \
    #    "need to include line_ids in a column, or line_id_lookup dictionary mapping line_ids to site numbers"
    #    df = df.loc[df['site_no'].isin(line_id_lookup)].copy()
    #    df['line_id'] = [line_id_lookup[sn] for sn in df['site_no']]

    if include_sites is not None:
        md = md.loc[md.site_no.isin(include_sites)]
        df = df.loc[df.site_no.isin(include_sites)]
    if include_line_ids is not None:
        md = md.loc[md.line_id.isin(include_line_ids)]
        df = df.loc[df.line_id.isin(include_line_ids)]

    # convert units
    # ensure that flow values are numeric (may be objects if taken directly from NWIS)
    unit_conversion = (
        convert_volume_units(source_volume_units, dest_volume_units) /
        convert_time_units(source_time_units, dest_time_units))
    for flow_col in flow_data_columns:
        df[flow_col] = pd.to_numeric(df[flow_col],
                                     errors='coerce') * unit_conversion
    df.dropna(subset=flow_data_columns, axis=0, inplace=True)

    # reformat qualifiers for consistent output
    # (lump to dest category columns of either estimated or measured)
    # with measured including values derived from baseflow separation or actual measurements)
    # output column name for flow qualifier column:
    dest_flow_qualifier_column = 'category'
    if flow_qualifier_column is not None:
        flow_qualifiers = {
            'calculated': 'measured',  # 'measured',
            'base flow separated from measured values':
            'measured',  # 'measured',
            'measured total flow': 'measured',
            'estimated gaged': 'estimated',
            'estimated ungaged': 'estimated'
        }
        df[dest_flow_qualifier_column] = df[flow_qualifier_column].replace(
            flow_qualifiers)
    else:
        df['category'] = default_qualifier

    # make unique n-character prefixes (site identifiers) for each observation location
    # 13 character length allows for prefix_yyyymmm in 20 character observation names
    # (BeoPEST limit)
    unique_obsnames = set()
    obsnames = []
    for sn in md['site_no'].tolist():
        if max_obsname_len is not None:
            name = make_obsname(sn,
                                unique_names=unique_obsnames,
                                maxlen=max_obsname_len)
            assert name not in unique_obsnames
        else:
            name = sn
        unique_obsnames.add(name)
        obsnames.append(name)
    md['obsprefix'] = obsnames

    # add area of interest information
    md['group'] = 'fluxes'
    md = assign_geographic_obsgroups(md,
                                     geographic_groups,
                                     geographic_groups_col,
                                     metadata_crs=dest_crs)

    # data columns
    data_cols = ['site_no', 'line_id', 'datetime'
                 ] + flow_data_columns + ['category']
    #if 'line_id' in md.columns and 'line_id' not in df.columns:
    #    # only map line_ids to data if there are more site numbers
    #    # implying that no site number maps to more than one line_id
    #    if len(set(df.site_no)) >= len(set(df.line_id)):
    #        ids = dict(zip(md['site_no'], md['line_id']))
    #    df['line_id'] = [ids[sn] for sn in df['site_no']]
    data_cols = [c for c in data_cols if c in df.columns]
    df = df[data_cols]

    md.index = md['site_no']
    # save out the results
    if outfile is not None:
        df2shp(md.drop(['x', 'y'], axis=1), out_shapefile, crs=dest_crs)
        print('writing {}'.format(out_info_csvfile))
        md.drop('geometry', axis=1).to_csv(out_info_csvfile,
                                           index=False,
                                           float_format='%g')
        print('writing {}'.format(out_data_csvfile))
        df.to_csv(out_data_csvfile, index=False, float_format='%g')
    return df, md
Exemplo n.º 3
0
    def assign_monthly_production(self, outfile='processed_swuds.csv'):
        """ Assign production wells for water use, skipping IR (irrigation) and
        TE (thermal electric) to production zones.  If production zones are not
        assigned or if the well bottom doesn't fall into a production zone, then
        the screen_top and screen_bot are assigned using well_depth and the
        default screen length.

        Production is given in cubic m per day.
        todo:  add unit conversion parameter so other units can be used?

        Parameters
        ----------
        outfile: str
            path to final processed monthly water-use file with production zone
            information 
        """

        # fill in missing monthly values with annual value
        for c in self.monthly_cols:
            idx = self.df.loc[self.df[c].isnull()].index.values
            self.df.loc[idx, c] = self.df.loc[idx, 'ANNUAL_VAL']

        # pull out groundwater sites that are not IR, AQ or TE
        self.df = self.df.loc[(self.df['WATER_CD'] == 'GW')
                              & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'IR')
                              & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'AQ')
                              & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'TE')]

        # reshape dataframe to have monthly values in same column
        stacked = pd.DataFrame(self.df[self.monthly_cols].stack())
        stacked.reset_index(inplace=True)
        stacked.rename(columns={
            'level_1': 'month',
            0: 'q_monthly'
        },
                       inplace=True)
        stacked.q_monthly = stacked.q_monthly
        stacked.index = stacked.level_0
        stacked = stacked.join(self.df)
        keep_cols = [c for c in stacked.columns if c not in self.monthly_cols]
        stacked = stacked[keep_cols]
        month = {name: i + 1 for i, name in enumerate(self.monthly_cols)}
        dates = [
            '{}-{:02d}'.format(year, month[month_column_name])
            for year, month_column_name in zip(stacked.YEAR, stacked.month)
        ]
        stacked['datetime'] = pd.to_datetime(dates)
        stacked.sort_values(by=['SITE_NO', 'datetime'], inplace=True)

        # set start and end dates if not already set
        if self.start_date is None:
            self.start_date = stacked.datetime.min()
        if self.end_date is None:
            self.end_date = stacked.datetime.max()

        groups = stacked.groupby('SITE_NO')
        all_groups = []
        for site_no, group in groups:
            group = group.copy()
            group.index = pd.to_datetime(group['datetime'])
            start_date = pd.Timestamp(self.start_date)
            end_date = pd.Timestamp(self.end_date)

            monthly_values_2010 = group.loc[group.datetime.dt.year == 2010]
            monthly_values_2010 = dict(
                zip(monthly_values_2010.datetime.dt.month,
                    monthly_values_2010.q_monthly))
            avg_monthly_values = group.groupby(
                group.index.month).mean().q_monthly.to_dict()
            q_mean = group.q_monthly.mean()

            # reindex the site data to include all months for simulation period
            all_dates = pd.date_range(start_date, end_date, freq='MS')
            group = group.reindex(all_dates)
            # fill empty dates
            q = []
            for month, q_monthly in zip(group.index.month, group.q_monthly):
                # try to use 2010 values if they exist
                if np.isnan(q_monthly):
                    q_monthly = monthly_values_2010.get(month, np.nan)
                # otherwise take the average value for each month
                if np.isnan(q_monthly):
                    q_monthly = avg_monthly_values[month]
                # fill missing months with the mean value for the site
                if np.isnan(q_monthly):
                    q_monthly = q_mean
                q.append(q_monthly)

            # assume most values represent abstraction
            # if sum is positive, invert so that output values are negative
            if np.sum(q) > 0:
                q = -np.array(q)
            group['q'] = q
            #group['q'] = group['q'] * 3785.4  # convert from mgd to cubic m per d
            group['q'] = group['q'] * convert_volume_units(
                self.data_volume_units, self.model_length_units)

            group['site_no'] = f'swuds_{site_no}'
            group['well_elev'] = self.well_elevations[site_no]
            group['depth'] = self.depths[site_no]
            well_botm_depth = self.well_elevations[site_no] - self.depths[
                site_no]
            group['x'] = np.nanmin(group['x'])
            group['y'] = np.nanmin(group['y'])

            # assign a production zone from default dict.  If the bottom of the
            # well does not fall in a zone, or if the dictionary is empty; then
            # the production zone is assigned 'unnamed'
            production_zone = 'unnamed'
            for prod_name in self.prod_zone_top.keys():
                prod_zone_top = self.prod_zone_top[prod_name][site_no]
                prod_zone_bot = self.prod_zone_bot[prod_name][site_no]
                if np.isnan(prod_zone_top) or np.isnan(
                        prod_zone_bot):  # missing zone
                    group['screen_bot'] = self.well_elevations[
                        site_no] - self.depths[site_no]
                    group['screen_top'] = self.well_elevations[
                        site_no] - self.depths[
                            site_no] + self.default_screen_len
                    group['open_int_method'] = 'well depth'
                else:
                    if well_botm_depth < prod_zone_top and well_botm_depth > prod_zone_bot:
                        production_zone = prod_name
                        group['screen_bot'] = prod_zone_bot
                        group['screen_top'] = prod_zone_top
                        group['open_int_method'] = 'production zone'
                    else:
                        group['screen_bot'] = self.well_elevations[
                            site_no] - self.depths[site_no]
                        group['screen_top'] = self.well_elevations[
                            site_no] - self.depths[
                                site_no] + self.default_screen_len
                        group['open_int_method'] = 'well depth'
            group['production_zone'] = production_zone

            # add aquifer name
            group['aquifer_name'] = self.aquifer_names.get(
                group["FROM_AQFR_CD"].values[0], 'unnamed')

            cols = [
                'site_no', 'q', 'q_monthly', 'month', 'well_elev', 'depth',
                'screen_bot', 'screen_top', 'x', 'y'
            ]
            all_groups.append(group[cols])

        self.df = pd.concat(all_groups)
        self.df[
            'start_datetime'] = self.df.index  # start date of each pumping period
        if outfile is not None:
            outfile = Path(outfile)
            self.df.to_csv(outfile, index=False)
            print(
                'processed SWUDS data written to {0} and in dataframe attribute'
                .format(outfile))
            self.df['geometry'] = [
                Point(x, y) for x, y in zip(self.df.x, self.df.y)
            ]
            # write only unique pumping values to shapefile
            to_shapefile = self.df.groupby(['site_no',
                                            'q']).first().reset_index()
            shapefile = outfile.with_suffix('.shp')
            df2shp(to_shapefile, shapefile, crs=self.dest_crs)
Exemplo n.º 4
0
def test_convert_volume_units():
    assert np.allclose(convert_volume_units('cubic meters', 'mgal'),
                       264.172 / 1e6)
    assert np.allclose(convert_volume_units('$m^3$', '$ft^3$'), 35.3147)
    assert np.allclose(convert_volume_units('cubic meters', 'cubic feet'),
                       35.3147)
    assert np.allclose(convert_volume_units('cubic feet', 'cubic meters'),
                       0.0283168)
    assert np.allclose(convert_volume_units('meters', 'feet'), 35.3147)
    assert np.allclose(convert_volume_units('feet', 'meters'), 0.0283168)
    assert np.allclose(convert_volume_units('feet3', 'm3'), 0.0283168)
    assert np.allclose(convert_volume_units('feet3', 'meters3'), 0.0283168)
    assert np.allclose(convert_volume_units('gallons', 'ft3'), 1 / 7.48052)
    assert np.allclose(convert_volume_units('gallons', 'm3'),
                       (.3048**3) / 7.48052)
    assert np.allclose(convert_volume_units('gallons', 'acre foot'),
                       1 / 7.48052 / 43560)
    assert np.allclose(convert_volume_units('gallons', 'af'),
                       1 / 7.48052 / 43560)
    assert np.allclose(convert_volume_units('gallons', 'acre-ft'),
                       1 / 7.48052 / 43560)
    assert np.allclose(convert_volume_units('mgal', 'acre-ft'),
                       1e6 / 7.48052 / 43560)
    assert np.allclose(convert_volume_units('liters', 'gallon'), 1 / 3.78541)
    assert np.allclose(convert_volume_units(None, 'cubic feet'), 1.)
    assert np.allclose(convert_volume_units('cubic feet', None), 1.)
    assert np.allclose(convert_volume_units('junk', 'junk'), 1.)
Exemplo n.º 5
0
def resample_pumping_rates(wu_file, wu_points, model,
                           active_area=None,
                           minimum_layer_thickness=2,
                           drop_ids=None,
                           exclude_steady_state=True,
                           dropna=False, na_fill_value=0.,
                           verbose=False):
    """Read water use data from a master file generated from
    WDNR_wu_data.ipynb. Cull data to area of model. Convert
    from monthly gallons to daily averages in m3/d
    for model stress periods.

    Parameters
    ----------
    wu_file : csv file
        Water use data ouput from the WDNR_wu_data.ipynb.
    wu_points : point shapefile
        Water use locations, generated in the WDNR_wu_data.ipynb
        Must be in same CRS as sr.
    model : flopy.modflow.Modflow instance
        Must have a valid attached .sr attribute defining the model grid.
        Only wells within the bounds of the sr will be retained.
        Sr is also used for row/column lookup.
        Must be in same CRS as wu_points.
    active_area : str (shapefile path) or shapely.geometry.Polygon
        Polygon denoting active area of the model. If specified,
        wells are culled to this area instead of the model bounding box.
        (default None)
    exclude_steady_state : bool
        Exclude steady-state stress periods from resampled output.
        (default True)
    minimum_layer_thickness : scalar
        Minimum layer thickness to have pumping.
    dropna : bool
        Flag to drop times (stress periods) where there is no data for a well
    na_fill_value : float
        If dropna == False, fill missing times (stress periods) with this value.

    Returns
    -------
    wu_data : DataFrame

    """
    assert not np.isnan(na_fill_value), "na_fill_value must be a number!"

    well_info, monthly_data = read_wdnr_monthly_water_use(wu_file,
                                                          wu_points,
                                                          model,
                                                          drop_ids=drop_ids,
                                                          active_area=active_area,
                                                          minimum_layer_thickness=minimum_layer_thickness)
    print('\nResampling pumping rates in {} to model stress periods...'.format(wu_file))
    if dropna:
        print('    wells with no data for a stress period will be dropped from that stress period.')
    else:
        print('    wells with no data for a stress period will be assigned {} pumping rates.'.format(na_fill_value))
    if exclude_steady_state:
        perioddata = model.perioddata.loc[~model.perioddata.steady].copy()
    else:
        perioddata = model.perioddata.copy()

    t0 = time.time()
    # reindex the record at each site to the model stress periods
    dfs = []
    for site, sitedata in monthly_data.groupby('site_no'):
        if site not in well_info.index:
            continue
        sitedata.index = sitedata.datetime
        assert not sitedata.index.duplicated().any()

        if dropna:
            site_period_data = sitedata.reindex(perioddata.start_datetime).dropna(axis=1)
        else:
            site_period_data = sitedata.reindex(perioddata.start_datetime, fill_value=na_fill_value)
            isna = site_period_data['site_no'] == 0.
            if np.any(isna):
                if verbose:
                    years = set(site_period_data.loc[isna, 'year'])
                    years = ', '.join(list(years))
                    print('Site {} has {} times with nans (in years {})- filling with {}s'.format(site,
                                                                                    np.sum(isna),
                                                                                    years,
                                                                                    na_fill_value))
            site_period_data['site_no'] = site
            site_period_data['year'] = site_period_data.index.year
            site_period_data['month'] = site_period_data.index.month
            site_period_data['datetime'] = site_period_data.index
        assert not site_period_data.isna().any().any()
        site_period_data.index = perioddata.index

        # copy stress periods and lengths from master stress period table
        for col in ['perlen', 'per']:
            site_period_data[col] = perioddata[col]

        # convert units from monthly gallon totals to daily model length units
        site_period_data['gal_d'] = site_period_data['gallons'] / site_period_data['perlen']
        gal_to_model_units = convert_volume_units('gal', get_model_length_units(model))#model.dis.lenuni]
        site_period_data['q'] = site_period_data.gal_d * gal_to_model_units
        for col in ['i', 'j', 'k']:
            site_period_data[col] = well_info.loc[site, col]
        site_period_data.index = [site] * len(site_period_data)
        dfs.append(site_period_data[['k', 'i', 'j', 'q', 'per']])
    wel_data = pd.concat(dfs)
    # water use fluxes should be negative
    if not wel_data.q.max() <= 0:
        wel_data.loc[wel_data.q.abs() != 0., 'q'] *= -1
    wel_data['boundname'] = ['site{:d}'.format(s) for s in wel_data.index]
    assert not np.any(wel_data.isna()), "Nans in Well Data"
    print("took {:.2f}s\n".format(time.time() - t0))
    return wel_data
Exemplo n.º 6
0
def get_mean_pumping_rates(wu_file, wu_points, model,
                           start_date='2012-01-01', end_date='2018-12-31',
                           period_stats={0: 'mean'},
                           active_area=None,
                           drop_ids=None,
                           minimum_layer_thickness=2):
    """Read water use data from a master file generated from
    WDNR_wu_data.ipynb. Cull data to area of model. Convert
    from monthly gallons to daily averages in m3/d
    for model stress periods.

    Parameters
    ----------
    wu_file : csv file
        Water use data ouput from the WDNR_wu_data.ipynb.
    wu_points : point shapefile
        Water use locations, generated in the WDNR_wu_data.ipynb
        Must be in same CRS as sr.
    model : flopy.modflow.Modflow instance
        Must have a valid attached .sr attribute defining the model grid.
        Only wells within the bounds of the sr will be retained.
        Sr is also used for row/column lookup.
        Must be in same CRS as wu_points.
    start_date : str (YYYY-MM-dd)
        Start date of time period to average.
    end_date : str (YYYY-MM-dd)
        End date of time period to average.
    period_stats : dict
        Dictionary of stats keyed by stress period. Stats include zero values, unless noted.
        keys : 0, 1, 2 ...
        values: str; indicate statistic to apply for each stress period
            'mean': mean pumping for period defined by start_date and end_date
            '<month>': average for a month of the year (e.g. 'august'),
            for the for period defined by start_date and end_date
    minimum_layer_thickness : scalar
        Minimum layer thickness to have pumping.

    Returns
    -------
    wu_data : DataFrame

    """
    start_date, end_date = pd.Timestamp(start_date), pd.Timestamp(end_date)
    well_info, monthly_data = read_wdnr_monthly_water_use(wu_file, wu_points, model,
                                                          active_area=active_area,
                                                          drop_ids=drop_ids,
                                                          minimum_layer_thickness=minimum_layer_thickness)
    if well_info is None:
        return
    # determine period for computing average pumping
    # make a dataframe for each stress period listed
    wel_data = []
    for per, stat in period_stats.items():

        if isinstance(stat, str):
            stat = stat.lower()
        elif isinstance(stat, list):
            stat, start_date, end_date = stat
            start_date, end_date = pd.Timestamp(start_date), pd.Timestamp(end_date)
            stat = stat.lower()
        # slice the monthly values to the period of start_date, end_date
        # aggregate to mean values in m3/d
        # (this section will need some work for generalized transient run setup)
        is_inperiod = (monthly_data.datetime > start_date) & (monthly_data.datetime < end_date)
        inperiod = monthly_data.loc[is_inperiod].copy()

        # compute average daily flux using the sum and number of days for each site
        # (otherwise each month is weighted equally)
        # convert units from monthly gallons to daily gallons
        inperiod['days'] = inperiod.datetime.dt.daysinmonth

        if stat == 'mean':
            period_data = inperiod.copy()
        # mean for given month (e.g. august mean)
        elif stat in months.keys() or stat in months.values():
            period_data = inperiod.loc[inperiod.month == months.get(stat, stat)].copy()
        else:
            raise ValueError('Unrecognized input for stat: {}'.format(stat))

        site_means = period_data.groupby('site_no').mean()
        site_sums = period_data.groupby('site_no').sum()
        site_means['gal_d'] = site_sums['gallons'] / site_sums['days']
        # conversion to model units is based on lenuni variable in DIS package
        gal_to_model_units = convert_volume_units('gal', get_model_length_units(model))
        site_means['q'] = site_means.gal_d * gal_to_model_units
        site_means['per'] = per

        wel_data.append(well_info[['k', 'i', 'j']].join(site_means[['q', 'per']], how='inner'))

    wel_data = pd.concat(wel_data, axis=0)
    # water use fluxes should be negative
    if not wel_data.q.max() <= 0:
        wel_data.loc[wel_data.q.abs() != 0., 'q'] *= -1
    wel_data['boundname'] = ['site{:d}'.format(s) for s in wel_data.index]
    assert not np.any(wel_data.isna()), "Nans in Well Data"
    return wel_data
Exemplo n.º 7
0
def preprocess_iwum_pumping(ncfile,
                            start_date=None,
                            end_date=None,
                            active_area=None,
                            active_area_id_column=None,
                            active_area_feature_id=None,
                            estimated_production_zone_top=None,
                            estimated_production_zone_botm=None,
                            flux_variable='value',
                            nc_crs=5070,
                            dest_crs=5070,
                            nc_length_units='meters',
                            estimated_production_surface_units='meters',
                            model_length_units='meters',
                            outfile=None):
    """Get pumping from the Irrigation Water Use Model (IWUM; Wilson, 2020) output and
    assign open interval information, using raster surfaces of the
    top and bottom of an estimated production zone.

    Parameters
    ----------
    ncfile : file path
        NetCDF output from Irrigation Water Use Model
    start_date : str
        Cull data before this date.
    end_date : str
        Cull data after this date.
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    estimated_production_zone_top : file path
        Raster surface for assigning screen tops
    estimated_production_zone_botm : file path
        Raster surface for assigning screen bottoms
    flux_variable : str
        Varible in ncfile for pumping fluxes. Fluxes are assumed to
        represent total volumes for each time period.
    nc_crs : obj
        Coordinate Reference System (CRS) of ncfile.
        A Python int, dict, str, or pyproj.crs.CRS instance
        passed to the pyproj.crs.from_user_input
        See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input.
        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class
    nc_length_units : str, {'meters', 'ft', etc.}
        Length units of pumped volumes in ncfile
    estimated_production_surface_units : str, {'meters', 'ft', etc.}
        Length units of elevations in estimated production surface rasters.
    model_length_units : str, {'meters', 'ft', etc.}
        Length units of model.
    outfile : csv file for output table

    Returns
    -------
    df : DataFrame
        Table of pumping rates in m3/day, location
        and open interval information.

        Columns:

        ============== ================================================
        site_no        index position of pumping rate in ncfile grid
        x              x-coordinate in `dest_crs`
        y              y-coordinate in `dest_crs`
        start_datetime start date of pumping period
        end_datetime   end date of pumping period
        screen_top     screen top elevation, in `model_length_units`
        screen_botm    screen bottom elevation, in `model_length_units`
        q              pumping rate, in model units
        geometry       shapely Point object representing location
        ============== ================================================

    Notes
    -----
    * Time units are assumed to be days.
    * Fluxes are assumed to represent total volumes for each time period
      indicated by the differences between successive values along the time axis of ncfile.
    """
    ds = xr.open_dataset(ncfile)
    time_variable = [k for k in ds.coords.keys() if k.lower() not in {'x', 'y'}][0]
    ds_x, ds_y = np.meshgrid(ds['x'], ds['y'])

    # original values are in m3, in each 1 mi2 cell
    # can leave in m3 if reassigning to 1km grid as point values
    length_conversion = convert_volume_units(nc_length_units,
                                             model_length_units) ** 3
    unit_suffix = vol_suffix[model_length_units] + 'd'
    flux_col = 'q'  # 'flux_{}'.format(unit_suffix)  # output field name for fluxes

    # get top/botm elevations
    est_screen_top = None
    est_screen_botm = None
    if estimated_production_zone_top is not None and \
            estimated_production_zone_botm is not None:
        surf_unit_conversion = convert_length_units(estimated_production_surface_units,
                                                    model_length_units)
        est_screen_top = get_values_at_points(estimated_production_zone_top, ds_x, ds_y,
                                                points_crs=nc_crs)
        est_screen_top *= surf_unit_conversion
        est_screen_botm = get_values_at_points(estimated_production_zone_botm, ds_x, ds_y,
                                                 points_crs=nc_crs)
        est_screen_botm *= surf_unit_conversion

        # in any places where screen top is less than the screen botm,
        # set both at the mean
        loc = est_screen_top < est_screen_botm
        means = np.mean([est_screen_top, est_screen_botm], axis=0)
        est_screen_top[loc] = means[loc]
        est_screen_botm[loc] = means[loc]
        print(f'Reset screen top and bottom to mean elevation at {loc.ravel().sum()} '
              f'locations where screen top was < screen bottom')

    dfs = []
    times = pd.DatetimeIndex(ds[time_variable].loc[start_date:end_date].values)
    for n, period_start_date in enumerate(times):

        # for each time entry, get the data
        kwargs = {time_variable: period_start_date}
        arr = ds[flux_variable].sel(**kwargs).values

        # make sure pumping sign is  negative
        # based on assumption that values are mostly abstraction
        if arr.sum() > 0:
            arr *= -1

        # set up a dataframe
        data = {'site_no': np.arange(ds_x.size),
                'x': ds_x.ravel(),
                'y': ds_y.ravel(),
                 }
        if est_screen_top is not None and est_screen_botm is not None:
            data.update({'screen_top': est_screen_top.ravel(),
                         'screen_botm': est_screen_botm.ravel()
                         }
                        )
        df = pd.DataFrame(data)
        df['start_datetime'] = period_start_date

        # get the end_date, handling last entry
        if n + 1 < len(times):
            period_end_date = times[n + 1]
        else:
            # set end date for last period on previous period length
            last_start = dfs[-1]['start_datetime'].values[0]
            ndays = (pd.Timestamp(period_start_date) -
                     pd.Timestamp(last_start)).days
            period_end_date = period_start_date + pd.Timedelta(ndays, unit='d')

        # convert the time units
        ndays = (pd.Timestamp(period_end_date) -
                 pd.Timestamp(period_start_date)).days
        assert ndays > 0, "period_end_date {} is before period_start_date {}"\
            .format(period_end_date, period_start_date)
        time_conversion = 1 / ndays  # original quantities are volumes for the time period

        # time indexing in pandas is through last value
        period_end_date = pd.Timestamp(period_end_date) - pd.Timedelta(1, unit='d')
        df['end_datetime'] = period_end_date
        df[flux_col] = arr.ravel() * length_conversion * time_conversion

        # only includes fluxes > 0
        df = df.loc[df[flux_col] < 0]

        dfs.append(df)
    df = pd.concat(dfs)

    # site number column (that would be unique from other integers from other data sources)
    df['site_no'] = [f'iwum_{node}' for node in df.site_no]

    # project the data to a destination crs, if provided
    # make a separate metadata dataframe with 1 row per location
    # to avoid redundant operations
    metadata = df.groupby('site_no').first().reset_index()[['site_no', 'x', 'y']]
    metadata.index = metadata['site_no']
    x_pr, y_pr = project((metadata.x.values, metadata.y.values), nc_crs, dest_crs)
    metadata['x'], metadata['y'] = x_pr, y_pr
    metadata['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull the data to the model area, if provided
    if active_area is not None:
        df, metadata = cull_data_to_active_area(df, active_area,
                                      active_area_id_column,
                                      active_area_feature_id,
                                      data_crs=dest_crs, metadata=metadata)

    # update data with x,y values projected in metadata
    x = dict(zip(metadata.site_no, metadata.x))
    y = dict(zip(metadata.site_no, metadata.y))
    df['x'] = [x[sn] for sn in df.site_no]
    df['y'] = [y[sn] for sn in df.site_no]
    if outfile is not None:
        outfile = Path(outfile)
        df.to_csv(outfile, index=False, float_format='%g')
        print('wrote {}'.format(outfile))

        # Make a plot of iwum output in mgal/day
        out_pdf_path = outfile.parent / 'plots'
        out_pdf_path.mkdir(exist_ok=True)
        plot_iwum_output(ncfile, flux_variable=flux_variable, outpath=out_pdf_path)

    return df
Exemplo n.º 8
0
def plot_iwum_output(ncfile, flux_variable='value', outpath='.'):
    """Make a plot of iwum output in mgal/day
    for comparison with subsequent datasets.
    """

    ds = xr.open_dataset(ncfile)
    time_variable = [k for k in ds.coords.keys() if k.lower() not in {'x', 'y'}][0]
    xydims = tuple([i for i, len in enumerate(ds[flux_variable].shape)
                    if len != ds[time_variable].shape[0]])
    ts = ds[flux_variable][:, :, :].sum(axis=xydims).to_pandas()
    if ts.index.dtype == np.object:
        ts.index = pd.to_datetime(ts.index)

    ndays = pd.to_timedelta(np.diff(ts.index)).days.tolist()
    ndays.append(ndays[-1])  # pad the last time period
    df = pd.DataFrame(ts, columns=['m3'])
    df['m3d'] = df['m3'] / ndays # convert volumes to daily rate

    fig, ax = plt.subplots(figsize=(11, 8.5))
    ax = df['m3d'].plot.bar(ax=ax)
    ax.set_ylabel('Cubic meters per day')
    ymin, ymax = ax.get_ylim()
    ax2 = ax.twinx()
    to_mg = convert_volume_units('m3', 'mgal')
    ax2.set_ylim(ymin * to_mg, ymax * to_mg)
    ax2.set_ylabel('Million gallons per day')

    # can't use .mean(),
    # because periods with 0 pumping may not be included
    mean_mgd = df['m3'].sum() * to_mg / np.sum(ndays)
    ax2.axhline(mean_mgd, c='r')
    ax2.text(0.75, 0.9, 'Mean: {:,.0f} mgal/day'.format(mean_mgd),
             transform=ax.transAxes)

    # format the tick labels
    format_xtick_labels(df, ax, maxlabels=30, date_format='%Y-%m-%d')
    #maxlabels = 30
    #xticklabels = df.index.strftime('%Y-%m-%d').tolist()
    #stride = max(int(np.floor(len(xticklabels) / maxlabels)), 1)
    #formatted_labels = []
    #for label in xticklabels[::stride]:
    #    formatted_labels += [label] + [''] * (stride - 1)
    #formatted_labels = formatted_labels[:len(xticklabels)]
    #junk = ax.set_xticklabels(formatted_labels)

    # record the file name and last modified date
    ftime = pd.Timestamp(os.path.getmtime(ncfile), unit='s')
    ax2.text(0.02, 0.98, '{}\n{}'.format(ncfile,
                                         ftime.strftime('%Y-%m-%d')),
             va='top', fontsize=8,
             transform=ax.transAxes)

    # annotate the bars with the values
    for i, p in enumerate(ax.patches):
        value = '{:,.0f}'.format(to_mg * p.get_height())
        ax.annotate(value, (p.get_x() * 1.01, p.get_height() * 1.01),
                    ha='center', fontsize=8)
    ncfile = Path(ncfile)
    ftime = pd.Timestamp(ncfile.stat().st_mtime, unit='s')
    outfile = Path(outpath, f'{ncfile.name}_{ftime:%Y-%m-%d}.pdf')
    plt.savefig(outfile)
    print('wrote {}'.format(outfile))
    plt.close()
Exemplo n.º 9
0
def preprocess_te_wateruse(data,
                           start_date=None,
                           end_date=None,
                           active_area=None,
                           active_area_id_column=None,
                           active_area_feature_id=None,
                           estimated_production_zone_top=None,
                           estimated_production_zone_botm=None,
                           estimated_production_surface_units='feet',
                           source_crs=4269,
                           dest_crs=5070,
                           interp_method='linear',
                           data_volume_units='mgal',
                           model_length_units='meters',
                           outfile=None):
    """Preprocess water use data from thermoelectric power plants:

    * reproject data to a destination CRS `dest_crs`)
    * cull data to an area of interest (`active_area`)
    * if input data do not have information on the well screen intervals;
      sample screen tops and bottoms from raster surfaces bounding
      an estimated production zone (e.g. `estimated_production_zone_top`)
    * reindex the data to continous monthly values extending from `start_date`
      to `end_date`. Typically, these would bracket the time period for which
      the pumping should be simulated in a model. For example, the earliest data
      may be from 2010, but if the model starts in 2008, it may be appropriate to
      begin using the 2010 rates then (``start_date='2008'``). If no start or end
      date are given, the first and last years of pumping in `data` are used.
    * fill empty months by interpolation via a specified `interp_method`
    * backfill any remaining empty months going back to the `start_date`
    * write processed data to a CSV file and shapefile of the same name

    Parameters
    ----------
    data : DataFrame
        Thermoelectric water use data in the following format
        (similar to that output by :func:`mapgwm.te_wateruse.read_te_water_use_spreadsheet`):

        =============== =======================================================
        site_no         power plant identifier (plant code)
        start_datetime  pandas datetime representative of flux (e.g. '2010')
        x               x-coordinate of withdrawl, in `source_crs`
        y               y-coordinate of withdrawl, in `source_crs`
        q               withdrawl flux, in `data_volume_units` per days
        =============== =======================================================

    start_date : str
        Start date for pumping rates. If earlier than the dates in `data`,
        pumping rates will be backfilled to this date.
    end_date : str
        End date for pumping rates. If later than the dates in `data`,
        pumping rates will be forward filled to this date.
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    estimated_production_zone_top : file path
        Raster surface for assigning screen tops
    estimated_production_zone_botm : file path
        Raster surface for assigning screen bottoms
    estimated_production_surface_units : str, {'meters', 'ft', etc.}
        Length units of elevations in estimated production surface rasters.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    interp_method : str
        Interpolation method to use for filling pumping rates to monthly values.
        By default, 'linear'
    data_volume_units : str; e.g. 'mgal', 'm3', 'cubic feet', etc.
        Volume units of pumping data. All time units are assumed to be in days.
    model_length_units : str; e.g. 'feet', 'm', 'meters', etc.
        Length units of model.
    outfile : str
        Path for output file. A shapefile of the same name is also written.
        If None, no output file is written. By default, None

    Returns
    -------
    df_monthly : DataFrame
        

    Notes
    -----
    * time units for TE data and model are assumed to be days

    """
    df = data.copy()

    # reproject to dest_crs
    x, y = project(zip(df['x'], df['y']), source_crs, dest_crs)
    df['x'], df['y'] = x, y
    df['geometry'] = [Point(x, y) for x, y in zip(x, y)]

    # drop wells with no location information (for now)
    df.dropna(subset=['x', 'y'], axis=0, inplace=True)

    # cull sites to those within the Delta footprint
    # cull data to that within the model area
    if active_area is not None:
        df = cull_data_to_active_area(df,
                                      active_area,
                                      active_area_id_column,
                                      active_area_feature_id,
                                      data_crs=dest_crs)

    # get top and bottom of estimated production interval at each well
    if estimated_production_zone_top is not None and \
            estimated_production_zone_botm is not None:
        surf_unit_conversion = convert_length_units(
            estimated_production_surface_units, model_length_units)
        x, y = df.x.values, df.y.values
        est_screen_top = get_values_at_points(estimated_production_zone_top,
                                              x,
                                              y,
                                              points_crs=dest_crs)
        est_screen_top *= surf_unit_conversion
        est_screen_botm = get_values_at_points(estimated_production_zone_botm,
                                               x,
                                               y,
                                               points_crs=dest_crs)
        est_screen_botm *= surf_unit_conversion
        df['screen_top'] = est_screen_top
        df['screen_botm'] = est_screen_botm

    # distribute fluxes to monthly values
    # set start and end dates if not already set
    if start_date is None:
        start_date = df.start_datetime.min()
    if end_date is None:
        end_date = df.start_datetime.mmax()
    groups = df.groupby('site_no')
    all_groups = []
    for site_no, group in groups:
        dfg = group.copy()

        # create a continuous monthly time index
        # labeled at the month start
        all_dates = pd.date_range(start_date, end_date, freq='MS')
        dfg.index = dfg['start_datetime']
        dfg = dfg.reindex(all_dates)

        # interpolate the discharge values;
        # back filling to the start date
        dfg['q'] = dfg.q.interpolate(method=interp_method).bfill()
        dfg['q'] *= convert_volume_units(data_volume_units, model_length_units)

        # fill remaining columns
        dfg['start_datetime'] = dfg.index
        fill_columns = set(dfg.columns).difference({'q', 'start_datetime'})
        fill_values = group.iloc[0].to_dict()
        for c in fill_columns:
            dfg[c] = fill_values[c]

        # add 'te' prefix to site number
        dfg['site_no'] = f'te_{site_no}'
        all_groups.append(dfg)
    df_monthly = pd.concat(all_groups)

    # assume most values represent abstraction
    # if sum is positive, invert so that output values are negative
    if df_monthly['q'].sum() > 0:
        df_monthly['q'] *= -1

    # clean up the columns
    cols = [
        'site_no', 'start_datetime', 'x', 'y', 'screen_top', 'screen_botm',
        'q', 'geometry'
    ]
    cols += list(set(df_monthly.columns).difference(cols))
    df_monthly = df_monthly[cols]

    # write the output
    if outfile is not None:
        outfile = Path(outfile)
        df_monthly.drop('geometry', axis=1).to_csv(outfile,
                                                   index=False,
                                                   float_format='%g')
        print('wrote {}'.format(outfile))

        # write only unique pumping values to shapefile
        to_shapefile = df_monthly.groupby(['site_no',
                                           'q']).first().reset_index()
        shapefile = outfile.with_suffix('.shp')
        df2shp(to_shapefile, shapefile, crs=dest_crs)
    return df_monthly