def test_plot_wateruse(preprocessed_iwum_data, preprocessed_swuds_data, test_data_path): extent = test_data_path / 'extents/shellmound_bbox.shp' perioddata = test_data_path / 'shellmound/tables/stress_period_data.csv' preprocessed_iwum_data['geometry'] = [ Point(x, y) for x, y in zip(preprocessed_iwum_data.x, preprocessed_iwum_data.y) ] iwum_data = cull_data_to_active_area(preprocessed_iwum_data, active_area=extent, data_crs=5070) swuds_data = preprocessed_swuds_data.df swuds_data['geometry'] = [ Point(x, y) for x, y in zip(swuds_data.x, swuds_data.y) ] swuds_data = cull_data_to_active_area(swuds_data, active_area=extent, data_crs=5070) wel_files = sorted( glob.glob(str(test_data_path / 'shellmound/external/wel_*.dat'))) wel_files = dict(zip(range(1, len(wel_files) + 1), wel_files)) add_data = { 'IWUM estimates': { 'data': iwum_data }, 'SWUDs data': { 'data': swuds_data } } results = plot_wateruse(wel_files, perioddata, add_data) j = 2
def apply_footprint( self, active_area, active_area_id_column=None, active_area_feature_id=None, ): """Keep sites in the df pandas dataframe that fall into the passed bounding shapefile polygon. Requires that df dataframe has a Point geometry column as assigned in the reproject method. Parameters ---------- active_area: str path to shapefile with footprint for current analysis active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. outshp: str optional path to output shapefile with points within the footprint """ self.df = cull_data_to_active_area(self.df, active_area, active_area_id_column, active_area_feature_id, data_crs=self.dest_crs)
def preprocess_flows( data, metadata=None, flow_data_columns=['flow'], start_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, source_crs=4269, dest_crs=5070, datetime_col='datetime', site_no_col='site_no', line_id_col='line_id', x_coord_col='x', y_coord_col='y', name_col='name', flow_qualifier_column=None, default_qualifier='measured', include_sites=None, include_line_ids=None, source_volume_units='ft3', source_time_units='s', dest_volume_units='m3', dest_time_units='d', geographic_groups=None, geographic_groups_col=None, max_obsname_len=None, add_leading_zeros_to_sw_site_nos=False, column_renames=None, outfile=None, ): """Preprocess stream flow observation data, for example, from NWIS or another data source that outputs time series in CSV format with site locations and identifiers. * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates) to the CRS of the model (`dest_crs`) * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area * length and time units are converted to those of the groundwater model. * Prefixes for observation names (with an optional length limit) that identify the location are generated * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons (`geographic_groups` parameter) Parameters ---------- data : csv file or DataFrame Time series of stream flow observations. Columns: ===================== ====================================== site_no site identifier datetime measurement dates/times x x-coordinate of site y y-coordinate of site flow_data_columns Columns of observed streamflow values flow_qualifier_column Optional column with qualifiers for flow values ===================== ====================================== Notes: * x and y columns can alternatively be in the metadata table * flow_data_columns are denoted in `flow_data_columns`; multiple columns can be included to process base flow and total flow, or other statistics in tandem * For example, `flow_qualifier_column` may have "estimated" or "measured" flags denoting whether streamflows were derived from measured values or statistical estimates. metadata : csv file or DataFrame Stream flow observation site information. May include columns: ================= ================================================================================ site_no site identifier x x-coordinate of site y y-coordinate of site name name of site line_id_col Identifier for a line in a hydrography dataset that the site is associated with. ================= ================================================================================ Notes: * other columns in metadata will be passed through to the metadata output flow_data_columns : list of strings Columns in data with flow values or their statistics. By default, ['q_cfs'] start_date : str (YYYY-mm-dd) Simulation start date (cull observations before this date) active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 datetime_col : str, optional Column name in data with observation date/times, by default 'datetime' site_no_col : str, optional Column name in data and metadata with site identifiers, by default 'site_no' line_id_col : str, optional Column name in data or metadata with identifiers for hydrography lines associated with observation sites. by default 'line_id' x_coord_col : str, optional Column name in data or metadata with x-coordinates, by default 'x' y_coord_col : str, optional Column name in data or metadata with y-coordinates, by default 'y' name_col : str, optional Column name in data or metadata with observation site names, by default 'name' flow_qualifier_column : str, optional Column name in data with flow observation qualifiers, such as "measured" or "estimated" by default 'category' default_qualifier : str, optional Default qualifier to populate flow_qualifier_column if it is None. By default, "measured" include_sites : list-like, optional Exclude output to these sites. by default, None (include all sites) include_line_ids : list-like, optional Exclude output to these sites, represented by line identifiers. by default, None (include all sites) source_volume_units : str, 'm3', 'cubic meters', 'ft3', etc. Volume units of the source data. By default, 'ft3' source_time_units : str, 's', 'seconds', 'days', etc. Time units of the source data. By default, 's' dest_volume_units : str, 'm3', 'cubic meters', 'ft3', etc. Volume units of the output (model). By default, 'm3' dest_time_units : str, 's', 'seconds', 'days', etc. Time units of the output (model). By default, 'd' geographic_groups : file, dict or list-like Option to group observations by area(s) of interest. Can be a shapefile, list of shapefiles, or dictionary of shapely polygons. A 'group' column will be created in the metadata, and observation sites within each polygon will be assigned the group name associated with that polygon. For example:: geographic_groups='../source_data/extents/CompositeHydrographArea.shp' geographic_groups=['../source_data/extents/CompositeHydrographArea.shp'] geographic_groups={'cha': <shapely Polygon>} Where 'cha' is an observation group name for observations located within the the area defined by CompositeHydrographArea.shp. For shapefiles, group names are provided in a `geographic_groups_col`. geographic_groups_col : str Field name in the `geographic_groups` shapefile(s) containing the observation group names associated with each polygon. max_obsname_len : int or None Maximum length for observation name prefix. Default of 13 allows for a PEST obsnme of 20 characters or less with <prefix>_yyyydd or <prefix>_<per>d<per> (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1) If None, observation names will not be truncated. PEST++ does not have a limit on observation name length. add_leading_zeros_to_sw_site_nos : bool Whether or not to pad site numbers using the :func:~`mapgwm.swflows.format_usgs_sw_site_id` function. By default, False. column_renames : dict, optional Option to rename columns in the data or metadata that are different than those listed above. For example, if the data file has a 'SITE_NO' column instead of 'SITE_BADGE':: column_renames={'SITE_NO': 'site_no'} by default None, in which case the renames listed above will be used. Note that the renames must be the same as those listed above for :func:`mapgwm.swflows.preprocess_flows` to work. outfile : str Where output file will be written. Metadata are written to a file with the same name, with an additional "_info" suffix prior to the file extension. Returns ------- data : DataFrame Preprocessed time series metadata : DataFrame Preprocessed metadata References ---------- `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>` Notes ----- """ # outputs if outfile is not None: outpath, filename = os.path.split(outfile) makedirs(outpath) outname, ext = os.path.splitext(outfile) out_info_csvfile = outname + '_info.csv' out_data_csvfile = outfile out_shapefile = outname + '_info.shp' # read the source data if not isinstance(data, pd.DataFrame): df = pd.read_csv(data, dtype={site_no_col: object}) else: df = data.copy() # check the columns for col in [datetime_col] + flow_data_columns: assert col in df.columns, "Column {} not found in {}".format(col, data) assert any({site_no_col, line_id_col}.intersection(df.columns)), \ "Neither {} or {} found in {}. Need to specify a site_no_col or line_id_col".format(site_no_col, line_id_col, data) # rename input columns to these names, # for consistent output dest_columns = { datetime_col: 'datetime', site_no_col: 'site_no', line_id_col: 'line_id', x_coord_col: 'x', y_coord_col: 'y', name_col: 'name', flow_qualifier_column: 'category' } # update the default column renames # with any supplied via column_renames parameter if isinstance(column_renames, collections.Mapping): dest_columns.update(column_renames) df.rename(columns=dest_columns, inplace=True) flow_data_columns = [ c if c not in dest_columns else dest_columns[c] for c in flow_data_columns ] # convert site numbers to strings; # add leading 0s to any USGS sites that should have them if 'site_no' in df.columns: df['site_no'] = format_site_ids(df['site_no'], add_leading_zeros_to_sw_site_nos) else: df['site_no'] = df[line_id_col] # read the source data if metadata is not None: if not isinstance(metadata, pd.DataFrame): md = pd.read_csv(metadata, dtype={site_no_col: object}) else: md = metadata.copy() if site_no_col not in md.columns or 'site_no' not in df.columns: raise IndexError( 'If metadata are supplied, both data and metadata must ' 'have a site_no column.') md.rename(columns=dest_columns, inplace=True) md['site_no'] = format_site_ids(md['site_no'], add_leading_zeros_to_sw_site_nos) md.index = md['site_no'] by_site = df.groupby('site_no') md['start_dt'] = pd.DataFrame(by_site['datetime'].first()) else: by_site = df.groupby('site_no') md = pd.DataFrame(by_site['datetime'].first()) md.columns = ['start_dt'] md['site_no'] = md.index md['end_dt'] = pd.DataFrame(by_site['datetime'].last()) md['n'] = pd.DataFrame(by_site['datetime'].count()) md.reset_index(inplace=True, drop=True) # assign metadata if supplied for col in 'x', 'y', 'line_id', 'name': if col in df.columns and col not in md.columns: by_site_no = dict(zip(df['site_no'], df[col])) md[col] = [by_site_no[sn] for sn in md['site_no']] if col != 'line_id': df.drop(col, axis=1, inplace=True) # index the dataframe to times; # truncate data before start date df.index = pd.to_datetime(df['datetime']) df.index.name = 'datetime' df = df.loc[start_date:].copy() # project x, y to model crs x_pr, y_pr = project((md.x.values, md.y.values), source_crs, dest_crs) md['x'], md['y'] = x_pr, y_pr md['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)] # cull data to that within the model area if active_area is not None: df, md = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs, metadata=md) # get the hydrography IDs corresponding to each site # using the included lookup table #if 'line_id' not in df.columns: # assert line_id_lookup is not None, \ # "need to include line_ids in a column, or line_id_lookup dictionary mapping line_ids to site numbers" # df = df.loc[df['site_no'].isin(line_id_lookup)].copy() # df['line_id'] = [line_id_lookup[sn] for sn in df['site_no']] if include_sites is not None: md = md.loc[md.site_no.isin(include_sites)] df = df.loc[df.site_no.isin(include_sites)] if include_line_ids is not None: md = md.loc[md.line_id.isin(include_line_ids)] df = df.loc[df.line_id.isin(include_line_ids)] # convert units # ensure that flow values are numeric (may be objects if taken directly from NWIS) unit_conversion = ( convert_volume_units(source_volume_units, dest_volume_units) / convert_time_units(source_time_units, dest_time_units)) for flow_col in flow_data_columns: df[flow_col] = pd.to_numeric(df[flow_col], errors='coerce') * unit_conversion df.dropna(subset=flow_data_columns, axis=0, inplace=True) # reformat qualifiers for consistent output # (lump to dest category columns of either estimated or measured) # with measured including values derived from baseflow separation or actual measurements) # output column name for flow qualifier column: dest_flow_qualifier_column = 'category' if flow_qualifier_column is not None: flow_qualifiers = { 'calculated': 'measured', # 'measured', 'base flow separated from measured values': 'measured', # 'measured', 'measured total flow': 'measured', 'estimated gaged': 'estimated', 'estimated ungaged': 'estimated' } df[dest_flow_qualifier_column] = df[flow_qualifier_column].replace( flow_qualifiers) else: df['category'] = default_qualifier # make unique n-character prefixes (site identifiers) for each observation location # 13 character length allows for prefix_yyyymmm in 20 character observation names # (BeoPEST limit) unique_obsnames = set() obsnames = [] for sn in md['site_no'].tolist(): if max_obsname_len is not None: name = make_obsname(sn, unique_names=unique_obsnames, maxlen=max_obsname_len) assert name not in unique_obsnames else: name = sn unique_obsnames.add(name) obsnames.append(name) md['obsprefix'] = obsnames # add area of interest information md['group'] = 'fluxes' md = assign_geographic_obsgroups(md, geographic_groups, geographic_groups_col, metadata_crs=dest_crs) # data columns data_cols = ['site_no', 'line_id', 'datetime' ] + flow_data_columns + ['category'] #if 'line_id' in md.columns and 'line_id' not in df.columns: # # only map line_ids to data if there are more site numbers # # implying that no site number maps to more than one line_id # if len(set(df.site_no)) >= len(set(df.line_id)): # ids = dict(zip(md['site_no'], md['line_id'])) # df['line_id'] = [ids[sn] for sn in df['site_no']] data_cols = [c for c in data_cols if c in df.columns] df = df[data_cols] md.index = md['site_no'] # save out the results if outfile is not None: df2shp(md.drop(['x', 'y'], axis=1), out_shapefile, crs=dest_crs) print('writing {}'.format(out_info_csvfile)) md.drop('geometry', axis=1).to_csv(out_info_csvfile, index=False, float_format='%g') print('writing {}'.format(out_data_csvfile)) df.to_csv(out_data_csvfile, index=False, float_format='%g') return df, md
def preprocess_iwum_pumping(ncfile, start_date=None, end_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, estimated_production_zone_top=None, estimated_production_zone_botm=None, flux_variable='value', nc_crs=5070, dest_crs=5070, nc_length_units='meters', estimated_production_surface_units='meters', model_length_units='meters', outfile=None): """Get pumping from the Irrigation Water Use Model (IWUM; Wilson, 2020) output and assign open interval information, using raster surfaces of the top and bottom of an estimated production zone. Parameters ---------- ncfile : file path NetCDF output from Irrigation Water Use Model start_date : str Cull data before this date. end_date : str Cull data after this date. active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. estimated_production_zone_top : file path Raster surface for assigning screen tops estimated_production_zone_botm : file path Raster surface for assigning screen bottoms flux_variable : str Varible in ncfile for pumping fluxes. Fluxes are assumed to represent total volumes for each time period. nc_crs : obj Coordinate Reference System (CRS) of ncfile. A Python int, dict, str, or pyproj.crs.CRS instance passed to the pyproj.crs.from_user_input See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input. Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class nc_length_units : str, {'meters', 'ft', etc.} Length units of pumped volumes in ncfile estimated_production_surface_units : str, {'meters', 'ft', etc.} Length units of elevations in estimated production surface rasters. model_length_units : str, {'meters', 'ft', etc.} Length units of model. outfile : csv file for output table Returns ------- df : DataFrame Table of pumping rates in m3/day, location and open interval information. Columns: ============== ================================================ site_no index position of pumping rate in ncfile grid x x-coordinate in `dest_crs` y y-coordinate in `dest_crs` start_datetime start date of pumping period end_datetime end date of pumping period screen_top screen top elevation, in `model_length_units` screen_botm screen bottom elevation, in `model_length_units` q pumping rate, in model units geometry shapely Point object representing location ============== ================================================ Notes ----- * Time units are assumed to be days. * Fluxes are assumed to represent total volumes for each time period indicated by the differences between successive values along the time axis of ncfile. """ ds = xr.open_dataset(ncfile) time_variable = [k for k in ds.coords.keys() if k.lower() not in {'x', 'y'}][0] ds_x, ds_y = np.meshgrid(ds['x'], ds['y']) # original values are in m3, in each 1 mi2 cell # can leave in m3 if reassigning to 1km grid as point values length_conversion = convert_volume_units(nc_length_units, model_length_units) ** 3 unit_suffix = vol_suffix[model_length_units] + 'd' flux_col = 'q' # 'flux_{}'.format(unit_suffix) # output field name for fluxes # get top/botm elevations est_screen_top = None est_screen_botm = None if estimated_production_zone_top is not None and \ estimated_production_zone_botm is not None: surf_unit_conversion = convert_length_units(estimated_production_surface_units, model_length_units) est_screen_top = get_values_at_points(estimated_production_zone_top, ds_x, ds_y, points_crs=nc_crs) est_screen_top *= surf_unit_conversion est_screen_botm = get_values_at_points(estimated_production_zone_botm, ds_x, ds_y, points_crs=nc_crs) est_screen_botm *= surf_unit_conversion # in any places where screen top is less than the screen botm, # set both at the mean loc = est_screen_top < est_screen_botm means = np.mean([est_screen_top, est_screen_botm], axis=0) est_screen_top[loc] = means[loc] est_screen_botm[loc] = means[loc] print(f'Reset screen top and bottom to mean elevation at {loc.ravel().sum()} ' f'locations where screen top was < screen bottom') dfs = [] times = pd.DatetimeIndex(ds[time_variable].loc[start_date:end_date].values) for n, period_start_date in enumerate(times): # for each time entry, get the data kwargs = {time_variable: period_start_date} arr = ds[flux_variable].sel(**kwargs).values # make sure pumping sign is negative # based on assumption that values are mostly abstraction if arr.sum() > 0: arr *= -1 # set up a dataframe data = {'site_no': np.arange(ds_x.size), 'x': ds_x.ravel(), 'y': ds_y.ravel(), } if est_screen_top is not None and est_screen_botm is not None: data.update({'screen_top': est_screen_top.ravel(), 'screen_botm': est_screen_botm.ravel() } ) df = pd.DataFrame(data) df['start_datetime'] = period_start_date # get the end_date, handling last entry if n + 1 < len(times): period_end_date = times[n + 1] else: # set end date for last period on previous period length last_start = dfs[-1]['start_datetime'].values[0] ndays = (pd.Timestamp(period_start_date) - pd.Timestamp(last_start)).days period_end_date = period_start_date + pd.Timedelta(ndays, unit='d') # convert the time units ndays = (pd.Timestamp(period_end_date) - pd.Timestamp(period_start_date)).days assert ndays > 0, "period_end_date {} is before period_start_date {}"\ .format(period_end_date, period_start_date) time_conversion = 1 / ndays # original quantities are volumes for the time period # time indexing in pandas is through last value period_end_date = pd.Timestamp(period_end_date) - pd.Timedelta(1, unit='d') df['end_datetime'] = period_end_date df[flux_col] = arr.ravel() * length_conversion * time_conversion # only includes fluxes > 0 df = df.loc[df[flux_col] < 0] dfs.append(df) df = pd.concat(dfs) # site number column (that would be unique from other integers from other data sources) df['site_no'] = [f'iwum_{node}' for node in df.site_no] # project the data to a destination crs, if provided # make a separate metadata dataframe with 1 row per location # to avoid redundant operations metadata = df.groupby('site_no').first().reset_index()[['site_no', 'x', 'y']] metadata.index = metadata['site_no'] x_pr, y_pr = project((metadata.x.values, metadata.y.values), nc_crs, dest_crs) metadata['x'], metadata['y'] = x_pr, y_pr metadata['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)] # cull the data to the model area, if provided if active_area is not None: df, metadata = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs, metadata=metadata) # update data with x,y values projected in metadata x = dict(zip(metadata.site_no, metadata.x)) y = dict(zip(metadata.site_no, metadata.y)) df['x'] = [x[sn] for sn in df.site_no] df['y'] = [y[sn] for sn in df.site_no] if outfile is not None: outfile = Path(outfile) df.to_csv(outfile, index=False, float_format='%g') print('wrote {}'.format(outfile)) # Make a plot of iwum output in mgal/day out_pdf_path = outfile.parent / 'plots' out_pdf_path.mkdir(exist_ok=True) plot_iwum_output(ncfile, flux_variable=flux_variable, outpath=out_pdf_path) return df
def preprocess_te_wateruse(data, start_date=None, end_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, estimated_production_zone_top=None, estimated_production_zone_botm=None, estimated_production_surface_units='feet', source_crs=4269, dest_crs=5070, interp_method='linear', data_volume_units='mgal', model_length_units='meters', outfile=None): """Preprocess water use data from thermoelectric power plants: * reproject data to a destination CRS `dest_crs`) * cull data to an area of interest (`active_area`) * if input data do not have information on the well screen intervals; sample screen tops and bottoms from raster surfaces bounding an estimated production zone (e.g. `estimated_production_zone_top`) * reindex the data to continous monthly values extending from `start_date` to `end_date`. Typically, these would bracket the time period for which the pumping should be simulated in a model. For example, the earliest data may be from 2010, but if the model starts in 2008, it may be appropriate to begin using the 2010 rates then (``start_date='2008'``). If no start or end date are given, the first and last years of pumping in `data` are used. * fill empty months by interpolation via a specified `interp_method` * backfill any remaining empty months going back to the `start_date` * write processed data to a CSV file and shapefile of the same name Parameters ---------- data : DataFrame Thermoelectric water use data in the following format (similar to that output by :func:`mapgwm.te_wateruse.read_te_water_use_spreadsheet`): =============== ======================================================= site_no power plant identifier (plant code) start_datetime pandas datetime representative of flux (e.g. '2010') x x-coordinate of withdrawl, in `source_crs` y y-coordinate of withdrawl, in `source_crs` q withdrawl flux, in `data_volume_units` per days =============== ======================================================= start_date : str Start date for pumping rates. If earlier than the dates in `data`, pumping rates will be backfilled to this date. end_date : str End date for pumping rates. If later than the dates in `data`, pumping rates will be forward filled to this date. active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. estimated_production_zone_top : file path Raster surface for assigning screen tops estimated_production_zone_botm : file path Raster surface for assigning screen bottoms estimated_production_surface_units : str, {'meters', 'ft', etc.} Length units of elevations in estimated production surface rasters. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 interp_method : str Interpolation method to use for filling pumping rates to monthly values. By default, 'linear' data_volume_units : str; e.g. 'mgal', 'm3', 'cubic feet', etc. Volume units of pumping data. All time units are assumed to be in days. model_length_units : str; e.g. 'feet', 'm', 'meters', etc. Length units of model. outfile : str Path for output file. A shapefile of the same name is also written. If None, no output file is written. By default, None Returns ------- df_monthly : DataFrame Notes ----- * time units for TE data and model are assumed to be days """ df = data.copy() # reproject to dest_crs x, y = project(zip(df['x'], df['y']), source_crs, dest_crs) df['x'], df['y'] = x, y df['geometry'] = [Point(x, y) for x, y in zip(x, y)] # drop wells with no location information (for now) df.dropna(subset=['x', 'y'], axis=0, inplace=True) # cull sites to those within the Delta footprint # cull data to that within the model area if active_area is not None: df = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs) # get top and bottom of estimated production interval at each well if estimated_production_zone_top is not None and \ estimated_production_zone_botm is not None: surf_unit_conversion = convert_length_units( estimated_production_surface_units, model_length_units) x, y = df.x.values, df.y.values est_screen_top = get_values_at_points(estimated_production_zone_top, x, y, points_crs=dest_crs) est_screen_top *= surf_unit_conversion est_screen_botm = get_values_at_points(estimated_production_zone_botm, x, y, points_crs=dest_crs) est_screen_botm *= surf_unit_conversion df['screen_top'] = est_screen_top df['screen_botm'] = est_screen_botm # distribute fluxes to monthly values # set start and end dates if not already set if start_date is None: start_date = df.start_datetime.min() if end_date is None: end_date = df.start_datetime.mmax() groups = df.groupby('site_no') all_groups = [] for site_no, group in groups: dfg = group.copy() # create a continuous monthly time index # labeled at the month start all_dates = pd.date_range(start_date, end_date, freq='MS') dfg.index = dfg['start_datetime'] dfg = dfg.reindex(all_dates) # interpolate the discharge values; # back filling to the start date dfg['q'] = dfg.q.interpolate(method=interp_method).bfill() dfg['q'] *= convert_volume_units(data_volume_units, model_length_units) # fill remaining columns dfg['start_datetime'] = dfg.index fill_columns = set(dfg.columns).difference({'q', 'start_datetime'}) fill_values = group.iloc[0].to_dict() for c in fill_columns: dfg[c] = fill_values[c] # add 'te' prefix to site number dfg['site_no'] = f'te_{site_no}' all_groups.append(dfg) df_monthly = pd.concat(all_groups) # assume most values represent abstraction # if sum is positive, invert so that output values are negative if df_monthly['q'].sum() > 0: df_monthly['q'] *= -1 # clean up the columns cols = [ 'site_no', 'start_datetime', 'x', 'y', 'screen_top', 'screen_botm', 'q', 'geometry' ] cols += list(set(df_monthly.columns).difference(cols)) df_monthly = df_monthly[cols] # write the output if outfile is not None: outfile = Path(outfile) df_monthly.drop('geometry', axis=1).to_csv(outfile, index=False, float_format='%g') print('wrote {}'.format(outfile)) # write only unique pumping values to shapefile to_shapefile = df_monthly.groupby(['site_no', 'q']).first().reset_index() shapefile = outfile.with_suffix('.shp') df2shp(to_shapefile, shapefile, crs=dest_crs) return df_monthly
def preprocess_headobs( data, metadata, head_data_columns=['head', 'last_head', 'head_std'], dem=None, dem_units='meters', start_date='1998-04-01', active_area=None, active_area_id_column=None, active_area_feature_id=None, source_crs=4269, dest_crs=5070, data_length_units='meters', model_length_units='meters', geographic_groups=None, geographic_groups_col=None, max_obsname_len=None, outfile='../source_data/observations/head_obs/preprocessed_head_obs.csv' ): """Preprocess head observation data, for example, groundwater level data output from the `visGWDB program <https://doi.org/10.5066/P9W004O6>`_. * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates) to the CRS of the model (`dest_crs`) * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area * length units are converted to those of the groundwater model. Open intervals for the wells are converted from depths to elevations * missing open intervals are filled based on well bottom depths (if availabile) and the median open interval length for the dataset. * Wells are categorized based on the quality of the open interval information (see the documentation for :func:`mapgwm.headobs.fill_well_open_intervals`). * Prefixes for observation names (with an optional length limit) that identify the location are generated * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons (`aoi` parameter) Parameters ---------- data : DataFrame Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`. Columns: ========= ================================================================ site_no site identifier lat lattitude lon longitude datetime measurement dates in pandas datetime format head average head for the period represented by the datetime last_head last head measurement for the period represented by the datetime head_std standard deviation of measured heads within the datetime period ========= ================================================================ Notes: * lat and lon columns can alternatively be in the metadata table * `last_head` and `head_std` only need to be included if they are in `head_data_columns` metadata : DataFrame Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`. Must have the following columns: ================= ========================================================================== site_no (index) site identifier aqfr_cd Local aquifer code screen_botm Well screen bottom, as a depth below land surface, in feet screen_top Well screen top, as a depth below land surface, in feet well_depth Well depth, in feet well_el Altitude of land surface, in feet ================= ========================================================================== head_data_columns : list of strings Columns in data with head values or their statistics. By default, 'head', 'last_head', 'head_std', which allows both the average and last head values for the stress period to be considered, as well as the variability of water levels contributing to an average value. dem : str, optional DEM raster of the land surface. Used for estimating missing wellhead elevations. Any reprojection to dest_crs is handled automatically, assuming the DEM raster has CRS information embedded (arc-ascii grids do not!) By default, None. dem_units : str, {'feet', 'meters', ..} Units of DEM elevations, by default, 'meters' start_date : str (YYYY-mm-dd) Simulation start date (cull observations before this date) active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 data_length_units : str; 'meters', 'feet', etc. Length units of head observations. model_length_units : str; 'meters', 'feet', etc. Length units of model. geographic_groups : file, dict or list-like Option to group observations by area(s) of interest. Can be a shapefile, list of shapefiles, or dictionary of shapely polygons. A 'group' column will be created in the metadata, and observation sites within each polygon will be assigned the group name associated with that polygon. For example:: geographic_groups='../source_data/extents/CompositeHydrographArea.shp' geographic_groups=['../source_data/extents/CompositeHydrographArea.shp'] geographic_groups={'cha': <shapely Polygon>} Where 'cha' is an observation group name for observations located within the the area defined by CompositeHydrographArea.shp. For shapefiles, group names are provided in a `geographic_groups_col`. geographic_groups_col : str Field name in the `geographic_groups` shapefile(s) containing the observation group names associated with each polygon. max_obsname_len : int or None Maximum length for observation name prefix. Default of 13 allows for a PEST obsnme of 20 characters or less with <prefix>_yyyydd or <prefix>_<per>d<per> (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1) If None, observation names will not be truncated. PEST++ does not have a limit on observation name length. outfile : str Where output file will be written. Metadata are written to a file with the same name, with an additional "_info" suffix prior to the file extension. Returns ------- df : DataFrame Preprocessed time series well_info : DataFrame Preprocessed metadata References ---------- `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>` """ df = data.copy() # multiplier to convert input length units to model units unit_conversion = convert_length_units(data_length_units, model_length_units) # outputs out_plot = None if outfile is not None: outpath, filename = os.path.split(outfile) makedirs(outpath) outname, ext = os.path.splitext(outfile) out_info_csvfile = outname + '_info.csv' out_data_csvfile = outfile out_plot = os.path.join(outpath, 'open_interval_lengths.pdf') out_shapefile = outname + '_info.shp' # set the starting and ending dates here stdate = pd.Timestamp(start_date) # convert to datetime; drop the timestamps df['datetime'] = pd.to_datetime(df.datetime).dt.normalize() # trim to the time range n_measurements = len(data) n_sites = len(set(data.site_no)) print( f'starting with {n_measurements:,d} measurements at {n_sites:,d} unique wells' ) no_data_in_period = df.datetime < stdate if np.any(no_data_in_period): in_period = df.datetime >= stdate n_sites_before = len( set(df.loc[no_data_in_period, 'site_no']).difference(set(df.loc[in_period, 'site_no']))) print(( f'culling {in_period.sum():,d} measurements from {n_sites_before:,d} ' f'sites that are prior to start date of {start_date}')) df = df.loc[in_period] # collapse dataset to mean values at each site groups = df.groupby('site_no') well_info = groups.mean().copy() well_info = well_info.join(metadata, rsuffix='_meta') well_info['start_dt'] = groups.datetime.min() well_info['end_dt'] = groups.datetime.max() well_info.drop(labels=['year', 'month'], axis=1, inplace=True) well_info['site_no'] = well_info.index well_info['n'] = groups.datetime.count() # project x, y to model crs x_pr, y_pr = project((well_info.lon.values, well_info.lat.values), source_crs, dest_crs) well_info.drop(['lon', 'lat'], axis=1, inplace=True) well_info['x'], well_info['y'] = x_pr, y_pr well_info['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)] # cull data to that within the model area if active_area is not None: df, md = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs, metadata=well_info) # convert length units; convert screen tops and botms to depths missing_elevations = well_info.well_el.isna() if dem is not None and np.any(missing_elevations): well_location_elevations = get_values_at_points(dem, well_info['x'], well_info['y'], points_crs=dest_crs) well_location_elevations *= convert_length_units( dem_units, model_length_units) well_info.loc[missing_elevations, 'well_el'] = well_location_elevations[missing_elevations] length_columns = ['well_el' ] + head_data_columns + ['screen_top', 'screen_botm'] for col in length_columns: if col in well_info.columns: well_info[col] *= unit_conversion well_info['well_botm'] = well_info['well_el'] - well_info['well_depth'] well_info['screen_top'] = well_info['well_el'] - well_info['screen_top'] well_info['screen_botm'] = well_info['well_el'] - well_info['screen_botm'] # just the data, site numbers, times and aquifer head_data_columns = head_data_columns + ['head_std'] transient_cols = ['site_no', 'datetime'] + head_data_columns + ['n'] transient_cols = [c for c in transient_cols if c in df.columns] df = df[transient_cols].copy() for c in head_data_columns: if c in df.columns: df[c] *= unit_conversion # #### trim down to only well_info with both estimated water levels and standard deviation # monthly measured levels may not have standard deviation # (as opposed to monthly statistical estimates) criteria = pd.notnull(well_info['head']) #if 'head_std' in df.columns: # criteria = criteria & pd.notnull(well_info['head_std']) well_info = well_info[criteria] # verify that all well_info have a wellhead elevation assert not np.any(np.isnan(well_info.well_el)) # categorize wells based on quality of open interval information # estimate missing open intervals where possible well_info = fill_well_open_intervals(well_info, out_plot=out_plot) # drop well_info with negative reported open interval #well_info = well_info.loc[open_interval_length > 0] # cull data to well_info in well info table has_metadata = df.site_no.isin(well_info.index) if np.any(~has_metadata): warnings.warn('culling {} wells not found in metadata table!'.format( np.sum(~has_metadata))) df = df.loc[has_metadata].copy() # make unique n-character prefixes (site identifiers) for each observation location # 13 character length allows for prefix_yyyymmm in 20 character observation names # (BeoPEST limit) unique_obsnames = set() obsnames = [] for sn in well_info.index.tolist(): if max_obsname_len is not None: name = make_obsname(sn, unique_names=unique_obsnames, maxlen=max_obsname_len) assert name not in unique_obsnames else: name = sn unique_obsnames.add(name) obsnames.append(name) well_info['obsprefix'] = obsnames obsprefix = dict(zip(well_info.index, well_info.obsprefix)) df['obsprefix'] = [obsprefix[sn] for sn in df.site_no] # add area of interest information well_info['group'] = 'heads' well_info = assign_geographic_obsgroups(well_info, geographic_groups, geographic_groups_col, metadata_crs=dest_crs) # save out the results if outfile is not None: df2shp(well_info.drop(['x', 'y'], axis=1), out_shapefile, index=False, crs=dest_crs) print('writing {}'.format(out_info_csvfile)) well_info.drop('geometry', axis=1).to_csv(out_info_csvfile, index=False, float_format='%.2f') print('writing {}'.format(out_data_csvfile)) df.to_csv(out_data_csvfile, index=False, float_format='%.2f') return df, well_info