def test_sample_elevations_different_proj(dem, tylerforks_sfrdata, datapath): sfr = tylerforks_sfrdata sampled_elevs1 = sfr.sample_reach_elevations(dem, method='buffers', smooth=True) sampled_elevs1 = np.array(list(sampled_elevs1.values())) reach1_geom = sfr.reach_data.geometry[0] crs1 = sfr.crs crs2 = get_authority_crs(3070) sfr._crs = crs2 sfr.reach_data['geometry'] = project(sfr.reach_data['geometry'].values, crs1, crs2) reach1_geom_5070 = sfr.reach_data.geometry[0] # verify that the reaches were reprojected assert reach1_geom.intersection(reach1_geom_5070).area == 0 sampled_elevs2 = sfr.sample_reach_elevations(dem, method='buffers', smooth=True) sampled_elevs2 = np.array(list(sampled_elevs2.values())) rms_error = np.sqrt(np.mean((sampled_elevs2 - sampled_elevs1)**2)) assert rms_error < 0.5 # not sure why the elevations don't match better # verify that at least the first reach is the same reach1_geom_projected_back_100buffer = project(reach1_geom_5070, crs2, crs1).buffer(100) assert np.allclose(reach1_geom_projected_back_100buffer.area, reach1_geom.buffer(100).area)
def reproject(self, x_coord_col='FROM_DEC_LONG_VA', y_coord_col='FROM_DEC_LAT_VA', key='SITE_NO'): """ Reproject from self.source_crs to self.dest_crs using gisutils Parameters ---------- x_coord_col : str, optional Column name in data with x-coordinates, by default 'x' y_coord_col : str, optional Column name in data with y-coordinates, by default 'y' key: str key for the dictionary made, defaults to SITE_NO """ x_reprj, y_reprj = project( zip(self.df[x_coord_col], self.df[y_coord_col]), self.source_crs, self.dest_crs) self.df['x'] = x_reprj self.df['y'] = y_reprj self.df['geometry'] = [Point(x, y) for x, y in zip(x_reprj, y_reprj)] # drop entries if no location information self.df.dropna(subset=['x', 'y'], axis=0, inplace=True) # make dictionary of location self.locations = dict( list(zip(self.df[key], list(zip(self.df['x'], self.df['y'])))))
def to_crs(self, dest_crs): """Reproject the LineStrings in :py:attr:`Lines.df` to a different Coordinate Reference System. Parameters ---------- dest_crs : obj A Python int, dict, str, or :py:class:`pyproj.crs.CRS` instance passed to the :py:meth:`pyproj.crs.CRS.from_user_input` See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input. Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class """ assert self.crs is not None, "No crs for flowlines" assert dest_crs is not None, "No destination CRS." dest_crs = get_authority_crs(dest_crs) print('\nreprojecting hydrography from\n{}\nto\n{}\n'.format( self.crs, dest_crs)) geoms = project(self.df.geometry, self.crs, dest_crs) assert np.isfinite(np.max(geoms[0].xy[0])), \ "Invalid reprojection; check CRS for lines and grid." self.df['geometry'] = geoms self.crs = dest_crs
def get_bbox(feature, dest_crs): """Get bounding box for a Polygon feature. Parameters ---------- feature : str (shapefile path), shapely Polygon or GeoJSON dest_crs : proj str Desired output coordinate system (shapefiles only) """ if isinstance(feature, str): with fiona.open(feature) as src: l, b, r, t = src.bounds bbox_src_crs = box(*src.bounds) shpcrs = crs(proj_str=to_string(src.crs)) if dest_crs is not None and shpcrs != dest_crs: bbox_dest_crs = project(bbox_src_crs, shpcrs.proj_str, dest_crs.proj_str) l, b, r, t = bbox_dest_crs.bounds # x, y = project([(l, b), # (r, t)], shpcrs.proj_str, dest_crs.proj_str) # filter = (x[0], y[0], x[1], y[1]) # else: filter = (l, b, r, t) elif isinstance(feature, Polygon): filter = feature.bounds elif isinstance(feature, dict): try: filter = shape(feature).bounds except Exception as ex: print(ex) print("Supplied dictionary doesn't appear to be valid GeoJSON.") return filter
def extent_poly(): extent_poly_ll = box(-92.7, 46.7, -92.6, 46.8) extent_poly = project(extent_poly_ll, "+init=epsg:{}".format(4269), "+init=epsg:26915") df = pd.DataFrame({'geometry': [extent_poly], 'id': [0]}) df2shp(df, 'examples/data/bbox.shp', epsg=26915) return extent_poly_ll
def reproject(self, dest_proj_str): assert self.crs.proj_str is not None, "No proj_str string for flowlines" assert dest_proj_str is not None, "No destination CRS." print('\nreprojecting hydrography from\n{}\nto\n{}\n'.format( self.crs.proj_str, dest_proj_str)) geoms = project(self.df.geometry, self.crs.proj_str, dest_proj_str) assert np.isfinite(np.max(geoms[0].xy[0])), \ "Invalid reprojection; check CRS for lines and grid." self.df['geometry'] = geoms self.crs.proj_str = dest_proj_str
def test_lines_from_NHDPlus(tylerforks_lines_from_NHDPlus): lines = tylerforks_lines_from_NHDPlus tf = lines.df.name == 'Tyler Forks' lines_pr = project(lines.df.geometry, 4269, 26915) line_lengths = np.array([g.length for g in lines_pr]) expected_asum1s = lines.df['asum2'] - line_lengths # add dropna due to some lines along boundary # not being in the PFVAA subset used for the test assert np.allclose(lines.df['asum1'].dropna(), expected_asum1s.dropna(), atol=10) assert np.all(lines.df.loc[tf, 'asum1'].dropna() > 95000) assert isinstance(lines, Lines)
def _compute_geometries(self, df): datum = np.array([coord_datums_epsg[d] for d in df.dec_coord_datum_cd]) datums = set(datum) x1, y1 = df.dec_long_va.values, df.dec_lat_va.values x2 = np.ones(len(df), dtype=float) * np.nan y2 = np.ones(len(df), dtype=float) * np.nan for dtm in datums: pr1 = "epsg:{}".format(dtm) loc = datum == dtm x2[loc], y2[loc] = gisutils.project((x1[loc], y1[loc]), pr1, self.proj_str) geoms = [Point(x, y) for x, y in zip(x2, y2)] return geoms
def _read_extent_shapefile(self, shpfile, buffer=0): import fiona from fiona.crs import to_string, from_epsg print('reading extent from {}...'.format(shpfile)) shp = fiona.open(shpfile) g = shape(shp.next()['geometry']) if to_string(from_epsg(coord_datums_epsg[self.datum])) != to_string( shp.crs): print('reprojecting extent from {} to {}'.format( to_string(shp.crs), self.proj_str)) return gisutils.project(g, to_string(shp.crs), self.proj_str) else: return g
def test_get_upstream_area(): catchments = ['/Users/aleaf/Documents/NHDPlus/NHDPlusGL/NHDPlus04/NHDPlusCatchment/Catchment.shp', '/Users/aleaf/Documents/NHDPlus/NHDPlusMS/NHDPlus07/NHDPlusCatchment/Catchment.shp'] plusflow = ['/Users/aleaf/Documents/NHDPlus/NHDPlusGL/NHDPlus04/NHDPlusAttributes/PlusFlow.dbf', '/Users/aleaf/Documents/NHDPlus/NHDPlusMS/NHDPlus07/NHDPlusAttributes/PlusFlow.dbf'] nodasites = '/Users/aleaf/Documents/USFS/Nicolet/targets/north/flux_field_no_da.shp' flowlines = ['/Users/aleaf/Documents/NHDPlus/NHDPlusGL/NHDPlus04/NHDSnapshot/Hydrography/NHDFlowline.shp', '/Users/aleaf/Documents/NHDPlus/NHDPlusMS/NHDPlus07/NHDSnapshot/Hydrography/NHDFlowline.shp'] nearfield = '/Users/aleaf/Documents/USFS/Nicolet/shps/Nicolet_north_NF.shp' nf = shape(fiona.open(nearfield).next()['geometry']) nf = project(nf, '+init=epsg:26716', '+init=epsg:4269') bbox = nf.bounds noda = shp2df(nodasites) get_upstream_area(noda.geometry.tolist(), plusflow, flowlines, catchments, nf)
def read_polygon_feature(feature, dest_crs, feature_crs=None): """Read a geometric feature from a shapefile, shapely geometry object, or collection of shapely geometry objects. Reproject to dest_crs if the feature is in a different CRS. Parameters ---------- feature : shapely Polygon, list of Polygons, or shapefile path Polygons must be in same CRS as linework; shapefile features will be reprojected if their crs is different. dest_crs : instance of sfrmaker.crs Output CRS for the feature. Returns ------- feature : shapely geometry object """ if isinstance(feature, str): with fiona.open(feature) as src: feature_crs = crs(src.crs) geoms = shp2df(feature)['geometry'].values feature = unary_union(geoms) elif isinstance(feature, collections.Iterable): if isinstance(feature[0], dict): try: feature = [shape(f) for f in feature] except Exception as ex: print(ex) print( "Supplied dictionary doesn't appear to be valid GeoJSON.") feature = unary_union(feature) elif isinstance(feature, dict): try: feature = shape(feature) except Exception as ex: print(ex) print("Supplied dictionary doesn't appear to be valid GeoJSON.") elif isinstance(feature, Polygon): pass else: raise TypeError("Unrecognized feature input.") if feature_crs is not None and feature_crs != dest_crs: feature = project(feature, feature_crs.proj_str, dest_crs.proj_str) return feature.buffer(0)
def get_bbox(feature, dest_crs): """Get bounding box for a Polygon feature. Parameters ---------- feature : str (shapefile path), shapely Polygon or GeoJSON dest_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 """ if isinstance(feature, str): with fiona.open(feature) as src: l, b, r, t = src.bounds bbox_src_crs = box(*src.bounds) shpcrs = get_shapefile_crs(feature) if dest_crs is not None and shpcrs != dest_crs: bbox_dest_crs = project(bbox_src_crs, shpcrs, dest_crs) l, b, r, t = bbox_dest_crs.bounds filter = (l, b, r, t) elif isinstance(feature, Polygon): filter = feature.bounds elif isinstance(feature, dict): try: filter = shape(feature).bounds except Exception as ex: print(ex) print("Supplied dictionary doesn't appear to be valid GeoJSON.") return filter
def setup_lake_info(model): # lake package must have a source_data block # (e.g. for supplying shapefile delineating lake extents) source_data = model.cfg.get('lak', {}).get('source_data') if source_data is None or 'lak' not in model.package_list: return lakesdata = model.load_features(**source_data['lakes_shapefile']) lakesdata_proj_str = get_proj_str(source_data['lakes_shapefile']['filename']) id_column = source_data['lakes_shapefile']['id_column'].lower() name_column = source_data['lakes_shapefile'].get('name_column', 'name').lower() nlakes = len(lakesdata) # make dataframe with lake IDs, names and locations centroids = project([g.centroid for g in lakesdata.geometry], lakesdata_proj_str, 'epsg:4269') df = pd.DataFrame({'lak_id': np.arange(1, nlakes + 1), 'feat_id': lakesdata[id_column].values, 'name': lakesdata[name_column].values, 'latitude': [c.y for c in centroids], 'geometry': lakesdata['geometry'] }) # get starting stages from model top, for specifying ranges stages = [] for lakid in df['lak_id']: loc = model._lakarr2d == lakid est_stage = model.dis.top.array[loc].min() stages.append(est_stage) df['strt'] = np.array(stages) # save a lookup file mapping lake ids to hydroids lookup_file = model.cfg['lak']['output_files']['lookup_file'].format(model.name) df.drop('geometry', axis=1).to_csv(lookup_file, index=False) # clean up names df['name'].replace('nan', '', inplace=True) df['name'].replace(' ', '', inplace=True) return df
def preprocess_flows( data, metadata=None, flow_data_columns=['flow'], start_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, source_crs=4269, dest_crs=5070, datetime_col='datetime', site_no_col='site_no', line_id_col='line_id', x_coord_col='x', y_coord_col='y', name_col='name', flow_qualifier_column=None, default_qualifier='measured', include_sites=None, include_line_ids=None, source_volume_units='ft3', source_time_units='s', dest_volume_units='m3', dest_time_units='d', geographic_groups=None, geographic_groups_col=None, max_obsname_len=None, add_leading_zeros_to_sw_site_nos=False, column_renames=None, outfile=None, ): """Preprocess stream flow observation data, for example, from NWIS or another data source that outputs time series in CSV format with site locations and identifiers. * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates) to the CRS of the model (`dest_crs`) * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area * length and time units are converted to those of the groundwater model. * Prefixes for observation names (with an optional length limit) that identify the location are generated * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons (`geographic_groups` parameter) Parameters ---------- data : csv file or DataFrame Time series of stream flow observations. Columns: ===================== ====================================== site_no site identifier datetime measurement dates/times x x-coordinate of site y y-coordinate of site flow_data_columns Columns of observed streamflow values flow_qualifier_column Optional column with qualifiers for flow values ===================== ====================================== Notes: * x and y columns can alternatively be in the metadata table * flow_data_columns are denoted in `flow_data_columns`; multiple columns can be included to process base flow and total flow, or other statistics in tandem * For example, `flow_qualifier_column` may have "estimated" or "measured" flags denoting whether streamflows were derived from measured values or statistical estimates. metadata : csv file or DataFrame Stream flow observation site information. May include columns: ================= ================================================================================ site_no site identifier x x-coordinate of site y y-coordinate of site name name of site line_id_col Identifier for a line in a hydrography dataset that the site is associated with. ================= ================================================================================ Notes: * other columns in metadata will be passed through to the metadata output flow_data_columns : list of strings Columns in data with flow values or their statistics. By default, ['q_cfs'] start_date : str (YYYY-mm-dd) Simulation start date (cull observations before this date) active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 datetime_col : str, optional Column name in data with observation date/times, by default 'datetime' site_no_col : str, optional Column name in data and metadata with site identifiers, by default 'site_no' line_id_col : str, optional Column name in data or metadata with identifiers for hydrography lines associated with observation sites. by default 'line_id' x_coord_col : str, optional Column name in data or metadata with x-coordinates, by default 'x' y_coord_col : str, optional Column name in data or metadata with y-coordinates, by default 'y' name_col : str, optional Column name in data or metadata with observation site names, by default 'name' flow_qualifier_column : str, optional Column name in data with flow observation qualifiers, such as "measured" or "estimated" by default 'category' default_qualifier : str, optional Default qualifier to populate flow_qualifier_column if it is None. By default, "measured" include_sites : list-like, optional Exclude output to these sites. by default, None (include all sites) include_line_ids : list-like, optional Exclude output to these sites, represented by line identifiers. by default, None (include all sites) source_volume_units : str, 'm3', 'cubic meters', 'ft3', etc. Volume units of the source data. By default, 'ft3' source_time_units : str, 's', 'seconds', 'days', etc. Time units of the source data. By default, 's' dest_volume_units : str, 'm3', 'cubic meters', 'ft3', etc. Volume units of the output (model). By default, 'm3' dest_time_units : str, 's', 'seconds', 'days', etc. Time units of the output (model). By default, 'd' geographic_groups : file, dict or list-like Option to group observations by area(s) of interest. Can be a shapefile, list of shapefiles, or dictionary of shapely polygons. A 'group' column will be created in the metadata, and observation sites within each polygon will be assigned the group name associated with that polygon. For example:: geographic_groups='../source_data/extents/CompositeHydrographArea.shp' geographic_groups=['../source_data/extents/CompositeHydrographArea.shp'] geographic_groups={'cha': <shapely Polygon>} Where 'cha' is an observation group name for observations located within the the area defined by CompositeHydrographArea.shp. For shapefiles, group names are provided in a `geographic_groups_col`. geographic_groups_col : str Field name in the `geographic_groups` shapefile(s) containing the observation group names associated with each polygon. max_obsname_len : int or None Maximum length for observation name prefix. Default of 13 allows for a PEST obsnme of 20 characters or less with <prefix>_yyyydd or <prefix>_<per>d<per> (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1) If None, observation names will not be truncated. PEST++ does not have a limit on observation name length. add_leading_zeros_to_sw_site_nos : bool Whether or not to pad site numbers using the :func:~`mapgwm.swflows.format_usgs_sw_site_id` function. By default, False. column_renames : dict, optional Option to rename columns in the data or metadata that are different than those listed above. For example, if the data file has a 'SITE_NO' column instead of 'SITE_BADGE':: column_renames={'SITE_NO': 'site_no'} by default None, in which case the renames listed above will be used. Note that the renames must be the same as those listed above for :func:`mapgwm.swflows.preprocess_flows` to work. outfile : str Where output file will be written. Metadata are written to a file with the same name, with an additional "_info" suffix prior to the file extension. Returns ------- data : DataFrame Preprocessed time series metadata : DataFrame Preprocessed metadata References ---------- `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>` Notes ----- """ # outputs if outfile is not None: outpath, filename = os.path.split(outfile) makedirs(outpath) outname, ext = os.path.splitext(outfile) out_info_csvfile = outname + '_info.csv' out_data_csvfile = outfile out_shapefile = outname + '_info.shp' # read the source data if not isinstance(data, pd.DataFrame): df = pd.read_csv(data, dtype={site_no_col: object}) else: df = data.copy() # check the columns for col in [datetime_col] + flow_data_columns: assert col in df.columns, "Column {} not found in {}".format(col, data) assert any({site_no_col, line_id_col}.intersection(df.columns)), \ "Neither {} or {} found in {}. Need to specify a site_no_col or line_id_col".format(site_no_col, line_id_col, data) # rename input columns to these names, # for consistent output dest_columns = { datetime_col: 'datetime', site_no_col: 'site_no', line_id_col: 'line_id', x_coord_col: 'x', y_coord_col: 'y', name_col: 'name', flow_qualifier_column: 'category' } # update the default column renames # with any supplied via column_renames parameter if isinstance(column_renames, collections.Mapping): dest_columns.update(column_renames) df.rename(columns=dest_columns, inplace=True) flow_data_columns = [ c if c not in dest_columns else dest_columns[c] for c in flow_data_columns ] # convert site numbers to strings; # add leading 0s to any USGS sites that should have them if 'site_no' in df.columns: df['site_no'] = format_site_ids(df['site_no'], add_leading_zeros_to_sw_site_nos) else: df['site_no'] = df[line_id_col] # read the source data if metadata is not None: if not isinstance(metadata, pd.DataFrame): md = pd.read_csv(metadata, dtype={site_no_col: object}) else: md = metadata.copy() if site_no_col not in md.columns or 'site_no' not in df.columns: raise IndexError( 'If metadata are supplied, both data and metadata must ' 'have a site_no column.') md.rename(columns=dest_columns, inplace=True) md['site_no'] = format_site_ids(md['site_no'], add_leading_zeros_to_sw_site_nos) md.index = md['site_no'] by_site = df.groupby('site_no') md['start_dt'] = pd.DataFrame(by_site['datetime'].first()) else: by_site = df.groupby('site_no') md = pd.DataFrame(by_site['datetime'].first()) md.columns = ['start_dt'] md['site_no'] = md.index md['end_dt'] = pd.DataFrame(by_site['datetime'].last()) md['n'] = pd.DataFrame(by_site['datetime'].count()) md.reset_index(inplace=True, drop=True) # assign metadata if supplied for col in 'x', 'y', 'line_id', 'name': if col in df.columns and col not in md.columns: by_site_no = dict(zip(df['site_no'], df[col])) md[col] = [by_site_no[sn] for sn in md['site_no']] if col != 'line_id': df.drop(col, axis=1, inplace=True) # index the dataframe to times; # truncate data before start date df.index = pd.to_datetime(df['datetime']) df.index.name = 'datetime' df = df.loc[start_date:].copy() # project x, y to model crs x_pr, y_pr = project((md.x.values, md.y.values), source_crs, dest_crs) md['x'], md['y'] = x_pr, y_pr md['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)] # cull data to that within the model area if active_area is not None: df, md = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs, metadata=md) # get the hydrography IDs corresponding to each site # using the included lookup table #if 'line_id' not in df.columns: # assert line_id_lookup is not None, \ # "need to include line_ids in a column, or line_id_lookup dictionary mapping line_ids to site numbers" # df = df.loc[df['site_no'].isin(line_id_lookup)].copy() # df['line_id'] = [line_id_lookup[sn] for sn in df['site_no']] if include_sites is not None: md = md.loc[md.site_no.isin(include_sites)] df = df.loc[df.site_no.isin(include_sites)] if include_line_ids is not None: md = md.loc[md.line_id.isin(include_line_ids)] df = df.loc[df.line_id.isin(include_line_ids)] # convert units # ensure that flow values are numeric (may be objects if taken directly from NWIS) unit_conversion = ( convert_volume_units(source_volume_units, dest_volume_units) / convert_time_units(source_time_units, dest_time_units)) for flow_col in flow_data_columns: df[flow_col] = pd.to_numeric(df[flow_col], errors='coerce') * unit_conversion df.dropna(subset=flow_data_columns, axis=0, inplace=True) # reformat qualifiers for consistent output # (lump to dest category columns of either estimated or measured) # with measured including values derived from baseflow separation or actual measurements) # output column name for flow qualifier column: dest_flow_qualifier_column = 'category' if flow_qualifier_column is not None: flow_qualifiers = { 'calculated': 'measured', # 'measured', 'base flow separated from measured values': 'measured', # 'measured', 'measured total flow': 'measured', 'estimated gaged': 'estimated', 'estimated ungaged': 'estimated' } df[dest_flow_qualifier_column] = df[flow_qualifier_column].replace( flow_qualifiers) else: df['category'] = default_qualifier # make unique n-character prefixes (site identifiers) for each observation location # 13 character length allows for prefix_yyyymmm in 20 character observation names # (BeoPEST limit) unique_obsnames = set() obsnames = [] for sn in md['site_no'].tolist(): if max_obsname_len is not None: name = make_obsname(sn, unique_names=unique_obsnames, maxlen=max_obsname_len) assert name not in unique_obsnames else: name = sn unique_obsnames.add(name) obsnames.append(name) md['obsprefix'] = obsnames # add area of interest information md['group'] = 'fluxes' md = assign_geographic_obsgroups(md, geographic_groups, geographic_groups_col, metadata_crs=dest_crs) # data columns data_cols = ['site_no', 'line_id', 'datetime' ] + flow_data_columns + ['category'] #if 'line_id' in md.columns and 'line_id' not in df.columns: # # only map line_ids to data if there are more site numbers # # implying that no site number maps to more than one line_id # if len(set(df.site_no)) >= len(set(df.line_id)): # ids = dict(zip(md['site_no'], md['line_id'])) # df['line_id'] = [ids[sn] for sn in df['site_no']] data_cols = [c for c in data_cols if c in df.columns] df = df[data_cols] md.index = md['site_no'] # save out the results if outfile is not None: df2shp(md.drop(['x', 'y'], axis=1), out_shapefile, crs=dest_crs) print('writing {}'.format(out_info_csvfile)) md.drop('geometry', axis=1).to_csv(out_info_csvfile, index=False, float_format='%g') print('writing {}'.format(out_data_csvfile)) df.to_csv(out_data_csvfile, index=False, float_format='%g') return df, md
def preprocess_headobs( data, metadata, head_data_columns=['head', 'last_head', 'head_std'], dem=None, dem_units='meters', start_date='1998-04-01', active_area=None, active_area_id_column=None, active_area_feature_id=None, source_crs=4269, dest_crs=5070, data_length_units='meters', model_length_units='meters', geographic_groups=None, geographic_groups_col=None, max_obsname_len=None, outfile='../source_data/observations/head_obs/preprocessed_head_obs.csv' ): """Preprocess head observation data, for example, groundwater level data output from the `visGWDB program <https://doi.org/10.5066/P9W004O6>`_. * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates) to the CRS of the model (`dest_crs`) * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area * length units are converted to those of the groundwater model. Open intervals for the wells are converted from depths to elevations * missing open intervals are filled based on well bottom depths (if availabile) and the median open interval length for the dataset. * Wells are categorized based on the quality of the open interval information (see the documentation for :func:`mapgwm.headobs.fill_well_open_intervals`). * Prefixes for observation names (with an optional length limit) that identify the location are generated * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons (`aoi` parameter) Parameters ---------- data : DataFrame Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`. Columns: ========= ================================================================ site_no site identifier lat lattitude lon longitude datetime measurement dates in pandas datetime format head average head for the period represented by the datetime last_head last head measurement for the period represented by the datetime head_std standard deviation of measured heads within the datetime period ========= ================================================================ Notes: * lat and lon columns can alternatively be in the metadata table * `last_head` and `head_std` only need to be included if they are in `head_data_columns` metadata : DataFrame Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`. Must have the following columns: ================= ========================================================================== site_no (index) site identifier aqfr_cd Local aquifer code screen_botm Well screen bottom, as a depth below land surface, in feet screen_top Well screen top, as a depth below land surface, in feet well_depth Well depth, in feet well_el Altitude of land surface, in feet ================= ========================================================================== head_data_columns : list of strings Columns in data with head values or their statistics. By default, 'head', 'last_head', 'head_std', which allows both the average and last head values for the stress period to be considered, as well as the variability of water levels contributing to an average value. dem : str, optional DEM raster of the land surface. Used for estimating missing wellhead elevations. Any reprojection to dest_crs is handled automatically, assuming the DEM raster has CRS information embedded (arc-ascii grids do not!) By default, None. dem_units : str, {'feet', 'meters', ..} Units of DEM elevations, by default, 'meters' start_date : str (YYYY-mm-dd) Simulation start date (cull observations before this date) active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 data_length_units : str; 'meters', 'feet', etc. Length units of head observations. model_length_units : str; 'meters', 'feet', etc. Length units of model. geographic_groups : file, dict or list-like Option to group observations by area(s) of interest. Can be a shapefile, list of shapefiles, or dictionary of shapely polygons. A 'group' column will be created in the metadata, and observation sites within each polygon will be assigned the group name associated with that polygon. For example:: geographic_groups='../source_data/extents/CompositeHydrographArea.shp' geographic_groups=['../source_data/extents/CompositeHydrographArea.shp'] geographic_groups={'cha': <shapely Polygon>} Where 'cha' is an observation group name for observations located within the the area defined by CompositeHydrographArea.shp. For shapefiles, group names are provided in a `geographic_groups_col`. geographic_groups_col : str Field name in the `geographic_groups` shapefile(s) containing the observation group names associated with each polygon. max_obsname_len : int or None Maximum length for observation name prefix. Default of 13 allows for a PEST obsnme of 20 characters or less with <prefix>_yyyydd or <prefix>_<per>d<per> (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1) If None, observation names will not be truncated. PEST++ does not have a limit on observation name length. outfile : str Where output file will be written. Metadata are written to a file with the same name, with an additional "_info" suffix prior to the file extension. Returns ------- df : DataFrame Preprocessed time series well_info : DataFrame Preprocessed metadata References ---------- `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>` """ df = data.copy() # multiplier to convert input length units to model units unit_conversion = convert_length_units(data_length_units, model_length_units) # outputs out_plot = None if outfile is not None: outpath, filename = os.path.split(outfile) makedirs(outpath) outname, ext = os.path.splitext(outfile) out_info_csvfile = outname + '_info.csv' out_data_csvfile = outfile out_plot = os.path.join(outpath, 'open_interval_lengths.pdf') out_shapefile = outname + '_info.shp' # set the starting and ending dates here stdate = pd.Timestamp(start_date) # convert to datetime; drop the timestamps df['datetime'] = pd.to_datetime(df.datetime).dt.normalize() # trim to the time range n_measurements = len(data) n_sites = len(set(data.site_no)) print( f'starting with {n_measurements:,d} measurements at {n_sites:,d} unique wells' ) no_data_in_period = df.datetime < stdate if np.any(no_data_in_period): in_period = df.datetime >= stdate n_sites_before = len( set(df.loc[no_data_in_period, 'site_no']).difference(set(df.loc[in_period, 'site_no']))) print(( f'culling {in_period.sum():,d} measurements from {n_sites_before:,d} ' f'sites that are prior to start date of {start_date}')) df = df.loc[in_period] # collapse dataset to mean values at each site groups = df.groupby('site_no') well_info = groups.mean().copy() well_info = well_info.join(metadata, rsuffix='_meta') well_info['start_dt'] = groups.datetime.min() well_info['end_dt'] = groups.datetime.max() well_info.drop(labels=['year', 'month'], axis=1, inplace=True) well_info['site_no'] = well_info.index well_info['n'] = groups.datetime.count() # project x, y to model crs x_pr, y_pr = project((well_info.lon.values, well_info.lat.values), source_crs, dest_crs) well_info.drop(['lon', 'lat'], axis=1, inplace=True) well_info['x'], well_info['y'] = x_pr, y_pr well_info['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)] # cull data to that within the model area if active_area is not None: df, md = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs, metadata=well_info) # convert length units; convert screen tops and botms to depths missing_elevations = well_info.well_el.isna() if dem is not None and np.any(missing_elevations): well_location_elevations = get_values_at_points(dem, well_info['x'], well_info['y'], points_crs=dest_crs) well_location_elevations *= convert_length_units( dem_units, model_length_units) well_info.loc[missing_elevations, 'well_el'] = well_location_elevations[missing_elevations] length_columns = ['well_el' ] + head_data_columns + ['screen_top', 'screen_botm'] for col in length_columns: if col in well_info.columns: well_info[col] *= unit_conversion well_info['well_botm'] = well_info['well_el'] - well_info['well_depth'] well_info['screen_top'] = well_info['well_el'] - well_info['screen_top'] well_info['screen_botm'] = well_info['well_el'] - well_info['screen_botm'] # just the data, site numbers, times and aquifer head_data_columns = head_data_columns + ['head_std'] transient_cols = ['site_no', 'datetime'] + head_data_columns + ['n'] transient_cols = [c for c in transient_cols if c in df.columns] df = df[transient_cols].copy() for c in head_data_columns: if c in df.columns: df[c] *= unit_conversion # #### trim down to only well_info with both estimated water levels and standard deviation # monthly measured levels may not have standard deviation # (as opposed to monthly statistical estimates) criteria = pd.notnull(well_info['head']) #if 'head_std' in df.columns: # criteria = criteria & pd.notnull(well_info['head_std']) well_info = well_info[criteria] # verify that all well_info have a wellhead elevation assert not np.any(np.isnan(well_info.well_el)) # categorize wells based on quality of open interval information # estimate missing open intervals where possible well_info = fill_well_open_intervals(well_info, out_plot=out_plot) # drop well_info with negative reported open interval #well_info = well_info.loc[open_interval_length > 0] # cull data to well_info in well info table has_metadata = df.site_no.isin(well_info.index) if np.any(~has_metadata): warnings.warn('culling {} wells not found in metadata table!'.format( np.sum(~has_metadata))) df = df.loc[has_metadata].copy() # make unique n-character prefixes (site identifiers) for each observation location # 13 character length allows for prefix_yyyymmm in 20 character observation names # (BeoPEST limit) unique_obsnames = set() obsnames = [] for sn in well_info.index.tolist(): if max_obsname_len is not None: name = make_obsname(sn, unique_names=unique_obsnames, maxlen=max_obsname_len) assert name not in unique_obsnames else: name = sn unique_obsnames.add(name) obsnames.append(name) well_info['obsprefix'] = obsnames obsprefix = dict(zip(well_info.index, well_info.obsprefix)) df['obsprefix'] = [obsprefix[sn] for sn in df.site_no] # add area of interest information well_info['group'] = 'heads' well_info = assign_geographic_obsgroups(well_info, geographic_groups, geographic_groups_col, metadata_crs=dest_crs) # save out the results if outfile is not None: df2shp(well_info.drop(['x', 'y'], axis=1), out_shapefile, index=False, crs=dest_crs) print('writing {}'.format(out_info_csvfile)) well_info.drop('geometry', axis=1).to_csv(out_info_csvfile, index=False, float_format='%.2f') print('writing {}'.format(out_data_csvfile)) df.to_csv(out_data_csvfile, index=False, float_format='%.2f') return df, well_info
def rasterize(feature, grid, id_column=None, include_ids=None, epsg=None, proj4=None, dtype=np.float32): """Rasterize a feature onto the model grid, using the rasterio.features.rasterize method. Features are intersected if they contain the cell center. Parameters ---------- feature : str (shapefile path), list of shapely objects, or dataframe with geometry column id_column : str Column with unique integer identifying each feature; values from this column will be assigned to the output raster. grid : grid.StructuredGrid instance epsg : int EPSG code for feature coordinate reference system. Optional, but an epgs code or proj4 string must be supplied if feature isn't a shapefile, and isn't in the same CRS as the model. proj4 : str Proj4 string for feature CRS (optional) dtype : dtype Datatype for the output array Returns ------- 2D numpy array with intersected values """ try: from rasterio import features from rasterio import Affine except: print('This method requires rasterio.') return #trans = Affine(sr.delr[0], 0., sr.xul, # 0., -sr.delc[0], sr.yul) * Affine.rotation(sr.rotation) trans = grid.transform if isinstance(feature, str): proj4 = get_proj_str(feature) df = shp2df(feature) elif isinstance(feature, pd.DataFrame): df = feature.copy() elif isinstance(feature, collections.Iterable): # list of shapefiles if isinstance(feature[0], str): proj4 = get_proj_str(feature[0]) df = shp2df(feature) else: df = pd.DataFrame({'geometry': feature}) elif not isinstance(feature, collections.Iterable): df = pd.DataFrame({'geometry': [feature]}) else: print('unrecognized feature input') return # handle shapefiles in different CRS than model grid reproject = False if proj4 is not None: if proj4 != grid.proj_str: reproject = True elif epsg is not None and grid.epsg is not None: if epsg != grid.epsg: reproject = True from fiona.crs import to_string, from_epsg proj4 = to_string(from_epsg(epsg)) if reproject: df['geometry'] = project(df.geometry.values, proj4, grid.proj_str) # subset to include_ids if id_column is not None and include_ids is not None: df = df.loc[df[id_column].isin(include_ids)].copy() # create list of GeoJSON features, with unique value for each feature if id_column is None: numbers = range(1, len(df)+1) # if IDs are strings, get a number for each one # pd.DataFrame.unique() generally preserves order elif isinstance(df[id_column].dtype, np.object): unique_values = df[id_column].unique() values = dict(zip(unique_values, range(1, len(unique_values) + 1))) numbers = [values[n] for n in df[id_column]] else: numbers = df[id_column].tolist() geoms = list(zip(df.geometry, numbers)) result = features.rasterize(geoms, out_shape=(grid.nrow, grid.ncol), transform=trans) assert result.sum(axis=(0, 1)) != 0, "Nothing was intersected!" return result.astype(dtype)
def rasterize(feature, grid, id_column=None, include_ids=None, crs=None, epsg=None, proj4=None, dtype=np.float32, **kwargs): """Rasterize a feature onto the model grid, using the rasterio.features.rasterize method. Features are intersected if they contain the cell center. Parameters ---------- feature : str (shapefile path), list of shapely objects, or dataframe with geometry column id_column : str Column with unique integer identifying each feature; values from this column will be assigned to the output raster. grid : grid.StructuredGrid instance crs : obj A Python int, dict, str, or pyproj.crs.CRS instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class dtype : dtype Datatype for the output array **kwargs : keyword arguments to rasterio.features.rasterize() https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html Returns ------- 2D numpy array with intersected values """ try: from rasterio import Affine, features except: print('This method requires rasterio.') return if epsg is not None: warnings.warn( "The epsg argument is deprecated. Use crs instead, " "which requires gisutils >= 0.2", DeprecationWarning) if proj4 is not None: warnings.warn( "The epsg argument is deprecated. Use crs instead, " "which requires gisutils >= 0.2", DeprecationWarning) if crs is not None: if version.parse(gisutils.__version__) < version.parse('0.2.0'): raise ValueError("The crs argument requires gisutils >= 0.2") from gisutils import get_authority_crs crs = get_authority_crs(crs) trans = grid.transform kwargs = {} if isinstance(feature, str): proj4 = get_proj_str(feature) kwargs = {'dest_crs': grid.crs} kwargs = get_input_arguments(kwargs, shp2df) df = shp2df(feature, **kwargs) elif isinstance(feature, pd.DataFrame): df = feature.copy() elif isinstance(feature, collections.Iterable): # list of shapefiles if isinstance(feature[0], str): proj4 = get_proj_str(feature[0]) kwargs = {'dest_crs': grid.crs} kwargs = get_input_arguments(kwargs, shp2df) df = shp2df(feature, **kwargs) else: df = pd.DataFrame({'geometry': feature}) elif not isinstance(feature, collections.Iterable): df = pd.DataFrame({'geometry': [feature]}) else: print('unrecognized feature input') return # handle shapefiles in different CRS than model grid if 'dest_crs' not in kwargs: reproject = False # todo: consolidate rasterize reprojection to just use crs if crs is not None: if crs != grid.crs: df['geometry'] = project(df.geometry.values, crs, grid.crs) if proj4 is not None: if proj4 != grid.proj_str: reproject = True elif epsg is not None and grid.epsg is not None: if epsg != grid.epsg: reproject = True from fiona.crs import from_epsg, to_string proj4 = to_string(from_epsg(epsg)) if reproject: df['geometry'] = project(df.geometry.values, proj4, grid.proj_str) # subset to include_ids if id_column is not None and include_ids is not None: df = df.loc[df[id_column].isin(include_ids)].copy() # create list of GeoJSON features, with unique value for each feature if id_column is None: numbers = range(1, len(df) + 1) # if IDs are strings, get a number for each one # pd.DataFrame.unique() generally preserves order elif isinstance(df[id_column].dtype, np.object): unique_values = df[id_column].unique() values = dict(zip(unique_values, range(1, len(unique_values) + 1))) numbers = [values[n] for n in df[id_column]] else: numbers = df[id_column].tolist() geoms = list(zip(df.geometry, numbers)) result = features.rasterize(geoms, out_shape=(grid.nrow, grid.ncol), transform=trans) assert result.sum(axis=(0, 1)) != 0, "Nothing was intersected!" return result.astype(dtype)
def setup_ghb_data(model): m = model source_data = model.cfg['ghb'].get('source_data').copy() # get the GHB cells # todo: generalize more of the GHB setup code and move it somewhere else if 'shapefile' in source_data: shapefile_data = source_data['shapefile'] key = [k for k in shapefile_data.keys() if 'filename' in k.lower()][0] shapefile_name = shapefile_data.pop(key) ghbcells = rasterize(shapefile_name, m.modelgrid, **shapefile_data) else: raise NotImplementedError('Only shapefile input supported for GHBs') cond = model.cfg['ghb'].get('cond') if cond is None: raise KeyError("key 'cond' not found in GHB yaml input. " "Must supply conductance via this key for GHB setup.") # sample DEM for minimum elevation in each cell with a GHB # todo: GHB: allow time-varying bheads via csv input vertices = np.array(m.modelgrid.vertices)[ghbcells.flat > 0, :, :] polygons = [Polygon(vrts) for vrts in vertices] if 'dem' in source_data: key = [ k for k in source_data['dem'].keys() if 'filename' in k.lower() ][0] dem_filename = source_data['dem'].pop(key) with rasterio.open(dem_filename) as src: meta = src.meta # reproject the polygons to the dem crs if needed try: from gisutils import get_authority_crs dem_crs = get_authority_crs(src.crs) except: dem_crs = pyproj.crs.CRS.from_user_input(src.crs) if dem_crs != m.modelgrid.crs: polygons = project(polygons, m.modelgrid.crs, dem_crs) all_touched = False if meta['transform'][0] > m.modelgrid.delr[0]: all_touched = True results = zonal_stats(polygons, dem_filename, stats='min', all_touched=all_touched) min_elevs = np.ones((m.nrow * m.ncol), dtype=float) * np.nan min_elevs[ghbcells.flat > 0] = np.array([r['min'] for r in results]) units_key = [k for k in source_data['dem'] if 'units' in k] if len(units_key) > 0: min_elevs *= convert_length_units(source_data['dem'][units_key[0]], model.length_units) min_elevs = np.reshape(min_elevs, (m.nrow, m.ncol)) else: raise NotImplementedError( 'Must supply DEM to sample for GHB elevations\n' '(GHB: source_data: dem:)') # make a DataFrame with MODFLOW input i, j = np.indices((m.nrow, m.ncol)) df = pd.DataFrame({ 'per': 0, 'k': 0, 'i': i.flat, 'j': j.flat, 'bhead': min_elevs.flat, 'cond': cond }) df.dropna(axis=0, inplace=True) # assign layers so that bhead is above botms df['k'] = get_layer(model.dis.botm.array, df.i, df.j, df.bhead) # remove GHB cells from places where the specified head is below the model below_bottom_of_model = df.bhead < model.dis.botm.array[-1, df.i, df.j] + 0.01 df = df.loc[~below_bottom_of_model].copy() # exclude inactive cells k, i, j = df.k, df.i, df.j if model.version == 'mf6': active_cells = model.idomain[k, i, j] >= 1 else: active_cells = model.ibound[k, i, j] >= 1 df = df.loc[active_cells] return df
def baseflow_summary(self, field_sites, field_measurements, daily_values, q90_window=20, output_proj4=None): fm = field_measurements dvs = daily_values if fm['measurement_dt'].dtype != 'datetime64[ns]': fm['measurement_dt'] = pd.to_datetime(fm.measurement_dt) # reprojected the output X, Y coordinates print('reprojecting output from\n{}\nto\n{}...'.format( self.proj4, output_proj4)) if output_proj4 is not None: field_sites['geometry'] = gisutils.project(field_sites, self.proj4, output_proj4) fm_site_no = [] Qm = [] measurement_dt = [] measured_rating_diff = [] drainage_area = [] station_nm = [] index_station = [] indexQr = [] indexQ90 = [] X, Y = [], [] for i in range(len(fm)): mdt = fm.measurement_dt.tolist()[i] Dt = dt.datetime(mdt.year, mdt.month, mdt.day) for site_no, data in list(dvs.items()): # check if index station covers measurement date try: dv = data.ix[Dt] except KeyError: continue dv = data.ix[Dt] site_no = dv.site_no DDcd = [ k for k in list(data.keys()) if '00060' in k and not 'cd' in k ][0] try: Qr = float(dv[DDcd]) # handle ice and other non numbers except: continue # get q90 values for window q90start = pd.Timestamp(Dt) - pd.Timedelta(0.5 * q90_window, unit='Y') q90end = pd.Timestamp(Dt) + pd.Timedelta(0.5 * q90_window, unit='Y') values = pd.to_numeric(data.ix[q90start:q90end, DDcd], errors='coerce') q90 = values.quantile(q=0.1) # append last to avoid mismatches in length site_info = field_sites.ix[fm.site_no.values[i]] fm_site_no.append(fm.site_no.values[i]) station_nm.append(site_info['station_nm']) Qm.append(fm.discharge_va.values[i]) measurement_dt.append(fm.measurement_dt.tolist()[i]) measured_rating_diff.append(fm.measured_rating_diff.values[i]) drainage_area.append(site_info['drain_area_va']) index_station.append(site_no) indexQr.append(Qr) indexQ90.append(q90) X.append(site_info['geometry'].xy[0][0]) Y.append(site_info['geometry'].xy[1][0]) df = pd.DataFrame({ 'site_no': fm_site_no, 'station_nm': station_nm, 'datetime': measurement_dt, 'Qm': Qm, 'quality': measured_rating_diff, 'drn_area': drainage_area, 'idx_station': index_station, 'indexQr': indexQr, 'indexQ90': indexQ90, 'X': X, 'Y': Y }) df['est_error'] = [ self.est_error.get(q.lower(), self.default_error) for q in df.quality ] df = df[[ 'site_no', 'datetime', 'Qm', 'quality', 'est_error', 'idx_station', 'indexQr', 'indexQ90', 'drn_area', 'station_nm', 'X', 'Y' ]] return df
def setup_wel_data(model, for_external_files=True): """Performs the part of well package setup that is independent of MODFLOW version. Returns a DataFrame with the information needed to set up stress_period_data. """ # default options for distributing fluxes vertically vfd_defaults = { 'across_layers': False, 'distribute_by': 'thickness', 'screen_top_col': 'screen_top', 'screen_botm_col': 'screen_botm', 'minimum_layer_thickness': model.cfg['wel'].get('minimum_layer_thickness', 2.) } # master dataframe for stress period data columns = ['per', 'k', 'i', 'j', 'q', 'boundname'] df = pd.DataFrame(columns=columns) # check for source data datasets = model.cfg['wel'].get('source_data') # delete the dropped wells file if it exists, to avoid confusion dropped_wells_file = model.cfg['wel']['output_files'][ 'dropped_wells_file'].format(model.name) if os.path.exists(dropped_wells_file): os.remove(dropped_wells_file) # get well package input from source (parent) model in lieu of source data # todo: fetching correct well package from mf6 parent model if datasets is None and model.cfg['parent'].get('default_source_data') \ and hasattr(model.parent, 'wel'): # get well stress period data from mfnwt or mf6 model parent = model.parent spd = get_package_stress_period_data(parent, package_name='wel') # map the parent stress period data to inset stress periods periods = spd.groupby('per') dfs = [] for inset_per, parent_per in model.parent_stress_periods.items(): if parent_per in periods.groups: period = periods.get_group(parent_per) if len(dfs) > 0 and period.drop('per', axis=1).equals( dfs[-1].drop('per', axis=1)): continue else: dfs.append(period) spd = pd.concat(dfs) parent_well_i = spd.i.copy() parent_well_j = spd.j.copy() parent_well_k = spd.k.copy() # set boundnames based on well locations in parent model parent_name = parent.name spd['boundname'] = [ '{}_({},{},{})'.format(parent_name, pk, pi, pj) for pk, pi, pj in zip(parent_well_k, parent_well_i, parent_well_j) ] parent_well_x = parent.modelgrid.xcellcenters[parent_well_i, parent_well_j] parent_well_y = parent.modelgrid.ycellcenters[parent_well_i, parent_well_j] coords = project((parent_well_x, parent_well_y), model.modelgrid.proj_str, parent.modelgrid.proj_str) geoms = [Point(x, y) for x, y in zip(*coords)] bounds = model.modelgrid.bbox within = [g.within(bounds) for g in geoms] i, j = get_ij(model.modelgrid, parent_well_x[within], parent_well_y[within]) spd = spd.loc[within].copy() spd['i'] = i spd['j'] = j df = df.append(spd) # read source data and map onto model space and time discretization # multiple types of source data can be submitted elif datasets is not None: for k, v in datasets.items(): # determine the format if 'csvfile' in k.lower(): # generic csv # read csv file and aggregate flow rates to model stress periods # sum well fluxes co-located in a cell sd = TransientTabularSourceData.from_config( v, resolve_duplicates_with='sum', dest_model=model) csvdata = sd.get_data() csvdata.rename(columns={ v['data_column']: 'q', v['id_column']: 'boundname' }, inplace=True) if 'k' not in csvdata.columns: if model.nlay > 1: vfd = vfd_defaults.copy() vfd.update(v.get('vertical_flux_distribution', {})) csvdata = assign_layers_from_screen_top_botm( csvdata, model, **vfd) else: csvdata['k'] = 0 df = df.append(csvdata[columns]) elif k.lower() == 'wells': # generic dict added_wells = {k: v for k, v in v.items() if v is not None} if len(added_wells) > 0: aw = pd.DataFrame(added_wells).T aw['boundname'] = aw.index else: aw = None if aw is not None: if 'x' in aw.columns and 'y' in aw.columns: aw['i'], aw['j'] = get_ij(model.modelgrid, aw['x'].values, aw['y'].values) aw['per'] = aw['per'].astype(int) aw['k'] = aw['k'].astype(int) df = df.append(aw) elif k.lower() == 'wdnr_dataset': # custom input format for WI DNR # Get steady-state pumping rates check_source_files([v['water_use'], v['water_use_points']]) # fill out period stats period_stats = v['period_stats'] if isinstance(period_stats, str): period_stats = { kper: period_stats for kper in range(model.nper) } # separate out stress periods with period mean statistics vs. # those to be resampled based on start/end dates resampled_periods = { k: v for k, v in period_stats.items() if v == 'resample' } periods_with_dataset_means = { k: v for k, v in period_stats.items() if k not in resampled_periods } if len(periods_with_dataset_means) > 0: wu_means = get_mean_pumping_rates( v['water_use'], v['water_use_points'], period_stats=periods_with_dataset_means, drop_ids=v.get('drop_ids'), model=model) df = df.append(wu_means) if len(resampled_periods) > 0: wu_resampled = resample_pumping_rates( v['water_use'], v['water_use_points'], drop_ids=v.get('drop_ids'), exclude_steady_state=True, model=model) df = df.append(wu_resampled) # boundary fluxes from parent model if model.perimeter_bc_type == 'flux': assert model.parent is not None, "need parent model for TMR cut" # boundary fluxes kstpkper = [(0, 0)] tmr = Tmr(model.parent, model) # parent periods to copy over kstpkper = [(0, per) for per in model.cfg['model']['parent_stress_periods']] bfluxes = tmr.get_inset_boundary_fluxes(kstpkper=kstpkper) bfluxes['boundname'] = 'boundary_flux' df = df.append(bfluxes) for col in ['per', 'k', 'i', 'j']: df[col] = df[col].astype(int) # drop any k, i, j locations that are inactive if model.version == 'mf6': inactive = model.dis.idomain.array[df.k.values, df.i.values, df.j.values] != 1 else: inactive = model.bas6.ibound.array[df.k.values, df.i.values, df.j.values] != 1 # record dropped wells in csv file # (which might contain wells dropped by other routines) if np.any(inactive): #inactive_i, inactive_j = df.loc[inactive, 'i'].values, df.loc[inactive, 'j'].values dropped = df.loc[inactive].copy() dropped = dropped.groupby(['k', 'i', 'j']).first().reset_index() dropped['reason'] = 'in inactive cell' dropped['routine'] = __name__ + '.setup_wel_data' append_csv(dropped_wells_file, dropped, index=False, float_format='%g') # append to existing file if it exists df = df.loc[~inactive].copy() copy_fluxes_to_subsequent_periods = False if copy_fluxes_to_subsequent_periods and len(df) > 0: df = copy_fluxes_to_subsequent_periods(df) wel_lookup_file = model.cfg['wel']['output_files']['lookup_file'].format( model.name) wel_lookup_file = os.path.join(model._tables_path, os.path.split(wel_lookup_file)[1]) model.cfg['wel']['output_files']['lookup_file'] = wel_lookup_file # verify that all wells have a boundname if df.boundname.isna().any(): no_name = df.boundname.isna() k, i, j = df.loc[no_name, ['k', 'i', 'j']].T.values names = ['({},{},{})'.format(k, i, j) for k, i, j in zip(k, i, j)] df.loc[no_name, 'boundname'] = names assert not df.boundname.isna().any() # save a lookup file with well site numbers/categories df.sort_values(by=['boundname', 'per'], inplace=True) df[['per', 'k', 'i', 'j', 'q', 'boundname']].to_csv(wel_lookup_file, index=False) # convert to one-based and comment out header if df will be written straight to external file if for_external_files: df.rename(columns={'k': '#k'}, inplace=True) df['#k'] += 1 df['i'] += 1 df['j'] += 1 return df
def setup_structured_grid(xoff=None, yoff=None, xul=None, yul=None, nrow=None, ncol=None, nlay=None, dxy=None, delr=None, delc=None, top=None, botm=None, rotation=0., parent_model=None, snap_to_NHG=False, features=None, features_shapefile=None, id_column=None, include_ids=None, buffer=1000, crs=None, epsg=None, model_length_units=None, grid_file='grid.json', bbox_shapefile=None, **kwargs): """""" print('setting up model grid...') t0 = time.time() # conversions for model/parent model units to meters # set regular flag for handling delc/delr to_meters_inset = convert_length_units(model_length_units, 'meters') regular = True if dxy is not None: delr_m = np.round(dxy * to_meters_inset, 4) # dxy is specified in model units delc_m = delr_m if delr is not None: delr_m = np.round(delr * to_meters_inset, 4) # delr is specified in model units if not np.isscalar(delr_m): if (set(delr_m)) == 1: delr_m = delr_m[0] else: regular = False if delc is not None: delc_m = np.round(delc * to_meters_inset, 4) # delc is specified in model units if not np.isscalar(delc_m): if (set(delc_m)) == 1: delc_m = delc_m[0] else: regular = False if parent_model is not None: to_meters_parent = convert_length_units( get_model_length_units(parent_model), 'meters') # parent model grid spacing in meters parent_delr_m = np.round( parent_model.dis.delr.array[0] * to_meters_parent, 4) if not parent_delr_m % delr_m == 0: raise ValueError( 'inset delr spacing of {} must be factor of parent spacing of {}' .format(delr_m, parent_delr_m)) parent_delc_m = np.round( parent_model.dis.delc.array[0] * to_meters_parent, 4) if not parent_delc_m % delc_m == 0: raise ValueError( 'inset delc spacing of {} must be factor of parent spacing of {}' .format(delc_m, parent_delc_m)) if epsg is not None: crs = pyproj.crs.CRS.from_epsg(epsg) elif crs is not None: from gisutils import get_authority_crs crs = get_authority_crs(crs) elif parent_model is not None: crs = parent_model.modelgrid.crs # option 1: make grid from xoff, yoff and specified dimensions if xoff is not None and yoff is not None: assert nrow is not None and ncol is not None, \ "Need to specify nrow and ncol if specifying xoffset and yoffset." if regular: height_m = np.round(delc_m * nrow, 4) width_m = np.round(delr_m * ncol, 4) else: height_m = np.sum(delc_m) width_m = np.sum(delr_m) # optionally align grid with national hydrologic grid # grids snapping to NHD must have spacings that are a factor of 1 km if snap_to_NHG: assert regular and np.allclose(1000 % delc_m, 0, atol=1e-4) x, y = get_point_on_national_hydrogeologic_grid(xoff, yoff, offset='edge', op=np.floor) xoff = x yoff = y rotation = 0. # need to specify xul, yul in case snapping to parent # todo: allow snapping to parent grid on xoff, yoff if rotation != 0: raise NotImplementedError('Rotated grids not supported.') xul = xoff yul = yoff + height_m # option 2: make grid using buffered feature bounding box else: if features is None and features_shapefile is not None: # Make sure shapefile and bbox filter are in dest (model) CRS # TODO: CRS wrangling could be added to shp2df as a feature reproject_filter = False try: from gisutils import get_shapefile_crs features_crs = get_shapefile_crs(features_shapefile) if features_crs != crs: reproject_filter = True except: features_crs = get_proj_str(features_shapefile) reproject_filter = True filter = None if parent_model is not None: if reproject_filter: filter = project(parent_model.modelgrid.bbox, parent_model.modelgrid.crs, features_crs).bounds else: filter = parent_model.modelgrid.bbox.bounds shp2df_kwargs = {'dest_crs': crs} shp2df_kwargs = get_input_arguments(shp2df_kwargs, shp2df) df = shp2df(features_shapefile, filter=filter, **shp2df_kwargs) # optionally subset shapefile data to specified features if id_column is not None and include_ids is not None: df = df.loc[df[id_column].isin(include_ids)] # use all features by default features = df.geometry.tolist() # convert multiple features to a MultiPolygon if isinstance(features, list): if len(features) > 1: features = MultiPolygon(features) else: features = features[0] # size the grid based on the bbox for features x1, y1, x2, y2 = features.bounds L = buffer # distance from area of interest to boundary xul = x1 - L yul = y2 + L height_m = np.round(yul - (y1 - L), 4) # initial model height from buffer distance width_m = np.round((x2 + L) - xul, 4) rotation = 0. # rotation not supported with this option # align model with parent grid if there is a parent model # (and not snapping to national hydrologic grid) if parent_model is not None and not snap_to_NHG: # get location of coinciding cell in parent model for upper left pi, pj = parent_model.modelgrid.intersect(xul, yul) verts = np.array(parent_model.modelgrid.get_cell_vertices(pi, pj)) xul, yul = verts[:, 0].min(), verts[:, 1].max() # adjust the dimensions to align remaining corners def roundup(number, increment): return int(np.ceil(number / increment) * increment) height = roundup(height_m, parent_delr_m) width = roundup(width_m, parent_delc_m) # update nrow, ncol after snapping to parent grid if regular: nrow = int(height / delc_m) # h is in meters ncol = int(width / delr_m) # set the grid configuration dictionary # spacing is in meters (consistent with projected CRS) # (modelgrid object will be updated automatically from this dictionary) #if rotation == 0.: # xll = xul # yll = yul - model.height grid_cfg = { 'nrow': int(nrow), 'ncol': int(ncol), 'nlay': nlay, 'delr': delr_m, 'delc': delc_m, 'xoff': xoff, 'yoff': yoff, 'xul': xul, 'yul': yul, 'rotation': rotation, 'lenuni': 2 } if regular: grid_cfg['delr'] = np.ones(grid_cfg['ncol'], dtype=float) * grid_cfg['delr'] grid_cfg['delc'] = np.ones(grid_cfg['nrow'], dtype=float) * grid_cfg['delc'] grid_cfg['delr'] = grid_cfg['delr'].tolist() # for serializing to json grid_cfg['delc'] = grid_cfg['delc'].tolist() # renames for flopy modelgrid renames = {'rotation': 'angrot'} for k, v in renames.items(): if k in grid_cfg: grid_cfg[v] = grid_cfg.pop(k) # add epsg or wkt if there isn't an epsg if epsg is not None: grid_cfg['epsg'] = epsg elif crs is not None: if 'epsg' in crs.srs.lower(): grid_cfg['epsg'] = int(crs.srs.split(':')[1]) else: grid_cfg['wkt'] = crs.srs else: warnings.warn('No coordinate system reference provided for model grid!' 'Model input data may not be mapped correctly.') # set up the model grid instance grid_cfg['top'] = top grid_cfg['botm'] = botm grid_cfg.update(kwargs) # update with any kwargs from function call kwargs = get_input_arguments(grid_cfg, MFsetupGrid) modelgrid = MFsetupGrid(**kwargs) modelgrid.cfg = grid_cfg # write grid info to json, and shapefile of bbox # omit top and botm arrays from json represenation of grid # (just for horizontal disc.) del grid_cfg['top'] del grid_cfg['botm'] fileio.dump(grid_file, grid_cfg) if bbox_shapefile is not None: write_bbox_shapefile(modelgrid, bbox_shapefile) print("finished in {:.2f}s\n".format(time.time() - t0)) return modelgrid
def extent_poly(): extent_poly = box(390000, 1330000, 500000, 1455000) extent_poly_ll = project(extent_poly, "+init=epsg:{}".format(5070), "+init=epsg:4269") return extent_poly_ll
def preprocess_te_wateruse(data, start_date=None, end_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, estimated_production_zone_top=None, estimated_production_zone_botm=None, estimated_production_surface_units='feet', source_crs=4269, dest_crs=5070, interp_method='linear', data_volume_units='mgal', model_length_units='meters', outfile=None): """Preprocess water use data from thermoelectric power plants: * reproject data to a destination CRS `dest_crs`) * cull data to an area of interest (`active_area`) * if input data do not have information on the well screen intervals; sample screen tops and bottoms from raster surfaces bounding an estimated production zone (e.g. `estimated_production_zone_top`) * reindex the data to continous monthly values extending from `start_date` to `end_date`. Typically, these would bracket the time period for which the pumping should be simulated in a model. For example, the earliest data may be from 2010, but if the model starts in 2008, it may be appropriate to begin using the 2010 rates then (``start_date='2008'``). If no start or end date are given, the first and last years of pumping in `data` are used. * fill empty months by interpolation via a specified `interp_method` * backfill any remaining empty months going back to the `start_date` * write processed data to a CSV file and shapefile of the same name Parameters ---------- data : DataFrame Thermoelectric water use data in the following format (similar to that output by :func:`mapgwm.te_wateruse.read_te_water_use_spreadsheet`): =============== ======================================================= site_no power plant identifier (plant code) start_datetime pandas datetime representative of flux (e.g. '2010') x x-coordinate of withdrawl, in `source_crs` y y-coordinate of withdrawl, in `source_crs` q withdrawl flux, in `data_volume_units` per days =============== ======================================================= start_date : str Start date for pumping rates. If earlier than the dates in `data`, pumping rates will be backfilled to this date. end_date : str End date for pumping rates. If later than the dates in `data`, pumping rates will be forward filled to this date. active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. estimated_production_zone_top : file path Raster surface for assigning screen tops estimated_production_zone_botm : file path Raster surface for assigning screen bottoms estimated_production_surface_units : str, {'meters', 'ft', etc.} Length units of elevations in estimated production surface rasters. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 interp_method : str Interpolation method to use for filling pumping rates to monthly values. By default, 'linear' data_volume_units : str; e.g. 'mgal', 'm3', 'cubic feet', etc. Volume units of pumping data. All time units are assumed to be in days. model_length_units : str; e.g. 'feet', 'm', 'meters', etc. Length units of model. outfile : str Path for output file. A shapefile of the same name is also written. If None, no output file is written. By default, None Returns ------- df_monthly : DataFrame Notes ----- * time units for TE data and model are assumed to be days """ df = data.copy() # reproject to dest_crs x, y = project(zip(df['x'], df['y']), source_crs, dest_crs) df['x'], df['y'] = x, y df['geometry'] = [Point(x, y) for x, y in zip(x, y)] # drop wells with no location information (for now) df.dropna(subset=['x', 'y'], axis=0, inplace=True) # cull sites to those within the Delta footprint # cull data to that within the model area if active_area is not None: df = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs) # get top and bottom of estimated production interval at each well if estimated_production_zone_top is not None and \ estimated_production_zone_botm is not None: surf_unit_conversion = convert_length_units( estimated_production_surface_units, model_length_units) x, y = df.x.values, df.y.values est_screen_top = get_values_at_points(estimated_production_zone_top, x, y, points_crs=dest_crs) est_screen_top *= surf_unit_conversion est_screen_botm = get_values_at_points(estimated_production_zone_botm, x, y, points_crs=dest_crs) est_screen_botm *= surf_unit_conversion df['screen_top'] = est_screen_top df['screen_botm'] = est_screen_botm # distribute fluxes to monthly values # set start and end dates if not already set if start_date is None: start_date = df.start_datetime.min() if end_date is None: end_date = df.start_datetime.mmax() groups = df.groupby('site_no') all_groups = [] for site_no, group in groups: dfg = group.copy() # create a continuous monthly time index # labeled at the month start all_dates = pd.date_range(start_date, end_date, freq='MS') dfg.index = dfg['start_datetime'] dfg = dfg.reindex(all_dates) # interpolate the discharge values; # back filling to the start date dfg['q'] = dfg.q.interpolate(method=interp_method).bfill() dfg['q'] *= convert_volume_units(data_volume_units, model_length_units) # fill remaining columns dfg['start_datetime'] = dfg.index fill_columns = set(dfg.columns).difference({'q', 'start_datetime'}) fill_values = group.iloc[0].to_dict() for c in fill_columns: dfg[c] = fill_values[c] # add 'te' prefix to site number dfg['site_no'] = f'te_{site_no}' all_groups.append(dfg) df_monthly = pd.concat(all_groups) # assume most values represent abstraction # if sum is positive, invert so that output values are negative if df_monthly['q'].sum() > 0: df_monthly['q'] *= -1 # clean up the columns cols = [ 'site_no', 'start_datetime', 'x', 'y', 'screen_top', 'screen_botm', 'q', 'geometry' ] cols += list(set(df_monthly.columns).difference(cols)) df_monthly = df_monthly[cols] # write the output if outfile is not None: outfile = Path(outfile) df_monthly.drop('geometry', axis=1).to_csv(outfile, index=False, float_format='%g') print('wrote {}'.format(outfile)) # write only unique pumping values to shapefile to_shapefile = df_monthly.groupby(['site_no', 'q']).first().reset_index() shapefile = outfile.with_suffix('.shp') df2shp(to_shapefile, shapefile, crs=dest_crs) return df_monthly
def preprocess_iwum_pumping(ncfile, start_date=None, end_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, estimated_production_zone_top=None, estimated_production_zone_botm=None, flux_variable='value', nc_crs=5070, dest_crs=5070, nc_length_units='meters', estimated_production_surface_units='meters', model_length_units='meters', outfile=None): """Get pumping from the Irrigation Water Use Model (IWUM; Wilson, 2020) output and assign open interval information, using raster surfaces of the top and bottom of an estimated production zone. Parameters ---------- ncfile : file path NetCDF output from Irrigation Water Use Model start_date : str Cull data before this date. end_date : str Cull data after this date. active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. estimated_production_zone_top : file path Raster surface for assigning screen tops estimated_production_zone_botm : file path Raster surface for assigning screen bottoms flux_variable : str Varible in ncfile for pumping fluxes. Fluxes are assumed to represent total volumes for each time period. nc_crs : obj Coordinate Reference System (CRS) of ncfile. A Python int, dict, str, or pyproj.crs.CRS instance passed to the pyproj.crs.from_user_input See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input. Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class nc_length_units : str, {'meters', 'ft', etc.} Length units of pumped volumes in ncfile estimated_production_surface_units : str, {'meters', 'ft', etc.} Length units of elevations in estimated production surface rasters. model_length_units : str, {'meters', 'ft', etc.} Length units of model. outfile : csv file for output table Returns ------- df : DataFrame Table of pumping rates in m3/day, location and open interval information. Columns: ============== ================================================ site_no index position of pumping rate in ncfile grid x x-coordinate in `dest_crs` y y-coordinate in `dest_crs` start_datetime start date of pumping period end_datetime end date of pumping period screen_top screen top elevation, in `model_length_units` screen_botm screen bottom elevation, in `model_length_units` q pumping rate, in model units geometry shapely Point object representing location ============== ================================================ Notes ----- * Time units are assumed to be days. * Fluxes are assumed to represent total volumes for each time period indicated by the differences between successive values along the time axis of ncfile. """ ds = xr.open_dataset(ncfile) time_variable = [k for k in ds.coords.keys() if k.lower() not in {'x', 'y'}][0] ds_x, ds_y = np.meshgrid(ds['x'], ds['y']) # original values are in m3, in each 1 mi2 cell # can leave in m3 if reassigning to 1km grid as point values length_conversion = convert_volume_units(nc_length_units, model_length_units) ** 3 unit_suffix = vol_suffix[model_length_units] + 'd' flux_col = 'q' # 'flux_{}'.format(unit_suffix) # output field name for fluxes # get top/botm elevations est_screen_top = None est_screen_botm = None if estimated_production_zone_top is not None and \ estimated_production_zone_botm is not None: surf_unit_conversion = convert_length_units(estimated_production_surface_units, model_length_units) est_screen_top = get_values_at_points(estimated_production_zone_top, ds_x, ds_y, points_crs=nc_crs) est_screen_top *= surf_unit_conversion est_screen_botm = get_values_at_points(estimated_production_zone_botm, ds_x, ds_y, points_crs=nc_crs) est_screen_botm *= surf_unit_conversion # in any places where screen top is less than the screen botm, # set both at the mean loc = est_screen_top < est_screen_botm means = np.mean([est_screen_top, est_screen_botm], axis=0) est_screen_top[loc] = means[loc] est_screen_botm[loc] = means[loc] print(f'Reset screen top and bottom to mean elevation at {loc.ravel().sum()} ' f'locations where screen top was < screen bottom') dfs = [] times = pd.DatetimeIndex(ds[time_variable].loc[start_date:end_date].values) for n, period_start_date in enumerate(times): # for each time entry, get the data kwargs = {time_variable: period_start_date} arr = ds[flux_variable].sel(**kwargs).values # make sure pumping sign is negative # based on assumption that values are mostly abstraction if arr.sum() > 0: arr *= -1 # set up a dataframe data = {'site_no': np.arange(ds_x.size), 'x': ds_x.ravel(), 'y': ds_y.ravel(), } if est_screen_top is not None and est_screen_botm is not None: data.update({'screen_top': est_screen_top.ravel(), 'screen_botm': est_screen_botm.ravel() } ) df = pd.DataFrame(data) df['start_datetime'] = period_start_date # get the end_date, handling last entry if n + 1 < len(times): period_end_date = times[n + 1] else: # set end date for last period on previous period length last_start = dfs[-1]['start_datetime'].values[0] ndays = (pd.Timestamp(period_start_date) - pd.Timestamp(last_start)).days period_end_date = period_start_date + pd.Timedelta(ndays, unit='d') # convert the time units ndays = (pd.Timestamp(period_end_date) - pd.Timestamp(period_start_date)).days assert ndays > 0, "period_end_date {} is before period_start_date {}"\ .format(period_end_date, period_start_date) time_conversion = 1 / ndays # original quantities are volumes for the time period # time indexing in pandas is through last value period_end_date = pd.Timestamp(period_end_date) - pd.Timedelta(1, unit='d') df['end_datetime'] = period_end_date df[flux_col] = arr.ravel() * length_conversion * time_conversion # only includes fluxes > 0 df = df.loc[df[flux_col] < 0] dfs.append(df) df = pd.concat(dfs) # site number column (that would be unique from other integers from other data sources) df['site_no'] = [f'iwum_{node}' for node in df.site_no] # project the data to a destination crs, if provided # make a separate metadata dataframe with 1 row per location # to avoid redundant operations metadata = df.groupby('site_no').first().reset_index()[['site_no', 'x', 'y']] metadata.index = metadata['site_no'] x_pr, y_pr = project((metadata.x.values, metadata.y.values), nc_crs, dest_crs) metadata['x'], metadata['y'] = x_pr, y_pr metadata['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)] # cull the data to the model area, if provided if active_area is not None: df, metadata = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs, metadata=metadata) # update data with x,y values projected in metadata x = dict(zip(metadata.site_no, metadata.x)) y = dict(zip(metadata.site_no, metadata.y)) df['x'] = [x[sn] for sn in df.site_no] df['y'] = [y[sn] for sn in df.site_no] if outfile is not None: outfile = Path(outfile) df.to_csv(outfile, index=False, float_format='%g') print('wrote {}'.format(outfile)) # Make a plot of iwum output in mgal/day out_pdf_path = outfile.parent / 'plots' out_pdf_path.mkdir(exist_ok=True) plot_iwum_output(ncfile, flux_variable=flux_variable, outpath=out_pdf_path) return df