Пример #1
0
def test_sample_elevations_different_proj(dem, tylerforks_sfrdata, datapath):
    sfr = tylerforks_sfrdata
    sampled_elevs1 = sfr.sample_reach_elevations(dem,
                                                 method='buffers',
                                                 smooth=True)
    sampled_elevs1 = np.array(list(sampled_elevs1.values()))

    reach1_geom = sfr.reach_data.geometry[0]
    crs1 = sfr.crs
    crs2 = get_authority_crs(3070)
    sfr._crs = crs2
    sfr.reach_data['geometry'] = project(sfr.reach_data['geometry'].values,
                                         crs1, crs2)
    reach1_geom_5070 = sfr.reach_data.geometry[0]

    # verify that the reaches were reprojected
    assert reach1_geom.intersection(reach1_geom_5070).area == 0
    sampled_elevs2 = sfr.sample_reach_elevations(dem,
                                                 method='buffers',
                                                 smooth=True)
    sampled_elevs2 = np.array(list(sampled_elevs2.values()))
    rms_error = np.sqrt(np.mean((sampled_elevs2 - sampled_elevs1)**2))
    assert rms_error < 0.5  # not sure why the elevations don't match better

    # verify that at least the first reach is the same
    reach1_geom_projected_back_100buffer = project(reach1_geom_5070, crs2,
                                                   crs1).buffer(100)
    assert np.allclose(reach1_geom_projected_back_100buffer.area,
                       reach1_geom.buffer(100).area)
Пример #2
0
    def reproject(self,
                  x_coord_col='FROM_DEC_LONG_VA',
                  y_coord_col='FROM_DEC_LAT_VA',
                  key='SITE_NO'):
        """ Reproject from self.source_crs to self.dest_crs using gisutils

        Parameters
        ----------
        x_coord_col : str, optional
            Column name in data with x-coordinates,
            by default 'x'
        y_coord_col : str, optional
            Column name in data with y-coordinates,
            by default 'y'
        key: str
            key for the dictionary made, defaults to SITE_NO
        """
        x_reprj, y_reprj = project(
            zip(self.df[x_coord_col], self.df[y_coord_col]), self.source_crs,
            self.dest_crs)
        self.df['x'] = x_reprj
        self.df['y'] = y_reprj
        self.df['geometry'] = [Point(x, y) for x, y in zip(x_reprj, y_reprj)]

        # drop entries if no location information
        self.df.dropna(subset=['x', 'y'], axis=0, inplace=True)

        # make dictionary of location
        self.locations = dict(
            list(zip(self.df[key], list(zip(self.df['x'], self.df['y'])))))
Пример #3
0
    def to_crs(self, dest_crs):
        """Reproject the LineStrings in :py:attr:`Lines.df` to
        a different Coordinate Reference System.

        Parameters
        ----------
        dest_crs : obj
            A Python int, dict, str, or :py:class:`pyproj.crs.CRS` instance
            passed to the :py:meth:`pyproj.crs.CRS.from_user_input`
            See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input.
            Can be any of:

              - PROJ string
              - Dictionary of PROJ parameters
              - PROJ keyword arguments for parameters
              - JSON string with PROJ parameters
              - CRS WKT string
              - An authority string [i.e. 'epsg:4326']
              - An EPSG integer code [i.e. 4326]
              - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
              - An object with a `to_wkt` method.
              - A :class:`pyproj.crs.CRS` class
        """
        assert self.crs is not None, "No crs for flowlines"
        assert dest_crs is not None, "No destination CRS."

        dest_crs = get_authority_crs(dest_crs)
        print('\nreprojecting hydrography from\n{}\nto\n{}\n'.format(
            self.crs, dest_crs))
        geoms = project(self.df.geometry, self.crs, dest_crs)
        assert np.isfinite(np.max(geoms[0].xy[0])), \
            "Invalid reprojection; check CRS for lines and grid."
        self.df['geometry'] = geoms
        self.crs = dest_crs
Пример #4
0
def get_bbox(feature, dest_crs):
    """Get bounding box for a Polygon feature.

    Parameters
    ----------
    feature : str (shapefile path), shapely Polygon or GeoJSON
    dest_crs : proj str
        Desired output coordinate system (shapefiles only)
    """
    if isinstance(feature, str):
        with fiona.open(feature) as src:
            l, b, r, t = src.bounds
            bbox_src_crs = box(*src.bounds)
            shpcrs = crs(proj_str=to_string(src.crs))
        if dest_crs is not None and shpcrs != dest_crs:
            bbox_dest_crs = project(bbox_src_crs, shpcrs.proj_str,
                                    dest_crs.proj_str)
            l, b, r, t = bbox_dest_crs.bounds
            # x, y = project([(l, b),
            #                (r, t)], shpcrs.proj_str, dest_crs.proj_str)
            # filter = (x[0], y[0], x[1], y[1])
            # else:
        filter = (l, b, r, t)
    elif isinstance(feature, Polygon):
        filter = feature.bounds
    elif isinstance(feature, dict):
        try:
            filter = shape(feature).bounds
        except Exception as ex:
            print(ex)
            print("Supplied dictionary doesn't appear to be valid GeoJSON.")
    return filter
Пример #5
0
def extent_poly():
    extent_poly_ll = box(-92.7, 46.7, -92.6, 46.8)

    extent_poly = project(extent_poly_ll, "+init=epsg:{}".format(4269),
                          "+init=epsg:26915")
    df = pd.DataFrame({'geometry': [extent_poly], 'id': [0]})
    df2shp(df, 'examples/data/bbox.shp', epsg=26915)
    return extent_poly_ll
Пример #6
0
    def reproject(self, dest_proj_str):
        assert self.crs.proj_str is not None, "No proj_str string for flowlines"
        assert dest_proj_str is not None, "No destination CRS."

        print('\nreprojecting hydrography from\n{}\nto\n{}\n'.format(
            self.crs.proj_str, dest_proj_str))
        geoms = project(self.df.geometry, self.crs.proj_str, dest_proj_str)
        assert np.isfinite(np.max(geoms[0].xy[0])), \
            "Invalid reprojection; check CRS for lines and grid."
        self.df['geometry'] = geoms
        self.crs.proj_str = dest_proj_str
Пример #7
0
def test_lines_from_NHDPlus(tylerforks_lines_from_NHDPlus):
    lines = tylerforks_lines_from_NHDPlus

    tf = lines.df.name == 'Tyler Forks'
    lines_pr = project(lines.df.geometry, 4269, 26915)
    line_lengths = np.array([g.length for g in lines_pr])
    expected_asum1s = lines.df['asum2'] - line_lengths
    # add dropna due to some lines along boundary
    # not being in the PFVAA subset used for the test
    assert np.allclose(lines.df['asum1'].dropna(),
                       expected_asum1s.dropna(),
                       atol=10)
    assert np.all(lines.df.loc[tf, 'asum1'].dropna() > 95000)
    assert isinstance(lines, Lines)
Пример #8
0
    def _compute_geometries(self, df):

        datum = np.array([coord_datums_epsg[d] for d in df.dec_coord_datum_cd])
        datums = set(datum)
        x1, y1 = df.dec_long_va.values, df.dec_lat_va.values
        x2 = np.ones(len(df), dtype=float) * np.nan
        y2 = np.ones(len(df), dtype=float) * np.nan
        for dtm in datums:
            pr1 = "epsg:{}".format(dtm)
            loc = datum == dtm
            x2[loc], y2[loc] = gisutils.project((x1[loc], y1[loc]), pr1,
                                                self.proj_str)
        geoms = [Point(x, y) for x, y in zip(x2, y2)]
        return geoms
Пример #9
0
    def _read_extent_shapefile(self, shpfile, buffer=0):

        import fiona
        from fiona.crs import to_string, from_epsg

        print('reading extent from {}...'.format(shpfile))
        shp = fiona.open(shpfile)
        g = shape(shp.next()['geometry'])

        if to_string(from_epsg(coord_datums_epsg[self.datum])) != to_string(
                shp.crs):
            print('reprojecting extent from {} to {}'.format(
                to_string(shp.crs), self.proj_str))
            return gisutils.project(g, to_string(shp.crs), self.proj_str)
        else:
            return g
Пример #10
0
def test_get_upstream_area():

    catchments = ['/Users/aleaf/Documents/NHDPlus/NHDPlusGL/NHDPlus04/NHDPlusCatchment/Catchment.shp',
                  '/Users/aleaf/Documents/NHDPlus/NHDPlusMS/NHDPlus07/NHDPlusCatchment/Catchment.shp']
    plusflow = ['/Users/aleaf/Documents/NHDPlus/NHDPlusGL/NHDPlus04/NHDPlusAttributes/PlusFlow.dbf',
                '/Users/aleaf/Documents/NHDPlus/NHDPlusMS/NHDPlus07/NHDPlusAttributes/PlusFlow.dbf']
    nodasites = '/Users/aleaf/Documents/USFS/Nicolet/targets/north/flux_field_no_da.shp'
    flowlines = ['/Users/aleaf/Documents/NHDPlus/NHDPlusGL/NHDPlus04/NHDSnapshot/Hydrography/NHDFlowline.shp',
                 '/Users/aleaf/Documents/NHDPlus/NHDPlusMS/NHDPlus07/NHDSnapshot/Hydrography/NHDFlowline.shp']
    nearfield = '/Users/aleaf/Documents/USFS/Nicolet/shps/Nicolet_north_NF.shp'

    nf = shape(fiona.open(nearfield).next()['geometry'])
    nf = project(nf, '+init=epsg:26716', '+init=epsg:4269')
    bbox = nf.bounds

    noda = shp2df(nodasites)

    get_upstream_area(noda.geometry.tolist(), plusflow, flowlines, catchments, nf)
Пример #11
0
def read_polygon_feature(feature, dest_crs, feature_crs=None):
    """Read a geometric feature from a shapefile, shapely geometry object,
    or collection of shapely geometry objects. Reproject to dest_crs
    if the feature is in a different CRS.

    Parameters
    ----------
    feature : shapely Polygon, list of Polygons, or shapefile path
            Polygons must be in same CRS as linework; shapefile
            features will be reprojected if their crs is different.
    dest_crs : instance of sfrmaker.crs
        Output CRS for the feature.

    Returns
    -------
    feature : shapely geometry object
    """
    if isinstance(feature, str):
        with fiona.open(feature) as src:
            feature_crs = crs(src.crs)
        geoms = shp2df(feature)['geometry'].values
        feature = unary_union(geoms)
    elif isinstance(feature, collections.Iterable):
        if isinstance(feature[0], dict):
            try:
                feature = [shape(f) for f in feature]
            except Exception as ex:
                print(ex)
                print(
                    "Supplied dictionary doesn't appear to be valid GeoJSON.")
        feature = unary_union(feature)
    elif isinstance(feature, dict):
        try:
            feature = shape(feature)
        except Exception as ex:
            print(ex)
            print("Supplied dictionary doesn't appear to be valid GeoJSON.")
    elif isinstance(feature, Polygon):
        pass
    else:
        raise TypeError("Unrecognized feature input.")
    if feature_crs is not None and feature_crs != dest_crs:
        feature = project(feature, feature_crs.proj_str, dest_crs.proj_str)
    return feature.buffer(0)
Пример #12
0
def get_bbox(feature, dest_crs):
    """Get bounding box for a Polygon feature.

    Parameters
    ----------
    feature : str (shapefile path), shapely Polygon or GeoJSON
    dest_crs  : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    """
    if isinstance(feature, str):
        with fiona.open(feature) as src:
            l, b, r, t = src.bounds
        bbox_src_crs = box(*src.bounds)
        shpcrs = get_shapefile_crs(feature)
        if dest_crs is not None and shpcrs != dest_crs:
            bbox_dest_crs = project(bbox_src_crs, shpcrs, dest_crs)
            l, b, r, t = bbox_dest_crs.bounds
        filter = (l, b, r, t)
    elif isinstance(feature, Polygon):
        filter = feature.bounds
    elif isinstance(feature, dict):
        try:
            filter = shape(feature).bounds
        except Exception as ex:
            print(ex)
            print("Supplied dictionary doesn't appear to be valid GeoJSON.")
    return filter
Пример #13
0
def setup_lake_info(model):

    # lake package must have a source_data block
    # (e.g. for supplying shapefile delineating lake extents)
    source_data = model.cfg.get('lak', {}).get('source_data')
    if source_data is None or 'lak' not in model.package_list:
        return
    lakesdata = model.load_features(**source_data['lakes_shapefile'])
    lakesdata_proj_str = get_proj_str(source_data['lakes_shapefile']['filename'])
    id_column = source_data['lakes_shapefile']['id_column'].lower()
    name_column = source_data['lakes_shapefile'].get('name_column', 'name').lower()
    nlakes = len(lakesdata)

    # make dataframe with lake IDs, names and locations
    centroids = project([g.centroid for g in lakesdata.geometry],
                        lakesdata_proj_str, 'epsg:4269')
    df = pd.DataFrame({'lak_id': np.arange(1, nlakes + 1),
                       'feat_id': lakesdata[id_column].values,
                       'name': lakesdata[name_column].values,
                       'latitude': [c.y for c in centroids],
                       'geometry': lakesdata['geometry']
                       })
    # get starting stages from model top, for specifying ranges
    stages = []
    for lakid in df['lak_id']:
        loc = model._lakarr2d == lakid
        est_stage = model.dis.top.array[loc].min()
        stages.append(est_stage)
    df['strt'] = np.array(stages)

    # save a lookup file mapping lake ids to hydroids
    lookup_file = model.cfg['lak']['output_files']['lookup_file'].format(model.name)
    df.drop('geometry', axis=1).to_csv(lookup_file, index=False)

    # clean up names
    df['name'].replace('nan', '', inplace=True)
    df['name'].replace(' ', '', inplace=True)
    return df
Пример #14
0
def preprocess_flows(
    data,
    metadata=None,
    flow_data_columns=['flow'],
    start_date=None,
    active_area=None,
    active_area_id_column=None,
    active_area_feature_id=None,
    source_crs=4269,
    dest_crs=5070,
    datetime_col='datetime',
    site_no_col='site_no',
    line_id_col='line_id',
    x_coord_col='x',
    y_coord_col='y',
    name_col='name',
    flow_qualifier_column=None,
    default_qualifier='measured',
    include_sites=None,
    include_line_ids=None,
    source_volume_units='ft3',
    source_time_units='s',
    dest_volume_units='m3',
    dest_time_units='d',
    geographic_groups=None,
    geographic_groups_col=None,
    max_obsname_len=None,
    add_leading_zeros_to_sw_site_nos=False,
    column_renames=None,
    outfile=None,
):
    """Preprocess stream flow observation data, for example, from NWIS or another data source that
    outputs time series in CSV format with site locations and identifiers.

    * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates)
      to the CRS of the model (`dest_crs`)
    * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area
    * length and time units are converted to those of the groundwater model.
    * Prefixes for observation names (with an optional length limit) that identify the location are generated
    * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons
      (`geographic_groups` parameter)

    Parameters
    ----------
    data : csv file or DataFrame
        Time series of stream flow observations.
        Columns:

        ===================== ======================================
        site_no               site identifier
        datetime              measurement dates/times
        x                     x-coordinate of site
        y                     y-coordinate of site
        flow_data_columns     Columns of observed streamflow values
        flow_qualifier_column Optional column with qualifiers for flow values
        ===================== ======================================

        Notes:

        * x and y columns can alternatively be in the metadata table
        * flow_data_columns are denoted in `flow_data_columns`; multiple
          columns can be included to process base flow and total flow, or
          other statistics in tandem
        * For example, `flow_qualifier_column` may have "estimated" or "measured"
          flags denoting whether streamflows were derived from measured values
          or statistical estimates.

    metadata : csv file or DataFrame
        Stream flow observation site information.

        May include columns:

        ================= ================================================================================
        site_no           site identifier
        x                 x-coordinate of site
        y                 y-coordinate of site
        name              name of site
        line_id_col       Identifier for a line in a hydrography dataset that the site is associated with.
        ================= ================================================================================

        Notes:

        * other columns in metadata will be passed through to the metadata output

    flow_data_columns : list of strings
        Columns in data with flow values or their statistics.
        By default, ['q_cfs']
        start_date : str (YYYY-mm-dd)
        Simulation start date (cull observations before this date)
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    datetime_col : str, optional
        Column name in data with observation date/times,
        by default 'datetime'
    site_no_col : str, optional
        Column name in data and metadata with site identifiers,
        by default 'site_no'
    line_id_col : str, optional
        Column name in data or metadata with identifiers for
        hydrography lines associated with observation sites.
        by default 'line_id'
    x_coord_col : str, optional
        Column name in data or metadata with x-coordinates,
        by default 'x'
    y_coord_col : str, optional
        Column name in data or metadata with y-coordinates,
        by default 'y'
    name_col : str, optional
        Column name in data or metadata with observation site names,
        by default 'name'
    flow_qualifier_column : str, optional
        Column name in data with flow observation qualifiers, such
        as "measured" or "estimated"
        by default 'category'
    default_qualifier : str, optional
        Default qualifier to populate flow_qualifier_column if it
        is None. By default, "measured"
    include_sites : list-like, optional
        Exclude output to these sites.
        by default, None (include all sites)
    include_line_ids : list-like, optional
        Exclude output to these sites, represented by line identifiers.
        by default, None (include all sites)
    source_volume_units : str, 'm3', 'cubic meters', 'ft3', etc.
        Volume units of the source data. By default, 'ft3'
    source_time_units : str, 's', 'seconds', 'days', etc.
        Time units of the source data. By default, 's'
    dest_volume_units : str, 'm3', 'cubic meters', 'ft3', etc.
        Volume units of the output (model). By default, 'm3'
    dest_time_units : str, 's', 'seconds', 'days', etc.
        Time units of the output (model). By default, 'd'
    geographic_groups : file, dict or list-like
        Option to group observations by area(s) of interest. Can
        be a shapefile, list of shapefiles, or dictionary of shapely polygons.
        A 'group' column will be created in the metadata, and observation
        sites within each polygon will be assigned the group name
        associated with that polygon.

        For example::

            geographic_groups='../source_data/extents/CompositeHydrographArea.shp'
            geographic_groups=['../source_data/extents/CompositeHydrographArea.shp']
            geographic_groups={'cha': <shapely Polygon>}

        Where 'cha' is an observation group name for observations located within the
        the area defined by CompositeHydrographArea.shp. For shapefiles,
        group names are provided in a `geographic_groups_col`.

    geographic_groups_col : str
        Field name in the `geographic_groups` shapefile(s) containing the
        observation group names associated with each polygon.
    max_obsname_len : int or None
        Maximum length for observation name prefix. Default of 13
        allows for a PEST obsnme of 20 characters or less with
        <prefix>_yyyydd or <prefix>_<per>d<per>
        (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1)
        If None, observation names will not be truncated. PEST++ does not have
        a limit on observation name length.
    add_leading_zeros_to_sw_site_nos : bool
        Whether or not to pad site numbers using the
        :func:~`mapgwm.swflows.format_usgs_sw_site_id` function.
        By default, False.
    column_renames : dict, optional
        Option to rename columns in the data or metadata that are different than those listed above.
        For example, if the data file has a 'SITE_NO' column instead of 'SITE_BADGE'::

            column_renames={'SITE_NO': 'site_no'}

        by default None, in which case the renames listed above will be used.
        Note that the renames must be the same as those listed above for
        :func:`mapgwm.swflows.preprocess_flows` to work.
    outfile : str
        Where output file will be written. Metadata are written to a file
        with the same name, with an additional "_info" suffix prior to
        the file extension.

    Returns
    -------
    data : DataFrame
        Preprocessed time series
    metadata : DataFrame
        Preprocessed metadata

    References
    ----------
    `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>`

    Notes
    -----

    """
    # outputs
    if outfile is not None:
        outpath, filename = os.path.split(outfile)
        makedirs(outpath)
        outname, ext = os.path.splitext(outfile)
        out_info_csvfile = outname + '_info.csv'
        out_data_csvfile = outfile
        out_shapefile = outname + '_info.shp'

    # read the source data
    if not isinstance(data, pd.DataFrame):
        df = pd.read_csv(data, dtype={site_no_col: object})
    else:
        df = data.copy()
    # check the columns
    for col in [datetime_col] + flow_data_columns:
        assert col in df.columns, "Column {} not found in {}".format(col, data)
    assert any({site_no_col, line_id_col}.intersection(df.columns)), \
        "Neither {} or {} found in {}. Need to specify a site_no_col or line_id_col".format(site_no_col,
                                                                                            line_id_col, data)
    # rename input columns to these names,
    # for consistent output
    dest_columns = {
        datetime_col: 'datetime',
        site_no_col: 'site_no',
        line_id_col: 'line_id',
        x_coord_col: 'x',
        y_coord_col: 'y',
        name_col: 'name',
        flow_qualifier_column: 'category'
    }
    # update the default column renames
    # with any supplied via column_renames parameter
    if isinstance(column_renames, collections.Mapping):
        dest_columns.update(column_renames)
    df.rename(columns=dest_columns, inplace=True)
    flow_data_columns = [
        c if c not in dest_columns else dest_columns[c]
        for c in flow_data_columns
    ]
    # convert site numbers to strings;
    # add leading 0s to any USGS sites that should have them
    if 'site_no' in df.columns:
        df['site_no'] = format_site_ids(df['site_no'],
                                        add_leading_zeros_to_sw_site_nos)
    else:
        df['site_no'] = df[line_id_col]

    # read the source data
    if metadata is not None:
        if not isinstance(metadata, pd.DataFrame):
            md = pd.read_csv(metadata, dtype={site_no_col: object})
        else:
            md = metadata.copy()
        if site_no_col not in md.columns or 'site_no' not in df.columns:
            raise IndexError(
                'If metadata are supplied, both data and metadata must '
                'have a site_no column.')
        md.rename(columns=dest_columns, inplace=True)
        md['site_no'] = format_site_ids(md['site_no'],
                                        add_leading_zeros_to_sw_site_nos)
        md.index = md['site_no']
        by_site = df.groupby('site_no')
        md['start_dt'] = pd.DataFrame(by_site['datetime'].first())
    else:
        by_site = df.groupby('site_no')
        md = pd.DataFrame(by_site['datetime'].first())
        md.columns = ['start_dt']
        md['site_no'] = md.index

    md['end_dt'] = pd.DataFrame(by_site['datetime'].last())
    md['n'] = pd.DataFrame(by_site['datetime'].count())
    md.reset_index(inplace=True, drop=True)

    # assign metadata if supplied
    for col in 'x', 'y', 'line_id', 'name':
        if col in df.columns and col not in md.columns:
            by_site_no = dict(zip(df['site_no'], df[col]))
            md[col] = [by_site_no[sn] for sn in md['site_no']]
            if col != 'line_id':
                df.drop(col, axis=1, inplace=True)

    # index the dataframe to times;
    # truncate data before start date
    df.index = pd.to_datetime(df['datetime'])
    df.index.name = 'datetime'
    df = df.loc[start_date:].copy()

    # project x, y to model crs
    x_pr, y_pr = project((md.x.values, md.y.values), source_crs, dest_crs)
    md['x'], md['y'] = x_pr, y_pr
    md['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull data to that within the model area
    if active_area is not None:
        df, md = cull_data_to_active_area(df,
                                          active_area,
                                          active_area_id_column,
                                          active_area_feature_id,
                                          data_crs=dest_crs,
                                          metadata=md)

    # get the hydrography IDs corresponding to each site
    # using the included lookup table
    #if 'line_id' not in df.columns:
    #    assert line_id_lookup is not None, \
    #    "need to include line_ids in a column, or line_id_lookup dictionary mapping line_ids to site numbers"
    #    df = df.loc[df['site_no'].isin(line_id_lookup)].copy()
    #    df['line_id'] = [line_id_lookup[sn] for sn in df['site_no']]

    if include_sites is not None:
        md = md.loc[md.site_no.isin(include_sites)]
        df = df.loc[df.site_no.isin(include_sites)]
    if include_line_ids is not None:
        md = md.loc[md.line_id.isin(include_line_ids)]
        df = df.loc[df.line_id.isin(include_line_ids)]

    # convert units
    # ensure that flow values are numeric (may be objects if taken directly from NWIS)
    unit_conversion = (
        convert_volume_units(source_volume_units, dest_volume_units) /
        convert_time_units(source_time_units, dest_time_units))
    for flow_col in flow_data_columns:
        df[flow_col] = pd.to_numeric(df[flow_col],
                                     errors='coerce') * unit_conversion
    df.dropna(subset=flow_data_columns, axis=0, inplace=True)

    # reformat qualifiers for consistent output
    # (lump to dest category columns of either estimated or measured)
    # with measured including values derived from baseflow separation or actual measurements)
    # output column name for flow qualifier column:
    dest_flow_qualifier_column = 'category'
    if flow_qualifier_column is not None:
        flow_qualifiers = {
            'calculated': 'measured',  # 'measured',
            'base flow separated from measured values':
            'measured',  # 'measured',
            'measured total flow': 'measured',
            'estimated gaged': 'estimated',
            'estimated ungaged': 'estimated'
        }
        df[dest_flow_qualifier_column] = df[flow_qualifier_column].replace(
            flow_qualifiers)
    else:
        df['category'] = default_qualifier

    # make unique n-character prefixes (site identifiers) for each observation location
    # 13 character length allows for prefix_yyyymmm in 20 character observation names
    # (BeoPEST limit)
    unique_obsnames = set()
    obsnames = []
    for sn in md['site_no'].tolist():
        if max_obsname_len is not None:
            name = make_obsname(sn,
                                unique_names=unique_obsnames,
                                maxlen=max_obsname_len)
            assert name not in unique_obsnames
        else:
            name = sn
        unique_obsnames.add(name)
        obsnames.append(name)
    md['obsprefix'] = obsnames

    # add area of interest information
    md['group'] = 'fluxes'
    md = assign_geographic_obsgroups(md,
                                     geographic_groups,
                                     geographic_groups_col,
                                     metadata_crs=dest_crs)

    # data columns
    data_cols = ['site_no', 'line_id', 'datetime'
                 ] + flow_data_columns + ['category']
    #if 'line_id' in md.columns and 'line_id' not in df.columns:
    #    # only map line_ids to data if there are more site numbers
    #    # implying that no site number maps to more than one line_id
    #    if len(set(df.site_no)) >= len(set(df.line_id)):
    #        ids = dict(zip(md['site_no'], md['line_id']))
    #    df['line_id'] = [ids[sn] for sn in df['site_no']]
    data_cols = [c for c in data_cols if c in df.columns]
    df = df[data_cols]

    md.index = md['site_no']
    # save out the results
    if outfile is not None:
        df2shp(md.drop(['x', 'y'], axis=1), out_shapefile, crs=dest_crs)
        print('writing {}'.format(out_info_csvfile))
        md.drop('geometry', axis=1).to_csv(out_info_csvfile,
                                           index=False,
                                           float_format='%g')
        print('writing {}'.format(out_data_csvfile))
        df.to_csv(out_data_csvfile, index=False, float_format='%g')
    return df, md
Пример #15
0
def preprocess_headobs(
        data,
        metadata,
        head_data_columns=['head', 'last_head', 'head_std'],
        dem=None,
        dem_units='meters',
        start_date='1998-04-01',
        active_area=None,
        active_area_id_column=None,
        active_area_feature_id=None,
        source_crs=4269,
        dest_crs=5070,
        data_length_units='meters',
        model_length_units='meters',
        geographic_groups=None,
        geographic_groups_col=None,
        max_obsname_len=None,
        outfile='../source_data/observations/head_obs/preprocessed_head_obs.csv'
):
    """Preprocess head observation data, for example, groundwater level data output from the
    `visGWDB program <https://doi.org/10.5066/P9W004O6>`_.

    * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates)
      to the CRS of the model (`dest_crs`)
    * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area
    * length units are converted to those of the groundwater model. Open intervals for the wells are
      converted from depths to elevations
    * missing open intervals are filled based on well bottom depths (if availabile) and the median open
      interval length for the dataset.
    * Wells are categorized based on the quality of the open interval information (see the documentation
      for :func:`mapgwm.headobs.fill_well_open_intervals`).
    * Prefixes for observation names (with an optional length limit) that identify the location are generated
    * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons
      (`aoi` parameter)

    Parameters
    ----------
    data : DataFrame
        Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`.
        Columns:

        ========= ================================================================
        site_no   site identifier
        lat       lattitude
        lon       longitude
        datetime  measurement dates in pandas datetime format
        head      average head for the period represented by the datetime
        last_head last head measurement for the period represented by the datetime
        head_std  standard deviation of measured heads within the datetime period
        ========= ================================================================

        Notes:

        * lat and lon columns can alternatively be in the metadata table
        * `last_head` and `head_std` only need to be included if they are in
          `head_data_columns`

    metadata : DataFrame
        Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`.

        Must have the following columns:

        ================= ==========================================================================
        site_no (index)   site identifier
        aqfr_cd           Local aquifer code
        screen_botm       Well screen bottom, as a depth below land surface, in feet
        screen_top        Well screen top, as a depth below land surface, in feet
        well_depth        Well depth, in feet
        well_el           Altitude of land surface, in feet
        ================= ==========================================================================

    head_data_columns : list of strings
        Columns in data with head values or their statistics.
        By default, 'head', 'last_head', 'head_std', which allows both
        the average and last head values for the stress period to be considered,
        as well as the variability of water levels contributing to an average value.
    dem : str, optional
        DEM raster of the land surface. Used for estimating missing wellhead elevations.
        Any reprojection to dest_crs is handled automatically, assuming
        the DEM raster has CRS information embedded (arc-ascii grids do not!)
        By default, None.
    dem_units : str, {'feet', 'meters', ..}
        Units of DEM elevations, by default, 'meters'
    start_date : str (YYYY-mm-dd)
        Simulation start date (cull observations before this date)
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269

    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    data_length_units : str; 'meters', 'feet', etc.
        Length units of head observations.
    model_length_units : str; 'meters', 'feet', etc.
        Length units of model.
    geographic_groups : file, dict or list-like
        Option to group observations by area(s) of interest. Can
        be a shapefile, list of shapefiles, or dictionary of shapely polygons.
        A 'group' column will be created in the metadata, and observation
        sites within each polygon will be assigned the group name
        associated with that polygon.

        For example::

            geographic_groups='../source_data/extents/CompositeHydrographArea.shp'
            geographic_groups=['../source_data/extents/CompositeHydrographArea.shp']
            geographic_groups={'cha': <shapely Polygon>}

        Where 'cha' is an observation group name for observations located within the
        the area defined by CompositeHydrographArea.shp. For shapefiles,
        group names are provided in a `geographic_groups_col`.

    geographic_groups_col : str
        Field name in the `geographic_groups` shapefile(s) containing the
        observation group names associated with each polygon.

    max_obsname_len : int or None
        Maximum length for observation name prefix. Default of 13
        allows for a PEST obsnme of 20 characters or less with
        <prefix>_yyyydd or <prefix>_<per>d<per>
        (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1)
        If None, observation names will not be truncated. PEST++ does not have
        a limit on observation name length.
    outfile : str
        Where output file will be written. Metadata are written to a file
        with the same name, with an additional "_info" suffix prior to
        the file extension.

    Returns
    -------
    df : DataFrame
        Preprocessed time series
    well_info : DataFrame
        Preprocessed metadata

    References
    ----------
    `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>`
    """

    df = data.copy()
    # multiplier to convert input length units to model units
    unit_conversion = convert_length_units(data_length_units,
                                           model_length_units)

    # outputs
    out_plot = None
    if outfile is not None:
        outpath, filename = os.path.split(outfile)
        makedirs(outpath)
        outname, ext = os.path.splitext(outfile)
        out_info_csvfile = outname + '_info.csv'
        out_data_csvfile = outfile
        out_plot = os.path.join(outpath, 'open_interval_lengths.pdf')
        out_shapefile = outname + '_info.shp'

    # set the starting and ending dates here
    stdate = pd.Timestamp(start_date)

    # convert to datetime; drop the timestamps
    df['datetime'] = pd.to_datetime(df.datetime).dt.normalize()

    # trim to the time range
    n_measurements = len(data)
    n_sites = len(set(data.site_no))
    print(
        f'starting with {n_measurements:,d} measurements at {n_sites:,d} unique wells'
    )
    no_data_in_period = df.datetime < stdate

    if np.any(no_data_in_period):
        in_period = df.datetime >= stdate
        n_sites_before = len(
            set(df.loc[no_data_in_period,
                       'site_no']).difference(set(df.loc[in_period,
                                                         'site_no'])))
        print((
            f'culling {in_period.sum():,d} measurements from {n_sites_before:,d} '
            f'sites that are prior to start date of {start_date}'))
        df = df.loc[in_period]

    # collapse dataset to mean values at each site
    groups = df.groupby('site_no')
    well_info = groups.mean().copy()
    well_info = well_info.join(metadata, rsuffix='_meta')
    well_info['start_dt'] = groups.datetime.min()
    well_info['end_dt'] = groups.datetime.max()
    well_info.drop(labels=['year', 'month'], axis=1, inplace=True)
    well_info['site_no'] = well_info.index
    well_info['n'] = groups.datetime.count()

    # project x, y to model crs
    x_pr, y_pr = project((well_info.lon.values, well_info.lat.values),
                         source_crs, dest_crs)
    well_info.drop(['lon', 'lat'], axis=1, inplace=True)
    well_info['x'], well_info['y'] = x_pr, y_pr
    well_info['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull data to that within the model area
    if active_area is not None:
        df, md = cull_data_to_active_area(df,
                                          active_area,
                                          active_area_id_column,
                                          active_area_feature_id,
                                          data_crs=dest_crs,
                                          metadata=well_info)

    # convert length units; convert screen tops and botms to depths
    missing_elevations = well_info.well_el.isna()
    if dem is not None and np.any(missing_elevations):
        well_location_elevations = get_values_at_points(dem,
                                                        well_info['x'],
                                                        well_info['y'],
                                                        points_crs=dest_crs)
        well_location_elevations *= convert_length_units(
            dem_units, model_length_units)
        well_info.loc[missing_elevations,
                      'well_el'] = well_location_elevations[missing_elevations]

    length_columns = ['well_el'
                      ] + head_data_columns + ['screen_top', 'screen_botm']
    for col in length_columns:
        if col in well_info.columns:
            well_info[col] *= unit_conversion

    well_info['well_botm'] = well_info['well_el'] - well_info['well_depth']
    well_info['screen_top'] = well_info['well_el'] - well_info['screen_top']
    well_info['screen_botm'] = well_info['well_el'] - well_info['screen_botm']

    # just the data, site numbers, times and aquifer
    head_data_columns = head_data_columns + ['head_std']
    transient_cols = ['site_no', 'datetime'] + head_data_columns + ['n']
    transient_cols = [c for c in transient_cols if c in df.columns]
    df = df[transient_cols].copy()
    for c in head_data_columns:
        if c in df.columns:
            df[c] *= unit_conversion

    # #### trim down to only well_info with both estimated water levels and standard deviation
    # monthly measured levels may not have standard deviation
    # (as opposed to monthly statistical estimates)
    criteria = pd.notnull(well_info['head'])
    #if 'head_std' in df.columns:
    #    criteria = criteria & pd.notnull(well_info['head_std'])
    well_info = well_info[criteria]

    # verify that all well_info have a wellhead elevation
    assert not np.any(np.isnan(well_info.well_el))

    # categorize wells based on quality of open interval information
    # estimate missing open intervals where possible
    well_info = fill_well_open_intervals(well_info, out_plot=out_plot)

    # drop well_info with negative reported open interval
    #well_info = well_info.loc[open_interval_length > 0]

    # cull data to well_info in well info table
    has_metadata = df.site_no.isin(well_info.index)
    if np.any(~has_metadata):
        warnings.warn('culling {} wells not found in metadata table!'.format(
            np.sum(~has_metadata)))
        df = df.loc[has_metadata].copy()

    # make unique n-character prefixes (site identifiers) for each observation location
    # 13 character length allows for prefix_yyyymmm in 20 character observation names
    # (BeoPEST limit)
    unique_obsnames = set()
    obsnames = []
    for sn in well_info.index.tolist():
        if max_obsname_len is not None:
            name = make_obsname(sn,
                                unique_names=unique_obsnames,
                                maxlen=max_obsname_len)
            assert name not in unique_obsnames
        else:
            name = sn
        unique_obsnames.add(name)
        obsnames.append(name)
    well_info['obsprefix'] = obsnames
    obsprefix = dict(zip(well_info.index, well_info.obsprefix))
    df['obsprefix'] = [obsprefix[sn] for sn in df.site_no]

    # add area of interest information
    well_info['group'] = 'heads'
    well_info = assign_geographic_obsgroups(well_info,
                                            geographic_groups,
                                            geographic_groups_col,
                                            metadata_crs=dest_crs)

    # save out the results
    if outfile is not None:
        df2shp(well_info.drop(['x', 'y'], axis=1),
               out_shapefile,
               index=False,
               crs=dest_crs)
        print('writing {}'.format(out_info_csvfile))
        well_info.drop('geometry', axis=1).to_csv(out_info_csvfile,
                                                  index=False,
                                                  float_format='%.2f')
        print('writing {}'.format(out_data_csvfile))
        df.to_csv(out_data_csvfile, index=False, float_format='%.2f')
    return df, well_info
Пример #16
0
def rasterize(feature, grid, id_column=None,
              include_ids=None,
              epsg=None,
              proj4=None, dtype=np.float32):
    """Rasterize a feature onto the model grid, using
    the rasterio.features.rasterize method. Features are intersected
    if they contain the cell center.

    Parameters
    ----------
    feature : str (shapefile path), list of shapely objects,
              or dataframe with geometry column
    id_column : str
        Column with unique integer identifying each feature; values
        from this column will be assigned to the output raster.
    grid : grid.StructuredGrid instance
    epsg : int
        EPSG code for feature coordinate reference system. Optional,
        but an epgs code or proj4 string must be supplied if feature
        isn't a shapefile, and isn't in the same CRS as the model.
    proj4 : str
        Proj4 string for feature CRS (optional)
    dtype : dtype
        Datatype for the output array

    Returns
    -------
    2D numpy array with intersected values

    """
    try:
        from rasterio import features
        from rasterio import Affine
    except:
        print('This method requires rasterio.')
        return

    #trans = Affine(sr.delr[0], 0., sr.xul,
    #               0., -sr.delc[0], sr.yul) * Affine.rotation(sr.rotation)
    trans = grid.transform

    if isinstance(feature, str):
        proj4 = get_proj_str(feature)
        df = shp2df(feature)
    elif isinstance(feature, pd.DataFrame):
        df = feature.copy()
    elif isinstance(feature, collections.Iterable):
        # list of shapefiles
        if isinstance(feature[0], str):
            proj4 = get_proj_str(feature[0])
            df = shp2df(feature)
        else:
            df = pd.DataFrame({'geometry': feature})
    elif not isinstance(feature, collections.Iterable):
        df = pd.DataFrame({'geometry': [feature]})
    else:
        print('unrecognized feature input')
        return

    # handle shapefiles in different CRS than model grid
    reproject = False
    if proj4 is not None:
        if proj4 != grid.proj_str:
            reproject = True
    elif epsg is not None and grid.epsg is not None:
        if epsg != grid.epsg:
            reproject = True
            from fiona.crs import to_string, from_epsg
            proj4 = to_string(from_epsg(epsg))
    if reproject:
        df['geometry'] = project(df.geometry.values, proj4, grid.proj_str)

    # subset to include_ids
    if id_column is not None and include_ids is not None:
        df = df.loc[df[id_column].isin(include_ids)].copy()

    # create list of GeoJSON features, with unique value for each feature
    if id_column is None:
        numbers = range(1, len(df)+1)
    # if IDs are strings, get a number for each one
    # pd.DataFrame.unique() generally preserves order
    elif isinstance(df[id_column].dtype, np.object):
        unique_values = df[id_column].unique()
        values = dict(zip(unique_values, range(1, len(unique_values) + 1)))
        numbers = [values[n] for n in df[id_column]]
    else:
        numbers = df[id_column].tolist()

    geoms = list(zip(df.geometry, numbers))
    result = features.rasterize(geoms,
                                out_shape=(grid.nrow, grid.ncol),
                                transform=trans)
    assert result.sum(axis=(0, 1)) != 0, "Nothing was intersected!"
    return result.astype(dtype)
Пример #17
0
def rasterize(feature,
              grid,
              id_column=None,
              include_ids=None,
              crs=None,
              epsg=None,
              proj4=None,
              dtype=np.float32,
              **kwargs):
    """Rasterize a feature onto the model grid, using
    the rasterio.features.rasterize method. Features are intersected
    if they contain the cell center.

    Parameters
    ----------
    feature : str (shapefile path), list of shapely objects,
              or dataframe with geometry column
    id_column : str
        Column with unique integer identifying each feature; values
        from this column will be assigned to the output raster.
    grid : grid.StructuredGrid instance
    crs : obj
        A Python int, dict, str, or pyproj.crs.CRS instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`
        Can be any of:

          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

    dtype : dtype
        Datatype for the output array
    **kwargs : keyword arguments to rasterio.features.rasterize()
        https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html

    Returns
    -------
    2D numpy array with intersected values

    """
    try:
        from rasterio import Affine, features
    except:
        print('This method requires rasterio.')
        return

    if epsg is not None:
        warnings.warn(
            "The epsg argument is deprecated. Use crs instead, "
            "which requires gisutils >= 0.2", DeprecationWarning)
    if proj4 is not None:
        warnings.warn(
            "The epsg argument is deprecated. Use crs instead, "
            "which requires gisutils >= 0.2", DeprecationWarning)
    if crs is not None:
        if version.parse(gisutils.__version__) < version.parse('0.2.0'):
            raise ValueError("The crs argument requires gisutils >= 0.2")
        from gisutils import get_authority_crs
        crs = get_authority_crs(crs)

    trans = grid.transform

    kwargs = {}
    if isinstance(feature, str):
        proj4 = get_proj_str(feature)
        kwargs = {'dest_crs': grid.crs}
        kwargs = get_input_arguments(kwargs, shp2df)
        df = shp2df(feature, **kwargs)
    elif isinstance(feature, pd.DataFrame):
        df = feature.copy()
    elif isinstance(feature, collections.Iterable):
        # list of shapefiles
        if isinstance(feature[0], str):
            proj4 = get_proj_str(feature[0])
            kwargs = {'dest_crs': grid.crs}
            kwargs = get_input_arguments(kwargs, shp2df)
            df = shp2df(feature, **kwargs)
        else:
            df = pd.DataFrame({'geometry': feature})
    elif not isinstance(feature, collections.Iterable):
        df = pd.DataFrame({'geometry': [feature]})
    else:
        print('unrecognized feature input')
        return

    # handle shapefiles in different CRS than model grid
    if 'dest_crs' not in kwargs:
        reproject = False
        # todo: consolidate rasterize reprojection to just use crs
        if crs is not None:
            if crs != grid.crs:
                df['geometry'] = project(df.geometry.values, crs, grid.crs)
        if proj4 is not None:
            if proj4 != grid.proj_str:
                reproject = True
        elif epsg is not None and grid.epsg is not None:
            if epsg != grid.epsg:
                reproject = True
                from fiona.crs import from_epsg, to_string
                proj4 = to_string(from_epsg(epsg))
        if reproject:
            df['geometry'] = project(df.geometry.values, proj4, grid.proj_str)

    # subset to include_ids
    if id_column is not None and include_ids is not None:
        df = df.loc[df[id_column].isin(include_ids)].copy()

    # create list of GeoJSON features, with unique value for each feature
    if id_column is None:
        numbers = range(1, len(df) + 1)
    # if IDs are strings, get a number for each one
    # pd.DataFrame.unique() generally preserves order
    elif isinstance(df[id_column].dtype, np.object):
        unique_values = df[id_column].unique()
        values = dict(zip(unique_values, range(1, len(unique_values) + 1)))
        numbers = [values[n] for n in df[id_column]]
    else:
        numbers = df[id_column].tolist()

    geoms = list(zip(df.geometry, numbers))
    result = features.rasterize(geoms,
                                out_shape=(grid.nrow, grid.ncol),
                                transform=trans)
    assert result.sum(axis=(0, 1)) != 0, "Nothing was intersected!"
    return result.astype(dtype)
Пример #18
0
def setup_ghb_data(model):

    m = model
    source_data = model.cfg['ghb'].get('source_data').copy()
    # get the GHB cells
    # todo: generalize more of the GHB setup code and move it somewhere else
    if 'shapefile' in source_data:
        shapefile_data = source_data['shapefile']
        key = [k for k in shapefile_data.keys() if 'filename' in k.lower()][0]
        shapefile_name = shapefile_data.pop(key)
        ghbcells = rasterize(shapefile_name, m.modelgrid, **shapefile_data)
    else:
        raise NotImplementedError('Only shapefile input supported for GHBs')

    cond = model.cfg['ghb'].get('cond')
    if cond is None:
        raise KeyError("key 'cond' not found in GHB yaml input. "
                       "Must supply conductance via this key for GHB setup.")

    # sample DEM for minimum elevation in each cell with a GHB
    # todo: GHB: allow time-varying bheads via csv input
    vertices = np.array(m.modelgrid.vertices)[ghbcells.flat > 0, :, :]
    polygons = [Polygon(vrts) for vrts in vertices]
    if 'dem' in source_data:
        key = [
            k for k in source_data['dem'].keys() if 'filename' in k.lower()
        ][0]
        dem_filename = source_data['dem'].pop(key)
        with rasterio.open(dem_filename) as src:
            meta = src.meta

        # reproject the polygons to the dem crs if needed
        try:
            from gisutils import get_authority_crs
            dem_crs = get_authority_crs(src.crs)
        except:
            dem_crs = pyproj.crs.CRS.from_user_input(src.crs)
        if dem_crs != m.modelgrid.crs:
            polygons = project(polygons, m.modelgrid.crs, dem_crs)

        all_touched = False
        if meta['transform'][0] > m.modelgrid.delr[0]:
            all_touched = True
        results = zonal_stats(polygons,
                              dem_filename,
                              stats='min',
                              all_touched=all_touched)
        min_elevs = np.ones((m.nrow * m.ncol), dtype=float) * np.nan
        min_elevs[ghbcells.flat > 0] = np.array([r['min'] for r in results])
        units_key = [k for k in source_data['dem'] if 'units' in k]
        if len(units_key) > 0:
            min_elevs *= convert_length_units(source_data['dem'][units_key[0]],
                                              model.length_units)
        min_elevs = np.reshape(min_elevs, (m.nrow, m.ncol))
    else:
        raise NotImplementedError(
            'Must supply DEM to sample for GHB elevations\n'
            '(GHB: source_data: dem:)')

    # make a DataFrame with MODFLOW input
    i, j = np.indices((m.nrow, m.ncol))
    df = pd.DataFrame({
        'per': 0,
        'k': 0,
        'i': i.flat,
        'j': j.flat,
        'bhead': min_elevs.flat,
        'cond': cond
    })
    df.dropna(axis=0, inplace=True)

    # assign layers so that bhead is above botms
    df['k'] = get_layer(model.dis.botm.array, df.i, df.j, df.bhead)
    # remove GHB cells from places where the specified head is below the model
    below_bottom_of_model = df.bhead < model.dis.botm.array[-1, df.i,
                                                            df.j] + 0.01
    df = df.loc[~below_bottom_of_model].copy()

    # exclude inactive cells
    k, i, j = df.k, df.i, df.j
    if model.version == 'mf6':
        active_cells = model.idomain[k, i, j] >= 1
    else:
        active_cells = model.ibound[k, i, j] >= 1
    df = df.loc[active_cells]
    return df
Пример #19
0
def baseflow_summary(self,
                     field_sites,
                     field_measurements,
                     daily_values,
                     q90_window=20,
                     output_proj4=None):

    fm = field_measurements
    dvs = daily_values

    if fm['measurement_dt'].dtype != 'datetime64[ns]':
        fm['measurement_dt'] = pd.to_datetime(fm.measurement_dt)

    # reprojected the output X, Y coordinates
    print('reprojecting output from\n{}\nto\n{}...'.format(
        self.proj4, output_proj4))
    if output_proj4 is not None:
        field_sites['geometry'] = gisutils.project(field_sites, self.proj4,
                                                   output_proj4)

    fm_site_no = []
    Qm = []
    measurement_dt = []
    measured_rating_diff = []
    drainage_area = []
    station_nm = []
    index_station = []
    indexQr = []
    indexQ90 = []
    X, Y = [], []
    for i in range(len(fm)):
        mdt = fm.measurement_dt.tolist()[i]
        Dt = dt.datetime(mdt.year, mdt.month, mdt.day)
        for site_no, data in list(dvs.items()):

            # check if index station covers measurement date
            try:
                dv = data.ix[Dt]
            except KeyError:
                continue
            dv = data.ix[Dt]
            site_no = dv.site_no
            DDcd = [
                k for k in list(data.keys()) if '00060' in k and not 'cd' in k
            ][0]
            try:
                Qr = float(dv[DDcd])  # handle ice and other non numbers
            except:
                continue

            # get q90 values for window
            q90start = pd.Timestamp(Dt) - pd.Timedelta(0.5 * q90_window,
                                                       unit='Y')
            q90end = pd.Timestamp(Dt) + pd.Timedelta(0.5 * q90_window,
                                                     unit='Y')
            values = pd.to_numeric(data.ix[q90start:q90end, DDcd],
                                   errors='coerce')
            q90 = values.quantile(q=0.1)

            # append last to avoid mismatches in length
            site_info = field_sites.ix[fm.site_no.values[i]]
            fm_site_no.append(fm.site_no.values[i])
            station_nm.append(site_info['station_nm'])
            Qm.append(fm.discharge_va.values[i])
            measurement_dt.append(fm.measurement_dt.tolist()[i])
            measured_rating_diff.append(fm.measured_rating_diff.values[i])
            drainage_area.append(site_info['drain_area_va'])
            index_station.append(site_no)
            indexQr.append(Qr)
            indexQ90.append(q90)
            X.append(site_info['geometry'].xy[0][0])
            Y.append(site_info['geometry'].xy[1][0])

    df = pd.DataFrame({
        'site_no': fm_site_no,
        'station_nm': station_nm,
        'datetime': measurement_dt,
        'Qm': Qm,
        'quality': measured_rating_diff,
        'drn_area': drainage_area,
        'idx_station': index_station,
        'indexQr': indexQr,
        'indexQ90': indexQ90,
        'X': X,
        'Y': Y
    })
    df['est_error'] = [
        self.est_error.get(q.lower(), self.default_error) for q in df.quality
    ]
    df = df[[
        'site_no', 'datetime', 'Qm', 'quality', 'est_error', 'idx_station',
        'indexQr', 'indexQ90', 'drn_area', 'station_nm', 'X', 'Y'
    ]]
    return df
Пример #20
0
def setup_wel_data(model, for_external_files=True):
    """Performs the part of well package setup that is independent of
    MODFLOW version. Returns a DataFrame with the information
    needed to set up stress_period_data.
    """
    # default options for distributing fluxes vertically
    vfd_defaults = {
        'across_layers':
        False,
        'distribute_by':
        'thickness',
        'screen_top_col':
        'screen_top',
        'screen_botm_col':
        'screen_botm',
        'minimum_layer_thickness':
        model.cfg['wel'].get('minimum_layer_thickness', 2.)
    }

    # master dataframe for stress period data
    columns = ['per', 'k', 'i', 'j', 'q', 'boundname']
    df = pd.DataFrame(columns=columns)

    # check for source data
    datasets = model.cfg['wel'].get('source_data')

    # delete the dropped wells file if it exists, to avoid confusion
    dropped_wells_file = model.cfg['wel']['output_files'][
        'dropped_wells_file'].format(model.name)
    if os.path.exists(dropped_wells_file):
        os.remove(dropped_wells_file)

    # get well package input from source (parent) model in lieu of source data
    # todo: fetching correct well package from mf6 parent model
    if datasets is None and model.cfg['parent'].get('default_source_data') \
        and hasattr(model.parent, 'wel'):

        # get well stress period data from mfnwt or mf6 model
        parent = model.parent
        spd = get_package_stress_period_data(parent, package_name='wel')
        # map the parent stress period data to inset stress periods
        periods = spd.groupby('per')
        dfs = []
        for inset_per, parent_per in model.parent_stress_periods.items():
            if parent_per in periods.groups:
                period = periods.get_group(parent_per)
                if len(dfs) > 0 and period.drop('per', axis=1).equals(
                        dfs[-1].drop('per', axis=1)):
                    continue
                else:
                    dfs.append(period)
        spd = pd.concat(dfs)

        parent_well_i = spd.i.copy()
        parent_well_j = spd.j.copy()
        parent_well_k = spd.k.copy()

        # set boundnames based on well locations in parent model
        parent_name = parent.name
        spd['boundname'] = [
            '{}_({},{},{})'.format(parent_name, pk, pi, pj)
            for pk, pi, pj in zip(parent_well_k, parent_well_i, parent_well_j)
        ]

        parent_well_x = parent.modelgrid.xcellcenters[parent_well_i,
                                                      parent_well_j]
        parent_well_y = parent.modelgrid.ycellcenters[parent_well_i,
                                                      parent_well_j]
        coords = project((parent_well_x, parent_well_y),
                         model.modelgrid.proj_str, parent.modelgrid.proj_str)
        geoms = [Point(x, y) for x, y in zip(*coords)]
        bounds = model.modelgrid.bbox
        within = [g.within(bounds) for g in geoms]
        i, j = get_ij(model.modelgrid, parent_well_x[within],
                      parent_well_y[within])
        spd = spd.loc[within].copy()
        spd['i'] = i
        spd['j'] = j
        df = df.append(spd)

    # read source data and map onto model space and time discretization
    # multiple types of source data can be submitted
    elif datasets is not None:
        for k, v in datasets.items():

            # determine the format
            if 'csvfile' in k.lower():  # generic csv
                #  read csv file and aggregate flow rates to model stress periods
                #  sum well fluxes co-located in a cell
                sd = TransientTabularSourceData.from_config(
                    v, resolve_duplicates_with='sum', dest_model=model)
                csvdata = sd.get_data()
                csvdata.rename(columns={
                    v['data_column']: 'q',
                    v['id_column']: 'boundname'
                },
                               inplace=True)
                if 'k' not in csvdata.columns:
                    if model.nlay > 1:
                        vfd = vfd_defaults.copy()
                        vfd.update(v.get('vertical_flux_distribution', {}))
                        csvdata = assign_layers_from_screen_top_botm(
                            csvdata, model, **vfd)
                    else:
                        csvdata['k'] = 0
                df = df.append(csvdata[columns])

            elif k.lower() == 'wells':  # generic dict
                added_wells = {k: v for k, v in v.items() if v is not None}
                if len(added_wells) > 0:
                    aw = pd.DataFrame(added_wells).T
                    aw['boundname'] = aw.index
                else:
                    aw = None
                if aw is not None:
                    if 'x' in aw.columns and 'y' in aw.columns:
                        aw['i'], aw['j'] = get_ij(model.modelgrid,
                                                  aw['x'].values,
                                                  aw['y'].values)
                    aw['per'] = aw['per'].astype(int)
                    aw['k'] = aw['k'].astype(int)
                    df = df.append(aw)

            elif k.lower() == 'wdnr_dataset':  # custom input format for WI DNR
                # Get steady-state pumping rates
                check_source_files([v['water_use'], v['water_use_points']])

                # fill out period stats
                period_stats = v['period_stats']
                if isinstance(period_stats, str):
                    period_stats = {
                        kper: period_stats
                        for kper in range(model.nper)
                    }

                # separate out stress periods with period mean statistics vs.
                # those to be resampled based on start/end dates
                resampled_periods = {
                    k: v
                    for k, v in period_stats.items() if v == 'resample'
                }
                periods_with_dataset_means = {
                    k: v
                    for k, v in period_stats.items()
                    if k not in resampled_periods
                }

                if len(periods_with_dataset_means) > 0:
                    wu_means = get_mean_pumping_rates(
                        v['water_use'],
                        v['water_use_points'],
                        period_stats=periods_with_dataset_means,
                        drop_ids=v.get('drop_ids'),
                        model=model)
                    df = df.append(wu_means)
                if len(resampled_periods) > 0:
                    wu_resampled = resample_pumping_rates(
                        v['water_use'],
                        v['water_use_points'],
                        drop_ids=v.get('drop_ids'),
                        exclude_steady_state=True,
                        model=model)
                    df = df.append(wu_resampled)

    # boundary fluxes from parent model
    if model.perimeter_bc_type == 'flux':
        assert model.parent is not None, "need parent model for TMR cut"

        # boundary fluxes
        kstpkper = [(0, 0)]
        tmr = Tmr(model.parent, model)

        # parent periods to copy over
        kstpkper = [(0, per)
                    for per in model.cfg['model']['parent_stress_periods']]
        bfluxes = tmr.get_inset_boundary_fluxes(kstpkper=kstpkper)
        bfluxes['boundname'] = 'boundary_flux'
        df = df.append(bfluxes)

    for col in ['per', 'k', 'i', 'j']:
        df[col] = df[col].astype(int)

    # drop any k, i, j locations that are inactive
    if model.version == 'mf6':
        inactive = model.dis.idomain.array[df.k.values, df.i.values,
                                           df.j.values] != 1
    else:
        inactive = model.bas6.ibound.array[df.k.values, df.i.values,
                                           df.j.values] != 1

    # record dropped wells in csv file
    # (which might contain wells dropped by other routines)
    if np.any(inactive):
        #inactive_i, inactive_j = df.loc[inactive, 'i'].values, df.loc[inactive, 'j'].values
        dropped = df.loc[inactive].copy()
        dropped = dropped.groupby(['k', 'i', 'j']).first().reset_index()
        dropped['reason'] = 'in inactive cell'
        dropped['routine'] = __name__ + '.setup_wel_data'
        append_csv(dropped_wells_file, dropped, index=False,
                   float_format='%g')  # append to existing file if it exists
    df = df.loc[~inactive].copy()

    copy_fluxes_to_subsequent_periods = False
    if copy_fluxes_to_subsequent_periods and len(df) > 0:
        df = copy_fluxes_to_subsequent_periods(df)

    wel_lookup_file = model.cfg['wel']['output_files']['lookup_file'].format(
        model.name)
    wel_lookup_file = os.path.join(model._tables_path,
                                   os.path.split(wel_lookup_file)[1])
    model.cfg['wel']['output_files']['lookup_file'] = wel_lookup_file

    # verify that all wells have a boundname
    if df.boundname.isna().any():
        no_name = df.boundname.isna()
        k, i, j = df.loc[no_name, ['k', 'i', 'j']].T.values
        names = ['({},{},{})'.format(k, i, j) for k, i, j in zip(k, i, j)]
        df.loc[no_name, 'boundname'] = names
    assert not df.boundname.isna().any()

    # save a lookup file with well site numbers/categories
    df.sort_values(by=['boundname', 'per'], inplace=True)
    df[['per', 'k', 'i', 'j', 'q', 'boundname']].to_csv(wel_lookup_file,
                                                        index=False)

    # convert to one-based and comment out header if df will be written straight to external file
    if for_external_files:
        df.rename(columns={'k': '#k'}, inplace=True)
        df['#k'] += 1
        df['i'] += 1
        df['j'] += 1
    return df
Пример #21
0
def setup_structured_grid(xoff=None,
                          yoff=None,
                          xul=None,
                          yul=None,
                          nrow=None,
                          ncol=None,
                          nlay=None,
                          dxy=None,
                          delr=None,
                          delc=None,
                          top=None,
                          botm=None,
                          rotation=0.,
                          parent_model=None,
                          snap_to_NHG=False,
                          features=None,
                          features_shapefile=None,
                          id_column=None,
                          include_ids=None,
                          buffer=1000,
                          crs=None,
                          epsg=None,
                          model_length_units=None,
                          grid_file='grid.json',
                          bbox_shapefile=None,
                          **kwargs):
    """"""
    print('setting up model grid...')
    t0 = time.time()

    # conversions for model/parent model units to meters
    # set regular flag for handling delc/delr
    to_meters_inset = convert_length_units(model_length_units, 'meters')
    regular = True
    if dxy is not None:
        delr_m = np.round(dxy * to_meters_inset,
                          4)  # dxy is specified in model units
        delc_m = delr_m
    if delr is not None:
        delr_m = np.round(delr * to_meters_inset,
                          4)  # delr is specified in model units
        if not np.isscalar(delr_m):
            if (set(delr_m)) == 1:
                delr_m = delr_m[0]
            else:
                regular = False
    if delc is not None:
        delc_m = np.round(delc * to_meters_inset,
                          4)  # delc is specified in model units
        if not np.isscalar(delc_m):
            if (set(delc_m)) == 1:
                delc_m = delc_m[0]
            else:
                regular = False
    if parent_model is not None:
        to_meters_parent = convert_length_units(
            get_model_length_units(parent_model), 'meters')
        # parent model grid spacing in meters
        parent_delr_m = np.round(
            parent_model.dis.delr.array[0] * to_meters_parent, 4)
        if not parent_delr_m % delr_m == 0:
            raise ValueError(
                'inset delr spacing of {} must be factor of parent spacing of {}'
                .format(delr_m, parent_delr_m))
        parent_delc_m = np.round(
            parent_model.dis.delc.array[0] * to_meters_parent, 4)
        if not parent_delc_m % delc_m == 0:
            raise ValueError(
                'inset delc spacing of {} must be factor of parent spacing of {}'
                .format(delc_m, parent_delc_m))

    if epsg is not None:
        crs = pyproj.crs.CRS.from_epsg(epsg)
    elif crs is not None:
        from gisutils import get_authority_crs
        crs = get_authority_crs(crs)
    elif parent_model is not None:
        crs = parent_model.modelgrid.crs

    # option 1: make grid from xoff, yoff and specified dimensions
    if xoff is not None and yoff is not None:
        assert nrow is not None and ncol is not None, \
            "Need to specify nrow and ncol if specifying xoffset and yoffset."
        if regular:
            height_m = np.round(delc_m * nrow, 4)
            width_m = np.round(delr_m * ncol, 4)
        else:
            height_m = np.sum(delc_m)
            width_m = np.sum(delr_m)

        # optionally align grid with national hydrologic grid
        # grids snapping to NHD must have spacings that are a factor of 1 km
        if snap_to_NHG:
            assert regular and np.allclose(1000 % delc_m, 0, atol=1e-4)
            x, y = get_point_on_national_hydrogeologic_grid(xoff,
                                                            yoff,
                                                            offset='edge',
                                                            op=np.floor)
            xoff = x
            yoff = y
            rotation = 0.

        # need to specify xul, yul in case snapping to parent
        # todo: allow snapping to parent grid on xoff, yoff
        if rotation != 0:
            raise NotImplementedError('Rotated grids not supported.')
        xul = xoff
        yul = yoff + height_m

    # option 2: make grid using buffered feature bounding box
    else:
        if features is None and features_shapefile is not None:
            # Make sure shapefile and bbox filter are in dest (model) CRS
            # TODO: CRS wrangling could be added to shp2df as a feature
            reproject_filter = False
            try:
                from gisutils import get_shapefile_crs
                features_crs = get_shapefile_crs(features_shapefile)
                if features_crs != crs:
                    reproject_filter = True
            except:
                features_crs = get_proj_str(features_shapefile)
                reproject_filter = True
            filter = None
            if parent_model is not None:
                if reproject_filter:
                    filter = project(parent_model.modelgrid.bbox,
                                     parent_model.modelgrid.crs,
                                     features_crs).bounds
                else:
                    filter = parent_model.modelgrid.bbox.bounds
            shp2df_kwargs = {'dest_crs': crs}
            shp2df_kwargs = get_input_arguments(shp2df_kwargs, shp2df)
            df = shp2df(features_shapefile, filter=filter, **shp2df_kwargs)

            # optionally subset shapefile data to specified features
            if id_column is not None and include_ids is not None:
                df = df.loc[df[id_column].isin(include_ids)]
            # use all features by default
            features = df.geometry.tolist()

            # convert multiple features to a MultiPolygon
            if isinstance(features, list):
                if len(features) > 1:
                    features = MultiPolygon(features)
                else:
                    features = features[0]

            # size the grid based on the bbox for features
            x1, y1, x2, y2 = features.bounds
            L = buffer  # distance from area of interest to boundary
            xul = x1 - L
            yul = y2 + L
            height_m = np.round(yul - (y1 - L),
                                4)  # initial model height from buffer distance
            width_m = np.round((x2 + L) - xul, 4)
            rotation = 0.  # rotation not supported with this option

    # align model with parent grid if there is a parent model
    # (and not snapping to national hydrologic grid)
    if parent_model is not None and not snap_to_NHG:

        # get location of coinciding cell in parent model for upper left
        pi, pj = parent_model.modelgrid.intersect(xul, yul)
        verts = np.array(parent_model.modelgrid.get_cell_vertices(pi, pj))
        xul, yul = verts[:, 0].min(), verts[:, 1].max()

        # adjust the dimensions to align remaining corners
        def roundup(number, increment):
            return int(np.ceil(number / increment) * increment)

        height = roundup(height_m, parent_delr_m)
        width = roundup(width_m, parent_delc_m)

        # update nrow, ncol after snapping to parent grid
        if regular:
            nrow = int(height / delc_m)  # h is in meters
            ncol = int(width / delr_m)

    # set the grid configuration dictionary
    # spacing is in meters (consistent with projected CRS)
    # (modelgrid object will be updated automatically from this dictionary)
    #if rotation == 0.:
    #    xll = xul
    #    yll = yul - model.height
    grid_cfg = {
        'nrow': int(nrow),
        'ncol': int(ncol),
        'nlay': nlay,
        'delr': delr_m,
        'delc': delc_m,
        'xoff': xoff,
        'yoff': yoff,
        'xul': xul,
        'yul': yul,
        'rotation': rotation,
        'lenuni': 2
    }
    if regular:
        grid_cfg['delr'] = np.ones(grid_cfg['ncol'],
                                   dtype=float) * grid_cfg['delr']
        grid_cfg['delc'] = np.ones(grid_cfg['nrow'],
                                   dtype=float) * grid_cfg['delc']
    grid_cfg['delr'] = grid_cfg['delr'].tolist()  # for serializing to json
    grid_cfg['delc'] = grid_cfg['delc'].tolist()

    # renames for flopy modelgrid
    renames = {'rotation': 'angrot'}
    for k, v in renames.items():
        if k in grid_cfg:
            grid_cfg[v] = grid_cfg.pop(k)

    # add epsg or wkt if there isn't an epsg
    if epsg is not None:
        grid_cfg['epsg'] = epsg
    elif crs is not None:
        if 'epsg' in crs.srs.lower():
            grid_cfg['epsg'] = int(crs.srs.split(':')[1])
        else:
            grid_cfg['wkt'] = crs.srs
    else:
        warnings.warn('No coordinate system reference provided for model grid!'
                      'Model input data may not be mapped correctly.')

    # set up the model grid instance
    grid_cfg['top'] = top
    grid_cfg['botm'] = botm
    grid_cfg.update(kwargs)  # update with any kwargs from function call
    kwargs = get_input_arguments(grid_cfg, MFsetupGrid)
    modelgrid = MFsetupGrid(**kwargs)
    modelgrid.cfg = grid_cfg

    # write grid info to json, and shapefile of bbox
    # omit top and botm arrays from json represenation of grid
    # (just for horizontal disc.)
    del grid_cfg['top']
    del grid_cfg['botm']

    fileio.dump(grid_file, grid_cfg)
    if bbox_shapefile is not None:
        write_bbox_shapefile(modelgrid, bbox_shapefile)
    print("finished in {:.2f}s\n".format(time.time() - t0))
    return modelgrid
Пример #22
0
def extent_poly():
    extent_poly = box(390000, 1330000, 500000, 1455000)
    extent_poly_ll = project(extent_poly, "+init=epsg:{}".format(5070), "+init=epsg:4269")
    return extent_poly_ll
Пример #23
0
def preprocess_te_wateruse(data,
                           start_date=None,
                           end_date=None,
                           active_area=None,
                           active_area_id_column=None,
                           active_area_feature_id=None,
                           estimated_production_zone_top=None,
                           estimated_production_zone_botm=None,
                           estimated_production_surface_units='feet',
                           source_crs=4269,
                           dest_crs=5070,
                           interp_method='linear',
                           data_volume_units='mgal',
                           model_length_units='meters',
                           outfile=None):
    """Preprocess water use data from thermoelectric power plants:

    * reproject data to a destination CRS `dest_crs`)
    * cull data to an area of interest (`active_area`)
    * if input data do not have information on the well screen intervals;
      sample screen tops and bottoms from raster surfaces bounding
      an estimated production zone (e.g. `estimated_production_zone_top`)
    * reindex the data to continous monthly values extending from `start_date`
      to `end_date`. Typically, these would bracket the time period for which
      the pumping should be simulated in a model. For example, the earliest data
      may be from 2010, but if the model starts in 2008, it may be appropriate to
      begin using the 2010 rates then (``start_date='2008'``). If no start or end
      date are given, the first and last years of pumping in `data` are used.
    * fill empty months by interpolation via a specified `interp_method`
    * backfill any remaining empty months going back to the `start_date`
    * write processed data to a CSV file and shapefile of the same name

    Parameters
    ----------
    data : DataFrame
        Thermoelectric water use data in the following format
        (similar to that output by :func:`mapgwm.te_wateruse.read_te_water_use_spreadsheet`):

        =============== =======================================================
        site_no         power plant identifier (plant code)
        start_datetime  pandas datetime representative of flux (e.g. '2010')
        x               x-coordinate of withdrawl, in `source_crs`
        y               y-coordinate of withdrawl, in `source_crs`
        q               withdrawl flux, in `data_volume_units` per days
        =============== =======================================================

    start_date : str
        Start date for pumping rates. If earlier than the dates in `data`,
        pumping rates will be backfilled to this date.
    end_date : str
        End date for pumping rates. If later than the dates in `data`,
        pumping rates will be forward filled to this date.
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    estimated_production_zone_top : file path
        Raster surface for assigning screen tops
    estimated_production_zone_botm : file path
        Raster surface for assigning screen bottoms
    estimated_production_surface_units : str, {'meters', 'ft', etc.}
        Length units of elevations in estimated production surface rasters.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    interp_method : str
        Interpolation method to use for filling pumping rates to monthly values.
        By default, 'linear'
    data_volume_units : str; e.g. 'mgal', 'm3', 'cubic feet', etc.
        Volume units of pumping data. All time units are assumed to be in days.
    model_length_units : str; e.g. 'feet', 'm', 'meters', etc.
        Length units of model.
    outfile : str
        Path for output file. A shapefile of the same name is also written.
        If None, no output file is written. By default, None

    Returns
    -------
    df_monthly : DataFrame
        

    Notes
    -----
    * time units for TE data and model are assumed to be days

    """
    df = data.copy()

    # reproject to dest_crs
    x, y = project(zip(df['x'], df['y']), source_crs, dest_crs)
    df['x'], df['y'] = x, y
    df['geometry'] = [Point(x, y) for x, y in zip(x, y)]

    # drop wells with no location information (for now)
    df.dropna(subset=['x', 'y'], axis=0, inplace=True)

    # cull sites to those within the Delta footprint
    # cull data to that within the model area
    if active_area is not None:
        df = cull_data_to_active_area(df,
                                      active_area,
                                      active_area_id_column,
                                      active_area_feature_id,
                                      data_crs=dest_crs)

    # get top and bottom of estimated production interval at each well
    if estimated_production_zone_top is not None and \
            estimated_production_zone_botm is not None:
        surf_unit_conversion = convert_length_units(
            estimated_production_surface_units, model_length_units)
        x, y = df.x.values, df.y.values
        est_screen_top = get_values_at_points(estimated_production_zone_top,
                                              x,
                                              y,
                                              points_crs=dest_crs)
        est_screen_top *= surf_unit_conversion
        est_screen_botm = get_values_at_points(estimated_production_zone_botm,
                                               x,
                                               y,
                                               points_crs=dest_crs)
        est_screen_botm *= surf_unit_conversion
        df['screen_top'] = est_screen_top
        df['screen_botm'] = est_screen_botm

    # distribute fluxes to monthly values
    # set start and end dates if not already set
    if start_date is None:
        start_date = df.start_datetime.min()
    if end_date is None:
        end_date = df.start_datetime.mmax()
    groups = df.groupby('site_no')
    all_groups = []
    for site_no, group in groups:
        dfg = group.copy()

        # create a continuous monthly time index
        # labeled at the month start
        all_dates = pd.date_range(start_date, end_date, freq='MS')
        dfg.index = dfg['start_datetime']
        dfg = dfg.reindex(all_dates)

        # interpolate the discharge values;
        # back filling to the start date
        dfg['q'] = dfg.q.interpolate(method=interp_method).bfill()
        dfg['q'] *= convert_volume_units(data_volume_units, model_length_units)

        # fill remaining columns
        dfg['start_datetime'] = dfg.index
        fill_columns = set(dfg.columns).difference({'q', 'start_datetime'})
        fill_values = group.iloc[0].to_dict()
        for c in fill_columns:
            dfg[c] = fill_values[c]

        # add 'te' prefix to site number
        dfg['site_no'] = f'te_{site_no}'
        all_groups.append(dfg)
    df_monthly = pd.concat(all_groups)

    # assume most values represent abstraction
    # if sum is positive, invert so that output values are negative
    if df_monthly['q'].sum() > 0:
        df_monthly['q'] *= -1

    # clean up the columns
    cols = [
        'site_no', 'start_datetime', 'x', 'y', 'screen_top', 'screen_botm',
        'q', 'geometry'
    ]
    cols += list(set(df_monthly.columns).difference(cols))
    df_monthly = df_monthly[cols]

    # write the output
    if outfile is not None:
        outfile = Path(outfile)
        df_monthly.drop('geometry', axis=1).to_csv(outfile,
                                                   index=False,
                                                   float_format='%g')
        print('wrote {}'.format(outfile))

        # write only unique pumping values to shapefile
        to_shapefile = df_monthly.groupby(['site_no',
                                           'q']).first().reset_index()
        shapefile = outfile.with_suffix('.shp')
        df2shp(to_shapefile, shapefile, crs=dest_crs)
    return df_monthly
Пример #24
0
def preprocess_iwum_pumping(ncfile,
                            start_date=None,
                            end_date=None,
                            active_area=None,
                            active_area_id_column=None,
                            active_area_feature_id=None,
                            estimated_production_zone_top=None,
                            estimated_production_zone_botm=None,
                            flux_variable='value',
                            nc_crs=5070,
                            dest_crs=5070,
                            nc_length_units='meters',
                            estimated_production_surface_units='meters',
                            model_length_units='meters',
                            outfile=None):
    """Get pumping from the Irrigation Water Use Model (IWUM; Wilson, 2020) output and
    assign open interval information, using raster surfaces of the
    top and bottom of an estimated production zone.

    Parameters
    ----------
    ncfile : file path
        NetCDF output from Irrigation Water Use Model
    start_date : str
        Cull data before this date.
    end_date : str
        Cull data after this date.
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    estimated_production_zone_top : file path
        Raster surface for assigning screen tops
    estimated_production_zone_botm : file path
        Raster surface for assigning screen bottoms
    flux_variable : str
        Varible in ncfile for pumping fluxes. Fluxes are assumed to
        represent total volumes for each time period.
    nc_crs : obj
        Coordinate Reference System (CRS) of ncfile.
        A Python int, dict, str, or pyproj.crs.CRS instance
        passed to the pyproj.crs.from_user_input
        See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input.
        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class
    nc_length_units : str, {'meters', 'ft', etc.}
        Length units of pumped volumes in ncfile
    estimated_production_surface_units : str, {'meters', 'ft', etc.}
        Length units of elevations in estimated production surface rasters.
    model_length_units : str, {'meters', 'ft', etc.}
        Length units of model.
    outfile : csv file for output table

    Returns
    -------
    df : DataFrame
        Table of pumping rates in m3/day, location
        and open interval information.

        Columns:

        ============== ================================================
        site_no        index position of pumping rate in ncfile grid
        x              x-coordinate in `dest_crs`
        y              y-coordinate in `dest_crs`
        start_datetime start date of pumping period
        end_datetime   end date of pumping period
        screen_top     screen top elevation, in `model_length_units`
        screen_botm    screen bottom elevation, in `model_length_units`
        q              pumping rate, in model units
        geometry       shapely Point object representing location
        ============== ================================================

    Notes
    -----
    * Time units are assumed to be days.
    * Fluxes are assumed to represent total volumes for each time period
      indicated by the differences between successive values along the time axis of ncfile.
    """
    ds = xr.open_dataset(ncfile)
    time_variable = [k for k in ds.coords.keys() if k.lower() not in {'x', 'y'}][0]
    ds_x, ds_y = np.meshgrid(ds['x'], ds['y'])

    # original values are in m3, in each 1 mi2 cell
    # can leave in m3 if reassigning to 1km grid as point values
    length_conversion = convert_volume_units(nc_length_units,
                                             model_length_units) ** 3
    unit_suffix = vol_suffix[model_length_units] + 'd'
    flux_col = 'q'  # 'flux_{}'.format(unit_suffix)  # output field name for fluxes

    # get top/botm elevations
    est_screen_top = None
    est_screen_botm = None
    if estimated_production_zone_top is not None and \
            estimated_production_zone_botm is not None:
        surf_unit_conversion = convert_length_units(estimated_production_surface_units,
                                                    model_length_units)
        est_screen_top = get_values_at_points(estimated_production_zone_top, ds_x, ds_y,
                                                points_crs=nc_crs)
        est_screen_top *= surf_unit_conversion
        est_screen_botm = get_values_at_points(estimated_production_zone_botm, ds_x, ds_y,
                                                 points_crs=nc_crs)
        est_screen_botm *= surf_unit_conversion

        # in any places where screen top is less than the screen botm,
        # set both at the mean
        loc = est_screen_top < est_screen_botm
        means = np.mean([est_screen_top, est_screen_botm], axis=0)
        est_screen_top[loc] = means[loc]
        est_screen_botm[loc] = means[loc]
        print(f'Reset screen top and bottom to mean elevation at {loc.ravel().sum()} '
              f'locations where screen top was < screen bottom')

    dfs = []
    times = pd.DatetimeIndex(ds[time_variable].loc[start_date:end_date].values)
    for n, period_start_date in enumerate(times):

        # for each time entry, get the data
        kwargs = {time_variable: period_start_date}
        arr = ds[flux_variable].sel(**kwargs).values

        # make sure pumping sign is  negative
        # based on assumption that values are mostly abstraction
        if arr.sum() > 0:
            arr *= -1

        # set up a dataframe
        data = {'site_no': np.arange(ds_x.size),
                'x': ds_x.ravel(),
                'y': ds_y.ravel(),
                 }
        if est_screen_top is not None and est_screen_botm is not None:
            data.update({'screen_top': est_screen_top.ravel(),
                         'screen_botm': est_screen_botm.ravel()
                         }
                        )
        df = pd.DataFrame(data)
        df['start_datetime'] = period_start_date

        # get the end_date, handling last entry
        if n + 1 < len(times):
            period_end_date = times[n + 1]
        else:
            # set end date for last period on previous period length
            last_start = dfs[-1]['start_datetime'].values[0]
            ndays = (pd.Timestamp(period_start_date) -
                     pd.Timestamp(last_start)).days
            period_end_date = period_start_date + pd.Timedelta(ndays, unit='d')

        # convert the time units
        ndays = (pd.Timestamp(period_end_date) -
                 pd.Timestamp(period_start_date)).days
        assert ndays > 0, "period_end_date {} is before period_start_date {}"\
            .format(period_end_date, period_start_date)
        time_conversion = 1 / ndays  # original quantities are volumes for the time period

        # time indexing in pandas is through last value
        period_end_date = pd.Timestamp(period_end_date) - pd.Timedelta(1, unit='d')
        df['end_datetime'] = period_end_date
        df[flux_col] = arr.ravel() * length_conversion * time_conversion

        # only includes fluxes > 0
        df = df.loc[df[flux_col] < 0]

        dfs.append(df)
    df = pd.concat(dfs)

    # site number column (that would be unique from other integers from other data sources)
    df['site_no'] = [f'iwum_{node}' for node in df.site_no]

    # project the data to a destination crs, if provided
    # make a separate metadata dataframe with 1 row per location
    # to avoid redundant operations
    metadata = df.groupby('site_no').first().reset_index()[['site_no', 'x', 'y']]
    metadata.index = metadata['site_no']
    x_pr, y_pr = project((metadata.x.values, metadata.y.values), nc_crs, dest_crs)
    metadata['x'], metadata['y'] = x_pr, y_pr
    metadata['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull the data to the model area, if provided
    if active_area is not None:
        df, metadata = cull_data_to_active_area(df, active_area,
                                      active_area_id_column,
                                      active_area_feature_id,
                                      data_crs=dest_crs, metadata=metadata)

    # update data with x,y values projected in metadata
    x = dict(zip(metadata.site_no, metadata.x))
    y = dict(zip(metadata.site_no, metadata.y))
    df['x'] = [x[sn] for sn in df.site_no]
    df['y'] = [y[sn] for sn in df.site_no]
    if outfile is not None:
        outfile = Path(outfile)
        df.to_csv(outfile, index=False, float_format='%g')
        print('wrote {}'.format(outfile))

        # Make a plot of iwum output in mgal/day
        out_pdf_path = outfile.parent / 'plots'
        out_pdf_path.mkdir(exist_ok=True)
        plot_iwum_output(ncfile, flux_variable=flux_variable, outpath=out_pdf_path)

    return df