示例#1
0
def xy_to_gpd(id_col, x_col, y_col, df=None, crs=2193):
    """
    Function to convert a DataFrame with x and y coordinates to a GeoDataFrame.

    Parameters
    ----------
    df: Dataframe
        The DataFrame with the location data.
    id_col: str or list of str
        The column(s) from the dataframe to be returned. Either a one name string or a list of column names.
    xcol: str or ndarray
        Either the column name that has the x values within the df or an array of x values.
    ycol: str or ndarray
        Same as xcol except for y.
    crs: int
        The projection of the data.

    Returns
    -------
    GeoDataFrame
        Of points.
    """

    if type(x_col) is str:
        geometry = [Point(xy) for xy in zip(df[x_col], df[y_col])]
    else:
        x1 = select_sites(x_col)
        y1 = select_sites(y_col)
        geometry = [Point(xy) for xy in zip(x1, y1)]
    if isinstance(id_col, str) & (df is not None):
        id_data = df[id_col]
    elif isinstance(id_col, list):
        if df is not None:
            id_data = df[id_col]
        else:
            id_data = id_col
    elif isinstance(id_col, (np.ndarray, pd.Series, pd.Index)):
        id_data = id_col
    else:
        raise ValueError('id_data could not be determined')
    if isinstance(crs, int):
        crs1 = convert_crs(crs)
    elif isinstance(crs, (str, dict)):
        crs1 = crs
    else:
        raise ValueError('crs must be an int, str, or dict')
    gpd1 = gpd.GeoDataFrame(id_data, geometry=geometry, crs=crs1)
    return gpd1
示例#2
0
def hydstra_site_mod_time(sites=None):
    """
    Function to extract modification times from Hydstra data archive files. Returns a DataFrame of sites by modification date. The modification date is in GMT.

    Parameters
    ----------
    sites : list, array, Series, or None
        If sites is not None, then return only the given sites.

    Returns
    -------
    DataFrame
    """

    site_files_path = r'\\fileservices02\ManagedShares\Data\Hydstra\prod\hyd\dat\hyd'
    files1 = rd_dir(site_files_path, 'A')
    file_sites = [os.path.splitext(i)[0] for i in files1]

    if sites is not None:
        sites1 = select_sites(sites).astype(str)
        sites2 = [i.replace('/', '_') for i in sites1]
        file_sites1 = [i for i in file_sites if i in sites2]
    else:
        file_sites1 = file_sites

    mod_times = pd.to_datetime([round(os.path.getmtime(os.path.join(site_files_path, i + '.A'))) for i in file_sites1], unit='s')

    df = pd.DataFrame({'site': file_sites1, 'mod_time': mod_times})
    return df
示例#3
0
def rd_blocklist(sites, datasources=['A'], variables=['100', '10', '110', '140', '130', '143', '450'], start='1900-01-01', end='2100-01-01', start_modified='1900-01-01', end_modified='2100-01-01'):
    """
    Wrapper function to extract info about when data has changed between modification dates.

    Parameters
    ----------
    sites : list, array, one column csv file, or dataframe
        Site numbers.
    datasource : list of str
        Hydstra datasource code (usually ['A']).
    variables : list of int or float
        The hydstra conversion data variable (140.00 is flow).
    start : str
        The start time in the format of '2001-01-01'.
    end : str
        Same formatting as start.
    start_modified: str
        The starting date of the modification.
    end_modified: str
        The ending date of the modification.

    Returns
    -------
    DataFrame
        With site, data_source, varto, from_mod_date, and to_mod_date.
    """
    ### Process sites
    sites1 = select_sites(sites).tolist()

    ### Open connection
    hyd = openHyDb()
    with hyd as h:
        df = h.get_ts_blockinfo(sites1, start=start, end=end, datasources=datasources, variables=variables, start_modified=start_modified, end_modified=end_modified)
    return df
示例#4
0
    def get_ts_traces(self, site_list, start=0, end=0, varfrom=100, varto=140, interval='day', multiplier=1, datasource='A', data_type='mean', qual_codes=[30, 20, 10, 11, 21, 18], report_time=None):
        """

        """

        # Convert the site list to a comma delimited string of sites
        sites = select_sites(site_list).astype(str)
        site_list_str = ','.join([str(site) for site in sites])

        ### Datetime conversion - with dates < 1900
        c1900 = pd.Timestamp('1900-01-01')
        if start != 0:
            start1 = pd.Timestamp(start)
            if start1 > c1900:
                start = start1.strftime('%Y%m%d%H%M%S')
            else:
                start = start1.isoformat(' ').replace('-', '').replace(' ', '').replace(':', '')
        if end != 0:
            end1 = pd.Timestamp(end)
            if end1 > c1900:
                end = end1.strftime('%Y%m%d%H%M%S')
            else:
                end = end1.isoformat(' ').replace('-', '').replace(' ', '').replace(':', '')

        ts_traces_request = {'function': 'get_ts_traces',
                             'version': 2,
                             'params': {'site_list': site_list_str,
                                        'start_time': start,
                                        'end_time': end,
                                        'varfrom': varfrom,
                                        'varto': varto,
                                        'interval': interval,
                                        'datasource': datasource,
                                        'data_type': data_type,
                                        'multiplier': multiplier,
                                        'report_time': report_time}}

        ts_traces_request = self.query_by_dict(ts_traces_request)
        j1 = ts_traces_request['return']['traces']

        ### Convert json to a dataframe
        sites = [str(f['site']) for f in j1]

        out1 = pd.DataFrame()
        for i in range(len(j1)):
            df1 = pd.DataFrame(j1[i]['trace'])
            if not df1.empty:
                df1.rename(columns={'v': 'data', 't': 'time', 'q': 'qual_code'}, inplace=True)
                df1['data'] = pd.to_numeric(df1['data'], errors='coerce')
                df1['time'] = pd.to_datetime(df1['time'], format='%Y%m%d%H%M%S')
                df1['qual_code'] = pd.to_numeric(df1['qual_code'], errors='coerce', downcast='integer')
                df1['site'] = sites[i]
                df2 = df1[df1.qual_code.isin(qual_codes)]
                out1 = pd.concat([out1, df2])

        out2 = out1.set_index(['site', 'time'])[['data', 'qual_code']]

        return out2
示例#5
0
def rd_hydstra_db(sites, start=0, end=0, datasource='A', data_type='mean', varfrom=100, varto=140, interval='day', multiplier=1, qual_codes=[30, 20, 10, 11, 21, 18], report_time=None, sites_chunk=20, print_sites=False, export_path=None):
    """
    Wrapper function over hydllp to read in data from Hydstra's database. Must be run in a 32bit python. If either start_time or end_time is not 0, then they both need a date.

    Parameters
    ----------
    sites : list, array, one column csv file, or dataframe
        Site numbers.
    start : str or int of 0
        The start time in the format of either '2001-01-01' or 0 (for all data).
    end : str or int of 0
        Same formatting as start.
    datasource : str
        Hydstra datasource code (usually 'A').
    data_type : str
        mean, maxmin, max, min, start, end, first, last, tot, point, partialtot, or cum.
    varfrom : int or float
        The hydstra source data variable (100.00 is water level).
    varto : int or float
        The hydstra conversion data variable (140.00 is flow).
    interval : str
        The frequency of the output data (year, month, day, hour, minute, second, period). If data_type is 'point', then interval cannot be 'period' (use anything else, it doesn't matter).
    multiplier : int
        interval frequency.
    qual_codes : list of int
        The quality codes in Hydstra for filtering the data.
    sites_chunk : int
        Number of sites to request to hydllp at one time. Do not change unless you understand what it does.

    Return
    ------
    DataFrame
        In long format with site and time as a MultiIndex.
    """

    ### Process sites into workable chunks
    sites1 = select_sites(sites)
    n_chunks = np.ceil(len(sites1) / float(sites_chunk))
    sites2 = np.array_split(sites1, n_chunks)

    ### Run instance of hydllp
    data = pd.DataFrame()
    for i in sites2:
        if print_sites:
            print(i)
        ### Open connection
        hyd = openHyDb()
        with hyd as h:
            df = h.get_ts_traces(i, start=start, end=end, datasource=datasource, data_type=data_type, varfrom=varfrom, varto=varto, interval=interval, multiplier=multiplier, qual_codes=qual_codes, report_time=report_time)
        data = pd.concat([data, df])

    if isinstance(export_path, str):
        save_df(data, export_path)

    return data
示例#6
0
    def get_ts_blockinfo(self, site_list, datasources=['A'], variables=['100', '10', '110', '140', '130', '143', '450'], start='1900-01-01', end='2100-01-01', start_modified='1900-01-01', end_modified='2100-01-01', fill_gaps=0, auditinfo=0):
        """

        """

        # Convert the site list to a comma delimited string of sites
        sites = select_sites(site_list).astype(str)
        site_list_str = ','.join([str(site) for site in sites])

        ### Datetime conversion
        start = pd.Timestamp(start).strftime('%Y%m%d%H%M%S')
        end = pd.Timestamp(end).strftime('%Y%m%d%H%M%S')
        start_modified = pd.Timestamp(start_modified).strftime('%Y%m%d%H%M%S')
        end_modified = pd.Timestamp(end_modified).strftime('%Y%m%d%H%M%S')

        ### dict request
        ts_blockinfo_request = {"function": "get_ts_blockinfo",
                                "version": 2,
                                "params": {'site_list': site_list_str,
                                           'datasources': datasources,
                                           'variables': variables,
                                           'starttime': start,
                                           'endtime': end,
                                           'start_modified': start_modified,
                                           'end_modified': end_modified
                                           }}

        ts_blockinfo_result = self.query_by_dict(ts_blockinfo_request)
        blocks = ts_blockinfo_result['return']['blocks']
        df1 = pd.DataFrame(blocks)
        if df1.empty:
            return(df1)
        else:
            df1['endtime'] = pd.to_datetime(df1['endtime'], format='%Y%m%d%H%M%S')
            df1['starttime'] = pd.to_datetime(df1['starttime'], format='%Y%m%d%H%M%S')
            df1['variable'] = pd.to_numeric(df1['variable'], errors='coerce', downcast='integer')
            df2 = df1[['site', 'datasource', 'variable', 'starttime', 'endtime']].sort_values(['site', 'variable', 'starttime'])
            df2.rename(columns={'datasource': 'data_source', 'variable': 'varto', 'starttime': 'from_mod_date', 'endtime': 'to_mod_date'}, inplace=True)

            return df2
示例#7
0
def crc_band_flow(site_lst=None, crc_lst=None, names=False):
    """
    Function to determine the min flow conditions for each flow site, band, and crc.
    """

    ### Database parameters
    # crc, sites, and bands

    server = 'SQL2012PROD03'
    database = 'LowFlows'

    crc_table = 'vLowFlowConsents2'

    # id and gauge site

    gauge_table = 'LowFlowSite'

    # Internal site id, band, and min flow

    min_flow_table = 'LowFlowSiteBandPeriodAllocation'

    ## fields and associated column names
    crc_fields = ['SiteID', 'BandNo', 'RecordNo']
    crc_names = ['id', 'band', 'crc']

    if names:
        gauge_fields = ['SiteID', 'RefDBaseKey', 'Waterway', 'Location']
        gauge_names = ['id', 'site', 'Waterway', 'Location']
    else:
        gauge_fields = ['SiteID', 'RefDBaseKey']
        gauge_names = ['id', 'site']

    min_flow_fields = ['SiteID', 'BandNo', 'PeriodNo', 'Allocation', 'Flow']
    min_flow_names = ['id', 'band', 'mon', 'allo', 'min_flow']

    ### Load in data

    crc = rd_sql(server, database, crc_table, crc_fields)
    crc['crc'] = crc['crc'].str.strip()
    crc.columns = crc_names

    gauge = rd_sql(server, database, gauge_table, gauge_fields)
    gauge.columns = gauge_names

    min_flow = rd_sql(server, database, min_flow_table, min_flow_fields)
    min_flow.columns = min_flow_names

    ### Remove min flows that are not restricted
    min_flow1 = min_flow[min_flow.allo < 100]

    ### Lots of table merges!
    crc_min_flow = pd.merge(crc, min_flow1, on=['id', 'band'])
    crc_min_gauge = pd.merge(gauge, crc_min_flow, on='id').drop('id', axis=1)

    ### Query results
    if crc_lst is not None:
        crc_sel = select_sites(crc_lst)
        sel1 = crc_min_gauge[np.in1d(crc_min_gauge.crc, crc_sel)]
    else:
        sel1 = crc_min_gauge
    if site_lst is not None:
        site_sel = select_sites(site_lst).astype(str)
        sel2 = sel1[np.in1d(sel1.site, site_sel)]
    else:
        sel2 = sel1

    return sel2
示例#8
0
def flow_ros(select=all, start_date='1900-01-01', end_date='2016-06-30', fill_na=False, flow_csv='S:/Surface Water/shared/base_data/flow/flow_data.csv', min_flow_cond_csv='S:/Surface Water/shared/base_data/usage/restrictions/min_flow_cond.csv', min_flow_id_csv='S:/Surface Water/shared/base_data/usage/restrictions/min_flow_id.csv', min_flow_mon_csv='S:/Surface Water/shared/base_data/usage/restrictions/mon_min_flow.csv', min_flow_restr_csv='S:/Surface Water/shared/base_data/usage/restrictions/min_flow_restr.csv'):
    """
    Function to estimate the percent allowable abstraction per band_id.

    Arguments:\n
    select -- Either a list, array, dataframe, or signle column csv file of site numbers.\n
    start_date / end_date -- The start and/or end date for the results as a string.\n
    *_csv -- csv files necessary for the analysis.
    """

    def norm_eval(series):
        if series['lower'] == '0':
            lower1 = '-1'
        else:
            lower1 =  series['lower']
        stmt = '(' + series['object'] + '[' + str(int(series['site'])) + ']' + ' <= ' + series['upper'] + ')' + ' & ' + '(' + series['object'] + '[' + str(int(series['site'])) + ']' + ' > ' + lower1 + ')'
        return(stmt)

    def stmt_set(norm_conds, other_conds):
        if (len(norm_conds) > 0) & (len(other_conds) > 0):
            max1 = norm_conds.ix[norm_conds.index[-1], 'upper']
            new1 = norm_conds.ix[0, :]
            new1.loc['upper'] = '100000'
            new1.loc['lower'] = max1
            new1.loc['cond_id'] = 0
            norm_conds.loc['a', :] = new1
            stmt = [norm_eval(norm_conds.loc[x,:]) for x in norm_conds.index]
            other_stmt = other_conds.other.tolist()
            stmt.append(other_stmt)
            ids = norm_conds.cond_id.tolist()
            ids.append(other_conds.cond_id)
        elif (len(norm_conds) > 0):
            max1 = norm_conds.ix[norm_conds.index[-1], 'upper']
            new1 = norm_conds.iloc[0, :]
            new1.loc['upper'] = '100000'
            new1.loc['lower'] = max1
            new1.loc['cond_id'] = 0
            norm_conds.loc['a', :] = new1
            stmt = [norm_eval(norm_conds.loc[x,:]) for x in norm_conds.index]
            ids = norm_conds.cond_id.tolist()
        elif (len(other_conds) > 0):
            stmt = other_conds.other.tolist()
            ids = other_conds.cond_id.tolist()
        return([stmt, ids])

    def pro_rata(flow, lower, upper):
        perc = (flow - lower) * 100 / (upper - lower)
        perc[perc < 0] = 0
        return(perc)

    ### Read in data tables
    min_flow_cond = pd.read_csv(min_flow_cond_csv).dropna(how='all')
    min_flow_id = pd.read_csv(min_flow_id_csv).dropna(how='all')
    min_flow_mon = pd.read_csv(min_flow_mon_csv).dropna(how='all')
    min_flow_restr = pd.read_csv(min_flow_restr_csv).dropna(how='all')
    if type(flow_csv) is str:
        flow1 = pd.read_csv(flow_csv)
        flow1.loc[:, 'time'] = pd.to_datetime(flow1.loc[:, 'time'])
        flow = flow1.pivot_table('data', 'time', 'site')
    else:
        flow = flow_csv
    flow.columns = flow.columns.astype('int32')

    ### Select specific site bands
    if select is not all:
        bands1 = select_sites(select).astype(str)
        min_flow_id = min_flow_id[np.in1d(min_flow_id.site.astype(str), bands1)]

    ### Add in additional data from hydrotel if needed
    if sum(min_flow_id.site == 69607) > 0:
        hydrotel_flow_sites = [696501]
        hydrotel_wl_sites = [69660]
        opuha_flow = rd_hydrotel(hydrotel_flow_sites, mtype='flow_tel', resample='day', fun='avg', pivot=True).value
        opuha_flow.columns = opuha_flow.columns.astype(int)
        wl = rd_hydrotel(hydrotel_wl_sites, mtype='swl_tel', resample='day', fun='avg', pivot=True).value
        wl.columns = wl.columns.astype(int)

        UF = (1.288 * flow[69615] + 0.673 * flow[69616] + 2.438 * flow[69618] - 2.415)
        UF.name = 1696297

        flow = concat([flow, opuha_flow, UF], axis=1)
        flow.columns = flow.columns.astype(int)

    ### Create monthly time series of flow restrictions
    mon_series1 = pd.DataFrame(flow.index.month, index=flow.index, columns=['mon'])
    mon_series = pd.merge(mon_series1, min_flow_mon, on='mon', how='left')
    mon_series.index = mon_series1.index

    ### Run through each band
    ## Create blank dataframe
    c1 = min_flow_id.site.tolist()
    c2 = min_flow_id.allo_band_id.tolist()

    index1 = pd.MultiIndex.from_tuples(list(zip(*[c1, c2])))
    if sum(min_flow_id.site == 69607) > 0:
        eval_dict = {'flow': flow, 'wl': wl, 'mon_series': mon_series}
    else:
        eval_dict = {'flow': flow, 'mon_series': mon_series}

    allow1 = pd.DataFrame(np.nan, index=flow.index, columns=index1)

    for j in min_flow_id.index:
        site_id = min_flow_id.site[j]
        band_id = min_flow_id.allo_band_id[j]
        t1 = min_flow_id.loc[j, :]
        cond_id = literal_eval(t1['cond_id'])
        cond_id.extend([0])
        restr_id = literal_eval(t1['restr_id'])
        restr_id.extend(['r100'])
        cond_restr = dict(zip(cond_id, restr_id))

        conds1 = min_flow_cond[np.in1d(min_flow_cond.cond_id, cond_id)]
        norm_conds = conds1[conds1.object != 'other']
        other_conds = conds1[conds1.object == 'other']

        stmt, ids = stmt_set(norm_conds, other_conds)

        df1 = pd.concat((eval(x, globals(), eval_dict) for x in stmt), axis=1)
        df1.columns = ids
        df2 = df1.copy()
        df2.loc[:, :] = np.nan

        perc_restr = {}
        for x in cond_restr:
            if cond_restr[x] != 'pro_rata':
                perc_restr.update({x: eval(min_flow_restr.loc[min_flow_restr.restr_id == cond_restr[x], 'restr_cond'].values[0], globals(), eval_dict)})
            else:
                seta = norm_conds.loc[norm_conds.cond_id == x,:]
                pr1 = pro_rata(flow[int(seta.site)], float(seta.lower), float(seta.upper))
                perc_restr.update({x: pr1})

        for i in perc_restr:
            index = df1[i].dropna().index[np.where(df1[i].dropna())[0]]
            if type(perc_restr[i]) is pd.Series:
                df2.ix[index, i] = perc_restr[i][index]
            else:
                df2.ix[index, i] = perc_restr[i]

        ## Take the most restrictive between the conditions
        df3 = df2.min(axis=1)

        ### Process exemptions
        if t1['exempt_id'] is not np.nan:
            exempt_id = literal_eval(t1['exempt_id'])
            exempt_restr_id = literal_eval(t1['exempt_restr_id'])
            exempt_cond_restr = dict(zip(exempt_id, exempt_restr_id))

            conds1 = min_flow_cond[np.in1d(min_flow_cond.cond_id, exempt_id)]
            norm_conds = conds1[conds1.object != 'other']
            other_conds = conds1[conds1.object == 'other']

            stmt, ids = stmt_set(norm_conds, other_conds)

            df1 = pd.concat((eval(x, globals(), eval_dict) for x in stmt), axis=1)
            df1.columns = ids
            df2 = df1.copy()
            df2.loc[:, :] = np.nan

            perc_restr = {x: eval(min_flow_restr.loc[min_flow_restr.restr_id == exempt_cond_restr[x], 'restr_cond'].values[0], globals(), eval_dict) for x in exempt_cond_restr}

            for i in perc_restr:
                index = df1[i].dropna().index[np.where(df1[i].dropna())[0]]
                if type(perc_restr[i]) is pd.Series:
                    df2.ix[index, i] = perc_restr[i][index]
                else:
                    df2.ix[index, i] = perc_restr[i]

            ## Take the most restrictive for the exemptions
            df3_exempt = df2.min(axis=1)

            ### Take the least restrictive between the primary conditions and the exemptions
            allow1.loc[:, (site_id, band_id)] = concat([df3, df3_exempt], axis=1).max(axis=1)

        else:
            allow1.loc[:, (site_id, band_id)] = df3

    ### Constrain results to dates
    allow2 = allow1[start_date:end_date].round(1)
    if fill_na:
        allow2 = allow2.fillna(method='ffill')
    return(allow2)
示例#9
0
def restr_days(select, period='A-JUN', months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], min_sites_shp='S:/Surface Water/shared/GIS_base/vector/low_flows/min_flows_sites_Cant.shp', sites_col='ReferenceN', export=True, export_path='restr_days.csv'):
    """
    Function to determine the number of days on restriction per period according to the LowFlows database.

    Parameters
    ----------
    select: list or str
        Can either be a list of gauging site numbers or a shapefile polygon of an area that contains min flow sites.
    period: str
        Pandas time series code for the time period.
    months: list of int
        The specific months to include in the query.

    Returns
    -------
    DataFrame
    """

    ########################################
    ### Parameters

    ## Query fields - Be sure to use single quotes for the names!!!

    restr_fields = ['SiteID', 'RestrictionDate', 'BandNo', 'BandAllocation']
#    sites_fields = ['SiteID', 'RefDBaseKey','RecordNo', 'WellNo']
    crc_fields = ['SiteID', 'BandNo', 'RecordNo']
    sites_fields = ['Siteid', 'RefDBaseKey']


    ## Equivelant short names for analyses - Use these names!!!

    restr_names = ['SiteID', 'dates', 'band_num', 'band_restr']
#    sites_names = ['SiteID', 'gauge_num', 'crc', 'wap']
    crc_names = ['SiteID', 'band_num', 'crc']
    sites_names = ['SiteID', 'gauge_num']

    ## Databases

    #statement = "SELECT * FROM "

    # daily restrictions

    server1 = 'SQL2012PROD03'
    database1 = 'LowFlows'

    restr_table = 'LowFlows.dbo.LowFlowSiteRestrictionDaily'
    restr_where = {'SnapshotType': ['Live']}

    # Sites info

    server2 = 'SQL2012PROD03'
    database2 = 'LowFlows'

    sites_table = 'LowFlows.dbo.vLowFlowSite'

    # crc, sites, and bands

    server3 = 'SQL2012PROD03'
    database3 = 'LowFlows'

    crc_table = 'LowFlows.dbo.vLowFlowConsents2'

    ########################################
    ## Make the sites selection
    if isinstance(select, str):
        if select.endswith('.shp'):
            sites3 = sel_sites_poly(select, min_sites_shp)[sites_col].unique()
        else:
            sites3 = pd.read_csv(select)[sites_col].unique()
    elif isinstance(select, (list, np.ndarray)):
        sites3 = select_sites(select)

    ########################################
    ### Read in data

    sites = rd_sql(server2, database2, sites_table, sites_fields)
    sites.columns = sites_names

    sites4 = sites.loc[sites.gauge_num.isin(sites3.astype(str)), 'SiteID'].unique().astype('int32').tolist()

    restr_where.update({'SiteID': sites4})

    restr = rd_sql(server1, database1, restr_table, restr_fields, restr_where).drop_duplicates(keep='last')
    restr.columns = restr_names

    crc = rd_sql(server3, database3, crc_table, crc_fields)
    crc.columns = crc_names

    ##################################
    ### Calculate the number of days on full and partial restriction

    ## Remove anything above 100%
    restr1 = restr[restr.band_restr <= 100]

    ## Recategorize band restr
    partial_index = (restr1.band_restr > 0) & (restr1.band_restr < 100)

    restr1.loc[partial_index, 'band_restr'] = 101
    restr1.loc[restr1.band_restr == 100, 'band_restr'] = 103
    restr1.loc[restr1.band_restr == 0, 'band_restr'] = 102

    ## Restrict by months
    mon_index = restr1.dates.dt.month.isin(months)
    restr1 = restr1[mon_index]

    ## Do the work
    def sp_count(df, num):
        df.index = df.dates
        df_grp = df[df.band_restr == num].resample(period)
        df_count = df_grp['band_restr'].count()
        return df_count

    restr1_grp = restr1.groupby(['SiteID', 'band_num'])

    partial1 = restr1_grp.apply(sp_count, 101)
    partial1.name = 'partial'
    full1 = restr1_grp.apply(sp_count, 102)
    full1.name = 'full'
#    no1 = restr1_grp.apply(sp_count, 103)

    tot1 = pd.concat([partial1, full1], axis=1)
    tot1.index.names = ['SiteID', 'band_num', 'dates']
    if partial1.empty:
        tot1['partial'] = 0
    if full1.empty:
        tot1['full'] = 0
#    tot1.columns = ['partial', 'full']

    tot2 = tot1.reset_index()

    ## Relabel the sites to actually be site number
    sites2 = sites.drop_duplicates()
    tot3 = pd.merge(tot2, sites2, on='SiteID', how='left')
    tot3.loc[tot3.partial.isnull(), 'partial'] = 0
    tot3.loc[tot3.full.isnull(), 'full'] = 0

    tot3 = tot3[tot3.gauge_num.notnull()]

    ## Summarize the results
    restr2 = tot3[['gauge_num', 'band_num', 'dates', 'partial', 'full']]

    if export:
        restr2.to_csv(export_path, index=False)
    return(restr2)
示例#10
0
def rd_squalarc(sites,
                mtypes=None,
                from_date=None,
                to_date=None,
                convert_dtl=False,
                dtl_method=None,
                export=None):
    """
    Function to read in "squalarc" data. Which is atually stored in the mssql db.

    Parameters
    ----------
    sites: ndarry, list, or str
        The site names as a list, array, csv with the first column as the site names, or a polygon shapefile of the area of interest.
    mtypes: list or None
        A list of measurement type names to be in the output. Leaving it empty returns all mtypes.
    from_date: str
        A start date string in of '2010-01-01'.
    to_date: str
        A end date string in of '2011-01-01'.
    convert_dtl: bool
        Should values under the detection limit be converted to numeric?
    dtl_method: str
        The method to use to convert values under a detection limit to numeric. None or 'standard' takes half of the detection limit. 'trend' is meant as an output for trend analysis with includes an additional column dtl_ratio referring to the ratio of values under the detection limit.
    export: str or None
        Either None or a string path to a csv file.
    """

    #### Read in sites
    sites1 = select_sites(sites)

    #### Extract by polygon
    if isinstance(sites1, gpd.GeoDataFrame):
        ## Surface water sites
        sw_sites_tab = rd_sql('SQL2012PROD05',
                              'Squalarc',
                              'SITES',
                              col_names=['SITE_ID', 'NZTMX', 'NZTMY'])
        sw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_sw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', sw_sites_tab)
        sites1a = sites1.to_crs(gdf_sw_sites.crs)
        sw_sites2 = sel_sites_poly(gdf_sw_sites, sites1a).drop('geometry',
                                                               axis=1)

        ## Groundwater sites
        gw_sites_tab = rd_sql('SQL2012PROD05',
                              'Wells',
                              'WELL_DETAILS',
                              col_names=['WELL_NO', 'NZTMX', 'NZTMY'])
        gw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_gw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', gw_sites_tab)
        gw_sites2 = sel_sites_poly(gdf_gw_sites, sites1a).drop('geometry',
                                                               axis=1)

        sites2 = sw_sites2.site.append(gw_sites2.site).astype(str).tolist()
    else:
        sites2 = pd.Series(sites1, name='site').astype(str).tolist()

    #### Extract the rest of the data
    if len(sites2) > 10000:
        n_chunks = int(np.ceil(len(sites2) * 0.0001))
        sites3 = [sites2[i::n_chunks] for i in xrange(n_chunks)]
        samples_tab = pd.DataFrame()
        for i in sites3:
            samples_tab1 = rd_sql('SQL2012PROD05',
                                  'Squalarc',
                                  '"SQL_SAMPLE_METHODS+"',
                                  col_names=[
                                      'Site_ID', 'SAMPLE_NO', 'ME_TYP',
                                      'Collect_Date', 'Collect_Time',
                                      'PA_NAME', 'PARAM_UNITS', 'SRESULT'
                                  ],
                                  where_col='Site_ID',
                                  where_val=i)
            samples_tab1.columns = [
                'site', 'sample_id', 'source', 'date', 'time', 'parameter',
                'units', 'val'
            ]
            samples_tab1.loc[:,
                             'source'] = samples_tab1.loc[:,
                                                          'source'].str.lower(
                                                          )
            samples_tab = pd.concat([samples_tab, samples_tab1])
    else:
        samples_tab = rd_sql('SQL2012PROD05',
                             'Squalarc',
                             '"SQL_SAMPLE_METHODS+"',
                             col_names=[
                                 'Site_ID', 'SAMPLE_NO', 'ME_TYP',
                                 'Collect_Date', 'Collect_Time', 'PA_NAME',
                                 'PARAM_UNITS', 'SRESULT'
                             ],
                             where_col='Site_ID',
                             where_val=sites2)
        samples_tab.columns = [
            'site', 'sample_id', 'source', 'date', 'time', 'parameter',
            'units', 'val'
        ]
        samples_tab.loc[:, 'source'] = samples_tab.loc[:, 'source'].str.lower()

    samples_tab2 = samples_tab.copy()
    num_test = pd.to_numeric(samples_tab2.loc[:, 'time'], 'coerce')
    samples_tab2.loc[num_test.isnull(), 'time'] = '0000'
    samples_tab2.loc[:,
                     'time'] = samples_tab2.loc[:,
                                                'time'].str.replace('.', '')
    samples_tab2 = samples_tab2[samples_tab2.date.notnull()]
    #    samples_tab2.loc[:, 'time'] = samples_tab2.loc[:, 'time'].str.replace('9999', '0000')
    time1 = pd.to_datetime(samples_tab2.time, format='%H%M', errors='coerce')
    time1[time1.isnull()] = pd.Timestamp('2000-01-01 00:00:00')
    datetime1 = pd.to_datetime(
        samples_tab2.date.dt.date.astype(str) + ' ' +
        time1.dt.time.astype(str))
    samples_tab2.loc[:, 'date'] = datetime1
    samples_tab2 = samples_tab2.drop('time', axis=1)
    samples_tab2.loc[samples_tab2.val.isnull(), 'val'] = np.nan
    samples_tab2.loc[samples_tab2.val == 'N/A', 'val'] = np.nan

    #### Select within time range
    if isinstance(from_date, str):
        samples_tab2 = samples_tab2[samples_tab2['date'] >= from_date]
    if isinstance(to_date, str):
        samples_tab2 = samples_tab2[samples_tab2['date'] <= to_date]

    if mtypes is not None:
        mtypes1 = select_sites(mtypes)
        data = samples_tab2[samples_tab2.parameter.isin(mtypes1)].reset_index(
            drop=True)
    else:
        data = samples_tab2.reset_index(drop=True)

    #### Correct poorly typed in site names
    data.loc[:, 'site'] = data.loc[:, 'site'].str.upper().str.replace(' ', '')

    #### Convert detection limit values
    if convert_dtl:
        less1 = data['val'].str.match('<')
        if less1.sum() > 0:
            less1.loc[less1.isnull()] = False
            data2 = data.copy()
            data2.loc[less1,
                      'val'] = pd.to_numeric(
                          data.loc[less1, 'val'].str.replace('<', ''),
                          errors='coerce') * 0.5
            if dtl_method in (None, 'standard'):
                data3 = data2
            if dtl_method == 'trend':
                df1 = data2.loc[less1]
                count1 = data.groupby('parameter')['val'].count()
                count1.name = 'tot_count'
                count_dtl = df1.groupby('parameter')['val'].count()
                count_dtl.name = 'dtl_count'
                count_dtl_val = df1.groupby('parameter')['val'].nunique()
                count_dtl_val.name = 'dtl_val_count'
                combo1 = pd.concat([count1, count_dtl, count_dtl_val],
                                   axis=1,
                                   join='inner')
                combo1['dtl_ratio'] = (combo1['dtl_count'] /
                                       combo1['tot_count']).round(2)

                ## conditionals
                #            param1 = combo1[(combo1['dtl_ratio'] <= 0.4) | (combo1['dtl_ratio'] == 1)]
                #            under_40 = data['parameter'].isin(param1.index)
                param2 = combo1[(combo1['dtl_ratio'] > 0.4)
                                & (combo1['dtl_val_count'] != 1)]
                over_40 = data['parameter'].isin(param2.index)

                ## Calc detection limit values
                data3 = pd.merge(data,
                                 combo1['dtl_ratio'].reset_index(),
                                 on='parameter',
                                 how='left')
                data3.loc[:, 'val_dtl'] = data2['val']

                max_dtl_val = data2[over_40 & less1].groupby(
                    'parameter')['val'].transform('max')
                max_dtl_val.name = 'dtl_val_max'
                data3.loc[over_40 & less1, 'val_dtl'] = max_dtl_val
        else:
            data3 = data
    else:
        data3 = data

    #### Return and export
    if isinstance(export, str):
        data3.to_csv(export, encoding='utf-8', index=False)
    return data3
示例#11
0
def rd_henry(sites,
             from_date=None,
             to_date=None,
             agg_day=True,
             sites_by_col=False,
             min_filter=None,
             export=None):
    """
    Function to read in gaugings data from the "Henry DB". Hopefully, they keep this around for a while longer.

    Parameters
    ----------
    sites: list or str
        Either a list of site names or a file path string that contains a column of site names.
    from_date: str
        A date string for the start of the data (e.g. '2010-01-01').
    to_date: str
        A date string for the end of the data.
    agg_day: bool
        Should the gauging dates be aggregated down to the day as opposed to having the hour and minute. Gaugings are aggregated by the mean.
    sites_by_col: bool
        'False' does not make a single DateTimeIndex, rather it is indexed by site and date (long format). 'True' creates a single DateTimeIndex with the columns as gauging sites (will create many NAs).
    min_filter: int or None
        Minimum number of days required for the gaugings output.
    export: str or None
        Either a string path to a csv file or None.
    """
    def resample1(df):
        df.index = df.date
        df2 = df.resample('D').mean()
        return df2

    #### Fields and names for databases

    ## Query fields - Be sure to use single quotes for the names!!!

    fields = ['SiteNo', 'SampleDate', 'Flow']

    ## Equivelant short names for analyses - Use these names!!!

    names = ['site', 'date', 'flow']

    #### Databases

    ### Gaugings data

    server = 'SQL2012PROD03'
    database = 'DataWarehouse'

    table = 'DataWarehouse.dbo.F_SG_BGauging'
    where_col = 'SiteNo'

    ## Will change to the following!!! Or stay as a duplicate...

    # database1 = 'Hydstra'

    # table1 = 'Hydstra.dbo.GAUGINGS'

    ########################################
    ### Read in data

    sites1 = select_sites(sites).tolist()
    data = rd_sql(server=server,
                  database=database,
                  table=table,
                  col_names=fields,
                  where_col=where_col,
                  where_val=sites1).dropna()
    data.columns = names

    ### Aggregate duplicates

    data2 = data.groupby(['site', 'date']).mean().reset_index()

    ### Aggregate by day

    if agg_day:
        data3 = data2.groupby(['site']).apply(resample1).reset_index().dropna()
    else:
        data3 = data2

    ### Filter out sites with less than min_filter
    if min_filter is not None:
        count1 = data3.groupby('site')['flow'].count()
        count_index = count1[count1 >= min_filter].index
        data3 = data3[np.in1d(data3.site.values, count_index)]

    ### Select within date range
    if from_date is not None:
        data3 = data3[data3.date >= from_date]
    if to_date is not None:
        data3 = data3[data3.date <= to_date]

    ### reorganize data with sites as columns and dates as index

    if sites_by_col:
        data4 = data3.pivot(index='date', columns='site').xs('flow',
                                                             axis=1).round(4)
    else:
        data4 = data3.round(4)

    if isinstance(export, str):
        if sites_by_col:
            data4.to_csv(export)
        else:
            data4.to_csv(export, index=False)

    return data4
示例#12
0
def rd_hydrotel(sites,
                hydro_id,
                from_date=None,
                to_date=None,
                resample_code='D',
                period=1,
                val_round=3,
                min_count=None,
                pivot=False,
                export_path=None):
    """
    Function to extract time series data from the hydrotel database.

    Parameters
    ----------
    sites: list, array, dataframe, or str
        Site list or a str path to a single column csv file of site names/numbers.
    hydro_id: str
        'river / flow / rec / raw', 'aq / wl / rec / raw', 'atmos / precip / rec / raw', 'river / wl / rec / raw', or 'river / T / rec / raw'.
    from_date: str or None
        The start date in the format '2000-01-01'.
    to_date: str or None
        The end date in the format '2000-01-01'.
    resample_code : str
        The Pandas time series resampling code. e.g. 'D' for day, 'W' for week, 'M' for month, etc.
    period: int
        The number of resampling periods. e.g. period = 2 and resample = 'D' would be to resample the values over a 2 day period.
    fun: str
        The resampling function. i.e. mean, sum, count, min, or max. No median yet...
    val_round: int
        The number of decimals to round the values.
    pivot: bool
        Should the output be pivotted into wide format?
    export_path: str or None
        The path and file name to be saved.

    Returns
    -------
    Series or DataFrame
        A MultiIndex Pandas Series if pivot is False and a DataFrame if True
    """
    #### Import data and select the correct sites

    sites = select_sites(sites)
    if hydro_id == 'atmos / precip / rec / raw':
        site_ob1 = rd_sql(server, database, objects_tab, ['Site', 'ExtSysId'],
                          'ExtSysId',
                          sites.astype('int32').tolist())
        site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'],
                           'Site', site_ob1.Site.tolist())
        site_val1 = pd.merge(site_val0, site_ob1, on='Site')
    elif hydro_id in ['aq / wl / rec / raw', 'aq / T / rec / raw']:
        site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'])
        site_val0.loc[:,
                      'Name'] = site_val0.apply(lambda x: x.Name.split(' ')[0],
                                                axis=1)
        site_val1 = site_val0[site_val0.Name.isin(sites)].copy()
        site_val1.loc[:, 'ExtSysId'] = site_val1.loc[:, 'Name']
    else:
        site_val1 = rd_sql(server, database, sites_tab, sites_col, 'ExtSysId',
                           sites.astype('int32').tolist())

    if site_val1.empty:
        raise ValueError('No site(s) in database')

    site_val1.loc[:, 'ExtSysId'] = pd.to_numeric(site_val1.loc[:, 'ExtSysId'],
                                                 errors='ignore')
    site_val1 = site_val1.drop_duplicates('ExtSysId')
    site_val = site_val1.Site.astype('int32').tolist()
    if isinstance(hydro_id, (list, np.ndarray, pd.Series)):
        hydro_ids = [hydro_ids_dict[i] for i in hydro_id]
    elif isinstance(hydro_id, str):
        hydro_ids = [hydro_ids_dict[hydro_id]]
    else:
        raise ValueError('hydro_id must be a str, list, ndarray, or Series.')
    hydro_ids_val = rd_sql(server, database, hydro_ids_tab, hydro_ids_col,
                           'Name', hydro_ids)

    where_col = {
        'Site': site_val,
        'ObjectVariant': hydro_ids_val.ObjectVariant.astype('int32').tolist(),
        'ObjectType': hydro_ids_val.ObjectType.astype('int32').tolist()
    }

    object_val1 = rd_sql(server, database, objects_tab, objects_col, where_col)
    if hydro_id == 'aq / wl / rec / raw':
        object_val1 = object_val1[object_val1.Name == 'Water Level']
    elif hydro_id == 'atmos / precip / rec / raw':
        object_val1 = object_val1[object_val1.Name == 'Rainfall']
    elif hydro_id == 'river / T / rec / raw':
        object_val1 = object_val1[object_val1.Name == 'Water Temperature']
    object_val = object_val1.Object.values.astype(int).tolist()

    #### Rearrange data
    point_val1 = rd_sql(server,
                        database,
                        points_tab,
                        points_col,
                        where_col='Object',
                        where_val=object_val)
    point_val = point_val1.Point.values.astype(int).tolist()

    #### Big merge
    comp_tab1 = pd.merge(site_val1, object_val1[['Object', 'Site']], on='Site')
    comp_tab2 = pd.merge(comp_tab1, point_val1, on='Object')
    comp_tab2.set_index('Point', inplace=True)

    #### Pull out the data
    ### Make SQL statement
    data1 = rd_sql_ts(server,
                      database,
                      data_tab,
                      'Point',
                      'DT',
                      'SampleValue',
                      resample_code,
                      period,
                      resample_dict[hydro_id],
                      val_round, {'Point': point_val},
                      from_date=from_date,
                      to_date=to_date,
                      min_count=min_count)['SampleValue']

    data1.index.names = ['site', 'time']
    data1.name = 'value'
    site_numbers = [
        comp_tab2.loc[i, 'ExtSysId'] for i in data1.index.levels[0]
    ]
    data1.index.set_levels(site_numbers, level='site', inplace=True)

    if pivot:
        data3 = data1.unstack(0)
    else:
        data3 = data1

    #### Export and return
    if export_path is not None:
        save_df(data3, export_path)

    return data3
示例#13
0
def stream_nat(
        sites,
        catch_shp=r'S:\Surface Water\shared\GIS_base\vector\catchments\catch_delin_recorders.shp',
        include_gw=True,
        max_date='2015-06-30',
        sd_hdf='S:/Surface Water/shared/base_data/usage/sd_est_all_mon_vol.h5',
        flow_csv=None,
        crc_shp=r'S:\Surface Water\shared\GIS_base\vector\allocations\allo_gis.shp',
        catch_col='site',
        pivot=False,
        return_data=False,
        export_path=None):
    """
    Function to naturalize stream flows from monthly sums of usage.

    Parameters
    ----------
    sites: list, ndarray, Series
        A list of recorder sites to be naturalised.
    catch_shp: str
        A shapefile of the delineated catchments for all recorders.
    include_gw: bool
        Should stream depleting GW takes be included?
    max_date: str
        The last date to be naturalised. In the form of '2015-06-30'.
    sd_hdf: str
        The hdf file of all the crc/waps with estimated usage and allocation.
    flow_csv: str or None
        If None, then use the hydro class to import the data. Otherwise, flow data can be imported as a csv file with the first column as datetime and each other column as a recorder site in m3/s. It can also be a dataframe.
    crc_shp: str
        A shapefile of all of th locations of the crc/waps.
    pivot: bool
        Should the output be pivotted?
    return_data: bool
        Should the allocation/usage time series be returned?
    export_path: str or None
        Path to save results as either hdf or csv (or None).

    Returns
    -------
    DataFrame
    """

    qual_codes = [10, 18, 20, 50]

    ### Read in data
    ## Site numbers
    sites1 = select_sites(sites)

    ## Stream depletion
    sd = pd.read_hdf(sd_hdf)
    sd.time = pd.to_datetime(sd.time)
    if include_gw:
        sd1 = sd[sd.time <= max_date]
    else:
        sd1 = sd[(sd.take_type == 'Take Surface Water')
                 & (sd.time <= max_date)]

    ## Recorder flow
    if type(flow_csv) is str:
        flow = rd_ts(flow_csv)
        flow.columns = flow.columns.astype(int)
        flow.index.name = 'time'
        flow.columns.name = 'site'
        flow = flow.stack()
        flow.name = 'flow'
        flow.index = flow.index.reorder_levels(['site', 'time'])
        flow = flow.sort_index()
    elif isinstance(flow_csv, pd.DataFrame):
        flow = flow_csv.copy()
        flow.columns = flow.columns.astype(int)
        flow.index.name = 'time'
        flow.columns.name = 'site'
        flow = flow.stack()
        flow.name = 'flow'
        flow.index = flow.index.reorder_levels(['site', 'time'])
        flow = flow.sort_index()
    elif isinstance(flow_csv, pd.Series):
        flow = flow_csv.copy()
    else:
        raise ValueError('Pass something useful to flow_csv.')

    ## crc shp
    crc_loc = gpd.read_file(crc_shp)
    crc_loc1 = pd.merge(
        crc_loc[[
            'crc', 'take_type', 'allo_block', 'wap', 'use_type', 'geometry'
        ]],
        sd[['crc', 'take_type', 'allo_block', 'wap',
            'use_type']].drop_duplicates(),
        on=['crc', 'take_type', 'allo_block', 'wap', 'use_type'])

    ## Catchment areas shp
    catch = gpd.read_file(catch_shp).drop('NZREACH', axis=1)
    catch = catch[catch[catch_col].isin(sites1)]

    ### Spatial processing of WAPs, catchments, and sites
    ## WAPs to catchments sjoin
    crc_catch, catch2 = pts_poly_join(crc_loc1, catch, catch_col)

    #    id_areas = catch2.area.copy()
    #    tot_areas = catch2.area.copy()
    #
    #    ## Unique catchments/gauges
    ##    sites = wap_catch[catch_col].unique()
    #    sites2 = catch[catch_col].unique()

    ### Next data import
    ## Gaugings
    #    gaugings = rd_henry(sites=sites.astype('int32'), agg_day=True, sites_by_col=True)
    #    gaugings.columns = gaugings.columns.astype(int)

    ## site specific flow
    #    rec_sites = flow.columns[in1d(flow.columns, sites)]
    #    gauge_sites = sites[~in1d(sites, rec_sites)]
    #    gauge_sites2 = gaugings.columns[in1d(gaugings.columns, gauge_sites)]
    #    site_flow = flow[rec_sites]
    #    gaugings = gaugings[gauge_sites2]

    ### filter down the sites
    sd1a = pd.merge(crc_catch,
                    sd1,
                    on=['crc', 'take_type', 'allo_block', 'wap',
                        'use_type']).drop('geometry', axis=1)

    ### Remove excessive usages
    sd1a = sd1a[~((sd1a.sd_usage / sd1a.ann_restr_allo_m3 / 12) >= 1.5)]

    ### Calc SD for site and month
    sd2 = sd1a.groupby(['site', 'time'])['sd_usage'].sum().reset_index()
    days1 = sd2.time.dt.daysinmonth
    sd2['sd_rate'] = sd2.sd_usage / days1 / 24 / 60 / 60

    ### Resample SD to daily time series
    days2 = pd.to_timedelta((days1 / 2).round().astype('int32'), unit='D')
    sd3 = sd2.drop('sd_usage', axis=1)
    sd3.loc[:, 'time'] = sd3.loc[:, 'time'] - days2
    grp1 = sd3.groupby(['site'])
    first1 = grp1.first()
    last1 = sd2.groupby('site')[['time', 'sd_rate']].last()
    first1.loc[:, 'time'] = pd.to_datetime(
        first1.loc[:, 'time'].dt.strftime('%Y-%m') + '-01')
    sd4 = pd.concat([first1.reset_index(), sd3,
                     last1.reset_index()
                     ]).reset_index(drop=True).sort_values(['site', 'time'])
    sd5 = sd4.set_index('time')
    sd6 = sd5.groupby('site').apply(
        lambda x: x.resample('D').interpolate(method='pchip'))['sd_rate']

    ### Naturalise flows
    nat1 = pd.concat([flow, sd6], axis=1, join='inner')
    nat1['nat_flow'] = nat1['flow'] + nat1['sd_rate']

    ## Normalize to area if desired
    #    if norm_area:
    #        # recorder flow in mm/day
    #        site_order = tot_areas[flow1.columns].values / 60 / 60 / 24 / 1000
    #        flow_norm = flow1.div(site_order)
    #        nat_flow_norm = nat_flow.div(site_order)
    #
    #        # Gauges flow in mm/day
    #        site_order = tot_areas[gaugings1.columns].values / 60 / 60 / 24 / 1000
    #        gaugings_norm = gaugings1.div(site_order)
    #        nat_gauge_norm = nat_gauge.div(site_order)
    #
    #        ### Export and return results
    #        if export:
    #            nat_flow_norm.to_csv(export_rec_flow_path)
    #            nat_gauge_norm.to_csv(export_gauge_flow_path)
    #        return([flow_norm, gaugings_norm, nat_flow_norm, nat_gauge_norm])
    #    else:
    #        if export:
    #            nat_flow.to_csv(export_rec_flow_path)
    #            nat_gauge.to_csv(export_gauge_flow_path)
    #        return([flow1, gaugings1, nat_flow, nat_gauge])
    if pivot:
        nat2 = nat1.round(3).unstack('site')
    else:
        nat2 = nat1.round(3)
    if isinstance(export_path, str):
        save_df(nat2, export_path)
    if return_data:
        return nat2, sd1a
    else:
        return nat2
示例#14
0
def rec_catch_del(sites_shp, rec_streams_shp, rec_catch_shp, sites_col='site', buffer_dis=400, catch_output=None):
    """
    Catchment delineation using the REC streams and catchments.

    Parameters
    ----------
    sites_shp : str path or GeoDataFrame
        Points shapfile of the sites along the streams or the equivelant GeoDataFrame.
    rec_streams_shp : str path, GeoDataFrame, or dict
        str path to the REC streams shapefile, the equivelant GeoDataFrame, or a dict of parameters to read in an mssql table using the rd_sql function.
    rec_catch_shp : str path, GeoDataFrame, or dict
        str path to the REC catchment shapefile, the equivelant GeoDataFrame, or a dict of parameters to read in an mssql table using the rd_sql function.
    sites_col : str
        The column name of the site numbers in the sites_shp.
    catch_output : str or None
        The output polygon shapefile path of the catchment delineation.

    Returns
    -------
    GeoDataFrame
        Polygons
    """

    ### Parameters


    ### Modifications {NZREACH: {NZTNODE/NZFNODE: node # to change}}
    mods = {13053151: {'NZTNODE': 13055874}, 13048353: {'NZTNODE': 13048851}, 13048498: {'NZTNODE': 13048851}}

    ### Load data
    if isinstance(rec_catch_shp, gpd.GeoDataFrame):
        rec_catch = rec_catch_shp.copy()
    elif isinstance(rec_catch_shp, str):
        if rec_catch_shp.endswith('shp'):
            rec_catch = gpd.read_file(rec_catch_shp)
        else:
            raise ValueError('If rec_catch_shp is a str, then it must be a path to a shapefile.')
    elif isinstance(rec_catch_shp, dict):
        rec_catch = rd_sql(**rec_catch_shp)

    if isinstance(rec_streams_shp, gpd.GeoDataFrame):
        rec_streams = rec_streams_shp.copy()
    elif isinstance(rec_streams_shp, str):
        if rec_streams_shp.endswith('shp'):
            rec_streams = gpd.read_file(rec_streams_shp)
        else:
            raise ValueError('If rec_catch_shp is a str, then it must be a path to a shapefile.')
    elif isinstance(rec_streams_shp, dict):
        rec_streams = rd_sql(**rec_streams_shp)

    pts = select_sites(sites_shp)

    ### make mods
    for i in mods:
        rec_streams.loc[rec_streams['NZREACH'] == i, mods[i].keys()] = mods[i].values()

    ### Find closest REC segment to points
    pts_seg = closest_line_to_pts(pts, rec_streams, line_site_col='NZREACH', buffer_dis=buffer_dis)
    nzreach = pts_seg.copy().NZREACH.unique()

    ### Find all upstream reaches
    reaches = find_upstream_rec(nzreach, rec_streams_shp=rec_streams)

    ### Extract associated catchments
    rec_catch = extract_rec_catch(reaches, rec_catch_shp=rec_catch)

    ### Aggregate individual catchments
    rec_shed = agg_rec_catch(rec_catch)
    rec_shed.columns = ['NZREACH', 'geometry', 'area']
    rec_shed1 = rec_shed.merge(pts_seg.drop('geometry', axis=1), on='NZREACH')

    ### Export and return
    if catch_output is not None:
        rec_shed1.to_file(catch_output)
    return rec_shed1