Пример #1
0
def poly_import(irr_type_dict, paw_dict, paw_ratio=0.67):
    """
    Function to import polygon input data. At the moment, these include irrigation type and PAW. Inputs are dictionaries that reference either an MSSQL table with a geometry column or a shapefile. If the dictionary references an sql table then the keys should be 'server', 'database', 'table', and 'column'. If the dictionary references a shapefile, then the keys should be 'shp' and 'column'. All values should be strings.
    """

    if not all([isinstance(irr_type_dict, dict), isinstance(paw_dict, dict)]):
        raise TypeError("'irr_type_dict' and 'paw_dict' must be dictionaries.")

    if 'column' in irr_type_dict.keys():
        if not isinstance(irr_type_dict['column'], str):
            raise TypeError("The key 'column' must be a string.")
    else:
        raise TypeError("The key 'column' must be in the dictionaries.")

    if 'shp' in irr_type_dict.keys():
        if not isinstance(irr_type_dict['shp'], str):
            raise TypeError("If 'shp' is in the dict, then it must be a string path to a shapefile.")
        irr1 = gpd.read_file(irr_type_dict['shp'])[[irr_type_dict['column'], 'geometry']]
    elif isinstance(irr_type_dict, dict):
        irr1 = rd_sql(irr_type_dict['server'], irr_type_dict['database'], irr_type_dict['table'], [irr_type_dict['column']], geo_col=True)
    irr1.rename(columns={irr_type_dict['column']: 'irr_type'}, inplace=True)

    if 'shp' in paw_dict.keys():
        if not isinstance(paw_dict['shp'], str):
            raise TypeError("If 'shp' is in the dict, then it must be a string path to a shapefile.")
        paw1 = gpd.read_file(paw_dict['shp'])[[paw_dict['column'], 'geometry']]
    elif isinstance(paw_dict, dict):
        paw1 = rd_sql(paw_dict['server'], paw_dict['database'], paw_dict['table'], [paw_dict['column']], geo_col=True)
    paw1.rename(columns={paw_dict['column']: 'paw'}, inplace=True)
    paw1.loc[:, 'paw'] = paw1.loc[:, 'paw'] * paw_ratio

    return(irr1, paw1)
Пример #2
0
def metconnect_id_loc(sites=None,
                      mc_server='SQL2012PROD03',
                      mc_db='MetConnect',
                      mc_site_table='RainFallPredictionSites',
                      mc_cols=['MetConnectID', 'SiteString', 'TidedaID'],
                      gis_server='SQL2012PROD05'):
    """
    Function to extract the metconnect id table with geometry location.

    Parameters
    ----------
    sites : list of int or None
        The site numbers to extract from the table, or None for all.

    Returns
    -------
    GeoDataFrame
    """

    ### Input parameters
    #    hy_server = 'SQL2012PROD05'
    #    hy_db = 'Hydrotel'
    #    pts_table = 'Points'
    #    objs_table = 'Objects'
    #    sites_table = 'Sites'
    #
    #    pts_cols = ['Point', 'Object']
    #    objs_cols = ['Object', 'Site']
    #    sites_cols = ['Site', 'ExtSysId']

    loc_db = 'Bgauging'
    loc_table = 'RSITES'

    loc_cols = ['SiteNumber', 'NZTMX', 'NZTMY']

    ## Import tables
    mc1 = rd_sql(mc_server, mc_db, mc_site_table, mc_cols)
    mc2 = mc1[~mc1.SiteString.str.startswith('M')]
    mc2.columns = ['MetConnectID', 'site_name', 'ExtSysId']
    mc2 = mc2[(mc2.MetConnectID != 7) & mc2.ExtSysId.notnull()]
    mc2.loc[:, 'ExtSysId'] = mc2.loc[:, 'ExtSysId'].astype(int)

    #    hy_pts = rd_sql(hy_server, hy_db, pts_table, pts_cols, 'Point', mc2.Point.tolist())
    #    hy_objs = rd_sql(hy_server, hy_db, objs_table, objs_cols, 'Object', hy_pts.Object.tolist())
    #    hy_sites = rd_sql(hy_server, hy_db, sites_table, sites_cols, 'Site', hy_objs.Site.tolist())
    #    hy_sites['ExtSysId'] = to_numeric(hy_sites['ExtSysId'])
    hy_loc = rd_sql(gis_server, loc_db, loc_table, loc_cols, 'SiteNumber',
                    mc2.ExtSysId.tolist())
    hy_loc.columns = ['ExtSysId', 'x', 'y']

    #    t1 = merge(mc2, hy_pts, on='Point')
    #    t2 = merge(t1, hy_objs, on='Object')
    #    t3 = merge(t2, hy_sites, on='Site')
    t4 = pd.merge(mc2, hy_loc, on='ExtSysId')

    hy_xy = xy_to_gpd('MetConnectID', 'x', 'y', t4)

    return hy_xy
Пример #3
0
def rating_changes(sites=None, from_mod_date=None, to_mod_date=None):
    """
    Function to determine flow rating changes during a specified period.

    Parameters
    ----------
    sites: list of str
        List of sites to be returned. None includes all sites.
    from_mod_date: str
        The starting date when the data has been modified.
    to_mod_date: str
        The ending date when the data has been modified.

    Returns
    -------
    DataFrame
        With site, varfrom, varto, and from_date
    """
    ### Parameters
    server= 'SQL2012PROD03'
    database = 'Hydstra'

    table_per = 'RATEPER'
    table_hed = 'RATEHED'
    fields_per = ['STATION', 'VARFROM', 'VARTO', 'SDATE', 'STIME', 'REFTAB', 'PHASE']
    names_per = ['site', 'varfrom', 'varto', 'sdate', 'stime', 'reftab', 'phase']
    fields_hed = ['STATION', 'VARFROM', 'VARTO', 'TABLE', 'RELDATE']
    names_hed = ['site', 'varfrom', 'varto', 'reftab', 'date']

    ### Read data
    if sites is not None:
        if isinstance(sites, list):
            where_col = {'STATION': sites}
        else:
            where_col = None
    else:
            where_col = None

    rate_hed = rd_sql(server, database, table_hed, fields_hed, where_col, rename_cols=names_hed, from_date=from_mod_date, to_date=to_mod_date, date_col='RELDATE')
    rate_hed['site'] = rate_hed['site'].str.strip()

    where_per = {'STATION': rate_hed['site'].astype(str).unique().tolist()}

    rate_per = rd_sql(server, database, table_per, fields_per, where_per, rename_cols=names_per, where_op='OR')
    rate_per['site'] = rate_per['site'].str.strip()
    time1 = pd.to_timedelta(rate_per['stime'] // 100, unit='H') + pd.to_timedelta(rate_per['stime'] % 100, unit='m')
    rate_per['sdate'] = rate_per['sdate'] + time1
    rate_per = rate_per.sort_values(['site', 'sdate']).reset_index(drop=True).drop('stime', axis=1)

    rate_per1 = pd.merge(rate_per, rate_hed[['site', 'reftab']], on=['site', 'reftab'])
    rate_per2 = rate_per1.groupby('site')['sdate'].min().reset_index()
    rate_per2.columns = ['site', 'from_date']

    rate_per2['varfrom'] = 100
    rate_per2['varto'] = 140

    return rate_per2[['site', 'varfrom', 'varto', 'from_date']]
Пример #4
0
def _rd_hydro_geo_mssql(self, server, database, table, geo_dict):
    """
    Function to select sites based on the geo attributes.
    """

    sites1 = rd_sql(server, database, table, 'site', geo_dict)
    sites2 = sites1.site.astype(str).values.tolist()
    return sites2
Пример #5
0
def telem_corr_sites(site_num=None):
    """
    Function to determine if sites are telemetered or are correlated from telemetered sites in Hydrotel. Output is a list of correlated sites.

    Parameters
    ----------
    site_num: list of str
        Site numbers for the selection.

    Returns
    -------
    List of str
        List of site numbers that are correlated sites.
    """
    ### Parameters
    server = 'SQL2012PROD05'
    database = 'Hydrotel'
    sites_tab = 'Sites'
    obj_tab = 'Objects'

    sites_fields = ['Site', 'ExtSysID']
    obj_fields = ['Site', 'Name']

    where_dict = {'Name': ['calculated flow']}

    ### Read in data
    if isinstance(site_num, list):
        sites = rd_sql(server, database, sites_tab, sites_fields,
                       {'ExtSysID': site_num})
        sites['ExtSysID'] = pd.to_numeric(sites['ExtSysID'], 'coerce')
    else:
        sites = rd_sql(server, database, sites_tab, sites_fields)
        sites['ExtSysID'] = pd.to_numeric(sites['ExtSysID'], 'coerce')
        sites = sites[sites.ExtSysID.notnull()]

    sites['Site'] = sites['Site'].astype('int32')

    where_dict.update({'Site': sites.Site.tolist()})

    obj = rd_sql(server, database, obj_tab, obj_fields, where_dict)
    corr_sites = sites[sites.Site.isin(obj.Site)]

    return corr_sites.ExtSysID.astype('int32').astype(str).tolist()
Пример #6
0
def hydstra_sites_var(varto=None, data_source='A', server='SQL2012PROD03', database='Hydstra'):
    """
    Function to extract all of the sites associated with specific varto codes.

    Parameters
    ----------
    varto: list of int or int
        The Hydstra specific variable codes. None equates to all varto's.
    data_source: str
        The Hydstra data source ID. 'A' is archive.

    Returns
    -------
    DataFrame
        With site, data_source, varfrom, and varto
    """
    ### Parameters
    period_tab = 'PERIOD'

    period_cols = ['STATION', 'VARFROM', 'VARIABLE']
    period_names = ['site', 'varfrom', 'varto']

    ## Removals
    rem_dict = {'165131': [140, 140], '69302': [140, 140], '71106': [140, 140], '366425': [140, 140]}

    ### Import

    if varto is None:
        period_where = {'DATASOURCE': data_source}
    elif isinstance(varto, int):
        period_where = {'DATASOURCE': data_source, 'VARIABLE': [varto]}
    elif isinstance(varto, list):
        period_where = {'DATASOURCE': data_source, 'VARIABLE': varto}
    else:
        raise TypeError('period_where must be None, int, or list')

    period1 = rd_sql(server, database, period_tab, period_cols, where_col=period_where, rename_cols=period_names)
    period1.loc[:, 'site'] = period1.site.str.strip()

    ### Determine the variables to extract
    period2 = period1[period1.varto.isin(period1.varto.round())].sort_values('site')
    period2 = period2[period2.varto != 101]
    for i in rem_dict:
        period2 = period2[~((period2.site == i) & (period2.varfrom == rem_dict[i][0]) & (period2.varto == rem_dict[i][1]))]

    ### Convert variables to int
    period3 = period2.copy()
    period3['varfrom'] = period3['varfrom'].astype('int32')
    period3['varto'] = period3['varto'].astype('int32')

    ### Return
    return period3
Пример #7
0
def hydrotel_sites_by_hydroid(hydro_id):
    """
    Function to return all the sites associated with a particular hydro_id.

    Parameters
    ----------
    hydro_id: str
        The HydroPandas hydro_id.

    Returns
    -------
    DataFrame
        of hydrotel_id and ECan site id.
    """
    if hydro_id in ['aq / wl / rec / raw', 'aq / T / rec / raw']:
        sites1 = rd_sql(server, db_wells, wells_tab, wells_col)[wells_col[0]]
        site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'])
        site_val0.loc[:,
                      'Name'] = site_val0.apply(lambda x: x.Name.split(' ')[0],
                                                axis=1)
        site_val1 = site_val0[site_val0.Name.isin(sites1)].copy()
    elif hydro_id in ['atmos / precip / rec / raw']:
        sites1 = rd_sql(server, db_bgauging, bg_tab, bg_col,
                        {'RainfallSite': 'R'})[bg_col[0]]
        site_val1 = rd_sql(server, database, objects_tab, ['Site', 'ExtSysId'],
                           'ExtSysId',
                           sites1.astype('int32').tolist()).sort_values('Site')


#        site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'], 'Site', site_ob1.Site.tolist())
#        site_val1 = pd.merge(site_val0, site_ob1, on='Site')
    else:
        sites1 = rd_sql(server, db_bgauging, bg_tab, bg_col,
                        {'RainfallSite': 'N'})[bg_col[0]]
        site_val1 = rd_sql(server, database, sites_tab, ['Site', 'ExtSysId'],
                           'ExtSysId',
                           sites1.astype('int32').tolist()).sort_values('Site')
    site_val1.columns = ['hydrotel_id', 'site']
    site_val2 = site_val1.drop_duplicates('site')

    return site_val2
Пример #8
0
def priority_gaugings(num_previous_months=2):
    """
    Function to extract the gauging sites for low flow restrictions that hven't been gauged in a while.

    Parameters
    ----------
    num_previous_months: int
        The number of previous months to query over.

    Returns
    -------
    DataFrame
    """
    ########################################
    ### Parameters

    is_active = True

    ## Query fields - Be sure to use single quotes for the names!!!
    ass_fields = ['SiteID', 'MeasuredDate', 'Flow']
    sites_fields = ['Siteid', 'RefDBaseKey', 'Waterway', 'Location']
    crc_fields = ['SiteID', 'BandNo', 'RecordNo']
    site_type_fields = ['SiteID', 'BandNo', 'RestrictionType']

    ## Equivelant short names for analyses - Use these names!!!
    ass_names = ['SiteID', 'date', 'flow']
    sites_names = ['SiteID', 'site', 'Waterway', 'Location']
    crc_names = ['SiteID', 'band_num', 'crc']
    site_type_names = ['SiteID', 'band_num', 'restr_type']

    ## Databases

    server1 = 'SQL2012PROD03'
    database1 = 'LowFlows'

    # assessments table
    ass_table = 'LowFlowSiteAssessment'

    # Sites info
    sites_table = 'LowFlowSite'

    # crc, sites, and bands
    crc_table = 'LowFlows.dbo.vLowFlowConsents2'

    # lowflow or residual flow site
    site_type_table = 'LowFlowSiteBand'

    ########################################
    ### Read in data

    today1 = pd.to_datetime(date.today())
    from_date = today1 - pd.DateOffset(months=num_previous_months)

    sites = rd_sql(server1, database1, sites_table, sites_fields, {'isActive': [is_active], 'RefDBase': ['Gauging']}, rename_cols=sites_names)

    ass = rd_sql(server1, database1, ass_table, ass_fields, {'MethodID': [1, 2, 3]}, rename_cols=ass_names)

    crc = rd_sql(server1, database1, crc_table, crc_fields, {'isCurrent': [is_active]}, rename_cols=crc_names)

    site_type = rd_sql(server1, database1, site_type_table, site_type_fields, {'isActive': [is_active]}, rename_cols=site_type_names)

    #######################################
    ### Process data

    ## max ass
    max_ass1 = ass.groupby('SiteID')['date'].max().reset_index()
    max_ass = pd.merge(ass, max_ass1, on=['SiteID', 'date']).set_index('SiteID')
    max_ass.columns = ['last_gauging', 'flow']

    ## crc counts
    crc_count = crc.groupby('SiteID')['crc'].count()
    crc_count.name = 'crc_count'

    ## Only low flow sites
    lowflow_site1 = site_type[site_type.restr_type == 'LowFlow'].SiteID.unique()
    lowflow_sites = sites[sites.SiteID.isin(lowflow_site1)].set_index('SiteID')

    ## Combine all df
    sites_lastg = pd.concat([lowflow_sites, max_ass, crc_count], axis=1, join='inner')

    ## Filter out sites that have been gauged recently
    sites_lastg2 = sites_lastg[sites_lastg['last_gauging'] < from_date].sort_values('crc_count', ascending=False)

    ## Add in min and max triggers
    basic, complete = min_max_trig()
    basic2 = basic[basic.mon == today1.month].copy().drop('mon', axis=1).set_index('SiteID')
    basic2['trig_date'] = today1.date()

    ## Combine
    sites_lastg3 = pd.concat([sites_lastg2, basic2], axis=1, join='inner').set_index('site')

    ### Return
    return sites_lastg3
Пример #9
0
def low_flow_restr(sites_num=None, from_date=None, to_date=None, only_restr=True):
    """
    Function to determine the flow sites currently on restriction.

    Parameters
    ----------
    sites_num : list or None
        A list of sites to return, or all sites if None.
    from_date: str
        The start date in the format '2017-01-01'.
    end_date : str
        The end date in the format '2017-01-01'.
    only_restr : bool
        Should only the sites that are on some kind of restriction be returned?

    Returns
    -------
    DataFrames
        Two DataFrames are returned. One is a summary of each site on restriction and one has the sites and bands on restriction.

    Notes
    -----
    This should not be queried for low flows history past the last season as the bands and consents history are not stored. They only reflect active bands and consents.
    """

    ########################################
    ### Parameters

    is_active = True
    hour1 = datetime.now().hour
    today1 = date.today()

    ## Query fields - Be sure to use single quotes for the names!!!
    restr_fields = ['SiteID', 'RestrictionDate', 'BandNo', 'BandAllocation', 'AsmtFlow']
    sites_fields = ['Siteid', 'RefDBaseKey', 'Waterway', 'Location']
    crc_fields = ['SiteID', 'BandNo', 'RecordNo']
    site_type_fields = ['SiteID', 'BandNo', 'RestrictionType']

    ## Equivelant short names for analyses - Use these names!!!
    restr_names = ['SiteID', 'band_num', 'date', 'flow', 'band_allo']
    sites_names = ['SiteID', 'site', 'waterway', 'location']
    crc_names = ['SiteID', 'band_num', 'crc']
    site_type_names = ['SiteID', 'band_num', 'site_type']
    ass_names = ['SiteID', 'flow_method', 'applies_date', 'date']

    ## Databases

    server1 = 'SQL2012PROD03'
    database1 = 'LowFlows'

    # daily restrictions
    restr_table = 'LowFlows.dbo.LowFlowSiteRestrictionDaily'
    restr_where = {'SnapshotType': ['Live']}

    # Sites info
    sites_table = 'LowFlowSite'

    # crc, sites, and bands
    crc_table = 'LowFlows.dbo.vLowFlowConsents2'

    # lowflow or residual flow site
    site_type_table = 'LowFlowSiteBand'

    # assessments table
    ass_table = 'LowFlowSiteAssessment'

    # Ass stmt
    ass_stmt = "select SiteID, MethodID, AppliesFromDate, MeasuredDate from LowFlows.dbo.LowFlowSiteAssessment t1 WHERE EXISTS(SELECT 1 FROM LowFlows.dbo.LowFlowSiteAssessment t2 WHERE t2.SiteID = t1.SiteID GROUP BY t2.SiteID HAVING t1.MeasuredDate = MAX(t2.MeasuredDate))"

    ## Method dict
    method_dict = {1: 'Gauged', 2: 'Visually Gauged', 3: 'Telemetered', 4: 'Manually Calculated', 5: 'Correlated from Telem'}


    ########################################
    ### Read in data

    sites = rd_sql(server1, database1, sites_table, sites_fields, {'isActive': [is_active]}, rename_cols=sites_names)

    if only_restr:
        allo_values = list(np.arange(100))
        allo_values.extend(list(np.arange(103, 110)))
    else:
        allo_values = list(np.arange(110))

    restr_day = rd_sql_ts(server=server1, database=database1, table=restr_table, groupby_cols=['SiteID', 'BandNo'], date_col='RestrictionDate',  values_cols=['AsmtFlow', 'BandAllocation'], from_date=from_date, to_date=to_date, where_col={'BandAllocation': allo_values})
    restr_day = restr_day.reset_index()
    restr_day.columns = restr_names

    crc = rd_sql(server1, database1, crc_table, crc_fields, {'isCurrent': [is_active]}, rename_cols=crc_names)

    site_type = rd_sql(server1, database1, site_type_table, site_type_fields, {'isActive': [is_active]}, rename_cols=site_type_names)

#    yesterday = str(to_datetime(to_date) - DateOffset(days=1))

#    tel_sites_all = rd_sql(server1, database1, ass_table, ['SiteID'], {'MethodID': [3], 'AppliesFromDate': [to_date]})
#    tel_sites = rd_sql(server1, database1, ass_table, ['SiteID'], {'MethodID': [3, 4], 'AppliesFromDate': [to_date], 'MeasuredDate': [yesterday]}).SiteID

    ass1 = rd_sql(server1, database1, stmt=ass_stmt)
    ass1.columns = ass_names

    #######################################
    ### Process data

    ## Filter sites if needed
    if isinstance(sites_num, list):
        if isinstance(sites_num[0], str):
            sites = sites[sites.site.isin(sites_num)]
        else:
            raise ValueError('sites_num must be a list of strings')

    ## Periods by month
    p_set_site, p_set = min_max_trig(restr_day.SiteID.unique().tolist())

    ## Trigger flows
    restr_day['mon'] = restr_day['date'].dt.month
    restr2 = pd.merge(restr_day, p_set, on=['SiteID', 'band_num', 'mon'], how='left')

    ## crc counts
    crc_count = crc.groupby(['SiteID', 'band_num'])['crc'].count()
    crc_count.name = 'crc_count'

    ## Combine restr with crc
    restr_crc = pd.merge(restr2, crc_count.reset_index(), on=['SiteID', 'band_num'])

    ## Not only lowflow sites
#    lowflow_site = site_type[site_type.restr_type == 'LowFlow'].copy().drop('restr_type', axis=1)
    restr_crc = pd.merge(restr_crc, site_type, on=['SiteID', 'band_num'], how='inner')

    ## Add in how it was measured and when
    site_type1 = ass1[ass1.SiteID.isin(restr_day.SiteID.unique())].copy()
    tel_sites1 = site_type1[site_type1.flow_method == 3].SiteID
    tel_sites2 = sites.loc[sites.SiteID.isin(tel_sites1), 'site']
    corr_sites1 = telem_corr_sites(tel_sites2.astype('int32').tolist())
    corr_sites2 = sites.loc[sites.site.isin(corr_sites1), 'SiteID']
    site_type1.loc[site_type1.SiteID.isin(corr_sites2), 'flow_method'] = 5
    if to_date is not None:
        site_type1['days_since_flow_est'] = (pd.to_datetime(to_date) - site_type1.date).dt.days
    else:
        site_type1['days_since_flow_est'] = (pd.to_datetime(today1) - site_type1.date).dt.days
    if (hour1 >= 17) | (hour1 < 14):
        site_type1['days_since_flow_est'] = site_type1['days_since_flow_est'] - 1

    site_type2 = site_type1.replace({'flow_method': method_dict}).drop(['applies_date', 'date'], axis=1)

    sites1 = pd.merge(sites, site_type2, on='SiteID')
    num1 = pd.to_numeric(sites1.site, 'coerce')
    sites1.loc[num1.isnull(), 'flow_method'] = 'GW manual'

    ## Aggregate to site and date
    grp1 = restr_crc.sort_values('site_type').groupby(['SiteID', 'date'])
    crcs1 = grp1['crc_count'].sum()
    flow_site = grp1[['site_type', 'flow', 'mon']].first()
    crc_flow = pd.concat([flow_site, crcs1], axis=1).reset_index()

    restr_sites1 = pd.merge(crc_flow, p_set_site, on=['SiteID', 'mon'], how='left').drop('mon', axis=1)

    ## Add in the restriction categories
    restr_sites1['restr_category'] = 'No'
    restr_sites1.loc[(restr_sites1['flow'] <= restr_sites1['min_trig']), 'restr_category'] = 'Full'
    restr_sites1.loc[(restr_sites1['flow'] < restr_sites1['max_trig']) & (restr_sites1['flow'] > restr_sites1['min_trig']), 'restr_category'] = 'Partial'

    ## Add in site numbers
    restr_crc_sites = pd.merge(sites1, restr_crc.drop(['mon', 'min_allo_perc', 'max_allo_perc'], axis=1), on='SiteID').drop('SiteID', axis=1).sort_values(['waterway', 'date', 'min_trig'])
    restr_sites = pd.merge(sites1, restr_sites1, on='SiteID').drop('SiteID', axis=1).sort_values(['waterway', 'date', 'min_trig'])

    ## Correct for duplicate primary keys
    restr_crc_sites.drop_duplicates(['site', 'band_num', 'date'], keep='last', inplace=True)
    restr_sites.drop_duplicates(['site', 'date'], keep='last', inplace=True)

    ######################################
    ### Return

    return restr_sites, restr_crc_sites
Пример #10
0
def min_max_trig(SiteID=None, is_active=True):
    """
    Function to determine the min/max triggers.

    Parameters
    ----------
    SiteID: list of str
        Lowflows internal site numbers for filtering.
    is_active: bool
        Should the output only return active sites/bands?

    Returns
    -------
    DataFrames
        Outputs two DataFrames. The first includes the min and max triggger levelsfor all bands per site, while the second has the min and max trigger levels for each site and band.
    """

    ######################################
    ### Parameters

    period_fields = ['SiteID', 'BandNo', 'PeriodNo', 'fmDate', 'toDate']
    period_names = ['SiteID', 'band_num', 'period', 'from_date', 'to_date']
    min_flow_fields = ['SiteID', 'BandNo', 'PeriodNo', 'Allocation', 'Flow']
    min_flow_names = ['SiteID', 'band_num', 'period', 'allo_perc', 'trig_level']
    site_type_fields = ['SiteID', 'BandNo']
    site_type_names = ['SiteID', 'band_num']

    ## Databases

    # daily restrictions
    server1 = 'SQL2012PROD03'
    database1 = 'LowFlows'

    # Internal site id, band, and min flow
    min_flow_table = 'LowFlowSiteBandPeriodAllocation'

    # period info
    period_table = 'LowFlowSiteBandPeriod'

    # site band active
    site_type_table = 'LowFlowSiteBand'

    #######################################
    ### Read in data

    periods0 = rd_sql(server1, database1, period_table, period_fields, rename_cols=period_names)

    if isinstance(SiteID, list):
        restr_val = rd_sql(server1, database1, min_flow_table, min_flow_fields, {'SiteID': SiteID}, rename_cols=min_flow_names)
    else:
        restr_val = rd_sql(server1, database1, min_flow_table, min_flow_fields, rename_cols=min_flow_names)

    site_type = rd_sql(server1, database1, site_type_table, site_type_fields, {'isActive': [is_active]}, rename_cols=site_type_names)

    #######################################
    ### Process data

    ## Fix duplicate zero allocations at zero flow
    grp1 = restr_val.groupby(['SiteID', 'band_num', 'period'])
    zeros1 = grp1.min()
    zeros2 = zeros1[zeros1.trig_level == 0]['allo_perc']
    zeros3 = pd.merge(restr_val, zeros2.reset_index(), on=['SiteID', 'band_num', 'period', 'allo_perc'])
    max_zero = zeros3.groupby(['SiteID', 'band_num', 'period', 'allo_perc'])['trig_level'].max()

    all_trig = restr_val.groupby(['SiteID', 'band_num', 'period', 'allo_perc'])['trig_level'].min()

    all_trig[max_zero.index] = max_zero

    ## Periods by month
    periods = pd.merge(periods0, site_type, on=['SiteID', 'band_num'])

    periods['from_mon'] = periods['from_date'].dt.month
    periods['to_mon'] = periods['to_date'].dt.month

    periods1 = periods.groupby(['SiteID', 'band_num', 'period']).apply(lambda x: pd.Series(np.arange(x.from_mon, x.to_mon + 1)))
    periods1.index = periods1.index.droplevel(3)
    periods1.name = 'mon'
    periods1 = periods1.reset_index().drop_duplicates(['SiteID', 'band_num', 'mon'])

    periods2 = pd.merge(periods1, all_trig.reset_index(), on=['SiteID', 'band_num', 'period']).drop('period', axis=1)
    p_min = periods2[~periods2.allo_perc.isin([103, 105, 106, 107, 108, 109])].groupby(['SiteID', 'band_num', 'mon']).min()
    p_min.columns = ['min_allo_perc', 'min_trig']
    p_max = periods2.groupby(['SiteID', 'band_num', 'mon']).max()
    p_max.columns = ['max_allo_perc', 'max_trig']

    p_min_site = p_min.reset_index().groupby(['SiteID', 'mon'])['min_trig'].min()
    p_max_site = p_max.reset_index().groupby(['SiteID', 'mon'])['max_trig'].max()
    p_set = pd.concat([p_min, p_max], axis=1).reset_index()
    p_set_site = pd.concat([p_min_site, p_max_site], axis=1).reset_index()

    return p_set_site, p_set
Пример #11
0
def ros_proc(allo_use, date_col='date', allo_col='allo', min_days=150, export_use_ros_path=None, export_ann_use_ros_path=None):
    """
    Function to process the reliability of supply data from the low flows db and merge it with the allo_use.
    """

    def resample2(df, min_days=20):
        from numpy import nan

        df.index = df.date
        df_grp = df.resample('M')
        df_count = df_grp['band_restr'].count()
        df2 = (df_grp['band_restr'].sum() / df_count).round(2)
        df2[df_count < min_days] = nan
        df2[df2 > 100] = 100
        return(df2)

    def resample_ann(df):
        from numpy import nan

        df.index = df.date
        df_grp = df.resample('A-JUN')
        df2 = df_grp.sum().round(2)
        df_mean = df_grp['band_restr'].mean().round(2)
        df_first = df_grp[['band', 'site']].first()
        df2['site'] = df_first['site']
        df2['band'] = df_first['band']
        df2['band_restr'] = df_mean
        return(df2)

    def resample1(df):
        df.index = df.date
        df_grp = df.resample('A-JUN')
        df2 = df_grp['up_allo_m3'].transform(sum).round(2)
        return(df2)

    #########################################
    ### Read in data
    sql1 = sql_arg()

    restr = rd_sql(**sql1.get_dict('lowflow_restr_day'))
    crc = rd_sql(**sql1.get_dict('lowflow_band_crc')).drop('active', axis=1)
    sites = rd_sql(**sql1.get_dict('lowflow_gauge')).drop(['active', 'DB'], axis=1)

    ## Rename columns

#    restr.columns = restr_names
#    sites.columns = sites_names
#    crc.columns = crc_names
#    usage_names = usage.columns.values.tolist()
#
#    list1 = [[usage_names[0]], ['dates'], usage_names[2:]]
#    usage.columns = [item for sublist in list1 for item in sublist]

    #######################################
    ### Prepare data

    allo_use1 = allo_use.copy()
    allo_use1.loc[date_col] = pd.to_datetime(allo_use1[date_col])
    allo_use1 = allo_use1[allo_use1[allo_col].notnull()]
    restr.loc[restr.band_restr > 100, 'band_restr'] = 100

    #########################################################
    ### Aggregate the daily low flow data to monthly

    restr2_grp1 = restr.groupby(['lowflow_id', 'band'])

    restr2_mon = restr2_grp1.apply(resample2).reset_index()
    restr2_mon = restr2_mon[restr2_mon.band_restr.notnull()]

    #####################################
    ### Combine all together

    ## crc to siteID and band

    crc_band_mon = pd.merge(crc, restr2_mon, on=['lowflow_id', 'band'])
    use_band = pd.merge(allo_use1, crc_band_mon, on=['crc', date_col])

    grp1_names = ['crc', date_col, 'take_type', 'allo_block', 'wap']
    use_band_grp1 = use_band.groupby(grp1_names)

    use_band_min = use_band_grp1.apply(lambda x: x[x.band_restr == x.band_restr.min()]).drop(grp1_names, axis=1).reset_index().drop(['level_5'], axis=1)

    use_band_min2 = use_band_min.drop_duplicates(subset=grp1_names)

    ## Calc new allocation based on flow restrictions

    use_band2 = use_band_min2.copy()
    use_band2['up_allo_m3'] = use_band_min2[allo_col]*use_band_min2['band_restr']*0.01
#    use_band2['up_allo_wqn_m3'] = use_band_min2['mon_vol_wqn']*use_band_min2['band_restr']*0.01

    use_band2.loc[:, 'up_allo_m3'] = use_band2.loc[:, 'up_allo_m3'].round(2)
#    use_band2.loc[:, 'up_allo_wqn_m3'] = use_band2.loc[:, 'up_allo_wqn_m3'].round(2)

    ## Merge other data together for saving

    use_ros1 = pd.merge(use_band2, sites, on=['lowflow_id'], how='left')

#    use_ros1 = merge(use_restr, allo, on=['crc', 'take_type', 'use_type'], how='left')

    ## Combine with all allocation data and update ann allo for all crc

#    use_ros1 = merge(usage, use_restr2[['crc', 'dates', 'band', 'band_restr', 'up_allo_m3', 'up_allo_wqn_m3', 'site']], on=['crc', 'dates'], how='left')

    use_ros1.loc[use_ros1.up_allo_m3.isnull(), 'up_allo_m3'] =  use_ros1.loc[use_ros1.up_allo_m3.isnull(), allo_col]
#    use_ros1.loc[use_ros1.up_allo_wqn_m3.isnull(), 'up_allo_wqn_m3'] = use_ros1.loc[use_ros1.up_allo_wqn_m3.isnull(), 'mon_vol_wqn']

    use_ros1.loc[:, 'up_allo_m3'] = use_ros1.loc[:, 'up_allo_m3'].round(2)
#    use_ros1.loc[:, 'up_allo_wqn_m3'] = use_ros1.loc[:, 'up_allo_wqn_m3'].round(2)

    ## Aggregate to annual allocation for monthly dataframe
    res1 = use_ros1[['crc', date_col, 'take_type', 'allo_block', 'wap', 'up_allo_m3']].groupby(['crc', 'take_type', 'allo_block', 'wap']).apply(resample1).reset_index()
    res1.columns = ['crc', 'take_type', 'allo_block', 'wap', date_col, 'ann_up_allo']
    use_ros1b = pd.merge(use_ros1, res1, on=grp1_names, how='left')
    use_ros1b = use_ros1b[use_ros1b['ann_up_allo'] != 0].drop(['lowflow_id'], axis=1)
    use_ros1b.columns = ['crc', date_col, 'take_type', 'allo_block', 'wap', 'mon_allo_m3', 'mon_usage_m3', 'band', 'band_restr', 'mon_restr_allo_m3', 'gauge_num', 'ann_restr_allo_m3']

    ## Make annual aggregations
    ann_ros = use_ros1.groupby(['crc', 'take_type', 'allo_block', 'wap']).apply(resample_ann)
    ann_ros1 = ann_ros[(ann_ros[allo_col] != 0) & ann_ros[allo_col].notnull()].reset_index().drop(['lowflow_id'], axis=1)
    ann_ros1.columns = ['crc', 'take_type', 'allo_block', 'wap', date_col, 'ann_allo_m3', 'ann_usage_m3', 'band', 'band_restr', 'ann_restr_allo_m3', 'gauge_num']

    ## Save data

    if isinstance(export_use_ros_path, str):
        save_df(use_ros1b, export_use_ros_path, index=False)
        save_df(ann_ros1, export_ann_use_ros_path, index=False)

    return([use_ros1b, ann_ros1])
Пример #12
0
def rd_henry(sites,
             from_date=None,
             to_date=None,
             agg_day=True,
             sites_by_col=False,
             min_filter=None,
             export=None):
    """
    Function to read in gaugings data from the "Henry DB". Hopefully, they keep this around for a while longer.

    Parameters
    ----------
    sites: list or str
        Either a list of site names or a file path string that contains a column of site names.
    from_date: str
        A date string for the start of the data (e.g. '2010-01-01').
    to_date: str
        A date string for the end of the data.
    agg_day: bool
        Should the gauging dates be aggregated down to the day as opposed to having the hour and minute. Gaugings are aggregated by the mean.
    sites_by_col: bool
        'False' does not make a single DateTimeIndex, rather it is indexed by site and date (long format). 'True' creates a single DateTimeIndex with the columns as gauging sites (will create many NAs).
    min_filter: int or None
        Minimum number of days required for the gaugings output.
    export: str or None
        Either a string path to a csv file or None.
    """
    def resample1(df):
        df.index = df.date
        df2 = df.resample('D').mean()
        return df2

    #### Fields and names for databases

    ## Query fields - Be sure to use single quotes for the names!!!

    fields = ['SiteNo', 'SampleDate', 'Flow']

    ## Equivelant short names for analyses - Use these names!!!

    names = ['site', 'date', 'flow']

    #### Databases

    ### Gaugings data

    server = 'SQL2012PROD03'
    database = 'DataWarehouse'

    table = 'DataWarehouse.dbo.F_SG_BGauging'
    where_col = 'SiteNo'

    ## Will change to the following!!! Or stay as a duplicate...

    # database1 = 'Hydstra'

    # table1 = 'Hydstra.dbo.GAUGINGS'

    ########################################
    ### Read in data

    sites1 = select_sites(sites).tolist()
    data = rd_sql(server=server,
                  database=database,
                  table=table,
                  col_names=fields,
                  where_col=where_col,
                  where_val=sites1).dropna()
    data.columns = names

    ### Aggregate duplicates

    data2 = data.groupby(['site', 'date']).mean().reset_index()

    ### Aggregate by day

    if agg_day:
        data3 = data2.groupby(['site']).apply(resample1).reset_index().dropna()
    else:
        data3 = data2

    ### Filter out sites with less than min_filter
    if min_filter is not None:
        count1 = data3.groupby('site')['flow'].count()
        count_index = count1[count1 >= min_filter].index
        data3 = data3[np.in1d(data3.site.values, count_index)]

    ### Select within date range
    if from_date is not None:
        data3 = data3[data3.date >= from_date]
    if to_date is not None:
        data3 = data3[data3.date <= to_date]

    ### reorganize data with sites as columns and dates as index

    if sites_by_col:
        data4 = data3.pivot(index='date', columns='site').xs('flow',
                                                             axis=1).round(4)
    else:
        data4 = data3.round(4)

    if isinstance(export, str):
        if sites_by_col:
            data4.to_csv(export)
        else:
            data4.to_csv(export, index=False)

    return data4
Пример #13
0
def _proc_hydro_sql(self,
                    sites_sql_fun,
                    mtype_dict,
                    mtype,
                    sites=None,
                    from_date=None,
                    to_date=None,
                    qual_codes=None,
                    min_count=None,
                    buffer_dis=0,
                    resample_code=None,
                    period=1,
                    fun='mean'):
    """
    Convenience function for reading in mssql data from standardized hydro tables.
    """

    if isinstance(sites, gpd.GeoDataFrame):
        loc1 = sites_sql_fun()
        sites1 = sel_sites_poly(loc1, sites, buffer_dis).index.astype(str)
    else:
        sites1 = pd.Series(sites).astype(str)

    h1 = self.copy()
    if isinstance(mtype_dict, (list, tuple)):
        for i in range(len(mtype_dict)):
            site1 = mtype_dict[i]['site_col']

            sites_stmt = 'select distinct ' + site1 + ' from ' + mtype_dict[i][
                'table']
            sites2 = rd_sql(mtype_dict[i]['server'],
                            mtype_dict[i]['database'],
                            stmt=sites_stmt).astype(str)[site1]
            sites3 = sites2[sites2.isin(sites1)].astype(str).tolist()
            if not sites3:
                raise ValueError('No sites in database')
            if mtype_dict[i]['qual_col'] is None:
                qual_codes = None
            h1 = h1._rd_hydro_mssql(sites=sites3,
                                    mtype=mtype,
                                    from_date=from_date,
                                    to_date=to_date,
                                    qual_codes=qual_codes,
                                    min_count=min_count,
                                    resample_code=resample_code,
                                    period=period,
                                    fun=fun,
                                    **mtype_dict[i])
    elif isinstance(mtype_dict, dict):
        site1 = mtype_dict['site_col']

        sites_stmt = 'select distinct ' + site1 + ' from ' + mtype_dict['table']
        sites2 = rd_sql(mtype_dict['server'],
                        mtype_dict['database'],
                        stmt=sites_stmt).astype(str)[site1]
        sites3 = sites2[sites2.isin(sites1)].astype(str).tolist()
        if not sites3:
            raise ValueError('No sites in database')
        if mtype_dict['qual_col'] is None:
            qual_codes = None
        h1 = h1._rd_hydro_mssql(sites=sites3,
                                mtype=mtype,
                                from_date=from_date,
                                to_date=to_date,
                                qual_codes=qual_codes,
                                min_count=min_count,
                                resample_code=resample_code,
                                period=period,
                                fun=fun,
                                **mtype_dict)
    elif callable(mtype_dict):
        h1 = mtype_dict(h1,
                        sites=sites1,
                        mtype=mtype,
                        from_date=from_date,
                        to_date=to_date,
                        min_count=min_count)

    return h1
Пример #14
0
def rd_hydrotel(sites,
                hydro_id,
                from_date=None,
                to_date=None,
                resample_code='D',
                period=1,
                val_round=3,
                min_count=None,
                pivot=False,
                export_path=None):
    """
    Function to extract time series data from the hydrotel database.

    Parameters
    ----------
    sites: list, array, dataframe, or str
        Site list or a str path to a single column csv file of site names/numbers.
    hydro_id: str
        'river / flow / rec / raw', 'aq / wl / rec / raw', 'atmos / precip / rec / raw', 'river / wl / rec / raw', or 'river / T / rec / raw'.
    from_date: str or None
        The start date in the format '2000-01-01'.
    to_date: str or None
        The end date in the format '2000-01-01'.
    resample_code : str
        The Pandas time series resampling code. e.g. 'D' for day, 'W' for week, 'M' for month, etc.
    period: int
        The number of resampling periods. e.g. period = 2 and resample = 'D' would be to resample the values over a 2 day period.
    fun: str
        The resampling function. i.e. mean, sum, count, min, or max. No median yet...
    val_round: int
        The number of decimals to round the values.
    pivot: bool
        Should the output be pivotted into wide format?
    export_path: str or None
        The path and file name to be saved.

    Returns
    -------
    Series or DataFrame
        A MultiIndex Pandas Series if pivot is False and a DataFrame if True
    """
    #### Import data and select the correct sites

    sites = select_sites(sites)
    if hydro_id == 'atmos / precip / rec / raw':
        site_ob1 = rd_sql(server, database, objects_tab, ['Site', 'ExtSysId'],
                          'ExtSysId',
                          sites.astype('int32').tolist())
        site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'],
                           'Site', site_ob1.Site.tolist())
        site_val1 = pd.merge(site_val0, site_ob1, on='Site')
    elif hydro_id in ['aq / wl / rec / raw', 'aq / T / rec / raw']:
        site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'])
        site_val0.loc[:,
                      'Name'] = site_val0.apply(lambda x: x.Name.split(' ')[0],
                                                axis=1)
        site_val1 = site_val0[site_val0.Name.isin(sites)].copy()
        site_val1.loc[:, 'ExtSysId'] = site_val1.loc[:, 'Name']
    else:
        site_val1 = rd_sql(server, database, sites_tab, sites_col, 'ExtSysId',
                           sites.astype('int32').tolist())

    if site_val1.empty:
        raise ValueError('No site(s) in database')

    site_val1.loc[:, 'ExtSysId'] = pd.to_numeric(site_val1.loc[:, 'ExtSysId'],
                                                 errors='ignore')
    site_val1 = site_val1.drop_duplicates('ExtSysId')
    site_val = site_val1.Site.astype('int32').tolist()
    if isinstance(hydro_id, (list, np.ndarray, pd.Series)):
        hydro_ids = [hydro_ids_dict[i] for i in hydro_id]
    elif isinstance(hydro_id, str):
        hydro_ids = [hydro_ids_dict[hydro_id]]
    else:
        raise ValueError('hydro_id must be a str, list, ndarray, or Series.')
    hydro_ids_val = rd_sql(server, database, hydro_ids_tab, hydro_ids_col,
                           'Name', hydro_ids)

    where_col = {
        'Site': site_val,
        'ObjectVariant': hydro_ids_val.ObjectVariant.astype('int32').tolist(),
        'ObjectType': hydro_ids_val.ObjectType.astype('int32').tolist()
    }

    object_val1 = rd_sql(server, database, objects_tab, objects_col, where_col)
    if hydro_id == 'aq / wl / rec / raw':
        object_val1 = object_val1[object_val1.Name == 'Water Level']
    elif hydro_id == 'atmos / precip / rec / raw':
        object_val1 = object_val1[object_val1.Name == 'Rainfall']
    elif hydro_id == 'river / T / rec / raw':
        object_val1 = object_val1[object_val1.Name == 'Water Temperature']
    object_val = object_val1.Object.values.astype(int).tolist()

    #### Rearrange data
    point_val1 = rd_sql(server,
                        database,
                        points_tab,
                        points_col,
                        where_col='Object',
                        where_val=object_val)
    point_val = point_val1.Point.values.astype(int).tolist()

    #### Big merge
    comp_tab1 = pd.merge(site_val1, object_val1[['Object', 'Site']], on='Site')
    comp_tab2 = pd.merge(comp_tab1, point_val1, on='Object')
    comp_tab2.set_index('Point', inplace=True)

    #### Pull out the data
    ### Make SQL statement
    data1 = rd_sql_ts(server,
                      database,
                      data_tab,
                      'Point',
                      'DT',
                      'SampleValue',
                      resample_code,
                      period,
                      resample_dict[hydro_id],
                      val_round, {'Point': point_val},
                      from_date=from_date,
                      to_date=to_date,
                      min_count=min_count)['SampleValue']

    data1.index.names = ['site', 'time']
    data1.name = 'value'
    site_numbers = [
        comp_tab2.loc[i, 'ExtSysId'] for i in data1.index.levels[0]
    ]
    data1.index.set_levels(site_numbers, level='site', inplace=True)

    if pivot:
        data3 = data1.unstack(0)
    else:
        data3 = data1

    #### Export and return
    if export_path is not None:
        save_df(data3, export_path)

    return data3
Пример #15
0
def restr_days(select, period='A-JUN', months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], min_sites_shp='S:/Surface Water/shared/GIS_base/vector/low_flows/min_flows_sites_Cant.shp', sites_col='ReferenceN', export=True, export_path='restr_days.csv'):
    """
    Function to determine the number of days on restriction per period according to the LowFlows database.

    Parameters
    ----------
    select: list or str
        Can either be a list of gauging site numbers or a shapefile polygon of an area that contains min flow sites.
    period: str
        Pandas time series code for the time period.
    months: list of int
        The specific months to include in the query.

    Returns
    -------
    DataFrame
    """

    ########################################
    ### Parameters

    ## Query fields - Be sure to use single quotes for the names!!!

    restr_fields = ['SiteID', 'RestrictionDate', 'BandNo', 'BandAllocation']
#    sites_fields = ['SiteID', 'RefDBaseKey','RecordNo', 'WellNo']
    crc_fields = ['SiteID', 'BandNo', 'RecordNo']
    sites_fields = ['Siteid', 'RefDBaseKey']


    ## Equivelant short names for analyses - Use these names!!!

    restr_names = ['SiteID', 'dates', 'band_num', 'band_restr']
#    sites_names = ['SiteID', 'gauge_num', 'crc', 'wap']
    crc_names = ['SiteID', 'band_num', 'crc']
    sites_names = ['SiteID', 'gauge_num']

    ## Databases

    #statement = "SELECT * FROM "

    # daily restrictions

    server1 = 'SQL2012PROD03'
    database1 = 'LowFlows'

    restr_table = 'LowFlows.dbo.LowFlowSiteRestrictionDaily'
    restr_where = {'SnapshotType': ['Live']}

    # Sites info

    server2 = 'SQL2012PROD03'
    database2 = 'LowFlows'

    sites_table = 'LowFlows.dbo.vLowFlowSite'

    # crc, sites, and bands

    server3 = 'SQL2012PROD03'
    database3 = 'LowFlows'

    crc_table = 'LowFlows.dbo.vLowFlowConsents2'

    ########################################
    ## Make the sites selection
    if isinstance(select, str):
        if select.endswith('.shp'):
            sites3 = sel_sites_poly(select, min_sites_shp)[sites_col].unique()
        else:
            sites3 = pd.read_csv(select)[sites_col].unique()
    elif isinstance(select, (list, np.ndarray)):
        sites3 = select_sites(select)

    ########################################
    ### Read in data

    sites = rd_sql(server2, database2, sites_table, sites_fields)
    sites.columns = sites_names

    sites4 = sites.loc[sites.gauge_num.isin(sites3.astype(str)), 'SiteID'].unique().astype('int32').tolist()

    restr_where.update({'SiteID': sites4})

    restr = rd_sql(server1, database1, restr_table, restr_fields, restr_where).drop_duplicates(keep='last')
    restr.columns = restr_names

    crc = rd_sql(server3, database3, crc_table, crc_fields)
    crc.columns = crc_names

    ##################################
    ### Calculate the number of days on full and partial restriction

    ## Remove anything above 100%
    restr1 = restr[restr.band_restr <= 100]

    ## Recategorize band restr
    partial_index = (restr1.band_restr > 0) & (restr1.band_restr < 100)

    restr1.loc[partial_index, 'band_restr'] = 101
    restr1.loc[restr1.band_restr == 100, 'band_restr'] = 103
    restr1.loc[restr1.band_restr == 0, 'band_restr'] = 102

    ## Restrict by months
    mon_index = restr1.dates.dt.month.isin(months)
    restr1 = restr1[mon_index]

    ## Do the work
    def sp_count(df, num):
        df.index = df.dates
        df_grp = df[df.band_restr == num].resample(period)
        df_count = df_grp['band_restr'].count()
        return df_count

    restr1_grp = restr1.groupby(['SiteID', 'band_num'])

    partial1 = restr1_grp.apply(sp_count, 101)
    partial1.name = 'partial'
    full1 = restr1_grp.apply(sp_count, 102)
    full1.name = 'full'
#    no1 = restr1_grp.apply(sp_count, 103)

    tot1 = pd.concat([partial1, full1], axis=1)
    tot1.index.names = ['SiteID', 'band_num', 'dates']
    if partial1.empty:
        tot1['partial'] = 0
    if full1.empty:
        tot1['full'] = 0
#    tot1.columns = ['partial', 'full']

    tot2 = tot1.reset_index()

    ## Relabel the sites to actually be site number
    sites2 = sites.drop_duplicates()
    tot3 = pd.merge(tot2, sites2, on='SiteID', how='left')
    tot3.loc[tot3.partial.isnull(), 'partial'] = 0
    tot3.loc[tot3.full.isnull(), 'full'] = 0

    tot3 = tot3[tot3.gauge_num.notnull()]

    ## Summarize the results
    restr2 = tot3[['gauge_num', 'band_num', 'dates', 'partial', 'full']]

    if export:
        restr2.to_csv(export_path, index=False)
    return(restr2)
Пример #16
0
def rd_squalarc(sites,
                mtypes=None,
                from_date=None,
                to_date=None,
                convert_dtl=False,
                dtl_method=None,
                export=None):
    """
    Function to read in "squalarc" data. Which is atually stored in the mssql db.

    Parameters
    ----------
    sites: ndarry, list, or str
        The site names as a list, array, csv with the first column as the site names, or a polygon shapefile of the area of interest.
    mtypes: list or None
        A list of measurement type names to be in the output. Leaving it empty returns all mtypes.
    from_date: str
        A start date string in of '2010-01-01'.
    to_date: str
        A end date string in of '2011-01-01'.
    convert_dtl: bool
        Should values under the detection limit be converted to numeric?
    dtl_method: str
        The method to use to convert values under a detection limit to numeric. None or 'standard' takes half of the detection limit. 'trend' is meant as an output for trend analysis with includes an additional column dtl_ratio referring to the ratio of values under the detection limit.
    export: str or None
        Either None or a string path to a csv file.
    """

    #### Read in sites
    sites1 = select_sites(sites)

    #### Extract by polygon
    if isinstance(sites1, gpd.GeoDataFrame):
        ## Surface water sites
        sw_sites_tab = rd_sql('SQL2012PROD05',
                              'Squalarc',
                              'SITES',
                              col_names=['SITE_ID', 'NZTMX', 'NZTMY'])
        sw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_sw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', sw_sites_tab)
        sites1a = sites1.to_crs(gdf_sw_sites.crs)
        sw_sites2 = sel_sites_poly(gdf_sw_sites, sites1a).drop('geometry',
                                                               axis=1)

        ## Groundwater sites
        gw_sites_tab = rd_sql('SQL2012PROD05',
                              'Wells',
                              'WELL_DETAILS',
                              col_names=['WELL_NO', 'NZTMX', 'NZTMY'])
        gw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_gw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', gw_sites_tab)
        gw_sites2 = sel_sites_poly(gdf_gw_sites, sites1a).drop('geometry',
                                                               axis=1)

        sites2 = sw_sites2.site.append(gw_sites2.site).astype(str).tolist()
    else:
        sites2 = pd.Series(sites1, name='site').astype(str).tolist()

    #### Extract the rest of the data
    if len(sites2) > 10000:
        n_chunks = int(np.ceil(len(sites2) * 0.0001))
        sites3 = [sites2[i::n_chunks] for i in xrange(n_chunks)]
        samples_tab = pd.DataFrame()
        for i in sites3:
            samples_tab1 = rd_sql('SQL2012PROD05',
                                  'Squalarc',
                                  '"SQL_SAMPLE_METHODS+"',
                                  col_names=[
                                      'Site_ID', 'SAMPLE_NO', 'ME_TYP',
                                      'Collect_Date', 'Collect_Time',
                                      'PA_NAME', 'PARAM_UNITS', 'SRESULT'
                                  ],
                                  where_col='Site_ID',
                                  where_val=i)
            samples_tab1.columns = [
                'site', 'sample_id', 'source', 'date', 'time', 'parameter',
                'units', 'val'
            ]
            samples_tab1.loc[:,
                             'source'] = samples_tab1.loc[:,
                                                          'source'].str.lower(
                                                          )
            samples_tab = pd.concat([samples_tab, samples_tab1])
    else:
        samples_tab = rd_sql('SQL2012PROD05',
                             'Squalarc',
                             '"SQL_SAMPLE_METHODS+"',
                             col_names=[
                                 'Site_ID', 'SAMPLE_NO', 'ME_TYP',
                                 'Collect_Date', 'Collect_Time', 'PA_NAME',
                                 'PARAM_UNITS', 'SRESULT'
                             ],
                             where_col='Site_ID',
                             where_val=sites2)
        samples_tab.columns = [
            'site', 'sample_id', 'source', 'date', 'time', 'parameter',
            'units', 'val'
        ]
        samples_tab.loc[:, 'source'] = samples_tab.loc[:, 'source'].str.lower()

    samples_tab2 = samples_tab.copy()
    num_test = pd.to_numeric(samples_tab2.loc[:, 'time'], 'coerce')
    samples_tab2.loc[num_test.isnull(), 'time'] = '0000'
    samples_tab2.loc[:,
                     'time'] = samples_tab2.loc[:,
                                                'time'].str.replace('.', '')
    samples_tab2 = samples_tab2[samples_tab2.date.notnull()]
    #    samples_tab2.loc[:, 'time'] = samples_tab2.loc[:, 'time'].str.replace('9999', '0000')
    time1 = pd.to_datetime(samples_tab2.time, format='%H%M', errors='coerce')
    time1[time1.isnull()] = pd.Timestamp('2000-01-01 00:00:00')
    datetime1 = pd.to_datetime(
        samples_tab2.date.dt.date.astype(str) + ' ' +
        time1.dt.time.astype(str))
    samples_tab2.loc[:, 'date'] = datetime1
    samples_tab2 = samples_tab2.drop('time', axis=1)
    samples_tab2.loc[samples_tab2.val.isnull(), 'val'] = np.nan
    samples_tab2.loc[samples_tab2.val == 'N/A', 'val'] = np.nan

    #### Select within time range
    if isinstance(from_date, str):
        samples_tab2 = samples_tab2[samples_tab2['date'] >= from_date]
    if isinstance(to_date, str):
        samples_tab2 = samples_tab2[samples_tab2['date'] <= to_date]

    if mtypes is not None:
        mtypes1 = select_sites(mtypes)
        data = samples_tab2[samples_tab2.parameter.isin(mtypes1)].reset_index(
            drop=True)
    else:
        data = samples_tab2.reset_index(drop=True)

    #### Correct poorly typed in site names
    data.loc[:, 'site'] = data.loc[:, 'site'].str.upper().str.replace(' ', '')

    #### Convert detection limit values
    if convert_dtl:
        less1 = data['val'].str.match('<')
        if less1.sum() > 0:
            less1.loc[less1.isnull()] = False
            data2 = data.copy()
            data2.loc[less1,
                      'val'] = pd.to_numeric(
                          data.loc[less1, 'val'].str.replace('<', ''),
                          errors='coerce') * 0.5
            if dtl_method in (None, 'standard'):
                data3 = data2
            if dtl_method == 'trend':
                df1 = data2.loc[less1]
                count1 = data.groupby('parameter')['val'].count()
                count1.name = 'tot_count'
                count_dtl = df1.groupby('parameter')['val'].count()
                count_dtl.name = 'dtl_count'
                count_dtl_val = df1.groupby('parameter')['val'].nunique()
                count_dtl_val.name = 'dtl_val_count'
                combo1 = pd.concat([count1, count_dtl, count_dtl_val],
                                   axis=1,
                                   join='inner')
                combo1['dtl_ratio'] = (combo1['dtl_count'] /
                                       combo1['tot_count']).round(2)

                ## conditionals
                #            param1 = combo1[(combo1['dtl_ratio'] <= 0.4) | (combo1['dtl_ratio'] == 1)]
                #            under_40 = data['parameter'].isin(param1.index)
                param2 = combo1[(combo1['dtl_ratio'] > 0.4)
                                & (combo1['dtl_val_count'] != 1)]
                over_40 = data['parameter'].isin(param2.index)

                ## Calc detection limit values
                data3 = pd.merge(data,
                                 combo1['dtl_ratio'].reset_index(),
                                 on='parameter',
                                 how='left')
                data3.loc[:, 'val_dtl'] = data2['val']

                max_dtl_val = data2[over_40 & less1].groupby(
                    'parameter')['val'].transform('max')
                max_dtl_val.name = 'dtl_val_max'
                data3.loc[over_40 & less1, 'val_dtl'] = max_dtl_val
        else:
            data3 = data
    else:
        data3 = data

    #### Return and export
    if isinstance(export, str):
        data3.to_csv(export, encoding='utf-8', index=False)
    return data3
Пример #17
0
def crc_band_flow(site_lst=None, crc_lst=None, names=False):
    """
    Function to determine the min flow conditions for each flow site, band, and crc.
    """

    ### Database parameters
    # crc, sites, and bands

    server = 'SQL2012PROD03'
    database = 'LowFlows'

    crc_table = 'vLowFlowConsents2'

    # id and gauge site

    gauge_table = 'LowFlowSite'

    # Internal site id, band, and min flow

    min_flow_table = 'LowFlowSiteBandPeriodAllocation'

    ## fields and associated column names
    crc_fields = ['SiteID', 'BandNo', 'RecordNo']
    crc_names = ['id', 'band', 'crc']

    if names:
        gauge_fields = ['SiteID', 'RefDBaseKey', 'Waterway', 'Location']
        gauge_names = ['id', 'site', 'Waterway', 'Location']
    else:
        gauge_fields = ['SiteID', 'RefDBaseKey']
        gauge_names = ['id', 'site']

    min_flow_fields = ['SiteID', 'BandNo', 'PeriodNo', 'Allocation', 'Flow']
    min_flow_names = ['id', 'band', 'mon', 'allo', 'min_flow']

    ### Load in data

    crc = rd_sql(server, database, crc_table, crc_fields)
    crc['crc'] = crc['crc'].str.strip()
    crc.columns = crc_names

    gauge = rd_sql(server, database, gauge_table, gauge_fields)
    gauge.columns = gauge_names

    min_flow = rd_sql(server, database, min_flow_table, min_flow_fields)
    min_flow.columns = min_flow_names

    ### Remove min flows that are not restricted
    min_flow1 = min_flow[min_flow.allo < 100]

    ### Lots of table merges!
    crc_min_flow = pd.merge(crc, min_flow1, on=['id', 'band'])
    crc_min_gauge = pd.merge(gauge, crc_min_flow, on='id').drop('id', axis=1)

    ### Query results
    if crc_lst is not None:
        crc_sel = select_sites(crc_lst)
        sel1 = crc_min_gauge[np.in1d(crc_min_gauge.crc, crc_sel)]
    else:
        sel1 = crc_min_gauge
    if site_lst is not None:
        site_sel = select_sites(site_lst).astype(str)
        sel2 = sel1[np.in1d(sel1.site, site_sel)]
    else:
        sel2 = sel1

    return sel2
Пример #18
0
def rec_catch_del(sites_shp, rec_streams_shp, rec_catch_shp, sites_col='site', buffer_dis=400, catch_output=None):
    """
    Catchment delineation using the REC streams and catchments.

    Parameters
    ----------
    sites_shp : str path or GeoDataFrame
        Points shapfile of the sites along the streams or the equivelant GeoDataFrame.
    rec_streams_shp : str path, GeoDataFrame, or dict
        str path to the REC streams shapefile, the equivelant GeoDataFrame, or a dict of parameters to read in an mssql table using the rd_sql function.
    rec_catch_shp : str path, GeoDataFrame, or dict
        str path to the REC catchment shapefile, the equivelant GeoDataFrame, or a dict of parameters to read in an mssql table using the rd_sql function.
    sites_col : str
        The column name of the site numbers in the sites_shp.
    catch_output : str or None
        The output polygon shapefile path of the catchment delineation.

    Returns
    -------
    GeoDataFrame
        Polygons
    """

    ### Parameters


    ### Modifications {NZREACH: {NZTNODE/NZFNODE: node # to change}}
    mods = {13053151: {'NZTNODE': 13055874}, 13048353: {'NZTNODE': 13048851}, 13048498: {'NZTNODE': 13048851}}

    ### Load data
    if isinstance(rec_catch_shp, gpd.GeoDataFrame):
        rec_catch = rec_catch_shp.copy()
    elif isinstance(rec_catch_shp, str):
        if rec_catch_shp.endswith('shp'):
            rec_catch = gpd.read_file(rec_catch_shp)
        else:
            raise ValueError('If rec_catch_shp is a str, then it must be a path to a shapefile.')
    elif isinstance(rec_catch_shp, dict):
        rec_catch = rd_sql(**rec_catch_shp)

    if isinstance(rec_streams_shp, gpd.GeoDataFrame):
        rec_streams = rec_streams_shp.copy()
    elif isinstance(rec_streams_shp, str):
        if rec_streams_shp.endswith('shp'):
            rec_streams = gpd.read_file(rec_streams_shp)
        else:
            raise ValueError('If rec_catch_shp is a str, then it must be a path to a shapefile.')
    elif isinstance(rec_streams_shp, dict):
        rec_streams = rd_sql(**rec_streams_shp)

    pts = select_sites(sites_shp)

    ### make mods
    for i in mods:
        rec_streams.loc[rec_streams['NZREACH'] == i, mods[i].keys()] = mods[i].values()

    ### Find closest REC segment to points
    pts_seg = closest_line_to_pts(pts, rec_streams, line_site_col='NZREACH', buffer_dis=buffer_dis)
    nzreach = pts_seg.copy().NZREACH.unique()

    ### Find all upstream reaches
    reaches = find_upstream_rec(nzreach, rec_streams_shp=rec_streams)

    ### Extract associated catchments
    rec_catch = extract_rec_catch(reaches, rec_catch_shp=rec_catch)

    ### Aggregate individual catchments
    rec_shed = agg_rec_catch(rec_catch)
    rec_shed.columns = ['NZREACH', 'geometry', 'area']
    rec_shed1 = rec_shed.merge(pts_seg.drop('geometry', axis=1), on='NZREACH')

    ### Export and return
    if catch_output is not None:
        rec_shed1.to_file(catch_output)
    return rec_shed1