Exemplo n.º 1
0
def rd_waps_geo(sites=None):
    if sites is not None:
        site_geo = rd_sql('SQL2012PROD05',
                          'Wells',
                          'WELL_DETAILS', ['WELL_NO', 'NZTMX', 'NZTMY'],
                          where_col='WELL_NO',
                          where_val=sites)
    else:
        site_geo = rd_sql('SQL2012PROD05', 'Wells', 'WELL_DETAILS',
                          ['WELL_NO', 'NZTMX', 'NZTMY'])

    site_geo.rename(columns={'WELL_NO': 'site'}, inplace=True)
    index1 = (site_geo.NZTMX > 1300000) & (site_geo.NZTMX < 1700000) & (
        site_geo.NZTMY > 5000000) & (site_geo.NZTMY < 5400000)
    site_geo0 = site_geo[index1]
    site_geo2 = xy_to_gpd(df=site_geo0,
                          id_col='site',
                          x_col='NZTMX',
                          y_col='NZTMY')
    #    site_geo2.loc[:, 'site'] = site_geo2.loc[:, 'site'].str.upper().str.replace(' ', '')
    #    site_geo2 = site_geo2.drop_duplicates()
    site_geo2.loc[:, 'site'] = to_numeric(site_geo2.loc[:, 'site'],
                                          errors='ignore')

    return (site_geo2.set_index('site'))
Exemplo n.º 2
0
def rd_sw_rain_geo(sites=None):
    if sites is not None:
        site_geo = rd_sql('SQL2012PROD05',
                          'Bgauging',
                          'RSITES',
                          col_names=['SiteNumber', 'NZTMX', 'NZTMY'],
                          where_col='SiteNumber',
                          where_val=sites)
    else:
        site_geo = rd_sql('SQL2012PROD05',
                          'Bgauging',
                          'RSITES',
                          col_names=['SiteNumber', 'NZTMX', 'NZTMY'])

    site_geo.columns = ['site', 'NZTMX', 'NZTMY']
    site_geo.loc[:, 'site'] = to_numeric(site_geo.loc[:, 'site'],
                                         errors='ignore')

    site_geo2 = xy_to_gpd(df=site_geo,
                          id_col='site',
                          x_col='NZTMX',
                          y_col='NZTMY')
    site_geo3 = site_geo2.loc[site_geo2.site > 0, :]
    site_geo3.loc[:, 'site'] = site_geo3.loc[:, 'site'].astype('int32')
    return (site_geo3.set_index('site'))
Exemplo n.º 3
0
def metconnect_id_loc(sites=None,
                      mc_server='SQL2012PROD03',
                      mc_db='MetConnect',
                      mc_site_table='RainFallPredictionSites',
                      mc_cols=['MetConnectID', 'SiteString', 'TidedaID'],
                      gis_server='SQL2012PROD05'):
    """
    Function to extract the metconnect id table with geometry location.

    Parameters
    ----------
    sites : list of int or None
        The site numbers to extract from the table, or None for all.

    Returns
    -------
    GeoDataFrame
    """

    ### Input parameters
    #    hy_server = 'SQL2012PROD05'
    #    hy_db = 'Hydrotel'
    #    pts_table = 'Points'
    #    objs_table = 'Objects'
    #    sites_table = 'Sites'
    #
    #    pts_cols = ['Point', 'Object']
    #    objs_cols = ['Object', 'Site']
    #    sites_cols = ['Site', 'ExtSysId']

    loc_db = 'Bgauging'
    loc_table = 'RSITES'

    loc_cols = ['SiteNumber', 'NZTMX', 'NZTMY']

    ## Import tables
    mc1 = rd_sql(mc_server, mc_db, mc_site_table, mc_cols)
    mc2 = mc1[~mc1.SiteString.str.startswith('M')]
    mc2.columns = ['MetConnectID', 'site_name', 'ExtSysId']
    mc2 = mc2[(mc2.MetConnectID != 7) & mc2.ExtSysId.notnull()]
    mc2.loc[:, 'ExtSysId'] = mc2.loc[:, 'ExtSysId'].astype(int)

    #    hy_pts = rd_sql(hy_server, hy_db, pts_table, pts_cols, 'Point', mc2.Point.tolist())
    #    hy_objs = rd_sql(hy_server, hy_db, objs_table, objs_cols, 'Object', hy_pts.Object.tolist())
    #    hy_sites = rd_sql(hy_server, hy_db, sites_table, sites_cols, 'Site', hy_objs.Site.tolist())
    #    hy_sites['ExtSysId'] = to_numeric(hy_sites['ExtSysId'])
    hy_loc = rd_sql(gis_server, loc_db, loc_table, loc_cols, 'SiteNumber',
                    mc2.ExtSysId.tolist())
    hy_loc.columns = ['ExtSysId', 'x', 'y']

    #    t1 = merge(mc2, hy_pts, on='Point')
    #    t2 = merge(t1, hy_objs, on='Object')
    #    t3 = merge(t2, hy_sites, on='Site')
    t4 = merge(mc2, hy_loc, on='ExtSysId')

    hy_xy = xy_to_gpd('MetConnectID', 'x', 'y', t4)

    return (hy_xy)
Exemplo n.º 4
0
def flow_sites_to_shp(sites='All',
                      min_flow_only=False,
                      export=False,
                      export_path='sites.shp'):
    """
    Function to create a geopandas/shapefile from flow sites.
    """

    ### Import from databases
    if min_flow_only:
        min_flow_sites = rd_sql('SQL2012PROD05',
                                'Wells',
                                '"vMinimumFlowSites+Consent+Well_classes"',
                                col_names=[
                                    'RefDbase', 'RefDbaseKey',
                                    'restrictionType', 'RecordNo', 'WellNo'
                                ],
                                where_col='RefDbase',
                                where_val=['Gauging', 'Hydrotel'])
        min_flow_sites.columns = ['type', 'site', 'restr', 'crc', 'wap']
        min_flow_sites['site'] = min_flow_sites['site'].astype(int)
        min_flow_sites = min_flow_sites[min_flow_sites.restr == 'LowFlow']

    site_geo = rd_sql('SQL2012PROD05',
                      'GIS',
                      'vGAUGING_NZTM',
                      col_names=['SiteNumber', 'RIVER', 'SITENAME'],
                      geo_col=True)
    site_geo.columns = ['site', 'river', 'site_name', 'geometry']
    site_geo['river'] = site_geo.river.apply(lambda x: x.title())
    site_geo['site_name'] = site_geo.site_name.apply(lambda x: x.title())
    site_geo['site_name'] = site_geo.site_name.apply(
        lambda x: x.replace(' (Recorder)', ''))
    site_geo['site_name'] = site_geo.site_name.apply(
        lambda x: x.replace('Sh', 'SH'))
    site_geo['site_name'] = site_geo.site_name.apply(
        lambda x: x.replace('Ecs', 'ECS'))

    ### Select sites
    if type(sites) is str:
        if sites is 'All':
            sites_sel_geo = site_geo
        elif sites.endswith('.shp'):
            poly = read_file(sites)
            sites_sel_geo = sel_sites_poly(poly, site_geo)
        else:
            raise ValueError('If sites is a str, then it must be a shapefile.')
    else:
        sites_sel = select_sites(sites).astype('int32')
        sites_sel_geo = site_geo[in1d(site_geo.site, sites_sel)]
    if min_flow_only:
        sites_sel_geo = sites_sel_geo[in1d(sites_sel_geo.site,
                                           min_flow_sites.site.values)]

    ### Export and return
    if export:
        sites_sel_geo.to_file(export_path)
    return (sites_sel_geo)
Exemplo n.º 5
0
def rd_site_geo_attr(sites):
    geo_attr = rd_sql('SQL2012DEV01',
                      'Hydro',
                      'site_geo_attr',
                      where_col='site',
                      where_val=sites)
    return (geo_attr)
Exemplo n.º 6
0
def _rd_hydro_geo_mssql(self, server, database, table, geo_dict):
    """
    Function to select sites based on the geo attributes.
    """

    sites1 = rd_sql(server, database, table, 'site', geo_dict)
    sites2 = sites1.site.astype(str).values.tolist()
    return(sites2)
Exemplo n.º 7
0
def poly_import(irr_type_dict, paw_dict, paw_ratio=0.67):
    """
    Function to import polygon input data. At the moment, these include irrigation type and PAW. Inputs are dictionaries that reference either an MSSQL table with a geometry column or a shapefile. If the dictionary references an sql table then the keys should be 'server', 'database', 'table', and 'column'. If the dictionary references a shapefile, then the keys should be 'shp' and 'column'. All values should be strings.
    """

    if not all([isinstance(irr_type_dict, dict), isinstance(paw_dict, dict)]):
        raise TypeError("'irr_type_dict' and 'paw_dict' must be dictionaries.")

    if 'column' in irr_type_dict.keys():
        if not isinstance(irr_type_dict['column'], str):
            raise TypeError("The key 'column' must be a string.")
    else:
        raise TypeError("The key 'column' must be in the dictionaries.")

    if 'shp' in irr_type_dict.keys():
        if not isinstance(irr_type_dict['shp'], str):
            raise TypeError(
                "If 'shp' is in the dict, then it must be a string path to a shapefile."
            )
        irr1 = read_file(
            irr_type_dict['shp'])[[irr_type_dict['column'], 'geometry']]
    elif isinstance(irr_type_dict, dict):
        irr1 = rd_sql(irr_type_dict['server'],
                      irr_type_dict['database'],
                      irr_type_dict['table'], [irr_type_dict['column']],
                      geo_col=True)
    irr1.rename(columns={irr_type_dict['column']: 'irr_type'}, inplace=True)

    if 'shp' in paw_dict.keys():
        if not isinstance(paw_dict['shp'], str):
            raise TypeError(
                "If 'shp' is in the dict, then it must be a string path to a shapefile."
            )
        paw1 = read_file(paw_dict['shp'])[[paw_dict['column'], 'geometry']]
    elif isinstance(paw_dict, dict):
        paw1 = rd_sql(paw_dict['server'],
                      paw_dict['database'],
                      paw_dict['table'], [paw_dict['column']],
                      geo_col=True)
    paw1.rename(columns={paw_dict['column']: 'paw'}, inplace=True)
    paw1.loc[:, 'paw'] = paw1.loc[:, 'paw'] * paw_ratio

    return (irr1, paw1)
Exemplo n.º 8
0
def _proc_hydro_sql(self, sites_sql_fun, mtype_dict, mtype, sites=None, from_date=None, to_date=None, qual_codes=None, min_count=None, buffer_dis=0, resample_code=None, period=1, fun='mean'):
    """
    Convenience function for reading in mssql data from standardized hydro tables.
    """

    if isinstance(sites, GeoDataFrame):
        loc1 = sites_sql_fun()
        sites1 = sel_sites_poly(loc1, sites, buffer_dis).index.astype(str)
    else:
        sites1 = Series(sites).astype(str)

    h1 = self.copy()
    if isinstance(mtype_dict, (list, tuple)):
        for i in range(len(mtype_dict)):
            site1 = mtype_dict[i]['site_col']

            sites_stmt = 'select distinct ' + site1 + ' from ' + mtype_dict[i]['table']
            sites2 = rd_sql(mtype_dict[i]['server'], mtype_dict[i]['database'], stmt=sites_stmt).astype(str)[site1]
            sites3 = sites2[sites2.isin(sites1)].astype(str).tolist()
            if not sites3:
                raise ValueError('No sites in database')
            if mtype_dict[i]['qual_col'] is None:
                qual_codes = None
            h1 = h1._rd_hydro_mssql(sites=sites3, mtype=mtype, from_date=from_date, to_date=to_date, qual_codes=qual_codes, min_count=min_count, resample_code=resample_code, period=period, fun=fun, **mtype_dict[i])
    elif isinstance(mtype_dict, dict):
        site1 = mtype_dict['site_col']

        sites_stmt = 'select distinct ' + site1 + ' from ' + mtype_dict['table']
        sites2 = rd_sql(mtype_dict['server'], mtype_dict['database'], stmt=sites_stmt).astype(str)[site1]
        sites3 = sites2[sites2.isin(sites1)].astype(str).tolist()
        if not sites3:
                raise ValueError('No sites in database')
        if mtype_dict['qual_col'] is None:
            qual_codes = None
        h1 = h1._rd_hydro_mssql(sites=sites3, mtype=mtype, from_date=from_date, to_date=to_date, qual_codes=qual_codes, min_count=min_count, resample_code=resample_code, period=period, fun=fun, **mtype_dict)
    elif callable(mtype_dict):
        h1 = mtype_dict(h1, sites=sites1, mtype=mtype, from_date=from_date, to_date=to_date, min_count=min_count)

    return(h1)
Exemplo n.º 9
0
def rd_niwa_geo():
    site_geo = rd_sql('SQL2012PROD05',
                      'GIS',
                      'NIWA_NZTM_NIWA_STATIONS',
                      col_names=['gml_id'],
                      geo_col=True)
    site_geo.loc[:, 'gml_id'] = site_geo.loc[:, 'gml_id'].str.replace(
        'stations.', '')
    site_geo.loc[:, 'gml_id'] = to_numeric(site_geo.loc[:, 'gml_id'],
                                           errors='coerse')
    site_geo.columns = ['site', 'geometry']

    return (site_geo.set_index('site'))
Exemplo n.º 10
0
def pts_sql_join(pts, sql_codes):
    """
    Function to perform spatial joins on sql tables that are polygon layers.
    """

    sql1 = sql_arg()

    for i in sql_codes:
        poly = rd_sql(**sql1.get_dict(i))
        pts = sjoin(pts, poly, how='left', op='within').drop('index_right',
                                                             axis=1)

    return (pts)
Exemplo n.º 11
0
def rd_henry(sites, from_date=None, to_date=None, agg_day=True, sites_by_col=False, min_filter=None, export=False,
             export_path='gauge_flows.csv'):
    """
    Function to read in gaugings data from the "Henry DB". Hopefully, they keep this around for a while longer.

    Arguments:\n
    sites -- Either a list of site names or a file path string that contains a column of site names.\n
    sites_col -- If 'sites' is a path string, then the column that contains site names.\n
    from_date -- A date string for the start of the data (e.g. '2010-01-01').\n
    to_date -- A date string for the end of the data.\n
    agg_day -- Should the gauging dates be aggregated down to the day as opposed to having the hour and minute. Gaugings are aggregated by the mean.\n
    sites_by_col -- 'False' does not make a time series, rather it is organized by site, date, and gauging. 'True' creates a time series with the columns as gauging sites (will create many NAs).\n
    min_filter -- Minimum number of days required for the gaugings output.
    """

    def resample1(df):
        df.index = df.date
        df2 = df.resample('D').mean()
        return (df2)

    #### Fields and names for databases

    ## Query fields - Be sure to use single quotes for the names!!!

    fields = ['SiteNo', 'SampleDate', 'Flow']

    ## Equivelant short names for analyses - Use these names!!!

    names = ['site', 'date', 'flow']

    #### Databases

    ### Gaugings data

    server = 'SQL2012PROD03'
    database = 'DataWarehouse'

    table = 'DataWarehouse.dbo.F_SG_BGauging'
    where_col = 'SiteNo'

    ## Will change to the following!!! Or stay as a duplicate...

    # database1 = 'Hydstra'

    # table1 = 'Hydstra.dbo.GAUGINGS'

    ########################################
    ### Read in data

    sites1 = select_sites(sites).tolist()
    data = rd_sql(server=server, database=database, table=table, col_names=fields, where_col=where_col,
                  where_val=sites1).dropna()
    data.columns = names

    ### Aggregate duplicates

    data2 = data.groupby(['site', 'date']).mean().reset_index()

    ### Aggregate by day

    if agg_day:
        data3 = data2.groupby(['site']).apply(resample1).reset_index().dropna()
    else:
        data3 = data2

    ### Filter out sites with less than min_filter
    if min_filter is not None:
        count1 = data3.groupby('site')['flow'].count()
        count_index = count1[count1 >= min_filter].index
        data3 = data3[in1d(data3.site.values, count_index)]

    ### Select within date range
    if from_date is not None:
        data3 = data3[data3.date >= from_date]
    if to_date is not None:
        data3 = data3[data3.date <= to_date]

    ### reorganize data with sites as columns and dates as index

    if sites_by_col:
        data4 = data3.pivot(index='date', columns='site').xs('flow', axis=1).round(4)
    else:
        data4 = data3.round(4)

    if export:
        if sites_by_col:
            data4.to_csv(export_path)
        else:
            data4.to_csv(export_path, index=False)

    return (data4)
Exemplo n.º 12
0
def rd_hydrotel(sites, mtype='river_flow_cont_raw', from_date=None, to_date=None, resample_code='D', period=1,
                fun='mean', val_round=3, min_count=None, pivot=False, export_path=None):
    """
    Function to extract time series data from the hydrotel database.

    Parameters
    ----------
    sites : list, array, dataframe, or str
        Site list or a str path to a single column csv file of site names/numbers.
    mtype : str
        'flow_tel', 'gwl_tel', 'precip_tel', 'swl_tel', or 'wtemp_tel'.
    from_date : str or None
        The start date in the format '2000-01-01'.
    to_date : str or None
        The end date in the format '2000-01-01'.
    resample_code : str
        The Pandas time series resampling code. e.g. 'D' for day, 'W' for week, 'M' for month, etc.
    period : int
        The number of resampling periods. e.g. period = 2 and resample = 'D' would be to resample the values over a 2 day period.
    fun : str
        The resampling function. i.e. mean, sum, count, min, or max. No median yet...
    val_round : int
        The number of decimals to round the values.
    pivot : bool
        Should the output be pivotted into wide format?
    export_path : str or None
        The path and file name to be saved.

    Returns
    -------
    Series or DataFrame
        A MultiIndex Pandas Series if pivot is False and a DataFrame if True
    """

    #### mtypes dict
    mtypes_dict = {'river_flow_cont_raw': 'Flow Rate', 'aq_wl_cont_raw': 'Water Level',
                   'atmos_precip_cont_raw': 'Rainfall Depth', 'river_wl_cont_raw': 'Water Level',
                   'river_wtemp_cont_raw': 'Water Temperature'}

    #### Database parameters
    server = 'SQL2012PROD05'
    database = 'Hydrotel'

    data_tab = 'Hydrotel.dbo.Samples'
    points_tab = 'Hydrotel.dbo.Points'
    objects_tab = 'Hydrotel.dbo.Objects'
    mtypes_tab = 'Hydrotel.dbo.ObjectVariants'
    sites_tab = 'Hydrotel.dbo.Sites'

    data_col = ['Point', 'DT', 'SampleValue']
    points_col = ['Point', 'Object']
    objects_col = ['Object', 'Site', 'Name', 'ObjectVariant']
    mtypes_col = ['ObjectVariant', 'Name']
    sites_col = ['Site', 'Name', 'ExtSysId']

    #### Import data and select the correct sites

    sites = select_sites(sites)
    if mtype == 'atmos_precip_cont_raw':
        site_ob1 = rd_sql(server, database, objects_tab, ['Site', 'ExtSysId'], 'ExtSysId',
                          sites.astype('int32').tolist())
        site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'], 'Site', site_ob1.Site.tolist())
        site_val1 = merge(site_val0, site_ob1, on='Site')
    elif mtype == 'aq_wl_cont_raw':
        site_val0 = rd_sql(server, database, sites_tab, ['Site', 'Name'])
        site_val0.loc[:, 'Name'] = site_val0.apply(lambda x: x.Name.split(' ')[0], axis=1)
        site_val1 = site_val0[site_val0.Name.isin(sites)]
        site_val1.loc[:, 'ExtSysId'] = site_val1.loc[:, 'Name']
    else:
        site_val1 = rd_sql(server, database, sites_tab, sites_col, 'ExtSysId', sites.astype('int32').tolist())

    if site_val1.empty:
        raise ValueError('No site(s) in database')

    site_val1.loc[:, 'ExtSysId'] = to_numeric(site_val1.loc[:, 'ExtSysId'], errors='ignore')
    site_val = site_val1.Site.astype('int32').tolist()
    if isinstance(mtype, (list, ndarray, Series)):
        mtypes = [mtypes_dict[i] for i in mtype]
    elif isinstance(mtype, str):
        mtypes = [mtypes_dict[mtype]]
    else:
        raise ValueError('mtype must be a str, list, ndarray, or Series.')
    mtypes_val = rd_sql(server, database, mtypes_tab, mtypes_col, 'Name', mtypes)

    where_col = {'Site': site_val, 'ObjectVariant': mtypes_val.ObjectVariant.astype('int32').tolist()}

    object_val1 = rd_sql(server, database, objects_tab, objects_col, where_col)
    if mtype == 'gwl_tel':
        object_val1 = object_val1[object_val1.Name == 'Water Level']
    if mtype == 'precip_tel':
        object_val1 = object_val1[object_val1.Name == 'Rainfall']
    object_val = object_val1.Object.values.astype(int).tolist()

    #### Rearrange data
    point_val1 = rd_sql(server, database, points_tab, points_col, where_col='Object', where_val=object_val)
    point_val = point_val1.Point.values.astype(int).tolist()

    #### Big merge
    comp_tab1 = merge(site_val1, object_val1[['Object', 'Site']], on='Site')
    comp_tab2 = merge(comp_tab1, point_val1, on='Object')
    comp_tab2.set_index('Point', inplace=True)

    #### Pull out the data
    ### Make SQL statement
    data1 = rd_sql_ts(server, database, data_tab, 'Point', 'DT', 'SampleValue', resample_code, period, fun, val_round,
                      {'Point': point_val}, from_date=from_date, to_date=to_date, min_count=min_count)['SampleValue']

    data1.index.names = ['site', 'time']
    data1.name = 'value'
    site_numbers = [comp_tab2.loc[i, 'ExtSysId'] for i in data1.index.levels[0]]
    data1.index.set_levels(site_numbers, level='site', inplace=True)

    if pivot:
        data3 = data1.unstack(0)
    else:
        data3 = data1

    #### Export and return
    if export_path is not None:
        save_df(data3, export_path)

    return data3
Exemplo n.º 13
0
def rd_squalarc(sites,
                mtypes=None,
                from_date=None,
                to_date=None,
                convert_dtl=False,
                dtl_method=None,
                export_path=None):
    """
    Function to read in "squalarc" data. Which is atually stored in the mssql db.

    Arguments:\n
    sites -- The site names as a list, array, csv with the first column as the site names, or a polygon shapefile of the area of interest.\n
    mtypes -- A list of measurement type names to be in the output. Leaving it empty returns all mtypes.\n
    from_date -- A start date string in of '2010-01-01'.\n
    to_date -- A end date string in of '2011-01-01'.\n
    convert_dtl -- Should values under the detection limit be converted to numeric?\n
    dtl_method -- The method to use to convert values under a detection limit to numeric. None or 'standard' takes half of the detection limit. 'trend' is meant as an output for trend analysis with includes an additional column dtl_ratio referring to the ratio of values under the detection limit.
    """

    #### Read in sites
    sites1 = select_sites(sites)

    #### Extract by polygon
    if isinstance(sites1, GeoDataFrame):
        ## Surface water sites
        sw_sites_tab = rd_sql('SQL2012PROD05',
                              'Squalarc',
                              'SITES',
                              col_names=['SITE_ID', 'NZTMX', 'NZTMY'])
        sw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_sw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', sw_sites_tab)
        sites1a = sites1.to_crs(gdf_sw_sites.crs)
        sw_sites2 = sel_sites_poly(gdf_sw_sites, sites1a).drop('geometry',
                                                               axis=1)

        ## Groundwater sites
        gw_sites_tab = rd_sql('SQL2012PROD05',
                              'Wells',
                              'WELL_DETAILS',
                              col_names=['WELL_NO', 'NZTMX', 'NZTMY'])
        gw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_gw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', gw_sites_tab)
        gw_sites2 = sel_sites_poly(gdf_gw_sites, sites1a).drop('geometry',
                                                               axis=1)

        sites2 = sw_sites2.site.append(gw_sites2.site).astype(str).tolist()
    else:
        sites2 = Series(sites1, name='site').astype(str).tolist()

    #### Extract the rest of the data
    if len(sites2) > 10000:
        n_chunks = int(ceil(len(sites2) * 0.0001))
        sites3 = [sites2[i::n_chunks] for i in xrange(n_chunks)]
        samples_tab = DataFrame()
        for i in sites3:
            samples_tab1 = rd_sql('SQL2012PROD05',
                                  'Squalarc',
                                  '"SQL_SAMPLE_METHODS+"',
                                  col_names=[
                                      'Site_ID', 'SAMPLE_NO', 'ME_TYP',
                                      'Collect_Date', 'Collect_Time',
                                      'PA_NAME', 'PARAM_UNITS', 'SRESULT'
                                  ],
                                  where_col='Site_ID',
                                  where_val=i)
            samples_tab1.columns = [
                'site', 'sample_id', 'source', 'date', 'time', 'parameter',
                'units', 'val'
            ]
            samples_tab1.loc[:,
                             'source'] = samples_tab1.loc[:,
                                                          'source'].str.lower(
                                                          )
            samples_tab = concat([samples_tab, samples_tab1])
    else:
        samples_tab = rd_sql('SQL2012PROD05',
                             'Squalarc',
                             '"SQL_SAMPLE_METHODS+"',
                             col_names=[
                                 'Site_ID', 'SAMPLE_NO', 'ME_TYP',
                                 'Collect_Date', 'Collect_Time', 'PA_NAME',
                                 'PARAM_UNITS', 'SRESULT'
                             ],
                             where_col='Site_ID',
                             where_val=sites2)
        samples_tab.columns = [
            'site', 'sample_id', 'source', 'date', 'time', 'parameter',
            'units', 'val'
        ]
        samples_tab.loc[:, 'source'] = samples_tab.loc[:, 'source'].str.lower()

    samples_tab2 = samples_tab.copy()
    num_test = to_numeric(samples_tab2.loc[:, 'time'], 'coerce')
    samples_tab2.loc[num_test.isnull(), 'time'] = '0000'
    samples_tab2.loc[:,
                     'time'] = samples_tab2.loc[:,
                                                'time'].str.replace('.', '')
    samples_tab2 = samples_tab2[samples_tab2.date.notnull()]
    #    samples_tab2.loc[:, 'time'] = samples_tab2.loc[:, 'time'].str.replace('9999', '0000')
    time1 = to_datetime(samples_tab2.time, format='%H%M', errors='coerce')
    time1[time1.isnull()] = Timestamp('2000-01-01 00:00:00')
    datetime1 = to_datetime(
        samples_tab2.date.dt.date.astype(str) + ' ' +
        time1.dt.time.astype(str))
    samples_tab2.loc[:, 'date'] = datetime1
    samples_tab2 = samples_tab2.drop('time', axis=1)
    samples_tab2.loc[samples_tab2.val.isnull(), 'val'] = nan
    samples_tab2.loc[samples_tab2.val == 'N/A', 'val'] = nan

    #### Select within time range
    if isinstance(from_date, str):
        samples_tab2 = samples_tab2[samples_tab2['date'] >= from_date]
    if isinstance(to_date, str):
        samples_tab2 = samples_tab2[samples_tab2['date'] <= to_date]

    if mtypes is not None:
        mtypes1 = select_sites(mtypes)
        data = samples_tab2[samples_tab2.parameter.isin(mtypes1)].reset_index(
            drop=True)
    else:
        data = samples_tab2.reset_index(drop=True)

    #### Correct poorly typed in site names
    data.loc[:, 'site'] = data.loc[:, 'site'].str.upper().str.replace(' ', '')

    #### Convert detection limit values
    if convert_dtl:
        less1 = data['val'].str.match('<')
        if less1.sum() > 0:
            less1.loc[less1.isnull()] = False
            data2 = data.copy()
            data2.loc[less1,
                      'val'] = to_numeric(data.loc[less1, 'val'].str.replace(
                          '<', ''),
                                          errors='coerce') * 0.5
            if dtl_method in (None, 'standard'):
                data3 = data2
            if dtl_method == 'trend':
                df1 = data2.loc[less1]
                count1 = data.groupby('parameter')['val'].count()
                count1.name = 'tot_count'
                count_dtl = df1.groupby('parameter')['val'].count()
                count_dtl.name = 'dtl_count'
                count_dtl_val = df1.groupby('parameter')['val'].nunique()
                count_dtl_val.name = 'dtl_val_count'
                combo1 = concat([count1, count_dtl, count_dtl_val],
                                axis=1,
                                join='inner')
                combo1['dtl_ratio'] = (combo1['dtl_count'] /
                                       combo1['tot_count']).round(2)

                ## conditionals
                #            param1 = combo1[(combo1['dtl_ratio'] <= 0.4) | (combo1['dtl_ratio'] == 1)]
                #            under_40 = data['parameter'].isin(param1.index)
                param2 = combo1[(combo1['dtl_ratio'] > 0.4)
                                & (combo1['dtl_val_count'] != 1)]
                over_40 = data['parameter'].isin(param2.index)

                ## Calc detection limit values
                data3 = merge(data,
                              combo1['dtl_ratio'].reset_index(),
                              on='parameter',
                              how='left')
                data3.loc[:, 'val_dtl'] = data2['val']

                max_dtl_val = data2[over_40 & less1].groupby(
                    'parameter')['val'].transform('max')
                max_dtl_val.name = 'dtl_val_max'
                data3.loc[over_40 & less1, 'val_dtl'] = max_dtl_val
        else:
            data3 = data
    else:
        data3 = data

    #### Return and export
    if isinstance(export_path, str):
        data3.to_csv(export_path, encoding='utf-8', index=False)
    return (data3)
Exemplo n.º 14
0
def rd_ht_wq_data(hts,
                  sites=None,
                  mtypes=None,
                  start=None,
                  end=None,
                  dtl_method=None,
                  output_site_data=False,
                  mtype_params=None,
                  sample_params=None):
    """
    Function to read data from an hts file and optionally select specific sites and aggregate the data.

    Parameters
    ----------
    hts : str
        Path to the hts file.
    sites : list
        A list of site names within the hts file.
    mtypes : list
        A list of measurement types that should be returned.
    start : str
        The start date to retreive from the data in ISO format (e.g. '2011-11-30 00:00').
    end : str
        The end date to retreive from the data in ISO format (e.g. '2011-11-30 00:00').
    dtl_method : None, 'standard', 'trend'
        The method to use to convert values under a detection limit to numeric. None does no conversion. 'standard' takes half of the detection limit. 'trend' is meant as an output for trend analysis with includes an additional column dtl_ratio referring to the ratio of values under the detection limit.
    output_site_data : bool
        Should the site data be output?

    Returns
    -------
    DataFrame
    """

    #    agg_unit_dict = {'l/s': 1, 'm3/s': 1, 'm3/hour': 1, 'mm': 1, 'm3': 4}
    #    unit_convert = {'l/s': 0.001, 'm3/s': 1, 'm3/hour': 1, 'mm': 1, 'm3': 4}

    sites1 = select_sites(sites)

    #### Extract by polygon
    if isinstance(sites1, GeoDataFrame):
        ## Surface water sites
        sw_sites_tab = rd_sql('SQL2012PROD05',
                              'Squalarc',
                              'SITES',
                              col_names=['SITE_ID', 'NZTMX', 'NZTMY'])
        sw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_sw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', sw_sites_tab)
        sites1a = sites1.to_crs(gdf_sw_sites.crs)
        sw_sites2 = sel_sites_poly(gdf_sw_sites, sites1a).drop('geometry',
                                                               axis=1)

        ## Groundwater sites
        gw_sites_tab = rd_sql('SQL2012PROD05',
                              'Wells',
                              'WELL_DETAILS',
                              col_names=['WELL_NO', 'NZTMX', 'NZTMY'])
        gw_sites_tab.columns = ['site', 'NZTMX', 'NZTMY']
        gdf_gw_sites = xy_to_gpd('site', 'NZTMX', 'NZTMY', gw_sites_tab)
        gw_sites2 = sel_sites_poly(gdf_gw_sites, sites1a).drop('geometry',
                                                               axis=1)

        sites2 = sw_sites2.site.append(gw_sites2.site).astype(str).tolist()
    else:
        sites2 = sites1

    ### First read all of the sites in the hts file and select the ones to be read
    sites_df = rd_hilltop_sites(hts,
                                sites=sites2,
                                mtypes=mtypes,
                                rem_wq_sample=False)

    ### Open the hts file
    wqr = Dispatch("Hilltop.WQRetrieval")
    dfile = Dispatch("Hilltop.DataFile")
    try:
        dfile.Open(hts)
    except ValueError:
        print(dfile.errmsg)

    ### Iterate through he hts file
    df_lst = []
    for i in sites_df.index:
        site = sites_df.loc[i, 'site']
        mtype = sites_df.loc[i, 'mtype']
        if mtype == 'WQ Sample':
            continue
        wqr = dfile.FromWQSite(site, mtype)

        ## Set up start and end times and aggregation initiation
        if (start is None):
            start1 = wqr.DataStartTime
        else:
            start1 = start
        if end is None:
            end1 = wqr.DataEndTime
        else:
            end1 = end

        wqr.FromTimeRange(start1, end1)

        ## Extract data
        data = []
        time = []

        test_params = sites_df[sites_df.site == site].mtype.unique()
        if ('WQ Sample' in test_params) & (isinstance(mtype_params, list)
                                           | isinstance(sample_params, list)):
            sample_p = []
            mtype_p = []
            while wqr.GetNext:
                data.append(wqr.value)
                time.append(str(pytime_to_datetime(wqr.time)))
                sample_p.append({
                    sp: wqr.params(sp).encode('ascii', 'ignore')
                    for sp in sample_params
                })
                mtype_p.append({
                    mp: wqr.params(mp).encode('ascii', 'ignore')
                    for mp in mtype_params
                })
        else:
            while wqr.GetNext:
                data.append(wqr.value)
                time.append(str(pytime_to_datetime(wqr.time)))

        if data:
            df_temp = DataFrame({
                'time': time,
                'data': data,
                'site': site,
                'mtype': mtype
            })
            if sample_p:
                df_temp = concat(
                    [df_temp, DataFrame(sample_p),
                     DataFrame(mtype_p)], axis=1)
            df_lst.append(df_temp)

    dfile.Close()
    wqr.close()
    if df_lst:
        data = concat(df_lst)
        data.loc[:, 'time'] = to_datetime(data.loc[:, 'time'])
        data1 = to_numeric(data.loc[:, 'data'], errors='coerce')
        data.loc[data1.notnull(), 'data'] = data1[data1.notnull()]
        #        data.loc[:, 'data'].str.replace('*', '')
        data = data.reset_index(drop=True)

        #### Convert detection limit values
        if dtl_method is not None:
            less1 = data['data'].str.match('<')
            if less1.sum() > 0:
                less1.loc[less1.isnull()] = False
                data2 = data.copy()
                data2.loc[less1, 'data'] = to_numeric(
                    data.loc[less1, 'data'].str.replace('<', ''),
                    errors='coerce') * 0.5
                if dtl_method == 'standard':
                    data3 = data2
                if dtl_method == 'trend':
                    df1 = data2.loc[less1]
                    count1 = data.groupby('mtype')['data'].count()
                    count1.name = 'tot_count'
                    count_dtl = df1.groupby('mtype')['data'].count()
                    count_dtl.name = 'dtl_count'
                    count_dtl_val = df1.groupby('mtype')['data'].nunique()
                    count_dtl_val.name = 'dtl_val_count'
                    combo1 = concat([count1, count_dtl, count_dtl_val],
                                    axis=1,
                                    join='inner')
                    combo1['dtl_ratio'] = (combo1['dtl_count'] /
                                           combo1['tot_count']).round(2)

                    ## conditionals
                    param2 = combo1[(combo1['dtl_ratio'] > 0.4)
                                    & (combo1['dtl_val_count'] != 1)]
                    over_40 = data['mtype'].isin(param2.index)

                    ## Calc detection limit values
                    data3 = merge(data,
                                  combo1['dtl_ratio'].reset_index(),
                                  on='mtype',
                                  how='left')
                    data3.loc[:, 'data_dtl'] = data2['data']

                    max_dtl_val = data2[over_40 & less1].groupby(
                        'mtype')['data'].transform('max')
                    max_dtl_val.name = 'dtl_data_max'
                    data3.loc[over_40 & less1, 'data_dtl'] = max_dtl_val
            else:
                data3 = data
        else:
            data3 = data

        if output_site_data:
            sites_df = sites_df[~(sites_df.mtype == 'WQ Sample')]
            return (data3, sites_df)
        else:
            return (data3)