Exemplo n.º 1
0
def get_MASTER_Chance2014_iodide_obs_file(
    sheetname='S>30 data set',
    skiprows=1,
    file_and_path='./sparse2spatial.rc',
):
    """
    To check on the correlations between the newly extract climatological
    values, this funtion extracts the details from Chance2014's master
    spreadsheet to perform comparisons.

    Parameters
    -------
    sheetname (str): name of the excel sheet to use
    skiprows (int): number of rows to skip when reading sheet
    file_and_path (str): folder and filename with location settings as single str

    Returns
    -------
    (pd.DataFrame)
    """
    # Location and filename?
    filename = 'Iodide_correlations_310114_MASTER_TMS_EDIT.xlsx'
    folder = utils.get_file_locations('data_root', file_and_path=file_and_path)
    folder += 'Iodide/inputs/RJC_spreadsheets/'
    # Extract MASTER excel spreadsheet from Chance2014
    df = pd.read_excel(folder + filename,
                       sheetname=sheetname,
                       skiprows=skiprows)
    return df
Exemplo n.º 2
0
def Convert_DOC_prod_file_into_Standard_NetCDF():
    """
    Convert Saeed Roshan's file into CF compliant format
    """
    # - convert the surface DOC file into a monthly average file
    # Directory?
    older = utils.get_file_locations('data_root') +'/DOC/'
    # Filename as a string
    file_str = 'DOC_Accum_rate_SR.nc'
    # Open dataset
    ds = xr.open_dataset(folder+file_str)
    # - Force use of coordinate variables in netCDF
    ds['latitude'] = ds['lat'][0, :].values
    ds['latitude'].attrs = ds['lat'].attrs
    ds['longitude'] = ds['lon'][:, 0].values
    ds['longitude'] .attrs = ds['lon'].attrs
    # - Rename dimensions
    dims_dict = {'latitude': 'lat', 'longitude': 'lon'}
    # - Only keep the variables of interest
    var2keep = [u'DOCaccum_avg', u'DOCaccum_std', ]
    var2keep += dims_dict.keys()
    ds = ds.drop(labels=[i for i in ds.variables if i not in var2keep])
    ds.rename(dims_dict, inplace=True)
    # - Add history to attirubtes
    d = ds.attrs
    date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
    hst_str = 'File structure/variables updated to CF by TMS ({}) on {}'
    d['History'] = hst_str.format('University of York', date)
    d['Originating author'] = 'SR - Saeed Roshan ([email protected])'
    d['Editting author'] = 'TMS - ([email protected])'
    d['Citation'] = 'doi.org/10.1038/s41467-017-02227-3'
    ds.attrs = d
    # - Save the new NetCDF file
    newfile_str = file_str.split('.nc')[0]+'_TMS_EDIT.nc'
    ds.to_netcdf(folder + newfile_str)
Exemplo n.º 3
0
def get_processed_df_obs_mod(reprocess_params=False,
                             target='CH2Br2',
                             filename='s2s_CH2Br2_obs_ancillaries.csv',
                             rm_Skagerrak_data=False,
                             file_and_path='./sparse2spatial.rc',
                             verbose=True,
                             debug=False):
    """
    Get the processed observation and model output

    Parameters
    -------
    Returns
    -------
    (pd.DataFrame)
    Notes
    -----
    """
    # Read in processed csv file
    folder = utils.get_file_locations('data_root', file_and_path=file_and_path)
    folder += '/{}/inputs/'.format(target)
    filename = 's2s_{}_obs_ancillaries.csv'.format(target)
    df = pd.read_csv(folder + filename, encoding='utf-8')
    # Add SST in Kelvin too
    if 'WOA_TEMP_K' not in df.columns:
        df['WOA_TEMP_K'] = df['WOA_TEMP_K'].values + 273.15
    return df
def get_iodide_data_from_BODC(file_and_path='./sparse2spatial.rc',
                              filename = 'Global_Iodide_obs_surface.csv'):
    """
    Get the latest iodide data from .csv file archived with BODC

    Parameters
    -------
    file_and_path (str): folder and filename with location settings as single str
    filename (str): name of the csv file or archived data from BODC
    debug (bool), print debug statements

    Returns
    -------
    (pd.DataFrame)
    """
    # print instructions to mannually download data.
    prt_str = 'WARNING: automated download from BODC not yet setup \n'
    prt_str += 'Please mannually download the lastest data from BODC \n'
    prt_str += '*.csv file availble from https://doi.org/10/czhx \n'
    print(prt_str)
    # Location of data
    folder = utils.get_file_locations('s2s_root', file_and_path=file_and_path)
    folder += '/Iodide/inputs/'
    # open .csv file and return
    df = pd.read_csv(folder+filename)
    return df
Exemplo n.º 5
0
def get_processed_df_obs_mod(target='example', file_and_path='./sparse2spatial.rc'):
    """
    Get the processed observation and model output

    Parameters
    -------
    target (str), Name of the target variable (e.g. iodide)
    file_and_path (str), folder and filename with location settings as single str

    Returns
    -------
    (pd.DataFrame)

    Notes
    -----
    """
    # Read in processed csv file of observations and ancillaries
    folder = utils.get_file_locations('s2s_root', file_and_path=file_and_path)
    folder += '/{}/inputs/'.format(target)
    filename = 's2s_{}_obs_ancillaries.csv'.format(target)
    df = pd.read_csv(folder+filename, encoding='utf-8')
    # Add SST in Kelvin too
    if 'WOA_TEMP_K' not in df.columns:
        df['WOA_TEMP_K'] = df['WOA_TEMP_K'].values + 273.15
    return df
Exemplo n.º 6
0
def get_example_obs(target='example', limit_depth_to=20):
    """
    Get the raw sparse observations from a database...

    Parameters
    -------
    target (str), Name of the target variable (e.g. iodide)
    limit_depth_to (float), depth from sea surface to include data (metres)

    Returns
    -------
    (pd.DataFrame)
    """
    # File to use (example name string...)
    filename = 'HC_seawater_concs_above_{}m.csv'.format(limit_depth_to)
    # Where is the file?
    s2s_root = utils.get_file_locations('s2s_root')
    folder = '{}/{}/inputs/'.format(s2s_root, target)
    df = pd.read_csv(folder+filename)
    # Variable name?
    Varname = 'example (pM)'
    # Assume using coord variables for now
    LatVar1 = '<native latitude name (+ve N)>'
    LonVar1 = '<native longitude name (+ve E)>'
    # Add time
    TimeVar1 = 'native Date and time (UTC)'
    month_var = 'Month'
    dt = pd.to_datetime(
        df[TimeVar1], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    df['datetime'] = dt

    def get_month(x):
        return x.month
    df[month_var] = df['datetime'].map(get_month)
    # Make sure all values are numeric
    for var in [Varname]+[LatVar1, LonVar1]:
        df.loc[:, var] = pd.to_numeric(df[var].values, errors='coerce')
        # replace flagged values with NaN
        df.replace(999, np.NaN, inplace=True)
        df.replace(-999, np.NaN, inplace=True)
    # Update names to use
    cols2use = ['datetime', 'Month', LatVar1, LonVar1, Varname]
    name_dict = {
        LatVar1: 'Latitude', LonVar1: 'Longitude', month_var: 'Month', Varname: target
    }
    df = df[cols2use].rename(columns=name_dict)
    # Add a unique identifier
    df['NEW_INDEX'] = range(1, df.shape[0]+1)
    # Set to a unique string instead of a number

    def get_unique_Data_Key_ID(x):
        return 'HC_{:0>6}'.format(int(x))
    df['Data_Key_ID'] = df['NEW_INDEX'].map(get_unique_Data_Key_ID)
    # Remove all the NaNs and print to screen the change in dataset size
    t0_shape = df.shape[0]
    df = df.dropna()
    if t0_shape != df.shape[0]:
        pstr = 'WARNING: Dropped obs. (#={}), now have #={} (had #={})'
        print(pstr.format(t0_shape-df.shape[0], df.shape[0], t0_shape))
    return df
Exemplo n.º 7
0
def get_processed_df_obs_mod(reprocess_params=False,
                             filename='Iodine_obs_WOA.csv',
                             rm_Skagerrak_data=False,
                             file_and_path='./sparse2spatial.rc',
                             verbose=True,
                             debug=False):
    """
    Get the processed observation and model output

    Parameters
    -------
    file_and_path (str): folder and filename with location settings as single str
    rm_Skagerrak_data (boolean): remove the single data from the Skagerrak region
    reprocess_params (bool):
    filename (str): name of the input file of processed observational data
    verbose (bool): print verbose statements
    debug (bool): print debug statements

    Returns
    -------
    (pd.DataFrame)
    """
    # Read in processed csv file
    folder = utils.get_file_locations('data_root', file_and_path=file_and_path)
    folder += '/Iodide/'
    df = pd.read_csv(folder + filename, encoding='utf-8')
    # Add ln of iodide too
    df['ln(Iodide)'] = df['Iodide'].map(np.ma.log)
    # Add SST in Kelvin too
    if 'WOA_TEMP_K' not in df.columns:
        df['WOA_TEMP_K'] = df['WOA_TEMP_K'].values + 273.15
    # Make sure month is numeric (if not given)
    month_var = 'Month'
    NaN_months_bool = ~np.isfinite(df[month_var].values)
    NaN_months_df = df.loc[NaN_months_bool, :]
    N_NaN_months = NaN_months_df.shape[0]
    if N_NaN_months > 1:
        print_str = 'DataFrame contains NaNs for {} months - '
        print_str += 'Replacing these with month # 3 months '
        print_str += 'before (hemispheric) summer solstice'
        if verbose:
            print(print_str.format(N_NaN_months))
        NaN_months_df[month_var] = NaN_months_df.apply(
            lambda x: set_backup_month_if_unknown(
                lat=x['Latitude'],
                # main_var=var2use,
                # var2use=var2use,
                #
                # Data_key_ID_=Data_key_ID_,
                debug=False),
            axis=1)
        # Add back into DataFrame
        df.loc[NaN_months_bool, month_var] = NaN_months_df[month_var].values
    # Re-process the parameterisations (Chance et al etc + ensemble)?
    if reprocess_params:
        # Add predictions from literature
        df = get_literature_predicted_iodide(df=df)
        # Add ensemble prediction
        df = get_ensemble_predicted_iodide(rm_Skagerrak_data=rm_Skagerrak_data)
    return df
Exemplo n.º 8
0
def process_iodide_obs_ancillaries_2_csv(rm_Skagerrak_data=False,
                                         add_ensemble=False,
                                         file_and_path='./sparse2spatial.rc',
                                         target='Iodide',
                                         verbose=True):
    """
    Create a csv files of iodide observation and ancilllary observations

    Parameters
    -------
    file_and_path (str): folder and filename with location settings as single str
    add_ensemble (bool): add the ensemble prediction to input data dataframe
    rm_Skagerrak_data (boolean): remove the single data from the Skagerrak region
    target (str): Name of the target variable (e.g. iodide)
    verbose (bool): print verbose statements

    Returns
    -------
    (pd.DataFrame)

    Notes
    -----
     -  Workflow assumes that this step will be run to compile the data
    """
    # Get iodide observations (as a dictionary/DataFrame)
    obs_data_df, obs_metadata_df = get_iodide_obs()
    # Add ancillary obs.
    obs_data_df = extract_ancillaries_from_compiled_file(df=obs_data_df)
    # Save the intermediate file
    folder = utils.get_file_locations('data_root', file_and_path=file_and_path)
    folder += '/{}/'.format(target)
    filename = 'Iodine_obs_WOA_v8_5_1_TEMP_TEST.csv'
    obs_data_df.to_csv(folder + filename, encoding='utf-8')
    # - Add predicted iodide from MacDonald and Chance parameterisations
    obs_data_df = get_literature_predicted_iodide(df=obs_data_df)
    # - Add ensemble prediction by averaging predictions at obs. locations.?
    if add_ensemble:
        print('NOTE - models must have already been provided via RFR_dict')
        RFR_dict = build_or_get_models(rm_Skagerrak_data=rm_Skagerrak_data, )
        # Now extract for
        obs_data_df = get_ensemble_predicted_iodide(
            df=obs_data_df,
            use_vals_from_NetCDF=False,
            RFR_dict=RFR_dict,
            rm_Skagerrak_data=rm_Skagerrak_data)
    # - Join dataframes and save as csv.


#    filename = 'Iodine_obs_WOA.csv'
#    filename = 'Iodine_obs_WOA_v8_1_PLUS_ENSEMBLE.csv'
#    filename = 'Iodine_obs_WOA_v8_5_1_PLUS_ENSEMBLE_8_3_ENSEMBLE.csv'
    filename = 'Iodine_obs_WOA_v8_5_1_ENSEMBLE_csv__avg_nSkag_nOutliers.csv'
    #    filename = 'Iodine_obs_WOA_v8_2_PLUS_PARAMS.csv'
    if verbose:
        print(obs_data_df.shape, obs_data_df.columns)
    obs_data_df.to_csv(folder + filename, encoding='utf-8')
    if verbose:
        print('File saved to: ', folder + filename)
Exemplo n.º 9
0
def get_CHBr3_obs(target='CHBr3', limit_depth_to=20,):
    """
    Get the raw observations from HalOcAt database
    """
    # File to use
    filename = 'HC_seawater_concs_above_{}m.csv'.format(limit_depth_to)
    # Where is the file?
    data_root = utils.get_file_locations('data_root')
    folder = '{}/{}/inputs/'.format(data_root, target)
    df = pd.read_csv(folder+filename)
    # Variable name? - Just use one of the values for now
    Varname = 'CHBr3 (pM)'
    # Assume using coord variables for now
    LatVar1 = 'Sample start latitude (+ve N)'
    LonVar1 = 'Sample start longitude (+ve E)'
    # Add time
    TimeVar1 = 'Date (UTC) and time'
    TimeVar2 = 'Sampling date/time (UT)'
    month_var = 'Month'
    dt = pd.to_datetime(
        df[TimeVar1], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    df['datetime'] = dt

    def get_month(x):
        return x.month
    df[month_var] = df['datetime'].map(get_month)
    # make sure all values are numeric
    for var in [Varname]+[LatVar1, LonVar1]:
        df.loc[:, var] = pd.to_numeric(df[var].values, errors='coerce')
        # replace flagged values with NaN
        df.replace(999, np.NaN, inplace=True)
        df.replace(-999, np.NaN, inplace=True)
    # Update names to use
    cols2use = ['datetime', 'Month', LatVar1, LonVar1, Varname]
    name_dict = {
        LatVar1: 'Latitude', LonVar1: 'Longitude', month_var: 'Month', Varname: target
    }
    df = df[cols2use].rename(columns=name_dict)
    # Add a unique identifier
    df['NEW_INDEX'] = range(1, df.shape[0]+1)
    # Kludge for now to just a name then number

    def get_unique_Data_Key_ID(x):
        return 'HC_{:0>6}'.format(int(x))
    df['Data_Key_ID'] = df['NEW_INDEX'].map(get_unique_Data_Key_ID)
    # Remove all the NaNs
    t0_shape = df.shape[0]
    df = df.dropna()
    if t0_shape != df.shape[0]:
        pstr = 'WARNING: Dropped obs. (#={}), now have #={} (had #={})'
        print(pstr.format(t0_shape-df.shape[0], df.shape[0], t0_shape))
    return df
Exemplo n.º 10
0
def get_iodide_obs_metadata(file_and_path='./sparse2spatial.rc'):
    """
    Extract and return metadata from metadata csv
    """
    # What is the location of the iodide data?
    folder = utils.get_file_locations('data_root', file_and_path=file_and_path)
    folder += '/Iodide/inputs/'
    # Filename?
    filename = 'Iodine_climatology_Submitted_data_list_formatted_TMS.xlsx'
    # Extract
    df = pd.read_excel(folder + filename, sheetname='Full')
    # return as DataFrame
    return df
Exemplo n.º 11
0
def mk_RAD_NetCDF_monthly():
    """
    Resample shortwave radiation NetCDF from daily to monthly
    """
    # Directory?
    folder = utils.get_file_locations('data_root') +'/GFDL/'
    # Filename as a string
    file_str = 'ncar_rad.15JUNE2009.nc'
    ds = xr.open_dataset(folder + filename)
    # Resample to monthly
    ds = ds.resample(dim='TIME', freq='M')
    # Save as NetCDF
    newfile_str = file_str.split('.nc')[0]+'_TMS_EDIT.nc'
    ds.to_netcdf(folder+newfile_str)
Exemplo n.º 12
0
def process_obs_and_ancillaries_2_csv(target='CH2Br2',
                                      file_and_path='./sparse2spatial.rc'):
    """
    Process the observations and extract ancillary variables for these locations
    """
    # Get the bass observations
    df = get_CH2Br2_obs()
    # Extract the ancillary values for these locations
    df = extract_ancillaries_from_compiled_file(df=df)
    # Save the intermediate file
    folder = utils.get_file_locations('data_root', file_and_path=file_and_path)
    folder += '/{}/inputs/'.format(target)
    filename = 's2s_{}_obs_ancillaries_v0_0_0.csv'.format(target)
    df.to_csv(folder + filename, encoding='utf-8')
Exemplo n.º 13
0
def get_WOA18_data(automatically_download=False, target='Iodide'):
    """
    Get data from World Ocean (WOA) 2018 version 2

    Notes
    -------
    https://www.nodc.noaa.gov/OC5/woa18/woa18-preliminary-notes.html
    """
    # Use the data settings for Iodide
    file_and_path = './{}/sparse2spatial.rc'.format(target)
    data_root = utils.get_file_locations('data_root',
                                         file_and_path=file_and_path)
    folder = '{}/data/{}/'.format(data_root, 'WOA18')
    # Now loop through the list of variables to donwload
    vars_dict2download = store_of_values2download4WOA18()
    for n in vars_dict2download.keys():
        print(n, vars_dict2download[n].items())
        # Extract variables
        d = vars_dict2download[n]
        var = d['var']
        res = d['res']
        period = d['period']
        # Which specific subfolder to save data to?
        sfolder = '{}/{}/'.format(folder, var)
        # If seasonal data (decadal averaged) then download the monthd
        if (period == 'decav') or (period == 'all'):
            # get monthly and seasonal files
            seasons = ['{:0>2}'.format(i + 1) for i in np.arange(16)]
            # download files for season
            for season in seasons:
                WOA18_data4var_period(folder=sfolder,
                                      season=season,
                                      period=period,
                                      res=res,
                                      var=var)
        # If decadal split data, down load by season
        else:
            # just get seasonal files
            seasons = ['{:0>2}'.format(i) for i in [13, 14, 15, 16]]
            # download files for season
            for season in seasons:
                WOA18_data4var_period(folder=sfolder,
                                      season=season,
                                      period=period,
                                      res=res,
                                      var=var)
Exemplo n.º 14
0
    Parameters
    -------
    target (str), Name of the target variable (e.g. iodide)
    version (str), version name/number (e.g. semantic version - https://semver.org/)
    file_and_path (str), folder and filename with location settings as single str

    Returns
    -------
    (None)
    """
    # Get the base observations
    df = get_example_obs()
    # Extract the ancillary values for these locations
    df = ancillaries2grid.extract_ancillaries_from_compiled_file(df=df)
    # Save the intermediate file
    folder = utils.get_file_locations('s2s_root', file_and_path=file_and_path)
    folder += '/{}/inputs/'.format(target)
    filename = 's2s_{}_obs_ancillaries_{}.csv'.format(target, version)
    df.to_csv(folder+filename, encoding='utf-8')


def get_processed_df_obs_mod(target='example', file_and_path='./sparse2spatial.rc'):
    """
    Get the processed observation and model output

    Parameters
    -------
    target (str), Name of the target variable (e.g. iodide)
    file_and_path (str), folder and filename with location settings as single str

    Returns
Exemplo n.º 15
0
def mk_predictions_for_3D_features(dsA=None,
                                   RFR_dict=None,
                                   res='4x5',
                                   models_dict=None,
                                   features_used_dict=None,
                                   stats=None,
                                   folder=None,
                                   target='Iodide',
                                   use_updated_predictor_NetCDF=False,
                                   save2NetCDF=False,
                                   plot2check=False,
                                   models2compare=[],
                                   topmodels=None,
                                   xsave_str='',
                                   add_ensemble2ds=False,
                                   verbose=True,
                                   debug=False):
    """
    Make a NetCDF file of predicted target from feature variables for a given resolution

    Parameters
    ----------
    dsA (xr.Dataset): dataset object with variables to interpolate
    RFR_dict (dict): dictionary of core variables and data
    res (str): horizontal resolution (e.g. 4x5) of Dataset
    save2NetCDF (bool): save interpolated Dataset to as a NetCDF?
    features_used_dict (dict): dictionary of feature variables in models
    models_dict (dict): dictionary of RFR models and there names
    stats (pd.DataFrame): dataframe of statistics on models in models_dict
    folder (str): location of NetCDF file of feature variables
    target (str): name of the species being predicted
    models2compare (list): list of models to make spatial predictions for (rm: double up?)
    topmodels (list): list of models to make spatial predictions for
    xsave_str (str): string to include as suffix in filename used for saved NetCDF
    add_ensemble2ds (bool): calculate std. dev. and mean for list of topmodels
    verbose (bool): print out verbose output?
    debug (bool): print out debugging output?

    Returns
    -------
    (xr.Dataset)
    """
    # Make sure the core dictionary is provided
    assert (type(RFR_dict) == dict
            ), 'Core variables must be provided as dict (RFR_dict)'
    # Make sure a full list of models was provided
    assert (len(models2compare) > 0), 'List of models to must be provided!'
    # Inc. all the topmodels in the list of models to compare if they have been provided.
    if isinstance(topmodels, type(list)):
        models2compare += topmodels
    # Remove any double ups in list of of models to predict
    models2compare = list(set(models2compare))
    # Get the variables required here
    if isinstance(models_dict, type(None)):
        models_dict = RFR_dict['models_dict']
    if isinstance(features_used_dict, type(None)):
        features_used_dict = RFR_dict['features_used_dict']
    # Get location to save file and set filename
    if isinstance(folder, type(None)):
        folder = utils.get_file_locations('data_root') + '/data/'
    if isinstance(dsA, type(None)):
        filename = 'Oi_prj_feature_variables_{}.nc'.format(res)
        dsA = xr.open_dataset(folder + filename)
    # - Make a dataset of predictions for each model
    ds_l = []
    for modelname in models2compare:
        # get model
        model = models_dict[modelname]
        # get testinng features
        features_used = utils.get_model_features_used_dict(modelname)
        # Make a DataSet of predicted values
        ds_tmp = utils.mk_da_of_predicted_values(dsA=dsA,
                                                 model=model,
                                                 res=res,
                                                 modelname=modelname,
                                                 features_used=features_used)
        #  Add attributes to the prediction
        ds_tmp = utils.add_attrs2target_ds(ds_tmp,
                                           add_global_attrs=False,
                                           varname=modelname)
        # Save to list
        ds_l += [ds_tmp]
    # Combine datasets
    ds = xr.merge(ds_l)
    # - Also get values for parameterisations
    #     if target == 'Iodide':
    #         # Chance et al (2013)
    #         param = u'Chance2014_STTxx2_I'
    #         arr = utils.calc_I_Chance2014_STTxx2_I(dsA['WOA_TEMP'].values)
    #         ds[param] = ds[modelname]  # use existing array as dummy to fill
    #         ds[param].values = arr
    #         # MacDonald et al (2013)
    #         param = 'MacDonald2014_iodide'
    #         arr = utils.calc_I_MacDonald2014(dsA['WOA_TEMP'].values)
    #         ds[param] = ds[modelname]  # use existing array as dummy to fill
    #         ds[param].values = arr
    # Add ensemble to ds too
    if add_ensemble2ds:
        print('WARNING: Using topmodels for ensemble as calculated here')
        var2template = list(ds.data_vars)[0]
        ds = RFRanalysis.add_ensemble_avg_std_to_dataset(
            ds=ds,
            res=res,
            target=target,
            RFR_dict=RFR_dict,
            topmodels=topmodels,
            var2template=var2template,
            save2NetCDF=False)
    # Add global attributes
    ds = utils.add_attrs2target_ds(ds, add_varname_attrs=False)
    # Save to NetCDF
    if save2NetCDF:
        filename = 'Oi_prj_predicted_{}_{}{}.nc'.format(target, res, xsave_str)
        ds.to_netcdf(filename)
    else:
        return ds
Exemplo n.º 16
0
def get_stats_on_spatial_predictions_0125x0125(use_annual_mean=True, target='Iodide',
                                               RFR_dict=None, ex_str='',
                                               just_return_df=False, folder=None,
                                               filename=None, rm_Skagerrak_data=False,
                                               debug=False):
    """
    Evaluate the spatial predictions between models at 0.125x0.125

    Parameters
    -------
    target (str): Name of the target variable (e.g. iodide)
    debug (bool): print out debugging output?
    rm_Skagerrak_data (bool): Remove specific data
    (above argument is a iodide specific option - remove this)
    just_return_df (bool): just return the data as dataframe
    folder (str): folder where NetCDF of predicted data is located
    ex_str (str): extra string to include in file name to save data
    use_annual_mean (bool): use the annual mean of the variable for statistics
    var2template (str): variable to use a template for making new variables in ds

    Returns
    -------

    Notes
    -----

    """
    # ----
    # Get spatial prediction data from NetCDF files saved already
    res = '0.125x0.125'
    if isinstance(filename, type(None)):
        if rm_Skagerrak_data:
            extr_file_str = '_No_Skagerrak'
        else:
            extr_file_str = ''
        filename = 'Oi_prj_predicted_{}_{}{}.nc'.format(
            target, res, extr_file_str)
    if isinstance(folder, type(None)):
        data_root = utils.get_file_locations('data_root')
        folder = '{}/outputs/{}/'.format(data_root, target)
    ds = xr.open_dataset(folder + filename)
    # Variables to consider
    vars2analyse = list(ds.data_vars)
    # Add LWI and surface area to array
    ds = utils.add_LWI2array(ds=ds, res=res, var2template='Chance2014_STTxx2_I')
    # Set a name for output to saved as
    file_save_str = 'Oi_prj_annual_stats_global_ocean_{}{}'.format(res, ex_str)
    # ---- build an array with general statistics
    df = pd.DataFrame()
    # -- get general annual stats
    # Take annual average over time (if using annual mean)
    if use_annual_mean:
        ds_tmp = ds.mean(dim='time')
    for var_ in vars2analyse:
        # mask to only consider (100%) water boxes
        arr = ds_tmp[var_].values
        arr = arr[(ds_tmp['IS_WATER'] == True)]
        # save to dataframe
        df[var_] = pd.Series(arr.flatten()).describe()
    # Get area weighted mean too
    vals = []
    # Take annual average over time (if using annual mean) -
    # Q: why does this need to be done twice separately?
    if use_annual_mean:
        ds_tmp = ds.mean(dim='time')
    for var_ in vars2analyse:
        # Mask to only consider (100%) water boxes
        mask = ~(ds_tmp['IS_WATER'] == True)
        arr = np.ma.array(ds_tmp[var_].values, mask=mask)
        # Also mask surface area (s_area)
        s_area_tmp = np.ma.array(ds_tmp['AREA'].values, mask=mask)
        # Save value to list
        vals += [AC.get_2D_arr_weighted_by_X(arr, s_area=s_area_tmp)]
    # Add area weighted mean to df
    df = df.T
    df['mean (weighted)'] = vals
    df = df.T
    #  just return the dataframe of global stats
    if just_return_df:
        return df
    # save the values
    df.T.to_csv(file_save_str+'.csv')
    # ---- print out a more formatted version as a table for the paper
    # remove variables
    topmodels = get_top_models(RFR_dict=RFR_dict, vars2exclude=['DOC', 'Prod'])
    params = [
        'Chance2014_STTxx2_I', 'MacDonald2014_iodide', 'Ensemble_Monthly_mean'
    ]
    # select just the models of interest
    df = df[topmodels + params]
    # rename the models
    rename_titles = {u'Chance2014_STTxx2_I': 'Chance et al. (2014)',
                     u'MacDonald2014_iodide': 'MacDonald et al. (2014)',
                     'Ensemble_Monthly_mean': 'RFR(Ensemble)',
                     'Iodide': 'Obs.',
                     #                    u'Chance2014_Multivariate': 'Chance et al. (2014) (Multi)',
                     }
    df.rename(columns=rename_titles,  inplace=True)
    # Sort the dataframe by the mean weighted vales
    df = df.T
    df.sort_values(by=['mean (weighted)'], ascending=False, inplace=True)
    # rename columns (50% to median and ... )
    cols2rename = {'50%': 'median', 'std': 'std. dev.', }
    df.rename(columns=cols2rename,  inplace=True)
    # rename
    df.rename(index=rename_titles, inplace=True)
    # set column order
    # Set the stats to use
    first_columns = [
        'mean (weighted)', 'std. dev.', '25%', 'median', '75%', 'max',
    ]
    if debug:
        print(df.head())
    df = df[first_columns]
    # save as CSV
    df.round(1).to_csv(file_save_str+'_FOR_TABLE_'+'.csv')

    # ---- Do some further analysis and save this to a text file
    a = open(file_save_str+'_analysis.txt', 'w')
    # Set a header
    print('This file contains global analysis of {} data'.format(str), file=a)
    print('\n', file=a)
    # which files are being analysed?
    print('---- Detail on the predicted fields', file=a)
    models2compare = {
        1: u'RFR(Ensemble)',
        2: u'Chance et al. (2014)',
        3: u'MacDonald et al. (2014)',
        #    1: u'Ensemble_Monthly_mean',
        #    2: u'Chance2014_STTxx2_I',
        #    3:'MacDonald2014_iodide'
        #    1: u'RFR(TEMP+DEPTH+SAL+NO3+DOC)',
        #    2: u'RFR(TEMP+SAL+Prod)',
        #    3: u'RFR(TEMP+DEPTH+SAL)',
    }
    debug = True
    if debug:
        print(df.head())
    df_tmp = df.T[models2compare.values()]
    # What are the core models
    print('Core models being compared are:', file=a)
    for key in models2compare.keys():
        ptr_str = 'model {} - {}'
        print(ptr_str.format(key, models2compare[key]), file=a)
    print('\n', file=a)
    # Now print analysis on predicted fields
    # range in predicted model values
    mean_ = df_tmp.T['mean (weighted)'].values.mean()
    min_ = df_tmp.T['mean (weighted)'].values.min()
    max_ = df_tmp.T['mean (weighted)'].values.max()
    prt_str = 'avg predicted values = {:.5g} ({:.5g}-{:.5g})'
    print(prt_str.format(mean_, min_, max_), file=a)
    # range in predicted model values
    range_ = max_-min_
    prt_str = 'range of predicted avg values = {:.3g}'
    print(prt_str.format(range_, min_, max_), file=a)
    # % of range in predicted model values ( as an error of model choice... )
    pcents_ = range_ / df_tmp.T['mean (weighted)'] * 100
    min_ = pcents_.min()
    max_ = pcents_.max()
    prt_str = 'As a % this is = {:.3g} ({:.5g}-{:.5g})'
    print(prt_str.format(pcents_.mean(), min_, max_), file=a)
    a.close()
Exemplo n.º 17
0
def get_stats_on_spatial_predictions_4x5_2x25_by_lat(res='4x5', ex_str='',
                                                     target='Iodide',
                                                     use_annual_mean=False, filename=None,
                                                     folder=None, ds=None,
                                                     var2template='Chance2014_STTxx2_I',
                                                     debug=False):
    """
    Evaluate the spatial predictions between models, binned by latitude

    Parameters
    -------
    target (str): Name of the target variable (e.g. iodide)
    res (str): horizontal resolution of dataset (e.g. 4x5)
    debug (bool): print out debugging output?
    var2template (str): variable to use a template for making new variables in ds
    use_annual_mean (bool): use the annual mean of the variable

    Returns
    -------
    (pd.DataFrame)
    """
    if isinstance(ds, type(None)):
        # If filename or folder not given, then use defaults
        if isinstance(filename, type(None)):
            filename = 'Oi_prj_predicted_{}_{}.nc'.format(target, res)
        if isinstance(folder, type(None)):
            data_root = utils.get_file_locations('data_root')
            folder = '{}/{}/outputs/'.format(data_root, target)
        ds = xr.open_dataset(folder + filename)
    # Variables to consider
    vars2analyse = list(ds.data_vars)
    # Add LWI to array
    ds = utils.add_LWI2array(ds=ds, var2template=var2template, res=res)
    # - Get general annual stats
    df = pd.DataFrame()
    # take annual average
    if use_annual_mean:
        ds_tmp = ds.mean(dim='time')
    else:
        ds_tmp = ds
    for var_ in vars2analyse:
        # Mask to only consider (100%) water boxes
        arr = ds_tmp[var_].values
        if debug:
            print(arr.shape, (ds_tmp['IS_WATER'] == False).shape)
        arr[(ds_tmp['IS_WATER'] == False).values] = np.NaN
        # Update values to include np.NaN
        ds_tmp[var_].values = arr
        # Setup series objects to hold stats
        s_mean = pd.Series()
        s_75 = pd.Series()
        s_50 = pd.Series()
        s_25 = pd.Series()
        # Loop by latasave to dataframe
        for lat_ in ds['lat'].values:
            vals = ds_tmp[var_].sel(lat=lat_).values
            stats_ = pd.Series(vals.flatten()).dropna().describe()
            # At poles all values will be the same (masked) value
#            if len( set(vals.flatten()) ) == 1:
#                pass
#            else:
            # save quartiles and mean
    #            try:
            s_mean[lat_] = stats_['mean']
            s_25[lat_] = stats_['25%']
            s_75[lat_] = stats_['75%']
            s_50[lat_] = stats_['50%']
    #            except KeyError:
    #                print( 'Values not considered for lat={}'.format( lat_ ) )
        # Save variables to DataFrame
        var_str = '{} - {}'
        stats_dict = {'mean': s_mean, '75%': s_75, '25%': s_25, 'median': s_50}
        for stat_ in stats_dict.keys():
            df[var_str.format(var_, stat_)] = stats_dict[stat_]
    return df
Exemplo n.º 18
0
def download_data4spec(lev2use=72, spec='LWI', res='0.125',
                       file_prefix='nature_run', doys_list=None, verbose=True,
                       debug=False):
    """
    Download all data for a given species at a given resolution

    Parameters
    -------
    spec (str): variable to extract from archived data
    res (str): horizontal resolution of dataset (e.g. 4x5)
    file_prefix (str): file prefix to add to saved file
    debug (bool): print out debugging output?

    Returns
    -------
    (None)

    Notes
    -----
     - use level=71 for lowest level
     (NetCDF is ordered the oposite way, python 0-71. Xarray numbering makes
     this level=72)
     (or use dictionary through xarray)
    """
    # - local variables
    # Where is the remote data?
    root_url = 'https://opendap.nccs.nasa.gov/dods/OSSE/G5NR-Chem/Heracles/'
#    url_str = root_url+'12.5km/{}_deg/inst/inst1_3d_TRC{}_Nv'.format(res,spec)
    url_str = root_url+'12.5km/{}_deg/tavg/tavg1_2d_chm_Nx'.format(res)
    # Where should i save the data?
    save_dir = utils.get_file_locations('data_root') + '/NASA/LWI/'
    # - Open dataset via URL with xarray
    # Using xarray (issues found with NASA OpenDAP data model - via PyDAP)
    ds = xr.open_dataset(url_str)
    if verbose:
        print(ds, '\n\n\n')
    # Get list of (all) doys to extract (unless provided as argv.)
    if isinstance(doys_list, type(None)):
        doys_list = list(set(ds['time.dayofyear'].values))
    # Variable to extract?
    var_name = '{}'.format(spec.lower())
    # Just test a small extraction.
    # if debug:
    #    data = ds[var_name][:10, lev, :, :]
    # select level and download all data
    ds = ds[var_name][:, :, :]
    # Make sure time is the dimension not module
    time = ds.time
    # - loop days of year (doy)
    # Custom mask
    def is_dayofyear(doy):
        return (doy == doy_)
    # Loop doys
    for doy_ in doys_list[:4]:
        try:
            if verbose:
                print(doy_, spec)
            # Now select for month
            ds_tmp = ds.sel(time=is_dayofyear(ds['time.dayofyear']))
            # Save as NetCDF
            year_ = list(set(ds_tmp['time.year'].values))[0]
            # What is the filename?
            fstr = '{}_lev_{}_res_{}_spec_{}_{}_{:0>3}_ctm.nc'
            file2save = fstr.format(file_prefix, lev2use, res, spec, year_, str(doy_))
            # Now save downloaded data as a NetCDF locally...
            if verbose:
                print(save_dir+file2save)
            ds_tmp.to_netcdf(save_dir+file2save)
            # Remove from memory
            del ds_tmp
        except RuntimeError:
            err_str = 'TMS ERROR - FAIL for spec={} (doy={})'.format(
                spec, doy_)
            print(err_str)
Exemplo n.º 19
0
def add_ensemble_avg_std_to_dataset(res='0.125x0.125', RFR_dict=None, target='Iodide',
                                    stats=None, ds=None, topmodels=None,
                                    var2template='Chance2014_STTxx2_I',
                                    var2use4Ensemble = 'Ensemble_Monthly_mean',
                                    var2use4std = 'Ensemble_Monthly_std',
                                    save2NetCDF=True):
    """
    Add ensemble average and std to dataset

    Parameters
    -------
    target (str): Name of the target variable (e.g. iodide)
    var2use4Ensemble (str): variable name to use for ensemble prediction
    var2use4Std (str): variable name to use for ensemble prediction's std dev.
    var2template (str): variable to use a template to make new variables
    res (str): horizontal resolution of dataset (e.g. 4x5)
    topmodels (list): list of models to include in ensemble prediction
    save2NetCDF (bool): save the dataset as NetCDF file
    RFR_dict (dict): dictionary of core variables and data
    var2template (str): variable to use a template for making new variables in ds

    Returns
    -------
    (xr.Dataset)
    """
    # Get existing dataset from NetCDF if ds not provided
    filename = 'Oi_prj_predicted_{}_{}.nc'.format(target, res)
    if isinstance(ds, type(None)):
        data_root = utils.get_file_locations('data_root')
        folder = '{}/{}/'.format(data_root, target)
        ds = xr.open_dataset(folder + filename)
    # Just use top 10 models are included
    # ( with derivative variables )
    if isinstance(topmodels, type(None)):
        # extract the models...
        if isinstance(RFR_dict, type(None)):
            RFR_dict = build_or_get_models()
        # Get list of
        topmodels = get_top_models(RFR_dict=RFR_dict, vars2exclude=['DOC', 'Prod'])
    # Now get average concentrations and std dev. per month
    avg_ars = []
    std_ars = []
    for month in range(1, 13):
        ars = []
        for var in topmodels:
            ars += [ds[var].sel(time=(ds['time.month'] == month)).values]
        # Concatenate the models
        arr = np.concatenate(ars, axis=0)
        # Save the monthly average and standard deviation
        avg_ars += [np.ma.mean(arr, axis=0)]
        std_ars += [np.ma.std(arr, axis=0)]
    # Combine the arrays and then make the model variable
    # 1st Template an existing variable, then overwrite
    ds[var2use4Ensemble] = ds[var2template].copy()
    ds[var2use4Ensemble].values = np.stack(avg_ars)
    # And repeat for standard deviation
    ds[var2use4std] = ds[var2template].copy()
    ds[var2use4std].values = np.stack(std_ars)
    # Save the list of models used to make ensemble to array
    attrs = ds.attrs.copy()
    attrs['Ensemble_members ({})'.format(var2use4Ensemble)] = ', '.join(topmodels)
    ds.attrs = attrs
    # Save to NetCDF
    if save2NetCDF:
        ds.to_netcdf(filename)
    else:
        return ds
Exemplo n.º 20
0
def mk_NetCDF_from_productivity_data():
    """
    Convert productivity .csv file (Behrenfeld and Falkowski, 1997) into a NetCDF file
    """
    # Location of data (update to use public facing host)
    folder = utils.get_file_locations('data_root') + '/Productivity/'
    # Which file to use?
    filename = 'productivity_behrenfeld_and_falkowski_1997_extrapolated.csv'
    # Setup coordinates
    lon = np.arange(-180, 180, 1/6.)
    lat = np.arange(-90, 90, 1/6.)
    lat = np.append(lat, [90])
    # Setup time
    varname = 'vgpm'
    months = np.arange(1, 13)
    # Extract data
    df = pd.read_csv(folder+filename, header=None)
    print(df.shape)
    # Extract data by month
    da_l = []
    for n in range(12):
        # Assume the data is in blocks by longitude?
        arr = df.values[:, n*1081: (n+1)*1081].T[None, ...]
        print(arr.shape)
        da_l += [xr.Dataset(
            data_vars={varname: (['time', 'lat', 'lon', ], arr)},
            coords={'lat': lat, 'lon': lon, 'time': [n]})]
    # Concatenate to data xr.Dataset
    ds = xr.concat(da_l, dim='time')
    # Update time ...
    sdate = datetime.datetime(1985, 1, 1)  # Climate model tiem
    ds['time'] = [AC.add_months(sdate, i-1) for i in months]
    # Update to hours since X
    hours = [(AC.dt64_2_dt([i])[0] - sdate).days *
             24. for i in ds['time'].values]
    ds['time'] = hours
    # Add units
    attrs_dict = {'units': 'hours since 1985-01-01 00:00:00'}
    ds['time'].attrs = attrs_dict
    # Add attributes for variable
    attrs_dict = {
        'long_name': "net primary production",
        'units': "mg C / m**2 / day",
    }
    ds[varname].attrs = attrs_dict
    # For latitude...
    attrs_dict = {
        'long_name': "latitude",
        'units': "degrees_north",
        "standard_name": "latitude",
        "axis": "Y",
    }
    ds['lat'].attrs = attrs_dict
    # And longitude...
    attrs_dict = {
        'long_name': "longitude",
        'units': "degrees_east",
        "standard_name": "longitude",
        "axis": "X",
    }
    ds['lon'].attrs = attrs_dict
    # Add extra global attributes
    global_attribute_dictionary = {
        'Title': 'Sea-surface productivity (Behrenfeld and Falkowski, 1997)',
        'Author': 'Tomas Sherwen ([email protected])',
        'Notes': "Data extracted from OCRA and extrapolated to poles by Martin Wadley. NetCDF contructed using xarray (xarray.pydata.org) by Tomas Sherwen. \n NOTES from oringal site (http://orca.science.oregonstate.edu/) from 'based on the standard vgpm algorithm. npp is based on the standard vgpm, using modis chl, sst4, and par as input; clouds have been filled in the input data using our own gap-filling software. For citation, please reference the original vgpm paper by Behrenfeld and Falkowski, 1997a as well as the Ocean Productivity site for the data.' ",
        'History': 'Last Modified on:' + strftime("%B %d %Y", gmtime()),
        'Conventions': "COARDS",
    }
    ds.attrs = global_attribute_dictionary
    # Save to NetCDF
    filename = 'productivity_behrenfeld_and_falkowski_1997_extrapolated.nc'
    ds.to_netcdf(filename, unlimited_dims={'time': True})
Exemplo n.º 21
0
def extract_templated_excel_file(limit_depth_to=20,
                                 Data_Key=None,
                                 metadata_df=None,
                                 use_inclusive_limit=False,
                                 file_and_path='./sparse2spatial.rc',
                                 verbose=True,
                                 debug=False):
    """
    Extract an excel file in the iodide template format & return as DataFrame

    Parameters
    -------
    file_and_path (str): folder and filename with location settings as single str
    filename (str): name of the csv file or archived data from BODC
    limit_depth_to (float), depth (m) to limit inclusion of data to
    use_inclusive_limit (bool), limit depth (limit_depth_to) in a inclusive way
    debug (bool), print debugging statements to screen

    Returns
    -------
    (pd.DataFrame)
    """
    # limit_depth_to=20; Data_Key=None; metadata_df=None; debug=False
    # -  Get file details
    # Load metadata file as a DataFrame
    Data_Key_meta = metadata_df[metadata_df.Data_Key == Data_Key]
    # Use TMS updated variable for filename
    #    filename = Data_Key_meta['File name'].values[0]
    filename = Data_Key_meta['File_name_UPDATED'].values[0]
    source = Data_Key_meta['source'].values[0]
    InChance2014 = Data_Key_meta['In Chance2014?'].values[0] == 'Y'
    # - Get directory which contains files
    # Data submitted directly for preparation
    # (as publish by Chance et al (2014) )
    # New data, acquired since 2017
    folder = utils.get_file_locations('data_root', file_and_path=file_and_path)
    folder = '/{}/Iodide/'.format(folder)
    if (not InChance2014):
        folder += '/inputs/new_data/'
    elif ((source == 's') or (source == 'bodc')) and (InChance2014):
        folder += '/inputs/submitted_data/'
    # Data digitalised for Chance et al (2014)
    elif (source == 'd') and (InChance2014):
        folder += '/inputs/digitised_data/'
    else:
        print("Source received ('') unknown?!".format(source))
        sys.exit()
    # File specific reading settings?
    read_csv_settings = read_csv_settings_4_data_key_file(Data_Key=Data_Key)
    skiprows, file_extension = read_csv_settings
    # - Read file and process
    if verbose:
        print('reading: {}'.format(filename), Data_Key)
    df = pd.read_excel(folder + filename + file_extension,
                       sheet_name='Data',
                       skiprows=skiprows)
    # Force use of 'Index' column as index to preserve ordering.
    df.index = df['Index'].values
    # Only consider values with a depth value lower than x (e.g. 100m)
    # From Chance et al (2014): On ship-based campaigns, ‘surface’ water is
    # usually collected from an underway pumped seawater inlet
    # (typically at a depth of around 6 m on a 100 m length research
    # ship), and/or sampling bottles mounted on a CTD rosette and
    # closed within a few metres of the sea surface, but during some
    # eld campaigns (e.g. winter samples in the Antarctic73), only
    # data from 15 m depth was available. In most cases, the water
    # column is thought to be sufficiently homogenous between 0 and
    # 20 m that this choice of depth can be assumed to be representative
    # of concentrations in the top few metres of the water
    # column (see Section 3.4 for a description of the changes in
    # iodine speciation with depth).
    if verbose:
        print(df.columns)
    if use_inclusive_limit:
        df = df.loc[df['Depth'] <=
                    limit_depth_to, :]  # consider values inclusively
    else:
        df = df.loc[df['Depth'] <
                    limit_depth_to, :]  # only consider values less than X
    # Add a column to be a unique identifier and column index
    def get_unique_Data_Key_label(x, Data_Key=Data_Key):
        # Use the index as the number (which now starts from 1)
        x = int(x)
        return '{}_{:0>4}'.format(Data_Key, x)

    # Map to index, then assign to be index
    df['Data_Key_ID'] = df['Index'].map(get_unique_Data_Key_label)
    #    df.index = df['Data_Key_ID']
    # Also add column for data key
    df['Data_Key'] = Data_Key
    return df
Exemplo n.º 22
0
def get_processed_df_obs_mod(reprocess_params=False, target='CHBr3',
                             filename='s2s_CHBr3_obs_ancillaries.csv',
                             rm_Skagerrak_data=False,
                             file_and_path='./sparse2spatial.rc',
                             verbose=True, debug=False):
    """
    Get the processed observation and model output

    Parameters
    -------

    Returns
    -------
    (pd.DataFrame)

    Notes
    -----

    """
    # Read in processed csv file
    folder = utils.get_file_locations('data_root', file_and_path=file_and_path)
    folder += '/{}/inputs/'.format(target)
    filename = 's2s_{}_obs_ancillaries.csv'.format(target)
    df = pd.read_csv(folder+filename, encoding='utf-8')
    # Kludge (temporary) - make Chlorophyll values all floats
#     def mk_float_or_nan(input):
#         try:
#             return float(input)
#         except:
#             return np.nan
#     df['SeaWIFs_ChlrA'] = df['SeaWIFs_ChlrA'].map(mk_float_or_nan)
    # Add ln of iodide too
#    df['ln(Iodide)'] = df['Iodide'].map(np.ma.log)
    # Add SST in Kelvin too
    if 'WOA_TEMP_K' not in df.columns:
        df['WOA_TEMP_K'] = df['WOA_TEMP_K'].values + 273.15
    # Add a flag for coastal values
#     coastal_flagged = 'coastal_flagged'
#     if coastal_flagged not in df.columns:
#         df = get_coastal_flag(df=df)
    # Make sure month is numeric (if not given)
#     month_var = 'Month'
#     NaN_months_bool = ~np.isfinite(df[month_var].values)
#     NaN_months_df = df.loc[NaN_months_bool, :]
#     N_NaN_months = NaN_months_df.shape[0]
#     if N_NaN_months > 1:
#         print_str = 'DataFrame contains NaNs for {} months - '
#         print_str += 'Replacing these with month # 3 months '
#         print_str += 'before (hemispheric) summer solstice'
#         if verbose:
#             print(print_str.format(N_NaN_months))
#         NaN_months_df[month_var] = NaN_months_df.apply(lambda x:
#                                                        set_backup_month_if_unkonwn(
#                                                            lat=x['Latitude'],
#                                                            #main_var=var2use,
#                                                            #var2use=var2use,
#                                                            #
#                                                            #Data_key_ID_=Data_key_ID_,
#                                                            debug=False), axis=1)
#         # Add back into DataFrame
#         df.loc[NaN_months_bool, month_var] = NaN_months_df[month_var].values
    # Re-process the parameterisations (Chance et al etc + ensemble)?
#     if reprocess_params:
#                 # Add predictions from literature
#         df = get_literature_predicted_iodide(df=df)
#         # Add ensemble prediction
#         df = get_ensemble_predicted_iodide(
#             rm_Skagerrak_data=rm_Skagerrak_data
#         )
    return df
Exemplo n.º 23
0
def get_stats_on_spatial_predictions_4x5_2x25(res='4x5', ex_str='', target='Iodide',
                                              use_annual_mean=True, filename=None,
                                              folder=None, just_return_df=False,
                                              var2template='Chance2014_STTxx2_I',
                                              ):
    """
    Evaluate the spatial predictions between models at a resolution of 4x5 or 2x2.5

    Parameters
    -------
    target (str): Name of the target variable (e.g. iodide)
    res (str): horizontal resolution of dataset (e.g. 4x5)
    var2template (str): variable to use a template for making new variables in ds
    use_annual_mean (bool): use the annual mean of the variable

    Returns
    -------

    Notes
    -----
    """
    # If filename or folder not given, then use defaults
    if isinstance(filename, type(None)):
        filename = 'Oi_prj_predicted_{}_{}.nc'.format(target, res)
    if isinstance(folder, type(None)):
        data_root = utils.get_file_locations('data_root')
        folder = '{}/{}/outputs/'.format(data_root, target)
    ds = xr.open_dataset(folder + filename)
    # variables to consider
    vars2plot = list(ds.data_vars)
    # add LWI and surface area to array
    ds = utils.add_LWI2array(ds=ds, var2template=var2template)
    IS_WATER = ds['IS_WATER'].mean(dim='time')
    # -- get general annual stats in a dataframe
    df = pd.DataFrame()
    for var_ in vars2plot:
        ds_tmp = ds[var_].copy()
        # take annual average
        if use_annual_mean:
            ds_tmp = ds_tmp.mean(dim='time')
        # mask to only consider (100%) water boxes
        arr = ds_tmp.values
        arr = arr[(IS_WATER == True)]
        # sve to dataframe
        df[var_] = pd.Series(arr.flatten()).describe()
    # Get area weighted mean
    vals = []
    for var_ in vars2plot:
        ds_tmp = ds[var_]
        # take annual average
        if use_annual_mean:
            ds_tmp = ds_tmp.mean(dim='time')
        # mask to only consider (100%) water boxes
        arr = np.ma.array(ds_tmp.values, mask=~(LWI == 0).T)
        # also mask s_area
        s_area_tmp = np.ma.array(s_area, mask=~(LWI == 0))
        # save value
        vals += [AC.get_2D_arr_weighted_by_X(arr, s_area=s_area_tmp.T)]
    # Add area weighted mean to df
    df = df.T
    df['mean (weighted)'] = vals
    df = df.T
    # Save or just return the values
    file_save = 'Oi_prj_annual_stats_global_ocean_{}{}.csv'.format(res, ex_str)
    if just_return_df:
        return df
    df.T.to_csv(file_save)
Exemplo n.º 24
0
def add_all_Chance2014_correlations(df=None, debug=False, verbose=False):
    """
    Add Chance et al 2014 parameterisations to df (from processed .csv)
    """
    # get details of parameterisations
    #    filename='Chance_2014_Table2_PROCESSED_17_04_19.csv'
    filename = 'Chance_2014_Table2_PROCESSED.csv'
    folder = utils.get_file_locations('data_root')
    folder += '/Iodide/'
    param_df = pd.read_csv(folder + filename)
    # map input variables
    input_dict = {
        'C': 'WOA_TEMP',
        'ChlorA': 'SeaWIFs_ChlrA',
        'K': 'WOA_TEMP_K',
        'Lat': 'Latitude',
        'MLDpd': 'WOA_MLDpd',
        'MLDpt': 'WOA_MLDpt',
        'MLDvd': 'WOA_MLDvd',
        'MLDpd_max': 'WOA_MLDpd_max',
        'MLDpt_max': 'WOA_MLDpt_max',
        'MLDvd_max': 'WOA_MLDvd_max',
        'MLDpd_sum': 'WOA_MLDpd_sum',
        'MLDpt_sum': 'WOA_MLDpt_sum',
        'MLDvd_sum': 'WOA_MLDvd_sum',
        'NO3': 'WOA_Nitrate',
        'Salinity': 'WOA_Salinity',
    }
    # - Loop parameterisations and add to dataframe
    for param in param_df['TMS ID'].values:
        sub_df = param_df[param_df['TMS ID'] == param]
        if debug:
            print(sub_df)
        # extract variables
        data = df[input_dict[sub_df.param.values[0]]].values
        #  Function to use?
        func2use = str(sub_df.function.values[0])
        if debug:
            print(func2use)
        # Do any functions on the data
        if func2use == 'None':
            pass
        elif func2use == 'abs':
            data = abs(data)
        elif func2use == 'inverse':
            data = 1. / data
        elif func2use == 'square':
            data = data**2
#        elif func2use == 'max':
#            print 'Need to add max option!'
#        elif func2use == 'sum':
#            print 'Need to add sum option!'
        else:
            print('function not in list')
            sys.exit()


#        if not isinstance(func2use, type(None) ):
#            data = func2use(data)
# apply linear scaling
        m, c = [sub_df[i].values[0] for i in ['m', 'c']]
        #        print [ (type(i), i) for i in m, c,data ]
        data = (m * data) + c
        # now add to dictionary
        df[param] = data
    return df
Exemplo n.º 25
0
def get_core_Chance2014_obs(debug=False, file_and_path='./sparse2spatial.rc'):
    """
    Get core observation data from Chance2014

    Parameters
    -------
    file_and_path (str): folder and filename with location settings as single str
    debug (bool): print debugging to screen

    Returns
    -------
    (pd.DataFrame)

    Notes
    -----
     - This assumes that core data is "surface data" above 20m
     - only considers rows of csv where there is iodine data.
    """
    # - Get file observational file
    # Directory to use?
    folder = utils.get_file_locations('data_root', file_and_path=file_and_path)
    folder = '{}/Iodide/inputs/'.format(folder)
    # Filename for <20m iodide data?
    filename = 'Iodide_data_above_20m.csv'
    # Open data as DataFrame
    df = pd.read_csv(folder + filename)
    # - Process the input observational data
    # list of core variables
    core_vars = [
        'Ammonium',
        'Chl-a',
        'Cruise',
        'Data_Key',
        'Data_Key_ID',
        'Date',
        'Day',
        'Depth',
        'Iodate',
        'Iodide',
        'Latitude',
        'Longitude',
        'MLD',
        'Month',
        'MLD(vd)',
        'Nitrate',
        'Nitrite',
        'O2',
        'Organic-I',
        'Salinity',
        'Station',
        'Temperature',
        'Time',
        'Total-I',
        'Unique id',
        'Year',
        u'Method',
        u'ErrorFlag',
    ]
    # 2nd iteration excludes 'MLD(vd)', so remove this.
    core_vars.pop(core_vars.index('MLD(vd)'))
    # 2nd iterations includes new flag columns. Add these.
    core_vars += [
        'Coastal',
        'LocatorFlag',
        'Province',
    ]
    # Just select core variables
    df = df[core_vars]

    # Remove datapoints that are not floats
    def make_sure_values_are_floats(x):
        """
        Some values in the dataframes are "nd" or "###?". remove these.
        """
        try:
            x = float(x)
        except:
            x = np.NaN
        return x

    # TODO: Make this more pythonic
    make_data_floats = [
        'Ammonium', 'Chl-a', 'Iodate', 'Iodide', 'Latitude', 'Longitude',
        'MLD', 'MLD(vd)', 'Month', 'Nitrate', 'Nitrite', 'O2', 'Organic-I',
        'Salinity', 'Temperature', 'Total-I'
    ]
    # 2nd iteration excludes 'MLD(vd)', so remove this.
    make_data_floats.pop(make_data_floats.index('MLD(vd)'))
    # 2nd iterations includes new flag columns. Add these.
    make_data_floats += [
        'Coastal',
        'LocatorFlag',
        'Province',
    ]
    # v8.4 had further updates.
    make_data_floats += [
        'ErrorFlag',
    ]
    for col in make_data_floats:
        df[col] = df[col].map(make_sure_values_are_floats)[:]
    # Only consider rows where there is iodide data (of values from <20m N=930)
    if debug:
        print('I- df shape (inc. NaNs): {}'.format(str(df.shape)))
    df = df[np.isfinite(df['Iodide'])]
    if debug:
        print("I- df post rm'ing NaNs: {}".format(str(df.shape)))
    return df
Exemplo n.º 26
0
def Hyperparameter_Tune_model(use_choosen_model=True,
                              model=None,
                              RFR_dict=None,
                              df=None,
                              cv=3,
                              testset='Test set (strat. 20%)',
                              target='Iodide',
                              features_used=None,
                              model_name=None,
                              save_best_estimator=True):
    """
    Driver to tune hyperparmeters of model

    Parameters
    -------
    testset (str): Testset to use, e.g. stratified sampling over quartiles for 20%:80%
    target (str): Name of the target variable (e.g. iodide)
    RFR_dict (dict): dictionary of core variables and data
    model_name (str): name of model to tune performance of
    features_used (list): list of the features within the model_name model
    save_best_estimator (bool): save the best performing model offline
    model (RandomForestRegressor), Random Forest Regressor model to tune
    cv (int), number of folds of cross-validation to use

    Returns
    -------
    (RandomForestRegressor)
    """
    from sklearn.externals import joblib
    from sklearn.ensemble import RandomForestRegressor
    # Get data to test
    if isinstance(df, type(None)):
        #        df = get_dataset_processed4ML()
        df = RFR_dict['df']

    # Use the model selected from the feature testing
    if use_choosen_model:
        assert_str = "model name not needed as use_choosen_model selected!"
        assert isinstance(model, type(None)), assert_str
        # select a single chosen model
        mdict = get_choosen_model_from_features_selection()
        features_used = mdict['features_used']
        model = mdict['model']
        model_name = mdict['name']

    # - extract training dataset
    test_set = df.loc[df[testset] == True, :]
    train_set = df.loc[df[testset] == False, :]
    # also sub select all vectors for input data
    # ( Making sure to remove the target!!! )
    train_features = df[features_used].loc[train_set.index]
    train_labels = df[[target]].loc[train_set.index]
    test_features = df[features_used].loc[test_set.index]
    test_labels = df[[target]].loc[test_set.index]

    # - Make the base model for comparisons
    base_model = RandomForestRegressor(n_estimators=10,
                                       random_state=42,
                                       criterion='mse')
    base_model.fit(train_features, train_labels)
    quick_model_evaluation(base_model, test_features, test_labels)

    # - First make an intial explore of the parameter space
    rf_random = Use_RS_CV_to_explore_hyperparams(cv=cv,
                                                 train_features=train_features,
                                                 train_labels=train_labels,
                                                 features_used=features_used)
    # Check the performance by Random searching (RandomizedSearchCV)
    best_random = rf_random.best_estimator_
    best_params_ = rf_random.best_params_
    print(rf_random.best_params_)
    quick_model_evaluation(best_random, test_features, test_labels)

    # - Now do a more focused optimisation
    # get the parameters based on the RandomizedSearchCV output
    param_grid = define_hyperparameter_options2test(
        features_used=features_used,
        best_params_=best_params_,
        param_grid_RandomizedSearchCV=True)
    # Use GridSearchCV
    grid_search = use_GS_CV_to_tune_Hyperparams(
        cv=cv,
        train_features=train_features,
        param_grid=param_grid,
        train_labels=train_labels,
        features_used=features_used,
    )
    print(grid_search.best_params_)
    # Check the performance of grid seraching searching
    BEST_ESTIMATOR = grid_search.best_estimator_
    quick_model_evaluation(BEST_ESTIMATOR, test_features, test_labels)

    # Save the best estimator now for future use
    if save_best_estimator:
        data_root = utils.get_file_locations('data_root')
        folder = '{}/{}/models/LIVE/OPTIMISED_MODELS/'.format(
            data_root, target)
        model_savename = "my_model_{}.pkl".format(model_name)
        joblib.dump(BEST_ESTIMATOR, folder + model_savename)
    else:
        return BEST_ESTIMATOR
Exemplo n.º 27
0
def mk_predictions_NetCDF_4_many_builds(model2use,
                                        res='4x5',
                                        models_dict=None,
                                        features_used_dict=None,
                                        RFR_dict=None,
                                        target='Iodide',
                                        stats=None,
                                        plot2check=False,
                                        rm_Skagerrak_data=False,
                                        debug=False):
    """
    Make a NetCDF file of predicted variables for a given resolution

    Parameters
    -------
    model2use (str): name of the model to use
    target (str): Name of the target variable (e.g. iodide)
    RFR_dict (dict): dictionary of core variables and data
    res (str): horizontal resolution of dataset (e.g. 4x5)
    features_used_dict (dict): dictionary of feature variables in models
    plot2check (bool): make a quick plot to check the prediction
    models_dict (dict): dictionary of RFR models and there names
    stats (pd.DataFrame): dataframe of statistics on models in models_dict
    rm_Skagerrak_data (bool): Remove specific data
    (above argument is a iodide specific option - remove this)
    debug (bool): print out debugging output?

    Returns
    -------
    (None)
    """
    from sklearn.externals import joblib
    import gc
    import glob
    # - local variables
    # extract the models...
    if isinstance(RFR_dict, type(None)):
        RFR_dict = build_or_get_models(rm_Skagerrak_data=rm_Skagerrak_data)
    # Get the variables required here
    if isinstance(features_used_dict, type(None)):
        features_used_dict = RFR_dict['features_used_dict']
    # Set the extr_str if rm_Skagerrak_data set to True
    if rm_Skagerrak_data:
        extr_str = '_No_Skagerrak'
    else:
        extr_str = ''
    # Get location to save file and set filename
    folder = utils.get_file_locations('data_root') + '/data/'
    filename = 'Oi_prj_feature_variables_{}.nc'.format(res)
    dsA = xr.open_dataset(folder + filename)
    # Get location to save ensemble builds of models
    folder_str = '{}/{}/models/LIVE/ENSEMBLE_REPEAT_BUILD{}/'
    folder = folder_str.format(folder, target, extr_str)
    # - Make a dataset for each model
    ds_l = []
    # Get list of twenty models built
    models_str = folder + '*{}*.pkl'.format(model2use)
    builds4model = glob.glob(models_str)
    print(builds4model, models_str)
    # Print a string to debug the output
    db_str = "Found {} saved models for '{} - glob str:{}'"
    print(db_str.format(len(builds4model), model2use, models_str))
    # Get the numbers for the models in directory
    b_modelnames = [i.split('my_model_')[-1][:-3] for i in builds4model]
    # Check the number of models selected
    ast_str = "There aren't models for {} in {}"
    assert len(b_modelnames) > 1, ast_str.format(model2use, folder)
    # Now loop by model built for ensemble member and predict values
    for n_modelname, b_modelname in enumerate(b_modelnames):
        # Load the model
        model = joblib.load(builds4model[n_modelname])
        # Get testinng features
        features_used = features_used_dict[model2use].split('+')
        # Make a DataSet of predicted values
        ds_l += [
            mk_da_of_predicted_values(model=model,
                                      res=res,
                                      dsA=dsA,
                                      modelname=b_modelname,
                                      features_used=features_used)
        ]
        # Force local tidy of garbage
        gc.collect()
    # Combine datasets
    ds = xr.merge(ds_l)
    # - Also get values for existing parameterisations
    if target == 'Iodide':
        # Chance et al (2013)
        param = u'Chance2014_STTxx2_I'
        arr = utils.calc_I_Chance2014_STTxx2_I(dsA['WOA_TEMP'].values)
        ds[param] = ds[b_modelname]  # use existing array as dummy to fill
        ds[param].values = arr
        # MacDonald et al (2013)
        param = 'MacDonald2014_iodide'
        arr = utils.calc_I_MacDonald2014(dsA['WOA_TEMP'].values)
        ds[param] = ds[b_modelname]  # use existing array as dummy to fill
        ds[param].values = arr
    # Do a test diagnostic plot?
    if plot2check:
        for var_ in ds.data_vars:
            # Do a quick plot to check
            arr = ds[var_].mean(dim='time')
            AC.map_plot(arr, res=res)
            plt.title(var_)
            plt.show()
    # Save to NetCDF
    save_name = 'Oi_prj_predicted_{}_{}_ENSEMBLE_BUILDS_{}_{}.nc'
    ds.to_netcdf(save_name.format(target, res, model2use, extr_str))
Exemplo n.º 28
0
def process_MLD_csv2NetCDF(debug=False, _fill_value=-9999.9999E+10):
    """
    Process NOAA WOA94 csv files into NetCDF files

    Parameters
    -------
    _fill_value (float): fill value to use for new NetCDF
    debug (bool): perform debugging and verbose printing?

    Returns
    -------
    (xr.Dataset)
    """
    # The MLD fields available are computed from climatological monthly mean
    # profiles of potential temperature and potential density based on three
    # different criteria: a temperature change from the ocean surface of 0.5
    # degree Celsius, a density change from the ocean surface of 0.125
    # (sigma units), and a variable density change from the ocean surface
    # corresponding to a temperature change of 0.5 degree Celsius. The MLD
    # based on the variable density criterion is designed to account for the
    # large variability of the coefficient of thermal expansion that
    # characterizes seawater.
    # Citation: Monterey, G. and Levitus, S., 1997: Seasonal Variability of
    # Mixed Layer Depth for the World Ocean. NOAA Atlas NESDIS 14, U.S.
    # Gov. Printing Office, Wash., D.C., 96 pp. 87 figs. (pdf, 13.0 MB).
    # variables for
    MLD_vars = ['pt', 'pd', 'vd']
    folder = utils.get_file_locations('data_root') + '/WOA94/'
    # - Loop MLD variables
    for var_ in MLD_vars:
        file_str = 'mld*{}*'.format(var_)
        files = sorted(glob.glob(folder+file_str))
        print(files)
        # Loop files and extract data as an arrayu
        ars = []
        for file in files:
            # values are assume to have been outputed in a row major way
            # e.g. (lon, lat)
            # open
            with open(file, 'rb') as file_:
                # Extract all values
                lines = [i.split() for i in file_]
                # Convert to floats (and masked values (e.g. "-") to NaN ),
                # the concatenate to "big" list
                big = []
                for n, line in enumerate(lines):
                    for value in line:
                        try:
                            value = float(value)
                        except ValueError:
                            value = np.NaN
                        big += [value]
            # Now reshape
            ars += [np.ma.array(big).reshape((180, 360)).T]
            # Debug (?) by showing 2D grid
            if debug:
                plt.pcolor(np.arange(0, 360), np.arange(0, 180),  ars[0])
                plt.colorbar()
                plt.show()
        # Force to be in COARDS format? (e.g. lat, lon) instead of (lon, lat)
        ars = [i.T for i in ars]
        # Fill nans with _fill_value,
        ars = [np.ma.filled(i, fill_value=_fill_value) for i in ars]
        # Then convert to numpy array...
        ars = [np.array(i) for i in ars]
        print([type(i) for i in ars])
        # Force dates
        dates = [datetime.datetime(1985, 1, i+1) for i in range(12)]
        lons = np.arange(0+0.5, 360+0.5, 1)
        lats = np.arange(-90+0.5, 90+0.5, 1)
        res = '1x1'
        # Save to NetCDF
        AC.save_2D_arrays_to_3DNetCDF(ars=ars, dates=dates, varname=var_,
                                      res=res,
                                      filename='WOA94_MLD_1x1_{}'.format(var_),
                                      lons=lons,
                                      lats=lats)
Exemplo n.º 29
0
def Hyperparameter_Tune4choosen_models(RFR_dict=None,
                                       target='Iodide',
                                       cv=7,
                                       testset='Test set (strat. 20%)'):
    """
    Driver to tune mutiple RFR models

    Parameters
    -------
    testset (str): Testset to use, e.g. stratified sampling over quartiles for 20%:80%
    cv (int), number of folds of cross-validation to use
    target (str): Name of the target variable (e.g. iodide)
    RFR_dict (dict): dictionary of models, data and shared variables

    Returns
    -------
    (None)
    """
    from sklearn.externals import joblib
    # Get the data for the models
    if isinstance(RFR_dict, type(None)):
        RFR_dict = build_or_get_models()
    # Set models to optimise
    models2compare = get_top_models(RFR_dict=RFR_dict,
                                    vars2exclude=['DOC', 'Prod'])
    # Get variables needed from core dictionary
    features_used_dict = RFR_dict['features_used_dict']
    models_dict = RFR_dict['models_dict']
    # Set folder to use for optimised models
    data_root = utils.get_file_locations('data_root')
    folder = '{}/{}/models/LIVE/OPTIMISED_MODELS/'.format(data_root, target)
    # Loop and save optimised model
    # NOTE: this could be speed up by using more cores
    for model_name in models2compare:
        print('Optimising model: {}'.format(model_name))
        # Get model
        model = models_dict[model_name]
        # get testing features
        features_used = features_used_dict[model_name].split('+')
        # Tune parameters
        BE = Hyperparameter_Tune_model(model=model,
                                       use_choosen_model=False,
                                       save_best_estimator=True,
                                       model_name=model_name,
                                       RFR_dict=RFR_dict,
                                       features_used=features_used,
                                       cv=cv)

    # - Test the tuned models against the test set
    test_the_tuned_models = False
    if test_the_tuned_models:
        # Get the core data
        df = RFR_dict['df']
        # Get the data
        test_set = df.loc[df[testset] == True, :]
        train_set = df.loc[df[testset] == False, :]
        # Test the improvements in the optimised models?
        for model_name in models2compare:
            # - Get existing model
            model = models_dict[model_name]
            # Get testing features
            features_used = features_used_dict[model_name].split('+')
            # -  Get the data
            # ( Make sure to remove the target )
            #        train_features = df[features_used].loc[ train_set.index  ]
            #        train_labels = df[[target]].loc[ train_set.index  ]
            test_features = df[features_used].loc[test_set.index]
            test_labels = df[[target]].loc[test_set.index]
            # - test the existing model
            print(' ---------------- ' * 3)
            print(' ---------------- {}: '.format(model_name))
            print(' - Base values: ')
            quick_model_evaluation(model, test_features, test_labels)
            # - Get optimised model
            try:
                model_savename = "my_model_{}.pkl".format(model_name)
                OPmodel = joblib.load(folder + model_savename)
                #
                print(' - Optimised values: ')
                quick_model_evaluation(OPmodel, test_features, test_labels)
            except:
                pass
        # - Test the tuned models against the training set
        # Get the core data
        df = RFR_dict['df']
        # get the data
        test_set = df.loc[df[testset] == True, :]
        train_set = df.loc[df[testset] == False, :]
        # Test the improvements in the optimised models?
        for model_name in models2compare:
            # - Get existing model
            model = models_dict[model_name]
            # get testing features
            features_used = features_used_dict[model_name].split('+')
            # -  Get the data
            # ( Making sure to remove the target!!! )
            train_features = df[features_used].loc[train_set.index]
            train_labels = df[[target]].loc[train_set.index]
            #            test_features = df[features_used].loc[ test_set.index ]
            #            test_labels = df[[target]].loc[ test_set.index ]
            # - test the existing model
            print(' ---------------- ' * 3)
            print(' ---------------- {}: '.format(model_name))
            print(' - Base values: ')
            quick_model_evaluation(model, train_features, train_labels)
            # - Get optimised model
            try:
                model_savename = "my_model_{}.pkl".format(model_name)
                OPmodel = joblib.load(folder + model_savename)
                #
                print(' - Optimised values: ')
                quick_model_evaluation(OPmodel, train_features, train_labels)
            except:
                pass
Exemplo n.º 30
0
def get_iodide_obs(just_use_submitted_data=False,
                   use_Chance2014_core_data=True,
                   analyse_iodide_values2drop=False,
                   process_new_iodide_obs_file=False,
                   file_and_path='./sparse2spatial.rc',
                   limit_depth_to=20,
                   verbose=True,
                   debug=False):
    """
    Extract iodide observations from the (re-formated) file from Chance2014

    Parameters
    -------
    just_use_submitted_data (bool), just use the data submitted for Chance et al 2014
    use_Chance2014_core_data (bool), just use the code data in Chance2014's analysis
    analyse_iodide_values2drop (bool), check which values should be removed
    process_new_iodide_obs_file (bool), make a new iodide obs. file?
    file_and_path (str): folder and filename with location settings as single str
    limit_depth_to (float), depth (m) to limit inclusion of data to
    verbose (bool): print verbose statements to screen
    debug (bool): print debugging statements to screen

    Returns
    -------
    (pd.DataFrame)

    Notes
    -----
    """
    # What is the location of the iodide data?
    folder = utils.get_file_locations('data_root', file_and_path=file_and_path)
    folder += '/Iodide/inputs/'
    # Name to save file as
    filename = 'Iodide_data_above_20m.csv'
    # - Get Metadata (and keep as a seperate DataFrame )
    metadata_df = get_iodide_obs_metadata()
    # Process new iodide obs. (data) file?
    if process_new_iodide_obs_file:
        # - Extract data?
        # To test processing... just use submitted data?
        if just_use_submitted_data:
            Data_Keys = metadata_df['Data_Key'][metadata_df['source'] == 's']
            print(Data_Keys)
            # Add bodc data
            bool_ = metadata_df['source'] == 'bodc'
            bodc_Data_Keys = metadata_df['Data_Key'].loc[bool_]
            Data_Keys = list(Data_Keys)
            bodc_Data_Keys = list(bodc_Data_Keys)
            print(bodc_Data_Keys)
            Data_Keys = Data_Keys + bodc_Data_Keys
            print(Data_Keys)
        else:  # use all data
            Data_Keys = metadata_df['Data_Key']
        # - Loop by the datasets ("Data_Keys")
        # Setup list to store dataframes
        dfs = []
        # Loop data keys for sites
        for n_Data_Key, Data_Key in enumerate(Data_Keys):
            pcent = float(n_Data_Key) / len(Data_Keys) * 100
            if verbose:
                print(n_Data_Key, Data_Key, pcent)
            # Extract data
            df = extract_templated_excel_file(Data_Key=Data_Key,
                                              metadata_df=metadata_df,
                                              limit_depth_to=limit_depth_to)
            # Save to list
            dfs += [df]
        # Combine dataframes.
        main_df = pd.concat(dfs)
        # Analyse the datapoints that are being removed.
        if analyse_iodide_values2drop:
            # Loop indexes and save out values that are "odd"
            ind2save = []
            tmp_var = 'temp #'
            main_df[tmp_var] = np.arange(main_df.shape[0])
            for ind in main_df[tmp_var].values:
                df_tmp = main_df.loc[main_df[tmp_var] == ind, :]
                try:
                    pd.to_numeric(df_tmp['Iodide'])
                except:
                    ind2save += [ind]
        # Make sure core values are numeric
        core_numeric_vars = [
            u'Ammonium', u'Chl-a', u'Depth', u'Iodate', u'Iodide', u'Latitude',
            u'Longitude', u'Nitrate', u'Nitrite', u'O2', u'Organic-I',
            u'Salinity', u'Total-I', u'Temperature', u'\u03b4Ammonium',
            u'\u03b4Chl-a', u'\u03b4Iodate', u'\u03b4Iodide', u'\u03b4Nitrate',
            u'\u03b4Nitrite', u'\u03b4Org-I', u'\u03b4Total-I'
        ]
        for var in core_numeric_vars:
            main_df[var] = pd.to_numeric(main_df[var].values, errors='coerce')
        # Save to disk
        main_df.to_csv(folder + filename, encoding='utf-8')
    # - Just use existing file
    else:
        try:
            # Just open existing file
            if use_Chance2014_core_data:
                main_df = get_core_Chance2014_obs()
            else:
                main_df = pd.read_csv(folder + filename, encoding='utf-8')
        except:
            print('Error opening processed iodide data file')
    # Return DataFrames
    return main_df, metadata_df