Exemplos de load_hdf5 em Python, exemplos de func_fc.load_hdf5 em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: exp_fc.py Projeto: yangxhcaf/RGCPD

def CPPA_precursor_regions(path_data, keys_options=['CPPA']):
    #%%
    dict_of_dfs = func_fc.load_hdf5(path_data)
    df_data = dict_of_dfs['df_data']
    splits = df_data.index.levels[0]
    skip = ['TrainIsTrue', 'RV_mask']
    keys_d = {}

    for option in keys_options:
        keys_d_ = {}
        for s in splits:

            if option == 'robust':
                not_robust = [
                    '0_101_PEPspatcov', 'PDO', 'ENSO_34', 'ENSO_34', 'PDO'
                ]
                all_keys = df_data.loc[s].columns[1:]
                all_keys = [k for k in all_keys if k not in skip]
                all_keys = [k for k in all_keys if k not in not_robust]

                robust = ['0_100_CPPAspatcov', '2', '7', '9']
                sst_regs = [k for k in all_keys if len(k.split('_')) == 3]
                other = [k for k in all_keys if len(k.split('_')) != 3]
                keys_ = [k for k in sst_regs if k.split('_')[1] in robust]
                [keys_.append(k) for k in other]

            elif option == 'CPPA':
                not_robust = [
                    '0_101_PEPspatcov', '0_104_PDO', '0_103_ENSO34', 'ENSO_34',
                    'PDO', '0_900_ENSO34', '0_901_PDO'
                ]
                all_keys = df_data.loc[s].columns[1:]
                all_keys = [k for k in all_keys if k not in skip]
                all_keys = [k for k in all_keys if k not in not_robust]
                keys_ = all_keys

            elif option == 'PEP':
                all_keys = df_data.loc[s].columns[1:]
                all_keys = [k for k in all_keys if k not in skip]
                keys_ = [
                    k for k in all_keys if k.split('_')[-1] == 'PEPspatcov'
                ]

            keys_d_[s] = np.unique(keys_)
        keys_d[option] = keys_d_

    #%%
    return keys_d

Exemplo n.º 2

0

Exibir arquivo

Arquivo: find_precursors.py Projeto: yangxhcaf/RGCPD

def import_precur_ts(import_prec_ts, df_splits, to_freq, start_end_date,
                     start_end_year):
    '''
    import_prec_ts has format tuple (name, path_data)
    '''
    splits = df_splits.index.levels[0]
    df_data_ext_s   = np.zeros( (splits.size) , dtype=object)
    counter = 0
    for i, (name, path_data) in enumerate(import_prec_ts):
        for s in range(splits.size):
            # skip first col because it is the RV ts
            df_data_e = func_fc.load_hdf5(path_data)['df_data'].iloc[:,1:].loc[s]
            cols_ts = np.logical_or(df_data_e.dtypes == 'float64', df_data_e.dtypes == 'float32')
            cols_ext = list(df_data_e.columns[cols_ts])
            # cols_ext must be of format '{}_{int}_{}'
            lab_int = 100
            for i, c in enumerate(cols_ext):
                char = c.split('_')[1]
                if char.isdigit():
                    pass
                else:
                    cols_ext[i] = c.replace(char, str(lab_int)) + char
                    lab_int += 1
                    
            df_data_ext_s[s] = df_data_e[cols_ext]
            tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days
            
            if to_freq != tfreq_date_e:
                try:
                    df_data_ext_s[s] = functions_pp.time_mean_bins(df_data_ext_s[s], 
                                                         to_freq,
                                                        start_end_date,
                                                        start_end_year)[0]
                except KeyError as e:
                    print('KeyError captured, likely the requested dates '
                          'given by start_end_date and start_end_year are not' 
                          'found in external pandas timeseries.\n{}'.format(str(e)))
                                                        
        if counter == 0:
            df_data_ext = pd.concat(list(df_data_ext_s), keys=range(splits.size))
        else:
            df_data_ext.merge(df_data_ext, left_index=True, right_index=True)
    return df_data_ext

Exemplo n.º 3

0

Exibir arquivo

Arquivo: wrapper_RGCPD_tig.py Projeto: yangxhcaf/RGCPD

def RV_and_traintest(fullts,
                     TV_ts,
                     method=str,
                     kwrgs_events=None,
                     precursor_ts=None,
                     seed=int,
                     verbosity=1):

    # Define traintest:
    df_RVfullts = pd.DataFrame(fullts.values,
                               index=pd.to_datetime(fullts.time.values))
    df_RV_ts = pd.DataFrame(TV_ts.values,
                            index=pd.to_datetime(TV_ts.time.values))
    if method[:9] == 'ran_strat':
        kwrgs_events = kwrgs_events
        TV = classes.RV_class(df_RVfullts, df_RV_ts, kwrgs_events)
    else:
        TV = classes.RV_class(df_RVfullts, df_RV_ts)

    if precursor_ts is not None:
        # Retrieve same train test split as imported ts
        path_data = ''.join(precursor_ts[0][1])
        df_splits = func_fc.load_hdf5(
            path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']]
        test_yrs_imp = functions_pp.get_testyrs(df_splits)
        df_splits = functions_pp.rand_traintest_years(
            TV,
            method=method,
            seed=seed,
            kwrgs_events=kwrgs_events,
            verb=verbosity)
        test_yrs_set = functions_pp.get_testyrs(df_splits)
        assert (np.equal(test_yrs_imp,
                         test_yrs_set)).all(), "Train test split not equal"
    else:
        df_splits = functions_pp.rand_traintest_years(
            TV,
            method=method,
            seed=seed,
            kwrgs_events=kwrgs_events,
            verb=verbosity)
    return TV, df_splits

Exemplo n.º 4

0

Exibir arquivo

Arquivo: find_precursors.py Projeto: yangxhcaf/RGCPD

def RV_and_traintest(fullts, TV_ts, method=str, kwrgs_events=None, precursor_ts=None,
                     seed=int, verbosity=1):


    # Define traintest:
    df_fullts = pd.DataFrame(fullts.values, 
                               index=pd.to_datetime(fullts.time.values))
    df_RV_ts    = pd.DataFrame(TV_ts.values,
                               index=pd.to_datetime(TV_ts.time.values))

    if method[:9] == 'ran_strat' and kwrgs_events is None:
            # events need to be defined to enable stratified traintest.
            kwrgs_events = {'event_percentile': 66,
                            'min_dur' : 1,
                            'max_break' : 0,
                            'grouped' : False}
            if verbosity == 1:
                print("kwrgs_events not given, creating stratified traintest split "
                     "based on events defined as exceeding the {}th percentile".format(
                         kwrgs_events['event_percentile']))

    TV = RV_class(df_fullts, df_RV_ts, kwrgs_events)

    
    if precursor_ts is not None:
        # Retrieve same train test split as imported ts
        path_data = ''.join(precursor_ts[0][1])
        df_splits = func_fc.load_hdf5(path_data)['df_data'].loc[:,['TrainIsTrue', 'RV_mask']]
        test_yrs_imp  = functions_pp.get_testyrs(df_splits)
        df_splits = functions_pp.rand_traintest_years(TV, method=method,
                                                          seed=seed, 
                                                          kwrgs_events=kwrgs_events, 
                                                          verb=verbosity)
        test_yrs_set  = functions_pp.get_testyrs(df_splits)
        assert (np.equal(test_yrs_imp, test_yrs_set)).all(), "Train test split not equal"
    else:
        df_splits = functions_pp.rand_traintest_years(TV, method=method,
                                                          seed=seed, 
                                                          kwrgs_events=kwrgs_events, 
                                                          verb=verbosity)
    return TV, df_splits

Exemplo n.º 5

0

Exibir arquivo

def forecast_wrapper(datasets=dict,
                     kwrgs_exp=dict,
                     kwrgs_events=dict,
                     stat_model_l=list,
                     lags_i=list,
                     n_boot=0):
    '''
    dict should have splits (as keys) and concomitant list of keys of that particular split 
    '''

    df_data = func_fc.load_hdf5(path_data)['df_data']
    splits = df_data.index.levels[0]
    RVfullts = pd.DataFrame(df_data[df_data.columns[0]][0])
    RV_ts = pd.DataFrame(df_data[df_data.columns[0]][0][df_data['RV_mask'][0]])
    fit_model_dates = kwrgs_exp['kwrgs_pp']['fit_model_dates']
    RV = func_fc.RV_class(RVfullts,
                          RV_ts,
                          kwrgs_events,
                          fit_model_dates=fit_model_dates)

    RV.TrainIsTrue = df_data['TrainIsTrue']
    RV.RV_mask = df_data['RV_mask']
    fit_model_mask = pd.concat([RV.fit_model_mask] * 10, keys=splits)
    df_data = df_data.merge(fit_model_mask, left_index=True, right_index=True)
    RV.prob_clim = func_fc.get_obs_clim(RV)

    dict_sum = {}
    for stat_model in stat_model_l:
        name = stat_model[0]
        df_valid, RV, y_pred_all = func_fc.forecast_and_valid(
            RV,
            df_data,
            kwrgs_exp,
            stat_model=stat_model,
            lags_i=lags_i,
            n_boot=n_boot)
        dict_sum[name] = (df_valid, RV, y_pred_all)

    return dict_sum

Exemplo n.º 6

0

Exibir arquivo

Arquivo: wrapper_RGCPD_tig.py Projeto: yangxhcaf/RGCPD

def run_PCMCI(ex, outdic_actors, s, df_splits, map_proj):
    #=====================================================================================
    #
    # 4) PCMCI-algorithm
    #
    #=====================================================================================

    # save output
    if ex['SaveTF'] == True:
        #        from contextlib import redirect_stdout
        orig_stdout = sys.stdout
        # buffer print statement output to f
        if sys.version[:1] == '3':
            sys.stdout = f = io.StringIO()
        elif sys.version[:1] == '2':
            sys.stdout = f = open(os.path.join(ex['fig_subpath'], 'old.txt'),
                                  'w+')

#%%
# amount of text printed:
    verbosity = 3

    # alpha level for independence test within the pc procedure (finding parents)
    pc_alpha = ex['pcA_sets'][ex['pcA_set']]
    # alpha level for multiple linear regression model while conditining on parents of
    # parents
    alpha_level = ex['alpha_level_tig']
    print('run tigramite 4, run.pcmci')
    print(('alpha level(s) for independence tests within the pc procedure'
           '(finding parents): {}'.format(pc_alpha)))
    print((
        'alpha level for multiple linear regression model while conditining on parents of '
        'parents: {}'.format(ex['alpha_level_tig'])))

    # Retrieve traintest info
    traintest = df_splits

    # load Response Variable class
    RV = ex[ex['RV_name']]
    # create list with all actors, these will be merged into the fulldata array
    allvar = ex['vars'][0]
    var_names_corr = []
    actorlist = []
    cols = [[RV.name]]

    for var in allvar[:]:
        print(var)
        actor = outdic_actors[var]
        if actor.ts_corr[s].size != 0:
            ts_train = actor.ts_corr[s].values
            actorlist.append(ts_train)
            # create array which numbers the regions
            var_idx = allvar.index(var)
            n_regions = actor.ts_corr[s].shape[1]
            actor.var_info = [[i + 1, actor.ts_corr[s].columns[i], var_idx]
                              for i in range(n_regions)]
            # Array of corresponing regions with var_names_corr (first entry is RV)
            var_names_corr = var_names_corr + actor.var_info
            cols.append(list(actor.ts_corr[s].columns))
            index_dates = actor.ts_corr[s].index
    var_names_corr.insert(0, RV.name)

    # stack actor time-series together:
    fulldata = np.concatenate(tuple(actorlist), axis=1)

    print(('There are {} regions in total'.format(fulldata.shape[1])))
    # add the full 1D time series of interest as first entry:

    fulldata = np.column_stack((RV.RVfullts, fulldata))
    df_data = pd.DataFrame(fulldata, columns=flatten(cols), index=index_dates)

    if ex['import_prec_ts'] == True:
        var_names_full = var_names_corr.copy()
        for d in ex['precursor_ts']:
            path_data = d[1]
            if len(path_data) > 1:
                path_data = ''.join(list(path_data))
            # skip first col because it is the RV ts
            df_data_ext = func_fc.load_hdf5(
                path_data)['df_data'].iloc[:, 1:].loc[s]
            cols_ts = np.logical_or(df_data_ext.dtypes == 'float64',
                                    df_data_ext.dtypes == 'float32')
            cols_ext = list(df_data_ext.columns[cols_ts])
            # cols_ext must be of format '{}_{int}_{}'
            lab_int = 100
            for i, c in enumerate(cols_ext):
                char = c.split('_')[1]
                if char.isdigit():
                    pass
                else:
                    cols_ext[i] = c.replace(char, str(lab_int)) + char
                    lab_int += 1

            df_data_ext = df_data_ext[cols_ext]
            to_freq = ex['tfreq']
            if to_freq != 1:
                start_end_date = (ex['sstartdate'], ex['senddate'])
                start_end_year = (ex['startyear'], ex['endyear'])
            df_data_ext = functions_pp.time_mean_bins(df_data_ext,
                                                      to_freq,
                                                      start_end_date,
                                                      start_end_year,
                                                      seldays='part')[0]
            #            df_data_ext = functions_pp.time_mean_bins(df_data_ext,
            #                                                     ex, ex['tfreq'],
            #                                                     seldays='part')[0]
            # Expand var_names_corr
            n = var_names_full[-1][0] + 1
            add_n = n + len(cols_ext)
            n_var_idx = var_names_full[-1][-1] + 1
            for i in range(n, add_n):
                var_names_full.append([i, cols_ext[i - n], n_var_idx])
            df_data = df_data.merge(df_data_ext,
                                    left_index=True,
                                    right_index=True)
    else:
        var_names_full = var_names_corr

    bool_train = traintest.loc[s]['TrainIsTrue']
    bool_RV_train = np.logical_and(bool_train, traintest.loc[s]['RV_mask'])
    dates_train = traintest.loc[s]['TrainIsTrue'][bool_train].index
    dates_RV_train = traintest.loc[s]['TrainIsTrue'][bool_RV_train].index

    RVfull_train = RV.RVfullts.sel(time=dates_train)
    datesfull_train = pd.to_datetime(RVfull_train.time.values)
    data = df_data.loc[datesfull_train].values
    print((data.shape))

    # get RV datamask (same shape als data)
    data_mask = [
        True if d in dates_RV_train else False for d in datesfull_train
    ]
    data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape)

    # add traintest mask to fulldata
    #    dates_all = pd.to_datetime(RV.RVfullts.index)
    #    dates_RV  = pd.to_datetime(RV.RV_ts.index)
    dates_all = pd.to_datetime(RV.RVfullts.time.values)
    dates_RV = pd.to_datetime(RV.RV_ts.time.values)
    df_data['TrainIsTrue'] = [
        True if d in datesfull_train else False for d in dates_all
    ]
    df_data['RV_mask'] = [True if d in dates_RV else False for d in dates_all]

    # ======================================================================================================================
    # tigramite 3
    # ======================================================================================================================

    T, N = data.shape  # Time, Regions
    # ======================================================================================================================
    # Initialize dataframe object (needed for tigramite functions)
    # ======================================================================================================================
    dataframe = pp.DataFrame(data=data,
                             mask=data_mask,
                             var_names=var_names_full)
    # ======================================================================================================================
    # pc algorithm: only parents for selected_variables are calculated
    # ======================================================================================================================

    parcorr = ParCorr(significance='analytic',
                      mask_type='y',
                      verbosity=verbosity)
    #==========================================================================
    # multiple testing problem:
    #==========================================================================
    pcmci = PCMCI(dataframe=dataframe,
                  cond_ind_test=parcorr,
                  selected_variables=None,
                  verbosity=4)

    # selected_variables : list of integers, optional (default: range(N))
    #    Specify to estimate parents only for selected variables. If None is
    #    passed, parents are estimated for all variables.

    # ======================================================================================================================
    #selected_links = dictionary/None
    results = pcmci.run_pcmci(tau_max=ex['tigr_tau_max'],
                              pc_alpha=pc_alpha,
                              tau_min=0,
                              max_combinations=ex['max_comb_actors'])

    q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'],
                                           fdr_method='fdr_bh')

    pcmci.print_significant_links(p_matrix=results['p_matrix'],
                                  q_matrix=q_matrix,
                                  val_matrix=results['val_matrix'],
                                  alpha_level=alpha_level)

    # returns all parents, not just causal precursors (of lag>0)
    sig = rgcpd.return_sign_parents(pcmci,
                                    pq_matrix=q_matrix,
                                    val_matrix=results['val_matrix'],
                                    alpha_level=alpha_level)

    all_parents = sig['parents']
    #    link_matrix = sig['link_matrix']

    links_RV = all_parents[0]

    df = rgcpd.bookkeeping_precursors(links_RV, var_names_full)
    #%%

    rgcpd.print_particular_region_new(links_RV, var_names_corr, s,
                                      outdic_actors, map_proj, ex)

    #%%
    if ex['SaveTF'] == True:
        if sys.version[:1] == '3':
            fname = f's{s}_' + ex['params'] + '.txt'
            file = io.open(os.path.join(ex['fig_subpath'], fname), mode='w+')
            file.write(f.getvalue())
            file.close()
            f.close()
        elif sys.version[:1] == '2':
            f.close()
        sys.stdout = orig_stdout

    return df, df_data

Exemplo n.º 7

0

Exibir arquivo

Arquivo: exp_fc.py Projeto: yangxhcaf/RGCPD

def normal_precursor_regions(path_data, keys_options=['all'], causal=False):
    #%%
    '''
    keys_options=['all', 'only_db_regs', 'sp_and_regs', 'sst+sm+RWT',
                  'sst(CPPA)+sm', 'sst(PEP)+sm', 'sst(PDO,ENSO)+sm',
                  'sst(CPPA)']
    '''

    dict_of_dfs = func_fc.load_hdf5(path_data)
    df_data = dict_of_dfs['df_data']
    splits = df_data.index.levels[0]
    try:
        df_sum = dict_of_dfs['df_sum']
    except:
        pass


#    skip = ['all_spatcov', '0_2_sm123', '0_101_PEPspatcov', 'sm123_spatcov']
    skip = ['all_spatcov']

    keys_d = {}
    for option in keys_options:
        keys_d_ = {}
        for s in splits:

            if causal == True or 'causal' in option:
                # causal
                all_keys = df_sum[df_sum['causal']].loc[s].index

            elif causal == False and 'causal' not in option:
                # correlated
                df_s = df_data.loc[s]
                all_keys = df_s.columns.delete(0)
                # extract only float columns
                mask_f = np.logical_or(df_s.dtypes == 'float64',
                                       df_s.dtypes == 'float32')
                all_keys = all_keys[mask_f[1:].values]
                # remove spatcov_causals
                all_keys = [k for k in all_keys if k[-4:] != 'caus']

            if option == 'all':
                # extract only float columns
                keys_ = [k for k in all_keys if k not in skip]

            elif 'only_db_regs' in option:
                # Regions + all_spatcov(_caus)
                keys_ = [k for k in all_keys if ('spatcov' not in k)]
                keys_ = [k for k in keys_ if k not in skip]
            elif option == 'sp_and_regs':
                keys_ = [k for k in all_keys if k not in skip]
            elif option == 'sst(CPPA)':
                skip_ex = [
                    '0_900_ENSO34', '0_901_PDO', '0_101_PEPspatcov',
                    'sm123_spatcov', 'all_spatcov'
                ]
                keys_ = [k for k in all_keys if 'v200hpa' not in k]
                keys_ = [k for k in keys_ if 'sm' not in k]
                keys_ = [k for k in keys_ if k not in skip_ex]
            elif option == 'sst+sm+RWT':
                keys_ = [k for k in all_keys if k[-7:] != 'v200hpa']
                keys_ = [k for k in keys_ if k not in skip]
            elif option == 'sst(CPPA)+sm':
                skip_ex = [
                    '0_900_ENSO34', '0_901_PDO', '0_101_PEPspatcov',
                    'sm123_spatcov', 'all_spatcov'
                ]
                keys_ = [k for k in all_keys if 'v200hpa' not in k]
                keys_ = [k for k in keys_ if k not in skip]
                keys_ = [k for k in keys_ if k not in skip_ex]
            elif option == 'CPPAregs+sm':
                skip_ex = ['0_900_ENSO34', '0_901_PDO']
                keys_ = [k for k in all_keys if 'v200hpa' not in k]
                keys_ = [k for k in keys_ if k not in skip]
                keys_ = [k for k in keys_ if ('spatcov' not in k)]
                keys_ = [k for k in keys_ if k not in skip_ex]
            elif option == 'CPPApattern+sm':
                skip_ex = ['0_900_ENSO34', '0_901_PDO']
                keys_ = [k for k in all_keys if 'v200hpa' not in k]
                keys_ = [k for k in keys_ if k not in skip]
                keys_ = [k for k in keys_ if ('spatcov' in k or 'sm' in k)]
            elif option == 'sm':
                keys_ = [k for k in all_keys if 'sm' in k]
                keys_ = [k for k in keys_ if 'spatcov' not in k]
            elif option == 'sst(PEP)+sm':
                keys_ = [k for k in all_keys if 'sm' in k or 'PEP' in k]
                keys_ = [k for k in keys_ if k != 'sm123_spatcov']
            elif option == 'sst(PDO,ENSO)+sm':
                keys_ = [
                    k for k in all_keys
                    if 'sm' in k or 'PDO' in k or 'ENSO' in k
                ]
                keys_ = [k for k in keys_ if 'spatcov' not in k]

            keys_d_[s] = np.unique(keys_)

        keys_d[option] = keys_d_

    #%%
    return keys_d

Exemplo n.º 8

0

Exibir arquivo

Arquivo: exp_fc.py Projeto: yangxhcaf/RGCPD

def compare_use_spatcov(path_data, causal=True):
    #%%
    dict_of_dfs = func_fc.load_hdf5(path_data)
    df_data = dict_of_dfs['df_data']
    splits = df_data.index.levels[0]
    df_sum = dict_of_dfs['df_sum']

    keys_d = {}

    keys_d_ = {}
    for s in splits:
        if causal == True:
            # causal
            all_keys = df_sum[df_sum['causal']].loc[s].index
        elif causal == False:
            # correlated
            all_keys = df_sum.loc[s].index.delete(0)

        # Regions + all_spatcov(_caus)
        keys_ = [
            k for k in all_keys if ('spatcov_caus' in k) and k[:3] == 'all'
        ]
        keys_d_[s] = np.array((keys_))

    keys_d['Only_all_spatcov'] = keys_d_

    keys_d_ = {}
    for s in splits:
        if causal == True:
            # causal
            all_keys = df_sum[df_sum['causal']].loc[s].index
        elif causal == False:
            # correlated
            all_keys = df_sum.loc[s].index.delete(0)

        # Regions + all_spatcov(_caus)
        keys_ = [
            k for k in all_keys if ('spatcov_caus' not in k) or k[:3] == 'all'
        ]
        keys_d_[s] = np.array((keys_))

    keys_d['Regions_all_spatcov'] = keys_d_

    keys_d_ = {}
    for s in splits:
        if causal == True:
            # causal
            all_keys = df_sum[df_sum['causal']].loc[s].index
        elif causal == False:
            # correlated
            all_keys = df_sum.loc[s].index.delete(0)

        # only spatcov(_caus) (no all_spatcov)
        keys_ = [k for k in all_keys if ('spatcov' in k) and ('all' not in k)]

        keys_d_[s] = np.array((keys_))
    keys_d['only_sp_caus_no_all_sp'] = keys_d_

    keys_d_ = {}
    for s in splits:
        if causal == True:
            # causal
            all_keys = df_sum[df_sum['causal']].loc[s].index
        elif causal == False:
            # correlated
            all_keys = df_sum.loc[s].index.delete(0)

        # only spatcov(_caus) (no all_spatcov)
        keys_ = [k for k in all_keys if ('all' not in k)]

        keys_d_[s] = np.array((keys_))
    keys_d['Regions_sp_caus_no_all_sp'] = keys_d_

    #%%
    return keys_d

Exemplo n.º 9

0

Exibir arquivo

Arquivo: main.py Projeto: semvijverberg/CPPA

    n_ev = int(p * n_steps)
    rand_true = np.zeros((n_steps))
    ind = np.random.choice(range(n_steps), n_ev)
    rand_true[ind] = 1
    rand_scores.append(brier_score_loss(rand_true, np.repeat(p, n_steps)))
plt.plot(np.linspace(0, 1, 19), rand_scores)

#%% Keep forecast the same, change event definition
import func_fc
import validation as valid
import classes
import pandas as pd

path_fig = '/Users/semvijverberg/surfdrive/MckinRepl/era5_T2mmax_sst_Northern/ran_strat10_s30/figures'
path_data = '/Users/semvijverberg/surfdrive/RGCPD_mcKinnon/t2mmax_E-US_sm123_m01-09_dt10/18jun-17aug_lag0-0_ran_strat10_s30/pcA_none_ac0.05_at0.05_subinfo/fulldata_pcA_none_ac0.05_at0.05_2019-09-24.h5'
df_data = func_fc.load_hdf5(path_data)['df_data']
splits = df_data.index.levels[0]
RVfullts = pd.DataFrame(df_data[df_data.columns[0]][0])
RV_ts = pd.DataFrame(df_data[df_data.columns[0]][0][df_data['RV_mask'][0]])
thresholds = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99]
blocksize = valid.get_bstrap_size(RV.RVfullts, plot=False)
metric = 'AUC-ROC'
df_list = []
scores = []

for r in thresholds:
    kwrgs_events = {
        'event_percentile': r,
        'min_dur': 1,
        'max_break': 0,
        'grouped': False

Exemplo n.º 10

0

Exibir arquivo

def normal_precursor_regions(path_data, keys_options=['all'], causal=True):
    #%%
    '''
    keys_options=['all', 'only_db_regs', 'sp_and_regs', 'sst+sm+RWT',
                  'sst(CPPA)+sm', 'sst(PEP)+sm', 'sst(PDO,ENSO)+sm']
    '''
    
    dict_of_dfs = func_fc.load_hdf5(path_data)
    df_data = dict_of_dfs['df_data']
    splits  = df_data.index.levels[0]
    df_sum  = dict_of_dfs['df_sum']
    
    skip = ['all_spatcov', '0_2_sm123', '0_101_PEPspatcov', 'sm123_spatcov']
    
    
    
    keys_d = {}
    for option in keys_options:
        keys_d_ = {}
        for s in splits:
            
            if causal == True:
                # causal
                keys_ = df_sum[df_sum['causal']].loc[s].index
                
            elif causal == False:
                # correlated
                all_keys = df_sum.loc[s].index.delete(0)
                # remove spatcov_causals
                all_keys = [k for k in all_keys if k[-4:] != 'caus']
                
                
            if option == 'all':
                keys_ = [k for k in all_keys if k not in skip]
            elif option == 'only_db_regs':                
                # Regions + all_spatcov(_caus)
                keys_ = [k for k in all_keys if ('spatcov' not in k)]
                keys_ = [k for k in keys_ if k not in skip]
            elif option == 'sp_and_regs': 
                keys_ = [k for k in all_keys if k not in skip]
            elif option == 'sst+sm+RWT': 
                keys_ = [k for k in all_keys if k[-7:] != 'v200hpa']
                keys_ = [k for k in keys_ if k not in skip]
            elif option == 'sst(CPPA)+sm': 
                skip_ex = ['0_900_ENSO34', '0_901_PDO']
                keys_ = [k for k in all_keys if 'v200hpa' not in k]
                keys_ = [k for k in keys_ if k not in skip]
                keys_ = [k for k in keys_ if k not in skip_ex]
            elif option == 'CPPAregs+sm': 
                skip_ex = ['0_900_ENSO34', '0_901_PDO']
                keys_ = [k for k in all_keys if 'v200hpa' not in k]
                keys_ = [k for k in keys_ if k not in skip]
                keys_ = [k for k in keys_ if ('spatcov' not in k)]
                keys_ = [k for k in keys_ if k not in skip_ex]
            elif option == 'CPPApattern+sm': 
                skip_ex = ['0_900_ENSO34', '0_901_PDO']
                keys_ = [k for k in all_keys if 'v200hpa' not in k]
                keys_ = [k for k in keys_ if k not in skip]
                keys_ = [k for k in keys_ if ('spatcov' in k or 'sm' in k)]              
            elif option == 'sst(PEP)+sm': 
                keys_ = [k for k in all_keys if 'sm' in k or 'PEP' in k]
                keys_ = [k for k in keys_ if k != 'sm123_spatcov']
            elif option == 'sst(PDO,ENSO)+sm':
                keys_ = [k for k in all_keys if 'sm' in k or 'PDO' in k or 'ENSO' in k]
                keys_ = [k for k in keys_ if 'spatcov' not in k]
                
            keys_d_[s] = np.array(list(unique_everseen(keys_)))
            
        keys_d[option] = keys_d_
        
    #%%
    return keys_d

Exemplo n.º 11

0

Exibir arquivo

    'min_dur': 1,
    'max_break': 0,
    'grouped': False
}

dict_experiments = {}
for dataset, tuple_sett in experiments.items():
    '''
    Format output is 
    dict(
            exper_name = dict( model=tuple(df_valid, RV, y_pred) ) 
        )
    '''
    path_data = tuple_sett[0]
    kwrgs_exp = tuple_sett[1]
    dict_of_dfs = func_fc.load_hdf5(path_data)
    df_data = dict_of_dfs['df_data']
    splits = df_data.index.levels[0]
    tfreq = (df_data.loc[0].index[1] - df_data.loc[0].index[0]).days
    if tfreq == 1:
        lags_i = np.arange(0, 70 + 1E-9, max(10, tfreq), dtype=int)
    else:
        lags_i = np.array(np.arange(0, 70 + 1E-9, max(10, tfreq)) /
                          max(10, tfreq),
                          dtype=int)

#    lags_i = np.array([0], dtype=int)

    if 'keys' not in kwrgs_exp:
        # if keys not defined, getting causal keys
        kwrgs_exp['keys'] = exp_fc.normal_precursor_regions(

Exemplo n.º 12

0

Exibir arquivo

def calculate_corr_maps(ex, map_proj):
    #%%
    # =============================================================================
    # Load 'exp' dictionairy with information of pre-processed data (variables, paths, filenames, etcetera..)
    # and add RGCPD/Tigrimate experiment settings
    # =============================================================================
    # Response Variable is what we want to predict
    RV = ex[ex['RV_name']]
    ex['time_cycle'] = RV.dates[
        RV.dates.year ==
        RV.startyear].size  # time-cycle of data. total timesteps in one year
    ex['time_range_all'] = [0, RV.dates.size]
    #==================================================================================
    # Start of experiment
    #==================================================================================

    # Define traintest:
    df_RVfullts = pd.DataFrame(RV.RVfullts.values,
                               index=pd.to_datetime(RV.RVfullts.time.values))
    df_RV_ts = pd.DataFrame(RV.RV_ts.values,
                            index=pd.to_datetime(RV.RV_ts.time.values))
    if ex['method'][:9] == 'ran_strat':
        kwrgs_events = ex['kwrgs_events']
        RV = func_fc.RV_class(df_RVfullts, df_RV_ts, kwrgs_events)
    else:
        RV = func_fc.RV_class(df_RVfullts, df_RV_ts)
    if ex['import_prec_ts']:
        # Retrieve same train test split as imported ts
        path_data = ''.join(ex['precursor_ts'][0][1])
        df_splits = func_fc.load_hdf5(
            path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']]
        test_yrs = functions_pp.get_testyrs(df_splits)
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex, test_yrs)
        assert (np.equal(test_yrs,
                         ex['tested_yrs'])).all(), "Train test split not equal"
    else:
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex)

    # =============================================================================
    # 2) DEFINE PRECURSOS COMMUNITIES:
    # =============================================================================
    # - calculate and plot pattern correltion for differnt fields
    # - create time-series over these regions
    #=====================================================================================
    outdic_actors = dict()

    class act:
        def __init__(self, name, corr_xr, precur_arr):
            self.name = var
            self.corr_xr = corr_xr
            self.precur_arr = precur_arr
            self.lat_grid = precur_arr.latitude.values
            self.lon_grid = precur_arr.longitude.values
            self.area_grid = rgcpd.get_area(precur_arr)
            self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0])

    allvar = ex['vars'][0]  # list of all variable names
    for var in allvar[ex['excludeRV']:]:  # loop over all variables
        actor = ex[var]
        #===========================================
        # 3c) Precursor field
        #===========================================
        file_path = os.path.join(actor.path_pp, actor.filename_pp)
        precur_arr = functions_pp.import_ds_timemeanbins(file_path, ex)
        #        precur_arr = rgcpd.convert_longitude(precur_arr, 'only_east')
        # =============================================================================
        # Calculate correlation
        # =============================================================================
        corr_xr = rgcpd.calc_corr_coeffs_new(precur_arr, RV, ex)

        # =============================================================================
        # Cluster into precursor regions
        # =============================================================================
        actor = act(var, corr_xr, precur_arr)
        actor, ex = rgcpd.cluster_DBSCAN_regions(actor, ex)
        if np.isnan(actor.prec_labels.values).all() == False:
            rgcpd.plot_regs_xarray(actor.prec_labels.copy(), ex)
        outdic_actors[var] = actor
        # =============================================================================
        # Plot
        # =============================================================================
        if ex['plotin1fig'] == False:
            plot_maps.plot_corr_maps(corr_xr, corr_xr['mask'], map_proj)
            fig_filename = '{}_corr_{}_vs_{}'.format(
                ex['params'], ex['RV_name'], var) + ex['file_type2']
            plt.savefig(os.path.join(ex['fig_path'], fig_filename),
                        bbox_inches='tight',
                        dpi=200)
            if ex['showplot'] == False:
                plt.close()


#%%
    return ex, outdic_actors