예제 #1
0
def RV_and_traintest(fullts,
                     TV_ts,
                     method=str,
                     kwrgs_events=None,
                     precursor_ts=None,
                     seed=int,
                     verbosity=1):

    # Define traintest:
    df_RVfullts = pd.DataFrame(fullts.values,
                               index=pd.to_datetime(fullts.time.values))
    df_RV_ts = pd.DataFrame(TV_ts.values,
                            index=pd.to_datetime(TV_ts.time.values))
    if method[:9] == 'ran_strat':
        kwrgs_events = kwrgs_events
        TV = classes.RV_class(df_RVfullts, df_RV_ts, kwrgs_events)
    else:
        TV = classes.RV_class(df_RVfullts, df_RV_ts)

    if precursor_ts is not None:
        # Retrieve same train test split as imported ts
        path_data = ''.join(precursor_ts[0][1])
        df_splits = func_fc.load_hdf5(
            path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']]
        test_yrs_imp = functions_pp.get_testyrs(df_splits)
        df_splits = functions_pp.rand_traintest_years(
            TV,
            method=method,
            seed=seed,
            kwrgs_events=kwrgs_events,
            verb=verbosity)
        test_yrs_set = functions_pp.get_testyrs(df_splits)
        assert (np.equal(test_yrs_imp,
                         test_yrs_set)).all(), "Train test split not equal"
    else:
        df_splits = functions_pp.rand_traintest_years(
            TV,
            method=method,
            seed=seed,
            kwrgs_events=kwrgs_events,
            verb=verbosity)
    return TV, df_splits
예제 #2
0
def RV_and_traintest(fullts, TV_ts, method=str, kwrgs_events=None, precursor_ts=None,
                     seed=int, verbosity=1):


    # Define traintest:
    df_fullts = pd.DataFrame(fullts.values, 
                               index=pd.to_datetime(fullts.time.values))
    df_RV_ts    = pd.DataFrame(TV_ts.values,
                               index=pd.to_datetime(TV_ts.time.values))

    if method[:9] == 'ran_strat' and kwrgs_events is None:
            # events need to be defined to enable stratified traintest.
            kwrgs_events = {'event_percentile': 66,
                            'min_dur' : 1,
                            'max_break' : 0,
                            'grouped' : False}
            if verbosity == 1:
                print("kwrgs_events not given, creating stratified traintest split "
                     "based on events defined as exceeding the {}th percentile".format(
                         kwrgs_events['event_percentile']))

    TV = RV_class(df_fullts, df_RV_ts, kwrgs_events)

    
    if precursor_ts is not None:
        # Retrieve same train test split as imported ts
        path_data = ''.join(precursor_ts[0][1])
        df_splits = func_fc.load_hdf5(path_data)['df_data'].loc[:,['TrainIsTrue', 'RV_mask']]
        test_yrs_imp  = functions_pp.get_testyrs(df_splits)
        df_splits = functions_pp.rand_traintest_years(TV, method=method,
                                                          seed=seed, 
                                                          kwrgs_events=kwrgs_events, 
                                                          verb=verbosity)
        test_yrs_set  = functions_pp.get_testyrs(df_splits)
        assert (np.equal(test_yrs_imp, test_yrs_set)).all(), "Train test split not equal"
    else:
        df_splits = functions_pp.rand_traintest_years(TV, method=method,
                                                          seed=seed, 
                                                          kwrgs_events=kwrgs_events, 
                                                          verb=verbosity)
    return TV, df_splits
예제 #3
0
def import_precur_ts(list_import_ts: List[tuple],
                     df_splits: pd.DataFrame,
                     start_end_date: Tuple[str, str],
                     start_end_year: Tuple[int, int],
                     start_end_TVdate: Tuple[str, str],
                     cols: list = None,
                     precur_aggr: int = 1):
    '''
    list_import_ts has format List[tuples],
    [(name, path_data)]
    '''
    #%%
    # df_splits = rg.df_splits

    splits = df_splits.index.levels[0]
    orig_traintest = functions_pp.get_testyrs(df_splits)
    df_data_ext_s = np.zeros((splits.size), dtype=object)
    counter = 0
    for i, (name, path_data) in enumerate(list_import_ts):

        df_data_e_all = functions_pp.load_hdf5(path_data)['df_data']
        if type(df_data_e_all) is pd.Series:
            df_data_e_all = pd.DataFrame(df_data_e_all)

        df_data_e_all = df_data_e_all.iloc[:, :]  # not sure why needed
        if cols is None:
            cols = list(
                df_data_e_all.columns[(df_data_e_all.dtypes != bool).values])
        elif type(cols) is str:
            cols = [cols]

        if hasattr(df_data_e_all.index, 'levels'):
            dates_subset = core_pp.get_subdates(df_data_e_all.loc[0].index,
                                                start_end_date, start_end_year)
            df_data_e_all = df_data_e_all.loc[pd.IndexSlice[:,
                                                            dates_subset], :]
        else:
            dates_subset = core_pp.get_subdates(df_data_e_all.index,
                                                start_end_date, start_end_year)
            df_data_e_all = df_data_e_all.loc[dates_subset]

        if 'TrainIsTrue' in df_data_e_all.columns:
            _c = [
                k for k in df_splits.columns
                if k in ['TrainIsTrue', 'RV_mask']
            ]
            # check if traintest split is correct
            ext_traintest = functions_pp.get_testyrs(df_data_e_all[_c])
            _check_traintest = all(
                np.equal(core_pp.flatten(ext_traintest),
                         core_pp.flatten(orig_traintest)))
            assert _check_traintest, (
                'Train test years of df_splits are not the '
                'same as imported timeseries')

        for s in range(splits.size):
            if 'TrainIsTrue' in df_data_e_all.columns:
                df_data_e = df_data_e_all.loc[s]
            else:
                df_data_e = df_data_e_all

            df_data_ext_s[s] = df_data_e[cols]
            tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days

            if precur_aggr != tfreq_date_e:
                try:
                    df_data_ext_s[s] = functions_pp.time_mean_bins(
                        df_data_ext_s[s],
                        precur_aggr,
                        start_end_date,
                        start_end_year,
                        start_end_TVdate=start_end_TVdate)[0]
                except KeyError as e:
                    print('KeyError captured, likely the requested dates '
                          'given by start_end_date and start_end_year are not'
                          'found in external pandas timeseries.\n{}'.format(
                              str(e)))
        print(f'loaded in exterinal timeseres: {cols}')

        if counter == 0:
            df_data_ext = pd.concat(list(df_data_ext_s),
                                    keys=range(splits.size))
        else:
            df_add = pd.concat(list(df_data_ext_s), keys=range(splits.size))
            df_data_ext = df_data_ext.merge(df_add,
                                            left_index=True,
                                            right_index=True)
        counter += 1
        cols = None
    #%%
    return df_data_ext
예제 #4
0
def calculate_corr_maps(ex, map_proj):
    #%%
    # =============================================================================
    # Load 'exp' dictionairy with information of pre-processed data (variables, paths, filenames, etcetera..)
    # and add RGCPD/Tigrimate experiment settings
    # =============================================================================
    # Response Variable is what we want to predict
    RV = ex[ex['RV_name']]
    ex['time_cycle'] = RV.dates[
        RV.dates.year ==
        RV.startyear].size  # time-cycle of data. total timesteps in one year
    ex['time_range_all'] = [0, RV.dates.size]
    #==================================================================================
    # Start of experiment
    #==================================================================================

    # Define traintest:
    df_RVfullts = pd.DataFrame(RV.RVfullts.values,
                               index=pd.to_datetime(RV.RVfullts.time.values))
    df_RV_ts = pd.DataFrame(RV.RV_ts.values,
                            index=pd.to_datetime(RV.RV_ts.time.values))
    if ex['method'][:9] == 'ran_strat':
        kwrgs_events = ex['kwrgs_events']
        RV = func_fc.RV_class(df_RVfullts, df_RV_ts, kwrgs_events)
    else:
        RV = func_fc.RV_class(df_RVfullts, df_RV_ts)
    if ex['import_prec_ts']:
        # Retrieve same train test split as imported ts
        path_data = ''.join(ex['precursor_ts'][0][1])
        df_splits = func_fc.load_hdf5(
            path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']]
        test_yrs = functions_pp.get_testyrs(df_splits)
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex, test_yrs)
        assert (np.equal(test_yrs,
                         ex['tested_yrs'])).all(), "Train test split not equal"
    else:
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex)

    # =============================================================================
    # 2) DEFINE PRECURSOS COMMUNITIES:
    # =============================================================================
    # - calculate and plot pattern correltion for differnt fields
    # - create time-series over these regions
    #=====================================================================================
    outdic_actors = dict()

    class act:
        def __init__(self, name, corr_xr, precur_arr):
            self.name = var
            self.corr_xr = corr_xr
            self.precur_arr = precur_arr
            self.lat_grid = precur_arr.latitude.values
            self.lon_grid = precur_arr.longitude.values
            self.area_grid = rgcpd.get_area(precur_arr)
            self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0])

    allvar = ex['vars'][0]  # list of all variable names
    for var in allvar[ex['excludeRV']:]:  # loop over all variables
        actor = ex[var]
        #===========================================
        # 3c) Precursor field
        #===========================================
        file_path = os.path.join(actor.path_pp, actor.filename_pp)
        precur_arr = functions_pp.import_ds_timemeanbins(file_path, ex)
        #        precur_arr = rgcpd.convert_longitude(precur_arr, 'only_east')
        # =============================================================================
        # Calculate correlation
        # =============================================================================
        corr_xr = rgcpd.calc_corr_coeffs_new(precur_arr, RV, ex)

        # =============================================================================
        # Cluster into precursor regions
        # =============================================================================
        actor = act(var, corr_xr, precur_arr)
        actor, ex = rgcpd.cluster_DBSCAN_regions(actor, ex)
        if np.isnan(actor.prec_labels.values).all() == False:
            rgcpd.plot_regs_xarray(actor.prec_labels.copy(), ex)
        outdic_actors[var] = actor
        # =============================================================================
        # Plot
        # =============================================================================
        if ex['plotin1fig'] == False:
            plot_maps.plot_corr_maps(corr_xr, corr_xr['mask'], map_proj)
            fig_filename = '{}_corr_{}_vs_{}'.format(
                ex['params'], ex['RV_name'], var) + ex['file_type2']
            plt.savefig(os.path.join(ex['fig_path'], fig_filename),
                        bbox_inches='tight',
                        dpi=200)
            if ex['showplot'] == False:
                plt.close()


#%%
    return ex, outdic_actors