Пример #1
0
def _load_test_data(base_vars_to_drop=base_vars_to_drop,
                    return_info=None,
                    **parameters):
    """
    Load  test data 
    """
    io = IO()
    time = parameters['time']
    target = parameters['target']
    drop_opt = parameters['drop_opt']
    model_name = parameters.get('model_name', None)

    path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/'
    if drop_opt == '_drop_high_corr_pred':
        fname = f'correlated_features_to_drop_{time}_{target}.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_drop_0.8_corr_pred':
        fname = f'correlated_features_to_drop_{time}_{target}_0.8.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_manual_drop_0.9_corr':
        fname = f'correlated_features_to_drop_{time}_{target}_0.9_manual_drop_time_max_spatial_mean.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            add_columns_to_drop = pickle.load(fp)

        vars_to_drop += add_columns_to_drop

    elif drop_opt == '_manual_drop_0.8_corr':
        fname = f'correlated_features_to_drop_{time}_{target}_0.8_manual_drop_time_max_spatial_mean.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            add_columns_to_drop = pickle.load(fp)

        vars_to_drop += add_columns_to_drop

    elif '_manual_drop_time_max_spatial_mean' in drop_opt:
        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_drop_irrelevant_features':
        fname = f'irrelevant_features_to_drop_{time}_{target}_{model_name}.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_drop_object_morph_pred':
        object_pred = ['area', 'minor_axis_length', 'major_axis_length']
        vars_to_drop = base_vars_to_drop + object_pred

    elif 'L1_based_feature_selection' in drop_opt and 'manual' not in drop_opt and 'aggres' not in drop_opt:
        path = '/home/monte.flora/wofs_ml/interpret/L1_based_features'
        fname = f'L1_based_features_to_drop_{time}_{target}.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = list(pickle.load(fp))
        if 'Run Date' in columns_to_drop:
            columns_to_drop.remove('Run Date')
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif 'L1_based_feature_selection_aggressive' in drop_opt:
        path = '/home/monte.flora/wofs_ml/interpret/L1_based_features'
        fname = f'L1_based_features_to_drop_{time}_{target}aggresive.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = list(pickle.load(fp))
        if 'Run Date' in columns_to_drop:
            columns_to_drop.remove('Run Date')
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif 'L1_based_feature_selection_with_manual' in drop_opt:
        path1 = '/home/monte.flora/wofs_ml/interpret/L1_based_features'
        fname = f'L1_based_features_to_drop_{time}_{target}_manual_drop_time_max_spatial_mean.pkl'
        with open(join(path1, fname), 'rb') as fp:
            columns_to_drop1 = list(pickle.load(fp))
        if 'Run Date' in columns_to_drop1:
            columns_to_drop1.remove('Run Date')

        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop2 = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop1 + columns_to_drop2

    else:
        vars_to_drop = base_vars_to_drop

    # LOAD DATA
    print(f'Loading {time} {target} data...(from _load_test_data)')
    fname = join(config.ML_DATA_STORAGE_PATH,
                 f'{time}_testing_matched_to_{target}_0km_dataset.pkl')
    test_data = io.load_dataframe(fname=fname,
                                  target_vars=[
                                      'matched_to_tornado_0km',
                                      'matched_to_severe_hail_0km',
                                      'matched_to_severe_wind_0km'
                                  ],
                                  vars_to_drop=vars_to_drop)

    examples = test_data['examples']
    target_values = test_data[f'matched_to_{target}_0km'].values
    if drop_opt == '_only_important_pred':
        path = '/work/mflora/ML_DATA/permutation_importance/'
        if 'Log' in model_name:
            tag = '_drop_high_corr_pred'
        else:
            tag = ''
        fname = join(
            path,
            f'permutation_importance_{model_name}_{target}_{time}_training_norm_aupdc{tag}.pkl'
        )
        perm_imp_results = load_pickle([fname])
        myInterpreter = InterpretToolkit(model=[None])
        myInterpreter.set_results(perm_imp_results,
                                  option='permutation_importance')
        important_vars = myInterpreter.get_important_vars(perm_imp_results,
                                                          multipass=True)
        important_vars += ['Run Date']

        examples = examples[important_vars]

    if return_info:
        info = test_data['info']
        return examples, target_values, info
    else:
        return examples, target_values
path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/'

for combo in iterator:
    time, target = combo

    if drop_opt == '_manual_drop_time_max_spatial_mean':
        path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/'
        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path,fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    print(f'Loading {time} {target} data...')
    fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_training_matched_to_{target}_0km_dataset.pkl')
    data = io.load_dataframe(fname=fname,
                         target_vars=['matched_to_tornado_0km', 'matched_to_severe_hail_0km','matched_to_severe_wind_0km' ],
                         vars_to_drop=vars_to_drop
                         )
    examples = data['examples']

    columns_to_drop, correlated_pairs = filter_obj.correlation_filtering(df=examples, cc_val=cc_value)

    fname = f'correlated_features_to_drop_{time}_{target}_{cc_value}{drop_opt}.pkl'
    with open(join(path,fname), 'wb') as fp:
        pickle.dump(columns_to_drop, fp)

    fname = f'correlated_feature_pairs_{time}_{target}_{cc_value}{drop_opt}.pkl'
    with open(join(path,fname), 'wb') as fp:
        pickle.dump(correlated_pairs, fp)

    
Пример #3
0
from wofs.util import config
from wofs_ml.io.io import IO, vars_to_drop

import pandas as pd
from os.path import join

# Iterates over model type, target type, resampling method
#
io = IO()

time = 'first_hour'
fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_training_dataset.pkl')
target_vars = ['matched_to_tornado_0km']

data = io.load_dataframe(fname=fname,
                         target_vars=target_vars,
                         vars_to_drop=vars_to_drop)

examples = data['examples']
targets = data['matched_to_tornado_0km']

feature_names = examples.columns.to_list()
date_col_idx = feature_names.index('Run Date')

cv = DateBasedCV(n_splits=5,
                 date_col_idx=date_col_idx,
                 y=targets,
                 verbose=True)

X = examples.to_numpy()