def _load_test_data(base_vars_to_drop=base_vars_to_drop, return_info=None, **parameters): """ Load test data """ io = IO() time = parameters['time'] target = parameters['target'] drop_opt = parameters['drop_opt'] model_name = parameters.get('model_name', None) path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/' if drop_opt == '_drop_high_corr_pred': fname = f'correlated_features_to_drop_{time}_{target}.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_drop_0.8_corr_pred': fname = f'correlated_features_to_drop_{time}_{target}_0.8.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_manual_drop_0.9_corr': fname = f'correlated_features_to_drop_{time}_{target}_0.9_manual_drop_time_max_spatial_mean.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: add_columns_to_drop = pickle.load(fp) vars_to_drop += add_columns_to_drop elif drop_opt == '_manual_drop_0.8_corr': fname = f'correlated_features_to_drop_{time}_{target}_0.8_manual_drop_time_max_spatial_mean.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: add_columns_to_drop = pickle.load(fp) vars_to_drop += add_columns_to_drop elif '_manual_drop_time_max_spatial_mean' in drop_opt: fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_drop_irrelevant_features': fname = f'irrelevant_features_to_drop_{time}_{target}_{model_name}.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_drop_object_morph_pred': object_pred = ['area', 'minor_axis_length', 'major_axis_length'] vars_to_drop = base_vars_to_drop + object_pred elif 'L1_based_feature_selection' in drop_opt and 'manual' not in drop_opt and 'aggres' not in drop_opt: path = '/home/monte.flora/wofs_ml/interpret/L1_based_features' fname = f'L1_based_features_to_drop_{time}_{target}.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = list(pickle.load(fp)) if 'Run Date' in columns_to_drop: columns_to_drop.remove('Run Date') vars_to_drop = base_vars_to_drop + columns_to_drop elif 'L1_based_feature_selection_aggressive' in drop_opt: path = '/home/monte.flora/wofs_ml/interpret/L1_based_features' fname = f'L1_based_features_to_drop_{time}_{target}aggresive.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = list(pickle.load(fp)) if 'Run Date' in columns_to_drop: columns_to_drop.remove('Run Date') vars_to_drop = base_vars_to_drop + columns_to_drop elif 'L1_based_feature_selection_with_manual' in drop_opt: path1 = '/home/monte.flora/wofs_ml/interpret/L1_based_features' fname = f'L1_based_features_to_drop_{time}_{target}_manual_drop_time_max_spatial_mean.pkl' with open(join(path1, fname), 'rb') as fp: columns_to_drop1 = list(pickle.load(fp)) if 'Run Date' in columns_to_drop1: columns_to_drop1.remove('Run Date') fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop2 = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop1 + columns_to_drop2 else: vars_to_drop = base_vars_to_drop # LOAD DATA print(f'Loading {time} {target} data...(from _load_test_data)') fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_testing_matched_to_{target}_0km_dataset.pkl') test_data = io.load_dataframe(fname=fname, target_vars=[ 'matched_to_tornado_0km', 'matched_to_severe_hail_0km', 'matched_to_severe_wind_0km' ], vars_to_drop=vars_to_drop) examples = test_data['examples'] target_values = test_data[f'matched_to_{target}_0km'].values if drop_opt == '_only_important_pred': path = '/work/mflora/ML_DATA/permutation_importance/' if 'Log' in model_name: tag = '_drop_high_corr_pred' else: tag = '' fname = join( path, f'permutation_importance_{model_name}_{target}_{time}_training_norm_aupdc{tag}.pkl' ) perm_imp_results = load_pickle([fname]) myInterpreter = InterpretToolkit(model=[None]) myInterpreter.set_results(perm_imp_results, option='permutation_importance') important_vars = myInterpreter.get_important_vars(perm_imp_results, multipass=True) important_vars += ['Run Date'] examples = examples[important_vars] if return_info: info = test_data['info'] return examples, target_values, info else: return examples, target_values
path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/' for combo in iterator: time, target = combo if drop_opt == '_manual_drop_time_max_spatial_mean': path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/' fname = f'time_max_spatial_mean_features.pkl' with open(join(path,fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop print(f'Loading {time} {target} data...') fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_training_matched_to_{target}_0km_dataset.pkl') data = io.load_dataframe(fname=fname, target_vars=['matched_to_tornado_0km', 'matched_to_severe_hail_0km','matched_to_severe_wind_0km' ], vars_to_drop=vars_to_drop ) examples = data['examples'] columns_to_drop, correlated_pairs = filter_obj.correlation_filtering(df=examples, cc_val=cc_value) fname = f'correlated_features_to_drop_{time}_{target}_{cc_value}{drop_opt}.pkl' with open(join(path,fname), 'wb') as fp: pickle.dump(columns_to_drop, fp) fname = f'correlated_feature_pairs_{time}_{target}_{cc_value}{drop_opt}.pkl' with open(join(path,fname), 'wb') as fp: pickle.dump(correlated_pairs, fp)
from wofs.util import config from wofs_ml.io.io import IO, vars_to_drop import pandas as pd from os.path import join # Iterates over model type, target type, resampling method # io = IO() time = 'first_hour' fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_training_dataset.pkl') target_vars = ['matched_to_tornado_0km'] data = io.load_dataframe(fname=fname, target_vars=target_vars, vars_to_drop=vars_to_drop) examples = data['examples'] targets = data['matched_to_tornado_0km'] feature_names = examples.columns.to_list() date_col_idx = feature_names.index('Run Date') cv = DateBasedCV(n_splits=5, date_col_idx=date_col_idx, y=targets, verbose=True) X = examples.to_numpy()