'cape_ml_ens_mean_spatial_mean', 'shear_v_0to1_ens_mean_spatial_mean', 'hailcast_time_max_ens_mean_of_90th', 'major_axis_length', 'uh_2to5_time_max_ens_mean_of_90th', 'cin_ml_ens_std_spatial_mean', 'minor_axis_length', 'shear_v_0to6_ens_mean_spatial_mean', 'lcl_ml_ens_mean_spatial_mean', 'w_up_time_max_ens_mean_of_90th'] ''' data['X']['mid_level_lapse_rate_ens_mean_spatial_mean'] = data['X']['mid_level_lapse_rate_ens_mean_spatial_mean'] / 2.67765 X['mid_level_lapse_rate_ens_mean_spatial_mean'] = X['mid_level_lapse_rate_ens_mean_spatial_mean'] / 2.67765 fname = join(perm_path, f'permutation_importance_all_models_{target}_{time}_training_{metric}{drop_opt}{perm_method}.nc') explainer = InterpretToolkit(X=data['X'],y=data['targets'],estimator_output='probability',) perm_results = explainer.load(fname) #important_vars = perm_results['multipass_rankings__LogisticRegression'].values[:12] #important_vars = ['low_level_lapse_rate_ens_mean_spatial_mean'] important_vars = ['mid_level_lapse_rate_ens_mean_spatial_mean'] all_vars = perm_results['singlepass_rankings__LogisticRegression'].values display_feature_names = {f: to_readable_names([f])[0] for f in all_vars} #display_feature_names = _fix_long_names(display_feature_names) feature_units = {f: get_units(f)for f in all_vars} if option == 'interaction': interaction_index = 'auto' y = None elif option == 'targets':
time_set = ['first_hour'] target_set = ['tornado'] iterator = itertools.product(time_set, target_set) for combo in iterator: time, target = combo parameters = { 'time': time, 'target': target, 'drop_opt': drop_opt, } X, y = _load_train_data(**parameters) estimators = load_models(time, target, drop_opt, model_names) explainer = InterpretToolkit(estimators=estimators, estimator_names=model_names, X=X, y=y) # ale_results_all_models_tornado_first_hourL1_based_feature_selection_aggressive.nc fnames = join(ale_path, f'ale_results_all_models_{target}_{time}{drop_opt}.nc') ale = explainer.load(fnames=fnames) results = explainer.interaction_strength(ale, n_bootstrap=10, subsample=0.1) print(results) explainer.save(fname=join( ale_path, f'ias_score_all_models_{target}_{time}{drop_opt}.nc'), data=results)
'lcl_ml_ens_mean_spatial_mean', 'major_axis_length', 'cape_ml_ens_mean_spatial_mean', 'geopotential_height_500mb_ens_mean_spatial_mean', 'minor_axis_length', 'wz_0to2_time_max_ens_mean_of_90th', 'bouyancy_time_min_ens_mean_spatial_mean', 'shear_v_0to1_ens_mean_spatial_mean', 'uh_0to2_time_max_ens_std_spatial_mean'] ######################################## print('First load of the data...') display_feature_names = {f: to_readable_names([f])[0] for f in feature_names} #display_feature_names = _fix_long_names(display_feature_names) feature_units = {f: get_units(f)for f in feature_names} explainer = InterpretToolkit() #fnames = [get_fnames(m, target, time, drop_opt) for m in model_names] fnames = get_fnames(target, time, drop_opt) data = explainer.load(fnames=fnames) fig, axes = explainer.plot_ale( data, features = feature_names, display_feature_names=display_feature_names, display_units=feature_units, title=f'{plt_config.title_dict[target]} {time.replace("_", " ").title()}', hspace=.75 ) fname=f'ale_{target}_{time}_{drop_opt}.png' base_plot.save_figure(fig=fig, fname=fname)
) #if exists(save_fname): # print(f'{save_fname} already exists!') # continue parameters = { 'time': time, 'target': target, 'drop_opt': drop_opt, } X, y, info = _load_train_data(return_info=True, **parameters) estimators = load_models(time, target, drop_opt, model_names) # Subsample time indices to reduce autocorrelations X_subset, y_subset = get_independent_samples(X, y, info) explainer = InterpretToolkit(estimators=estimators, estimator_names=model_names, X=X_subset.copy(), y=y_subset.copy()) background_dataset = shap.sample(X, 100) results = explainer.local_contributions( method='shap', background_dataset=background_dataset, performance_based=True, n_samples=n_samples) results = explainer.save(fname=save_fname, data=results) duration = datetime.datetime.now() - start_time seconds = duration.total_seconds() hours = seconds // 3600 minutes = (seconds % 3600) // 60
): """ """ resample_method = resample_dict[time][target][model_name] return join(path, f'shap_values_{model_name}_{target}_{time}{drop_opt}.pkl') fname = get_fnames(model_name, target, time, drop_opt) with open(fname, 'rb') as pkl_file: data = pickle.load(pkl_file) shap_values, bias = data['shap_values'], data['bias'] features = [var for var in list(data['X'].columns) if 'matched' not in var ] + ['Run Date'] display_feature_names = { feature: to_readable_names([feature])[0] for feature in features } myInterpreter = InterpretToolkit(X=data['X']) fig = myInterpreter.plot_shap( shap_values=shap_values, plot_type='summary', display_feature_names=display_feature_names, ) fname = f'shap_summary_{model_name}_{target}_{time}_{drop_opt}.png' base_plot.save_figure(fig=fig, fname=fname)
parameters = { 'time': time, 'target': target, 'drop_opt': drop_opt, } results_fname = join( ale_path, f'pd_1d_results_all_models_{target}_{time}{drop_opt}.nc') #if exists(results_fname): # print(f'{results_fname} already exist!') # continue X, y = _load_train_data(**parameters) estimators = load_models(time, target, drop_opt, model_names) explainer = InterpretToolkit(estimators=estimators, estimator_names=model_names, X=X, y=y) results = explainer.pd( features='all', n_bootstrap=n_bootstrap, subsample=subsample, n_jobs=njobs, n_bins=n_bins, ) print(f'Saving {results_fname}...') explainer.save(fname=results_fname, data=results) duration = datetime.datetime.now() - start_time seconds = duration.total_seconds() hours = seconds // 3600
sns.ecdfplot( ax=ax, data=df, x=var, hue=target, legend=False, ) # Load the most important variables path = '/work/mflora/ML_DATA/permutation_importance' perm_imp_fname = join( path, f'permutation_importance_all_models_{target}_{time}_training_{metric}{drop_opt}{perm_method}.nc' ) explainer = InterpretToolkit() perm_imp_results = explainer.load(perm_imp_fname) important_vars = perm_imp_results[ f'{mode}_rankings__LogisticRegression'].values important_vars = important_vars[:n_vars] # Convert to pretty feature names readable_feature_names = { feature: to_readable_names([feature])[0] + f' ({get_units(feature)})' for feature in important_vars } parameters = { 'time': time, 'target': target,
save_fname= join(path, f'shap_values_{model_names[0]}_{target}_{time}{drop_opt}.pkl') #if exists(save_fname): # print(f'{save_fname} already exists!') # continue parameters = {'time' : time, 'target' : target,'drop_opt' : drop_opt,} X,y, info = _load_train_data(return_info=True, **parameters) estimators = load_models(time,target,drop_opt,model_names) # Randomly 5000 samples from the training dataset indices = np.random.choice(len(X), size=5000, replace=False) X_subset = X.iloc[indices,:].reset_index(drop=True) y_subset = y[indices] estimators = load_models(time,target,drop_opt,model_names) explainer = InterpretToolkit(estimators=estimators,estimator_names=model_names,X=X_subset.copy()) background_dataset = shap.sample(X, 100) results = explainer.shap(background_dataset=background_dataset) shap_values, bias = results[model_names[0]] data = { 'shap_values': shap_values, 'bias' : bias, 'X' : X_subset, 'targets' : y_subset, } with open(save_fname, 'wb') as pkl_file: pickle.dump(data, pkl_file)
X, y, info = _load_train_data(return_info=True, **parameters) X_subsample = [] y_subsample = [] for idxs in indices_tuple: X_subsample.append(X.iloc[idxs, :].reset_index(drop=True)) y_subsample.append(y[idxs]) for mode, X, y in zip(['high_STP', 'low_STP'], X_subsample, y_subsample): start_time = datetime.datetime.now() n_vars = 10 if len(X.columns) else len(X.columns) # Load the models estimators = load_models(time, target, drop_opt, model_names) explainer = InterpretToolkit(estimators=estimators, estimator_names=model_names, X=X, y=y) # Compute the importance results = explainer.permutation_importance(n_vars=n_vars, evaluation_fn=metric, subsample=subsample, n_jobs=n_jobs, n_bootstrap=n_bootstrap, verbose=verbose, direction=direction) results_fname = join( perm_imp_path, f'permutation_importance_{mode}_{target}_{time}_{data_mode}_{metric}{drop_opt}{direction}.nc' )
} X, y, info = _load_train_data(return_info=True, **parameters) dates = info['Run Date'].values random_state = np.random.RandomState(35) random_idxs = random_state.choice(len(X), size=100) background_dataset = X.iloc[random_idxs, :] n_samples = 5 save_fname = join( path, f'shap_values_performance_{model_names[0]}_{target}_{time}{drop_opt}.pkl') estimators = load_models(time, target, drop_opt, model_names) explainer = InterpretToolkit(estimators=estimators, estimator_names=model_names, X=X.copy(), y=np.copy(y)) predictions = estimators[0].predict_proba(X)[:, 1] print(np.max(predictions)) X_test, y_test, _ = _load_test_data(return_info=True, **parameters) _predictions = estimators[0].predict_proba(X_test)[:, 1] print(np.sort(_predictions)[::-1]) performance_dict = get_indices_based_on_performance( estimator=estimators[0], X=X, y=y, n_samples=n_samples, estimator_output='probability',
p_values = [] for n in range(n_vars): p_value = permutation_test(multipass_scores[n,:], scores_to_compare_against[n,:], method='approximate', num_rounds=1000, seed=0) p_values.append(p_value) if p_value > 0.05: print('Probably the same distribution\n') else: print('Probably different distributions\n') p_values = np.array(p_values)>0.05 return p_values def get_fnames(target, time, mode, metric, drop_opt, perm_method, resample=''): return join(path, f'permutation_importance_{atype}_{target}_{time}_{mode}_{metric}{drop_opt}{perm_method}{resample}.nc') explainer = InterpretToolkit() results =[] for target in targets: fname = get_fnames(target, time, mode, metric, drop_opt, perm_method, resample) results.append(explainer.load(fname)) p_values = get_p_values(results[0], ml_models[0], n_vars=10) for (target, time, mode, metric, perm_method)
else: return join(shap_path, f'shap_values_performance_{model_name}_{target}_{time}{drop_opt}.pkl') model_names = ['LogisticRegression'] target = 'severe_hail' if mode is None else 'tornado' time = 'first_hour' drop_opt = 'L1_based_feature_selection_with_manual' perf_keys = ["Best Hits", "Worst False Alarms", "Worst Misses", ] metric = 'norm_aupdc' perm_method = 'backward' ######################################## explainer = InterpretToolkit() fnames = [get_fnames(m, target, time, drop_opt, mode) for m in model_names] dframe = explainer.load(fnames=fnames, dtype='dataframe') feature_names = dframe.attrs['feature_names'] display_feature_names = {f: to_readable_names([f])[0] for f in feature_names} #display_feature_names = _fix_long_names(display_feature_names) feature_units = {f: get_units(f)for f in feature_names} fname = join(perm_path, f'permutation_importance_all_models_{target}_{time}_training_{metric}{drop_opt}{perm_method}.nc') perm_results = explainer.load(fname) important_vars = perm_results['multipass_rankings__LogisticRegression'].values[:12] #important_vars=feature_names #important_vars.remove('Run Date') if 'Initialization Time' in important_vars:
ml_models = ['LogisticRegression'] drop_opt = 'L1_based_feature_selection_with_manual' perm_method = 'backward' num_vars_to_plot=10 figsize = (6,6) parameters = {'time' : time,'target' : 'severe_hail','drop_opt' : drop_opt} X,y = _load_train_data(**parameters) feature_names = list(X.columns) def get_fnames(target, time, mode, metric, drop_opt, perm_method, resample=''): return join(path, f'permutation_importance_{atype}_{target}_{time}_{mode}_{metric}{drop_opt}{perm_method}{resample}.nc') explainer = InterpretToolkit() #X=X,y=y) results =[] for target in targets: fname = get_fnames(target, time, mode, metric, drop_opt, perm_method, resample) results.append(explainer.load(fname)) results[0].attrs['estimator_output'] = ['LogisticRegression'] fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc' ds = xr.open_dataset(fname) features = [ var for var in list(ds.data_vars) if 'matched' not in var] + ['Run Date'] readable_feature_names = {feature: to_readable_names([feature])[0] for feature in features} feature_colors = {feature: to_readable_names([feature])[1] for feature in features}
def _load_test_data(base_vars_to_drop=base_vars_to_drop, return_info=None, **parameters): """ Load test data """ io = IO() time = parameters['time'] target = parameters['target'] drop_opt = parameters['drop_opt'] model_name = parameters.get('model_name', None) path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/' if drop_opt == '_drop_high_corr_pred': fname = f'correlated_features_to_drop_{time}_{target}.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_drop_0.8_corr_pred': fname = f'correlated_features_to_drop_{time}_{target}_0.8.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_manual_drop_0.9_corr': fname = f'correlated_features_to_drop_{time}_{target}_0.9_manual_drop_time_max_spatial_mean.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: add_columns_to_drop = pickle.load(fp) vars_to_drop += add_columns_to_drop elif drop_opt == '_manual_drop_0.8_corr': fname = f'correlated_features_to_drop_{time}_{target}_0.8_manual_drop_time_max_spatial_mean.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: add_columns_to_drop = pickle.load(fp) vars_to_drop += add_columns_to_drop elif '_manual_drop_time_max_spatial_mean' in drop_opt: fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_drop_irrelevant_features': fname = f'irrelevant_features_to_drop_{time}_{target}_{model_name}.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_drop_object_morph_pred': object_pred = ['area', 'minor_axis_length', 'major_axis_length'] vars_to_drop = base_vars_to_drop + object_pred elif 'L1_based_feature_selection' in drop_opt and 'manual' not in drop_opt and 'aggres' not in drop_opt: path = '/home/monte.flora/wofs_ml/interpret/L1_based_features' fname = f'L1_based_features_to_drop_{time}_{target}.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = list(pickle.load(fp)) if 'Run Date' in columns_to_drop: columns_to_drop.remove('Run Date') vars_to_drop = base_vars_to_drop + columns_to_drop elif 'L1_based_feature_selection_aggressive' in drop_opt: path = '/home/monte.flora/wofs_ml/interpret/L1_based_features' fname = f'L1_based_features_to_drop_{time}_{target}aggresive.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = list(pickle.load(fp)) if 'Run Date' in columns_to_drop: columns_to_drop.remove('Run Date') vars_to_drop = base_vars_to_drop + columns_to_drop elif 'L1_based_feature_selection_with_manual' in drop_opt: path1 = '/home/monte.flora/wofs_ml/interpret/L1_based_features' fname = f'L1_based_features_to_drop_{time}_{target}_manual_drop_time_max_spatial_mean.pkl' with open(join(path1, fname), 'rb') as fp: columns_to_drop1 = list(pickle.load(fp)) if 'Run Date' in columns_to_drop1: columns_to_drop1.remove('Run Date') fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop2 = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop1 + columns_to_drop2 else: vars_to_drop = base_vars_to_drop # LOAD DATA print(f'Loading {time} {target} data...(from _load_test_data)') fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_testing_matched_to_{target}_0km_dataset.pkl') test_data = io.load_dataframe(fname=fname, target_vars=[ 'matched_to_tornado_0km', 'matched_to_severe_hail_0km', 'matched_to_severe_wind_0km' ], vars_to_drop=vars_to_drop) examples = test_data['examples'] target_values = test_data[f'matched_to_{target}_0km'].values if drop_opt == '_only_important_pred': path = '/work/mflora/ML_DATA/permutation_importance/' if 'Log' in model_name: tag = '_drop_high_corr_pred' else: tag = '' fname = join( path, f'permutation_importance_{model_name}_{target}_{time}_training_norm_aupdc{tag}.pkl' ) perm_imp_results = load_pickle([fname]) myInterpreter = InterpretToolkit(model=[None]) myInterpreter.set_results(perm_imp_results, option='permutation_importance') important_vars = myInterpreter.get_important_vars(perm_imp_results, multipass=True) important_vars += ['Run Date'] examples = examples[important_vars] if return_info: info = test_data['info'] return examples, target_values, info else: return examples, target_values
return data features = [var for var in list(data['X'].columns) if 'matched' not in var ] + ['Run Date'] display_feature_names = { feature: to_readable_names([feature])[0] for feature in features } feature_colors = { feature: to_readable_names([feature])[1] for feature in features } explainer = InterpretToolkit(estimator_names=model_name, estimator_output='probability') results = shap_values_to_importance(shap_values, estimator_name=model_name, X=data['X']) columns = [r'$\sigma$(SHAP)'] #columns = [r'$\sum$ |SHAP|'] fig = explainer.plot_importance(data=results, method='shap', display_feature_names=display_feature_names, feature_colors=feature_colors, columns=columns) fname = f'shap_importance_{model_name}_{target}_{time}_{drop_opt}.png' base_plot.save_figure(fig=fig, fname=fname)