def plot_ba(experiment, **kwargs): # Operate on cached data only. get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) # Operate on cached data only. get_endog_exog_mask.check_in_store(experiment) master_mask = get_endog_exog_mask(experiment)[2] check_master_masks(master_mask) # Operate on cached fitted models only. get_model(X_train, y_train, cache_check=True) predicted_test = threading_get_model_predict( X_train=X_train, y_train=y_train, predict_X=X_test, ) ba_plotting( *get_ba_plotting_data(predicted_test, y_test, master_mask), figure_saver=map_figure_saver(sub_directory=experiment.name), **get_aux0_aux1_kwargs(y_test, master_mask), filename=f"{experiment.name}_ba_prediction", )
def correlation_plot(experiment, **kwargs): exp_figure_saver = figure_saver(sub_directory=experiment.name) # Operate on cached data only. get_data(experiment, cache_check=True) _, exog_data, _ = get_endog_exog_mask(experiment) def df_cols_to_str(df): df.columns = list(map(lambda s: shorten_features(str(s)), df.columns)) return df # with exp_figure_saver("corr_plot"): # corr_plot( # df_cols_to_str( # exog_data[ # list( # sort_variables( # var for var in exog_data.columns if var.shift <= 9 # ) # ) # ] # ), # fig_kwargs={"figsize": (12, 8)}, # ) # plt.grid(False) with exp_figure_saver(f"{experiment.name}_corr_plot_full"): corr_plot( df_cols_to_str(exog_data[list(sort_variables(exog_data.columns))]), rotation=70, fig_kwargs={"figsize": (8.2, 6.3)}, ) plt.grid(False)
def fit_buffered_loo_sample(experiment, radius, max_rad, seed, cache_check=False, **kwargs): # Operate on cached data only. get_endog_exog_mask.check_in_store(experiment) endog_data, exog_data, master_mask = get_endog_exog_mask(experiment) bloo_kwargs = dict( exog_data=exog_data, endog_data=endog_data, master_mask=master_mask, radius=radius, max_rad=max_rad, extrapolation_check=False, seed=seed, verbose=False, dpi=300, ) if cache_check: return buffered_leave_one_out.check_in_store(**bloo_kwargs) ( test_indices, n_ignored, n_train, n_hold_out, total_samples, hold_out_y, predicted_y, ) = buffered_leave_one_out(**bloo_kwargs) data_info = ( test_indices, n_ignored, n_train, n_hold_out, total_samples, ) # Prevents memory buildup over repeated calls. gc.collect() return (data_info, hold_out_y, predicted_y)
def fit_random_binary_dilation(experiment, structure, test_frac, seed, cache_check=False, **kwargs): if cache_check: get_data(experiment, cache_check=True) endog_data, exog_data, master_mask = get_endog_exog_mask(experiment) split_kwargs = dict( exog_data=exog_data, endog_data=endog_data, master_mask=master_mask, structure=structure, test_frac=test_frac, seed=seed, verbose=False, ) if cache_check: random_binary_dilation_split.check_in_store(**split_kwargs) ( desc_str, data_info, X_train, X_test, y_train, y_test, ) = random_binary_dilation_split(**split_kwargs) model = optional_client_call( get_model, dict(X_train=X_train, y_train=y_train), cache_check=cache_check, )[0] if cache_check: return get_model_scores.check_in_store(model, X_test, X_train, y_test, y_train) return data_info, get_model_scores(model, X_test, X_train, y_test, y_train)
def plot_obs_pred_comp(experiment, **kwargs): # Operate on cached data/models only. get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_val = get_experiment_split_data(experiment) get_model(X_train, y_train, cache_check=True) get_endog_exog_mask.check_in_store(experiment) master_mask = get_endog_exog_mask(experiment)[2] check_master_masks(master_mask) u_pre = threading_get_model_predict( X_train=X_train, y_train=y_train, predict_X=X_test, ) obs_pred_diff_cube = get_obs_pred_diff_cube(y_val, u_pre, master_mask) with map_figure_saver(sub_directory=experiment.name)( f"{experiment.name}_obs_pred_comp", sub_directory="predictions"): disc_cube_plot( obs_pred_diff_cube, fig=plt.figure(figsize=(5.1, 2.3)), cmap="BrBG", cmap_midpoint=0, cmap_symmetric=False, bin_edges=[-0.01, -0.001, -1e-4, 0, 0.001, 0.01, 0.02], extend="both", cbar_format=get_sci_format(ndigits=0), cbar_pad=0.025, cbar_label="Ob. - Pr.", **get_aux0_aux1_kwargs(y_val, master_mask), loc=(0.83, 0.14), height=0.055, aspect=1, spacing=0.06 * 0.2, )
def multi_model_ale_plot(*args, verbose=False, **kwargs): # Experiments for which data will be plotted. experiments = [ Experiment["ALL"], Experiment["TOP15"], Experiment["CURR"], Experiment["BEST15"], Experiment["15VEG_FAPAR"], Experiment["15VEG_LAI"], Experiment["15VEG_VOD"], Experiment["15VEG_SIF"], Experiment["CURRDD_FAPAR"], Experiment["CURRDD_LAI"], Experiment["CURRDD_VOD"], Experiment["CURRDD_SIF"], ] # Operate on cached data/models only. experiment_masks = [] plotting_experiment_data = {} for experiment in tqdm(experiments, desc="Loading data"): get_data(experiment, cache_check=True) get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data( experiment) get_model(X_train, y_train, cache_check=True) experiment_masks.append(get_endog_exog_mask(experiment)[2]) plotting_experiment_data[experiment] = dict( model=get_model(X_train, y_train), X_train=X_train, ) # Ensure masks are aligned. check_master_masks(*experiment_masks) lags = (0, 1, 3, 6, 9) for comp_vars in [[variable.FAPAR, variable.LAI], [variable.SIF, variable.VOD]]: fig, axes = plt.subplots(5, 2, sharex="col", figsize=(7.0, 5.8)) # Create general legend labels (with 'X' instead of FAPAR, or LAI, etc...). mod_exp_plot_kwargs = deepcopy(experiment_plot_kwargs) for plot_kwargs in mod_exp_plot_kwargs.values(): if plot_kwargs["label"].startswith("15VEG_"): plot_kwargs["label"] = "15VEG_X" elif plot_kwargs["label"].startswith("CURRDD_"): plot_kwargs["label"] = "CURRDD_X" x_factor_exp = 0 x_factor = 10**x_factor_exp # x_factor_str = rf"$10^{{{x_factor_exp}}}$" y_factor_exp = -4 y_factor = 10**y_factor_exp y_factor_str = rf"$10^{{{y_factor_exp}}}$" multi_model_ale_1d( comp_vars[0], plotting_experiment_data, mod_exp_plot_kwargs, verbose=verbose, legend_bbox=(0.5, 1.01), fig=fig, axes=axes[:, 0:1], lags=lags, x_ndigits=2, x_factor=x_factor, x_rotation=0, y_ndigits=0, y_factor=y_factor, ) multi_model_ale_1d( comp_vars[1], plotting_experiment_data, experiment_plot_kwargs, verbose=verbose, legend=False, fig=fig, axes=axes[:, 1:2], lags=lags, x_ndigits=2, x_factor=x_factor, x_rotation=0, y_ndigits=0, y_factor=y_factor, ) for ax in axes[:, 1]: ax.set_ylabel("") for ax in axes[:, 0]: lag_match = re.search("(\dM)", ax.get_xlabel()) if lag_match: lag_m = f" {lag_match.group(1)}" else: lag_m = "" ax.set_ylabel(f"ALE{lag_m} ({y_factor_str} BA)") for ax in axes.flatten(): ax.set_xlabel("") for ax, var in zip(axes[-1], comp_vars): assert x_factor_exp == 0 ax.set_xlabel( f"{shorten_features(str(var))} ({variable.units[var]})") for ax, title in zip(axes.flatten(), ascii_lowercase): ax.text(0.5, 1.05, f"({title})", transform=ax.transAxes) margin = 0.4 for ax in axes.ravel(): ax.set_xlim(-margin, 20 + margin) fig.tight_layout(h_pad=0.4) fig.align_labels() figure_saver.save_figure( fig, f"{'__'.join(map(shorten_features, map(str, comp_vars)))}_ale_comp", sub_directory="ale_comp", )
logger = logging.getLogger(__name__) enable_logging(level="WARNING") warnings.filterwarnings("ignore", ".*Collapsing a non-contiguous coordinate.*") warnings.filterwarnings("ignore", ".*DEFAULT_SPHERICAL_EARTH_RADIUS.*") warnings.filterwarnings("ignore", ".*guessing contiguous bounds.*") warnings.filterwarnings( "ignore", 'Setting feature_perturbation = "tree_path_dependent".*') if __name__ == "__main__": experiment = Experiment["15VEG_FAPAR"] # Operate on cached model / data only. get_endog_exog_mask.check_in_store(experiment) endog_data, _, master_mask = get_endog_exog_mask(experiment) check_master_masks(master_mask) get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) get_model(X_train, y_train, cache_check=True) rf = get_model(X_train, y_train) get_shap_values.check_in_store(rf=rf, X=X_test) shap_values = get_shap_values(rf=rf, X=X_test) # Analysis / plotting parameters. diff_threshold = 0.5 ptp_threshold_factor = 0.12 # relative to the mean
def prediction_comparisons(): """Compare ALL and CURR predictions.""" experiments = [Experiment.ALL, Experiment.CURR] # Operate on cached data/models only. experiment_data = {} experiment_models = {} for experiment in experiments: get_data(experiment, cache_check=True) get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) get_model(X_train, y_train, cache_check=True) experiment_data[experiment] = get_endog_exog_mask(experiment) experiment_models[experiment] = get_model(X_train, y_train) # Ensure masks are aligned. check_master_masks(*(data[2] for data in experiment_data.values())) master_mask = next(iter(experiment_data.values()))[2] # Record predictions and errors. experiment_predictions = {} experiment_errors = {} map_experiment_predictions = {} map_experiment_errors = {} for experiment in experiments: X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) predicted_test = threading_get_model_predict( X_train=X_train, y_train=y_train, predict_X=X_test, ) print("Experiment:", experiment.name) print("mean observed test:", np.mean(y_test.values)) print("mean predicted test:", np.mean(predicted_test)) print("lowest observed test:", np.min(y_test.values)) print( "fraction of times this occurs:", np.sum(y_test.values == np.min(y_test.values)) / y_test.values.size, ) print("lowest test prediction:", np.min(predicted_test)) experiment_predictions[experiment] = predicted_test experiment_errors[experiment] = y_test.values - predicted_test map_experiment_predictions[experiment] = get_mm_data( experiment_predictions[experiment], master_mask, kind="val" ) map_experiment_errors[experiment] = get_mm_data( experiment_errors[experiment], master_mask, kind="val" ) error_mag_diff = np.abs(map_experiment_errors[experiments[1]]) - np.abs( map_experiment_errors[experiments[0]] ) y_test = get_experiment_split_data(experiment)[3] rel_error_mag_diff = np.mean(error_mag_diff, axis=0) / np.mean( get_mm_data(y_test.values, master_mask, kind="val"), axis=0 ) all_rel = get_unmasked(rel_error_mag_diff) print(f"% >0: {100 * np.sum(all_rel > 0) / all_rel.size:0.1f}") print(f"% <0: {100 * np.sum(all_rel < 0) / all_rel.size:0.1f}") fig, ax, cbar = disc_cube_plot( dummy_lat_lon_cube(rel_error_mag_diff), bin_edges=(-0.5, 0, 0.5), extend="both", cmap="PiYG", cmap_midpoint=0, cmap_symmetric=False, cbar_label=f"<|Err({experiments[1].name})| - |Err({experiments[0].name})|> / <Ob.>", cbar_shrink=0.3, cbar_aspect=15, cbar_extendfrac=0.1, cbar_pad=0.02, cbar_format=None, **get_aux0_aux1_kwargs(y_test, master_mask), loc=(0.79, 0.14), height=0.05, aspect=1.25, spacing=0.06 * 0.2, ) cbar.ax.yaxis.label.set_size(7) map_figure_saver.save_figure( fig, f"rel_error_mag_diff_{'_'.join(map(attrgetter('name'), experiments))}" )