*veg_lag_product, ) for veg_lag_product in product(*veg_lags) ] assert all(len(combination) == 15 for combination in combinations) args = [[], []] for combination in combinations: for i in range(n_splits): args[0].append(combination) args[1].append(i) args_scores = run( combination_fit, *args, cx1_kwargs=cx1_kwargs, return_local_args=True ) if args_scores is None or ( isinstance(args_scores, dict) and set(args_scores) == { "present", "uncached", } ): sys.exit(0) # Load cached data for all combinations / splits. # Get training and test data for all variables.
chosen_ba_data, bins=50, max_lag=2000, n_jobs=get_ncpus(), n_per_job=6000, verbose=True, ) # fig.suptitle(f"{title}, {inds.shape[0]} samples (out of {valid_indices.shape[0]})") ax1.set_ylabel("Semivariance") ax2.set_ylabel("N") ax2.set_yscale("log") ax1.set_xlabel("Lag (km)") for ax in (ax1, ax2): ax.grid() format_label_string_with_exponent(ax1, axis="y") fig.align_labels() figure_saver.save_figure(fig, "mean_gfed4_variogram") def plot_mean_gfed4_variogram(*args, **kwargs): gfed4_variogram(-1) if __name__ == "__main__": cx1_kwargs = dict(ncpus=1, walltime="24:00:00", memory="10GB") run(plot_mean_gfed4_variogram, [None], cx1_kwargs=cx1_kwargs)
islice(param_iter(), None, total), desc=f"2D ALE plotting ({experiment})", total=total, disable=not verbose, ): save_ale_2d( experiment=experiment, model=model, train_set=X_train, features=columns, n_jobs=get_ncpus(), include_first_order=True, plot_samples=plot_samples, figure_saver=exp_figure_saver, ale_factor_exp=plotting_configuration.ale_factor_exps.get( (columns[0].parent, columns[1].parent), -2 ), x_factor_exp=plotting_configuration.factor_exps.get(columns[0].parent, 0), x_ndigits=plotting_configuration.ndigits.get(columns[0].parent, 2), y_factor_exp=plotting_configuration.factor_exps.get(columns[1].parent, 0), y_ndigits=plotting_configuration.ndigits.get(columns[1].parent, 2), ) plt.close("all") if __name__ == "__main__": # Relevant if called with the command 'cx1' instead of 'local'. cx1_kwargs = dict(walltime="01:00:00", ncpus=1, mem="10GB") run(plot_2d_ale, list(Experiment), cx1_kwargs=cx1_kwargs)
Experiment["15VEG_FAPAR"], variable.DRY_DAY_PERIOD[3], "(c) 15VEG_FAPAR", ), axes[1, 1]: ( Experiment["15VEG_FAPAR_MON"], variable.DRY_DAY_PERIOD[3], "(d) 15VEG_FAPAR_MON", ), } for (ax, (experiment, column, title)) in tqdm(plot_spec.items(), desc="ALE plots", disable=not verbose): plot_single_1d_ale(experiment, column, ax=ax, verbose=verbose) ax.set_title(title) gc.collect() for ax in axes[:, 1]: ax.set_ylabel("") fig.tight_layout() fig.align_labels() figure_saver.save_figure(fig, "15VEG_FAPAR_15VEG_FAPAR_MON_ALE_comp") if __name__ == "__main__": # Relevant if called with the command 'cx1' instead of 'local'. cx1_kwargs = dict(walltime="24:00:00", ncpus=32, mem="60GB") run(plot_clim_mon_ale_comp, [None], cx1_kwargs=cx1_kwargs)
) = random_binary_dilation_split(**split_kwargs) model = optional_client_call( get_model, dict(X_train=X_train, y_train=y_train), cache_check=cache_check, )[0] if cache_check: return get_model_scores.check_in_store(model, X_test, X_train, y_test, y_train) return data_info, get_model_scores(model, X_test, X_train, y_test, y_train) if __name__ == "__main__": cx1_kwargs = dict(walltime="04:00:00", ncpus=32, mem="60GB") experiments = list(Experiment) args = [] for experiment in experiments: for structure_info, structure in structures: for test_frac in [0.1, 0.05, 0.01]: for seed in range(4): args.append((experiment, structure, test_frac, seed)) output = run(fit_random_binary_dilation, *zip(*args), cx1_kwargs=cx1_kwargs) from pprint import pprint pprint(output)
warnings.filterwarnings( "ignore", 'Setting feature_perturbation = "tree_path_dependent".*') def get_experiment_model_scores(experiment, cache_check=False, **kwargs): # Operate on cached data only. get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) # Operate on cached fitted models only. get_model(X_train, y_train, cache_check=True) model = get_model(X_train, y_train) if cache_check: return get_model_scores.check_in_store(model, X_test, X_train, y_test, y_train) return get_model_scores(model, X_test, X_train, y_test, y_train) if __name__ == "__main__": scores = { exp.name: vals for exp, vals in zip( list(Experiment), run(get_experiment_model_scores, list(Experiment), cx1_kwargs=False), ) } print(pd.DataFrame(scores))
if cache_check: return calculate_pfi.check_in_store(*pfi_train_args) return { "train": calculate_pfi(*pfi_train_args), "test": calculate_pfi(*pfi_test_args), } if __name__ == "__main__": # Relevant if called with the command 'cx1' instead of 'local'. cx1_kwargs = dict(walltime="24:00:00", ncpus=32, mem="60GB") experiments = list(Experiment) args_pfi_results = run( pfi_calc, experiments, cx1_kwargs=cx1_kwargs, return_local_args=True ) if args_pfi_results is None: sys.exit(0) args, kwargs, pfi_results = args_pfi_results pfi_importances = {} for exp, pfi_result in zip(args[0], pfi_results): # Join the train and test data. pfi_importances[exp] = ( pfi_result["train"] .set_index("feature", drop=True) .rename({"weight": "train weight", "std": "train std"}, axis="columns") .join( pfi_result["test"]
def df_cols_to_str(df): df.columns = list(map(lambda s: shorten_features(str(s)), df.columns)) return df # with exp_figure_saver("corr_plot"): # corr_plot( # df_cols_to_str( # exog_data[ # list( # sort_variables( # var for var in exog_data.columns if var.shift <= 9 # ) # ) # ] # ), # fig_kwargs={"figsize": (12, 8)}, # ) # plt.grid(False) with exp_figure_saver(f"{experiment.name}_corr_plot_full"): corr_plot( df_cols_to_str(exog_data[list(sort_variables(exog_data.columns))]), rotation=70, fig_kwargs={"figsize": (8.2, 6.3)}, ) plt.grid(False) if __name__ == "__main__": run(correlation_plot, list(Experiment), cx1_kwargs=False)
X_train, X_test, y_train, y_test = get_experiment_split_data( experiment) for kind in ["train", "test"]: if kind == "train": X = X_train elif kind == "test": X = X_test else: raise ValueError(f"Unknown kind '{kind}'.") N = get_shap_params(X)["max_index"] + 1 indices = np.arange(N) args[0].extend([experiment] * N) args[1].extend(indices) args[2].extend([kind] * N) raw_shap_data = run(shap_values, *args, cx1_kwargs=cx1_kwargs) if raw_shap_data is None: if run_experiments: # Experiments were submitted as CX1 jobs. sys.exit(0) # Otherwise, experiments were already present as a fully cached value. if isinstance(raw_shap_data, dict) and set(raw_shap_data) == { "present", "uncached", }: # Checking was performed. print("Full cache present for:", end="") pprint( set(
y_test=y_test, leave_out=("", *selected_features[experiment]), local_n_jobs=(1 if (get_ncpus() < 4) else (get_ncpus() - 2)), ), cache_check=cache_check, add_client=True, )[0] if cache_check: return IN_STORE return loco_results if __name__ == "__main__": args_loco_results = run( loco_calc, list(Experiment), cx1_kwargs=False, return_local_args=True ) if args_loco_results is None: sys.exit(0) args, kwargs, loco_results = args_loco_results vis_data = {} for experiment, exp_results in zip(args[0], loco_results): for leave_out, results in exp_results.items(): vis_data[(experiment, leave_out)] = results combined_df = pd.DataFrame(vis_data).T combined_df.index.names = ["experiment", "feature"] combined_df.rename(
get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) if cache_check: return get_model(X_train=X_train, y_train=y_train, cache_check=True) model = get_model( X_train=X_train, y_train=y_train, parallel_backend_call=( # Use local threading backend - avoid the Dask backend. partial(parallel_backend, "threading", n_jobs=get_ncpus())), ) return model if __name__ == "__main__": cx1_kwargs = dict(walltime="24:00:00", ncpus=32, mem="60GB") args_models = run( fit_experiment_model, list(Experiment), cx1_kwargs=cx1_kwargs, return_local_args=True, ) if args_models is None: sys.exit(0) args, kwargs, models = args_models models = {exp: fitted_model for exp, fitted_model in zip(args[0], models)}
warnings.filterwarnings("ignore", ".*Collapsing a non-contiguous coordinate.*") warnings.filterwarnings("ignore", ".*DEFAULT_SPHERICAL_EARTH_RADIUS.*") warnings.filterwarnings("ignore", ".*guessing contiguous bounds.*") warnings.filterwarnings( "ignore", 'Setting feature_perturbation = "tree_path_dependent".*' ) def get_experiment_data(experiment, cache_check=False, **kwargs): if cache_check: get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) return X_train, X_test, y_train, y_test if __name__ == "__main__": cx1_kwargs = dict(walltime="04:00:00", ncpus=32, mem="60GB") experiments = list(Experiment) experiment_data = dict( zip( experiments, run(get_experiment_data, experiments, cx1_kwargs=cx1_kwargs), ) ) for (experiment, (X_train, X_test, y_train, y_test)) in experiment_data.items(): print(f"{experiment} → {y_train.name}") pprint(X_train.columns) print()
# Prevents memory buildup over repeated calls. gc.collect() return (data_info, hold_out_y, predicted_y) if __name__ == "__main__": # For 40 estimators, ~25 minutes per fit operation. cx1_kwargs = dict(walltime="24:00:00", ncpus=1, mem="5GB") experiments = list(Experiment) max_rad = 50 # Batches of 1000s (x8 rads) submitted as separate CX1 array jobs due to job size limitations. for seeds in [ range(1000), range(1000, 2000), range(2000, 3000), range(3000, 4000) ]: args = [[], [], [], []] for experiment in experiments: for radius in np.linspace(0, max_rad, 8): for seed in seeds: args[0].append(experiment) args[1].append(radius) args[2].append(max_rad) args[3].append(seed) results = run(fit_buffered_loo_sample, *args, cx1_kwargs=cx1_kwargs)
cmd_args = get_parsers()["parser"].parse_args() if cmd_args.experiment is not None: chosen_experiments = [ exp for exp in experiments if exp in tuple(Experiment[exp] for exp in cmd_args.experiment) ] else: chosen_experiments = experiments.copy() chosen_experiments = chosen_experiments[: 1 if cmd_args.single else None] for experiment in tqdm( chosen_experiments, desc="Preparing ALE 1D arguments", disable=not cmd_args.verbose, ): # Operate on cached data / models only. get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) get_model(X_train, y_train, cache_check=True) for column in X_train.columns: args[0].append(experiment) args[1].extend(column) run(plot_1d_ale, *args, cx1_kwargs=cx1_kwargs)
lag_m = "" ax.set_ylabel(f"ALE{lag_m} ({y_factor_str} BA)") for ax in axes.flatten(): ax.set_xlabel("") for ax, var in zip(axes[-1], comp_vars): assert x_factor_exp == 0 ax.set_xlabel( f"{shorten_features(str(var))} ({variable.units[var]})") for ax, title in zip(axes.flatten(), ascii_lowercase): ax.text(0.5, 1.05, f"({title})", transform=ax.transAxes) margin = 0.4 for ax in axes.ravel(): ax.set_xlim(-margin, 20 + margin) fig.tight_layout(h_pad=0.4) fig.align_labels() figure_saver.save_figure( fig, f"{'__'.join(map(shorten_features, map(str, comp_vars)))}_ale_comp", sub_directory="ale_comp", ) if __name__ == "__main__": run(multi_model_ale_plot, [None], cx1_kwargs=False)
mew=1, clip_on=False, ) ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs) ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs) for ax in (ax1, ax2): ax.set_xticks(list(range(len(experiments)))) figure_saver.save_figure(fig, "model_comp_scores") if __name__ == "__main__": experiment_groups = ( ( Experiment.ALL, Experiment.TOP15, Experiment.CURR, Experiment["15VEG_FAPAR"], Experiment["15VEG_LAI"], Experiment["15VEG_SIF"], Experiment["15VEG_VOD"], Experiment.CURRDD_FAPAR, Experiment.CURRDD_LAI, Experiment.CURRDD_SIF, Experiment.CURRDD_VOD, Experiment.BEST15, ), ) run(plot_score_groups, experiment_groups, cx1_kwargs=False)
X_train=X_train, y_train=y_train, predict_X=X_test, ) obs_pred_diff_cube = get_obs_pred_diff_cube(y_val, u_pre, master_mask) with map_figure_saver(sub_directory=experiment.name)( f"{experiment.name}_obs_pred_comp", sub_directory="predictions"): disc_cube_plot( obs_pred_diff_cube, fig=plt.figure(figsize=(5.1, 2.3)), cmap="BrBG", cmap_midpoint=0, cmap_symmetric=False, bin_edges=[-0.01, -0.001, -1e-4, 0, 0.001, 0.01, 0.02], extend="both", cbar_format=get_sci_format(ndigits=0), cbar_pad=0.025, cbar_label="Ob. - Pr.", **get_aux0_aux1_kwargs(y_val, master_mask), loc=(0.83, 0.14), height=0.055, aspect=1, spacing=0.06 * 0.2, ) if __name__ == "__main__": run(plot_obs_pred_comp, list(Experiment), cx1_kwargs=False)
# Plot the legend in between the two axes. axes[1].legend( loc="center", ncol=5, bbox_to_anchor=( np.mean( [ axes[0].get_position().xmax, axes[1].get_position().xmin, ] ), 0.932, ), bbox_transform=fig.transFigure, handletextpad=0.25, columnspacing=0.5, ) exp_figure_saver.save_figure( fig, f'{experiment.name}_{"__".join(map(shorten_features, map(str, features)))}_ale_shifts', sub_directory="multi_ale", transparent=False, ) if __name__ == "__main__": # Relevant if called with the command 'cx1' instead of 'local'. cx1_kwargs = dict(walltime="24:00:00", ncpus=32, mem="60GB") run(plot_multi_ale, list(Experiment), cx1_kwargs=cx1_kwargs)
def plot_ba(experiment, **kwargs): # Operate on cached data only. get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) # Operate on cached data only. get_endog_exog_mask.check_in_store(experiment) master_mask = get_endog_exog_mask(experiment)[2] check_master_masks(master_mask) # Operate on cached fitted models only. get_model(X_train, y_train, cache_check=True) predicted_test = threading_get_model_predict( X_train=X_train, y_train=y_train, predict_X=X_test, ) ba_plotting( *get_ba_plotting_data(predicted_test, y_test, master_mask), figure_saver=map_figure_saver(sub_directory=experiment.name), **get_aux0_aux1_kwargs(y_test, master_mask), filename=f"{experiment.name}_ba_prediction", ) if __name__ == "__main__": run(plot_ba, list(Experiment), cx1_kwargs=False)
mpl.rc_file(Path(__file__).resolve().parent.parent / "matplotlibrc") loguru_logger.enable("alepython") loguru_logger.remove() loguru_logger.add(sys.stderr, level="WARNING") logger = logging.getLogger(__name__) enable_logging(level="WARNING") warnings.filterwarnings("ignore", ".*Collapsing a non-contiguous coordinate.*") warnings.filterwarnings("ignore", ".*DEFAULT_SPHERICAL_EARTH_RADIUS.*") warnings.filterwarnings("ignore", ".*guessing contiguous bounds.*") warnings.filterwarnings( "ignore", 'Setting feature_perturbation = "tree_path_dependent".*') def calling_cached(x): return cached_example_function(x) if __name__ == "__main__": # Relevant if called with the command 'cx1' instead of 'local'. cx1_kwargs = dict(walltime="01:00:00", ncpus=1, mem="1GB") # This works both with single jobs... run(calling_cached, (1, ), cx1_kwargs=cx1_kwargs) # ... and array jobs. run(calling_cached, (2, 3, 4), cx1_kwargs=cx1_kwargs)