def run(args, i_cv): logger = logging.getLogger() print_line() logger.info('Running iter n°{}'.format(i_cv)) print_line() directory = os.path.join(DIRECTORY, f'cv_{i_cv}') os.makedirs(directory, exist_ok=True) config = Config() seed = SEED + i_cv * 5 train_generator = Generator(seed) valid_generator = Generator(seed+1) test_generator = Generator(seed+2) N_BINS = 10 X_train, y_train, w_train = train_generator.generate(*config.CALIBRATED, n_samples=config.N_TRAINING_SAMPLES) compute_summaries = HistogramSummaryComputer(n_bins=N_BINS).fit(X_train) result_table = [run_iter(compute_summaries, i_cv, i, test_config, valid_generator, test_generator, directory) for i, test_config in enumerate(config.iter_test_config())] result_table = pd.DataFrame(result_table) result_table.to_csv(os.path.join(directory, 'results.csv')) logger.info('Plot params') param_names = config.PARAM_NAMES for name in param_names: plot_params(name, result_table, title='Likelihood fit', directory=directory) return result_table
def run_conditional_estimation(args, i_cv): logger = logging.getLogger() print_line() logger.info('Running iter n°{}'.format(i_cv)) print_line() result_row = {'i_cv': i_cv} # LOAD/GENERATE DATA logger.info('Set up data generator') config = Config() seed = SEED + i_cv * 5 train_generator = GeneratorTorch(seed, cuda=args.cuda) train_generator = TrainGenerator(train_generator, cuda=args.cuda) valid_generator = Generator(seed + 1) test_generator = Generator(seed + 2) # SET MODEL logger.info('Set up classifier') model = build_model(args, i_cv) os.makedirs(model.results_path, exist_ok=True) flush(logger) # TRAINING / LOADING train_or_load_neural_net(model, train_generator, retrain=args.retrain) # CHECK TRAINING logger.info('Generate validation data') X_valid, y_valid, w_valid = valid_generator.generate( *config.CALIBRATED, n_samples=config.N_VALIDATION_SAMPLES) result_row.update(evaluate_neural_net(model, prefix='valid')) result_row.update( evaluate_classifier(model, X_valid, y_valid, w_valid, prefix='valid')) # MEASUREMENT evaluate_summary_computer(model, X_valid, y_valid, w_valid, n_bins=N_BINS, prefix='valid_', suffix='') iter_results = [ run_conditional_estimation_iter(model, result_row, i, test_config, valid_generator, test_generator, n_bins=N_BINS) for i, test_config in enumerate(config.iter_test_config()) ] conditional_estimate = pd.concat(iter_results) conditional_estimate['i_cv'] = i_cv fname = os.path.join(model.results_path, "conditional_estimations.csv") conditional_estimate.to_csv(fname) logger.info('DONE') return conditional_estimate
def run(args, i_cv): logger = logging.getLogger() print_line() logger.info('Running iter n°{}'.format(i_cv)) print_line() result_row = {'i_cv': i_cv} # LOAD/GENERATE DATA logger.info('Set up data generator') config = Config() seed = SEED + i_cv * 5 train_generator = Generator(seed) valid_generator = Generator(seed + 1) test_generator = Generator(seed + 2) train_generator = TrainGenerator(param_generator, train_generator) # SET MODEL logger.info('Set up regressor') model = build_model(args, i_cv) os.makedirs(model.results_path, exist_ok=True) flush(logger) # TRAINING / LOADING train_or_load_neural_net(model, train_generator, retrain=args.retrain) # CHECK TRAINING logger.info('Generate validation data') X_valid, y_valid, w_valid = valid_generator.generate( *config.CALIBRATED, n_samples=config.N_VALIDATION_SAMPLES) result_row.update(evaluate_neural_net(model, prefix='valid')) evaluate_regressor(model, prefix='valid') # MEASUREMENT result_row['nfcn'] = NCALL result_table = [ run_iter(model, result_row, i, test_config, valid_generator, test_generator) for i, test_config in enumerate(config.iter_test_config()) ] result_table = pd.DataFrame(result_table) result_table.to_csv(os.path.join(model.results_path, 'results.csv')) logger.info('Plot params') param_names = config.PARAM_NAMES for name in param_names: plot_params(name, result_table, title=model.full_name, directory=model.results_path) logger.info('DONE') return result_table
def main(): # BASIC SETUP logger = set_logger() args = GB_parse_args(main_description="Training launcher for Gradient boosting on S3D2 benchmark") logger.info(args) flush(logger) # Config config = Config() config.TRUE = Parameter(r=0.1, lam=2.7, mu=0.1) train_generator = Generator(SEED) valid_generator = Generator(SEED+1) test_generator = Generator(SEED+2) X_test, y_test, w_test = test_generator.generate(*config.TRUE, n_samples=config.N_TESTING_SAMPLES) # for nuisance in p(nuisance | data) nuisance_param_sample = [param_generator().nuisance_parameters for _ in range(25)] average_list = [] variance_list = [] all_results = [] for nuisance_params in nuisance_param_sample: logger.info(f"nuisance_params = {nuisance_params}") estimator_values = [] results = {name : value for name, value in zip(config.TRUE.nuisance_parameters_names, nuisance_params)} for i_cv in range(N_ITER): clf = build_model(args, i_cv) parameters = Parameter(*nuisance_params, config.CALIBRATED.interest_parameters) print(parameters) n_samples = config.N_TRAINING_SAMPLES X_train, y_train, w_train = train_generator.generate(*parameters, n_samples=n_samples) logger.info(f"Training {clf.full_name}") # TODO : is it OK to provide w_train to the classifier or useless ? clf.fit(X_train, y_train, w_train) compute_summaries = ClassifierSummaryComputer(clf, n_bins=10) nll_computer = NLLComputer(compute_summaries, valid_generator, X_test, w_test, config=config) compute_nll = lambda mu : nll_computer(*nuisance_params, mu) minimizer = get_minimizer(compute_nll) results.update(evaluate_minuit(minimizer, [config.TRUE.interest_parameters])) all_results.append(results.copy()) # TODO : Add results to some csv estimator_values.append(results['mu']) average_list.append(np.mean(estimator_values)) variance_list.append(np.var(estimator_values)) logger.info(f"average_list {average_list}") logger.info(f"variance_list {variance_list}") v_stat = np.mean(variance_list) v_syst = np.var(average_list) v_total = v_stat + v_syst logger.info(f"V_stat = {v_stat}") logger.info(f"V_syst = {v_syst}") logger.info(f"V_total = {v_total}")
def main(): # BASIC SETUP logger = set_logger() args = REG_parse_args( main_description="Training launcher for Regressor on S3D2 benchmark") logger.info(args) flush(logger) # Setup model logger.info("Setup model") model = build_model(args, 0) os.makedirs(model.results_directory, exist_ok=True) # Setup data logger.info("Setup data") config = Config() config_table = evaluate_config(config) config_table.to_csv( os.path.join(model.results_directory, 'config_table.csv')) seed = SEED + 99999 train_generator = TrainGenerator(param_generator, Generator(seed)) valid_generator = Generator(seed + 1) test_generator = Generator(seed + 2) i_cv = 0 result_row = {'i_cv': i_cv} # TRAINING / LOADING train_or_load_neural_net(model, train_generator, retrain=args.retrain) # CHECK TRAINING result_row.update(evaluate_neural_net(model, prefix='valid')) evaluate_regressor(model, prefix='valid') print_line() result_table = [ run_iter(model, result_row, i, test_config, valid_generator, test_generator) for i, test_config in enumerate(config.iter_test_config()) ] result_table = pd.DataFrame(result_table) result_table.to_csv(os.path.join(model.results_directory, 'results.csv')) logger.info('Plot params') param_names = [CALIB_PARAM_NAME] for name in param_names: plot_params(name, result_table, title=model.full_name, directory=model.results_directory) logger.info('DONE')
def run_estimation(args, i_cv): logger = logging.getLogger() print_line() logger.info('Running iter n°{}'.format(i_cv)) print_line() result_row = {'i_cv': i_cv} # LOAD/GENERATE DATA logger.info('Set up data generator') config = Config() seed = SEED + i_cv * 5 train_generator = Generator(seed) train_generator = TrainGenerator(param_generator, train_generator) valid_generator = Generator(seed+1) test_generator = Generator(seed+2) # SET MODEL logger.info('Set up classifier') model = build_model(args, i_cv) os.makedirs(model.results_path, exist_ok=True) flush(logger) # TRAINING / LOADING train_or_load_pivot(model, train_generator, config.N_TRAINING_SAMPLES*N_AUGMENT, retrain=args.retrain) # CHECK TRAINING logger.info('Generate validation data') X_valid, y_valid, w_valid = valid_generator.generate(*config.CALIBRATED, n_samples=config.N_VALIDATION_SAMPLES) result_row.update(evaluate_neural_net(model, prefix='valid')) result_row.update(evaluate_classifier(model, X_valid, y_valid, w_valid, prefix='valid')) # MEASUREMENT calib_r = load_calib_r(DATA_NAME, BENCHMARK_NAME) calib_lam = load_calib_lam(DATA_NAME, BENCHMARK_NAME) evaluate_summary_computer(model, X_valid, y_valid, w_valid, n_bins=N_BINS, prefix='valid_', suffix='') iter_results = [run_estimation_iter(model, result_row, i, test_config, valid_generator, test_generator, calib_r, calib_lam, n_bins=N_BINS) for i, test_config in enumerate(config.iter_test_config())] result_table = pd.DataFrame(iter_results) result_table.to_csv(os.path.join(model.results_path, 'estimations.csv')) logger.info('Plot params') param_names = config.PARAM_NAMES for name in param_names: plot_params(name, result_table, title=model.full_name, directory=model.results_path) logger.info('DONE') return result_table
def run_iter(i_cv, i_iter, config, seed, directory): logger = logging.getLogger() logger.info('-' * 45) logger.info(f'iter : {i_iter}') result_row = dict(i_cv=i_cv, i=i_iter) iter_directory = os.path.join(directory, f'iter_{i_iter}') os.makedirs(iter_directory, exist_ok=True) logger.info(f"True Parameters = {config.TRUE}") suffix = f'-mu={config.TRUE.mu:1.2f}_r={config.TRUE.r}_lambda={config.TRUE.lam}' generator = Generator(seed) # test_generator data, label = generator.sample_event(*config.TRUE, size=config.N_TESTING_SAMPLES) result_row['n_test_samples'] = config.N_TESTING_SAMPLES debug_label(label) compute_nll = lambda r, lam, mu: generator.nll(data, r, lam, mu) plot_nll_around_min(compute_nll, config.TRUE, iter_directory, suffix) logger.info('Prepare minuit minimizer') minimizer = get_minimizer(compute_nll, config.CALIBRATED, config.CALIBRATED_ERROR) minimizer.precision = None result_row.update( evaluate_minuit(minimizer, config.TRUE, iter_directory, suffix=suffix)) return result_row
def run(args, i_cv): logger = logging.getLogger() print_line() logger.info('Running iter n°{}'.format(i_cv)) print_line() result_row = {'i_cv': i_cv} # LOAD/GENERATE DATA logger.info('Set up data generator') config = Config() seed = SEED + i_cv * 5 # train_generator = Generator(seed) # valid_generator = Generator(seed+1) test_generator = Generator(seed+2) # SET MODEL # logger.info('Set up classifier') model = build_model(args, i_cv) # flush(logger) # TRAINING / LOADING # train_or_load_classifier(model, train_generator, config.CALIBRATED, config.N_TRAINING_SAMPLES, retrain=args.retrain) # CHECK TRAINING logger.info('Generate validation data') # X_valid, y_valid, w_valid = valid_generator.generate(*config.CALIBRATED, n_samples=config.N_VALIDATION_SAMPLES) # result_row.update(evaluate_classifier(model, X_valid, y_valid, w_valid, prefix='valid')) # MEASUREMENT N_BINS = 10 # evaluate_summary_computer(model, X_valid, y_valid, w_valid, n_bins=N_BINS, prefix='valid_', suffix='') result_table = [run_iter(model, result_row, i, i_cv, args, test_config, test_generator, n_bins=N_BINS) for i, test_config in enumerate(config.iter_test_config())] result_table = pd.DataFrame(result_table) result_table.to_csv(os.path.join(model.results_path, 'results.csv')) logger.info('Plot params') param_names = config.PARAM_NAMES for name in param_names: plot_params(name, result_table, title=model.full_name, directory=model.path) logger.info('DONE') return result_table
def explore_links(): generator = Generator(SEED) config = Config() N_SAMPLES = 30_000 feature_names = list(generator.feature_names) + ['Label', 'classifier', 'bin', 'log_p'] mu_range = np.linspace(min(config.RANGE.mu), max(config.RANGE.mu), num=18) all_params = {"min": config.MIN, "true":config.TRUE, "max":config.MAX} # all_params = {"true":config.TRUE} clf = load_some_clf() all_average_df = {} for params_name, orig_params in all_params.items(): print(f"computing link between X and mu using {params_name}...") average_list = [] target_list = [] for mu in mu_range: params = Parameter(*orig_params.nuisance_parameters, mu) data, label, weight = generator.generate(*params, n_samples=N_SAMPLES) sum_weight = np.sum(weight) average_array = np.sum(data*weight.reshape(-1, 1), axis=0) / sum_weight average_label = np.sum(label*weight, axis=0) / sum_weight proba = clf.predict_proba(data) decision = proba[:, 1] log_p = np.log(decision / (1 - decision)) average_log_p = np.sum(log_p*weight, axis=0) / sum_weight average_clf = np.sum(decision*weight, axis=0) / sum_weight average_bin = np.sum((decision > 0.9)*weight, axis=0) / sum_weight average_array = np.hstack([average_array, average_label, average_clf, average_bin, average_log_p]) average_list.append(average_array) target_list.append(mu) average_df = pd.DataFrame(np.array(average_list), columns=feature_names) all_average_df[params_name] = average_df for name in feature_names: for params_name, average_df in all_average_df.items(): plt.scatter(average_df[name], target_list, label=params_name) plt.title(f'Link between weighted mean({name}) and mu') plt.ylabel('mu') plt.xlabel(f'weighted mean({name})') plt.legend() plt.savefig(os.path.join(DIRECTORY, f'link_{name}.png')) plt.clf()
def run_iter(i_cv, i_iter, config, seed, directory): # Init logger = logging.getLogger() print_line() logger.info('running iter n°{}'.format(i_iter)) directory = os.path.join(directory, f'iter_{i_iter}') os.makedirs(directory, exist_ok=True) results = dict(i_cv=i_cv, i=i_iter) # Config # DATA_N_SAMPLES = config.N_TESTING_SAMPLES DATA_N_SAMPLES = 9000 R_MIN = config.TRUE.r - 0.3 R_MAX = config.TRUE.r + 0.3 LAM_MIN = config.TRUE.lam - 1 LAM_MAX = config.TRUE.lam + 1 MU_MIN = max(0, config.TRUE.mu - 0.1) MU_MAX = min(1.0, config.TRUE.mu + 0.1) R_N_SAMPLES = 21 LAM_N_SAMPLES = 22 MU_N_SAMPLES = 23 # Prior prior_r = stats.uniform(loc=R_MIN, scale=R_MAX - R_MIN) prior_lam = stats.uniform(loc=LAM_MIN, scale=LAM_MAX - LAM_MIN) prior_mu = stats.uniform(loc=MU_MIN, scale=MU_MAX - MU_MIN) # Param grid r_grid = np.linspace(R_MIN, R_MAX, R_N_SAMPLES) lam_grid = np.linspace(LAM_MIN, LAM_MAX, LAM_N_SAMPLES) mu_grid = np.linspace(MU_MIN, MU_MAX, MU_N_SAMPLES) # Data Generator generator = Generator(seed) data, label = generator.sample_event(*config.TRUE, size=DATA_N_SAMPLES) debug_label(label) # Compute likelihood shape = (R_N_SAMPLES, LAM_N_SAMPLES, MU_N_SAMPLES) n_elements = np.prod(shape) logger.info(f"3D grid has {n_elements} elements") log_likelihood = np.zeros(shape) log_prior_proba = np.zeros(shape) for i, j, k in get_iter_prod(R_N_SAMPLES, LAM_N_SAMPLES, MU_N_SAMPLES, progress_bar=True): log_likelihood[i, j, k] = generator.log_proba_density( data, r_grid[i], lam_grid[j], mu_grid[k]).sum() log_prior_proba[i, j, k] = prior_r.logpdf(r_grid[i]) \ + prior_lam.logpdf(lam_grid[j]) \ + prior_mu.logpdf(mu_grid[k]) debug_log_proba(log_likelihood, log_prior_proba) # Normalization posterior_r_lam_mu = softmax(log_likelihood + log_prior_proba) debug_posterior(posterior_r_lam_mu) # Marginal posterior param proba marginal_r = posterior_r_lam_mu.sum(axis=2).sum(axis=1) marginal_lam = posterior_r_lam_mu.sum(axis=2).sum(axis=0) marginal_mu = posterior_r_lam_mu.sum(axis=1).sum(axis=0) marginal_r_lam = posterior_r_lam_mu.sum(axis=2) assert marginal_r.shape == r_grid.shape, "sum along the wrong axis for marginal r" assert marginal_lam.shape == lam_grid.shape, "sum along the wrong axis for marginal lam" assert marginal_mu.shape == mu_grid.shape, "sum along the wrong axis for marginal mu" assert marginal_r_lam.shape == ( R_N_SAMPLES, LAM_N_SAMPLES), "sum along the wrong axis for marginal (r, lam)" debug_marginal(marginal_r, "r") debug_marginal(marginal_lam, "lam") debug_marginal(marginal_mu, "mu") debug_marginal(marginal_r_lam, "r_lam") # Conditional posterior posterior_mu = np.divide(posterior_r_lam_mu, marginal_r_lam.reshape(R_N_SAMPLES, LAM_N_SAMPLES, 1), out=np.zeros_like(posterior_r_lam_mu), where=(posterior_r_lam_mu != 0)) # Minor check logger.debug("probability densities should sum to one") debug_proba_sum_one(posterior_mu * marginal_r_lam.reshape(R_N_SAMPLES, LAM_N_SAMPLES, 1)) debug_proba_sum_one(posterior_r_lam_mu) debug_proba_sum_one(marginal_r) debug_proba_sum_one(marginal_mu) # Compute estimator values sig_ratio = np.sum(label == 1) / DATA_N_SAMPLES expect_mu = expectancy(mu_grid, marginal_mu) var_mu = variance(mu_grid, marginal_mu) std_mu = np.sqrt(var_mu) expect_r = expectancy(r_grid, marginal_r) var_r = variance(r_grid, marginal_r) std_r = np.sqrt(var_r) expect_lam = expectancy(lam_grid, marginal_lam) var_lam = variance(lam_grid, marginal_lam) std_lam = np.sqrt(var_lam) stat_err = stat_uncertainty(mu_grid, posterior_mu, marginal_r_lam, reshape=(1, 1, -1)) syst_err = syst_uncertainty(mu_grid, posterior_mu, marginal_r_lam, reshape=(1, 1, -1)) i_max, j_max, k_max = np.unravel_index(np.argmax(log_likelihood), log_likelihood.shape) assert np.max(log_likelihood) == log_likelihood[ i_max, j_max, k_max], "max and argmax should point to the same value" # Save estimator values results['mu'] = expect_mu results['mu' + _TRUTH] = config.TRUE.mu results['mu_std'] = std_mu results['mu' + _ERROR] = var_mu results['mu_stat'] = stat_err results['mu_syst'] = syst_err results['r'] = expect_r results['r' + _TRUTH] = config.TRUE.r results['r_std'] = std_r results['r' + _ERROR] = var_r results['lam'] = expect_lam results['lam' + _TRUTH] = config.TRUE.lam results['lam_std'] = std_lam results['lam' + _ERROR] = var_lam # Log estimator values logger.info(f"True mu value = {config.TRUE.mu}") logger.info(f"Sig ratio = {sig_ratio}") logger.info(f"E[mu|x] = {expect_mu}") logger.info(f"Var[mu|x] = {var_mu}") logger.info(f"sqrt(Var[mu|x]) = {std_mu}") logger.info(f"stat_uncertainty = {stat_err}") logger.info(f"syst_uncertainty = {syst_err}") logger.info(f"Var - stat = {var_mu - stat_err}") logger.info(f"argmax_mu p(mu|x) = {mu_grid[np.argmax(marginal_mu)]}") logger.info( f"argmax_r_mu logp(x|r, mu) = {r_grid[i_max]} {mu_grid[j_max]}") # Minor checks debug_min_max(marginal_mu, 'p(mu | x)') debug_min_max(marginal_lam, 'p(lam | x)') debug_min_max(marginal_r, 'p(r | x)') debug_min_max(posterior_mu, 'p(mu | x, r)') debug_min_max(posterior_r_lam_mu, 'p(mu, r | x)') # Plots plot_infer(mu_grid, marginal_mu, expected_value=expect_mu, true_value=config.TRUE.mu, std=std_mu, name='mu', directory=directory, fname='marginal_mu.png') plot_infer(r_grid, marginal_r, expected_value=expect_r, true_value=config.TRUE.r, std=std_r, name='r', directory=directory, fname='marginal_r.png') plot_infer(lam_grid, marginal_lam, expected_value=expect_lam, true_value=config.TRUE.lam, std=std_lam, name='lam', directory=directory, fname='marginal_lam.png') # plot_distrib(data, generator, config.TRUE, expect_r, expect_mu, # title="data distribution", directory=directory, fname='data_distrib.png') return results
def features(): config = Config() N_SAMPLES = 10_000 R_MIN = -0.3 R_MAX = 0.3 LAM_MIN = 2 LAM_MAX = 4 MU_MIN = 0.0 MU_MAX = 1.0 generator = Generator(SEED) X, label = generator.sample_event(config.TRUE.r, config.TRUE.lam, config.TRUE.mu, size=N_SAMPLES) n_sig = np.sum(label==1) n_bkg = np.sum(label==0) print(f"nb of signal = {n_sig}") print(f"nb of backgrounds = {n_bkg}") df = pd.DataFrame(X, columns=["x1","x2","x3"]) df['label'] = label g = sns.PairGrid(df, vars=["x1","x2","x3"], hue='label') g = g.map_upper(sns.scatterplot) g = g.map_diag(sns.kdeplot) g = g.map_lower(sns.kdeplot, n_levels=6) g = g.add_legend() # g = g.map_offdiag(sns.kdeplot, n_levels=6) g.savefig(os.path.join(DIRECTORY, 'pairgrid.png')) plt.clf() nll = generator.nll(X, config.TRUE.r, config.TRUE.lam, config.TRUE.mu) print(f"NLL = {nll}") R_RANGE = np.linspace(R_MIN, R_MAX, 100) nll = [generator.nll(X, r, config.TRUE.lam, config.TRUE.mu) for r in R_RANGE] min_nll = R_RANGE[np.argmin(nll)] plt.plot(R_RANGE, nll, label="nll(r)") plt.axvline(config.TRUE.r, c="orange", label="true r") plt.axvline(min_nll, c="red", label="min nll") plt.xlabel("r") plt.ylabel("NLL") plt.title("NLL according to r param") plt.legend() plt.tight_layout() plt.savefig(os.path.join(DIRECTORY, 'NLL_r.png')) plt.clf() LAM_RANGE = np.linspace(LAM_MIN, LAM_MAX, 100) nll = [generator.nll(X, config.TRUE.r, lam, config.TRUE.mu) for lam in LAM_RANGE] min_nll = LAM_RANGE[np.argmin(nll)] plt.plot(LAM_RANGE, nll, label="nll(lam)") plt.axvline(config.TRUE.lam, c="orange", label="true lam") plt.axvline(min_nll, c="red", label="min nll") plt.xlabel("$\lambda$") plt.ylabel("NLL") plt.title("NLL according to $\lambda$ param") plt.legend() plt.tight_layout() plt.savefig(os.path.join(DIRECTORY, 'NLL_lambda.png')) plt.clf() MU_RANGE = np.linspace(MU_MIN, MU_MAX, 100) nll = [generator.nll(X, config.TRUE.r, config.TRUE.lam, mu) for mu in MU_RANGE] min_nll = MU_RANGE[np.argmin(nll)] plt.plot(MU_RANGE, nll, label="nll(mu)") plt.axvline(config.TRUE.mu, c="orange", label="true mu") plt.axvline(min_nll, c="red", label="min nll") plt.xlabel("$\mu$") plt.ylabel("NLL") plt.title("NLL according to $\mu$ param") plt.legend() plt.tight_layout() plt.savefig(os.path.join(DIRECTORY, 'NLL_mu.png')) plt.clf()