Пример #1
0
    def __call__(self, rescale, mu):
        """
        $\sum_{i=0}^{n_{bin}} rate - n_i \log(rate)$ with $rate = \mu s + b$
        """
        config = self.config
        seed = SEED + self.i_cv * 5
        train_generator = Generator(seed)
        valid_generator = Generator(seed + 1)

        classifier = build_model(self.args, self.i_cv)
        X_train, y_train, w_train = train_generator.generate(
            rescale, config.CALIBRATED.mu, n_samples=config.N_TRAINING_SAMPLES)
        classifier.fit(X_train, y_train, w_train)

        X, y, w = valid_generator.generate(
            rescale, mu, n_samples=config.N_VALIDATION_SAMPLES)
        valid_summaries = classifier.compute_summaries(X,
                                                       w,
                                                       n_bins=self.n_bins)
        test_summaries = classifier.compute_summaries(self.X_test,
                                                      self.w_test,
                                                      n_bins=self.n_bins)

        # Compute NLL
        EPSILON = 1e-5  # avoid log(0)
        rate = valid_summaries + EPSILON
        data_nll = np.sum(poisson_nll(test_summaries, rate))
        rescale_constraint = gauss_nll(rescale, config.CALIBRATED.rescale,
                                       config.CALIBRATED_ERROR.rescale)
        total_nll = data_nll + rescale_constraint
        return total_nll
Пример #2
0
def run(args, i_cv):
    logger = logging.getLogger()
    print_line()
    logger.info('Running iter n°{}'.format(i_cv))
    print_line()
    directory = os.path.join(DIRECTORY, f'cv_{i_cv}')
    os.makedirs(directory, exist_ok=True)

    config = Config()
    seed = SEED + i_cv * 5
    train_generator = Generator(seed)
    valid_generator = Generator(seed + 1)
    test_generator = Generator(seed + 2)

    N_BINS = 10
    X_train, y_train, w_train = train_generator.generate(
        *config.CALIBRATED, n_samples=config.N_TRAINING_SAMPLES)
    compute_summaries = HistogramSummaryComputer(n_bins=N_BINS).fit(X_train)

    result_table = [
        run_iter(compute_summaries, i_cv, i, test_config, valid_generator,
                 test_generator, directory)
        for i, test_config in enumerate(config.iter_test_config())
    ]
    result_table = pd.DataFrame(result_table)
    result_table.to_csv(os.path.join(directory, 'results.csv'))
    logger.info('Plot params')
    param_names = config.PARAM_NAMES
    for name in param_names:
        plot_params(name,
                    result_table,
                    title='Likelihood fit',
                    directory=directory)

    return result_table
Пример #3
0
def run(args, i_cv):
    logger = logging.getLogger()
    print_line()
    logger.info('Running iter n°{}'.format(i_cv))
    print_line()


    # LOAD/GENERATE DATA
    logger.info('Set up data generator')
    config = Config()
    seed = SEED + i_cv * 5
    train_generator = GeneratorTorch(seed, cuda=args.cuda)
    valid_generator = Generator(seed+1)
    test_generator  = Generator(seed+2)

    # SET MODEL
    logger.info('Set up classifier')
    model = build_model(args, i_cv)
    os.makedirs(model.results_path, exist_ok=True)
    flush(logger)

    # TRAINING / LOADING
    train_or_load_neural_net(model, train_generator, retrain=args.retrain)

    # MEASUREMENT
    result_row = {'i_cv': i_cv}
    results = []
    for test_config in config.iter_test_config():
        logger.info(f"Running test set : {test_config.TRUE}, {test_config.N_TESTING_SAMPLES} samples")
        for threshold in np.linspace(0, 1, 500):
            result_row = {'i_cv': i_cv}
            result_row['threshold'] = threshold
            result_row.update(test_config.TRUE.to_dict(prefix='true_'))
            result_row['n_test_samples'] = test_config.N_TESTING_SAMPLES

            X, y, w = valid_generator.generate(*config.TRUE, n_samples=config.N_VALIDATION_SAMPLES)
            proba = model.predict_proba(X)
            decision = proba[:, 1]
            selected = decision > threshold
            beta = np.sum(y[selected] == 0)
            gamma = np.sum(y[selected] == 1)
            result_row['beta'] = beta
            result_row['gamma'] = gamma

            X, y, w = test_generator.generate(*config.TRUE, n_samples=config.N_VALIDATION_SAMPLES)
            proba = model.predict_proba(X)
            decision = proba[:, 1]
            selected = decision > threshold
            n_selected = np.sum(selected)
            n_selected_bkg = np.sum(y[selected] == 0)
            n_selected_sig = np.sum(y[selected] == 1)
            result_row['n'] = n_selected
            result_row['b'] = n_selected_bkg
            result_row['s'] = n_selected_sig
            result_row['s_sqrt_n'] = n_selected_sig / np.sqrt(n_selected)
            result_row['s_sqrt_b'] = n_selected_sig / np.sqrt(n_selected)
            results.append(result_row.copy())
    results = pd.DataFrame(results)
    print(results)
    return results
Пример #4
0
def run(args, i_cv):
    logger = logging.getLogger()
    print_line()
    logger.info('Running iter n°{}'.format(i_cv))
    print_line()

    result_row = {'i_cv': i_cv}

    # LOAD/GENERATE DATA
    logger.info('Set up data generator')
    config = Config()
    seed = SEED + i_cv * 5
    train_generator = Generator(seed)
    valid_generator = Generator(seed + 1)
    # test_generator  = Generator(seed+2)

    results = []

    for n_train_samples in N_TRAIN_RANGE:
        result_row['n_train_samples'] = n_train_samples
        # SET MODEL
        logger.info('Set up classifier')
        model = build_model(args, i_cv)
        os.makedirs(model.results_path, exist_ok=True)
        flush(logger)

        # TRAINING / LOADING
        X_train, y_train, w_train = train_generator.generate(
            *config.CALIBRATED, n_samples=n_train_samples)
        model.fit(X_train, y_train, w_train)

        # CHECK TRAINING
        logger.info('Generate validation data')
        X_valid, y_valid, w_valid = valid_generator.generate(
            *config.CALIBRATED, n_samples=config.N_VALIDATION_SAMPLES)

        some_eval = evaluate_classifier(model,
                                        X_valid,
                                        y_valid,
                                        w_valid,
                                        prefix='valid',
                                        suffix=f'-{n_train_samples}')
        result_row['valid_auc'] = some_eval[f'valid_auc-{n_train_samples}']
        result_row['valid_accuracy'] = some_eval[
            f'valid_accuracy-{n_train_samples}']

        N_BINS = 10
        evaluate_summary_computer(model,
                                  X_valid,
                                  y_valid,
                                  w_valid,
                                  n_bins=N_BINS,
                                  prefix='valid_',
                                  suffix=f'{n_train_samples}')

        results.append(result_row.copy())
    result_table = pd.DataFrame(results)

    return result_table
Пример #5
0
def run(args, i_cv):
    logger = logging.getLogger()
    print_line()
    logger.info('Running iter n°{}'.format(i_cv))
    print_line()

    result_row = {'i_cv': i_cv}

    # LOAD/GENERATE DATA
    logger.info('Set up data generator')
    config = Config()
    seed = SEED + i_cv * 5
    train_generator = Generator(seed)
    valid_generator = Generator(seed + 1)
    test_generator = Generator(seed + 2)
    train_generator = TrainGenerator(param_generator, train_generator)

    # SET MODEL
    logger.info('Set up regressor')
    model = build_model(args, i_cv)
    os.makedirs(model.results_path, exist_ok=True)
    flush(logger)

    # TRAINING / LOADING
    train_or_load_neural_net(model, train_generator, retrain=args.retrain)

    # CHECK TRAINING
    logger.info('Generate validation data')
    X_valid, y_valid, w_valid = valid_generator.generate(
        *config.CALIBRATED, n_samples=config.N_VALIDATION_SAMPLES)

    result_row.update(evaluate_neural_net(model, prefix='valid'))
    evaluate_regressor(model, prefix='valid')

    # MEASUREMENT
    result_row['nfcn'] = NCALL
    iter_results = [
        run_iter(model, result_row, i, test_config, valid_generator,
                 test_generator)
        for i, test_config in enumerate(config.iter_test_config())
    ]
    result_table = [e0 for e0, e1 in iter_results]
    result_table = pd.DataFrame(result_table)
    result_table.to_csv(os.path.join(model.results_path, 'estimations.csv'))
    logger.info('Plot params')
    param_names = config.PARAM_NAMES
    for name in param_names:
        plot_params(name,
                    result_table,
                    title=model.full_name,
                    directory=model.results_path)

    conditional_estimate = pd.concat([e1 for e0, e1 in iter_results])
    conditional_estimate['i_cv'] = i_cv
    fname = os.path.join(model.results_path, "conditional_estimations.csv")
    conditional_estimate.to_csv(fname)
    logger.info('DONE')
    return result_table, conditional_estimate
def main():
    # BASIC SETUP
    logger = set_logger()
    args = REG_parse_args(
        main_description="Training launcher for Regressor on S3D2 benchmark")
    logger.info(args)
    flush(logger)

    # Setup model
    logger.info("Setup model")
    model = build_model(args, 0)
    os.makedirs(model.results_directory, exist_ok=True)

    # Setup data
    logger.info("Setup data")
    config = Config()
    config_table = evaluate_config(config)
    config_table.to_csv(
        os.path.join(model.results_directory, 'config_table.csv'))
    seed = SEED + 99999
    train_generator = TrainGenerator(param_generator, Generator(seed))
    valid_generator = Generator(seed + 1)
    test_generator = Generator(seed + 2)

    i_cv = 0
    result_row = {'i_cv': i_cv}

    # TRAINING / LOADING
    train_or_load_neural_net(model, train_generator, retrain=args.retrain)

    # CHECK TRAINING
    result_row.update(evaluate_neural_net(model, prefix='valid'))
    evaluate_regressor(model, prefix='valid')
    print_line()

    result_table = [
        run_iter(model, result_row, i, test_config, valid_generator,
                 test_generator)
        for i, test_config in enumerate(config.iter_test_config())
    ]
    result_table = pd.DataFrame(result_table)
    result_table.to_csv(os.path.join(model.results_directory, 'results.csv'))

    logger.info('Plot params')
    param_names = [CALIB_PARAM_NAME]
    for name in param_names:
        plot_params(name,
                    result_table,
                    title=model.full_name,
                    directory=model.results_directory)

    logger.info('DONE')
Пример #7
0
def run(args, i_cv):
    logger = logging.getLogger()
    print_line()
    logger.info('Running iter n°{}'.format(i_cv))
    print_line()


    # LOAD/GENERATE DATA
    logger.info('Set up data generator')
    config = Config()
    seed = SEED + i_cv * 5
    train_generator = Generator(seed)
    train_generator = TrainGenerator(param_generator, train_generator)
    valid_generator = Generator(seed+1)
    test_generator  = Generator(seed+2)

    # SET MODEL
    logger.info('Set up classifier')
    model = build_model(args, i_cv)
    os.makedirs(model.results_path, exist_ok=True)
    flush(logger)

    # TRAINING / LOADING
    train_or_load_pivot(model, train_generator, config.N_TRAINING_SAMPLES*N_AUGMENT, retrain=args.retrain)

    some_fisher = compute_fisher(*compute_bins(model, valid_generator, config, n_bins=3), config.TRUE.mu)
    some_fisher_bis = compute_fisher(*compute_bins(model, valid_generator, config, n_bins=3), config.TRUE.mu)

    assert some_fisher == some_fisher_bis, f"Fisher info should be deterministic but found : {some_fisher} =/= {some_fisher_bis}"

    # MEASUREMENT
    result_row = {'i_cv': i_cv}
    results = []
    for test_config in config.iter_test_config():
        logger.info(f"Running test set : {test_config.TRUE}, {test_config.N_TESTING_SAMPLES} samples")
        for n_bins in range(1, 30):
            result_row = {'i_cv': i_cv}
            gamma_array, beta_array = compute_bins(model, valid_generator, test_config, n_bins=n_bins)
            fisher = compute_fisher(gamma_array, beta_array, test_config.TRUE.mu)
            result_row.update({f'gamma_{i}' : gamma for i, gamma in enumerate(gamma_array, 1)})
            result_row.update({f'beta_{i}' : beta for i, beta in enumerate(beta_array, 1)})
            result_row.update(test_config.TRUE.to_dict(prefix='true_'))
            result_row['n_test_samples'] = test_config.N_TESTING_SAMPLES
            result_row['fisher'] = fisher
            result_row['n_bins'] = n_bins
            results.append(result_row.copy())
    results = pd.DataFrame(results)
    print(results)
    return results
Пример #8
0
def run_estimation(args, i_cv):
    logger = logging.getLogger()
    print_line()
    logger.info('Running iter n°{}'.format(i_cv))
    print_line()

    result_row = {'i_cv': i_cv}

    # LOAD/GENERATE DATA
    logger.info('Set up data generator')
    config = Config()
    seed = SEED + i_cv * 5
    train_generator = Generator(seed)
    valid_generator = Generator(seed+1)
    test_generator  = Generator(seed+2)

    # SET MODEL
    logger.info('Set up classifier')
    model = build_model(args, i_cv)
    os.makedirs(model.results_path, exist_ok=True)
    flush(logger)

    # TRAINING / LOADING
    train_or_load_classifier(model, train_generator, config.CALIBRATED, config.N_TRAINING_SAMPLES, retrain=args.retrain)

    # CHECK TRAINING
    logger.info('Generate validation data')
    X_valid, y_valid, w_valid = valid_generator.generate(*config.CALIBRATED, n_samples=config.N_VALIDATION_SAMPLES)

    result_row.update(evaluate_classifier(model, X_valid, y_valid, w_valid, prefix='valid'))

    # MEASUREMENT
    calib_rescale = load_calib_rescale(DATA_NAME, BENCHMARK_NAME)
    evaluate_summary_computer(model, X_valid, y_valid, w_valid, n_bins=N_BINS, prefix='valid_', suffix='')
    iter_results = [run_estimation_iter(model, result_row, i, test_config, valid_generator, test_generator, calib_rescale, n_bins=N_BINS)
                    for i, test_config in enumerate(config.iter_test_config())]
    result_table = pd.DataFrame(iter_results)
    result_table.to_csv(os.path.join(model.results_path, 'estimations.csv'))
    logger.info('Plot params')
    param_names = config.PARAM_NAMES
    for name in param_names:
        plot_params(name, result_table, title=model.full_name, directory=model.results_path)

    logger.info('DONE')
    return result_table
Пример #9
0
def run_conditional_estimation(args, i_cv):
    logger = logging.getLogger()
    print_line()
    logger.info('Running iter n°{}'.format(i_cv))
    print_line()

    result_row = {'i_cv': i_cv}

    # LOAD/GENERATE DATA
    logger.info('Set up data generator')
    config = Config()
    seed = SEED + i_cv * 5
    train_generator = Generator(seed)
    valid_generator = Generator(seed+1)
    test_generator  = Generator(seed+2)

    # SET MODEL
    logger.info('Set up classifier')
    model = build_model(args, i_cv)
    os.makedirs(model.results_path, exist_ok=True)
    flush(logger)

    # TRAINING / LOADING
    train_or_load_classifier(model, train_generator, config.CALIBRATED, config.N_TRAINING_SAMPLES, retrain=args.retrain)

    # CHECK TRAINING
    logger.info('Generate validation data')
    X_valid, y_valid, w_valid = valid_generator.generate(*config.CALIBRATED, n_samples=config.N_VALIDATION_SAMPLES)

    result_row.update(evaluate_classifier(model, X_valid, y_valid, w_valid, prefix='valid'))

    # MEASUREMENT
    evaluate_summary_computer(model, X_valid, y_valid, w_valid, n_bins=N_BINS, prefix='valid_', suffix='')
    iter_results = [run_conditional_estimation_iter(model, result_row, i, test_config, valid_generator, test_generator, n_bins=N_BINS)
                    for i, test_config in enumerate(config.iter_test_config())]

    conditional_estimate = pd.concat(iter_results)
    conditional_estimate['i_cv'] = i_cv
    fname = os.path.join(model.results_path, "conditional_estimations.csv")
    conditional_estimate.to_csv(fname)
    logger.info('DONE')
    return conditional_estimate
Пример #10
0
def explore_distribs():
    config = Config()
    generator = Generator()
    data, label = generator.sample_event(*config.TRUE,
                                         size=config.N_TESTING_SAMPLES)

    prior_rescale = stats.norm(loc=config.CALIBRATED.rescale,
                               scale=config.CALIBRATED_ERROR.rescale)
    prior_mu = stats.uniform(loc=0, scale=1)

    plot_data_distrib(generator, config)
    plot_prior(prior_rescale, "rescale")
    plot_prior(prior_mu, "mu")
Пример #11
0
def run_iter(i_cv, i_iter, config, seed, directory):
    logger = logging.getLogger()
    result_row = dict(i_cv=i_cv, i=i_iter)
    iter_directory = os.path.join(directory, f'iter_{i_iter}')
    os.makedirs(iter_directory, exist_ok=True)

    logger.info(f"True Parameters   = {config.TRUE}")
    suffix = f'-mu={config.TRUE.mu:1.2f}_rescale={config.TRUE.rescale}'
    generator = Generator(seed)  # test_generator
    data, label = generator.sample_event(*config.TRUE,
                                         size=config.N_TESTING_SAMPLES)
    debug_label(label)

    compute_nll = lambda rescale, mu: generator.nll(data, rescale, mu)
    plot_nll_around_min(compute_nll, config.TRUE, iter_directory, suffix)

    logger.info('Prepare minuit minimizer')
    minimizer = get_minimizer(compute_nll, config.CALIBRATED,
                              config.CALIBRATED_ERROR)
    result_row.update(evaluate_minuit(minimizer, config.TRUE))
    return result_row
Пример #12
0
def explore_links():
    config = Config()
    generator = Generator()
    rescale_range = np.linspace(min(config.RANGE.rescale),
                                max(config.RANGE.rescale),
                                num=5)
    mu_range = np.linspace(min(config.RANGE.mu), max(config.RANGE.mu), num=15)
    for rescale in rescale_range:
        average_list = []
        target_list = []
        for mu in mu_range:
            data, label = generator.sample_event(rescale,
                                                 mu,
                                                 size=config.N_TESTING_SAMPLES)
            average_list.append(np.mean(data, axis=0))
            target_list.append(mu)
        plt.scatter(average_list, target_list, label=f'rescale={rescale}')

    plt.title('Link between mean(x) and mu')
    plt.ylabel('mu')
    plt.xlabel('mean(x)')
    plt.legend()
    plt.savefig(os.path.join(DIRECTORY, 'mean_link.png'))
    plt.clf()
Пример #13
0
def get_generators(i_cv=0):
    seed = SEED + i_cv * 5
    train_generator = Generator(seed)
    valid_generator = Generator(seed + 1)
    test_generator = Generator(seed + 2)
    return train_generator, valid_generator, test_generator
Пример #14
0
def run_iter(i_cv, i_iter, config, seed, directory):
    # Init
    logger = logging.getLogger()
    print_line()
    logger.info('running iter n°{}'.format(i_iter))
    directory = os.path.join(directory, f'iter_{i_iter}')
    os.makedirs(directory, exist_ok=True)
    results = dict(i_cv=i_cv, i=i_iter)

    # Config
    RESCALE_MIN = config.TRUE.rescale - 0.2
    RESCALE_MAX = config.TRUE.rescale + 0.2

    MU_MIN = max(0, config.TRUE.mu - 0.1)
    MU_MAX = min(1.0, config.TRUE.mu + 0.1)

    MU_N_SAMPLES = 142
    RESCALE_N_SAMPLES = 145
    DATA_N_SAMPLES = 2000

    # Prior
    prior_rescale = stats.uniform(loc=RESCALE_MIN,
                                  scale=RESCALE_MAX - RESCALE_MIN)
    prior_mu = stats.uniform(loc=MU_MIN, scale=MU_MAX - MU_MIN)

    # Param grid
    rescale_grid = np.linspace(RESCALE_MIN, RESCALE_MAX, RESCALE_N_SAMPLES)
    mu_grid = np.linspace(MU_MIN, MU_MAX, MU_N_SAMPLES)

    # Data Generator
    generator = Generator(seed)
    data, label = generator.sample_event(*config.TRUE, size=DATA_N_SAMPLES)
    debug_label(label)

    # Compute likelihood
    shape = (RESCALE_N_SAMPLES, MU_N_SAMPLES)
    n_elements = np.prod(shape)
    logger.info(f"3D grid has {n_elements} elements")
    log_likelihood = np.zeros(shape)
    log_prior_proba = np.zeros(shape)
    for i, j in get_iter_prod(RESCALE_N_SAMPLES,
                              MU_N_SAMPLES,
                              progress_bar=True):
        log_likelihood[i, j] = generator.log_proba_density(
            data, rescale_grid[i], mu_grid[j]).sum()
        log_prior_proba[i, j] = prior_rescale.logpdf(
            rescale_grid[i]) + prior_mu.logpdf(mu_grid[j])
    debug_log_proba(log_likelihood, log_prior_proba)

    # Normalization
    posterior_rescale_mu = softmax(log_likelihood + log_prior_proba)
    debug_posterior(posterior_rescale_mu)

    # Marginal posterior param proba
    marginal_rescale = posterior_rescale_mu.sum(axis=1)
    marginal_mu = posterior_rescale_mu.sum(axis=0)
    assert marginal_rescale.shape == rescale_grid.shape, "sum along the wrong axis for marginal rescale"
    assert marginal_mu.shape == mu_grid.shape, "sum along the wrong axis for marginal mu"
    debug_marginal(marginal_rescale, "rescale")
    debug_marginal(marginal_mu, "mu")

    # Conditional posterior
    posterior_mu = np.divide(posterior_rescale_mu,
                             marginal_rescale.reshape(RESCALE_N_SAMPLES, 1),
                             out=np.zeros_like(posterior_rescale_mu),
                             where=(posterior_rescale_mu != 0))

    # Minor check
    logger.debug("probability densities should sum to one")
    debug_proba_sum_one(posterior_mu * marginal_rescale.reshape(-1, 1))
    debug_proba_sum_one(posterior_rescale_mu)
    debug_proba_sum_one(marginal_rescale)
    debug_proba_sum_one(marginal_mu)

    # Compute estimator values
    sig_ratio = np.sum(label == 1) / DATA_N_SAMPLES
    expect_mu = expectancy(mu_grid, marginal_mu)
    var_mu = variance(mu_grid, marginal_mu)
    std_mu = np.sqrt(var_mu)
    expect_rescale = expectancy(rescale_grid, marginal_rescale)
    var_rescale = variance(rescale_grid, marginal_rescale)
    std_rescale = np.sqrt(var_rescale)

    stat_err = stat_uncertainty(mu_grid, posterior_mu, marginal_rescale)
    syst_err = syst_uncertainty(mu_grid, posterior_mu, marginal_rescale)

    i_max, j_max = np.unravel_index(np.argmax(log_likelihood),
                                    log_likelihood.shape)
    assert np.max(log_likelihood) == log_likelihood[
        i_max, j_max], "max and argmax should point to the same value"

    # Save estimator values
    results['mu'] = expect_mu
    results['mu' + _TRUTH] = config.TRUE.mu
    results['mu_std'] = std_mu
    results['mu' + _ERROR] = var_mu
    results['mu_stat'] = stat_err
    results['mu_syst'] = syst_err
    results['rescale'] = expect_rescale
    results['rescale' + _TRUTH] = config.TRUE.rescale
    results['rescale_std'] = std_rescale
    results['rescale' + _ERROR] = var_rescale

    # Log estimator values
    logger.info(f"True mu value    = {config.TRUE.mu}")
    logger.info(f"Sig ratio         = {sig_ratio}")
    logger.info(f"E[mu|x]          = {expect_mu}")
    logger.info(f"Var[mu|x]        = {var_mu}")
    logger.info(f"sqrt(Var[mu|x])  = {std_mu}")
    logger.info(f"stat_uncertainty = {stat_err}")
    logger.info(f"syst_uncertainty = {syst_err}")
    logger.info(f"Var - stat       = {var_mu - stat_err}")
    logger.info(f"argmax_mu p(mu|x) = {mu_grid[np.argmax(marginal_mu)]}")
    logger.info(
        f"argmax_rescale_mu logp(x|rescale, mu) = {rescale_grid[i_max]} {mu_grid[j_max]}"
    )

    # Minor checks
    debug_min_max(marginal_mu, 'p(mu | x)')
    debug_min_max(marginal_rescale, 'p(rescale | x)')
    debug_min_max(posterior_mu, 'p(mu | x, rescale)')
    debug_min_max(posterior_rescale_mu, 'p(mu, rescale | x)')

    # Plots
    plot_infer(mu_grid,
               marginal_mu,
               expected_value=expect_mu,
               true_value=config.TRUE.mu,
               std=std_mu,
               name='mu',
               directory=directory,
               fname='marginal_mu.png')

    plot_infer(rescale_grid,
               marginal_rescale,
               expected_value=expect_rescale,
               true_value=config.TRUE.rescale,
               std=std_rescale,
               name='rescale',
               directory=directory,
               fname='marginal_rescale.png')

    plot_distrib(data,
                 generator,
                 config.TRUE,
                 expect_rescale,
                 expect_mu,
                 title="data distribution",
                 directory=directory,
                 fname='data_distrib.png')

    return results
Пример #15
0
def main():
    # BASIC SETUP
    logger = set_logger()
    args = GB_parse_args(
        main_description=
        "Training launcher for Gradient boosting on S3D2 benchmark")
    logger.info(args)
    flush(logger)
    # Config
    config = Config()
    config.TRUE = Parameter(rescale=0.9, mu=0.1)
    train_generator = Generator(SEED)
    valid_generator = Generator(SEED + 1)
    test_generator = Generator(SEED + 2)
    X_test, y_test, w_test = test_generator.generate(
        *config.TRUE, n_samples=config.N_TESTING_SAMPLES)

    # for nuisance in p(nuisance | data)
    nuisance_param_sample = [
        param_generator().nuisance_parameters for _ in range(25)
    ]
    average_list = []
    variance_list = []
    result_table = []
    for nuisance_params in nuisance_param_sample:
        logger.info(f"nuisance_params = {nuisance_params}")
        estimator_values = []
        for i_cv in range(N_ITER):
            clf = build_model(args, i_cv)
            parameters = Parameter(*nuisance_params,
                                   config.CALIBRATED.interest_parameters)
            print(parameters)
            n_samples = config.N_TRAINING_SAMPLES
            X_train, y_train, w_train = train_generator.generate(
                *parameters, n_samples=n_samples)
            logger.info(f"Training {clf.full_name}")
            clf.fit(X_train, y_train, w_train)
            compute_summaries = ClassifierSummaryComputer(clf, n_bins=10)
            nll_computer = NLLComputer(compute_summaries,
                                       valid_generator,
                                       X_test,
                                       w_test,
                                       config=config)
            compute_nll = lambda mu: nll_computer(*nuisance_params, mu)
            minimizer = get_minimizer(compute_nll)
            results = evaluate_minuit(minimizer,
                                      [config.TRUE.interest_parameters])
            estimator_values.append(results['mu'])
            results['i_cv'] = i_cv
            results.update(params_to_dict(parameters, suffix='true'))
            result_table.append(results.copy())
        average_list.append(np.mean(estimator_values))
        variance_list.append(np.var(estimator_values))

    model = build_model(args, 0)
    model.set_info(DATA_NAME, BENCHMARK_NAME, 0)
    save_directory = model.results_path
    os.makedirs(save_directory, exist_ok=True)
    result_table = pd.DataFrame(result_table)
    result_table.to_csv(os.path.join(save_directory, 'results.csv'))
    logger.info(f"average_list {average_list}")
    logger.info(f"variance_list {variance_list}")
    v_stat = np.mean(variance_list)
    v_syst = np.var(average_list)
    v_total = v_stat + v_syst
    logger.info(f"V_stat = {v_stat}")
    logger.info(f"V_syst = {v_syst}")
    logger.info(f"V_total = {v_total}")
    eval_dict = {"V_stat": v_stat, "V_syst": v_syst, "V_total": v_total}
    eval_path = os.path.join(save_directory, 'info.json')
    with open(eval_path, 'w') as f:
        json.dump(eval_dict, f)