Пример #1
0
def run():
    teams = df.home_team.unique()
    teams = pd.DataFrame(teams, columns=['team'])
    teams['i'] = teams.index
    
    df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
    df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
    df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
    df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
    
    observed_home_goals = df.home_score.values
    observed_away_goals = df.away_score.values
    
    home_team = df.i_home.values
    away_team = df.i_away.values
    
    num_teams = len(df.i_home.drop_duplicates())
    num_games = len(home_team)
    
    g = df.groupby('i_away')
    att_starting_points = np.log(g.away_score.mean())
    g = df.groupby('i_home')
    def_starting_points = -np.log(g.away_score.mean())
    with pm.Model() as model:
        # global model parameters
        home = pm.Flat('home')
        sd_att = pm.HalfStudentT('sd_att', nu=3, sigma=2.5)
        sd_def = pm.HalfStudentT('sd_def', nu=3, sigma=2.5)
        intercept = pm.Flat('intercept')
    
        # team-specific model parameters
        atts_star = pm.Normal("atts_star", mu=0, sigma=sd_att, shape=num_teams)
        defs_star = pm.Normal("defs_star", mu=0, sigma=sd_def, shape=num_teams)
    
        atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
        defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
        home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team])
        away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
    
        # likelihood of observed data
        home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
        away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
    trace = pm.sample(1000, tune=1000, cores=3)
    pm.traceplot(trace, var_names=['intercept', 'home', 'sd_att', 'sd_def']);
    bfmi = pm.bfmi(trace)
    max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
    (pm.energyplot(trace, legend=False, figsize=(6, 4))
def main(input_dir, output_dir, dataset, model_type, n_samples, n_tune, target_accept, n_cores, seed, init, profile):
    '''Fit log-parabola model to DATASET. 

    Parameters
    ----------
    input_dir : [type]
        input directory containing subdirs for each instrument with dl3 data
    output_dir : [type]
        where to save the results. traces and two plots
    dataset : string
        telescope name
    model_type : string
        whether to use the profile likelihood ('wstat' or 'profile') or not ('full')
    n_samples : int
        number of samples to draw
    n_tune : int
        number of tuning steps
    target_accept : float
        target accept fraction for the pymc sampler
    n_cores : int
        number of cpu cores to use
    seed : int
        random seed
    init : string
        pymc init string
    profile : bool
        whether to output debugging/profiling information to the console
    Raises
    ------
    NotImplementedError
        This does not yet work on the joint dataset. but thats good enough for me.
    '''
    np.random.seed(seed)

    if dataset == 'joint':
        #TODO need to calculate mu_b for each observation independently.
        raise NotImplementedError('This is not implemented for the joint dataset yet.')
        # observations, lo, hi = load_joint_spectrum_observation(input_dir)
    else:
        p = os.path.join(input_dir, dataset)
        observations, lo, hi = load_spectrum_observations(p)

    prepare_output(output_dir)

    # TODO: this has to happen for every observation independently
    exposure_ratio = observations[0].alpha[0]
    # print(exposure_ratio)
    on_data, off_data = get_observed_counts(observations)

    integrator = init_integrators(observations)

    print('On Data')
    display_data(on_data)

    print('Off Data')
    display_data(off_data)
    
    print('--' * 30)
    print(f'Fitting data for {dataset} in {len(observations)} observations.  ')
    print(f'Using {len(on_data)} bins with { on_data.sum()} counts in on region and {off_data.sum()} counts in off region.')
    print(f'Fit range is: {(lo, hi) * u.TeV}.')
    model = pm.Model(theano_config={'compute_test_value': 'ignore'})
    with model:
        # amplitude = pm.TruncatedNormal('amplitude', mu=4, sd=1, lower=0.01, testval=4)
        # alpha = pm.TruncatedNormal('alpha', mu=2.5, sd=1, lower=0.00, testval=2.5)
        # beta = pm.TruncatedNormal('beta', mu=0.5, sd=0.5, lower=0.00000, testval=0.5)
        amplitude = pm.HalfFlat('amplitude', testval=4)
        alpha = pm.HalfFlat('alpha', testval=2.5)
        beta = pm.HalfFlat('beta', testval=0.5)

        mu_s = forward_fold_log_parabola_symbolic(integrator, amplitude, alpha, beta, observations)
        # mu_s = forward_fold_log_parabola_analytic(amplitude, alpha, beta, observations)

        if model_type == 'wstat':
            print('Building profiled likelihood model')
            mu_b = pm.Deterministic('mu_b', calc_mu_b(mu_s, on_data, off_data, exposure_ratio))
        else:
            print('Building full likelihood model')
            mu_b = pm.HalfFlat('mu_b', shape=len(off_data))

        pm.Poisson('background', mu=mu_b, observed=off_data, shape=len(off_data))
        pm.Poisson('signal', mu=mu_s + exposure_ratio * mu_b, observed=on_data, shape=len(on_data))


    print('--' * 30)
    print('Model debug information:')
    for RV in model.basic_RVs:
        print(RV.name, RV.logp(model.test_point))

    if profile:
        model.profile(model.logpt).summary()

    print(model.check_test_point())

    print('--' * 30)
    print('Plotting landscape:')
    fig, _ = plot_landscape(model, off_data)
    fig.savefig(os.path.join(output_dir, 'landscape.pdf'))

    print('--' * 30)
    print('Printing  graphs:')
    theano.printing.pydotprint(mu_s, outfile=os.path.join(output_dir, 'graph_mu_s.pdf'), format='pdf', var_with_name_simple=True)  
    theano.printing.pydotprint(mu_s + exposure_ratio * mu_b, outfile=os.path.join(output_dir, 'graph_n_on.pdf'), format='pdf', var_with_name_simple=True)  


    print('--' * 30)
    print('Sampling likelihood:')
    with model:
        trace = pm.sample(n_samples, cores=n_cores, tune=n_tune, init=init, seed=[seed] * n_cores)

    print('--' * 30)
    print(f'Fit results for {dataset}')
    print(trace['amplitude'].mean(), trace['alpha'].mean(), trace['beta'].mean())
    print(np.median(trace['amplitude']), np.median(trace['alpha']), np.median(trace['beta']))

    print('--' * 30)
    # print('Plotting traces')
    # plt.figure()
    # varnames = ['amplitude', 'alpha', 'beta'] if model_type != 'full' else ['amplitude', 'alpha', 'beta', 'mu_b']
    # pm.traceplot(trace, varnames=varnames)
    # plt.savefig(os.path.join(output_dir, 'traces.pdf'))

    p = os.path.join(output_dir, 'num_samples.txt')
    with open(p, "w") as text_file:
        text_file.write(f'\\num{{{n_samples}}}')

    p = os.path.join(output_dir, 'num_chains.txt')
    with open(p, "w") as text_file:
        text_file.write(f'\\num{{{n_cores}}}')
    
    p = os.path.join(output_dir, 'num_tune.txt')
    with open(p, "w") as text_file:
        text_file.write(f'\\num{{{n_tune}}}')

    plt.figure()
    pm.energyplot(trace)
    plt.savefig(os.path.join(output_dir, 'energy.pdf'))

    # plt.figure()
    # pm.autocorrplot(trace, burn=n_tune)
    # plt.savefig(os.path.join(output_dir, 'autocorr.pdf'))
    
    plt.figure()
    pm.forestplot(trace, varnames=['amplitude', 'alpha', 'beta'])
    plt.savefig(os.path.join(output_dir, 'forest.pdf'))
    

    trace_output = os.path.join(output_dir, 'traces')
    print(f'Saving traces to {trace_output}')
    with model:
        pm.save_trace(trace, trace_output, overwrite=True)
Пример #3
0
    u = pm.Normal('u', 0, 0.0001)

    mu = pm.Deterministic('mu', tt.exp(beta + beta1 * x_shared + u))
    # Observed_pred = pm.Weibull("Observed_pred",  alpha=mu, beta=sigma, shape=elec_faults.shape)  # 观测值
    Observed = pm.Weibull("Observed", alpha=sigma, beta=mu,
                          observed=y_shared)  # 观测值

    start = pm.find_MAP()
    # step = pm.Metropolis([switchpoint])
    trace2 = pm.sample(3000, start=start)
chain2 = trace2[1000:]
varnames2 = ['beta', 'early_rate', 'late_rate', 'sigma', 'u']

pm.traceplot(chain2)
plt.show()
pm.energyplot(trace2)
plt.show()
# # 画出自相关曲线
# pm.autocorrplot(chain2, varnames2)
# plt.show()
print(pm.dic(trace2, unpooled_model))

# x_shared.set_value([6, 6, 7])
# x_shared1.set_value([20, 40, 40])
# y_shared.set_value([0, 0, 0])
elec_year1 = np.delete(elec_year, np.s_[:6])
elec_year1 = np.append([2, 3, 4, 5, 6, 7], elec_year1)
x_shared.set_value(elec_year1)
with unpooled_model:
    trace3 = pm.sample(3000)
    post_pred = pm.sample_ppc(trace3)
Пример #4
0
    δ = pm.Normal('δ', 0, sd=(δ_1 * δ_1))
    # δ = pm.Normal('δ', 0, sd=20) # 若模型收敛差则δ改用这个语句
    theta1 = pm.Deterministic('theta1', a0 + (Δ_a).cumsum())

    theta = Bx_.dot(theta1) + δ
    Observed = pm.Normal('Observed', mu=theta, sd=sigma, observed=elec_faults)  # 观测值

    # start = pm.find_MAP()
    step1 = pm.Slice([tau1, a_0])
    trace2 = pm.sample(1000, tune=500, step=step1)
chain2 = trace2
varnames1 = [ 'a0', 'δ', 'sigma', 'tau1']
pm.plot_posterior(chain2, varnames1, kde_plot=True)
plt.show()

pm.energyplot(chain2)  # 能量图对比,重合度越高表示模型越优
plt.show()
# 画出自相关曲线
varnames1 = [ 'a0', 'δ', 'sigma', 'tau1']
pm.autocorrplot(chain2, varnames1)
plt.show()
print(pm.df_summary(chain2, varnames1))

print(pm.waic(trace=trace2, model=partial_model))
# ======================================================================
# 后验分析:
# 画出后验与原始图形对比图
#
# ======================================================================
# Bx_.set_value([7,8] , [5,6])
with partial_model:
Пример #5
0
with pm.Model() as model:
    σ_a = pm.HalfCauchy('σ_a', 5.)
    a0 = pm.Normal('a0', 0., 10.)
    Δ_a = pm.Normal('Δ_a', 0., 1., shape=N_MODEL_KNOTS)

    a = pm.Deterministic('a', a0 + (σ_a * Δ_a).cumsum())  # cumsum:返回累积和
    σ = pm.HalfCauchy('σ', 5.)

    obs = pm.Normal('obs', Bx_.dot(a), σ, observed=y)

Bx_.set_value(basis_funcs(x_plot))

with model:
    trace = pm.sample(nuts_kwargs={'target_accept': 0.95})

pm.energyplot(trace)
plt.show()

varnames1 = ['σ_a', 'a0', 'Δ_a', 'σ']
pm.traceplot(trace, varnames1)
plt.show()

# 后验分析
with model:
    pp_trace = pm.sample_ppc(trace, 1000)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(x_plot, spline(x_plot), c='k', label="True function")

low, high = np.percentile(pp_trace['obs'], [25, 75], axis=0)
ax.fill_between(x_plot, low, high, color=red, alpha=0.5)
Пример #6
0
    start = pm.find_MAP()
    # step = pm.Metropolis()
    # trace2 = pm.sample(nuts_kwargs={'target_accept': 0.95})
    trace2 = pm.sample(3000, tune=1000)
chain2 = trace2
varnames1 = ['σ_a', 'σ_aB', 'theta1', 'theta1B']
pm.traceplot(chain2, varnames1)
plt.show()
varnames1 = ['a0', 'sigma', 'δ', 'δB', 'Δ_a', 'Δ_aB']
pm.traceplot(chain2, varnames1)
plt.show()

plt.plot(trace2['step_size_bar'])
plt.show()

pm.energyplot(chain2)
plt.show()
# 画出自相关曲线
varnames1 = ['σ_a', 'a0', 'δ', 'σ_aB', 'Δ_a', 'δB']
pm.autocorrplot(chain2, varnames1)
plt.show()

# 后验分析
with partial_model:
    pp_trace = pm.sample_ppc(trace2, 1000)

fig, ax = plt.subplots(figsize=(8, 6))

j, k1 = 0, 6
x_plot = np.linspace(1, k1, Num / 7)
Пример #7
0
def main(input_dir, config_file, output_dir, dataset, model_type, tau,
         n_samples, n_tune, target_accept, n_cores, seed, init):
    prepare_output(output_dir)

    config = load_config(config_file, dataset)
    fit_range = config['fit_range']
    path = os.path.join(input_dir, dataset)
    observations = load_data(path, config)
    e_true_bins = config['e_true_bins']
    e_reco_bins = config['e_reco_bins']
    # exposure_ratio = observations[0].alpha
    # from IPython import embed; embed()
    on_data, off_data, excess, exposure_ratio = get_observed_counts(
        observations, fit_range=fit_range, bins=e_reco_bins)
    print(f'Exposure ratio {exposure_ratio}')
    # from IPython import embed; embed()
    # print(f'On Data {on_data.shape}\n')
    # display_data(on_data)
    # print(f'\n\n Off Data {off_data.shape}\n')
    # display_data(off_data)
    print(f'Excess {excess.shape} \n')
    idx = np.searchsorted(e_reco_bins.to_value(u.TeV),
                          fit_range.to_value(u.TeV))
    lo, up = idx[0], idx[1] + 1
    indices = np.argwhere(excess > 5)
    display_data(excess, mark_indices=indices, low_index=lo, high_index=up)
    print('--' * 30)
    print(f'Unfolding data for:  {dataset.upper()}.  ')
    # print(f'IRF with {len( config['e_true_bins'] ) - 1, len( config['e_reco_bins'] ) - 1}')
    print(
        f'Using {len(on_data)} bins with { on_data.sum()} counts in on region and {off_data.sum()} counts in off region.'
    )

    area_scaling = 1
    print(observations[0].aeff.data.data.to_value(u.km**2).astype(np.float32) *
          area_scaling)
    model = pm.Model(theano_config={'compute_test_value': 'ignore'})
    with model:
        # mu_b = pm.TruncatedNormal('mu_b', shape=len(off_data), sd=5, mu=off_data, lower=0.01)
        # expected_counts = pm.Lognormal('expected_counts', shape=len(config['e_true_bins']) - 1, testval=10, sd=1)
        expected_counts = pm.TruncatedNormal('expected_counts',
                                             shape=len(e_true_bins) - 1,
                                             testval=0.5,
                                             mu=2,
                                             sd=50,
                                             lower=0.0001)
        # expected_counts = pm.HalfFlat('expected_counts', shape=len(e_true_bins) - 1, testval=10)
        # c = expected_counts / area_scaling
        mu_s = forward_fold(expected_counts,
                            observations,
                            fit_range=fit_range,
                            area_scaling=area_scaling)

        if model_type == 'wstat':
            print('Building profiled likelihood model')
            mu_b = pm.Deterministic(
                'mu_b', calc_mu_b(mu_s, on_data, off_data, exposure_ratio))
        else:
            print('Building full likelihood model')
            mu_b = pm.HalfFlat('mu_b', shape=len(off_data))
            # mu_b = pm.Lognormal('mu_b', shape=len(off_data), sd=5)

        pm.Poisson('background', mu=mu_b + 1E-5, observed=off_data)
        pm.Poisson('signal',
                   mu=mu_s + exposure_ratio * mu_b + 1E-5,
                   observed=on_data)

    print('--' * 30)
    print('Model debug information:')
    for RV in model.basic_RVs:
        print(RV.name, RV.logp(model.test_point))

    print('--' * 30)
    print('Sampling likelihood:')
    with model:
        trace = pm.sample(n_samples,
                          cores=n_cores,
                          tune=n_tune,
                          init=init,
                          seed=[seed] * n_cores)

    trace_output = os.path.join(output_dir, 'traces')
    print(f'Saving traces to {trace_output}')
    with model:
        pm.save_trace(trace, trace_output, overwrite=True)

    print('--' * 30)
    print('Plotting result')
    # print(area_scaling)

    fig, [ax1, ax2] = plt.subplots(2,
                                   1,
                                   figsize=(10, 7),
                                   sharex=True,
                                   gridspec_kw={'height_ratios': [3, 1]})

    plot_unfolding_result(trace,
                          bins=e_true_bins,
                          area_scaling=area_scaling,
                          fit_range=fit_range,
                          ax=ax1)
    plot_excees(excess, bins=config['e_reco_bins'], ax=ax2)
    plt.savefig(os.path.join(output_dir, 'result.pdf'))

    print('--' * 30)
    print('Plotting Diagnostics')
    print(pm.summary(trace).round(2))
    # plt.figure()
    # pm.traceplot(trace)
    # plt.savefig(os.path.join(output_dir, 'traces.pdf'))

    plt.figure()
    pm.energyplot(trace)
    plt.savefig(os.path.join(output_dir, 'energy.pdf'))

    # try:
    #     plt.figure()
    #     pm.autocorrplot(trace, burn=n_tune)
    #     plt.savefig(os.path.join(output_dir, 'autocorr.pdf'))
    # except:
    #     print('Could not plot auttocorrelation')

    plt.figure()
    pm.forestplot(trace)
    plt.savefig(os.path.join(output_dir, 'forest.pdf'))

    p = os.path.join(output_dir, 'num_samples.txt')
    with open(p, "w") as text_file:
        text_file.write(f'\\num{{{n_samples}}}')

    p = os.path.join(output_dir, 'num_chains.txt')
    with open(p, "w") as text_file:
        text_file.write(f'\\num{{{n_cores}}}')

    p = os.path.join(output_dir, 'num_tune.txt')
    with open(p, "w") as text_file:
        text_file.write(f'\\num{{{n_tune}}}')
Пример #8
0
    # step = pm.Metropolis()
    trace3 = pm.sample(5000, start=start, tune=2000)
chain3 = trace3[2000:]
varnames1 = ['beta', 'beta1', 'beta2', 'beta3', 'beta4', 'beta5']
pm.traceplot(chain3, varnames1)
plt.show()
print(pm.df_summary(trace3, varnames1))
varnames1 = ['sigma', 'mu_a', 'sigma_a', 'theta1', 'beta12']
pm.traceplot(chain3, varnames1)
plt.show()
print(pm.df_summary(trace3, varnames1))
# # 画出自相关曲线
pm.autocorrplot(chain3)
plt.show()

pm.energyplot(chain3)
plt.show()

post_beta = chain3['beta']  # y有三行数据
post_beta1 = chain3['beta'][:, 2]  # 采用这种读法即可
post_beta2 = chain3['beta'][1]  # 这种读法应该也可以
#
#
# # 画出参数间的自相关
# tracedf = pm.trace_to_dataframe(trace3, varnames=['beta1', 'beta2', 'beta3', 'beta4', 'beta5'])
# sns.pairplot(tracedf)
# plt.show()
# ======================================================================
# 模型对比与后验分析
# ======================================================================
# Waic = pm.compare([traces_ols_glm, trace1], [mdl_ols_glm, pooled_model], ic='WAIC')
Пример #9
0
    home_points = pm.Poisson('home_points',
                             mu=home_theta,
                             observed=observed_home_score)
    away_points = pm.Poisson('away_points',
                             mu=away_theta,
                             observed=observed_away_score)

with model:
    trace = pm.sample(1000, tune=1000, cores=3)
    pm.traceplot(trace)

# check for convergence?
bfmi = pm.bfmi(trace)
max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
(pm.energyplot(trace, legend=False,
               figsize=(6, 4)).set_title("BFMI = {}\nGelman-Rubin = {}".format(
                   bfmi, max_gr)))

#results
df_hpd = pd.DataFrame(pm.stats.hpd(trace['atts']),
                      columns=['hpd_low', 'hpd_high'],
                      index=teams.team.values)
df_median = pd.DataFrame(pm.stats.quantiles(trace['atts'])[50],
                         columns=['hpd_median'],
                         index=teams.team.values)
df_hpd = df_hpd.join(df_median)
df_hpd['relative_lower'] = df_hpd.hpd_median - df_hpd.hpd_low
df_hpd['relative_upper'] = df_hpd.hpd_high - df_hpd.hpd_median
df_hpd = df_hpd.sort_values(by='hpd_median')
df_hpd = df_hpd.reset_index()
df_hpd['x'] = df_hpd.index + .5
Пример #10
0
# ================================================================================
# Return a B-spline basis element B(x | t[0], ..., t[k+1])
xx = np.linspace(1, 15, Num)
b = sp.interpolate.BSpline.basis_element(knots[1:])
print(b)
fig, ax = plt.subplots()
x = np.linspace(0, 12, 200)
ax.plot(x, b(x), 'g', lw=3)
ax.grid(True)
plt.show()

pm.traceplot(trace_1)
plt.show()

ax = pm.energyplot(trace_1)
bfmi = pm.bfmi(trace_1)
ax.set_title(f"BFMI = {bfmi:.2f}")
plt.show()
varnames2 = ['δ', 'δB', 'δC']
tmp0 = pm.df_summary(trace_1, varnames2)
print(tmp0)

# ================================================================================
Bx_.set_value(basis_funcs(xs_yearA.get_value()))
# 建模,模型,用作算法对比,将一阶回归换成高斯游走
with pm.Model() as model_3:
    # define priors
    alpha3 = pm.HalfCauchy('alpha3', 10., testval=1.15)

    beta0 = pm.GaussianRandomWalk('beta0', sd=1, shape=Num_5)