def run(): teams = df.home_team.unique() teams = pd.DataFrame(teams, columns=['team']) teams['i'] = teams.index df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left') df = df.rename(columns = {'i': 'i_home'}).drop('team', 1) df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left') df = df.rename(columns = {'i': 'i_away'}).drop('team', 1) observed_home_goals = df.home_score.values observed_away_goals = df.away_score.values home_team = df.i_home.values away_team = df.i_away.values num_teams = len(df.i_home.drop_duplicates()) num_games = len(home_team) g = df.groupby('i_away') att_starting_points = np.log(g.away_score.mean()) g = df.groupby('i_home') def_starting_points = -np.log(g.away_score.mean()) with pm.Model() as model: # global model parameters home = pm.Flat('home') sd_att = pm.HalfStudentT('sd_att', nu=3, sigma=2.5) sd_def = pm.HalfStudentT('sd_def', nu=3, sigma=2.5) intercept = pm.Flat('intercept') # team-specific model parameters atts_star = pm.Normal("atts_star", mu=0, sigma=sd_att, shape=num_teams) defs_star = pm.Normal("defs_star", mu=0, sigma=sd_def, shape=num_teams) atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star)) defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star)) home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team]) away_theta = tt.exp(intercept + atts[away_team] + defs[home_team]) # likelihood of observed data home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals) away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals) trace = pm.sample(1000, tune=1000, cores=3) pm.traceplot(trace, var_names=['intercept', 'home', 'sd_att', 'sd_def']); bfmi = pm.bfmi(trace) max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values()) (pm.energyplot(trace, legend=False, figsize=(6, 4))
def main(input_dir, output_dir, dataset, model_type, n_samples, n_tune, target_accept, n_cores, seed, init, profile): '''Fit log-parabola model to DATASET. Parameters ---------- input_dir : [type] input directory containing subdirs for each instrument with dl3 data output_dir : [type] where to save the results. traces and two plots dataset : string telescope name model_type : string whether to use the profile likelihood ('wstat' or 'profile') or not ('full') n_samples : int number of samples to draw n_tune : int number of tuning steps target_accept : float target accept fraction for the pymc sampler n_cores : int number of cpu cores to use seed : int random seed init : string pymc init string profile : bool whether to output debugging/profiling information to the console Raises ------ NotImplementedError This does not yet work on the joint dataset. but thats good enough for me. ''' np.random.seed(seed) if dataset == 'joint': #TODO need to calculate mu_b for each observation independently. raise NotImplementedError('This is not implemented for the joint dataset yet.') # observations, lo, hi = load_joint_spectrum_observation(input_dir) else: p = os.path.join(input_dir, dataset) observations, lo, hi = load_spectrum_observations(p) prepare_output(output_dir) # TODO: this has to happen for every observation independently exposure_ratio = observations[0].alpha[0] # print(exposure_ratio) on_data, off_data = get_observed_counts(observations) integrator = init_integrators(observations) print('On Data') display_data(on_data) print('Off Data') display_data(off_data) print('--' * 30) print(f'Fitting data for {dataset} in {len(observations)} observations. ') print(f'Using {len(on_data)} bins with { on_data.sum()} counts in on region and {off_data.sum()} counts in off region.') print(f'Fit range is: {(lo, hi) * u.TeV}.') model = pm.Model(theano_config={'compute_test_value': 'ignore'}) with model: # amplitude = pm.TruncatedNormal('amplitude', mu=4, sd=1, lower=0.01, testval=4) # alpha = pm.TruncatedNormal('alpha', mu=2.5, sd=1, lower=0.00, testval=2.5) # beta = pm.TruncatedNormal('beta', mu=0.5, sd=0.5, lower=0.00000, testval=0.5) amplitude = pm.HalfFlat('amplitude', testval=4) alpha = pm.HalfFlat('alpha', testval=2.5) beta = pm.HalfFlat('beta', testval=0.5) mu_s = forward_fold_log_parabola_symbolic(integrator, amplitude, alpha, beta, observations) # mu_s = forward_fold_log_parabola_analytic(amplitude, alpha, beta, observations) if model_type == 'wstat': print('Building profiled likelihood model') mu_b = pm.Deterministic('mu_b', calc_mu_b(mu_s, on_data, off_data, exposure_ratio)) else: print('Building full likelihood model') mu_b = pm.HalfFlat('mu_b', shape=len(off_data)) pm.Poisson('background', mu=mu_b, observed=off_data, shape=len(off_data)) pm.Poisson('signal', mu=mu_s + exposure_ratio * mu_b, observed=on_data, shape=len(on_data)) print('--' * 30) print('Model debug information:') for RV in model.basic_RVs: print(RV.name, RV.logp(model.test_point)) if profile: model.profile(model.logpt).summary() print(model.check_test_point()) print('--' * 30) print('Plotting landscape:') fig, _ = plot_landscape(model, off_data) fig.savefig(os.path.join(output_dir, 'landscape.pdf')) print('--' * 30) print('Printing graphs:') theano.printing.pydotprint(mu_s, outfile=os.path.join(output_dir, 'graph_mu_s.pdf'), format='pdf', var_with_name_simple=True) theano.printing.pydotprint(mu_s + exposure_ratio * mu_b, outfile=os.path.join(output_dir, 'graph_n_on.pdf'), format='pdf', var_with_name_simple=True) print('--' * 30) print('Sampling likelihood:') with model: trace = pm.sample(n_samples, cores=n_cores, tune=n_tune, init=init, seed=[seed] * n_cores) print('--' * 30) print(f'Fit results for {dataset}') print(trace['amplitude'].mean(), trace['alpha'].mean(), trace['beta'].mean()) print(np.median(trace['amplitude']), np.median(trace['alpha']), np.median(trace['beta'])) print('--' * 30) # print('Plotting traces') # plt.figure() # varnames = ['amplitude', 'alpha', 'beta'] if model_type != 'full' else ['amplitude', 'alpha', 'beta', 'mu_b'] # pm.traceplot(trace, varnames=varnames) # plt.savefig(os.path.join(output_dir, 'traces.pdf')) p = os.path.join(output_dir, 'num_samples.txt') with open(p, "w") as text_file: text_file.write(f'\\num{{{n_samples}}}') p = os.path.join(output_dir, 'num_chains.txt') with open(p, "w") as text_file: text_file.write(f'\\num{{{n_cores}}}') p = os.path.join(output_dir, 'num_tune.txt') with open(p, "w") as text_file: text_file.write(f'\\num{{{n_tune}}}') plt.figure() pm.energyplot(trace) plt.savefig(os.path.join(output_dir, 'energy.pdf')) # plt.figure() # pm.autocorrplot(trace, burn=n_tune) # plt.savefig(os.path.join(output_dir, 'autocorr.pdf')) plt.figure() pm.forestplot(trace, varnames=['amplitude', 'alpha', 'beta']) plt.savefig(os.path.join(output_dir, 'forest.pdf')) trace_output = os.path.join(output_dir, 'traces') print(f'Saving traces to {trace_output}') with model: pm.save_trace(trace, trace_output, overwrite=True)
u = pm.Normal('u', 0, 0.0001) mu = pm.Deterministic('mu', tt.exp(beta + beta1 * x_shared + u)) # Observed_pred = pm.Weibull("Observed_pred", alpha=mu, beta=sigma, shape=elec_faults.shape) # 观测值 Observed = pm.Weibull("Observed", alpha=sigma, beta=mu, observed=y_shared) # 观测值 start = pm.find_MAP() # step = pm.Metropolis([switchpoint]) trace2 = pm.sample(3000, start=start) chain2 = trace2[1000:] varnames2 = ['beta', 'early_rate', 'late_rate', 'sigma', 'u'] pm.traceplot(chain2) plt.show() pm.energyplot(trace2) plt.show() # # 画出自相关曲线 # pm.autocorrplot(chain2, varnames2) # plt.show() print(pm.dic(trace2, unpooled_model)) # x_shared.set_value([6, 6, 7]) # x_shared1.set_value([20, 40, 40]) # y_shared.set_value([0, 0, 0]) elec_year1 = np.delete(elec_year, np.s_[:6]) elec_year1 = np.append([2, 3, 4, 5, 6, 7], elec_year1) x_shared.set_value(elec_year1) with unpooled_model: trace3 = pm.sample(3000) post_pred = pm.sample_ppc(trace3)
δ = pm.Normal('δ', 0, sd=(δ_1 * δ_1)) # δ = pm.Normal('δ', 0, sd=20) # 若模型收敛差则δ改用这个语句 theta1 = pm.Deterministic('theta1', a0 + (Δ_a).cumsum()) theta = Bx_.dot(theta1) + δ Observed = pm.Normal('Observed', mu=theta, sd=sigma, observed=elec_faults) # 观测值 # start = pm.find_MAP() step1 = pm.Slice([tau1, a_0]) trace2 = pm.sample(1000, tune=500, step=step1) chain2 = trace2 varnames1 = [ 'a0', 'δ', 'sigma', 'tau1'] pm.plot_posterior(chain2, varnames1, kde_plot=True) plt.show() pm.energyplot(chain2) # 能量图对比,重合度越高表示模型越优 plt.show() # 画出自相关曲线 varnames1 = [ 'a0', 'δ', 'sigma', 'tau1'] pm.autocorrplot(chain2, varnames1) plt.show() print(pm.df_summary(chain2, varnames1)) print(pm.waic(trace=trace2, model=partial_model)) # ====================================================================== # 后验分析: # 画出后验与原始图形对比图 # # ====================================================================== # Bx_.set_value([7,8] , [5,6]) with partial_model:
with pm.Model() as model: σ_a = pm.HalfCauchy('σ_a', 5.) a0 = pm.Normal('a0', 0., 10.) Δ_a = pm.Normal('Δ_a', 0., 1., shape=N_MODEL_KNOTS) a = pm.Deterministic('a', a0 + (σ_a * Δ_a).cumsum()) # cumsum:返回累积和 σ = pm.HalfCauchy('σ', 5.) obs = pm.Normal('obs', Bx_.dot(a), σ, observed=y) Bx_.set_value(basis_funcs(x_plot)) with model: trace = pm.sample(nuts_kwargs={'target_accept': 0.95}) pm.energyplot(trace) plt.show() varnames1 = ['σ_a', 'a0', 'Δ_a', 'σ'] pm.traceplot(trace, varnames1) plt.show() # 后验分析 with model: pp_trace = pm.sample_ppc(trace, 1000) fig, ax = plt.subplots(figsize=(8, 6)) ax.plot(x_plot, spline(x_plot), c='k', label="True function") low, high = np.percentile(pp_trace['obs'], [25, 75], axis=0) ax.fill_between(x_plot, low, high, color=red, alpha=0.5)
start = pm.find_MAP() # step = pm.Metropolis() # trace2 = pm.sample(nuts_kwargs={'target_accept': 0.95}) trace2 = pm.sample(3000, tune=1000) chain2 = trace2 varnames1 = ['σ_a', 'σ_aB', 'theta1', 'theta1B'] pm.traceplot(chain2, varnames1) plt.show() varnames1 = ['a0', 'sigma', 'δ', 'δB', 'Δ_a', 'Δ_aB'] pm.traceplot(chain2, varnames1) plt.show() plt.plot(trace2['step_size_bar']) plt.show() pm.energyplot(chain2) plt.show() # 画出自相关曲线 varnames1 = ['σ_a', 'a0', 'δ', 'σ_aB', 'Δ_a', 'δB'] pm.autocorrplot(chain2, varnames1) plt.show() # 后验分析 with partial_model: pp_trace = pm.sample_ppc(trace2, 1000) fig, ax = plt.subplots(figsize=(8, 6)) j, k1 = 0, 6 x_plot = np.linspace(1, k1, Num / 7)
def main(input_dir, config_file, output_dir, dataset, model_type, tau, n_samples, n_tune, target_accept, n_cores, seed, init): prepare_output(output_dir) config = load_config(config_file, dataset) fit_range = config['fit_range'] path = os.path.join(input_dir, dataset) observations = load_data(path, config) e_true_bins = config['e_true_bins'] e_reco_bins = config['e_reco_bins'] # exposure_ratio = observations[0].alpha # from IPython import embed; embed() on_data, off_data, excess, exposure_ratio = get_observed_counts( observations, fit_range=fit_range, bins=e_reco_bins) print(f'Exposure ratio {exposure_ratio}') # from IPython import embed; embed() # print(f'On Data {on_data.shape}\n') # display_data(on_data) # print(f'\n\n Off Data {off_data.shape}\n') # display_data(off_data) print(f'Excess {excess.shape} \n') idx = np.searchsorted(e_reco_bins.to_value(u.TeV), fit_range.to_value(u.TeV)) lo, up = idx[0], idx[1] + 1 indices = np.argwhere(excess > 5) display_data(excess, mark_indices=indices, low_index=lo, high_index=up) print('--' * 30) print(f'Unfolding data for: {dataset.upper()}. ') # print(f'IRF with {len( config['e_true_bins'] ) - 1, len( config['e_reco_bins'] ) - 1}') print( f'Using {len(on_data)} bins with { on_data.sum()} counts in on region and {off_data.sum()} counts in off region.' ) area_scaling = 1 print(observations[0].aeff.data.data.to_value(u.km**2).astype(np.float32) * area_scaling) model = pm.Model(theano_config={'compute_test_value': 'ignore'}) with model: # mu_b = pm.TruncatedNormal('mu_b', shape=len(off_data), sd=5, mu=off_data, lower=0.01) # expected_counts = pm.Lognormal('expected_counts', shape=len(config['e_true_bins']) - 1, testval=10, sd=1) expected_counts = pm.TruncatedNormal('expected_counts', shape=len(e_true_bins) - 1, testval=0.5, mu=2, sd=50, lower=0.0001) # expected_counts = pm.HalfFlat('expected_counts', shape=len(e_true_bins) - 1, testval=10) # c = expected_counts / area_scaling mu_s = forward_fold(expected_counts, observations, fit_range=fit_range, area_scaling=area_scaling) if model_type == 'wstat': print('Building profiled likelihood model') mu_b = pm.Deterministic( 'mu_b', calc_mu_b(mu_s, on_data, off_data, exposure_ratio)) else: print('Building full likelihood model') mu_b = pm.HalfFlat('mu_b', shape=len(off_data)) # mu_b = pm.Lognormal('mu_b', shape=len(off_data), sd=5) pm.Poisson('background', mu=mu_b + 1E-5, observed=off_data) pm.Poisson('signal', mu=mu_s + exposure_ratio * mu_b + 1E-5, observed=on_data) print('--' * 30) print('Model debug information:') for RV in model.basic_RVs: print(RV.name, RV.logp(model.test_point)) print('--' * 30) print('Sampling likelihood:') with model: trace = pm.sample(n_samples, cores=n_cores, tune=n_tune, init=init, seed=[seed] * n_cores) trace_output = os.path.join(output_dir, 'traces') print(f'Saving traces to {trace_output}') with model: pm.save_trace(trace, trace_output, overwrite=True) print('--' * 30) print('Plotting result') # print(area_scaling) fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(10, 7), sharex=True, gridspec_kw={'height_ratios': [3, 1]}) plot_unfolding_result(trace, bins=e_true_bins, area_scaling=area_scaling, fit_range=fit_range, ax=ax1) plot_excees(excess, bins=config['e_reco_bins'], ax=ax2) plt.savefig(os.path.join(output_dir, 'result.pdf')) print('--' * 30) print('Plotting Diagnostics') print(pm.summary(trace).round(2)) # plt.figure() # pm.traceplot(trace) # plt.savefig(os.path.join(output_dir, 'traces.pdf')) plt.figure() pm.energyplot(trace) plt.savefig(os.path.join(output_dir, 'energy.pdf')) # try: # plt.figure() # pm.autocorrplot(trace, burn=n_tune) # plt.savefig(os.path.join(output_dir, 'autocorr.pdf')) # except: # print('Could not plot auttocorrelation') plt.figure() pm.forestplot(trace) plt.savefig(os.path.join(output_dir, 'forest.pdf')) p = os.path.join(output_dir, 'num_samples.txt') with open(p, "w") as text_file: text_file.write(f'\\num{{{n_samples}}}') p = os.path.join(output_dir, 'num_chains.txt') with open(p, "w") as text_file: text_file.write(f'\\num{{{n_cores}}}') p = os.path.join(output_dir, 'num_tune.txt') with open(p, "w") as text_file: text_file.write(f'\\num{{{n_tune}}}')
# step = pm.Metropolis() trace3 = pm.sample(5000, start=start, tune=2000) chain3 = trace3[2000:] varnames1 = ['beta', 'beta1', 'beta2', 'beta3', 'beta4', 'beta5'] pm.traceplot(chain3, varnames1) plt.show() print(pm.df_summary(trace3, varnames1)) varnames1 = ['sigma', 'mu_a', 'sigma_a', 'theta1', 'beta12'] pm.traceplot(chain3, varnames1) plt.show() print(pm.df_summary(trace3, varnames1)) # # 画出自相关曲线 pm.autocorrplot(chain3) plt.show() pm.energyplot(chain3) plt.show() post_beta = chain3['beta'] # y有三行数据 post_beta1 = chain3['beta'][:, 2] # 采用这种读法即可 post_beta2 = chain3['beta'][1] # 这种读法应该也可以 # # # # 画出参数间的自相关 # tracedf = pm.trace_to_dataframe(trace3, varnames=['beta1', 'beta2', 'beta3', 'beta4', 'beta5']) # sns.pairplot(tracedf) # plt.show() # ====================================================================== # 模型对比与后验分析 # ====================================================================== # Waic = pm.compare([traces_ols_glm, trace1], [mdl_ols_glm, pooled_model], ic='WAIC')
home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_score) away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_score) with model: trace = pm.sample(1000, tune=1000, cores=3) pm.traceplot(trace) # check for convergence? bfmi = pm.bfmi(trace) max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values()) (pm.energyplot(trace, legend=False, figsize=(6, 4)).set_title("BFMI = {}\nGelman-Rubin = {}".format( bfmi, max_gr))) #results df_hpd = pd.DataFrame(pm.stats.hpd(trace['atts']), columns=['hpd_low', 'hpd_high'], index=teams.team.values) df_median = pd.DataFrame(pm.stats.quantiles(trace['atts'])[50], columns=['hpd_median'], index=teams.team.values) df_hpd = df_hpd.join(df_median) df_hpd['relative_lower'] = df_hpd.hpd_median - df_hpd.hpd_low df_hpd['relative_upper'] = df_hpd.hpd_high - df_hpd.hpd_median df_hpd = df_hpd.sort_values(by='hpd_median') df_hpd = df_hpd.reset_index() df_hpd['x'] = df_hpd.index + .5
# ================================================================================ # Return a B-spline basis element B(x | t[0], ..., t[k+1]) xx = np.linspace(1, 15, Num) b = sp.interpolate.BSpline.basis_element(knots[1:]) print(b) fig, ax = plt.subplots() x = np.linspace(0, 12, 200) ax.plot(x, b(x), 'g', lw=3) ax.grid(True) plt.show() pm.traceplot(trace_1) plt.show() ax = pm.energyplot(trace_1) bfmi = pm.bfmi(trace_1) ax.set_title(f"BFMI = {bfmi:.2f}") plt.show() varnames2 = ['δ', 'δB', 'δC'] tmp0 = pm.df_summary(trace_1, varnames2) print(tmp0) # ================================================================================ Bx_.set_value(basis_funcs(xs_yearA.get_value())) # 建模,模型,用作算法对比,将一阶回归换成高斯游走 with pm.Model() as model_3: # define priors alpha3 = pm.HalfCauchy('alpha3', 10., testval=1.15) beta0 = pm.GaussianRandomWalk('beta0', sd=1, shape=Num_5)