def plot_ind(self, var_names: Union[str, List[str]] = None, show_density: bool = True, credible_interval: float = 0.94): """Plots individual posterior distributions, using ArviZ. Parameters ---------- var_names Parameter(s) to plot. If not specified, show all model parameters. show_density Whether to show density. True or False. Defaults to True. credible_interval Credible interval to plot. Defaults to 0.94. """ if var_names is None: var_names = list(self.parameters_desc) if show_density: kind = 'ridgeplot' else: kind = 'forestplot' az.plot_forest(self.fit, kind=kind, var_names=var_names, credible_interval=credible_interval, combined=True, colors='gray', ridgeplot_alpha=0.8) plt.show()
def forest(self, query='opsin=="chr2" & delay_length==60'): trace_post_query = utils.query_posterior(trace=self.trace, posterior=self.posterior, query=query) if query else \ self.trace.posterior['mu_per_condition'] az.plot_forest(trace_post_query, combined=True, kind='ridgeplot', ridgeplot_alpha=.5)
def run(n_samples=1000): model = build_model() with model: trace = pm.sample(draws=n_samples, tune=1000, target_accept=0.99) az.plot_trace(trace) az.plot_forest(trace)
def plot_model_comparison_CIs(model_res_dict): var_names = [ 'remr_lnVR', 'rema_lnVR', 'fema_lnVR', 'rema_lnCVR', 'fema_lnCVR' ] data = [ az.convert_to_dataset( {model: np.exp(model_res_dict[model].posterior.mu.values)}) for model in var_names ] _ = az.plot_forest(data, combined=True, hdi_prob=0.95, quartiles=True, colors='black', figsize=(10, 4), var_names=var_names, model_names=len(var_names) * ['']) plt.xlim(0.78, 1.23) plt.title('95% HDI for meta-analytic direct effect $e^\\mu$') plt.grid() plt.savefig(os.path.join(parent_dir_name, f'output/hdi_model_comparison.tiff'), format='tiff', dpi=500, bbox_inches="tight")
def plot_forest(trace, variable, var_name=None): fig, ax = az.plot_forest(trace, var_names=variable, credible_interval=0.95) ax[0].set_title('') ax[0].set_title('95% credible intervals', size=15, loc="left") ax[0].spines['left'].set_visible(True) if var_name is not None: ax[0].set_yticklabels(var_name) ax[0].tick_params() return fig, ax
def plot_forest(self): if not (self.mcmc_ and self.data_): raise AttributeError('Object needs to be fit first.') else: _ = az.plot_forest( # NOQA self.data_, var_names=['mu', 'sigma', 'log_nu'], credible_interval=0.95, figsize=(10, 10)) plt.show()
def hdi_param(m_idata, model_type, prior_level): fig, ax = plt.subplots(figsize=(10, 7)) az.plot_forest( m_idata, var_names=["alpha", "beta", "sigma"], combined=True, # combine chains kind='ridgeplot', # instead of default which does not show distribution ridgeplot_truncate=False, # do show the tails hdi_prob=.8, # hdi prob .8 here. ridgeplot_alpha=0.5, # looks prettier ridgeplot_quantiles=[0.5], # show mean ax=ax # add to our axis ) fig.suptitle("Python/pyMC3: HDI intervals for parameters") fig.tight_layout() plt.savefig(f"../plots_python/{model_type}_{prior_level}_HDI_param.jpeg", dpi=300)
def analyze_post(post, method): print_summary(post, 0.95, False) fig, ax = plt.subplots() az.plot_forest(post, hdi_prob=0.95, figsize=(10, 4), ax=ax) plt.title(method) pml.savefig(f'multicollinear_forest_plot_{method}.pdf') plt.show() # post = m6_1.sample_posterior(random.PRNGKey(1), p6_1, (1000,)) fig, ax = plt.subplots() az.plot_pair(post, var_names=["br", "bl"], scatter_kwargs={"alpha": 0.1}, ax=ax) pml.savefig(f'multicollinear_joint_post_{method}.pdf') plt.title(method) plt.show() sum_blbr = post["bl"] + post["br"] fig, ax = plt.subplots() az.plot_kde(sum_blbr, label="sum of bl and br", ax=ax) plt.title(method) pml.savefig(f'multicollinear_sum_post_{method}.pdf') plt.show()
def plot_param_diagnostics(mod, incl_noise_params=False, incl_trend_params=False, incl_smooth_params=False, which='trace', **kwargs): """ Parameters ----------- mod : orbit model object which : str, {'density', 'trace', 'pair', 'autocorr', 'posterior', 'forest'} incl_noise_params : bool if plot noise parameters; default False incl_trend_params : bool if plot trend parameters; default False incl_smooth_params : bool if plot smoothing parameters; default False **kwargs : other parameters passed to arviz functions Returns ------- matplotlib axes object """ posterior_samples = get_arviz_plot_dict( mod, incl_noise_params=incl_noise_params, incl_trend_params=incl_trend_params, incl_smooth_params=incl_smooth_params) if which == "trace": axes = az.plot_trace(posterior_samples, **kwargs) elif which == "density": axes = az.plot_density(posterior_samples, **kwargs) elif which == "posterior": axes = az.plot_posterior(posterior_samples, **kwargs) elif which == "pair": axes = az.plot_pair(posterior_samples, **kwargs) elif which == "autocorr": axes = az.plot_autocorr(posterior_samples, **kwargs) elif which == "forest": axes = az.plot_forest(posterior_samples, **kwargs) else: raise Exception( "please use one of 'trace', 'density', 'posterior', 'pair', 'autocorr', 'forest' for kind." ) return axes
def plot_model_comparison_CIs(model_res_dict): fig, ax = plt.subplots(nrows=1) datasets = [ az.convert_to_dataset({ drug_class: np.exp(model_res_dict[drug_class].posterior.mu.values) }) for drug_class in DRUG_CLASSES ] _ = az.plot_forest(datasets, combined=True, credible_interval=0.95, quartiles=True, colors='black', var_names=DRUG_CLASSES, model_names=['', '', '', ''], ax=ax) ax.set_title('95% HDI $e^\\mu$') plt.tight_layout() plt.savefig(os.path.join(parent_dir_name, f'output/hdi_drug_class_comparison.tiff'), format='tiff', dpi=500, bbox_inches="tight") return plt
def run_model(month=7, n_samples=1000, interp_type='ncs', binary=True, spike=0.9, hdi_prob=0.95, zero_inf=0.7): # preprocessing binary_str = 'binary' if binary else 'nonbinary' df = pd.read_csv('../data/' + interp_type + '-pop-deaths-and-' + binary_str + '-mandates.csv', index_col=0) df = df.rename(columns={ "Age Group": "Age_Group", "COVID-19 Deaths": "covid_19_deaths" }) test_df = df[df["Month"] == month] sex = np.array(test_df["Sex"]) mandates = test_df.iloc[:, -4:] # takes all of the 4 mandate columns that currently exist age = test_df["Age_Group"] covid_deaths = test_df["covid_19_deaths"] population = test_df[ "Population"] / 1000000 # makes the population in units of millions n = len(test_df["Age_Group"].unique() ) # should decrease by 1 after proper age filtering age_data = pd.get_dummies(test_df["Age_Group"]).drop("Under 1 year", axis=1) sex_data = pd.get_dummies(test_df["Sex"], drop_first=True) # run the model with pm.Model() as model: # spike and slab prior tau = pm.InverseGamma('tau', alpha=20, beta=20) xi = pm.Bernoulli('xi', p=spike, shape=len(mandates.columns)) beta_mandates = pm.MvNormal('beta_mandate', mu=0, cov=tau * np.eye(len(mandates.columns)), shape=len(mandates.columns)) # age prior mu_age_mean = np.linspace(-5, 5, len(age_data.columns)) cov = pm.HalfNormal('cov', sigma=2) mu_age = pm.MvNormal('mu_age', mu=mu_age_mean, cov=np.identity(len(age_data.columns)), shape=(1, 10)) beta_age = pm.MvNormal('beta_age', mu=mu_age, cov=(cov**2) * np.identity(10), shape=(1, 10)) # sex prior mu_sex = pm.Normal('mu_sex', mu=0, sigma=1) sigma_sex = pm.HalfNormal('simga_sex', sigma=2) beta_sex = pm.Normal('beta_sex', mu=mu_sex, sigma=sigma_sex) # intercept prior mu_intercept = pm.Normal('mu_intercept', mu=0, sigma=1) sigma_intercept = pm.HalfNormal('simga_intercept', sigma=2) beta_intercept = pm.Normal('beta_intercept', mu=mu_intercept, sigma=sigma_intercept) # mean setup for likelihood mandates = np.array(mandates).astype(theano.config.floatX) population = np.array(population).astype(theano.config.floatX) sex = np.array(sex_data).astype(theano.config.floatX) age = np.array(age_data).astype(theano.config.floatX) w_mandates = theano.shared(mandates, 'w_mandate') w_sex = theano.shared(sex, 'w_sex') w_age = theano.shared(age, 'w_age') mean = beta_intercept + pm.math.matrix_dot(w_mandates, xi*beta_mandates) \ + pm.math.matrix_dot(w_sex, beta_sex).T \ + pm.math.matrix_dot(w_age, beta_age.T).T # likelihood obs = pm.ZeroInflatedPoisson('y_obs', psi=zero_inf, theta=population * tt.exp(mean), observed=covid_deaths) # obs = pm.Normal('crap', mu=mean, sigma=3, observed=covid_deaths) # sample from posterior trace = pm.sample(n_samples, tune=n_samples, nuts={'target_accept': 0.98}) # posterior hdis mandates = test_df.iloc[:, -4:] x = az.summary(trace, var_names=["beta_mandate"], hdi_prob=hdi_prob) x.index = mandates.columns x.to_csv('../images/posteriors/mandate_' + interp_type + '_' + binary_str + '_' + 'summary.csv') x = az.summary(trace, var_names=["beta_sex"], hdi_prob=hdi_prob) x.index = sex_data.columns x.to_csv('../images/posteriors/sex_' + interp_type + '_' + binary_str + '_' + 'summary.csv') x = az.summary(trace, var_names=["beta_age"], hdi_prob=hdi_prob) x.index = age_data.columns x.to_csv('../images/posteriors/age_' + interp_type + '_' + binary_str + '_' + 'summary.csv') x = az.summary(trace, var_names=["beta_intercept"], hdi_prob=hdi_prob) x.to_csv('../images/posteriors/intercept_' + interp_type + '_' + binary_str + '_' + 'summary.csv') # posterior distributions ax = az.plot_forest(trace, 'ridgeplot', var_names=["beta_intercept"], combined=True, hdi_prob=0.99999) ax[0].set_title(r'Posterior Distribution of $\beta_0$') plt.savefig('../images/posteriors/intercept_posteriors_' + interp_type + '_' + binary_str + '.png') ax = az.plot_forest(trace, 'ridgeplot', var_names=["beta_age"], combined=True, hdi_prob=0.99999) ax[0].set_yticklabels(reversed(age_data.columns)) ax[0].set_title(r'Posterior Distribution of $\beta_{age}$') plt.savefig('../images/posteriors/age_posteriors_' + interp_type + '_' + binary_str + '.png') ax = az.plot_forest(trace, 'ridgeplot', var_names=["beta_sex"], combined=True, hdi_prob=0.99999) ax[0].set_yticklabels(reversed(sex_data.columns)) ax[0].set_title(r'Posterior Distribution of $\beta_{sex}$') plt.savefig('../images/posteriors/sex_posteriors_' + interp_type + '_' + binary_str + '.png') ax = az.plot_forest(trace, 'ridgeplot', var_names=["beta_mandate"], combined=True, hdi_prob=0.99999) ax[0].set_yticklabels(reversed(mandates.columns)) ax[0].set_title(r'Posterior Distribution of $\beta_{mandate}$') plt.savefig('../images/posteriors/mandate_posteriors_' + interp_type + '_' + binary_str + '.png') # ESS Plots ax = az.plot_ess(trace, var_names=["beta_intercept"]) ax.set_title(r'$\beta_0$ ESS') plt.savefig('../images/ess/' + interp_type + '_' + binary_str + '_interceptESS.png') ax = az.plot_ess(trace, var_names=["beta_age"]) ax[0, 0].set_title(r'$\beta_{age[1-4]}$ ESS', fontsize=18) ax[0, 1].set_title(r'$\beta_{age[15-24]}$ ESS', fontsize=18) ax[0, 2].set_title(r'$\beta_{age[25-34]}$ ESS', fontsize=18) ax[1, 0].set_title(r'$\beta_{age[35-44]}$ ESS', fontsize=18) ax[1, 1].set_title(r'$\beta_{age[45-54]}$ ESS', fontsize=18) ax[1, 2].set_title(r'$\beta_{age[5-14]}$ ESS', fontsize=18) ax[2, 0].set_title(r'$\beta_{age[55-64]}$ ESS', fontsize=18) ax[2, 1].set_title(r'$\beta_{age[65-74]}$ ESS', fontsize=18) ax[2, 2].set_title(r'$\beta_{age[75-84]}$ ESS', fontsize=18) ax[3, 0].set_title(r'$\beta_{age[85+]}$ ESS', fontsize=18) plt.savefig('../images/ess/' + interp_type + '_' + binary_str + '_ageESS.png') ax = az.plot_ess(trace, var_names=["beta_sex"]) ax.set_title(r'$\beta_{sex}$ ESS') plt.savefig('../images/ess/' + interp_type + '_' + binary_str + '_sexESS.png') ax = az.plot_ess(trace, var_names=["beta_mandate"]) ax[0].set_title(r'$\beta_{mandate[April]}$ ESS', fontsize=18) ax[1].set_title(r'$\beta_{mandate[May]}$ ESS', fontsize=18) ax[2].set_title(r'$\beta_{mandate[June]}$ ESS', fontsize=18) ax[3].set_title(r'$\beta_{mandate[July]}$ ESS', fontsize=18) plt.savefig('../images/ess/' + interp_type + '_' + binary_str + '_mandateESS.png') # posterior predictive checking with model: ppc = pm.sample_posterior_predictive(trace, var_names=["y_obs"]) az.plot_ppc(az.from_pymc3(posterior_predictive=ppc, model=model)) plt.savefig('../images/posterior_predictive/' + interp_type + '_' + binary_str + '.png') # return trace so that user can work with posterior data directly return trace
def conduct_bayesian(observations_file_path, mu_init, beta_init): df = pd.read_csv(observations_file_path) # Get list of unique damage state values: ds_list = df['DS Number'].unique() for ds in range(0, len(ds_list)): df_sub = df.loc[df['DS Number'] == ds_list[ds]] xj = np.array(df_sub['demand']) zj = np.array(df_sub['fail']) nj = np.array(df_sub['total']) mu_ds = mu_init[ds] beta_ds = beta_init[ds] with pm.Model() as model: # Set up the prior: mu = pm.Normal('mu', mu_ds, 2.71) beta = pm.Normal('beta', beta_ds, 0.03) # Define fragility function equation: def normal_cdf(mu, beta, xj): """Compute the log of the cumulative density function of the normal.""" return 0.5 * (1 + tt.erf( (tt.log(xj) - mu) / (beta * tt.sqrt(2)))) # Define likelihood: # like = pm.Binomial('like', p=p, observed=zj, n=nj) like = pm.Binomial('like', p=normal_cdf(mu, beta, xj), observed=zj, n=nj) for RV in model.basic_RVs: print(RV.name, RV.logp(model.test_point)) # Determine the posterior trace = pm.sample(2000, cores=1, return_inferencedata=True) # Posterior predictive check are a great way to validate model: # Generate data from the model using parameters from draws from the posterior: ppc = pm.sample_posterior_predictive( trace, var_names=['mu', 'beta', 'like']) # Calculate failure probabilities using samples: im = np.arange(70, 200, 5) pf_ppc = [] for i in range(0, len(ppc['mu'])): y = pf(im, ppc['mu'][i], ppc['beta'][i]) pf_ppc.append(y) # Plot the HPD: _, ax = plt.subplots() az.plot_hdi(im, pf_ppc, fill_kwargs={ 'alpha': 0.2, 'color': 'blue', 'label': 'bounds of prediction: 94% HPD' }) # Calculate and plot the mean outcome: pf_mean = pf(im, ppc['mu'].mean(), ppc['beta'].mean()) ax.plot(im, pf_mean, label='mean of prediction', color='r', linestyle='dashed') # Plot the mean of the simulation-based fragility: pf_sim = pf(im, mu_ds, beta_ds) ax.plot(im, pf_sim, label='simulation-based', color='k') # Plot the observations: ax.scatter(xj, zj / nj, color='r', marker='^', label='observations') ax.legend() plt.show() # Looking at the difference between the prior of the parameters and updated distributions: new_mu_mean, new_mu_std = norm.fit(ppc['mu']) plt.hist(ppc['mu'], bins=25, density=True, alpha=0.4, color='b') xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p_prior = norm.pdf(x, mu_ds, 2.71) p_new = norm.pdf(x, new_mu_mean, new_mu_std) plt.plot(x, p_prior, 'k', linewidth=2, label='prior distribution') plt.plot(x, p_new, 'r', linewidth=2, label='updated distribution', linestyle='dashed') # Note az.plot_violin(trace, var_names=['mu']) can be helpful for seeing distribution of parameter values # Plot the posterior distributions of each RV fig, ax = plt.subplots() az.plot_trace(trace, chain_prop={'color': ['blue', 'red']}) az.plot_posterior(trace) az.plot_forest(trace, var_names=['mu', 'beta']) plt.show() print(az.summary(trace))
def main(args): print("Loading data...") teams, df = load_data() nt = len(teams) train = df[df["split"] == "train"] print("Starting inference...") with pm.Model() as model: # priors alpha = pm.Normal("alpha", mu=0, sigma=1) sd_att = pm.HalfStudentT("sd_att", nu=3, sigma=2.5) sd_def = pm.HalfStudentT("sd_def", nu=3, sigma=2.5) home = pm.Normal("home", mu=0, sigma=1) # home advantage # team-specific model parameters attack = pm.Normal("attack", mu=0, sigma=sd_att, shape=nt) defend = pm.Normal("defend", mu=0, sigma=sd_def, shape=nt) # data home_id = pm.Data("home_data", train["Home_id"]) away_id = pm.Data("away_data", train["Away_id"]) # likelihood theta1 = tt.exp(alpha + home + attack[home_id] - defend[away_id]) theta2 = tt.exp(alpha + attack[away_id] - defend[home_id]) pm.Poisson("s1", mu=theta1, observed=train["score1"]) pm.Poisson("s2", mu=theta2, observed=train["score2"]) with model: fit = pm.sample( draws=args.num_samples, tune=args.num_warmup, chains=args.num_chains, cores=args.num_cores, random_seed=args.rng_seed, ) print("Analyse posterior...") az.plot_forest( fit, var_names=("alpha", "home", "sd_att", "sd_def"), backend="bokeh", ) az.plot_trace( fit, var_names=("alpha", "home", "sd_att", "sd_def"), backend="bokeh", ) # Attack and defence quality = teams.copy() quality = quality.assign( attack=fit["attack"].mean(axis=0), attacksd=fit["attack"].std(axis=0), defend=fit["defend"].mean(axis=0), defendsd=fit["defend"].std(axis=0), ) quality = quality.assign( attack_low=quality["attack"] - quality["attacksd"], attack_high=quality["attack"] + quality["attacksd"], defend_low=quality["defend"] - quality["defendsd"], defend_high=quality["defend"] + quality["defendsd"], ) plot_quality(quality) # Predicted goals and table predict = df[df["split"] == "predict"] with model: pm.set_data({"home_data": predict["Home_id"]}) pm.set_data({"away_data": predict["Away_id"]}) predicted_score = pm.sample_posterior_predictive( fit, var_names=["s1", "s2"], random_seed=1) predicted_full = predict.copy() predicted_full = predicted_full.assign( score1=predicted_score["s1"].mean(axis=0).round(), score1error=predicted_score["s1"].std(axis=0), score2=predicted_score["s2"].mean(axis=0).round(), score2error=predicted_score["s2"].std(axis=0), ) predicted_full = train.append( predicted_full.drop(columns=["score1error", "score2error"])) print(score_table(df)) print(score_table(predicted_full))
# models. Here we'll just use the [arviz](https://arviz- # devs.github.io/arviz/index.html) package to explore the credible intervals # of each of the covariance and variance parameters, although it makes # available a much wider set of tools for analysis. import arviz as az # Collect the observation error covariance parameters az_obs_cov = az.convert_to_inference_data({ ('Var[%s]' % mod.endog_names[i] if i == j else 'Cov[%s, %s]' % (mod.endog_names[i], mod.endog_names[j])): store_obs_cov[nburn + 1:, i, j] for i in range(mod.k_endog) for j in range(i, mod.k_endog) }) # Plot the credible intervals az.plot_forest(az_obs_cov, figsize=(8, 7)) # Collect the state innovation variance parameters az_state_cov = az.convert_to_inference_data({ r'$\sigma^2$[%s]' % mod.state_names[i]: store_state_cov[nburn + 1:, i] for i in range(mod.k_states) }) # Plot the credible intervals az.plot_forest(az_state_cov, figsize=(8, 7)) # ### Appendix: performance # # Finally, we run a few simple tests to compare the performance of the KFS # and CFA simulation smoothers by using the `%timeit` Jupyter notebook # magic.
""" Ridgeplot ========= _thumb: .8, .5 """ import matplotlib.pyplot as plt import arviz as az az.style.use("arviz-darkgrid") rugby_data = az.load_arviz_data("rugby") axes = az.plot_forest( rugby_data, kind="ridgeplot", var_names=["defs"], linewidth=4, combined=True, ridgeplot_overlap=1.5, colors="blue", figsize=(9, 4), ) axes[0].set_title("Relative defensive strength\nof Six Nation rugby teams") plt.show()
def main(args): print("Loading data...") teams, df = load_data() train = df[df["split"] == "train"] print("Starting inference...") rng_key = random.PRNGKey(args.rng_seed) mcmc = run_inference( model, train["Home_id"].values, train["Away_id"].values, train["score1"].values, train["score2"].values, rng_key, args, ) fit = az.from_numpyro(mcmc) print("Analyse posterior...") az.plot_forest( fit, var_names=("alpha", "home", "sd_att", "sd_def"), backend="bokeh", ) az.plot_trace( fit, var_names=("alpha", "home", "sd_att", "sd_def"), backend="bokeh", ) fit = mcmc.get_samples() # Attack and defence quality = teams.copy() quality = quality.assign( attack=fit["attack"].mean(axis=0), attacksd=fit["attack"].std(axis=0), defend=fit["defend"].mean(axis=0), defendsd=fit["defend"].std(axis=0), ) quality = quality.assign( attack_low=quality["attack"] - quality["attacksd"], attack_high=quality["attack"] + quality["attacksd"], defend_low=quality["defend"] - quality["defendsd"], defend_high=quality["defend"] + quality["defendsd"], ) plot_quality(quality) # Predicted goals and table predict = df[df["split"] == "predict"] predictive = Predictive(model, fit, return_sites=["s1", "s2"]) predicted_score = predictive( random.PRNGKey(0), home_id=predict["Home_id"].values, away_id=predict["Away_id"].values, ) predicted_full = predict.copy() predicted_full = predicted_full.assign( score1=predicted_score["s1"].mean(axis=0).round(), score1error=predicted_score["s1"].std(axis=0), score2=predicted_score["s2"].mean(axis=0).round(), score2error=predicted_score["s2"].std(axis=0), ) predicted_full = train.append( predicted_full.drop(columns=["score1error", "score2error"])) print(score_table(df)) print(score_table(predicted_full))
prior = pm.sample_prior_predictive(samples = 30) posterior_1 = pm.sample() posterior_pred_1 = pm.sample_posterior_predictive(posterior_1) pm.traceplot(posterior_1); data = az.from_pymc3(trace=posterior_1, prior=prior, posterior_predictive=posterior_pred_1) data az.style.use('arviz-darkgrid') fig, axes = az.plot_forest(data, kind='ridgeplot', combined=False, ridgeplot_overlap=2, colors='white', figsize=(10, 3)) axes[0].set_title('model_1 posteriors parameters distributions'); df = pd.DataFrame() df = df.assign(alpha = pd.Series(prior['alpha']), sigma = pd.Series(prior['sigma']), beta = pd.Series(prior['beta'])) df.head() priors = pd.DataFrame() for i in range(df.shape[0]): priors['prior_'+str(i)] = df.loc[i,'alpha'] + df.loc[i,'beta'] * d['area']
beta_mean = np.mean(betas) hyper_mean = alpha_mean / (alpha_mean + beta_mean) print('hyper mean') print(hyper_mean) hyper_mean2 = np.mean(alphas / (alphas + betas)) print(hyper_mean2) mle = G_samples / N_samples pooled_mle = np.sum(G_samples) / np.sum(N_samples) print('pooled mle') print(pooled_mle) axes = az.plot_forest(trace_h, var_names='θ', hdi_prob=0.95, combined=False, colors='cycle') y_lims = axes[0].get_ylim() #axes[0].vlines(post_hyper_mean, *y_lims) pml.savefig('hbayes_binom_covid_forest.png', dpi=300) fig, axs = plt.subplots(4, 1, figsize=(8, 8)) axs = np.reshape(axs, 4) xs = np.arange(J) ax = axs[0] ax.bar(xs, G_samples) ax.set_ylim(0, 5) ax.set_title('number of cases (truncated at 5)') ax = axs[1] ax.bar(xs, N_samples)
def main(): parser = argparse.ArgumentParser( description='Train PMF on CSV-formatted count matrix') parser.add_argument( '-f', '--csv-file', nargs='?', type=str, help="Enter the CSV file" ) parser.add_argument( '-e', '--epoch', nargs='?', type=int, default=300, help='Enter Epoch value: Default: 300' ) parser.add_argument( '-d', '--dimension', nargs='?', type=int, default=2, help='Enter embedding dimension. Default: 2' ) parser.add_argument( '-b', '--batch-size', nargs='?', type=int, default=5000, help='Enter batch size. Default: 5000' ) parser.add_argument( '-lr', '--learning-rate', nargs='?', type=float, default=0.01, help='Enter float. Default: 0.01' ) parser.add_argument( '-c', '--clip-value', nargs='?', type=float, default=3., help='Gradient clip value. Default: 3.0' ) parser.add_argument( '-lt', '--log-transform', help='Log-transform?', action='store_true' ) parser.add_argument( '-rn', '--row-normalize', help='Row normalize based on counts?', action='store_true' ) args = parser.parse_args(sys.argv[1:]) if args.csv_file is None: sys.exit("You need to specify a csv file") elif not os.path.exists(args.csv_file): sys.exit("File doesn't exist") else: _FILENAME = args.csv_file _BATCH_SIZE = args.batch_size _LOG_TRANSFORM = args.log_transform _EPOCH_NUMBER = args.epoch _DIMENSION = args.dimension _LEARNING_RATE = args.learning_rate _ROW_NORMALIZE = args.row_normalize _CLIP_VALUE = args.clip_value with open(_FILENAME) as f: csv_file = csv.reader(f) columns = len(next(csv_file)) csv_data0 = tf.data.experimental.CsvDataset( _FILENAME, [tf.float64]*columns) csv_data0 = csv_data0.enumerate() csv_data = csv_data0.map( lambda j, *x: { 'indices': j, 'counts': tf.squeeze(tf.stack(x, axis=-1)) }) # Grab a batch to compute statistics colsums = [] batch_sizes = [] N = 0 for batch in iter(csv_data.batch(_BATCH_SIZE, drop_remainder=False)): colsums += [tf.reduce_sum(batch['counts'], axis=0, keepdims=True)] N += batch['counts'].shape[0] colsums = tf.add_n(colsums) colmeans = colsums/N rowmean = tf.reduce_sum(colmeans) if _ROW_NORMALIZE: csv_data = csv_data0.map( lambda j, *x: { 'indices': j, 'counts': tf.squeeze(tf.stack(x, axis=-1)), 'normalization': tf.reduce_max([ tf.reduce_sum(x), 1.])/rowmean }) csv_data_batched = csv_data.batch(_BATCH_SIZE, drop_remainder=True) csv_data_batched = csv_data_batched.prefetch( tf.data.experimental.AUTOTUNE) factor = PoissonMatrixFactorization( csv_data_batched, latent_dim=_DIMENSION, strategy=None, scale_columns=True, log_transform=_LOG_TRANSFORM, column_norms=colmeans, u_tau_scale=1.0/np.sqrt(columns*N), dtype=tf.float64) factor.calibrate_advi( num_epochs=_EPOCH_NUMBER, rel_tol=1e-4, clip_value=_CLIP_VALUE, learning_rate=_LEARNING_RATE) print("Saving the encoding matrix") filename = f"{_FILENAME}_{_DIMENSION}D_encoding" filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.csv" with open(filename, "w") as f: writer = csv.writer(f) encoding = factor.encoding_matrix().numpy().T for row in range(encoding.shape[0]): writer.writerow(encoding[row, :]) print("Saving the trained model object") filename = f"{_FILENAME}_{_DIMENSION}D_model" filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.pkl" factor.save(filename) print("Saving figure with the encodings") fig, ax = plt.subplots(1, 2, figsize=(14, 8)) D = factor.feature_dim pcm = ax[0].imshow( factor.encoding_matrix().numpy()[::-1, :], vmin=0, cmap="Blues") ax[0].set_yticks(np.arange(factor.feature_dim)) ax[0].set_yticklabels(np.arange(factor.feature_dim)) ax[0].set_ylabel("item") ax[0].set_xlabel("factor dimension") ax[0].set_xticks(np.arange(_DIMENSION)) ax[0].set_xticklabels(np.arange(_DIMENSION)) surrogate_samples = factor.surrogate_distribution.sample(250) if 's' in surrogate_samples.keys(): weights = surrogate_samples['s'] / \ tf.reduce_sum(surrogate_samples['s'], -2, keepdims=True) intercept_data = az.convert_to_inference_data( { r"": ( tf.squeeze(surrogate_samples['w']) * weights[:, -1, :] * factor.eta_i ).numpy().T}) else: intercept_data = az.convert_to_inference_data( { r"": ( tf.squeeze(surrogate_samples['w']) * factor.eta_i).numpy().T}) fig.colorbar(pcm, ax=ax[0], orientation="vertical") az.plot_forest(intercept_data, ax=ax[1]) ax[1].set_xlabel("background rate") ax[1].set_ylim((-0.014, .466)) ax[1].set_title("65% and 95% CI") ax[1].axvline(1.0, linestyle='dashed', color="black") filename = f"{_FILENAME}_{_DIMENSION}D_encoding_" filename += f"lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.pdf" plt.savefig( filename, bbox_inches='tight') print("Generating representations") filename = f"{_FILENAME}_{_DIMENSION}D_representation" filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.csv" csv_data_batched = csv_data.batch(_BATCH_SIZE, drop_remainder=False) with open(filename, 'w') as f: writer = csv.writer(f) for record in iter(csv_data_batched): z = factor.encode(tf.cast(record['data'], factor.dtype)).numpy() if _ROW_NORMALIZE: z *= (record['normalization'].numpy())[:, np.newaxis] ind = record['indices'].numpy() for row in range(z.shape[0]): writer.writerow(np.concatenate([[ind[row]], z[row, :]]))
# set the priors on scale and df sigma = pm.HalfCauchy("sigma", 5) df = pm.Exponential("df", 1 / 30) # specify the likelihood of the data y_obs = pm.StudentT("y_obs", mu=alpha_temp[idx] + beta[idx] * x_centered, sd=sigma, nu=df, observed=y_m) # inference step trace_unp = pm.sample(2000) # -------------- analyse the posterior -------------------------------------- # with unpooled_model: az.plot_forest(trace_unp, var_names=["alpha", "beta"], combined=True) # ---------------- specify a hierarchical probabilistic model ----------------------------- # with pm.Model() as hierarchical_model: # specify a set of hyper-priors alpha_m_temp = pm.Normal("alpha_m_temp", mu=0, sd=10) alpha_s_temp = pm.HalfNormal("alpha_s_temp", sd=10) beta_m = pm.Normal("beta_m", mu=0, sd=10) beta_s = pm.HalfNormal("beta_s", sd=10) # set the priors on parameters alpha_temp = pm.Normal("alpha_temp", mu=alpha_m_temp, sd=alpha_s_temp, shape=M) beta = pm.Normal("beta", mu=beta_m, sd=beta_s, shape=M)
β = pm.Normal('β', mu=0, sd=10, shape=M) ϵ = pm.HalfCauchy('ϵ', 5) ν = pm.Exponential('ν', 1/30) y_pred = pm.StudentT('y_pred', mu=α_tmp[idx] + β[idx] * x_centered, sd=ϵ, nu=ν, observed=y_m) α = pm.Deterministic('α', α_tmp - β * x_m.mean()) trace_up = pm.sample(2000) # In[28]: az.plot_forest(trace_up, var_names=['α', 'β'], combined=True) plt.savefig('B11197_03_14.png', dpi=300) # # <img src='B11197_03_15.png' width="700"> # In[29]: with pm.Model() as hierarchical_model: # hyper-priors α_μ_tmp = pm.Normal('α_μ_tmp', mu=0, sd=10) α_σ_tmp = pm.HalfNormal('α_σ_tmp', 10) β_μ = pm.Normal('β_μ', mu=0, sd=10) β_σ = pm.HalfNormal('β_σ', sd=10)
mu = pm.Normal('mu', mu=0, sd=10, shape=groups) sigma = pm.HalfNormal('sigma', sd=10, shape=groups) y = pm.Normal('y', mu=mu[idx], sd=sigma[idx], observed=diff) trace_cs_nh = pm.sample(1000) # In[37]: with pm.Model() as cs_h: # hyper_priors mu_mu = pm.Normal('mu_mu', mu=0, sd=10) sigma_mu = pm.HalfNormal('sigma_mu', 10) # priors mu = pm.Normal('mu', mu=mu_mu, sd=sigma_mu, shape=groups) sigma = pm.HalfNormal('sigma', sd=10, shape=groups) y = pm.Normal('y', mu=mu[idx], sd=sigma[idx], observed=diff) trace_cs_h = pm.sample(1000) # In[38]: _, axes = az.plot_forest([trace_cs_nh, trace_cs_h], model_names=['n_h', 'h'], var_names='mu', combined=False, colors='cycle') y_lims = axes[0].get_ylim() axes[0].vlines(trace_cs_h['mu_mu'].mean(), *y_lims) plt.savefig('B11197_02_22.png', dpi=300)
ax.axvline(height.mean()) ax.set(title='Posterior predictive of the mean', xlabel='mean(x)', ylabel='Frequency') _, ax = plt.subplots(figsize=(12, 6)) ax.hist([h.mean() for h in no_collinear_ppc['h']]) ax.axvline(height.mean()) ax.set(title='Posterior predictive of the mean', xlabel='mean(x)', ylabel='Frequency') # Plot posterior density for models #%% az.plot_density([trace_collinear, trace_no_collinear], data_labels=['collinear', 'no collinear'], var_names=['br'], shade=0.1) # Compare plots #%% az.plot_forest([trace_collinear, trace_no_collinear], model_names=['collinear', 'no collinear'], var_names=['br', 'a', 'sigma']) #%% az.plot_forest([trace_collinear, trace_no_collinear], model_names=['collinear', 'no collinear'], var_names=['br', 'a', 'sigma'], kind='ridgeplot')
alpha = pm.Normal('alpha', mu=0, sd=10) beta = pm.Normal('beta', mu=0, sd=2, shape=len(x_n)) mu = alpha + pm.math.dot(x_1, beta) theta = pm.Deterministic('theta', 1 / (1 + pm.math.exp(-mu))) bd = pm.Deterministic('bd', -alpha / beta[1] - beta[0] / beta[1] * x_1[:, 0]) yl = pm.Bernoulli('yl', p=theta, observed=y_1) trace_1 = pm.sample(2000) # In[13]: varnames = ['alpha', 'beta'] az.plot_forest(trace_1, var_names=varnames) # In[14]: idx = np.argsort(x_1[:, 0]) bd = trace_1['bd'].mean(0)[idx] plt.scatter(x_1[:, 0], x_1[:, 1], c=[f'C{x}' for x in y_0]) plt.plot(x_1[:, 0][idx], bd, color='k') az.plot_hpd(x_1[:, 0], trace_1['bd'], color='k') plt.xlabel(x_n[0]) plt.ylabel(x_n[1]) plt.savefig('B11197_04_05.png', dpi=300) # ## Interpreting the coefficients of a logistic regression
mu = pm.MvNormal("ab_dept", mu=tt.stack([a, bm]), chol=chol, shape=(Ndept, 2)) a_dept = pm.Deterministic("a_dept", mu[:, 0]) bm_dept = pm.Deterministic("bm_dept", mu[:, 1]) p = pm.math.invlogit(mu[Dept_id, 0] + mu[Dept_id, 1] * d_ad["male"]) admit = pm.Binomial("admit", p=p, n=d_ad.applications, observed=d_ad.admit) trace_13_3 = pm.sample(5000, tune=1000) # %% az.plot_forest(trace_13_3, var_names=["bm_dept", "a_dept"], credible_interval=0.89) # %% with pm.Model() as m_13_4: a = pm.Normal("a", 0, 10) sigma_dept = pm.HalfCauchy("sigma_dept", 2) a_dept = pm.Normal("a_dept", a, sigma_dept, shape=Ndept) p = pm.math.invlogit(a_dept[Dept_id]) admit = pm.Binomial("admit", p=p, n=d_ad.applications, observed=d_ad.admit) trace_13_4 = pm.sample(4500, tune=500) comp_df = az.compare({ "m13_2": trace_13_2, "m13_3": trace_13_3,
az.summary(trace_h) J = len(N_samples) post_mean = np.zeros(J) samples = trace_h['θ'] post_mean = np.mean(samples, axis=0) post_hyper_mean = trace_h['μ'].mean() mle = G_samples / N_samples pooled_mle = np.sum(G_samples) / np.sum(N_samples) axes = az.plot_forest( trace_h, var_names='θ', combined=False, colors='cycle') y_lims = axes[0].get_ylim() axes[0].vlines(post_hyper_mean, *y_lims) axes = az.plot_forest( trace_h, var_names='θ', combined=True, colors='cycle', kind='ridgeplot') # Show posterior over hparans fig, ax= plt.subplots(1,1) x = np.linspace(0, 1, 100) for i in np.random.randint(0, len(trace_h), size=100): u = trace_h['μ'][i] k = trace_h['κ'][i]
def make_plots(self, run_params, run_data=None, energy_data=None, runs_np=True, out_dir=None): """Create trace + KDE plots of lattice observables and energy data.""" type_str = 'figures_np' if runs_np else 'figures_tf' figs_dir = os.path.join(self._log_dir, type_str) fig_dir = os.path.join(figs_dir, run_params['run_str']) io.check_else_make_dir(fig_dir) dataset = None energy_dataset = None try: fname, title_str = self._plot_setup(run_params) except FileNotFoundError: return dataset, energy_dataset tp_fname = f'{fname}_traceplot' pp_fname = f'{fname}_posterior' rp_fname = f'{fname}_ridgeplot' dataset = self.build_dataset(run_data, run_params) tp_out_file = os.path.join(fig_dir, f'{tp_fname}.pdf') pp_out_file = os.path.join(fig_dir, f'{pp_fname}.pdf') var_names = ['tunneling_rate', 'plaqs_diffs'] if hasattr(dataset, 'dx'): var_names.append('dx') var_names.extend(['accept_prob', 'charges_squared', 'charges']) tp_out_file_ = None pp_out_file_ = None if out_dir is not None: io.check_else_make_dir(out_dir) tp_out_file1 = os.path.join(out_dir, f'{tp_fname}.pdf') pp_out_file1 = os.path.join(out_dir, f'{pp_fname}.pdf') ################################################### # Create traceplot + posterior plot of observables ################################################### self._plot_trace(dataset, tp_out_file, var_names=var_names, out_file1=tp_out_file1) self._plot_posterior(dataset, pp_out_file, var_names=var_names, out_file1=pp_out_file1) # * * * * * * * * * * * * * * * * * # Create ridgeplot of plaq diffs * # * * * * * * * * * * * * * * * * * rp_out_file = os.path.join(fig_dir, f'{rp_fname}.pdf') _ = az.plot_forest(dataset, kind='ridgeplot', var_names=['plaqs_diffs'], ridgeplot_alpha=0.4, ridgeplot_overlap=0.1, combined=False) fig = plt.gcf() fig.suptitle(title_str, fontsize='x-large', y=1.025) self._savefig(fig, rp_out_file) if out_dir is not None: rp_out_file1 = os.path.join(out_dir, f'{rp_fname}.pdf') self._savefig(fig, rp_out_file1) # * * * * * * * * * * * * * * * * * * * * * * * * * * # Create traceplot + posterior plot of energy data * # * * * * * * * * * * * * * * * * * * * * * * * * * * if energy_data is not None: energy_dataset = self.energy_plots(energy_data, run_params, fname, out_dir=out_dir) return dataset, energy_dataset
def main(args): print("Loading data...") teams, df = load_data() train = df[df["split"] == "train"] nt = len(teams) print("Starting inference...") mcmc = run_inference( num_chains=args.num_chains, num_results=args.num_samples, num_burnin_steps=args.num_warmup, nt=nt, ) samples = dict( zip( ["alpha", "home", "sd_att", "sd_def", "attack", "defend"], [np.swapaxes(sample, 0, 1) for sample in mcmc], ) ) fit = az.from_dict(samples) print("Analyse posterior...") az.plot_forest( fit, var_names=("alpha", "home", "sd_att", "sd_def"), backend="bokeh", ) az.plot_trace( fit, var_names=("alpha", "home", "sd_att", "sd_def"), backend="bokeh", ) # Attack and defence quality = teams.copy() quality = quality.assign( attack=samples["attack"].mean(axis=(0, 1)), attacksd=samples["attack"].std(axis=(0, 1)), defend=samples["defend"].mean(axis=(0, 1)), defendsd=samples["defend"].std(axis=(0, 1)), ) quality = quality.assign( attack_low=quality["attack"] - quality["attacksd"], attack_high=quality["attack"] + quality["attacksd"], defend_low=quality["defend"] - quality["defendsd"], defend_high=quality["defend"] + quality["defendsd"], ) plot_quality(quality) # Predicted goals and table predict = df[df["split"] == "predict"] theta1 = ( samples["alpha"].flatten()[..., np.newaxis] + samples["home"].flatten()[..., np.newaxis] + tf.gather( samples["attack"].reshape(-1, samples["attack"].shape[-1]), predict["Home_id"], axis=-1, ) - tf.gather( samples["defend"].reshape(-1, samples["defend"].shape[-1]), predict["Away_id"], axis=-1, ) ) theta2 = ( samples["alpha"].flatten()[..., np.newaxis] + tf.gather( samples["attack"].reshape(-1, samples["attack"].shape[-1]), predict["Away_id"], axis=-1, ) - tf.gather( samples["defend"].reshape(-1, samples["defend"].shape[-1]), predict["Home_id"], axis=-1, ) ) s1 = np.array(tfd.Poisson(log_rate=theta1).sample()) s2 = np.array(tfd.Poisson(log_rate=theta2).sample()) predicted_full = predict.copy() predicted_full = predicted_full.assign( score1=s1.mean(axis=0).round(), score1error=s1.std(axis=0), score2=s2.mean(axis=0).round(), score2error=s2.std(axis=0), ) predicted_full = train.append( predicted_full.drop(columns=["score1error", "score2error"]) ) print(score_table(df)) print(score_table(predicted_full))
from collections import defaultdict import arviz as az #https://github.com/probml/pmtk3/blob/master/demos/cancerRatesEb.m data_y = np.array([0, 0, 2, 0, 1, 1, 0, 2, 1, 3, 0, 1, 1, 1, 54, 0, 0, 1, 3, 0]); data_n = np.array([1083, 855, 3461, 657, 1208, 1025, 527, 1668, 583, 582, 917, 857, 680, 917, 53637, 874, 395, 581, 588, 383]); N = len(data_n) # We put a prior on the mean and precision () of the Beta distribution, # instead of on the alpha and beta parameters with pm.Model() as model_h: mu = pm.Beta('mu', 1., 1.) kappa = pm.HalfNormal('kappa', 500) alpha = pm.Deterministic('alpha', mu*kappa) beta = pm.Deterministic('beta', (1.0-mu)*kappa) theta = pm.Beta('theta', alpha=alpha, beta=beta, shape=N) y = pm.Binomial('y', p=theta, observed=data_y, n=data_n) np.random.seed(0) with model_h: trace_h = pm.sample(1000, chains=4) az.summary(trace_h).round(4) az.plot_forest(trace_h, var_names=["theta"], combined=True, credible_interval=0.95); az.plot_forest(trace_h, var_names=["theta"], combined=True, kind='ridgeplot');
#https://github.com/probml/pmtk3/blob/master/demos/cancerRatesEb.m data_y = np.array( [0, 0, 2, 0, 1, 1, 0, 2, 1, 3, 0, 1, 1, 1, 54, 0, 0, 1, 3, 0]) data_n = np.array([ 1083, 855, 3461, 657, 1208, 1025, 527, 1668, 583, 582, 917, 857, 680, 917, 53637, 874, 395, 581, 588, 383 ]) N = len(data_n) # We put a prior on the mean and precision () of the Beta distribution, # instead of on the alpha and beta parameters with pm.Model() as model_h: mu = pm.Beta('mu', 1., 1.) kappa = pm.HalfNormal('kappa', 500) alpha = pm.Deterministic('alpha', mu * kappa) beta = pm.Deterministic('beta', (1.0 - mu) * kappa) theta = pm.Beta('theta', alpha=alpha, beta=beta, shape=N) y = pm.Binomial('y', p=theta, observed=data_y, n=data_n) np.random.seed(0) with model_h: trace_h = pm.sample(1000, chains=2, cores=1) az.summary(trace_h).round(4) az.plot_forest(trace_h, var_names=["theta"], combined=True, hdi_prob=0.95) az.plot_forest(trace_h, var_names=["theta"], combined=True, kind='ridgeplot') plt.show()