def plot_ppc_and_score(trace, data, ax=None, title='PPC', paras=None): # Sample PPC ppc_trace = pm.sample_posterior_predictive(trace=trace, var_names=['y']) # Calculate LOO score loo = az.loo(trace).loo loo_text = "LOO = %.2f"%loo # Aggregate binary responses new_trace = [] for soa in sorted(set((data.SOA_IN_FRAMES))): new_trace.append(ppc_trace['y'][:,(data.SOA_IN_FRAMES==soa) & (data.PROBE_SALIENT==0)].mean(axis=1)) new_trace.append(ppc_trace['y'][:,(data.SOA_IN_FRAMES==soa) & (data.PROBE_SALIENT==1)].mean(axis=1)) ppc_trace = {'y': np.array(new_trace).T} # Prepare axes if none provided if ax is None: f,ax= plt.subplots() # Get SOAs and condition mask from data SOAs = sorted(set(data['SOA_IN_MS'])) cond = data.groupby(['SOA_IN_MS', 'PROBE_SALIENT'])['PROBE_SALIENT'].min().values # Plot az.plot_hdi(y=ppc_trace['y'][:,cond==0],x=SOAs, color='k', ax=ax, hdi_prob=0.95, fill_kwargs={'alpha' : 0.23}) az.plot_hdi(y=ppc_trace['y'][:,cond==1],x=SOAs, color='g', ax=ax, hdi_prob=0.95, fill_kwargs={'alpha' : 0.23}) ax.plot(SOAs, np.mean(ppc_trace['y'][:,cond==0],axis=0), color='k') ax.plot(SOAs, np.mean(ppc_trace['y'][:,cond==1],axis=0), color='g') pf_mean = data.groupby(['SOA_IN_MS', 'PROBE_SALIENT']).mean().PROBE_FIRST_RESPONSE pf_count = data.groupby(['SOA_IN_MS', 'PROBE_SALIENT']).sum().PROBE_FIRST_RESPONSE pf_obs = data.groupby(['SOA_IN_MS', 'PROBE_SALIENT']).count().PROBE_FIRST_RESPONSE pf_ci = abs(np.array(prop_ci(pf_count.values, pf_obs.values)) - pf_mean.values) ax.plot(SOAs, pf_mean.values[::2], 'k.') ax.errorbar(np.array(SOAs)-0.5, pf_mean.values[::2], pf_ci[:,::2], fmt='none', color='k', alpha=0.5) ax.plot(SOAs, pf_mean.values[1::2], 'g.') ax.errorbar(np.array(SOAs)+0.5, pf_mean.values[1::2], pf_ci[:,1::2], fmt='none', color='g', alpha=0.5) ax.axvline(0, linestyle='dashed') ax.axhline(0.5, linestyle='dashed') ax.text(-20,0, loo_text) if paras is not None: for i, varname in enumerate(paras): stats = az.summary(trace, var_names=[varname], hdi_prob=.95) for j, s in enumerate(stats['mean']): text = r'$' + varname + r'$: %.2f [%.2f, %.2f]' text = text%(s, stats['hdi_2.5%'][j], stats['hdi_97.5%'][j]) posx, posy = .1 + .5 - (1 - j) * .5, 0.95 - (.05*i) - ((1-j)*.5) ax.text(posx, posy, text, transform = ax.transAxes, color=['k','g'][j]) ax.set_title(title)
def make_plot(trace): plot_training_data() # plot logistic curve theta = trace['θ'].mean(axis=0) idx = np.argsort(x_c) plt.plot(x_c[idx], theta[idx], color='C2', lw=3) az.plot_hdi(x_c, trace['θ'], color='C2') # plot decision boundary plt.vlines(trace['bd'].mean(), 0, 1, color='k') bd_hpd = az.hdi(trace['bd']) plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='k', alpha=0.5)
def conduct_bayesian(observations_file_path, mu_init, beta_init): df = pd.read_csv(observations_file_path) # Get list of unique damage state values: ds_list = df['DS Number'].unique() for ds in range(0, len(ds_list)): df_sub = df.loc[df['DS Number'] == ds_list[ds]] xj = np.array(df_sub['demand']) zj = np.array(df_sub['fail']) nj = np.array(df_sub['total']) mu_ds = mu_init[ds] beta_ds = beta_init[ds] with pm.Model() as model: # Set up the prior: mu = pm.Normal('mu', mu_ds, 2.71) beta = pm.Normal('beta', beta_ds, 0.03) # Define fragility function equation: def normal_cdf(mu, beta, xj): """Compute the log of the cumulative density function of the normal.""" return 0.5 * (1 + tt.erf( (tt.log(xj) - mu) / (beta * tt.sqrt(2)))) # Define likelihood: # like = pm.Binomial('like', p=p, observed=zj, n=nj) like = pm.Binomial('like', p=normal_cdf(mu, beta, xj), observed=zj, n=nj) for RV in model.basic_RVs: print(RV.name, RV.logp(model.test_point)) # Determine the posterior trace = pm.sample(2000, cores=1, return_inferencedata=True) # Posterior predictive check are a great way to validate model: # Generate data from the model using parameters from draws from the posterior: ppc = pm.sample_posterior_predictive( trace, var_names=['mu', 'beta', 'like']) # Calculate failure probabilities using samples: im = np.arange(70, 200, 5) pf_ppc = [] for i in range(0, len(ppc['mu'])): y = pf(im, ppc['mu'][i], ppc['beta'][i]) pf_ppc.append(y) # Plot the HPD: _, ax = plt.subplots() az.plot_hdi(im, pf_ppc, fill_kwargs={ 'alpha': 0.2, 'color': 'blue', 'label': 'bounds of prediction: 94% HPD' }) # Calculate and plot the mean outcome: pf_mean = pf(im, ppc['mu'].mean(), ppc['beta'].mean()) ax.plot(im, pf_mean, label='mean of prediction', color='r', linestyle='dashed') # Plot the mean of the simulation-based fragility: pf_sim = pf(im, mu_ds, beta_ds) ax.plot(im, pf_sim, label='simulation-based', color='k') # Plot the observations: ax.scatter(xj, zj / nj, color='r', marker='^', label='observations') ax.legend() plt.show() # Looking at the difference between the prior of the parameters and updated distributions: new_mu_mean, new_mu_std = norm.fit(ppc['mu']) plt.hist(ppc['mu'], bins=25, density=True, alpha=0.4, color='b') xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p_prior = norm.pdf(x, mu_ds, 2.71) p_new = norm.pdf(x, new_mu_mean, new_mu_std) plt.plot(x, p_prior, 'k', linewidth=2, label='prior distribution') plt.plot(x, p_new, 'r', linewidth=2, label='updated distribution', linestyle='dashed') # Note az.plot_violin(trace, var_names=['mu']) can be helpful for seeing distribution of parameter values # Plot the posterior distributions of each RV fig, ax = plt.subplots() az.plot_trace(trace, chain_prop={'color': ['blue', 'red']}) az.plot_posterior(trace) az.plot_forest(trace, var_names=['mu', 'beta']) plt.show() print(az.summary(trace))
df = sns.load_dataset('iris') iris = df.query("species == ('setosa', 'versicolor')") y = pd.Categorical(iris['species']).codes x = iris[iris.columns[:-1]].values x = x[:, 0] - x[:, 0].mean() print(x) with pm.Model() as model: alpha = pm.Normal('alpha', 0, 10) beta = pm.Normal('beta', 0, 10) mu = alpha + pm.math.dot(x, beta) p = pm.Deterministic('p', pm.math.sigmoid(mu)) y_lik = pm.Bernoulli('y_lik', p=p, observed=y) b = pm.Deterministic('b', -alpha / beta) trace_m = pm.sample(draws=1000, cores=1, chains=3, random_seed=1) pp = pm.sample_posterior_predictive(trace_m) _, ax = plt.subplots(figsize=(12, 8)) xs = np.linspace(x.min(), x.max(), 1000) theta = trace_m['p'].mean(axis=0) sns.lineplot(xs, 1 / (1 + np.exp(-(trace_m['alpha'].mean(axis=0) + trace_m['beta'].mean(axis=0) * xs))), ax=ax) plt.vlines(trace_m['b'].mean(axis=0), 0, 1) az.plot_hdi(x, trace_m['p'], ax=ax) hdi = az.hdi(trace_m['b'], hdi_prob=0.98) plt.fill_betweenx([0, 1], hdi[0], hdi[1], color='k', alpha=0.5) sns.scatterplot(x, y, ax=ax) plt.xlabel('sepal_length') plt.show()
from sklearn import preprocessing lb = preprocessing.LabelBinarizer() import seaborn as sns import arviz as az from sklearn.preprocessing import scale df = sns.load_dataset('iris') print(df.head()) iris = df.query("species == ('setosa', 'versicolor')") y = pd.Categorical(iris['species']).codes x = iris[iris.columns[:-1]].values with pm.Model() as Model: alpha = pm.Normal('alpha', mu=0, sigma=100) beta = pm.Normal('beta', mu=0, sigma=2, shape=(2)) mu = alpha + pm.math.dot(x[:, 0:2], beta) p = pm.Deterministic('p', pm.math.sigmoid(mu)) db = pm.Deterministic('db', -(alpha / beta[1]) - (beta[0] / beta[1]) * x[:, 0]) pm.Bernoulli('p-lik', p=p, observed=y) trace_m = pm.sample(2000, cores=1) #pp = pm.sample_posterior_predictive(trace_m) _, ax = plt.subplots(figsize=(12, 8)) theta = trace_m['db'].mean(axis=0) ax.scatter(x[:, 0], x[:, 1], c=[f'C{k}' for k in y]) ix = np.argsort(x[:, 0]) ax.plot(x[:, 0][ix], theta[ix]) az.plot_hdi(x[:, 0], trace_m['db'], color='k', ax=ax) plt.show()
y = pm.Data('y', y_obs) theta_0 = pm.Normal('intercept', mu=0, sigma=2) theta_1 = pm.Normal('coefx', mu=0, sigma=2) theta_2 = pm.Normal('coefxSqd', mu=0, sigma=2) theta = pm.Deterministic('theta', theta_0 + theta_1*xs + theta_2*xs**2) sigma = pm.HalfCauchy('sigma', 100) y_lik = pm.Normal('y_lik', mu=theta, sigma=sigma, observed=y) trace_linear = pm.sample(tune=2000, chains=1, cores=1) pp_samples = pm.sample_posterior_predictive(trace=trace_linear, random_seed=123) y_pred = pp_samples['y_lik'].mean(axis=0) _, axi = plt.subplots(1, 4, figsize=(8, 5)) sns.scatterplot(x, y_obs, ax=axi[0]).set_title("Data") sns.lineplot(x, y_pred, ax=axi[0]) az.plot_hdi(x, trace_linear['theta'], hdi_prob=0.98, ax=axi[0], color='gray') az.plot_posterior(trace_linear, var_names=['intercept', 'coefx'], ax=axi[1]) az.plot_posterior(trace_linear, var_names=['coefx'], ax=axi[2]) az.plot_posterior(trace_linear, var_names=['coefxSqd'], ax=axi[3]) plt.show() with linear_Model: pm.set_data({'xs': [1, 5.6, 4]}) y_test = pm.sample_posterior_predictive(trace=trace_linear) print(y_test['y_lik'].mean(axis=0)) print(1 + 3.2 * 1 + 4 * 1**2)
""" Plot HDI ======== _thumb: .8, .8 """ import bokeh.plotting as bkp import numpy as np import arviz as az x_data = np.random.normal(0, 1, 100) y_data = 2 + x_data * 0.5 y_data_rep = np.random.normal(y_data, 0.5, (200, 100)) x_data_sorted = np.sort(x_data) ax = az.plot_hdi(x_data, y_data_rep, color="red", backend="bokeh", show=False) ax.line(x_data_sorted, 2 + x_data_sorted * 0.5, line_color="black", line_width=3) if az.rcParams["plot.bokeh.show"]: bkp.show(ax)
bd = pm.Deterministic('bd', -α / β) # decision boundary yl = pm.Bernoulli('yl', p=θ, observed=y_0) trace_0 = pm.sample(1000) varnames = ['α', 'β', 'bd'] az.summary(trace_0, varnames) theta = trace_0['θ'].mean(axis=0) idx = np.argsort(x_c) plt.figure() # plot logistic curve plt.plot(x_c[idx], theta[idx], color='C2', lw=3) az.plot_hdi(x_c, trace_0['θ'], color='C2') # plot decision boundary plt.vlines(trace_0['bd'].mean(), 0, 1, color='k') bd_hpd = az.hdi(trace_0['bd']) plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='k', alpha=0.5) # plot jittered data plt.scatter(x_c, np.random.normal(y_0, 0.02), marker='.', color=[f'C{x}' for x in y_0]) plt.xlabel(x_n) plt.ylabel('p(y=1)', rotation=0) # use original scale for xticks
y_ = pm.Bernoulli('y', p=pm.math.sigmoid(f), observed=space_flu) trace_space_flu = pm.sample(1000, chains=1, compute_convergence_checks=False) X_new = np.linspace(0, 80, 200)[:, None] with model_space_flu: f_pred = gp.conditional('f_pred', X_new) pred_samples = pm.sample_posterior_predictive(trace_space_flu, var_names=['f_pred'], samples=1000) _, ax = plt.subplots(figsize=(10, 6)) fp = logistic(pred_samples['f_pred']) fp_mean = np.nanmean(fp, 0) ax.scatter(age, np.random.normal(space_flu, 0.02), marker='.', color=[f'C{ci}' for ci in space_flu]) ax.plot(X_new[:, 0], fp_mean, 'C2', lw=3) az.plot_hdi(X_new[:, 0], fp, color='C2') ax.set_yticks([0, 1]) ax.set_yticklabels(['healthy', 'sick']) ax.set_xlabel('age') pml.savefig('gp_classify_spaceflu.pdf', dpi=300)
def plot_hdi(t, y, n_idx, m_idata, model_type, prior_level, kind="all", hdi_prob=(.95, .8)): # unpack tuple & get unique t. high, low = hdi_prob t_unique = np.unique(t) n_time = len(t_unique) # take out ppc if kind == "full": ppc = m_idata.posterior_predictive y_pred = ppc["y_pred"].mean(axis=0).values y_mean = y_pred.mean(axis=(0, 1)) outcome = y_pred.reshape((4000 * n_idx, n_time)) elif kind == "fixed": # should be the same as just posterior here I think. ppc = m_idata.posterior_predictive alpha = ppc.alpha.values #shape: (1, 4.000) beta = ppc.beta.values #shape: (1, 4.000) outcome = (alpha + beta * t_unique[:, None]).T y_mean = outcome.mean(axis=0) elif kind == "predictions": ppc = m_idata.predictions y_pred = ppc["y_pred"].mean(axis=0).values y_mean = y_pred.mean(axis=(0, 1)) outcome = y_pred.reshape((4000 * n_idx, n_time)) # set up plot fig, ax = plt.subplots(figsize=(10, 7)) # plot data ax.scatter(t, y, color="darkorange", alpha=0.5) # plot mean ax.plot(t_unique, y_mean, color="darkorange") # plot lower interval az.plot_hdi(t_unique, outcome, ax=ax, fill_kwargs={ 'alpha': 0.4, "label": f"{low*100}% HPD intervals" }, hdi_prob=low) # plot higher interval az.plot_hdi(t_unique, outcome, ax=ax, fill_kwargs={ 'alpha': 0.3, "label": f"{high*100}% HDI intervals" }, hdi_prob=high) # add legend, title and formatting. ax.legend() fig.suptitle(f"Python/pyMC3: Prediction Intervals ({kind})") fig.tight_layout() plt.savefig(f"../plots_python/{model_type}_{prior_level}_HDI_{kind}.jpeg", dpi=300)
) # plot mean ax.plot( t_unique, y_mean, color = "darkorange" # aesthetics ) # set HDI intervals high, low = (.95, .8) # plot lower interval az.plot_hdi( t_unique, outcome, ax = ax, fill_kwargs= {'alpha': 0.4, "label": "80% HPD intervals"}, hdi_prob = low) # plot higher interval az.plot_hdi( t_unique, outcome, ax = ax, fill_kwargs = {'alpha': 0.3, "label": "95% HDI intervals"}, hdi_prob = high) # add legend, title and tight layout. ax.legend() fig.suptitle("Python/pyMC3: Prediction Intervals (fixed)") fig.tight_layout()
""" Plot HDI ======== _thumb: .8, .8 """ import matplotlib.pyplot as plt import numpy as np import arviz as az az.style.use("arviz-darkgrid") x_data = np.random.normal(0, 1, 100) y_data = 2 + x_data * 0.5 y_data_rep = np.random.normal(y_data, 0.5, (200, 100)) plt.plot(x_data, y_data, "C6") az.plot_hdi(x_data, y_data_rep, color="k", plot_kwargs={"ls": "--"}) plt.show()
μ = α + pm.math.dot(x_1, β) θ = pm.Deterministic('θ', 1 / (1 + pm.math.exp(-μ))) bd = pm.Deterministic('bd', -α / β[1] - β[0] / β[1] * x_1[:, 0]) yl = pm.Bernoulli('yl', p=θ, observed=y_1) trace_1 = pm.sample(2000, cores=1, chains=2) varnames = ['α', 'β'] #az.plot_forest(trace_1, var_names=varnames); idx = np.argsort(x_1[:, 0]) bd = trace_1['bd'].mean(0)[idx] plt.figure() plt.scatter(x_1[:, 0], x_1[:, 1], c=[f'C{x}' for x in y_1]) plt.plot(x_1[:, 0][idx], bd, color='k') az.plot_hdi(x_1[:, 0], trace_1['bd'], color='k') plt.xlabel(x_n[0]) plt.ylabel(x_n[1]) plt.tight_layout() if unbalanced: pml.savefig('logreg_iris_bayes_2d_unbalanced.pdf', dpi=300) else: pml.savefig('logreg_iris_bayes_2d.pdf', dpi=300) plt.show()
def conduct_bayesian_norm(xj, zj, nj, mu_init, beta_init, draws, target_accept, plot_flag=True): """ A function to conduct Bayesian updating of fragility models. (Optional): Produce MCMC-related plots (trace). Default: True Notes: Here intensity measure is the wind speed: A normalizing factor (max wind speed) is used to improve numerical stability. Prior distributions are designated according to the assumption that the Bayesian analysis will utilize wind fragility functions from HAZUS (see priors for mu and beta). See De Brujin et al. (2020) for more details. De Bruijn J. et al. (2020). "Using rapid damage observations from social media for Bayesian updating of hurricane vulnerability functions: A case study of Hurricane Dorian." Nat.Hazards Earth Syst. Sci. Discuss. [preprint], https://doi.org/10.5194/nhess-2020-282. The likelihood function is modeled using a Binomial distribution see Lallemant et al. (2015) for more details. :param xj: An array or list of observed intensity measure values for the damage measure. :param zj: An array or list of failure observations (number of failed buildings/components) for the given damage and intensity measure. :param nj: An array or list of the total # of buildings for the given damage measure and intensity measure :param mu_init: The mean value of the prior distribution for the logarithmic mean. :param beta_init: The mean value of the prior distribution for the logarithmic std. dev. :param num_samples: (Optional) The number of samples to conduct MCMC. :param plot_flag: (Optional) Produce the trace plot for the MCMC and updated distributions for parameters. :return: updated_values: A dictionary with each parameter's updated mean and standard deviation. """ # Step 1: Normalize the intensity measure: norm_analysis = True if norm_analysis: norm_factor = max(xj) xj = xj / norm_factor mu_init = mu_init / norm_factor mu_std_dev = 15 / norm_factor #nj = nj/15 #zj = zj/15 #beta_init = beta_init/norm_factor #beta_std_dev = 0.03/norm_factor else: mu_std_dev = 15 beta_std_dev = 0.03 # Step 2: Build the Bayesian model in PyMC3: with pm.Model() as model: # Step 3a: Set up the prior # Here we assume Normal distributions for both parameters of the fragility # Note: Parameters for mu are also normalized for compatibility with intensity measure values. # See De Brujin et al. for more information regarding the initialization of prior distributions. #mu = pm.Normal('mu', mu_init/norm_factor, 15/norm_factor) BoundedNormal = pm.Bound(pm.Normal, lower=0.0) #x = BoundedNormal('x', mu=1.0, sigma=3.0) theta1 = BoundedNormal('theta1', mu_init, mu_std_dev) theta2 = BoundedNormal('theta2', beta_init, beta_std_dev) # Step 3b: Set up the likelihood function: # The likelihood in this model is represented via a Binomial distribution. # See Lallemant et al. (2015) for MLE derivation. # Define fragility function equation: def normal_cdf(theta1, theta2, xj): """Compute the log of the cumulative density function of the normal.""" return 0.5 * (1 + tt.erf( (tt.log(xj / theta1)) / (theta2 * tt.sqrt(2)))) # Define the likelihood: like = pm.Binomial('like', p=normal_cdf(theta1, theta2, xj), observed=zj, n=nj) # Uncomment to do an initial check of parameter values (lookout for like = +/-inf) #for RV in model.basic_RVs: # print(RV.name, RV.logp(model.test_point)) # Step 3c: Determine the posterior # Note: can manually change number of cores if more computational power is available. trace_idata = pm.sample(draws, cores=1, return_inferencedata=True, random_seed=72, target_accept=target_accept) #trace = pm.sample(8000, cores=1, return_inferencedata=True, tune=2000, random_seed=52) #tune=2000 # (Optional): Plotting the trace and updated distributions for parameters: if plot_flag: from matplotlib import rcParams rcParams['font.family'] = "Times New Roman" rcParams.update({'font.size': 16}) az.plot_trace(trace_idata, chain_prop={'color': ['blue', 'red']}) # Plot the autocorrelation to check convergence: ax_corr = az.plot_autocorr(trace_idata, combined=True) ax_corr[0].set_title(r'$\theta_1$') ax_corr[1].set_title(r'$\theta_2$') ax_corr[0].set_ylabel('Autocorrelation') ax_corr[0].set_xlabel('Lag') ax_corr[1].set_xlabel('Lag') # Step 4: Generate summary statistics for the MCMC: print('Summary statistics for the MCMC:') print(az.summary( trace_idata)) # Note: can output this DataFrame if needed df = az.summary(trace_idata, hdi_prob=0.95) # Step 5: Sample from the posterior and save updated values for mean and std. dev: ppc = pm.sample_posterior_predictive( trace_idata, var_names=['theta1', 'theta2', 'like']) # Re-scale values for logarithmic mean: if norm_analysis: df['mean']['theta1'] = df['mean']['theta1'] * norm_factor df['sd']['theta1'] = df['sd']['theta1'] * norm_factor ppc['theta1'] = ppc['theta1'] * norm_factor else: pass # Export analysis results: summary_dict = {} for row in df.index: for col in df.columns: new_key = row + col if 'hdi' in new_key and 'theta1' in new_key and norm_analysis: new_val = df[col][row] * norm_factor else: new_val = df[col][row] summary_dict[new_key] = new_val # Export MCMC details: mcmc_dict = {'draws': draws, 'target_accept': target_accept} for key in mcmc_dict: summary_dict[key] = mcmc_dict[key] df_summary = pd.DataFrame(summary_dict, index=[0], dtype='object') # Sample directly from the posterior to create figures: updated_values = { 'theta1': { 'mean': ppc['theta1'].mean(), 'std dev': np.std(ppc['theta1']) }, 'theta2': { 'mean': ppc['theta2'].mean(), 'std dev': np.std(ppc['theta2']) } } if plot_flag: # Plot prior and updated distributions for parameters: # mu fig, ax = plt.subplots() ax.hist(ppc['theta1'] / 2.237, bins=25, density=True, alpha=0.4, color='cornflowerblue', label='posterior samples') ax.set_xlim(50 / 2.237, 200 / 2.237) xmin, xmax = ax.set_xlim() x = np.linspace(xmin, xmax, 100) if norm_analysis: p_prior = norm.pdf(x, mu_init * norm_factor / 2.237, mu_std_dev * norm_factor / 2.237) else: p_prior = norm.pdf(x, mu_init / 2.237, mu_std_dev / 2.237) p_new = norm.pdf(x, updated_values['theta1']['mean'] / 2.237, updated_values['theta1']['std dev'] / 2.237) ax.plot(x, p_prior, 'k', linewidth=2, label='prior') ax.plot(x, p_new, 'r', linewidth=2, label='updated', linestyle='dashed') ax.set_title('Prior and updated distributions for ' + r'$\theta_1$') ax.set_xlabel(r'$\theta_1$') ax.set_ylabel('Probability') ax.legend() plt.show() # beta fig2, ax2 = plt.subplots() ax2.hist(ppc['theta2'], bins=25, density=True, alpha=0.4, color='cornflowerblue', label='posterior samples') ax2.set_xlim(0, 0.3) xmin2, xmax2 = ax2.set_xlim() x2 = np.linspace(xmin2, xmax2, 100) p_prior2 = norm.pdf(x2, beta_init, 0.03) p_new2 = norm.pdf(x2, updated_values['theta2']['mean'], updated_values['theta2']['std dev']) ax2.plot(x2, p_prior2, 'k', linewidth=2, label='prior') ax2.plot(x2, p_new2, 'r', linewidth=2, label='updated', linestyle='dashed') ax2.set_title('Prior and updated distributions for ' + r'$\theta_2$') ax2.set_xlabel(r'$\theta_2$') ax2.set_ylabel('Probability') ax2.legend() plt.show() # Create forestplots: # ax_forest = az.plot_forest(trace_idata.posterior['theta1']*norm_factor/2.237, hdi_prob=0.95, combined=True, var_names=['theta1']) # ax_forest[0].set_xlim(45, 75) # plt.show() # ax_forest2 = az.plot_forest(trace_idata, hdi_prob=0.95, combined=True, # var_names=['theta2']) # ax_forest2[0].set_xlim(0, 0.5) # plt.show() # Calculate failure probabilities for prior, updated: im = np.arange(70, 200, 2) # Mean of simulation-based fragility: if norm_analysis: pf_sim = pf(im, mu_init * norm_factor, beta_init) else: pf_sim = pf(im, mu_init, beta_init) # Mean of updated fragility: pf_mean = pf(im, ppc['theta1'].mean(), ppc['theta2'].mean()) # Calculate entire distribution of pfs using posterior samples: pf_ppc = [] for i in range(0, len(ppc['theta1'])): y = pf(im, ppc['theta1'][i], ppc['theta2'][i]) pf_ppc.append(y) # Plot the credible intervals, mean outcome of prediction, mean of simulation-based: fig3, ax3 = plt.subplots() ax3.set_clip_on(False) ax3.set_ylim(0, 1.2) ax3.spines['right'].set_visible(False) ax3.spines['top'].set_visible(False) az.plot_hdi(im / 2.237, pf_ppc, hdi_prob=0.95, fill_kwargs={ 'alpha': 0.1, 'color': 'paleturquoise', 'label': '95% credible interval' }) ax3.plot(im / 2.237, pf_mean, label='mean of prediction', color='r', linestyle='dashed') ax3.plot(im / 2.237, pf_sim, label='mean of simulation-based', color='k') # Plot the observations: if norm_analysis: ax3.scatter(xj * norm_factor / 2.237, zj / nj, color='darkviolet', label='observations', zorder=5, s=70) else: ax3.scatter(xj / 2.237, zj / nj, color='darkviolet', label='observations', zorder=5, s=70) ax3.set_xlabel('Wind Speed [m/s]') ax3.set_ylabel('Probability of Failure') ax3.legend() plt.show() return df_summary
def plot_dependence( idata, X=None, Y=None, kind="pdp", xs_interval="linear", xs_values=None, var_idx=None, var_discrete=None, samples=50, instances=10, random_seed=None, sharey=True, rug=True, smooth=True, indices=None, grid="long", color="C0", color_mean="C0", alpha=0.1, figsize=None, smooth_kwargs=None, ax=None, ): """ Partial dependence or individual conditional expectation plot Parameters ---------- idata: InferenceData InferenceData containing a collection of BART_trees in sample_stats group X : array-like The covariate matrix. Y : array-like The response vector. kind : str Whether to plor a partial dependence plot ("pdp") or an individual conditional expectation plot ("ice"). Defaults to pdp. xs_interval : str Method used to compute the values X used to evaluate the predicted function. "linear", evenly spaced values in the range of X. "quantiles", the evaluation is done at the specified quantiles of X. "insample", the evaluation is done at the values of X. For discrete variables these options are ommited. xs_values : int or list Values of X used to evaluate the predicted function. If ``xs_interval="linear"`` number of points in the evenly spaced grid. If ``xs_interval="quantiles"``quantile or sequence of quantiles to compute, which must be between 0 and 1 inclusive. Ignored when ``xs_interval="insample"``. var_idx : list List of the indices of the covariate for which to compute the pdp or ice. var_discrete : list List of the indices of the covariate treated as discrete. samples : int Number of posterior samples used in the predictions. Defaults to 50 instances : int Number of instances of X to plot. Only relevant if ice ``kind="ice"`` plots. random_seed : int random_seed used to sample from the posterior. Defaults to None. sharey : bool Controls sharing of properties among y-axes. Defaults to True. rug : bool Whether to include a rugplot. Defaults to True. smooth=True, If True the result will be smoothed by first computing a linear interpolation of the data over a regular grid and then applying the Savitzky-Golay filter to the interpolated data. Defaults to True. grid : str or tuple How to arrange the subplots. Defaults to "long", one subplot below the other. Other options are "wide", one subplot next to eachother or a tuple indicating the number of rows and columns. color : matplotlib valid color Color used to plot the pdp or ice. Defaults to "C0" color_mean : matplotlib valid color Color used to plot the mean pdp or ice. Defaults to "C0", alpha : float Transparency level, should in the interval [0, 1]. figsize : tuple Figure size. If None it will be defined automatically. smooth_kwargs : dict Additional keywords modifying the Savitzky-Golay filter. See scipy.signal.savgol_filter() for details. ax : axes Matplotlib axes. Returns ------- axes: matplotlib axes """ if kind not in ["pdp", "ice"]: raise ValueError(f"kind={kind} is not suported. Available option are 'pdp' or 'ice'") if xs_interval not in ["insample", "linear", "quantiles"]: raise ValueError( f"""{xs_interval} is not suported. Available option are 'insample', 'linear' or 'quantiles'""" ) rng = RandomState(seed=random_seed) if isinstance(X, pd.DataFrame): X_names = list(X.columns) X = X.values else: X_names = [] if isinstance(Y, pd.DataFrame): Y_label = f"Predicted {Y.name}" else: Y_label = "Predicted Y" num_observations = X.shape[0] num_covariates = X.shape[1] indices = list(range(num_covariates)) if var_idx is None: var_idx = indices if var_discrete is None: var_discrete = [] if X_names: X_labels = [X_names[idx] for idx in var_idx] else: X_labels = [f"X_{idx}" for idx in var_idx] if xs_interval == "linear" and xs_values is None: xs_values = 10 if xs_interval == "quantiles" and xs_values is None: xs_values = [0.05, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.95] if kind == "ice": instances = np.random.choice(range(X.shape[0]), replace=False, size=instances) new_Y = [] new_X_target = [] y_mins = [] new_X = np.zeros_like(X) idx_s = list(range(X.shape[0])) for i in var_idx: indices_mi = indices[:] indices_mi.pop(i) y_pred = [] if kind == "pdp": if i in var_discrete: new_X_i = np.unique(X[:, i]) else: if xs_interval == "linear": new_X_i = np.linspace(np.nanmin(X[:, i]), np.nanmax(X[:, i]), xs_values) elif xs_interval == "quantiles": new_X_i = np.quantile(X[:, i], q=xs_values) elif xs_interval == "insample": new_X_i = X[:, i] for x_i in new_X_i: new_X[:, indices_mi] = X[:, indices_mi] new_X[:, i] = x_i y_pred.append(np.mean(predict(idata, rng, X_new=new_X, size=samples), 1)) new_X_target.append(new_X_i) else: for instance in instances: new_X = X[idx_s] new_X[:, indices_mi] = X[:, indices_mi][instance] y_pred.append(np.mean(predict(idata, rng, X_new=new_X, size=samples), 0)) new_X_target.append(new_X[:, i]) y_mins.append(np.min(y_pred)) new_Y.append(np.array(y_pred).T) if ax is None: if grid == "long": fig, axes = plt.subplots(len(var_idx), sharey=sharey, figsize=figsize) elif grid == "wide": fig, axes = plt.subplots(1, len(var_idx), sharey=sharey, figsize=figsize) elif isinstance(grid, tuple): fig, axes = plt.subplots(grid[0], grid[1], sharey=sharey, figsize=figsize) axes = np.ravel(axes) else: axes = [ax] fig = ax.get_figure() for i, ax in enumerate(axes): if i >= len(var_idx): ax.set_axis_off() fig.delaxes(ax) else: var = var_idx[i] if var in var_discrete: if kind == "pdp": y_means = new_Y[i].mean(0) hdi = az.hdi(new_Y[i]) ax.errorbar( new_X_target[i], y_means, (y_means - hdi[:, 0], hdi[:, 1] - y_means), fmt=".", color=color, ) else: ax.plot(new_X_target[i], new_Y[i], ".", color=color, alpha=alpha) ax.plot(new_X_target[i], new_Y[i].mean(1), "o", color=color_mean) ax.set_xticks(new_X_target[i]) elif smooth: if smooth_kwargs is None: smooth_kwargs = {} smooth_kwargs.setdefault("window_length", 55) smooth_kwargs.setdefault("polyorder", 2) x_data = np.linspace(np.nanmin(new_X_target[i]), np.nanmax(new_X_target[i]), 200) x_data[0] = (x_data[0] + x_data[1]) / 2 if kind == "pdp": interp = griddata(new_X_target[i], new_Y[i].mean(0), x_data) else: interp = griddata(new_X_target[i], new_Y[i], x_data) y_data = savgol_filter(interp, axis=0, **smooth_kwargs) if kind == "pdp": az.plot_hdi( new_X_target[i], new_Y[i], color=color, fill_kwargs={"alpha": alpha}, ax=ax ) ax.plot(x_data, y_data, color=color_mean) else: ax.plot(x_data, y_data.mean(1), color=color_mean) ax.plot(x_data, y_data, color=color, alpha=alpha) else: idx = np.argsort(new_X_target[i]) if kind == "pdp": az.plot_hdi( new_X_target[i], new_Y[i], smooth=smooth, fill_kwargs={"alpha": alpha}, ax=ax, ) ax.plot(new_X_target[i][idx], new_Y[i][idx].mean(0), color=color) else: ax.plot(new_X_target[i][idx], new_Y[i][idx], color=color, alpha=alpha) ax.plot(new_X_target[i][idx], new_Y[i][idx].mean(1), color=color_mean) if rug: lb = np.min(y_mins) ax.plot(X[:, var], np.full_like(X[:, var], lb), "k|") ax.set_xlabel(X_labels[i]) fig.text(-0.05, 0.5, Y_label, va="center", rotation="vertical", fontsize=15) return axes
t_eval = np.arange(0, 180) inference_data = az.from_cmdstan('../results/outputs/*.csv') chains = [i for i in range(18)] samples = [i for i in range(20000)] incidence = [] for i in range(5000): chain = np.random.choice(chains) sample = np.random.choice(samples) beta_start = inference_data.posterior.data_vars['beta_start'][chain, sample].data beta_end = inference_data.posterior.data_vars['beta_end'][chain, sample].data k = inference_data.posterior.data_vars['k'][chain, sample].data seir = TimeVaryingSLAPIR(t_eval=t_eval, beta_start=beta_start, beta_end=beta_end, k=k, m=90, init=init) incidence.append(seir.jit_solve().y[18, :]) az.plot_hdi(t_eval, incidence, hdi_prob=0.95) plt.plot(np.arange(0, 180), daily_cases.newcountconfirmed[20:200]) plt.xlabel('Time (days)') plt.ylabel('Incidence') plt.tight_layout() #plt.show() plt.savefig('../results/plots/model_fit.pdf')
import arviz as az from scipy import stats as st import matplotlib.pyplot as plt import numpy as np import graphviz n = 50 theta_0 = 2 theta_1 = 0.5 xs = st.uniform(0, 30).rvs(n) y_true = theta_0 + theta_1 * xs y_obs = y_true + st.norm(0, 0.5).rvs(n) with pm.Model() as BRegression: theta_0 = pm.Normal('theta_0', mu=0, sigma=10) theta_1 = pm.Normal('theta_1', mu=0, sigma=10) sigma = pm.HalfCauchy('sigma', 10) mu = pm.Deterministic('mu', theta_0 + theta_1 * xs) pm.Normal('y_lik', mu=mu, sigma=sigma, observed=y_obs) model_trace = pm.sample(draws=5000, tune=2000, cores=1, chains=4) pp = pm.sample_posterior_predictive(trace=model_trace) _, axi = plt.subplots(figsize=(12, 5)) axi.plot(xs, pp['y_lik'].mean(axis=0), c='k') az.plot_hdi(xs, model_trace['mu'], hdi_prob=0.98, ax=axi, color='gray') axi.scatter(xs, y_obs) plt.ylabel('y_observed', rotation=0, labelpad=30) az.plot_posterior(model_trace, var_names=['theta_0', 'theta_1']) plt.show() #pm.model_to_graphviz(BRegression).view()