def model_effect(query_var, trace, X, filename='model_effect.svg'): # Variables that do not change steady_vars = list(X.columns) steady_vars.remove(query_var) # Linear Model that estimates a grade based on the value of the query variable # and one sample from the trace def lm(value, sample): # Prediction is the estimate given a value of the query variable prediction = sample['Intercept'] + sample[query_var] * value # Each non-query variable is assumed to be at the median value for var in steady_vars: # Multiply the weight by the median value of the variable prediction += sample[var] * X[var].median() return prediction # Find the minimum and maximum values for the range of the query var var_min = X[query_var].min() var_max = X[query_var].max() # Plot the estimated grade versus the range of query variable pm.plot_posterior_predictive_glm(trace, eval=np.linspace(var_min, var_max, 100), lm=lm, samples=100, color='blue', alpha=0.4, lw=2) # Plot formatting plt.xlabel('%s' % query_var, size=16) plt.ylabel('Meritvärde', size=16) plt.title("Korrelation av meritvärde vs %s" % query_var, size=18) plt.savefig(filename, format='svg')
def logistic_bayesian(x,y, number_samples): '''Generates a Bayesian linear regression with uninformative priors sigma Half Cauchy, intercept and coefficient are std normal Returns a dict with the model that can be used to compare, the trace and main plots''' # Set container result = dict.fromkeys(["model", "trace", "posterior_pred", "plot_param", "plot_data", "plot_uncertainty"], None) with pm.Model() as model: # Define priors sigma = pm.HalfCauchy('sigma', beta=10, testval=1.) intercept = pm.Normal('Intercept', 0, sigma=1) beta_1 = pm.Normal('x', 0, sigma=1) # Define likelihood likelihood = pm.Normal('y', intercept + beta_1 * x, sigma=sigma, observed=y) # Inference! trace = pm.sample(number_samples , cores=2) # draw 3000 posterior samples using NUTS sampling result["trace"] = trace posterior_pred = pm.sample_posterior_predictive(trace) result["posterior_pred"] = posterior_pred result["model"] = model plot_param = plt.figure(figsize=(7, 7)) pm.traceplot(trace[100:]) plt.tight_layout() result["plot_param"] = plot_param plot_data = plt.figure(figsize=(7, 7)) plt.plot(x, y, 'x', label='data') pm.plot_posterior_predictive_glm(trace, samples=3000, label='posterior predictive regression lines', eval = np.linspace(0, 250, 50)) plt.title('Posterior predictive regression lines') plt.legend(loc=0) plt.xlabel('x') plt.ylabel('y'); result["plot_data"] = plot_data return(result)
def model_effect(query_var, trace, X): ''' Given a response with multiple variables, show the effect of changing one variable while keeping all others at median. Params: variable name, trace, and data ''' # Variables that do not change steady_vars = list(X.columns) steady_vars.remove(query_var) def lm(value, sample): ''' Estimate (as a linear model) the response based on the value of the query variable and one sample from the trace. ''' # Prediction is the estimate given a value of the query variable prediction = sample['Intercept'] + sample[query_var] * value # Each non-query variable is assumed to be at the median value for var in steady_vars: # Multiply the weight by the median value of the variable prediction += sample[var] * X[var].median() return prediction plt.figure(figsize=(6, 6)) # Find the minimum and maximum values for the range of the query var var_min = X[query_var].min() var_max = X[query_var].max() # Plot the estimated grade versus the range of query variable pm.plot_posterior_predictive_glm(trace, eval=np.linspace(var_min, var_max, 100), lm=lm, samples=100, color='blue', alpha=0.4, lw=2) # Plot formatting plt.xlabel('%s' % query_var, size=16) plt.ylabel('Grade', size=16) plt.title("Posterior of Grade vs %s" % query_var, size=18) plt.show()
def posteriorPlots(self, final_col_list, trace): X_train_copy = self.X_train.drop(columns='G3') for i in final_col_list: # Variables that do not change constant_vars = list(X_train_copy.columns) constant_vars.remove(i) # Linear Model that estimates a grade based on the value of the query variable # and one sample from the trace def lm(value, sample): # Prediction is the estimate given a value of the query variable prediction = sample['Intercept'] + sample[i] * value # Each non-query variable is assumed to be at the median value for var in constant_vars: # Multiply the weight by the median value of the variable prediction += sample[var] * X_train_copy[var].median() return prediction plt.figure(figsize=(7, 7)) # Find the minimum and maximum values for the range of the i var_min = X_train_copy[i].min() var_max = X_train_copy[i].max() pm.plot_posterior_predictive_glm( trace, samples=300, eval=np.linspace(var_min, var_max, 100), lm=lm, label='posterior predictive regression lines', lw=3., c='r') # Plot formatting plt.xlabel('%s' % i, size=16) plt.ylabel('Grade', size=16) plt.title("Posterior of Grade vs %s" % i, size=18) plt.show()
size = 100 true_intercept = 1 true_slope = 2 x = np.linspace(0, 1, size) # y = a + b*x true_regression_line = true_intercept + true_slope * x # add noise y = true_regression_line + np.random.normal(scale=.5, size=size) # Add outliers x_out = np.append(x, [.1, .15, .2]) y_out = np.append(y, [8, 6, 9]) data = dict(x=x_out, y=y_out) with pm.Model() as model: family = pm.glm.families.StudentT() pm.glm.GLM.from_formula('y ~ x', data, family=family) trace = pm.sample(2000, cores=2) plt.figure(figsize=(7, 5)) plt.plot(x_out, y_out, 'x', label='data') pm.plot_posterior_predictive_glm(trace, samples=100, label='posterior predictive regression lines') plt.plot(x, true_regression_line, label='true regression line', lw=3., c='y') plt.legend() plt.show()
y_obs = pm.Normal('yhat', mu=(pm.Normal('intercept', mu=0, sd=10) + pm.Normal('theta', mu=0, sd=10) * X), sd=std, observed=y.values) trace = pm.sample(1000) pm.traceplot(trace) plt.show() sns.lmplot('Duration', 'Calories', df, fit_reg=False) pm.plot_posterior_predictive_glm( trace, samples=100, eval=base, linewidth=.3, color='r', alpha=0.8, label='Bayesian Posterior Predictive', lm=lambda x, sample: sample['intercept'] + sample['theta'] * x) plt.plot(base, yhat, color='black', linestyle='dashed', label='Ordinary Least Square') plt.legend() plt.show() # Prediction x = 20 ols_yhat = predict_ols(x, intercept, theta)
loc=mu, scale=sigma, size=x.size) return y if __name__ == '__main__': args = parser.parse_args() true_intercept, true_gradient, mu_data, sigma_data = args.intercept, args.gradient, args.mu, args.sigma x = np.linspace(0, 10, 100) data = create_data(x, true_intercept, true_gradient, mu_data, sigma_data) with pm.Model() as poly_model: intercept = pm.Uniform('Intercept', -10, 10) x_coeff = pm.Uniform('x', -10, 10) sigma = pm.Uniform('sigma', 0, 20) y_obs = pm.Normal('y_obs', mu=intercept + x_coeff * x, sigma=sigma, observed=data) trace = pm.sample(args.samples, cores=4) print(pm.summary(trace).round(2)) pm.traceplot(trace) plt.show() plt.plot(x, data, 'x') pm.plot_posterior_predictive_glm(trace, eval=x) plt.show()
data = dict(x=x_out, y=y_out) fig = plt.figure(figsize=(7, 7)) ax = fig.add_subplot(111, xlabel="x", ylabel="y", title="Generated data and underlying model") ax.plot(x_out, y_out, "x", label="sampled data") ax.plot(x, true_regression_line, label="true regression line", lw=2.0) plt.legend(loc=0); # plt.show() with pm.Model() as model: pm.glm.GLM.from_formula("y ~ x", data) trace = pm.sample(2000, tune=2000, cores=16) plt.figure(figsize=(7, 5)) plt.plot(x_out, y_out, "x", label="data") pm.plot_posterior_predictive_glm(trace, samples=100, label="posterior predictive regression lines") plt.plot(x, true_regression_line, label="true regression line", lw=3.0, c="y") plt.legend(loc=0); with pm.Model() as model_robust: family = pm.glm.families.StudentT() pm.glm.GLM.from_formula("y ~ x", data, family=family) trace_robust = pm.sample(2000, tune=2000, cores=16) plt.figure(figsize=(7, 5)) plt.plot(x_out, y_out, "x") pm.plot_posterior_predictive_glm(trace_robust, label="posterior predictive regression lines") plt.plot(x, true_regression_line, label="true regression line", lw=3.0, c="y") plt.legend(); plt.show()
%matplotlib inline plt.rcParams["figure.figsize"] = (10, 5) np.random.seed(42) # Prepare the data x = uniform(0, 20).rvs(30) eps = norm(0, 4).rvs(30) y = 11 + 3*x + eps x = np.append(x, 20) y = np.append(y, 11 + 8*20) # Sampling with pm.Model() as model: b_0 = pm.Normal("b_0", mu=0, sd=10) b_1 = pm.Normal("b_1", mu=0, sd=2) e = pm.HalfCauchy("e", 2) mu = pm.Deterministic("mu", b_0 + b_1*x) Y = pm.Normal("Y", mu=mu, sd=e, observed=y) trace = pm.sample(10000) # Plotting plt.scatter(x, y) pm.plot_posterior_predictive_glm( trace=trace, samples=200, eval=np.linspace(0, 20, 100), lm=lambda x, sample: sample["b_0"] + sample["b_1"]*x, alpha=.1, color="red" ) plt.savefig("./results/4-15-posterior-hpd-outlier.png")
observed=y) # Inference! trace = pm.sample( 3000, cores=2, tune=2000) # draw 3000 posterior samples using NUTS sampling # %% slideshow={"slide_type": "slide"} axes = pm.traceplot(trace[100:], figsize=(12, 7)) # %% slideshow={"slide_type": "slide"} plt.figure(figsize=(7, 7)) plt.plot(x, y, 'rx', label='data') generating_fun = lambda x, sample: sample['Intercept'] + sample['slope'] * x pm.plot_posterior_predictive_glm(trace, lm=generating_fun, samples=100, label='posterior predictive regression lines') plt.plot(x, true_regression_line, label='true regression line', lw=3., c='y') plt.title('Posterior predictive regression lines') plt.legend(loc=0) plt.xlabel('x') plt.ylabel('y') # %% [markdown] slideshow={"slide_type": "slide"} # ## Well, that seemed complicated -- but worth it? # # Advantages: # # 1. Estimate of $\sigma$ #
with model as linear_model: I = pc.Normal('I', mu=0, sd=10) S = pc.Normal('S', mu=0, sd=10) SD = pc.HalfNormal('SD', sd=10) mean = I + S * X.iloc[0:20, 0].values y_bayes_pred = pc.Normal('y_bayes_pred', mu=mean, sd=SD, observed=Y.iloc[0:20, 0].values) step = pc.NUTS() linear_trace = pc.sample(100, step) pc.plot_posterior_predictive_glm( linear_trace, samples=200, eval=test, color='green', alpha=0.7, linewidth=1, lm=lambda x, sample: sample['I'] + sample['S'] * x, label='Bayesian Linear Regression') plt.scatter(X[0], Y, color='pink') plt.plot(X[0], y_pred, label='MLE Linear Regression', color='orange') plt.xlabel('X') plt.ylabel('Y') plt.legend(loc='best') plt.show() bayes_prediction = linear_trace['I'] + linear_trace['S'] * 0.8158 sns.kdeplot(bayes_prediction, label='Bayesian Estimate') plt.vlines(x=0.8158 * slope + intercept, color='orange',
def linear_bayesian_flat_prior(x, y, number_samples=3000): '''Generates a Bayesian linear regression with half-flat priors gaussian likelihood Returns a dict with the model that can be used to compare, the trace and main plots''' # Set container result = dict.fromkeys([ "model", "trace", "posterior_pred", "plot_param", "plot_data", "plot_uncertainty" ], None) try: with pm.Model() as model: # Define priors # sigma = pm.HalfFlat('sigma') # intercept = pm.distributions.continuous.HalfFlat('Intercept') # beta_1 = pm.distributions.continuous.HalfFlat('x') sigma = pm.HalfNormal('sigma', sigma=0.4) intercept = pm.Bound(pm.HalfFlat, lower=100, upper=1000)('Intercept') beta_1 = pm.Bound(pm.HalfFlat, upper=100)('beta_1') # Define likelihood likelihood = pm.Normal('y', mu=intercept + beta_1 * x, sigma=sigma, observed=y) # Inference! trace = pm.sample( number_samples, cores=2) # draw 3000 posterior samples using NUTS sampling result["trace"] = trace posterior_pred = pm.sample_posterior_predictive(trace) result["posterior_pred"] = posterior_pred result["model"] = model plot_param = plt.figure(figsize=(7, 7)) pm.traceplot(trace) plt.tight_layout() result["plot_param"] = plot_param plot_data = plt.figure(figsize=(7, 7)) plt.plot(x, y, label='data') pm.plot_posterior_predictive_glm( trace, samples=number_samples, label='posterior predictive regression lines', eval=np.linspace(0, np.max(x), 50), lm=lambda x, trace: trace['Intercept'] + trace['beta_1'] * x) plt.title('Posterior predictive regression lines') plt.legend(loc=0) plt.xlabel('x') plt.ylabel('y') result["plot_data"] = plot_data return (result) except RuntimeError: return (np.NaN)
eval=np.linspace(0, np.max(x), 50), lm=lambda x, trace: trace['Intercept'] + trace['beta_1'] * x) plt.title('Posterior predictive regression lines') plt.legend(loc=0) plt.xlabel('x') plt.ylabel('y') result["plot_data"] = plot_data return (result) except RuntimeError: return (np.NaN) #%% a = linear_bayesian_flat_prior(x=x_trial, y=y_trial) #%% trace = a['trace'] plt.plot(x_trial, y_trial) pm.plot_posterior_predictive_glm( trace, samples=6000, label='posterior predictive regression lines', eval=np.linspace(0, np.max(x_trial), 50), lm=lambda x, trace: trace['Intercept'] + trace['beta_1'] * x)
fig, ax = plt.subplots(figsize=(7, 7)) ax.errorbar(df['x'].values, df['y'].values, fmt='ro', yerr=df['y_error'].values, xerr=df['x_error'].values, ecolor='black') # %% with pm.Model() as model_robust: family = pm.glm.families.StudentT() pm.glm.GLM.from_formula('y ~ x', df, family=family) trace_robust = pm.sample(40000, cores=2) # %% {"scrolled": false} pm.plot_trace(trace_robust) # %% fig = plt.figure(figsize=(10, 7)) pm.plot_posterior_predictive_glm(trace_robust, label='posterior predictive regression lines') ax = fig.axes[0] ax.errorbar(df['x'].values, df['y'].values, fmt='ro', yerr=df['y_error'].values, xerr=df['x_error'].values, ecolor='black') # %%
random_seed=42, progressbar=True) return trace if __name__ == "__main__": beta_0 = 1.0 beta_1 = 2.0 N = 200 eps_sigma_sq = 0.5 df = simulate_linear_data(N, beta_0, beta_1, eps_sigma_sq) sns.lmplot(x="x", y="y", data=df, size=10) plt.xlim(0.0, 1.0) trace = glm_mcmc_inference(df, iterations=5000) pm.traceplot(trace[500:]) plt.show() sns.lmplot(x="x", y="y", data=df, size=10, fit_reg=False) plt.xlim(0.0, 1.0) plt.ylim(0.0, 4.0) pm.plot_posterior_predictive_glm(trace, samples=100) x = np.linspace(0, 1, N) y = beta_0 + beta_1 * x plt.plot(x, y, label="True Regression Line", lw=3., c="green") plt.legend(loc=0) plt.show()
x = np.asarray(x) y = a * x**2 + b * x + c + np.random.normal(loc=mu, scale=sigma, size=x.size) return y if __name__ == '__main__': args = parser.parse_args() true_a, true_b, true_c, mu_data, sigma_data = args.a, args.b, args.c, args.mu, args.sigma x = np.linspace(0, 1, 1000) data = create_data(x, true_a, true_b, true_c, mu_data, sigma_data) with pm.Model() as poly_model: a = pm.Uniform('a', -10, 10) b = pm.Uniform('b', -10, 10) c = pm.Uniform('c', -10, 10) sigma = pm.Uniform('sigma', 0, 20) y_obs = pm.Normal('y_obs', mu=a * x**2 + b * x + c, sigma=sigma, observed=data) trace = pm.sample(args.samples, cores=4) print(pm.summary(trace, credible_interval=0.95).round(2)) # pm.plot_posterior(trace) # pm.pairplot(trace) plt.show() plt.plot(x, data, 'x') pm.plot_posterior_predictive_glm(trace, lm=lambda x, sample: sample['a'] * x**2 + sample['b'] * x + sample['c'], eval=x) plt.show()
beta_1 = pm.Normal('x', 0, sigma=1) # Define likelihood likelihood = pm.Normal('y', mu=intercept + beta_1 * x, sigma=sigma, observed=y) # Inference! trace = pm.sample(3000 , cores=2) # draw 3000 posterior samples using NUTS sampling posterior_pred = pm.sample_posterior_predictive(trace) #%% plt.figure(figsize=(7, 7)) pm.traceplot(trace[100:]) plt.tight_layout(); #%% plt.figure(figsize=(7, 7)) plt.plot(x, y, 'x', label='data') pm.plot_posterior_predictive_glm(trace, samples=3000, label='posterior predictive regression lines', eval = np.linspace(0, 250, 50)) plt.title('Posterior predictive regression lines') plt.legend(loc=0) plt.xlabel('x') plt.ylabel('y');
fig.suptitle(f'Linear regression from scratch\n' f'f(x) = y[0] + x*y[1] | true values y[0] = {y[0]}, y[1] = {y[1]}') ax[0,0].set(ylabel='y[0]', title='y[0] chain') ax[1,0].set(ylabel='y[1]', xlabel='Iteration', title='y[1] chain') ax[0,1].set(ylabel='probability', title='y[0] posterior') ax[1,1].set(ylabel='probability', title='y[1] posterior') bx.set(xlabel='x', ylabel='f(x)') bx.legend() if run_pymc3: # Plot the pymc3 results for confirmation. # Use the bult in traceplot functionality to visualize the posteriors. lines = [('y0', {}, [y[0]]), ('y1', {}, [y[1]])] # This API is very cumbersome! pymc3.traceplot(pymc_chain[100:], lines=lines) # Now make a plot similar to the from scratch plot I made of the family # of lines picked from the posterior. plt.figure(figsize=(7, 7)) plt.plot(x, y_obs_noise, 'x', label='True+noise') pymc3.plot_posterior_predictive_glm(pymc_chain, samples=100, eval=x, lm=lambda x, sample: sample['y0'] + sample['y1']*x, label='posterior predictive regression lines') plt.plot(x, y[0] + y[1]*x, label='True', lw=3., c='y') plt.title('Posterior predictive regression lines') plt.legend(loc=0) plt.xlabel('x') plt.ylabel('y') # Lastly, make a pairplot, i.e. a corner plot pymc3.plot_joint(pymc_chain, figsize=(5, 5), kind="hexbin") plt.show()
# badly initialized. In the case of a complex model that is hard for NUTS, # Metropolis, while faster, will have a very low effective sample size or not # converge properly at all. A better approach is to instead try to improve # initialization of NUTS, or reparameterize the model. # plot the posterior distribution of our parameters and the individual samples we drew. plt.figure(figsize=(7, 7)) pm.traceplot(trace[:]) # why two lines/colors for each distribution even though i specified 1 core/chain? # plot regression lines plt.figure(figsize=(7, 7)) plt.plot(x, y, 'x', label='data') pm.plot_posterior_predictive_glm( trace, samples=250, alpha=.25, lm=lambda x, sample: sample['intercept'] + sample['x_coeff'] * x, label='posterior predictive regression lines') # the above lm line is all to say what the linear model is, ans uses same variable names assigned in priors above #plt.plot(x, true_regression_line, label='true regression line', lw=1., c='y') plt.title('Posterior predictive regression lines') plt.legend(loc=0) plt.xlabel('x') plt.ylabel('y') # get credible intercvals pm.forestplot(trace) with pm.Model( ) as model: # model specifications in PyMC3 are wrapped in a with-statement # Define priors
# Posterior distribution linear_trace = pm.sample(1000, step) pm.traceplot(linear_trace, figsize=(12, 12)) # plt.show() pm.plot_posterior(linear_trace, figsize=(12, 10), text_size=20) # plt.show() pm.forestplot(linear_trace) # plt.show() plt.figure(figsize=(8, 8)) pm.plot_posterior_predictive_glm( linear_trace, samples=100, eval=np.linspace(2, 30, 100), linewidth=1, color='red', alpha=0.8, label='Bayesian Posterior Fits', lm=lambda x, sample: sample['Intercept'] + sample['slope'] * x) plt.scatter(X['Duration'], y.values, s=12, alpha=0.8, c='blue', label='Observations') plt.plot(X['Duration'], by_hand_coefs[0] + X['Duration'] * by_hand_coefs[1], 'k--', label='OLS Fit', linewidth=1.4)
with basic_model: pm.GLM.from_formula('y ~ x', df) start = pm.find_MAP() step = pm.NUTS() # Use the No-U-Turn Sampler trace = pm.sample(iterations, step, start, random_seed=42, progressbar=True) # Calculate the trace return trace if __name__ == "__main__": df = pd.read_csv('LifeExpectancy.csv', names=['y', 'x']) sns.lmplot(x='x', y='y', data=df, size=10, ci=None) plt.show() trace = glm_mcmc_inference(df, iterations=5000) pm.traceplot(trace[500:]) plt.show() # Plot a sample of posterior regression lines sns.lmplot(x='x', y='y', data=df, ci=None, size=10, fit_reg=True) pm.plot_posterior_predictive_glm( trace, samples=100, c='lightgreen', label='Posterior predictive regression lines') plt.legend(loc=0) plt.show()