def main(): # The parameters are the bounds of the Uniform. p = pm.Uniform('p', lower=0, upper=1) # set constants p_true = 0.05 # remember, this is unknown. N = 1500 # sample N Bernoulli random variables from Ber(0.05). # each random variable has a 0.05 chance of being a 1. # this is the data-generation step occurrences = pm.rbernoulli(p_true, N) print occurrences print occurrences.sum() # Occurrences.mean is equal to n/N. print "What is the observed frequency in Group A? %.4f" % occurrences.mean() print "Does this equal the true frequency? %s" % (occurrences.mean() == p_true) # include the observations, which are Bernoulli obs = pm.Bernoulli("obs", p, value=occurrences, observed=True) # To be explained in chapter 3 mcmc = pm.MCMC([p, obs]) mcmc.sample(18000, 1000) plt.title("Posterior distribution of $p_A$, the true effectiveness of site A") plt.vlines(p_true, 0, 90, linestyle="--", label="true $p_A$ (unknown)") plt.hist(mcmc.trace("p")[:], bins=25, histtype="stepfilled", normed=True) plt.legend() plt.show()
def ages_and_data(N_exam, f_samp, correction_factor_array, age_lims): """Called by pred_samps. Simulates ages of survey participants and data given f.""" N_samp = len(f_samp) N_age_samps = correction_factor_array.shape[1] # Get samples for the age distribution at the observation points. age_distribution = [] for i in xrange(N_samp): l = age_lims[i] age_distribution.append(S_trace[np.random.randint(S_trace.shape[0]), 0, l[0]:l[1] + 1]) age_distribution[-1] /= np.sum(age_distribution[-1]) # Draw age for each individual, draw an age-correction profile for each location, # compute probability of positive for each individual, see how many individuals are # positive. A = [] pos = [] for s in xrange(N_samp): A.append( np.array(pm.rcategorical(age_distribution[s], size=N_exam[s]), dtype=int) + age_lims[s][0]) P_samp = pm.invlogit(f_samp[s].ravel( )) * correction_factor_array[:, np.random.randint(N_age_samps)][A[-1]] pos.append(pm.rbernoulli(P_samp)) return A, pos, age_distribution
def main(): # The parameters are the bounds of the Uniform. p = pm.Uniform('p', lower=0, upper=1) # set constants p_true = 0.05 # remember, this is unknown. N = 1500 # sample N Bernoulli random variables from Ber(0.05). # each random variable has a 0.05 chance of being a 1. # this is the data-generation step occurrences = pm.rbernoulli(p_true, N) print occurrences print occurrences.sum() # Occurrences.mean is equal to n/N. print "What is the observed frequency in Group A? %.4f" % occurrences.mean( ) print "Does this equal the true frequency? %s" % (occurrences.mean() == p_true) # include the observations, which are Bernoulli obs = pm.Bernoulli("obs", p, value=occurrences, observed=True) # To be explained in chapter 3 mcmc = pm.MCMC([p, obs]) mcmc.sample(18000, 1000) plt.title( "Posterior distribution of $p_A$, the true effectiveness of site A") plt.vlines(p_true, 0, 90, linestyle="--", label="true $p_A$ (unknown)") plt.hist(mcmc.trace("p")[:], bins=25, histtype="stepfilled", normed=True) plt.legend() plt.show()
def dice(data=None): if data is None: x = [pymc.rbernoulli(1.0 / 6.0) for i in range(0, 100)] else: x = data prob = pymc.Uniform('prob', lower=0, upper=1) d = pymc.Bernoulli('d', p=prob, value=x, observed=True) return locals()
def DoSamplingXPrior(self, w, a, M=None): if M is not None: xLen = M else: try: xLast = self.xSeq[-1] xLen = xLast.size except: raise UnboundLocalError('Unable to determine length of x') assert (w >= 0) and (w <= 1), __file__ + ': DoSamplingXPrior: w is out of bounds' bIsZero = pymc.rbernoulli(w, (xLen, 1)) FLaplaceDistInv = lambda x: (-a * np.sign(x - 0.5) * np.log(1 - 2 * np.abs(x - 0.5))) plazeSample = FLaplaceDistInv(0.5 + 0.5 * np.random.uniform(size=(xLen, 1))) xSample = bIsZero * plazeSample return xSample
def Main(): p = pm.Uniform('p', lower=0, upper=1) # Define true parameters for experimental purposes p_true = 0.05 N = 1_500 # Generate fake Data using parameters occurrences = pm.rbernoulli(p_true, N) print(occurrences) print(len(occurrences), occurrences.sum()) # Define observation random variavle obs = pm.Bernoulli("obs", p, value=occurrences, observed=True) # Solve using MCMC mcmc = pm.MCMC([p, obs]) mcmc.sample(18_000, 1_000) plot_posteriors(mcmc, p_true)
def trees_to_diagnostics(brt_evaluator, fname, species_name, n_pseudopresences, n_pseudoabsences, config_filename): """ Takes the BRT evaluator and sees how well it does at predicting the training dataset. """ from diagnostics import simple_assessments, roc, plot_roc_ din = csv2rec(os.path.join('anopheles-caches',fname)) found = din.found din = dict([(k,din[k]) for k in brt_evaluator.nice_tree_dict.iterkeys()]) probs = pm.flib.invlogit(brt_evaluator(din)) print 'Species %s: fraction %f correctly classified.'%(species_name, ((probs>.5)*found+(probs<.5)*(True-found)).sum()/float(len(probs))) result_dirname = get_result_dir(config_filename) resdict = {} for f in simple_assessments: resdict[f.__name__] = f(probs>.5, found) pstack = np.array([pm.rbernoulli(probs) for i in xrange(10000)]) fp, tp, AUC = roc(pstack, found) resdict['AUC'] = AUC fout=file(os.path.join(result_dirname,'simple-diagnostics.txt'),'w') fout.write('presences: %i\n'%(found.sum()-n_pseudopresences)) fout.write('pseudopresences: %i\n'%n_pseudopresences) fout.write('pseudoabsences: %i\n'%n_pseudoabsences) for k in resdict.iteritems(): fout.write('%s: %s\n'%k) import pylab as pl pl.clf() plot_roc_(fp,tp,AUC) pl.savefig(os.path.join(result_dirname,'roc.pdf')) r = np.rec.fromarrays([fp,tp],names='false,true') rec2csv(r,os.path.join(result_dirname,'roc.csv'))
def ages_and_data(N_exam, f_samp, correction_factor_array, age_lims): """Called by pred_samps. Simulates ages of survey participants and data given f.""" N_samp = len(f_samp) N_age_samps = correction_factor_array.shape[1] # Get samples for the age distribution at the observation points. age_distribution = [] for i in xrange(N_samp): l = age_lims[i] age_distribution.append(S_trace[np.random.randint(S_trace.shape[0]),0,l[0]:l[1]+1]) age_distribution[-1] /= np.sum(age_distribution[-1]) # Draw age for each individual, draw an age-correction profile for each location, # compute probability of positive for each individual, see how many individuals are # positive. A = [] pos = [] for s in xrange(N_samp): A.append(np.array(pm.rcategorical(age_distribution[s], size=N_exam[s]),dtype=int) + age_lims[s][0]) P_samp = pm.invlogit(f_samp[s].ravel())*correction_factor_array[:,np.random.randint(N_age_samps)][A[-1]] pos.append(pm.rbernoulli(P_samp)) return A, pos, age_distribution
#-------------------------------------------- # main code if __name__ == "__main__": VERBOSE = 0 # measure times start_time = time.time() #----------------------------------- # prior probability p = pm.Uniform("p", lower=0, upper=1) # initialize constants p_true = 0.5 N = 1500 # sample N Bernoulli random variables frpm Ber(0.05) # Each random variable has a 0.05 chance of being a 1. # This is the data=generation step. occurrences = pm.rbernoulli(p_true, N) print("numbers of 1 = {0}".format(occurrences.sum())) #----------------------------------- # observed frequency print("What is the observed frequency in Group A? %.4f" % occurrences.mean()) print("Does the observed frequency equal the true frequency? {0}".format( occurrences.mean() == p_true)) #----------------------------------- # apply Bayesian method # Include the observations, which are Bernoulli. obs = pm.Bernoulli("obs", p, value=occurrences, observed=True) # to be explained in Chapter 3 mcmc = pm.MCMC([p, obs]) mcmc.sample(20000, 1000)
# # p_A_true and p_B_true are the **true observed values** of A and B. # # We are now simulating values from a bernoulli distribution with the values of **p** as A_true and B_true # In[22]: # true value of p_A and p_B (unknown) p_A_true = 0.05 #Probability of click through rates in set up A p_B_true = 0.04 #Probability of click through rates in set up B # number of users visiting page A and B N_A = 1500 N_B = 700 occurrences_A = pm.rbernoulli(p_A_true, N_A) occurrences_B = pm.rbernoulli(p_B_true, N_B) print('Observed frequency for A:', sum(occurrences_A)) print('Observed frequency for B:', sum(occurrences_B)) # Now we will define our prior distributions, which is a Uniform distribution. This implies that we don't have any prior imformation. We also capture the difference between the probabilities of A and B # In[18]: p_A = pm.Uniform('p_A', lower=0, upper=1) p_B = pm.Uniform('p_B', lower=0, upper=1) @pm.deterministic def delta(p_A=p_A, p_B=p_B):
from matplotlib import pyplot from pylab import savefig # A/B testing of users purchasing from two different website frontends # assign prior distribs to unknowns. Let's assume `p` is uniform over [0, 1], # since we have no strong conviction about it. p = pm.Uniform('p', lower=0, upper=1) # set constants p_true = 0.05 # this is actually unknown, we are just testing N = 15000 # users # sample n bernoulli random variables from Ber(0.05) # each random var has a 0.05 chance of being a 1! # this is the data-generation step occurrences = pm.rbernoulli(p_true, N) print(occurrences) print(occurrences.sum()) # True == 1; False == 0 print(occurrences.sum()/N) # True == 1; False == 0 print("What is the observed frequency in Group A? %.4f" % occurrences.mean()) print("Does this equal the observed frequency? %s" % (occurrences.mean() == p_true)) # combine observations into the pymc observed variable: obs = pm.Bernoulli("obs", p, value=occurrences, observed=True) # then run inference algorithm (monte carlo simulation?) mcmc = pm.MCMC([p, obs]) mcmc.sample(18000, 1000)
plot_roc_(*results['roc']) elif k in ['producer_accuracy','consumer_accuracy']: pl.hist(results[k][:,0]) pl.title('%s, false') pl.figure() pl.hist(results[k][:,1]) pl.title('%s, true'%k) else: pl.hist(results[k]) pl.title(k) plot_validation = compose(plot_validation_, validate) if __name__ == '__main__': import pymc as pm import pylab as pl n = 1000 a = pm.rbernoulli(.7,size=n).astype('bool') p = pm.rbernoulli(.7,size=n).astype('bool') for s in simple_assessments: print s.__name__, s(p, a) ps = pm.rbernoulli(.7,size=(100,n)).astype('bool') for i in xrange(100): if np.random.random()<.05: ps[i,:]=a pl.clf() fp,tp,auc = roc(ps,a) plot_roc(ps, a)
# simple AB import matplotlib.pyplot as plt import pymc as pm p = pm.Uniform('p', lower = 0, upper = 1) # setting constants p_true = 0.05 N = 1500 # sampling N bern(0.05) data = pm.rbernoulli(p_true, N) print(data) print(data.sum()) print("\nmean: ", data.mean()) print("\nObserved prop equal p_true. ", data.mean() == p_true) obs = pm.Bernoulli("obs", p, value = data , observed = True) # yay a quick mcmc mcmc = pm.MCMC([p, obs]) mcmc.sample(20000, 1000) plt.title("possible values for the true effectiveness of version A") plt.vlines(p_true, 0, 90, linestyle = '--', label = "true $p_$A (not known)") plt.hist( mcmc.trace("p")[:] , bins = 35, histtype = "stepfilled", normed = True)
import pymc as pm import matplotlib.pyplot as plt true_p_A = 0.05 true_p_B = 0.04 N_A = 1500 N_B = 750 A_data = pm.rbernoulli(true_p_A, N_A) B_data = pm.rbernoulli(true_p_B, N_B) print("Mean of A: " , A_data.mean()) print("Mean of B: " , B_data.mean()) # priors p_A = pm.Uniform("p_A", 0, 1) p_B = pm.Uniform("p_B", 0, 1) @pm.deterministic def delta(p_A = p_A , p_B = p_B): return p_A - p_B data_A = pm.Bernoulli("data_A", p_A , value = A_data, observed = True) data_B = pm.Bernoulli("data_B", p_B , value = B_data, observed = True) # mcmc mcmc = pm.MCMC([p_A, p_B, delta, data_A, data_B]) mcmc.sample(25000, 5000) p_A_samples = mcmc.trace("p_A")[:]
def validate_rate_model(rate_type='neg_binom', data_type='epilepsy', replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) # load data model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/') data = model.get_data('p') #data = data.ix[:20, :] # replace data with synthetic data if requested if data_type == 'epilepsy': # no replacement needed pass elif data_type == 'schiz': import pandas as pd data = pd.read_csv('/homes/abie/gbd_dev/gbd/tests/schiz.csv') elif data_type == 'binom': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N elif data_type == 'poisson': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rpoisson(N*mu, size=len(data.index)) / N elif data_type == 'normal': mu = data['value'].mean() sigma = .125*mu data['standard_error'] = sigma data['value'] = mc.rnormal(mu, sigma**-2, size=len(data.index)) elif data_type == 'log_normal': mu = data['value'].mean() sigma = .25 data['standard_error'] = sigma*mu data['value'] = pl.exp(mc.rnormal(pl.log(mu), sigma**-2, size=len(data.index))) else: raise TypeError, 'Unknown data type "%s"' % data_type # sample prevalence data i_test = mc.rbernoulli(.25, size=len(data.index)) i_nan = pl.isnan(data['effective_sample_size']) data['lower_ci'] = pl.nan data['upper_ci'] = pl.nan data.ix[i_nan, 'effective_sample_size'] = 0. data['standard_error'] = pl.sqrt(data['value']*(1-data['value'])) / data['effective_sample_size'] data.ix[pl.isnan(data['standard_error']), 'standard_error'] = pl.inf data['standard_error'][i_test] = pl.inf data['effective_sample_size'][i_test] = 0. data['value'] = pl.maximum(data['value'], 1.e-12) model.input_data = data # create model # TODO: set parameters in model.parameters['p'] dict # then have simple method to create age specific rate model #model.parameters['p'] = ... #model.vars += dismod3.ism.age_specific_rate(model, 'p') model.parameters['p']['parameter_age_mesh'] = [0,100] model.parameters['p']['heterogeneity'] = 'Very' model.vars['p'] = dismod3.data_model.data_model( 'p', model, 'p', 'all', 'total', 'all', None, None, None, rate_type=rate_type, interpolation_method='zero', include_covariates=False) # add upper bound on sigma in log normal model to help convergence #if rate_type == 'log_normal': # model.vars['p']['sigma'].parents['upper'] = 1.5 # add upper bound on sigma, zeta in offset log normal #if rate_type == 'offset_log_normal': # model.vars['p']['sigma'].parents['upper'] = .1 # model.vars['p']['p_zeta'].value = 5.e-9 # model.vars['p']['p_zeta'].parents['upper'] = 1.e-8 # fit model dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000) #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0) # compare estimate to hold-out data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] data['lb_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,0] data['ub_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,1] import data_simulation model.test = data[i_test] data = model.test data['true'] = data['value'] data_simulation.add_quality_metrics(data) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'test') data_simulation.finalize_results(model) return model
for i in range(4): plt.subplot( 4, 1, i ) plot_artificial_sms_dataset() plt.show() # # A/B testing # # # A only p = pm.Uniform( 'p', lower=0, upper=1 ) p_true = 0.05 N = 1500 occurrences = pm.rbernoulli( p_true, N ) occurrences.sum() print "observed frequency: %.4f" % occurrences.mean() # run inference alg obs = pm.Bernoulli( "obs", p, value=occurrences, observed=True ) mcmc = pm.MCMC( [p, obs] ) mcmc.sample( 18000, 1000 ) plt.title( 'Posterior Dist of $p_A$: the true effectiveness of site A' ) plt.vlines( p_true, 0, 90, linestyles="--", label="true $p_A$ (unknown)" ) plt.hist( mcmc.trace( "p" )[:], bins=25, histtype="stepfilled", normed=True ) plt.legend() plt.show()
import pymc as pm from IPython.core.pylabtools import figsize from matplotlib import pyplot as plt p = pm.Uniform('p', lower=0, upper=1) p_ture = 0.05 # unknown in reality N = 1500 # sample N Bernoulli random variables from Ber(0.05) # each random variable has a 0.05 chance of being a 1. # this is the data-generation step occurrences = pm.rbernoulli(p_ture, N) print occurrences print occurrences.sum() print "What is the observed frequency in Group A? %.4f" % occurrences.mean() print "Does this equal the true frequency? %s" % (occurrences.mean() == p_ture) # include the boservations, which are Bernoulli obs = pm.Bernoulli('obs', p, value=occurrences, observed=True) mcmc = pm.MCMC([p, obs]) mcmc.sample(18000, 1000) figsize(12.5, 4) plt.title("Posterior distribution of $p_A$, the true effectiveness of site A") plt.vlines(p_ture, 0, 90, linestyle="--", label="true $p_A$ (unknown)") plt.hist(mcmc.trace('p')[:], bins=25, histtype="stepfilled", normed=True) plt.legend()
def validate_rate_model(rate_type='neg_binom', data_type='epilepsy', replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) # load data model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/') data = model.get_data('p') #data = data.ix[:20, :] # replace data with synthetic data if requested if data_type == 'epilepsy': # no replacement needed pass elif data_type == 'schiz': import pandas as pd data = pd.read_csv('/homes/abie/gbd_dev/gbd/tests/schiz.csv') elif data_type == 'binom': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N elif data_type == 'poisson': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rpoisson(N * mu, size=len(data.index)) / N elif data_type == 'normal': mu = data['value'].mean() sigma = .125 * mu data['standard_error'] = sigma data['value'] = mc.rnormal(mu, sigma**-2, size=len(data.index)) elif data_type == 'log_normal': mu = data['value'].mean() sigma = .25 data['standard_error'] = sigma * mu data['value'] = pl.exp( mc.rnormal(pl.log(mu), sigma**-2, size=len(data.index))) else: raise TypeError, 'Unknown data type "%s"' % data_type # sample prevalence data i_test = mc.rbernoulli(.25, size=len(data.index)) i_nan = pl.isnan(data['effective_sample_size']) data['lower_ci'] = pl.nan data['upper_ci'] = pl.nan data.ix[i_nan, 'effective_sample_size'] = 0. data['standard_error'] = pl.sqrt( data['value'] * (1 - data['value'])) / data['effective_sample_size'] data.ix[pl.isnan(data['standard_error']), 'standard_error'] = pl.inf data['standard_error'][i_test] = pl.inf data['effective_sample_size'][i_test] = 0. data['value'] = pl.maximum(data['value'], 1.e-12) model.input_data = data # create model # TODO: set parameters in model.parameters['p'] dict # then have simple method to create age specific rate model #model.parameters['p'] = ... #model.vars += dismod3.ism.age_specific_rate(model, 'p') model.parameters['p']['parameter_age_mesh'] = [0, 100] model.parameters['p']['heterogeneity'] = 'Very' model.vars['p'] = dismod3.data_model.data_model( 'p', model, 'p', 'all', 'total', 'all', None, None, None, rate_type=rate_type, interpolation_method='zero', include_covariates=False) # add upper bound on sigma in log normal model to help convergence #if rate_type == 'log_normal': # model.vars['p']['sigma'].parents['upper'] = 1.5 # add upper bound on sigma, zeta in offset log normal #if rate_type == 'offset_log_normal': # model.vars['p']['sigma'].parents['upper'] = .1 # model.vars['p']['p_zeta'].value = 5.e-9 # model.vars['p']['p_zeta'].parents['upper'] = 1.e-8 # fit model dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000) #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0) # compare estimate to hold-out data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] data['lb_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:, 0] data['ub_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:, 1] import data_simulation model.test = data[i_test] data = model.test data['true'] = data['value'] data_simulation.add_quality_metrics(data) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'test') data_simulation.finalize_results(model) return model
figsize(12, 4) #-------------------------------------------- # main code if __name__ == "__main__": VERBOSE = 0 # measure times start_time = time.time() #----------------------------------- # initialize constants true_p_A = 0.05 true_p_B = 0.04 N_A = 1500 N_B = 750 # Generate some observations. occurrences_A = pm.rbernoulli(true_p_A, N_A) occurrences_B = pm.rbernoulli(true_p_B, N_B) print("numbers of Site A = {0}".format(occurrences_A.sum())) print("numbers of Site B = {0}".format(occurrences_B.sum())) #----------------------------------- # observed frequency print("What is the observed frequency in Group A? %.4f" % occurrences_A.mean()) print("What is the observed frequency in Group B? %.4f" % occurrences_B.mean()) # Set up the PyMC model, Again assume Uniform priors for p_A and p_B. p_A = pm.Uniform("p_A", 0, 1) p_B = pm.Uniform("p_B", 0, 1) #Define the deterministic delta function. This is our unknown of interest. @pm.deterministic def delta(p_A=p_A, p_B=p_B):
import pymc as pm from IPython.core.pylabtools import figsize from matplotlib import pyplot as plt p = pm.Uniform('p', lower=0, upper=1) p_ture = 0.05 # unknown in reality N = 1500 # sample N Bernoulli random variables from Ber(0.05) # each random variable has a 0.05 chance of being a 1. # this is the data-generation step occurrences = pm.rbernoulli(p_ture, N) print occurrences print occurrences.sum() print "What is the observed frequency in Group A? %.4f" % occurrences.mean() print "Does this equal the true frequency? %s" % (occurrences.mean() == p_ture) # include the boservations, which are Bernoulli obs = pm.Bernoulli('obs', p, value=occurrences, observed=True) mcmc = pm.MCMC([p,obs]) mcmc.sample(18000, 1000) figsize(12.5, 4) plt.title("Posterior distribution of $p_A$, the true effectiveness of site A") plt.vlines(p_ture, 0, 90, linestyle="--", label="true $p_A$ (unknown)") plt.hist(mcmc.trace('p')[:], bins=25, histtype="stepfilled", normed=True) plt.legend()
# ---------------------------------------------------------------------------------------------------------------------- # Define random variables for probs of A and B p_A = pm.Uniform('p_A', lower=0, upper=1) p_B = pm.Uniform('p_B', lower=0, upper=1) # Define true parameters for experimental purposes p_true_A = 0.05 p_true_B = 0.04 # Note: Unequal sample sizes are valid in bayesian analysis N_A = 1_500 N_B = 750 # Generate fake Data using parameters observations_A = pm.rbernoulli(p_true_A, N_A) observations_B = pm.rbernoulli(p_true_B, N_B) # Define observation random variavle obs_A = pm.Bernoulli("obs_A", p_A, value=observations_A, observed=True) obs_B = pm.Bernoulli("obs_B", p_B, value=observations_B, observed=True) # ---------------------------------------------------------------------------------------------------------------------- # Define Functions # ---------------------------------------------------------------------------------------------------------------------- @pm.deterministic def delta(p_A=p_A, p_B=p_B): return p_A - p_B
def validate_covariate_model_dispersion(N=1000, delta_true=.15, pi_true=.01, zeta_true=[.5, -.5, 0.]): ## generate simulated data a = pl.arange(0, 100, 1) pi_age_true = pi_true * pl.ones_like(a) model = data.ModelData() model.parameters['p']['parameter_age_mesh'] = [0, 100] model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0 delta = delta_true * pl.exp(pl.dot(Z, zeta_true)) for i in range(len(zeta_true)): model.input_data['z_%d'%i] = Z[:,i] model.input_data['true'] = pi_true model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] add_quality_metrics(model.input_data) model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns) model.zeta['true'] = zeta_true model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean'] model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats()['standard deviation'] add_quality_metrics(model.zeta) print '\nzeta' print model.zeta model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.zeta['abs_err'].dropna())), model.zeta.dropna()['covered?'].mean()) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'delta') add_to_results(model, 'input_data') add_to_results(model, 'zeta') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) return model
from __future__ import print_function import matplotlib.pyplot as plt import numpy as np import pymc as pm p_a = 0.05 p_b = 0.04 N_a = 1500 N_b = 7500 data_a = pm.rbernoulli(p_a, N_a) data_b = pm.rbernoulli(p_b, N_b) p_a = pm.Uniform("p_a", 0, 1) obs_a = pm.Bernoulli("obs_a", p_a, value=data_a, observed=True) p_b = pm.Uniform("p_b", 0, 1) obs_b = pm.Bernoulli("obs_b", p_b, value=data_b, observed=True) @pm.deterministic def delta(p_a=p_a, p_b=p_b): return p_a - p_b mcmc = pm.MCMC([p_a, obs_a, p_b, obs_b, delta]) mcmc.sample(50000, 1000) ''' print ( "mean:" + str(data.mean()) \ + " true:" + str(p_t) \ + " form data:" + str( mcmc.trace("p")[:].mean() )
def generate(self, N_A, N_B): obsA = pymc.rbernoulli(self.true_pA, N_A) obsB = pymc.rbernoulli(self.true_PB, N_B)
This is an example of using Bayesian A/B testing """ import pymc as pm #these two quantities are unknown to us. true_p_A = 0.05 true_p_B = 0.04 #notice the unequal sample sizes -- no problem in Bayesian analysis. N_A = 1500 N_B = 1000 #generate data observations_A = pm.rbernoulli(true_p_A, N_A) observations_B = pm.rbernoulli(true_p_B, N_B) #set up the pymc model. Again assume Uniform priors for p_A and p_B p_A = pm.Uniform("p_A", 0, 1) p_B = pm.Uniform("p_B", 0, 1) #define the deterministic delta function. This is our unknown of interest. @pm.deterministic def delta(p_A=p_A, p_B=p_B): return p_A - p_B
def main(): # these two quantities are unknown to us. true_p_A = 0.05 true_p_B = 0.04 # notice the unequal sample sizes -- no problem in Bayesian analysis. N_A = 1500 N_B = 750 # generate some observations observations_A = pm.rbernoulli(true_p_A, N_A) observations_B = pm.rbernoulli(true_p_B, N_B) print observations_A.mean() print observations_B.mean() # Set up the pymc model. Again assume Uniform priors for p_A and p_B. p_A = pm.Uniform("p_A", 0, 1) p_B = pm.Uniform("p_B", 0, 1) # Define the deterministic delta function. This is our unknown of interest. @pm.deterministic def delta(p_A=p_A, p_B=p_B): return p_A - p_B # Set of observations, in this case we have two observation datasets. obs_A = pm.Bernoulli("obs_A", p_A, value=observations_A, observed=True) obs_B = pm.Bernoulli("obs_B", p_B, value=observations_B, observed=True) # To be explained in chapter 3. mcmc = pm.MCMC([p_A, p_B, delta, obs_A, obs_B]) mcmc.sample(20000, 1000) p_A_samples = mcmc.trace("p_A")[:] p_B_samples = mcmc.trace("p_B")[:] delta_samples = mcmc.trace("delta")[:] # histogram of posteriors ax = plt.subplot(311) plt.xlim(0, .1) plt.hist(p_A_samples, histtype='stepfilled', bins=25, alpha=0.85, label="posterior of $p_A$", color="#A60628", normed=True) plt.vlines(true_p_A, 0, 80, linestyle="--", label="true $p_A$ (unknown)") plt.legend(loc="upper right") plt.title("Posterior distributions of $p_A$, $p_B$, and delta unknowns") ax = plt.subplot(312) plt.xlim(0, .1) plt.hist(p_B_samples, histtype='stepfilled', bins=25, alpha=0.85, label="posterior of $p_B$", color="#467821", normed=True) plt.vlines(true_p_B, 0, 80, linestyle="--", label="true $p_B$ (unknown)") plt.legend(loc="upper right") ax = plt.subplot(313) plt.hist(delta_samples, histtype='stepfilled', bins=30, alpha=0.85, label="posterior of delta", color="#7A68A6", normed=True) plt.vlines(true_p_A - true_p_B, 0, 60, linestyle="--", label="true delta (unknown)") plt.vlines(0, 0, 60, color="black", alpha=0.2) plt.legend(loc="upper right") plt.show() # Count the number of samples less than 0, i.e. the area under the curve # before 0, represent the probability that site A is worse than site B. print "Probability site A is WORSE than site B: %.3f" % \ (delta_samples < 0).mean() print "Probability site A is BETTER than site B: %.3f" % \ (delta_samples > 0).mean()
def validate_covariate_model_dispersion(N=1000, delta_true=.15, pi_true=.01, zeta_true=[.5, -.5, 0.]): ## generate simulated data a = pl.arange(0, 100, 1) pi_age_true = pi_true * pl.ones_like(a) model = data.ModelData() model.parameters['p']['parameter_age_mesh'] = [0, 100] model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0 delta = delta_true * pl.exp(pl.dot(Z, zeta_true)) for i in range(len(zeta_true)): model.input_data['z_%d' % i] = Z[:, i] model.input_data['true'] = pi_true model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] add_quality_metrics(model.input_data) model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns) model.zeta['true'] = zeta_true model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean'] model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats( )['standard deviation'] add_quality_metrics(model.zeta) print '\nzeta' print model.zeta model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % ( pl.median(pl.absolute(model.zeta['abs_err'].dropna())), model.zeta.dropna()['covered?'].mean()) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'delta') add_to_results(model, 'input_data') add_to_results(model, 'zeta') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) return model
foos.value samples = [data_generator.random() for _ in xrange(10000)] plt.hist(samples, bins = 70, normed = True, histtype = "stepfilled") plt.title("FooBar") plt.xlim(0, 8) plt.show() # AB testing example --------------- # Preliminary -- just A p = pm.Uniform("p", lower = 0, upper = 1) p_true = 0.05 N = 1500 occurrences = pm.rbernoulli(p_true, N) obs = pm.Bernoulli("obs", p, value = occurrences, observed = True) model = pm.Model([p, obs]) mcmc = pm.MCMC(model) mcmc.sample(18000, 1000) samples = mcmc.trace("p")[:] # AB true_p_A = 0.05 true_p_B = 0.04 n_A = 1500 n_B = 750
""" This is an example of using Bayesian A/B testing """ import pymc as pm # these two quantities are unknown to us. true_p_A = 0.05 true_p_B = 0.04 # notice the unequal sample sizes -- no problem in Bayesian analysis. N_A = 1500 N_B = 1000 # generate data observations_A = pm.rbernoulli(true_p_A, N_A) observations_B = pm.rbernoulli(true_p_B, N_B) # set up the pymc model. Again assume Uniform priors for p_A and p_B p_A = pm.Uniform("p_A", 0, 1) p_B = pm.Uniform("p_B", 0, 1) # define the deterministic delta function. This is our unknown of interest. @pm.deterministic def delta(p_A=p_A, p_B=p_B): return p_A - p_B
def DoSamplingSpecificXConditionedAll(self, w, a, ind, fitErrExcludingInd, varLast, bLogDebug=False): assert (a > 0) and (w >= 0) and (varLast >= 0), __file__ + ': DoSamplingSpecificXConditionedAll: invalid inputs' etaIndSquared = varLast / self._hNormSquared[ind] assert (etaIndSquared > 0), __file__ + ': etaIndSquared is not strictly positive' dotProduct = np.sum(fitErrExcludingInd * self._h[:, ind]) muIndComponents = (dotProduct / self._hNormSquared[ind], -etaIndSquared / a) muInd = sum(muIndComponents) """ When y is big, calculating uInd is a challenge since there are numerical issues. We're trying to multiply a very small number (which equals 0 due to finite floating point representation) and a very large number, which is prone to returning 0. """ y = -muInd / PlazeGibbsSamplerReconstructor.CONST_SQRT_2 / math.sqrt(etaIndSquared) uInd = (w / a) * \ mpmath.sqrt(etaIndSquared) * PlazeGibbsSamplerReconstructor.CONST_SQRT_HALF_PI * mpmath.erfc(y) * \ mpmath.exp(y * y) uIndFloat = float(uInd) # Convert to an ordinary Python float assert (uIndFloat >= 0), __file__ + ': uIndFloat is negative: ' + str(uIndFloat) + \ ' w=' + str(w) + \ ' a=' + str(a) + \ ' etaIndSquared=' + str(etaIndSquared) + \ ' y=' + str(y) + \ ' muIndComp=(' + str(muIndComponents[0]) + ',' + str(muIndComponents[1]) + ')' + \ ' dotProduct=' + str(dotProduct) + \ ' hNormSquared=' + str(self._hNormSquared[ind]) if uIndFloat == float('inf'): wInd = 1 else: wInd = uIndFloat / (uIndFloat + (1 - w)) if ((wInd < 0) or (wInd > 1)): raise ValueError('uInd is {0} and wInd is {1}'.format(uInd, wInd)) if pymc.rbernoulli(wInd): # With probability wInd, generate a sample from a truncated Gaussian r.v. (support (0,Inf)) # xSample = pymc.rtruncated_normal(muInd, 1/etaIndSquared, a=0)[0] try: xSample = NumericalHelper.RandomNonnegativeNormal(muInd, etaIndSquared) except: fmtString = "Caught exception at {0}: NNN({1}, {2}). Intm. calc.: {3}, {4}, {5}. Exception: {6}" msg = fmtString.format(ind, muInd, etaIndSquared, muIndComponents[0], muIndComponents[1], varLast, sys.exc_info()[0]) logging.error(msg) xSample = 0; # XXX: Due to a numerical problem else: # Check the value of xSample if (xSample < 0): fmtString = "Invalid xSample at {0}: NNN({1}, {2}) ~> sample {3}. Intm. calc.: {4}, {5}, {6}" logging.error(fmtString.format(ind, muInd, etaIndSquared, xSample, muIndComponents[0], muIndComponents[1], varLast)) # Don't throw an exception # raise ValueError('xSample cannot be negative') xSample = 0; # XXX: Also due to a numerical problem, but no exception raised else: # With probability (1-wInd), generate 0 xSample = 0 if bLogDebug: fmtString = ' {0}/{1}: {2:.5e}, {3:.5f}={4:.5f}-{5:.5f}, {6:.5e}, {7:.5e}: {8:.5e}' logging.debug(fmtString.format(self._samplerIter, ind, etaIndSquared, muInd, muIndComponents[0], -muIndComponents[1], uIndFloat, wInd, xSample)) return xSample
import pymc as pm import numpy as np import matplotlib.pyplot as plt import scipy.stats as stats # A-B test p = pm.Uniform('p', lower=0, upper=1) p_true = 0.05 # Unknown N = 1500 # Ber(0.05) simmulation occur = pm.rbernoulli(p_true, N) print(occur) print(occur.sum()) print(occur.mean()) obs = pm.Bernoulli("obs", p, value=occur, observed=True) mcmc = pm.MCMC([p, obs]) mcmc.sample(20000, 1000) plt.figure(figsize=(12.5, 4)) plt.vlines(p_true, 0, 90, linestyle='--', label="real $p_A$ unknown value") plt.hist(mcmc.trace("p")[:], bins=25, histtype="stepfilled", normed=True) plt.legend() plt.show() true_p_a = 0.05 true_p_b = 0.04 n_a = 1500 n_b = 750