Пример #1
0
def main():
    # The parameters are the bounds of the Uniform.
    p = pm.Uniform('p', lower=0, upper=1)

    # set constants
    p_true = 0.05  # remember, this is unknown.
    N = 1500

    # sample N Bernoulli random variables from Ber(0.05).
    # each random variable has a 0.05 chance of being a 1.
    # this is the data-generation step
    occurrences = pm.rbernoulli(p_true, N)

    print occurrences
    print occurrences.sum()

    # Occurrences.mean is equal to n/N.
    print "What is the observed frequency in Group A? %.4f" % occurrences.mean()
    print "Does this equal the true frequency? %s" % (occurrences.mean() == p_true)

    # include the observations, which are Bernoulli
    obs = pm.Bernoulli("obs", p, value=occurrences, observed=True)

    # To be explained in chapter 3
    mcmc = pm.MCMC([p, obs])
    mcmc.sample(18000, 1000)

    plt.title("Posterior distribution of $p_A$, the true effectiveness of site A")
    plt.vlines(p_true, 0, 90, linestyle="--", label="true $p_A$ (unknown)")
    plt.hist(mcmc.trace("p")[:], bins=25, histtype="stepfilled", normed=True)
    plt.legend()
    plt.show()
Пример #2
0
def ages_and_data(N_exam, f_samp, correction_factor_array, age_lims):
    """Called by pred_samps. Simulates ages of survey participants and data given f."""

    N_samp = len(f_samp)
    N_age_samps = correction_factor_array.shape[1]

    # Get samples for the age distribution at the observation points.
    age_distribution = []
    for i in xrange(N_samp):
        l = age_lims[i]
        age_distribution.append(S_trace[np.random.randint(S_trace.shape[0]), 0,
                                        l[0]:l[1] + 1])
        age_distribution[-1] /= np.sum(age_distribution[-1])

    # Draw age for each individual, draw an age-correction profile for each location,
    # compute probability of positive for each individual, see how many individuals are
    # positive.
    A = []
    pos = []
    for s in xrange(N_samp):
        A.append(
            np.array(pm.rcategorical(age_distribution[s], size=N_exam[s]),
                     dtype=int) + age_lims[s][0])
        P_samp = pm.invlogit(f_samp[s].ravel(
        )) * correction_factor_array[:, np.random.randint(N_age_samps)][A[-1]]
        pos.append(pm.rbernoulli(P_samp))

    return A, pos, age_distribution
Пример #3
0
def main():
    # The parameters are the bounds of the Uniform.
    p = pm.Uniform('p', lower=0, upper=1)

    # set constants
    p_true = 0.05  # remember, this is unknown.
    N = 1500

    # sample N Bernoulli random variables from Ber(0.05).
    # each random variable has a 0.05 chance of being a 1.
    # this is the data-generation step
    occurrences = pm.rbernoulli(p_true, N)

    print occurrences
    print occurrences.sum()

    # Occurrences.mean is equal to n/N.
    print "What is the observed frequency in Group A? %.4f" % occurrences.mean(
    )
    print "Does this equal the true frequency? %s" % (occurrences.mean()
                                                      == p_true)

    # include the observations, which are Bernoulli
    obs = pm.Bernoulli("obs", p, value=occurrences, observed=True)

    # To be explained in chapter 3
    mcmc = pm.MCMC([p, obs])
    mcmc.sample(18000, 1000)

    plt.title(
        "Posterior distribution of $p_A$, the true effectiveness of site A")
    plt.vlines(p_true, 0, 90, linestyle="--", label="true $p_A$ (unknown)")
    plt.hist(mcmc.trace("p")[:], bins=25, histtype="stepfilled", normed=True)
    plt.legend()
    plt.show()
Пример #4
0
def dice(data=None):

    if data is None:
        x = [pymc.rbernoulli(1.0 / 6.0) for i in range(0, 100)]
    else:
        x = data

    prob = pymc.Uniform('prob', lower=0, upper=1)

    d = pymc.Bernoulli('d', p=prob, value=x, observed=True)

    return locals()
Пример #5
0
def dice(data=None):

    if data is None:
        x = [pymc.rbernoulli(1.0 / 6.0) for i in range(0, 100)]
    else:
        x = data

    prob = pymc.Uniform('prob', lower=0, upper=1)

    d = pymc.Bernoulli('d', p=prob, value=x, observed=True)

    return locals()
 def DoSamplingXPrior(self, w, a, M=None):
     if M is not None:
         xLen = M
     else:
         try:
             xLast = self.xSeq[-1]
             xLen = xLast.size
         except:
             raise UnboundLocalError('Unable to determine length of x')
     assert (w >= 0) and (w <= 1), __file__ + ': DoSamplingXPrior: w is out of bounds'
     bIsZero = pymc.rbernoulli(w, (xLen, 1))
     FLaplaceDistInv = lambda x: (-a * np.sign(x - 0.5) * np.log(1 - 2 * np.abs(x - 0.5)))
     plazeSample = FLaplaceDistInv(0.5 + 0.5 * np.random.uniform(size=(xLen, 1))) 
     xSample = bIsZero * plazeSample
     return xSample
Пример #7
0
def Main():
    
    p = pm.Uniform('p', lower=0, upper=1)

    # Define true parameters for experimental purposes
    p_true = 0.05
    N = 1_500
    
    # Generate fake Data using parameters
    occurrences = pm.rbernoulli(p_true, N)
    
    print(occurrences)
    print(len(occurrences), occurrences.sum())
    
    # Define observation random variavle
    obs = pm.Bernoulli("obs", p, value=occurrences, observed=True)
    
    # Solve using MCMC
    mcmc = pm.MCMC([p, obs])
    mcmc.sample(18_000, 1_000)
    
    plot_posteriors(mcmc, p_true)
def trees_to_diagnostics(brt_evaluator, fname, species_name, n_pseudopresences, n_pseudoabsences, config_filename):
    """
    Takes the BRT evaluator and sees how well it does at predicting the training dataset.
    """

    from diagnostics import simple_assessments, roc, plot_roc_

    din = csv2rec(os.path.join('anopheles-caches',fname))
    found = din.found
    din = dict([(k,din[k]) for k in brt_evaluator.nice_tree_dict.iterkeys()])
    probs = pm.flib.invlogit(brt_evaluator(din))

    print 'Species %s: fraction %f correctly classified.'%(species_name, ((probs>.5)*found+(probs<.5)*(True-found)).sum()/float(len(probs)))

    result_dirname = get_result_dir(config_filename)
    
    resdict = {}
    for f in simple_assessments:
        resdict[f.__name__] = f(probs>.5, found)

    pstack = np.array([pm.rbernoulli(probs) for i in xrange(10000)])
    fp, tp, AUC = roc(pstack, found)
    resdict['AUC'] = AUC
    
    fout=file(os.path.join(result_dirname,'simple-diagnostics.txt'),'w')
    fout.write('presences: %i\n'%(found.sum()-n_pseudopresences))
    fout.write('pseudopresences: %i\n'%n_pseudopresences)
    fout.write('pseudoabsences: %i\n'%n_pseudoabsences)
    for k in resdict.iteritems():
        fout.write('%s: %s\n'%k)
    
    import pylab as pl
    pl.clf()
    plot_roc_(fp,tp,AUC)
    pl.savefig(os.path.join(result_dirname,'roc.pdf'))
    
    r = np.rec.fromarrays([fp,tp],names='false,true')
    rec2csv(r,os.path.join(result_dirname,'roc.csv'))
Пример #9
0
def ages_and_data(N_exam, f_samp, correction_factor_array, age_lims):
    """Called by pred_samps. Simulates ages of survey participants and data given f."""
    
    N_samp = len(f_samp)
    N_age_samps = correction_factor_array.shape[1]
    
    # Get samples for the age distribution at the observation points.
    age_distribution = []
    for i in xrange(N_samp):
        l = age_lims[i]
        age_distribution.append(S_trace[np.random.randint(S_trace.shape[0]),0,l[0]:l[1]+1])
        age_distribution[-1] /= np.sum(age_distribution[-1])
    
    # Draw age for each individual, draw an age-correction profile for each location,
    # compute probability of positive for each individual, see how many individuals are
    # positive.
    A = []
    pos = []
    for s in xrange(N_samp):
        A.append(np.array(pm.rcategorical(age_distribution[s], size=N_exam[s]),dtype=int) + age_lims[s][0])
        P_samp = pm.invlogit(f_samp[s].ravel())*correction_factor_array[:,np.random.randint(N_age_samps)][A[-1]]
        pos.append(pm.rbernoulli(P_samp))
    
    return A, pos, age_distribution
Пример #10
0
#--------------------------------------------
# main code
if __name__ == "__main__":
    VERBOSE = 0
    # measure times
    start_time = time.time()
    #-----------------------------------
    # prior probability
    p = pm.Uniform("p", lower=0, upper=1)
    # initialize constants
    p_true = 0.5
    N = 1500
    # sample N Bernoulli random variables frpm Ber(0.05)
    # Each random variable has a 0.05 chance of being a 1.
    # This is the data=generation step.
    occurrences = pm.rbernoulli(p_true, N)
    print("numbers of 1 = {0}".format(occurrences.sum()))
    #-----------------------------------
    # observed frequency
    print("What is the observed frequency in Group A? %.4f" %
          occurrences.mean())
    print("Does the observed frequency equal the true frequency? {0}".format(
        occurrences.mean() == p_true))
    #-----------------------------------
    # apply Bayesian method
    # Include the observations, which are Bernoulli.
    obs = pm.Bernoulli("obs", p, value=occurrences, observed=True)

    # to be explained in Chapter 3
    mcmc = pm.MCMC([p, obs])
    mcmc.sample(20000, 1000)
#
# p_A_true and p_B_true are the **true observed values** of A and B.
#
# We are now simulating values from a bernoulli distribution with the values of **p** as A_true and B_true

# In[22]:

# true value of p_A and p_B (unknown)
p_A_true = 0.05  #Probability of click through rates in set up A
p_B_true = 0.04  #Probability of click through rates in set up B

# number of users visiting page A and B
N_A = 1500
N_B = 700

occurrences_A = pm.rbernoulli(p_A_true, N_A)
occurrences_B = pm.rbernoulli(p_B_true, N_B)

print('Observed frequency for A:', sum(occurrences_A))
print('Observed frequency for B:', sum(occurrences_B))

# Now we will define our prior distributions, which is a Uniform distribution. This implies that we don't have any prior imformation. We also capture the difference between the probabilities of A and B

# In[18]:

p_A = pm.Uniform('p_A', lower=0, upper=1)
p_B = pm.Uniform('p_B', lower=0, upper=1)


@pm.deterministic
def delta(p_A=p_A, p_B=p_B):
Пример #12
0
from matplotlib import pyplot
from pylab import savefig

# A/B testing of users purchasing from two different website frontends
# assign prior distribs to unknowns. Let's assume `p` is uniform over [0, 1],
# since we have no strong conviction about it.
p = pm.Uniform('p', lower=0, upper=1)

# set constants
p_true = 0.05  # this is actually unknown, we are just testing
N = 15000  # users

# sample n bernoulli random variables from Ber(0.05)
# each random var has a 0.05 chance of being a 1!
# this is the data-generation step
occurrences = pm.rbernoulli(p_true, N)

print(occurrences)
print(occurrences.sum())  # True == 1; False == 0
print(occurrences.sum()/N)  # True == 1; False == 0

print("What is the observed frequency in Group A? %.4f" % occurrences.mean())
print("Does this equal the observed frequency? %s" % (occurrences.mean() == p_true))

# combine observations into the pymc observed variable:
obs = pm.Bernoulli("obs", p, value=occurrences, observed=True)

# then run inference algorithm (monte carlo simulation?)
mcmc = pm.MCMC([p, obs])
mcmc.sample(18000, 1000)
Пример #13
0
            plot_roc_(*results['roc'])
        elif k in ['producer_accuracy','consumer_accuracy']:
            pl.hist(results[k][:,0])
            pl.title('%s, false')
            pl.figure()
            pl.hist(results[k][:,1])
            pl.title('%s, true'%k)
        else:
            pl.hist(results[k])
            pl.title(k)
            
plot_validation = compose(plot_validation_, validate)
    
    
if __name__ == '__main__':
    import pymc as pm
    import pylab as pl
    n = 1000
    a = pm.rbernoulli(.7,size=n).astype('bool')
    p = pm.rbernoulli(.7,size=n).astype('bool')
    
    for s in simple_assessments:
        print s.__name__, s(p, a)
    
    ps = pm.rbernoulli(.7,size=(100,n)).astype('bool')
    for i in xrange(100):
        if np.random.random()<.05:
            ps[i,:]=a
    pl.clf()
    fp,tp,auc = roc(ps,a)
    plot_roc(ps, a)
Пример #14
0
# simple AB
import matplotlib.pyplot as plt
import pymc as pm

p = pm.Uniform('p', lower = 0, upper = 1)

# setting constants

p_true = 0.05
N = 1500

# sampling N bern(0.05)

data = pm.rbernoulli(p_true, N)

print(data)
print(data.sum())

print("\nmean: ", data.mean())
print("\nObserved prop equal p_true.  ",  data.mean() == p_true)

obs = pm.Bernoulli("obs", p, value = data , observed = True)

# yay a quick mcmc

mcmc = pm.MCMC([p, obs])
mcmc.sample(20000, 1000)

plt.title("possible values for the true effectiveness of version A")
plt.vlines(p_true, 0, 90, linestyle = '--', label = "true $p_$A (not known)")
plt.hist( mcmc.trace("p")[:] , bins = 35, histtype = "stepfilled", normed = True)
Пример #15
0
import pymc as pm
import matplotlib.pyplot as plt

true_p_A = 0.05
true_p_B = 0.04

N_A = 1500
N_B = 750

A_data = pm.rbernoulli(true_p_A, N_A)
B_data = pm.rbernoulli(true_p_B, N_B)

print("Mean of A: " , A_data.mean())
print("Mean of B: " , B_data.mean())

# priors
p_A = pm.Uniform("p_A", 0, 1)
p_B = pm.Uniform("p_B", 0, 1)

@pm.deterministic
def delta(p_A = p_A , p_B = p_B):
	return p_A - p_B

data_A = pm.Bernoulli("data_A", p_A , value = A_data, observed = True)
data_B = pm.Bernoulli("data_B", p_B , value = B_data, observed = True)

# mcmc
mcmc = pm.MCMC([p_A, p_B, delta, data_A, data_B])
mcmc.sample(25000, 5000)

p_A_samples = mcmc.trace("p_A")[:]
Пример #16
0
def validate_rate_model(rate_type='neg_binom', data_type='epilepsy', replicate=0):
    # set random seed for reproducibility
    mc.np.random.seed(1234567 + replicate)
    
    # load data
    model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/')

    data = model.get_data('p')

    #data = data.ix[:20, :]
    
    # replace data with synthetic data if requested
    if data_type == 'epilepsy':
        # no replacement needed
        pass

    elif data_type == 'schiz':
        import pandas as pd
        data = pd.read_csv('/homes/abie/gbd_dev/gbd/tests/schiz.csv')
    
    elif data_type == 'binom':
        N = 1.e6
        data['effective_sample_size'] = N
        mu = data['value'].mean()
        data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N

    elif data_type == 'poisson':
        N = 1.e6
        data['effective_sample_size'] = N
        mu = data['value'].mean()
        data['value'] = mc.rpoisson(N*mu, size=len(data.index)) / N

    elif data_type == 'normal':
        mu = data['value'].mean()
        sigma = .125*mu
        data['standard_error'] = sigma
        data['value'] = mc.rnormal(mu, sigma**-2, size=len(data.index))

    elif data_type == 'log_normal':
        mu = data['value'].mean()
        sigma = .25
        data['standard_error'] = sigma*mu
        data['value'] = pl.exp(mc.rnormal(pl.log(mu), sigma**-2, size=len(data.index)))

    else:
        raise TypeError, 'Unknown data type "%s"' % data_type

    # sample prevalence data
    i_test = mc.rbernoulli(.25, size=len(data.index))
    i_nan = pl.isnan(data['effective_sample_size'])
    
    data['lower_ci'] = pl.nan
    data['upper_ci'] = pl.nan
    data.ix[i_nan, 'effective_sample_size'] = 0.
    data['standard_error'] = pl.sqrt(data['value']*(1-data['value'])) / data['effective_sample_size']
    data.ix[pl.isnan(data['standard_error']), 'standard_error'] = pl.inf

    data['standard_error'][i_test] = pl.inf
    data['effective_sample_size'][i_test] = 0.

    data['value'] = pl.maximum(data['value'], 1.e-12)

    model.input_data = data


    # create model
    # TODO: set parameters in model.parameters['p'] dict
    # then have simple method to create age specific rate model
    #model.parameters['p'] = ...
    #model.vars += dismod3.ism.age_specific_rate(model, 'p')

    model.parameters['p']['parameter_age_mesh'] = [0,100]
    model.parameters['p']['heterogeneity'] = 'Very'
    model.vars['p'] = dismod3.data_model.data_model(
        'p', model, 'p',
        'all', 'total', 'all',
        None, None, None,
        rate_type=rate_type,
        interpolation_method='zero',
        include_covariates=False)
    
    # add upper bound on sigma in log normal model to help convergence
    #if rate_type == 'log_normal':
    #    model.vars['p']['sigma'].parents['upper'] = 1.5

    # add upper bound on sigma, zeta in offset log normal
    #if rate_type == 'offset_log_normal':
    #    model.vars['p']['sigma'].parents['upper'] = .1
    #    model.vars['p']['p_zeta'].value = 5.e-9
    #    model.vars['p']['p_zeta'].parents['upper'] = 1.e-8

    # fit model
    dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000)
    #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0)

    # compare estimate to hold-out
    data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    data['lb_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,0]
    data['ub_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,1]

    import data_simulation
    model.test = data[i_test]
    data = model.test
    data['true'] = data['value']
    data_simulation.add_quality_metrics(data)

    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'test')
    data_simulation.finalize_results(model)


    return model
for i in range(4):
    plt.subplot( 4, 1, i )
    plot_artificial_sms_dataset()
plt.show()

#
# A/B testing
#

#
# A only
p = pm.Uniform( 'p', lower=0, upper=1 )

p_true = 0.05
N = 1500
occurrences = pm.rbernoulli( p_true, N )
occurrences.sum()

print "observed frequency: %.4f" % occurrences.mean()

# run inference alg
obs = pm.Bernoulli( "obs", p, value=occurrences, observed=True )

mcmc = pm.MCMC( [p, obs] )
mcmc.sample( 18000, 1000 )

plt.title( 'Posterior Dist of $p_A$: the true effectiveness of site A' )
plt.vlines( p_true, 0, 90, linestyles="--", label="true $p_A$ (unknown)" )
plt.hist( mcmc.trace( "p" )[:], bins=25, histtype="stepfilled", normed=True )
plt.legend()
plt.show()
Пример #18
0
import pymc as pm
from IPython.core.pylabtools import figsize
from matplotlib import pyplot as plt

p = pm.Uniform('p', lower=0, upper=1)

p_ture = 0.05  # unknown in reality
N = 1500

# sample N Bernoulli random variables from Ber(0.05)
# each random variable has a 0.05 chance of being a 1.
# this is the data-generation step
occurrences = pm.rbernoulli(p_ture, N)

print occurrences
print occurrences.sum()

print "What is the observed frequency in Group A? %.4f" % occurrences.mean()
print "Does this equal the true frequency? %s" % (occurrences.mean() == p_ture)

# include the boservations, which are Bernoulli
obs = pm.Bernoulli('obs', p, value=occurrences, observed=True)

mcmc = pm.MCMC([p, obs])
mcmc.sample(18000, 1000)

figsize(12.5, 4)
plt.title("Posterior distribution of $p_A$, the true effectiveness of site A")
plt.vlines(p_ture, 0, 90, linestyle="--", label="true $p_A$ (unknown)")
plt.hist(mcmc.trace('p')[:], bins=25, histtype="stepfilled", normed=True)
plt.legend()
Пример #19
0
def validate_rate_model(rate_type='neg_binom',
                        data_type='epilepsy',
                        replicate=0):
    # set random seed for reproducibility
    mc.np.random.seed(1234567 + replicate)

    # load data
    model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/')

    data = model.get_data('p')

    #data = data.ix[:20, :]

    # replace data with synthetic data if requested
    if data_type == 'epilepsy':
        # no replacement needed
        pass

    elif data_type == 'schiz':
        import pandas as pd
        data = pd.read_csv('/homes/abie/gbd_dev/gbd/tests/schiz.csv')

    elif data_type == 'binom':
        N = 1.e6
        data['effective_sample_size'] = N
        mu = data['value'].mean()
        data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N

    elif data_type == 'poisson':
        N = 1.e6
        data['effective_sample_size'] = N
        mu = data['value'].mean()
        data['value'] = mc.rpoisson(N * mu, size=len(data.index)) / N

    elif data_type == 'normal':
        mu = data['value'].mean()
        sigma = .125 * mu
        data['standard_error'] = sigma
        data['value'] = mc.rnormal(mu, sigma**-2, size=len(data.index))

    elif data_type == 'log_normal':
        mu = data['value'].mean()
        sigma = .25
        data['standard_error'] = sigma * mu
        data['value'] = pl.exp(
            mc.rnormal(pl.log(mu), sigma**-2, size=len(data.index)))

    else:
        raise TypeError, 'Unknown data type "%s"' % data_type

    # sample prevalence data
    i_test = mc.rbernoulli(.25, size=len(data.index))
    i_nan = pl.isnan(data['effective_sample_size'])

    data['lower_ci'] = pl.nan
    data['upper_ci'] = pl.nan
    data.ix[i_nan, 'effective_sample_size'] = 0.
    data['standard_error'] = pl.sqrt(
        data['value'] * (1 - data['value'])) / data['effective_sample_size']
    data.ix[pl.isnan(data['standard_error']), 'standard_error'] = pl.inf

    data['standard_error'][i_test] = pl.inf
    data['effective_sample_size'][i_test] = 0.

    data['value'] = pl.maximum(data['value'], 1.e-12)

    model.input_data = data

    # create model
    # TODO: set parameters in model.parameters['p'] dict
    # then have simple method to create age specific rate model
    #model.parameters['p'] = ...
    #model.vars += dismod3.ism.age_specific_rate(model, 'p')

    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.parameters['p']['heterogeneity'] = 'Very'
    model.vars['p'] = dismod3.data_model.data_model(
        'p',
        model,
        'p',
        'all',
        'total',
        'all',
        None,
        None,
        None,
        rate_type=rate_type,
        interpolation_method='zero',
        include_covariates=False)

    # add upper bound on sigma in log normal model to help convergence
    #if rate_type == 'log_normal':
    #    model.vars['p']['sigma'].parents['upper'] = 1.5

    # add upper bound on sigma, zeta in offset log normal
    #if rate_type == 'offset_log_normal':
    #    model.vars['p']['sigma'].parents['upper'] = .1
    #    model.vars['p']['p_zeta'].value = 5.e-9
    #    model.vars['p']['p_zeta'].parents['upper'] = 1.e-8

    # fit model
    dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000)
    #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0)

    # compare estimate to hold-out
    data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    data['lb_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,
                                                                            0]
    data['ub_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,
                                                                            1]

    import data_simulation
    model.test = data[i_test]
    data = model.test
    data['true'] = data['value']
    data_simulation.add_quality_metrics(data)

    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'test')
    data_simulation.finalize_results(model)

    return model
figsize(12, 4)

#--------------------------------------------
# main code
if __name__ == "__main__":
    VERBOSE = 0
    # measure times
    start_time = time.time()
    #-----------------------------------
    # initialize constants
    true_p_A = 0.05
    true_p_B = 0.04
    N_A = 1500
    N_B = 750
    # Generate some observations.
    occurrences_A = pm.rbernoulli(true_p_A, N_A)
    occurrences_B = pm.rbernoulli(true_p_B, N_B)
    print("numbers of Site A = {0}".format(occurrences_A.sum()))
    print("numbers of Site B = {0}".format(occurrences_B.sum()))
    #-----------------------------------
    # observed frequency
    print("What is the observed frequency in Group A? %.4f" %
          occurrences_A.mean())
    print("What is the observed frequency in Group B? %.4f" %
          occurrences_B.mean())
    # Set up the PyMC model, Again assume Uniform priors for p_A and p_B.
    p_A = pm.Uniform("p_A", 0, 1)
    p_B = pm.Uniform("p_B", 0, 1)
    #Define the deterministic delta function. This is our unknown of interest.
    @pm.deterministic
    def delta(p_A=p_A, p_B=p_B):
Пример #21
0
import pymc as pm
from IPython.core.pylabtools import figsize
from matplotlib import pyplot as plt

p = pm.Uniform('p', lower=0, upper=1)

p_ture = 0.05 # unknown in reality
N = 1500

# sample N Bernoulli random variables from Ber(0.05)
# each random variable has a 0.05 chance of being a 1.
# this is the data-generation step
occurrences = pm.rbernoulli(p_ture, N)

print occurrences
print occurrences.sum()

print "What is the observed frequency in Group A? %.4f" % occurrences.mean()
print "Does this equal the true frequency? %s" % (occurrences.mean() == p_ture)

# include the boservations, which are Bernoulli
obs = pm.Bernoulli('obs', p, value=occurrences, observed=True)

mcmc = pm.MCMC([p,obs])
mcmc.sample(18000, 1000)

figsize(12.5, 4)
plt.title("Posterior distribution of $p_A$, the true effectiveness of site A")
plt.vlines(p_ture, 0, 90, linestyle="--", label="true $p_A$ (unknown)")
plt.hist(mcmc.trace('p')[:], bins=25, histtype="stepfilled", normed=True)
plt.legend()
# ----------------------------------------------------------------------------------------------------------------------

# Define random variables for probs of A and B
p_A = pm.Uniform('p_A', lower=0, upper=1)
p_B = pm.Uniform('p_B', lower=0, upper=1)

# Define true parameters for experimental purposes
p_true_A = 0.05
p_true_B = 0.04

# Note: Unequal sample sizes are valid in bayesian analysis
N_A = 1_500
N_B = 750

# Generate fake Data using parameters
observations_A = pm.rbernoulli(p_true_A, N_A)
observations_B = pm.rbernoulli(p_true_B, N_B)

# Define observation random variavle
obs_A = pm.Bernoulli("obs_A", p_A, value=observations_A, observed=True)
obs_B = pm.Bernoulli("obs_B", p_B, value=observations_B, observed=True)

# ----------------------------------------------------------------------------------------------------------------------
# Define Functions
# ----------------------------------------------------------------------------------------------------------------------


@pm.deterministic
def delta(p_A=p_A, p_B=p_B):
    return p_A - p_B
Пример #23
0
def validate_covariate_model_dispersion(N=1000, delta_true=.15, pi_true=.01, zeta_true=[.5, -.5, 0.]):
    ## generate simulated data
    a = pl.arange(0, 100, 1)
    pi_age_true = pi_true * pl.ones_like(a)

    model = data.ModelData()
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0
    delta = delta_true * pl.exp(pl.dot(Z, zeta_true))
    for i in range(len(zeta_true)):
        model.input_data['z_%d'%i] = Z[:,i]

    model.input_data['true'] = pi_true

    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N)

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta*n*p) / n


    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()


    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    add_quality_metrics(model.input_data)


    model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns)
    model.zeta['true'] = zeta_true
    
    model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean']
    model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats()['standard deviation']
    add_quality_metrics(model.zeta)

    print '\nzeta'
    print model.zeta
    
    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.zeta['abs_err'].dropna())),
                                                           model.zeta.dropna()['covered?'].mean())


    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'delta')
    add_to_results(model, 'input_data')
    add_to_results(model, 'zeta')
    model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split())

    return model
Пример #24
0
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import pymc as pm

p_a = 0.05
p_b = 0.04
N_a = 1500
N_b = 7500

data_a = pm.rbernoulli(p_a, N_a)
data_b = pm.rbernoulli(p_b, N_b)

p_a = pm.Uniform("p_a", 0, 1)
obs_a = pm.Bernoulli("obs_a", p_a, value=data_a, observed=True)

p_b = pm.Uniform("p_b", 0, 1)
obs_b = pm.Bernoulli("obs_b", p_b, value=data_b, observed=True)


@pm.deterministic
def delta(p_a=p_a, p_b=p_b):
    return p_a - p_b


mcmc = pm.MCMC([p_a, obs_a, p_b, obs_b, delta])
mcmc.sample(50000, 1000)
'''
print ( "mean:" + str(data.mean()) \
      + " true:" + str(p_t) \
      + " form data:" + str( mcmc.trace("p")[:].mean() )  
Пример #25
0
 def generate(self, N_A, N_B):
     obsA = pymc.rbernoulli(self.true_pA, N_A)
     obsB = pymc.rbernoulli(self.true_PB, N_B)
Пример #26
0
This is an example of using Bayesian A/B testing

"""

import pymc as pm

#these two quantities are unknown to us.
true_p_A = 0.05
true_p_B = 0.04

#notice the unequal sample sizes -- no problem in Bayesian analysis.
N_A = 1500
N_B = 1000

#generate data
observations_A = pm.rbernoulli(true_p_A, N_A)
observations_B = pm.rbernoulli(true_p_B, N_B)

#set up the pymc model. Again assume Uniform priors for p_A and p_B

p_A = pm.Uniform("p_A", 0, 1)
p_B = pm.Uniform("p_B", 0, 1)

#define the deterministic delta function. This is our unknown of interest.


@pm.deterministic
def delta(p_A=p_A, p_B=p_B):
    return p_A - p_B

def main():
    # these two quantities are unknown to us.
    true_p_A = 0.05
    true_p_B = 0.04

    # notice the unequal sample sizes -- no problem in Bayesian analysis.
    N_A = 1500
    N_B = 750

    # generate some observations
    observations_A = pm.rbernoulli(true_p_A, N_A)
    observations_B = pm.rbernoulli(true_p_B, N_B)

    print observations_A.mean()
    print observations_B.mean()

    # Set up the pymc model. Again assume Uniform priors for p_A and p_B.
    p_A = pm.Uniform("p_A", 0, 1)
    p_B = pm.Uniform("p_B", 0, 1)

    # Define the deterministic delta function. This is our unknown of interest.
    @pm.deterministic
    def delta(p_A=p_A, p_B=p_B):
        return p_A - p_B

    # Set of observations, in this case we have two observation datasets.
    obs_A = pm.Bernoulli("obs_A", p_A, value=observations_A, observed=True)
    obs_B = pm.Bernoulli("obs_B", p_B, value=observations_B, observed=True)

    # To be explained in chapter 3.
    mcmc = pm.MCMC([p_A, p_B, delta, obs_A, obs_B])
    mcmc.sample(20000, 1000)

    p_A_samples = mcmc.trace("p_A")[:]
    p_B_samples = mcmc.trace("p_B")[:]
    delta_samples = mcmc.trace("delta")[:]

    # histogram of posteriors
    ax = plt.subplot(311)
    plt.xlim(0, .1)
    plt.hist(p_A_samples, histtype='stepfilled', bins=25, alpha=0.85,
             label="posterior of $p_A$", color="#A60628", normed=True)
    plt.vlines(true_p_A, 0, 80, linestyle="--", label="true $p_A$ (unknown)")
    plt.legend(loc="upper right")
    plt.title("Posterior distributions of $p_A$, $p_B$, and delta unknowns")

    ax = plt.subplot(312)
    plt.xlim(0, .1)
    plt.hist(p_B_samples, histtype='stepfilled', bins=25, alpha=0.85,
             label="posterior of $p_B$", color="#467821", normed=True)
    plt.vlines(true_p_B, 0, 80, linestyle="--", label="true $p_B$ (unknown)")
    plt.legend(loc="upper right")

    ax = plt.subplot(313)
    plt.hist(delta_samples, histtype='stepfilled', bins=30, alpha=0.85,
             label="posterior of delta", color="#7A68A6", normed=True)
    plt.vlines(true_p_A - true_p_B, 0, 60, linestyle="--",
               label="true delta (unknown)")
    plt.vlines(0, 0, 60, color="black", alpha=0.2)
    plt.legend(loc="upper right")
    plt.show()

    # Count the number of samples less than 0, i.e. the area under the curve
    # before 0, represent the probability that site A is worse than site B.
    print "Probability site A is WORSE than site B: %.3f" % \
        (delta_samples < 0).mean()

    print "Probability site A is BETTER than site B: %.3f" % \
        (delta_samples > 0).mean()
Пример #28
0
def validate_covariate_model_dispersion(N=1000,
                                        delta_true=.15,
                                        pi_true=.01,
                                        zeta_true=[.5, -.5, 0.]):
    ## generate simulated data
    a = pl.arange(0, 100, 1)
    pi_age_true = pi_true * pl.ones_like(a)

    model = data.ModelData()
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0
    delta = delta_true * pl.exp(pl.dot(Z, zeta_true))
    for i in range(len(zeta_true)):
        model.input_data['z_%d' % i] = Z[:, i]

    model.input_data['true'] = pi_true

    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N)

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n * p, delta * n * p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total',
                                            'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=5,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    add_quality_metrics(model.input_data)

    model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns)
    model.zeta['true'] = zeta_true

    model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean']
    model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats(
    )['standard deviation']
    add_quality_metrics(model.zeta)

    print '\nzeta'
    print model.zeta

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (
        pl.median(pl.absolute(model.zeta['abs_err'].dropna())),
        model.zeta.dropna()['covered?'].mean())

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'delta')
    add_to_results(model, 'input_data')
    add_to_results(model, 'zeta')
    model.results = pandas.DataFrame(model.results,
                                     columns='param bias mae mare pc'.split())

    return model
Пример #29
0
 def generate(self, N_A, N_B):
     obsA = pymc.rbernoulli(self.true_pA, N_A)
     obsB = pymc.rbernoulli(self.true_PB, N_B)
foos.value

samples = [data_generator.random() for _ in xrange(10000)]

plt.hist(samples, bins = 70, normed = True, histtype = "stepfilled")
plt.title("FooBar")
plt.xlim(0, 8)
plt.show()

# AB testing example ---------------
# Preliminary -- just A
p = pm.Uniform("p", lower = 0, upper = 1)
p_true = 0.05
N = 1500

occurrences = pm.rbernoulli(p_true, N)
obs = pm.Bernoulli("obs", p, value = occurrences, observed = True)
model = pm.Model([p, obs])

mcmc = pm.MCMC(model)
mcmc.sample(18000, 1000)

samples = mcmc.trace("p")[:]

# AB

true_p_A = 0.05
true_p_B = 0.04
n_A = 1500
n_B = 750
Пример #31
0
"""
This is an example of using Bayesian A/B testing
"""

import pymc as pm

# these two quantities are unknown to us.
true_p_A = 0.05
true_p_B = 0.04

# notice the unequal sample sizes -- no problem in Bayesian analysis.
N_A = 1500
N_B = 1000

# generate data
observations_A = pm.rbernoulli(true_p_A, N_A)
observations_B = pm.rbernoulli(true_p_B, N_B)


# set up the pymc model. Again assume Uniform priors for p_A and p_B
p_A = pm.Uniform("p_A", 0, 1)
p_B = pm.Uniform("p_B", 0, 1)


# define the deterministic delta function. This is our unknown of interest.

@pm.deterministic
def delta(p_A=p_A, p_B=p_B):
    return p_A - p_B

    def DoSamplingSpecificXConditionedAll(self, w, a, ind, fitErrExcludingInd, varLast, bLogDebug=False):
        assert (a > 0) and (w >= 0) and (varLast >= 0), __file__ + ': DoSamplingSpecificXConditionedAll: invalid inputs'                        
        
        etaIndSquared = varLast / self._hNormSquared[ind]       
        assert (etaIndSquared > 0), __file__ + ': etaIndSquared is not strictly positive'     
        dotProduct = np.sum(fitErrExcludingInd * self._h[:, ind])
        muIndComponents = (dotProduct / self._hNormSquared[ind], -etaIndSquared / a)
        muInd = sum(muIndComponents)
        
        """
        When y is big, calculating uInd is a challenge since there are numerical issues. We're 
        trying to multiply a very small number (which equals 0 due to finite floating point
        representation) and a very large number, which is prone to returning 0.        
        """  
        y = -muInd / PlazeGibbsSamplerReconstructor.CONST_SQRT_2 / math.sqrt(etaIndSquared)
                        
        uInd = (w / a) * \
            mpmath.sqrt(etaIndSquared) * PlazeGibbsSamplerReconstructor.CONST_SQRT_HALF_PI * mpmath.erfc(y) * \
            mpmath.exp(y * y)
                                               
        uIndFloat = float(uInd) # Convert to an ordinary Python float
        assert (uIndFloat >= 0), __file__ + ': uIndFloat is negative: ' + str(uIndFloat) + \
            ' w=' + str(w) + \
            ' a=' + str(a) + \
            ' etaIndSquared=' + str(etaIndSquared) + \
            ' y=' + str(y) + \
            ' muIndComp=(' + str(muIndComponents[0]) + ',' + str(muIndComponents[1]) + ')' + \
            ' dotProduct=' + str(dotProduct) + \
            ' hNormSquared=' + str(self._hNormSquared[ind])
        
        if uIndFloat == float('inf'):
            wInd = 1
        else:
            wInd = uIndFloat / (uIndFloat + (1 - w))            
                                                                                                                
        if ((wInd < 0) or (wInd > 1)):
            raise ValueError('uInd is {0} and wInd is {1}'.format(uInd, wInd))                                    
        
        if pymc.rbernoulli(wInd):
            # With probability wInd, generate a sample from a truncated Gaussian r.v. (support (0,Inf))
#             xSample = pymc.rtruncated_normal(muInd, 1/etaIndSquared, a=0)[0]
            try:            
                xSample = NumericalHelper.RandomNonnegativeNormal(muInd, etaIndSquared)
            except:
                fmtString = "Caught exception at {0}: NNN({1}, {2}). Intm. calc.: {3}, {4}, {5}. Exception: {6}"
                msg = fmtString.format(ind,
                                       muInd, etaIndSquared,                                                
                                       muIndComponents[0],
                                       muIndComponents[1],
                                       varLast,
                                       sys.exc_info()[0])
                logging.error(msg)
                xSample = 0; # XXX: Due to a numerical problem
            else:
                # Check the value of xSample 
                if (xSample < 0):
                    fmtString = "Invalid xSample at {0}: NNN({1}, {2}) ~> sample {3}. Intm. calc.: {4}, {5}, {6}"
                    logging.error(fmtString.format(ind,
                                                   muInd, etaIndSquared, xSample,                                               
                                                   muIndComponents[0],
                                                   muIndComponents[1],
                                                   varLast))
                    # Don't throw an exception
    #                raise ValueError('xSample cannot be negative')
                    xSample = 0; # XXX: Also due to a numerical problem, but no exception raised   
        else:
            # With probability (1-wInd), generate 0
            xSample = 0
        
        if bLogDebug:
            fmtString = '      {0}/{1}: {2:.5e}, {3:.5f}={4:.5f}-{5:.5f}, {6:.5e}, {7:.5e}: {8:.5e}'
            logging.debug(fmtString.format(self._samplerIter,
                                           ind, 
                                           etaIndSquared, 
                                           muInd, 
                                           muIndComponents[0],
                                           -muIndComponents[1],
                                           uIndFloat, 
                                           wInd, 
                                           xSample))
            
        return xSample
Пример #33
0
import pymc as pm
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# A-B test
p = pm.Uniform('p', lower=0, upper=1)
p_true = 0.05  # Unknown
N = 1500

# Ber(0.05) simmulation
occur = pm.rbernoulli(p_true, N)
print(occur)
print(occur.sum())
print(occur.mean())

obs = pm.Bernoulli("obs", p, value=occur, observed=True)
mcmc = pm.MCMC([p, obs])
mcmc.sample(20000, 1000)

plt.figure(figsize=(12.5, 4))
plt.vlines(p_true, 0, 90, linestyle='--', label="real $p_A$ unknown value")
plt.hist(mcmc.trace("p")[:], bins=25, histtype="stepfilled", normed=True)
plt.legend()
plt.show()

true_p_a = 0.05
true_p_b = 0.04
n_a = 1500
n_b = 750