예제 #1
0
N = 1
simu_data = dict(N=N)

simu_model = stan_utility.compile_model('simulate_data.stan')
simu = simu_model.sampling(data=simu_data,
                           iter=1,
                           chains=1,
                           seed=4838282,
                           algorithm="Fixed_param")

data = dict(N=N, y=simu.extract()['y'].flatten())
pystan.stan_rdump(data, 'simulation.data.R')

# Now we can read that data back in and use Hamiltonian
# Monte Carlo to estimate posterior expectation values
input_data = pystan.read_rdump('simulation.data.R')

model = stan_utility.compile_model('fit_data.stan')
fit = model.sampling(data=input_data, seed=4938483)

# Check diagnostics
stan_utility.check_all_diagnostics(fit)

# That doesn't look good.  Let's investigate the divergent
# samples in the context of the non-divergent samples to
# see what's going on.
nondiv_params, div_params = stan_utility.partition_div(fit)

plot.scatter(nondiv_params['mu'],
             [math.log(x) for x in nondiv_params['sigma']],
             color=mid_highlight,
예제 #2
0
import stan_utility
import pystan
import numpy as np
model = stan_utility.compile_model('gbm_sum.stan', model_name='test_gbm2')

data = pystan.read_rdump('alpha_data.R')

N_gen_spectra = 100
model_energy = np.logspace(0,5,N_gen_spectra)
data['N_gen_spectra'] = N_gen_spectra
data['model_energy'] = model_energy

warmup = 1000
iter = 100

total = warmup + iter

chains = 8

fit = model.sampling(
    data=data,
    iter=total,
    warmup=warmup,
    chains=chains,
    n_jobs=chains,
    control=dict(max_treedepth=13,
    #             adapt_delta=0.9
    ),
    seed=1234)

stan_utility.stanfit_to_hdf5(fit, '/data/jburgess/stan_fits/gbm_stan_fit_small.h5')
def combine_bucket(bucket_path, cat):
    ## Download all csv files in bucket path
    CATEGORY = cat
    upc_columns = None
    brand_columns = None
    DL_CMD = 'gsutil -m cp  "{}/*" .'.format(bucket_path)
    oldfiles = os.listdir()
    if len(oldfiles) < 5:
        sp.call(DL_CMD, shell=True)
    files = os.listdir()
    csvfiles = [f for f in files if f.endswith('.csv')]
    nfiles = len(csvfiles)
    brand_rows = []
    upc_rows = []
    for i, c in enumerate(csvfiles):
        ## Match up sample file with input data
        filestem = os.path.splitext(c)[0]
        print("{}/{} Opening {}".format(i, nfiles, c))
        try:
            samples = pd.read_csv(c)
        except FileNotFoundException as e:
            print("Can't load csv {}".format(c))
            raise e
        try:
            model_info = pystan.read_rdump(filestem + '.R')
        except FileNotFoundException as e:
            print('Can not find corresponding R dump file')
            raise e
        upc_ids = model_info['upc_id']
        brand_id = model_info['brand_id'][0] # Brand id is constant.
        obs_ids = model_info['obs_id']
        u_upc_ids = pd.Series(upc_ids).unique()
        brand_params = [p for p in samples.columns if p.startswith('brand')]
        upc_params = [u.split('.')[0] for u in list(samples.columns) if
                      u.startswith('upc')]
        #Order preserving remove list duplicates
        upc_params = OrderedDict((x, True) for x in upc_params).keys()

        brand_samples = samples[brand_params]
        
        ## Helper functions to compute 10% - 90% range for posterior
        def low_quantile(x):
            return x.quantile(0.1)
        def high_quantile(x):
            return x.quantile(0.9)
        print('Summarizing brand...')
        brand_summary = brand_samples.agg([ low_quantile, 'mean',  high_quantile])
        print('Brand summary created!')
        brand_summary_list = brand_summary.values.flatten(order = 'F').tolist()
        print('Adding brand row')
        brand_row = [brand_id, CATEGORY, *brand_summary_list]
        if brand_columns == None:
            param_columns = ['brand_id', 'thg_category']
            for p in brand_params:
                param_columns.append('{}_low'.format(p))
                param_columns.append('{}_mean'.format(p))
                param_columns.append('{}_high'.format(p))
            brand_columns = param_columns
        brand_rows.append(brand_row)
        ##Build upc row per upc
        print('Adding upc rows')
        for i, u in enumerate(u_upc_ids):
            this_upc_params = ['{}.{}'.format(p, i + 1) for p in upc_params]
            print('this upc_params: {}'.format(str(this_upc_params)))
            upc_samples = samples[this_upc_params]
            print('Summarizing upc')
            upc_summary = upc_samples.agg(
                [low_quantile, 'mean', high_quantile])
            print('Upc {} summarized'.format(u))
            upc_summary_list = upc_summary.values.flatten(order = 'F').tolist()
            upc_row = [u, CATEGORY, *upc_summary_list]
            if upc_columns == None:
                cols = ['upc_id', 'thg_category']
                for p in upc_params:
                    cols.append('{}_low'.format(p))
                    cols.append('{}_mean'.format(p))
                    cols.append('{}_high'.format(p))
                upc_columns = cols
            upc_rows.append(upc_row)
    ## Roll into dataframe and write out
    print('Combining rows...')
    brands_df  = pd.DataFrame(brand_rows, columns=brand_columns)
    upcs_df = pd.DataFrame(upc_rows, columns=upc_columns)
    print('Writing out...')
    brands_df.to_csv('{}_brands.csv'.format(CATEGORY))
    upcs_df.to_csv('{}_upcs.csv'.format(CATEGORY))
    print('All complete!')
예제 #4
0
# Create data
############################################################

model = stan_utility.compile_model('generate_data.stan')
fit = model.sampling(seed=194838, algorithm='Fixed_param', iter=1, chains=1)

data = dict(N = fit.extract()['N'].astype(numpy.int64),
            x_obs = fit.extract()['x_obs'][0,:])

pystan.stan_rdump(data, 'selection.data.R')

############################################################
# Fit model
############################################################

data = pystan.read_rdump('selection.data.R')

model = stan_utility.compile_model('selection.stan')
fit = model.sampling(data=data, chains=4, seed=4938483,
                     control=dict(adapt_delta=0.9, max_treedepth=12))

# Check diagnostics
stan_utility.check_all_diagnostics(fit)

# Default visual summaries
params = fit.extract()

# Plot marginal posteriors
f, axarr = plot.subplots(2, 2)
for a in axarr[0,:]:
    a.xaxis.set_ticks_position('bottom')
예제 #5
0
plot.gca().axes.get_yaxis().set_visible(False)

plot.show()

z_scores = [x[2] for x in ensemble_output]
shrinkages = [x[3] for x in ensemble_output]

plot.scatter(shrinkages, z_scores, color=dark, alpha=0.2)
plot.gca().set_xlabel("Posterior Shrinkage")
plot.gca().set_xlim(0, 1)
plot.gca().set_ylabel("Posterior z-Score")
plot.gca().set_ylim(-5, 5)

plot.show()

data = pystan.read_rdump('workflow.data.R')

model = stan_utility.compile_model('fit_data_ppc.stan')
fit = model.sampling(data=data, seed=4838282)
stan_utility.check_all_diagnostics(fit)

params = fit.extract()

plot.hist(params['lambda'], bins=25, color=dark, ec=dark_highlight)
plot.gca().set_xlabel("lambda")
plot.gca().axes.get_yaxis().set_visible(False)
plot.show()
max_y = 40
B = max_y + 1

bins = [b - 0.5 for b in range(B + 1)]