N = 1 simu_data = dict(N=N) simu_model = stan_utility.compile_model('simulate_data.stan') simu = simu_model.sampling(data=simu_data, iter=1, chains=1, seed=4838282, algorithm="Fixed_param") data = dict(N=N, y=simu.extract()['y'].flatten()) pystan.stan_rdump(data, 'simulation.data.R') # Now we can read that data back in and use Hamiltonian # Monte Carlo to estimate posterior expectation values input_data = pystan.read_rdump('simulation.data.R') model = stan_utility.compile_model('fit_data.stan') fit = model.sampling(data=input_data, seed=4938483) # Check diagnostics stan_utility.check_all_diagnostics(fit) # That doesn't look good. Let's investigate the divergent # samples in the context of the non-divergent samples to # see what's going on. nondiv_params, div_params = stan_utility.partition_div(fit) plot.scatter(nondiv_params['mu'], [math.log(x) for x in nondiv_params['sigma']], color=mid_highlight,
import stan_utility import pystan import numpy as np model = stan_utility.compile_model('gbm_sum.stan', model_name='test_gbm2') data = pystan.read_rdump('alpha_data.R') N_gen_spectra = 100 model_energy = np.logspace(0,5,N_gen_spectra) data['N_gen_spectra'] = N_gen_spectra data['model_energy'] = model_energy warmup = 1000 iter = 100 total = warmup + iter chains = 8 fit = model.sampling( data=data, iter=total, warmup=warmup, chains=chains, n_jobs=chains, control=dict(max_treedepth=13, # adapt_delta=0.9 ), seed=1234) stan_utility.stanfit_to_hdf5(fit, '/data/jburgess/stan_fits/gbm_stan_fit_small.h5')
def combine_bucket(bucket_path, cat): ## Download all csv files in bucket path CATEGORY = cat upc_columns = None brand_columns = None DL_CMD = 'gsutil -m cp "{}/*" .'.format(bucket_path) oldfiles = os.listdir() if len(oldfiles) < 5: sp.call(DL_CMD, shell=True) files = os.listdir() csvfiles = [f for f in files if f.endswith('.csv')] nfiles = len(csvfiles) brand_rows = [] upc_rows = [] for i, c in enumerate(csvfiles): ## Match up sample file with input data filestem = os.path.splitext(c)[0] print("{}/{} Opening {}".format(i, nfiles, c)) try: samples = pd.read_csv(c) except FileNotFoundException as e: print("Can't load csv {}".format(c)) raise e try: model_info = pystan.read_rdump(filestem + '.R') except FileNotFoundException as e: print('Can not find corresponding R dump file') raise e upc_ids = model_info['upc_id'] brand_id = model_info['brand_id'][0] # Brand id is constant. obs_ids = model_info['obs_id'] u_upc_ids = pd.Series(upc_ids).unique() brand_params = [p for p in samples.columns if p.startswith('brand')] upc_params = [u.split('.')[0] for u in list(samples.columns) if u.startswith('upc')] #Order preserving remove list duplicates upc_params = OrderedDict((x, True) for x in upc_params).keys() brand_samples = samples[brand_params] ## Helper functions to compute 10% - 90% range for posterior def low_quantile(x): return x.quantile(0.1) def high_quantile(x): return x.quantile(0.9) print('Summarizing brand...') brand_summary = brand_samples.agg([ low_quantile, 'mean', high_quantile]) print('Brand summary created!') brand_summary_list = brand_summary.values.flatten(order = 'F').tolist() print('Adding brand row') brand_row = [brand_id, CATEGORY, *brand_summary_list] if brand_columns == None: param_columns = ['brand_id', 'thg_category'] for p in brand_params: param_columns.append('{}_low'.format(p)) param_columns.append('{}_mean'.format(p)) param_columns.append('{}_high'.format(p)) brand_columns = param_columns brand_rows.append(brand_row) ##Build upc row per upc print('Adding upc rows') for i, u in enumerate(u_upc_ids): this_upc_params = ['{}.{}'.format(p, i + 1) for p in upc_params] print('this upc_params: {}'.format(str(this_upc_params))) upc_samples = samples[this_upc_params] print('Summarizing upc') upc_summary = upc_samples.agg( [low_quantile, 'mean', high_quantile]) print('Upc {} summarized'.format(u)) upc_summary_list = upc_summary.values.flatten(order = 'F').tolist() upc_row = [u, CATEGORY, *upc_summary_list] if upc_columns == None: cols = ['upc_id', 'thg_category'] for p in upc_params: cols.append('{}_low'.format(p)) cols.append('{}_mean'.format(p)) cols.append('{}_high'.format(p)) upc_columns = cols upc_rows.append(upc_row) ## Roll into dataframe and write out print('Combining rows...') brands_df = pd.DataFrame(brand_rows, columns=brand_columns) upcs_df = pd.DataFrame(upc_rows, columns=upc_columns) print('Writing out...') brands_df.to_csv('{}_brands.csv'.format(CATEGORY)) upcs_df.to_csv('{}_upcs.csv'.format(CATEGORY)) print('All complete!')
# Create data ############################################################ model = stan_utility.compile_model('generate_data.stan') fit = model.sampling(seed=194838, algorithm='Fixed_param', iter=1, chains=1) data = dict(N = fit.extract()['N'].astype(numpy.int64), x_obs = fit.extract()['x_obs'][0,:]) pystan.stan_rdump(data, 'selection.data.R') ############################################################ # Fit model ############################################################ data = pystan.read_rdump('selection.data.R') model = stan_utility.compile_model('selection.stan') fit = model.sampling(data=data, chains=4, seed=4938483, control=dict(adapt_delta=0.9, max_treedepth=12)) # Check diagnostics stan_utility.check_all_diagnostics(fit) # Default visual summaries params = fit.extract() # Plot marginal posteriors f, axarr = plot.subplots(2, 2) for a in axarr[0,:]: a.xaxis.set_ticks_position('bottom')
plot.gca().axes.get_yaxis().set_visible(False) plot.show() z_scores = [x[2] for x in ensemble_output] shrinkages = [x[3] for x in ensemble_output] plot.scatter(shrinkages, z_scores, color=dark, alpha=0.2) plot.gca().set_xlabel("Posterior Shrinkage") plot.gca().set_xlim(0, 1) plot.gca().set_ylabel("Posterior z-Score") plot.gca().set_ylim(-5, 5) plot.show() data = pystan.read_rdump('workflow.data.R') model = stan_utility.compile_model('fit_data_ppc.stan') fit = model.sampling(data=data, seed=4838282) stan_utility.check_all_diagnostics(fit) params = fit.extract() plot.hist(params['lambda'], bins=25, color=dark, ec=dark_highlight) plot.gca().set_xlabel("lambda") plot.gca().axes.get_yaxis().set_visible(False) plot.show() max_y = 40 B = max_y + 1 bins = [b - 0.5 for b in range(B + 1)]