def _run_cmdstan( self, runset: RunSet, idx: int = 0, pbar: List[Any] = None ) -> None: """ Encapsulates call to cmdstan. Spawn process, capture console output to file, record returncode. """ cmd = runset.cmds[idx] self._logger.info('start chain %u', idx + 1) self._logger.debug('sampling: %s', cmd) proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ, ) if pbar: stdout_pbar = self._read_progress(proc, pbar, idx) stdout, stderr = proc.communicate() if pbar: stdout = stdout_pbar + stdout transcript_file = runset.console_files[idx] self._logger.info('finish chain %u', idx + 1) with open(transcript_file, 'w+') as transcript: if stdout: transcript.write(stdout.decode('utf-8')) if stderr: transcript.write('ERROR') transcript.write(stderr.decode('utf-8')) runset._set_retcode(idx, proc.returncode)
def test_instantiate(self): stan = os.path.join(DATAFILES_PATH, 'variational', 'eta_should_be_big.stan') model = CmdStanModel(stan_file=stan) no_data = {} args = VariationalArgs(algorithm='meanfield') cmdstan_args = CmdStanArgs( model_name=model.name, model_exe=model.exe_file, chain_ids=None, data=no_data, method_args=args, ) runset = RunSet(args=cmdstan_args, chains=1) runset._csv_files = [ os.path.join(DATAFILES_PATH, 'variational', 'eta_big_output.csv') ] variational = CmdStanVB(runset) self.assertIn('CmdStanVB: model=eta_should_be_big', variational.__repr__()) self.assertIn('method=variational', variational.__repr__()) self.assertEqual( variational.column_names, ('lp__', 'log_p__', 'log_g__', 'mu[1]', 'mu[2]'), ) self.assertAlmostEqual(variational.variational_params_dict['mu[1]'], 31.0299, places=2) self.assertAlmostEqual(variational.variational_params_dict['mu[2]'], 28.8141, places=2) self.assertEqual(variational.variational_sample.shape, (1000, 5))
def _run_cmdstan(self, runset: RunSet, idx: int = 0, pbar: Any = None) -> None: """ Encapsulates call to CmdStan. Spawn process, capture console output to file, record returncode. """ cmd = runset.cmds[idx] self._logger.info('start chain %u', idx + 1) self._logger.debug('threads: %s', str(os.environ.get('STAN_NUM_THREADS'))) self._logger.debug('sampling: %s', cmd) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ) if pbar: stdout_pbar = self._read_progress(proc, pbar, idx) stdout, stderr = proc.communicate() if pbar: stdout = stdout_pbar + stdout self._logger.info('finish chain %u', idx + 1) if stdout: with open(runset.stdout_files[idx], 'w+') as fd: fd.write(stdout.decode('utf-8')) if stderr: with open(runset.stderr_files[idx], 'w+') as fd: fd.write(stderr.decode('utf-8')) runset._set_retcode(idx, proc.returncode)
def test_validate_big_run(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) sampler_args = SamplerArgs(iter_warmup=1500, iter_sampling=1000) cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2], seed=12345, output_dir=DATAFILES_PATH, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=2) runset._csv_files = [ os.path.join(DATAFILES_PATH, 'runset-big', 'output_icar_nyc-1.csv'), os.path.join(DATAFILES_PATH, 'runset-big', 'output_icar_nyc-1.csv'), ] fit = CmdStanMCMC(runset) phis = ['phi[{}]'.format(str(x + 1)) for x in range(2095)] column_names = SAMPLER_STATE + phis self.assertEqual(fit.num_draws_sampling, 1000) self.assertEqual(fit.column_names, tuple(column_names)) self.assertEqual(fit.metric_type, 'diag_e') self.assertEqual(fit.step_size.shape, (2, )) self.assertEqual(fit.metric.shape, (2, 2095)) self.assertEqual((1000, 2, 2102), fit.draws().shape) phis = fit.draws_pd(params=['phi']) self.assertEqual((2000, 2095), phis.shape) with self.assertRaisesRegex(ValueError, r'unknown parameter: gamma'): fit.draws_pd(params=['gamma'])
def test_variables(self): # construct fit using existing sampler output exe = os.path.join(DATAFILES_PATH, 'lotka-volterra' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'lotka-volterra.data.json') sampler_args = SamplerArgs(iter_sampling=20) cmdstan_args = CmdStanArgs( model_name='lotka-volterra', model_exe=exe, chain_ids=[1], seed=12345, data=jdata, output_dir=DATAFILES_PATH, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=1) runset._csv_files = [ os.path.join(DATAFILES_PATH, 'lotka-volterra.csv') ] runset._set_retcode(0, 0) fit = CmdStanMCMC(runset) self.assertEqual(20, fit.num_draws) self.assertEqual(8, len(fit._stan_variable_dims)) self.assertTrue('z' in fit._stan_variable_dims) self.assertEqual(fit._stan_variable_dims['z'], (20, 2)) vars = fit.stan_variables() self.assertEqual(len(vars), len(fit._stan_variable_dims)) self.assertTrue('z' in vars) self.assertEqual(vars['z'].shape, (20, 20, 2)) self.assertTrue('theta' in vars) self.assertEqual(vars['theta'].shape, (20, 4))
def test_save_latent_dynamics(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs() chain_ids = [1, 2, 3, 4] cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=chain_ids, data=jdata, method_args=sampler_args, save_latent_dynamics=True, ) runset = RunSet(args=cmdstan_args, chains=4) self.assertIn(_TMPDIR, runset.diagnostic_files[0]) cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=chain_ids, data=jdata, method_args=sampler_args, save_latent_dynamics=True, output_dir=os.path.abspath('.'), ) runset = RunSet(args=cmdstan_args, chains=4) self.assertIn(os.path.abspath('.'), runset.diagnostic_files[0])
def test_diagnose_divergences(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) sampler_args = SamplerArgs() cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1], output_dir=DATAFILES_PATH, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=1) runset._csv_files = [ os.path.join(DATAFILES_PATH, 'diagnose-good', 'corr_gauss_depth8-1.csv') ] fit = CmdStanMCMC(runset) # TODO - use cmdstan test files instead expected = '\n'.join([ 'Checking sampler transitions treedepth.', '424 of 1000 (42%) transitions hit the maximum ' 'treedepth limit of 8, or 2^8 leapfrog steps.', 'Trajectories that are prematurely terminated ' 'due to this limit will result in slow exploration.', 'For optimal performance, increase this limit.', ]) self.assertIn(expected, fit.diagnose().replace('\r\n', '\n'))
def test_good(self): # construct fit using existing sampler output exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs(iter_sampling=100, max_treedepth=11, adapt_delta=0.95) cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2, 3, 4], seed=12345, data=jdata, output_dir=DATAFILES_PATH, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args) runset._csv_files = [ os.path.join(DATAFILES_PATH, 'runset-good', 'bern-1.csv'), os.path.join(DATAFILES_PATH, 'runset-good', 'bern-2.csv'), os.path.join(DATAFILES_PATH, 'runset-good', 'bern-3.csv'), os.path.join(DATAFILES_PATH, 'runset-good', 'bern-4.csv'), ] retcodes = runset._retcodes for i in range(len(retcodes)): runset._set_retcode(i, 0) config = check_sampler_csv( path=runset.csv_files[i], is_fixed_param=False, iter_sampling=100, iter_warmup=1000, save_warmup=False, thin=1, ) expected = 'Metadata:\n{}\n'.format(config) metadata = InferenceMetadata(config) actual = '{}'.format(metadata) self.assertEqual(expected, actual) self.assertEqual(config, metadata.cmdstan_config) hmc_vars = { 'lp__', 'accept_stat__', 'stepsize__', 'treedepth__', 'n_leapfrog__', 'divergent__', 'energy__', } sampler_vars_cols = metadata.sampler_vars_cols self.assertEqual(hmc_vars, sampler_vars_cols.keys()) bern_model_vars = {'theta'} self.assertEqual(bern_model_vars, metadata.stan_vars_dims.keys()) self.assertEqual((), metadata.stan_vars_dims['theta']) self.assertEqual(bern_model_vars, metadata.stan_vars_cols.keys()) self.assertEqual((7, ), metadata.stan_vars_cols['theta'])
def test_validate_big_run(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) sampler_args = SamplerArgs() cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2], seed=12345, output_dir=DATAFILES_PATH, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=2) runset._csv_files = [ os.path.join(DATAFILES_PATH, 'runset-big', 'output_icar_nyc-1.csv'), os.path.join(DATAFILES_PATH, 'runset-big', 'output_icar_nyc-1.csv'), ] fit = CmdStanMCMC(runset) fit._validate_csv_files() sampler_state = [ 'lp__', 'accept_stat__', 'stepsize__', 'treedepth__', 'n_leapfrog__', 'divergent__', 'energy__', ] phis = ['phi.{}'.format(str(x + 1)) for x in range(2095)] column_names = sampler_state + phis self.assertEqual(fit.columns, len(column_names)) self.assertEqual(fit.column_names, tuple(column_names)) self.assertEqual(fit.metric_type, 'diag_e') self.assertEqual(fit.stepsize.shape, (2, )) self.assertEqual(fit.metric.shape, (2, 2095)) self.assertEqual((1000, 2, 2102), fit.sample.shape) phis = fit.get_drawset(params=['phi']) self.assertEqual((2000, 2095), phis.shape) phi1 = fit.get_drawset(params=['phi.1']) self.assertEqual((2000, 1), phi1.shape) mo_phis = fit.get_drawset(params=['phi.1', 'phi.10', 'phi.100']) self.assertEqual((2000, 3), mo_phis.shape) phi2095 = fit.get_drawset(params=['phi.2095']) self.assertEqual((2000, 1), phi2095.shape) with self.assertRaises(Exception): fit.get_drawset(params=['phi.2096']) with self.assertRaises(Exception): fit.get_drawset(params=['ph'])
def test_commands(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs() chain_ids = [1, 2, 3, 4] cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=chain_ids, data=jdata, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4) self.assertIn('id=1', runset.cmd(0)) self.assertIn('id=4', runset.cmd(3))
def test_set_variational_attrs(self): stan = os.path.join(datafiles_path, 'variational', 'eta_should_be_big.stan') model = CmdStanModel(stan_file=stan) no_data = {} args = VariationalArgs(algorithm='meanfield') cmdstan_args = CmdStanArgs(model_name=model.name, model_exe=model.exe_file, chain_ids=None, data=no_data, method_args=args) runset = RunSet(args=cmdstan_args, chains=1) vi = CmdStanVB(runset) self.assertIn('CmdStanVB: model=eta_should_be_big', vi.__repr__()) self.assertIn('method=variational', vi.__repr__()) # check CmdStanVB.__init__ state self.assertEqual(vi._column_names, ()) self.assertEqual(vi._variational_mean, {}) self.assertEqual(vi._variational_sample, None) # process csv file, check attrs output = os.path.join(datafiles_path, 'variational', 'eta_big_output.csv') vi._set_variational_attrs(output) self.assertEqual(vi.column_names, ('lp__', 'log_p__', 'log_g__', 'mu.1', 'mu.2')) self.assertAlmostEqual(vi.variational_params_dict['mu.1'], 31.0299, places=2) self.assertAlmostEqual(vi.variational_params_dict['mu.2'], 28.8141, places=2) self.assertEqual(vi.variational_sample.shape, (1000, 5))
def test_ctor_checks(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs() chain_ids = [11, 12, 13, 14] cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=chain_ids, data=jdata, method_args=sampler_args, ) with self.assertRaises(ValueError): RunSet(args=cmdstan_args, chains=0) with self.assertRaises(ValueError): RunSet(args=cmdstan_args, chains=4, chain_ids=[1, 2, 3])
def test_set_mle_attrs(self): stan = os.path.join(datafiles_path, 'optimize', 'rosenbrock.stan') model = CmdStanModel(stan_file=stan) no_data = {} args = OptimizeArgs(algorithm='Newton') cmdstan_args = CmdStanArgs( model_name=model.name, model_exe=model.exe_file, chain_ids=None, data=no_data, method_args=args, ) runset = RunSet(args=cmdstan_args, chains=1) mle = CmdStanMLE(runset) self.assertIn('CmdStanMLE: model=rosenbrock', mle.__repr__()) self.assertIn('method=optimize', mle.__repr__()) self.assertEqual(mle._column_names, ()) self.assertEqual(mle._mle, {}) output = os.path.join(datafiles_path, 'optimize', 'rosenbrock_mle.csv') mle._set_mle_attrs(output) self.assertEqual(mle.column_names, ('lp__', 'x', 'y')) self.assertAlmostEqual(mle.optimized_params_dict['x'], 1, places=3) self.assertAlmostEqual(mle.optimized_params_dict['y'], 1, places=3)
def test_validate_good_run(self): # construct fit using existing sampler output exe = os.path.join(datafiles_path, 'bernoulli' + EXTENSION) jdata = os.path.join(datafiles_path, 'bernoulli.data.json') output = os.path.join(goodfiles_path, 'bern') sampler_args = SamplerArgs(sampling_iters=100, max_treedepth=11, adapt_delta=0.95) cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2, 3, 4], seed=12345, data=jdata, output_basename=output, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4) self.assertEqual(4, runset.chains) retcodes = runset._retcodes for i in range(len(retcodes)): runset._set_retcode(i, 0) self.assertTrue(runset._check_retcodes()) fit = CmdStanMCMC(runset) fit._validate_csv_files() self.assertEqual(100, fit.draws) self.assertEqual(8, len(fit.column_names)) self.assertEqual('lp__', fit.column_names[0]) df = fit.get_drawset() self.assertEqual( df.shape, (fit.runset.chains * fit.draws, len(fit.column_names))) _ = fit.summary() self.assertTrue(True) # TODO - use cmdstan test files instead expected = '\n'.join([ 'Checking sampler transitions treedepth.', 'Treedepth satisfactory for all transitions.', '\nChecking sampler transitions for divergences.', 'No divergent transitions found.', '\nChecking E-BFMI - sampler transitions HMC potential energy.', 'E-BFMI satisfactory for all transitions.', '\nEffective sample size satisfactory.', ]) self.assertIn(expected, fit.diagnose().replace("\r\n", "\n"))
def test_chain_ids(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs() chain_ids = [11, 12, 13, 14] cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=chain_ids, data=jdata, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4, chain_ids=chain_ids) self.assertIn('id=11', runset.cmd(0)) self.assertIn('_11.csv', runset._csv_files[0]) self.assertIn('id=14', runset.cmd(3)) self.assertIn('_14.csv', runset._csv_files[3])
class Data: args = CmdStanArgs( "dummy.stan", "dummy.exe", list(range(1, 5)), method_args=SamplerArgs() ) runset_obj = RunSet(args) runset_obj._csv_files = filepaths # pylint: disable=protected-access obj = CmdStanMCMC(runset_obj) obj._validate_csv_files() # pylint: disable=protected-access obj._assemble_sample() # pylint: disable=protected-access
def test_validate_summary_sig_figs(self): # construct CmdStanMCMC from logistic model output, config exe = os.path.join(DATAFILES_PATH, 'logistic' + EXTENSION) rdata = os.path.join(DATAFILES_PATH, 'logistic.data.R') sampler_args = SamplerArgs(iter_sampling=100) cmdstan_args = CmdStanArgs( model_name='logistic', model_exe=exe, chain_ids=[1, 2, 3, 4], seed=12345, data=rdata, output_dir=DATAFILES_PATH, sig_figs=17, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args) runset._csv_files = [ os.path.join(DATAFILES_PATH, 'logistic_output_1.csv'), os.path.join(DATAFILES_PATH, 'logistic_output_2.csv'), os.path.join(DATAFILES_PATH, 'logistic_output_3.csv'), os.path.join(DATAFILES_PATH, 'logistic_output_4.csv'), ] retcodes = runset._retcodes for i in range(len(retcodes)): runset._set_retcode(i, 0) fit = CmdStanMCMC(runset) sum_default = fit.summary() beta1_default = format(sum_default.iloc[1, 0], '.18g') self.assertTrue(beta1_default.startswith('1.3')) if cmdstan_version_at(2, 25): sum_17 = fit.summary(sig_figs=17) beta1_17 = format(sum_17.iloc[1, 0], '.18g') self.assertTrue(beta1_17.startswith('1.345767078273')) sum_10 = fit.summary(sig_figs=10) beta1_10 = format(sum_10.iloc[1, 0], '.18g') self.assertTrue(beta1_10.startswith('1.34576707')) with self.assertRaises(ValueError): fit.summary(sig_figs=20) with self.assertRaises(ValueError): fit.summary(sig_figs=-1)
def test_variables_3d(self): # construct fit using existing sampler output exe = os.path.join(DATAFILES_PATH, 'multidim_vars' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'logistic.data.R') sampler_args = SamplerArgs(iter_sampling=20) cmdstan_args = CmdStanArgs( model_name='multidim_vars', model_exe=exe, chain_ids=[1], seed=12345, data=jdata, output_dir=DATAFILES_PATH, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=1) runset._csv_files = [os.path.join(DATAFILES_PATH, 'multidim_vars.csv')] runset._set_retcode(0, 0) fit = CmdStanMCMC(runset) self.assertEqual(20, fit.num_draws_sampling) self.assertEqual(3, len(fit.stan_vars_dims)) self.assertTrue('y_rep' in fit.stan_vars_dims) self.assertEqual(fit.stan_vars_dims['y_rep'], (5, 4, 3)) var_y_rep = fit.stan_variable(name='y_rep') self.assertEqual(var_y_rep.shape, (20, 5, 4, 3)) var_beta = fit.stan_variable(name='beta') self.assertEqual(var_beta.shape, (20, 2)) var_frac_60 = fit.stan_variable(name='frac_60') self.assertEqual(var_frac_60.shape, (20, )) vars = fit.stan_variables() self.assertEqual(len(vars), len(fit.stan_vars_dims)) self.assertTrue('y_rep' in vars) self.assertEqual(vars['y_rep'].shape, (20, 5, 4, 3)) self.assertTrue('beta' in vars) self.assertEqual(vars['beta'].shape, (20, 2)) self.assertTrue('frac_60' in vars) self.assertEqual(vars['frac_60'].shape, (20, ))
def test_output_filenames(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs() cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2, 3, 4], data=jdata, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4) self.assertIn('bernoulli-', runset._csv_files[0]) self.assertIn('-1-', runset._csv_files[0]) self.assertIn('-4-', runset._csv_files[3])
def test_check_repr(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs() chain_ids = [1, 2, 3, 4] # default cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=chain_ids, data=jdata, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4) self.assertIn('RunSet: chains=4', runset.__repr__()) self.assertIn('method=sample', runset.__repr__()) self.assertIn('retcodes=[-1, -1, -1, -1]', runset.__repr__()) self.assertIn('csv_file', runset.__repr__()) self.assertIn('console_msgs', runset.__repr__()) self.assertNotIn('diagnostics_file', runset.__repr__())
def test_get_err_msgs(self): exe = os.path.join(DATAFILES_PATH, 'logistic' + EXTENSION) rdata = os.path.join(DATAFILES_PATH, 'logistic.data.R') sampler_args = SamplerArgs() cmdstan_args = CmdStanArgs( model_name='logistic', model_exe=exe, chain_ids=[1, 2, 3], data=rdata, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=3) for i in range(3): runset._set_retcode(i, 70) stdout_file = 'chain-' + str(i + 1) + '-missing-data-stdout.txt' path = os.path.join(DATAFILES_PATH, stdout_file) runset._stdout_files[i] = path errs = '\n\t'.join(runset._get_err_msgs()) self.assertIn('Exception', errs)
def test_metadata(self): # construct CmdStanMCMC from logistic model output, config exe = os.path.join(DATAFILES_PATH, 'logistic' + EXTENSION) rdata = os.path.join(DATAFILES_PATH, 'logistic.data.R') sampler_args = SamplerArgs(iter_sampling=100) cmdstan_args = CmdStanArgs( model_name='logistic', model_exe=exe, chain_ids=[1, 2, 3, 4], seed=12345, data=rdata, output_dir=DATAFILES_PATH, sig_figs=17, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args) runset._csv_files = [ os.path.join(DATAFILES_PATH, 'logistic_output_1.csv'), os.path.join(DATAFILES_PATH, 'logistic_output_2.csv'), os.path.join(DATAFILES_PATH, 'logistic_output_3.csv'), os.path.join(DATAFILES_PATH, 'logistic_output_4.csv'), ] retcodes = runset._retcodes for i in range(len(retcodes)): runset._set_retcode(i, 0) fit = CmdStanMCMC(runset) col_names = tuple([ 'lp__', 'accept_stat__', 'stepsize__', 'treedepth__', 'n_leapfrog__', 'divergent__', 'energy__', 'beta[1]', 'beta[2]', ]) self.assertEqual(fit.chains, 4) self.assertEqual(fit.chain_ids, [1, 2, 3, 4]) self.assertEqual(fit.num_draws_warmup, 1000) self.assertEqual(fit.num_draws_sampling, 100) self.assertEqual(fit.column_names, col_names) self.assertEqual(fit.num_unconstrained_params, 2) self.assertEqual(fit.metric_type, 'diag_e') self.assertEqual(fit.sampler_config['num_samples'], 100) self.assertEqual(fit.sampler_config['thin'], 1) self.assertEqual(fit.sampler_config['algorithm'], 'hmc') self.assertEqual(fit.sampler_config['metric'], 'diag_e') self.assertAlmostEqual(fit.sampler_config['delta'], 0.80) self.assertTrue('n_leapfrog__' in fit.sampler_vars_cols) self.assertTrue('energy__' in fit.sampler_vars_cols) self.assertTrue('beta' not in fit.sampler_vars_cols) self.assertTrue('energy__' not in fit.stan_vars_dims) self.assertTrue('beta' in fit.stan_vars_dims) self.assertTrue('beta' in fit.stan_vars_cols) self.assertEqual(fit.stan_vars_dims['beta'], tuple([2])) self.assertEqual(fit.stan_vars_cols['beta'], tuple([7, 8]))
def test_validate_good_run(self): # construct fit using existing sampler output exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs(iter_sampling=100, max_treedepth=11, adapt_delta=0.95) cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2, 3, 4], seed=12345, data=jdata, output_dir=DATAFILES_PATH, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args) runset._csv_files = [ os.path.join(DATAFILES_PATH, 'runset-good', 'bern-1.csv'), os.path.join(DATAFILES_PATH, 'runset-good', 'bern-2.csv'), os.path.join(DATAFILES_PATH, 'runset-good', 'bern-3.csv'), os.path.join(DATAFILES_PATH, 'runset-good', 'bern-4.csv'), ] self.assertEqual(4, runset.chains) retcodes = runset._retcodes for i in range(len(retcodes)): runset._set_retcode(i, 0) self.assertTrue(runset._check_retcodes()) fit = CmdStanMCMC(runset) self.assertEqual(100, fit.num_draws) self.assertEqual(len(BERNOULLI_COLS), len(fit.column_names)) self.assertEqual('lp__', fit.column_names[0]) drawset = fit.get_drawset() self.assertEqual( drawset.shape, (fit.runset.chains * fit.num_draws, len(fit.column_names)), ) summary = fit.summary() self.assertIn('5%', list(summary.columns)) self.assertIn('50%', list(summary.columns)) self.assertIn('95%', list(summary.columns)) self.assertNotIn('1%', list(summary.columns)) self.assertNotIn('99%', list(summary.columns)) summary = fit.summary(percentiles=[1, 45, 99]) self.assertIn('1%', list(summary.columns)) self.assertIn('45%', list(summary.columns)) self.assertIn('99%', list(summary.columns)) self.assertNotIn('5%', list(summary.columns)) self.assertNotIn('50%', list(summary.columns)) self.assertNotIn('95%', list(summary.columns)) with self.assertRaises(ValueError): fit.summary(percentiles=[]) with self.assertRaises(ValueError): fit.summary(percentiles=[-1]) diagnostics = fit.diagnose() self.assertIn('Treedepth satisfactory for all transitions.', diagnostics) self.assertIn('No divergent transitions found.', diagnostics) self.assertIn('E-BFMI satisfactory for all transitions.', diagnostics) self.assertIn('Effective sample size satisfactory.', diagnostics)
def test_validate_bad_run(self): exe = os.path.join(datafiles_path, 'bernoulli' + EXTENSION) jdata = os.path.join(datafiles_path, 'bernoulli.data.json') sampler_args = SamplerArgs(sampling_iters=100, max_treedepth=11, adapt_delta=0.95) # some chains had errors output = os.path.join(badfiles_path, 'bad-transcript-bern') cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2, 3, 4], seed=12345, data=jdata, output_basename=output, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4) with self.assertRaisesRegex(Exception, 'Exception'): runset._check_console_msgs() # csv file headers inconsistent output = os.path.join(badfiles_path, 'bad-hdr-bern') cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2, 3, 4], seed=12345, data=jdata, output_basename=output, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4) retcodes = runset._retcodes for i in range(len(retcodes)): runset._set_retcode(i, 0) self.assertTrue(runset._check_retcodes()) fit = CmdStanMCMC(runset) with self.assertRaisesRegex(ValueError, 'header mismatch'): fit._validate_csv_files() # bad draws output = os.path.join(badfiles_path, 'bad-draws-bern') cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2, 3, 4], seed=12345, data=jdata, output_basename=output, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4) retcodes = runset._retcodes for i in range(len(retcodes)): runset._set_retcode(i, 0) self.assertTrue(runset._check_retcodes()) fit = CmdStanMCMC(runset) with self.assertRaisesRegex(ValueError, 'draws'): fit._validate_csv_files() # mismatch - column headers, draws output = os.path.join(badfiles_path, 'bad-cols-bern') cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2, 3, 4], seed=12345, data=jdata, output_basename=output, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4) retcodes = runset._retcodes for i in range(len(retcodes)): runset._set_retcode(i, 0) self.assertTrue(runset._check_retcodes()) fit = CmdStanMCMC(runset) with self.assertRaisesRegex(ValueError, 'bad draw'): fit._validate_csv_files()
def variational( self, data: Union[Dict, str] = None, seed: int = None, inits: float = None, output_dir: str = None, save_diagnostics: bool = False, algorithm: str = None, iter: int = None, grad_samples: int = None, elbo_samples: int = None, eta: Real = None, adapt_engaged: bool = True, adapt_iter: int = None, tol_rel_obj: Real = None, eval_elbo: int = None, output_samples: int = None, require_converged: bool = True, ) -> CmdStanVB: """ Run CmdStan's variational inference algorithm to approximate the posterior distribution of the model conditioned on the data. This function validates the specified configuration, composes a call to the CmdStan ``variational`` method and spawns one subprocess to run the optimizer and waits for it to run to completion. Unspecified arguments are not included in the call to CmdStan, i.e., those arguments will have CmdStan default values. The ``CmdStanVB`` object records the command, the return code, and the paths to the variational method output csv and console files. The output files are written either to a specified output directory or to a temporary directory which is deleted upon session exit. Output files are either written to a temporary directory or to the specified output directory. Output filenames correspond to the template '<model_name>-<YYYYMMDDHHMM>-<chain_id>' plus the file suffix which is either '.csv' for the CmdStan output or '.txt' for the console messages, e.g. 'bernoulli-201912081451-1.csv'. Output files written to the temporary directory contain an additional 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. :param data: Values for all data variables in the model, specified either as a dictionary with entries matching the data variables, or as the path of a data file in JSON or Rdump format. :param seed: The seed for random number generator. Must be an integer between 0 and 2^32 - 1. If unspecified, ``numpy.random.RandomState()`` is used to generate a seed which will be used for all chains. :param inits: Specifies how the sampler initializes parameter values. Initialization is uniform random on a range centered on 0 with default range of 2. Specifying a single number n > 0 changes the initialization range to [-n, n]. :param output_dir: Name of the directory to which CmdStan output files are written. If unspecified, output files will be written to a temporary directory which is deleted upon session exit. :param save_diagnostics: Whether or not to save diagnostics. If True, csv output files are written to an output file with filename template '<model_name>-<YYYYMMDDHHMM>-diagnostic-<chain_id>', e.g. 'bernoulli-201912081451-diagnostic-1.csv'. :param algorithm: Algorithm to use. One of: 'meanfield', 'fullrank'. :param iter: Maximum number of ADVI iterations. :param grad_samples: Number of MC draws for computing the gradient. :param elbo_samples: Number of MC draws for estimate of ELBO. :param eta: Stepsize scaling parameter. :param adapt_engaged: Whether eta adaptation is engaged. :param adapt_iter: Number of iterations for eta adaptation. :param tol_rel_obj: Relative tolerance parameter for convergence. :param eval_elbo: Number of iterations between ELBO evaluations. :param output_samples: Number of approximate posterior output draws to save. :param require_converged: Whether or not to raise an error if stan reports that "The algorithm may not have converged". :return: CmdStanVB object """ variational_args = VariationalArgs( algorithm=algorithm, iter=iter, grad_samples=grad_samples, elbo_samples=elbo_samples, eta=eta, adapt_engaged=adapt_engaged, adapt_iter=adapt_iter, tol_rel_obj=tol_rel_obj, eval_elbo=eval_elbo, output_samples=output_samples, ) with MaybeDictToFilePath(data, inits) as (_data, _inits): args = CmdStanArgs( self._name, self._exe_file, chain_ids=None, data=_data, seed=seed, inits=_inits, output_dir=output_dir, save_diagnostics=save_diagnostics, method_args=variational_args, ) dummy_chain_id = 0 runset = RunSet(args=args, chains=1) self._run_cmdstan(runset, dummy_chain_id) # treat failure to converge as failure transcript_file = runset.stdout_files[dummy_chain_id] valid = True pat = re.compile(r'The algorithm may not have converged.', re.M) with open(transcript_file, 'r') as transcript: contents = transcript.read() errors = re.findall(pat, contents) if len(errors) > 0: valid = False if require_converged and not valid: raise RuntimeError('The algorithm may not have converged.') if not runset._check_retcodes(): msg = 'Error during variational inference.\n{}'.format( runset.get_err_msgs()) raise RuntimeError(msg) # pylint: disable=invalid-name vb = CmdStanVB(runset) return vb
def generate_quantities( self, data: Union[Dict, str] = None, mcmc_sample: Union[CmdStanMCMC, List[str]] = None, seed: int = None, gq_output_dir: str = None, ) -> CmdStanGQ: """ Run CmdStan's generate_quantities method which runs the generated quantities block of a model given an existing sample. This function takes a CmdStanMCMC object and the dataset used to generate that sample and calls to the CmdStan ``generate_quantities`` method to generate additional quantities of interest. The ``CmdStanGQ`` object records the command, the return code, and the paths to the generate method output csv and console files. The output files are written either to a specified output directory or to a temporary directory which is deleted upon session exit. Output files are either written to a temporary directory or to the specified output directory. Output filenames correspond to the template '<model_name>-<YYYYMMDDHHMM>-<chain_id>' plus the file suffix which is either '.csv' for the CmdStan output or '.txt' for the console messages, e.g. 'bernoulli-201912081451-1.csv'. Output files written to the temporary directory contain an additional 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. :param data: Values for all data variables in the model, specified either as a dictionary with entries matching the data variables, or as the path of a data file in JSON or Rdump format. :param mcmc_sample: Can be either a ``CmdStanMCMC`` object returned by the ``sample`` method or a list of stan-csv files generated by fitting the model to the data using any Stan interface. :param seed: The seed for random number generator. Must be an integer between 0 and 2^32 - 1. If unspecified, ``numpy.random.RandomState()`` is used to generate a seed which will be used for all chains. *NOTE: Specifying the seed will guarantee the same result for multiple invocations of this method with the same inputs. However this will not reproduce results from the sample method given the same inputs because the RNG will be in a different state.* :param gq_output_dir: Name of the directory in which the CmdStan output files are saved. If unspecified, files will be written to a temporary directory which is deleted upon session exit. :return: CmdStanGQ object """ sample_csv_files = [] sample_drawset = None chains = 0 if isinstance(mcmc_sample, CmdStanMCMC): sample_csv_files = mcmc_sample.runset.csv_files sample_drawset = mcmc_sample.draws_pd() chains = mcmc_sample.chains chain_ids = mcmc_sample.chain_ids elif isinstance(mcmc_sample, list): if len(mcmc_sample) < 1: raise ValueError('MCMC sample cannot be empty list') sample_csv_files = mcmc_sample chains = len(sample_csv_files) chain_ids = [x + 1 for x in range(chains)] else: raise ValueError('MCMC sample must be either CmdStanMCMC object' ' or list of paths to sample csv_files.') try: if sample_drawset is None: # assemble sample from csv files config = {} # scan 1st csv file to get config try: config = scan_sampler_csv(sample_csv_files[0]) except ValueError: config = scan_sampler_csv(sample_csv_files[0], True) conf_iter_sampling = None if 'num_samples' in config: conf_iter_sampling = int(config['num_samples']) conf_iter_warmup = None if 'num_warmup' in config: conf_iter_warmup = int(config['num_warmup']) conf_thin = None if 'thin' in config: conf_thin = int(config['thin']) sampler_args = SamplerArgs( iter_sampling=conf_iter_sampling, iter_warmup=conf_iter_warmup, thin=conf_thin, ) args = CmdStanArgs( self._name, self._exe_file, chain_ids=chain_ids, method_args=sampler_args, ) runset = RunSet(args=args, chains=chains, chain_ids=chain_ids) runset._csv_files = sample_csv_files sample_fit = CmdStanMCMC(runset) sample_drawset = sample_fit.draws_pd() except ValueError as exc: raise ValueError('Invalid mcmc_sample, error:\n\t{}\n\t' ' while processing files\n\t{}'.format( repr(exc), '\n\t'.join(sample_csv_files))) from exc generate_quantities_args = GenerateQuantitiesArgs( csv_files=sample_csv_files) generate_quantities_args.validate(chains) with MaybeDictToFilePath(data, None) as (_data, _inits): args = CmdStanArgs( self._name, self._exe_file, chain_ids=chain_ids, data=_data, seed=seed, output_dir=gq_output_dir, method_args=generate_quantities_args, ) runset = RunSet(args=args, chains=chains, chain_ids=chain_ids) parallel_chains_avail = cpu_count() parallel_chains = max(min(parallel_chains_avail - 2, chains), 1) with ThreadPoolExecutor(max_workers=parallel_chains) as executor: for i in range(chains): executor.submit(self._run_cmdstan, runset, i) if not runset._check_retcodes(): msg = 'Error during generate_quantities.\n{}'.format( runset.get_err_msgs()) raise RuntimeError(msg) quantities = CmdStanGQ(runset=runset, mcmc_sample=sample_drawset) return quantities
def test_check_retcodes(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs() chain_ids = [1, 2, 3, 4] # default cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=chain_ids, data=jdata, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4) retcodes = runset._retcodes self.assertEqual(4, len(retcodes)) for i in range(len(retcodes)): self.assertEqual(-1, runset._retcode(i)) runset._set_retcode(0, 0) self.assertEqual(0, runset._retcode(0)) for i in range(1, len(retcodes)): self.assertEqual(-1, runset._retcode(i)) self.assertFalse(runset._check_retcodes()) for i in range(1, len(retcodes)): runset._set_retcode(i, 0) self.assertTrue(runset._check_retcodes())
def optimize( self, data: Union[Dict, str] = None, seed: int = None, inits: Union[Dict, float, str] = None, output_dir: str = None, algorithm: str = None, init_alpha: float = None, iter: int = None, ) -> CmdStanMLE: """ Run the specified CmdStan optimize algorithm to produce a penalized maximum likelihood estimate of the model parameters. This function validates the specified configuration, composes a call to the CmdStan ``optimize`` method and spawns one subprocess to run the optimizer and waits for it to run to completion. Unspecified arguments are not included in the call to CmdStan, i.e., those arguments will have CmdStan default values. The ``CmdStanMLE`` object records the command, the return code, and the paths to the optimize method output csv and console files. The output files are written either to a specified output directory or to a temporary directory which is deleted upon session exit. Output files are either written to a temporary directory or to the specified output directory. Ouput filenames correspond to the template '<model_name>-<YYYYMMDDHHMM>-<chain_id>' plus the file suffix which is either '.csv' for the CmdStan output or '.txt' for the console messages, e.g. 'bernoulli-201912081451-1.csv'. Output files written to the temporary directory contain an additional 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. :param data: Values for all data variables in the model, specified either as a dictionary with entries matching the data variables, or as the path of a data file in JSON or Rdump format. :param seed: The seed for random number generator. Must be an integer between 0 and 2^32 - 1. If unspecified, ``numpy.random.RandomState()`` is used to generate a seed. :param inits: Specifies how the sampler initializes parameter values. Initialization is either uniform random on a range centered on 0, exactly 0, or a dictionary or file of initial values for some or all parameters in the model. The default initialization behavior will initialize all parameter values on range [-2, 2] on the *unconstrained* support. If the expected parameter values are too far from this range, this option may improve estimation. The following value types are allowed: * Single number, n > 0 - initialization range is [-n, n]. * 0 - all parameters are initialized to 0. * dictionary - pairs parameter name : initial value. * string - pathname to a JSON or Rdump data file. :param output_dir: Name of the directory to which CmdStan output files are written. If unspecified, output files will be written to a temporary directory which is deleted upon session exit. :param algorithm: Algorithm to use. One of: 'BFGS', 'LBFGS', 'Newton' :param init_alpha: Line search step size for first iteration :param iter: Total number of iterations :return: CmdStanMLE object """ optimize_args = OptimizeArgs(algorithm=algorithm, init_alpha=init_alpha, iter=iter) with MaybeDictToFilePath(data, inits) as (_data, _inits): args = CmdStanArgs( self._name, self._exe_file, chain_ids=None, data=_data, seed=seed, inits=_inits, output_dir=output_dir, save_diagnostics=False, method_args=optimize_args, ) dummy_chain_id = 0 runset = RunSet(args=args, chains=1) self._run_cmdstan(runset, dummy_chain_id) if not runset._check_retcodes(): msg = 'Error during optimization.\n{}'.format( runset.get_err_msgs()) raise RuntimeError(msg) mle = CmdStanMLE(runset) return mle
def test_validate_bad_run(self): exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs(max_treedepth=11, adapt_delta=0.95) # some chains had errors cmdstan_args = CmdStanArgs( model_name='bernoulli', model_exe=exe, chain_ids=[1, 2, 3, 4], seed=12345, data=jdata, output_dir=DATAFILES_PATH, method_args=sampler_args, ) runset = RunSet(args=cmdstan_args, chains=4) for i in range(4): runset._set_retcode(i, 0) self.assertTrue(runset._check_retcodes()) # errors reported runset._stderr_files = [ os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-transcript-bern-1.txt'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-transcript-bern-2.txt'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-transcript-bern-3.txt'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-transcript-bern-4.txt'), ] self.assertEqual(len(runset._get_err_msgs()), 4) # csv file headers inconsistent runset._csv_files = [ os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-1.csv'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-2.csv'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-3.csv'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-4.csv'), ] with self.assertRaisesRegex(ValueError, 'header mismatch'): CmdStanMCMC(runset) # bad draws runset._csv_files = [ os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-1.csv'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-2.csv'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-3.csv'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-4.csv'), ] with self.assertRaisesRegex(ValueError, 'draws'): CmdStanMCMC(runset) # mismatch - column headers, draws runset._csv_files = [ os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-1.csv'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-2.csv'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-3.csv'), os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-4.csv'), ] with self.assertRaisesRegex(ValueError, 'bad draw, expecting 9 items, found 8'): CmdStanMCMC(runset)
def sample( self, data: Union[Dict, str] = None, chains: Union[int, None] = None, parallel_chains: Union[int, None] = None, threads_per_chain: Union[int, None] = None, seed: Union[int, List[int]] = None, chain_ids: Union[int, List[int]] = None, inits: Union[Dict, float, str, List[str]] = None, iter_warmup: int = None, iter_sampling: int = None, save_warmup: bool = False, thin: int = None, max_treedepth: float = None, metric: Union[str, List[str]] = None, step_size: Union[float, List[float]] = None, adapt_engaged: bool = True, adapt_delta: float = None, adapt_init_phase: int = None, adapt_metric_window: int = None, adapt_step_size: int = None, fixed_param: bool = False, output_dir: str = None, save_diagnostics: bool = False, show_progress: Union[bool, str] = False, validate_csv: bool = True, ) -> CmdStanMCMC: """ Run or more chains of the NUTS sampler to produce a set of draws from the posterior distribution of a model conditioned on some data. This function validates the specified configuration, composes a call to the CmdStan ``sample`` method and spawns one subprocess per chain to run the sampler and waits for all chains to run to completion. Unspecified arguments are not included in the call to CmdStan, i.e., those arguments will have CmdStan default values. For each chain, the ``CmdStanMCMC`` object records the command, the return code, the sampler output file paths, and the corresponding console outputs, if any. The output files are written either to a specified output directory or to a temporary directory which is deleted upon session exit. Output files are either written to a temporary directory or to the specified output directory. Ouput filenames correspond to the template '<model_name>-<YYYYMMDDHHMM>-<chain_id>' plus the file suffix which is either '.csv' for the CmdStan output or '.txt' for the console messages, e.g. 'bernoulli-201912081451-1.csv'. Output files written to the temporary directory contain an additional 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. :param data: Values for all data variables in the model, specified either as a dictionary with entries matching the data variables, or as the path of a data file in JSON or Rdump format. :param chains: Number of sampler chains, must be a positive integer. :param parallel_chains: Number of processes to run in parallel. Must be a positive integer. Defaults to ``multiprocessing.cpu_count()``. :param threads_per_chain: The number of threads to use in parallelized sections within an MCMC chain (e.g., when using the Stan functions ``reduce_sum()`` or ``map_rect()``). This will only have an effect if the model was compiled with threading support. The total number of threads used will be ``parallel_chains * threads_per_chain``. :param seed: The seed for random number generator. Must be an integer between 0 and 2^32 - 1. If unspecified, ``numpy.random.RandomState()`` is used to generate a seed which will be used for all chains. When the same seed is used across all chains, the chain-id is used to advance the RNG to avoid dependent samples. :param chain_ids: The offset for the random number generator, either an integer or a list of unique per-chain offsets. If unspecified, chain ids are numbered sequentially starting from 1. :param inits: Specifies how the sampler initializes parameter values. Initialization is either uniform random on a range centered on 0, exactly 0, or a dictionary or file of initial values for some or all parameters in the model. The default initialization behavior will initialize all parameter values on range [-2, 2] on the *unconstrained* support. If the expected parameter values are too far from this range, this option may improve adaptation. The following value types are allowed: * Single number n > 0 - initialization range is [-n, n]. * 0 - all parameters are initialized to 0. * dictionary - pairs parameter name : initial value. * string - pathname to a JSON or Rdump data file. * list of strings - per-chain pathname to data file. :param iter_warmup: Number of warmup iterations for each chain. :param iter_sampling: Number of draws from the posterior for each chain. :param save_warmup: When ``True``, sampler saves warmup draws as part of the Stan csv output file. :param thin: Period between saved samples. :param max_treedepth: Maximum depth of trees evaluated by NUTS sampler per iteration. :param metric: Specification of the mass matrix, either as a vector consisting of the diagonal elements of the covariance matrix ('diag' or 'diag_e') or the full covariance matrix ('dense' or 'dense_e'). If the value of the metric argument is a string other than 'diag', 'diag_e', 'dense', or 'dense_e', it must be a valid filepath to a JSON or Rdump file which contains an entry 'inv_metric' whose value is either the diagonal vector or the full covariance matrix. If the value of the metric argument is a list of paths, its length must match the number of chains and all paths must be unique. :param step_size: Initial stepsize for HMC sampler. The value is either a single number or a list of numbers which will be used as the global or per-chain initial step size, respectively. The length of the list of step sizes must match the number of chains. :param adapt_engaged: When True, adapt stepsize and metric. :param adapt_delta: Adaptation target Metropolis acceptance rate. The default value is 0.8. Increasing this value, which must be strictly less than 1, causes adaptation to use smaller step sizes which improves the effective sample size, but may increase the time per iteration. :param adapt_init_phase: Iterations for initial phase of adaptation during which step size is adjusted so that the chain converges towards the typical set. :param adapt_metric_window: The second phase of adaptation tunes the metric and stepsize in a series of intervals. This parameter specifies the number of iterations used for the first tuning interval; window size increases for each subsequent interval. :param adapt_step_size: Number of iterations given over to adjusting the step size given the tuned metric during the final phase of adaptation. :param fixed_param: When ``True``, call CmdStan with argument ``algorithm=fixed_param`` which runs the sampler without updating the Markov Chain, thus the values of all parameters and transformed parameters are constant across all draws and only those values in the generated quantities block that are produced by RNG functions may change. This provides a way to use Stan programs to generate simulated data via the generated quantities block. This option must be used when the parameters block is empty. Default value is ``False``. :param output_dir: Name of the directory to which CmdStan output files are written. If unspecified, output files will be written to a temporary directory which is deleted upon session exit. :param save_diagnostics: Whether or not to save diagnostics. If True, csv output files are written to an output file with filename template '<model_name>-<YYYYMMDDHHMM>-diagnostic-<chain_id>', e.g. 'bernoulli-201912081451-diagnostic-1.csv'. :param show_progress: Use tqdm progress bar to show sampling progress. If show_progress=='notebook' use tqdm_notebook (needs nodejs for jupyter). :param validate_csv: If ``False``, skip scan of sample csv output file. When sample is large or disk i/o is slow, will speed up processing. Default is ``True`` - sample csv files are scanned for completeness and consistency. :return: CmdStanMCMC object """ if chains is None: if fixed_param: chains = 1 else: chains = 4 if chains < 1: raise ValueError( 'Chains must be a positive integer value, found {}.'.format( chains)) if chain_ids is None: chain_ids = [x + 1 for x in range(chains)] else: if isinstance(chain_ids, int): if chain_ids < 1: raise ValueError( 'Chain_id must be a positive integer value,' ' found {}.'.format(chain_ids)) chain_ids = [chain_ids + i for i in range(chains)] else: if not len(chain_ids) == chains: raise ValueError( 'Chain_ids must correspond to number of chains' ' specified {} chains, found {} chain_ids.'.format( chains, len(chain_ids))) for chain_id in chain_ids: if chain_id < 0: raise ValueError( 'Chain_id must be a non-negative integer value,' ' found {}.'.format(chain_id)) if parallel_chains is None: parallel_chains = max(min(cpu_count(), chains), 1) elif parallel_chains > chains: self._logger.info( 'Requesting %u parallel_chains for %u chains,' ' running all chains in parallel.', parallel_chains, chains, ) parallel_chains = chains elif parallel_chains < 1: raise ValueError( 'Argument parallel_chains must be a positive integer value, ' 'found {}.'.format(parallel_chains)) if threads_per_chain is None: threads_per_chain = 1 if threads_per_chain < 1: raise ValueError( 'Argument threads_per_chain must be a positive integer value, ' 'found {}.'.format(threads_per_chain)) self._logger.debug('total threads: %u', parallel_chains * threads_per_chain) os.environ['STAN_NUM_THREADS'] = str(threads_per_chain) refresh = None if show_progress: try: import tqdm self._logger.propagate = False except ImportError: self._logger.warning( ('Package tqdm not installed, cannot show progress ' 'information. Please install tqdm with ' "'pip install tqdm'")) show_progress = False # TODO: issue 49: inits can be initialization function sampler_args = SamplerArgs( iter_warmup=iter_warmup, iter_sampling=iter_sampling, save_warmup=save_warmup, thin=thin, max_treedepth=max_treedepth, metric=metric, step_size=step_size, adapt_engaged=adapt_engaged, adapt_delta=adapt_delta, adapt_init_phase=adapt_init_phase, adapt_metric_window=adapt_metric_window, adapt_step_size=adapt_step_size, fixed_param=fixed_param, ) with MaybeDictToFilePath(data, inits) as (_data, _inits): args = CmdStanArgs( self._name, self._exe_file, chain_ids=chain_ids, data=_data, seed=seed, inits=_inits, output_dir=output_dir, save_diagnostics=save_diagnostics, method_args=sampler_args, refresh=refresh, logger=self._logger, ) runset = RunSet(args=args, chains=chains, chain_ids=chain_ids) pbar = None all_pbars = [] with ThreadPoolExecutor(max_workers=parallel_chains) as executor: for i in range(chains): if show_progress: if (isinstance(show_progress, str) and show_progress.lower() == 'notebook'): try: tqdm_pbar = tqdm.tqdm_notebook except ImportError: msg = ( 'Cannot import tqdm.tqdm_notebook.\n' 'Functionality is only supported on the ' 'Jupyter Notebook and compatible platforms' '.\nPlease follow the instructions in ' 'https://github.com/tqdm/tqdm/issues/394#' 'issuecomment-384743637 and remember to ' 'stop & start your jupyter server.') self._logger.warning(msg) tqdm_pbar = tqdm.tqdm else: tqdm_pbar = tqdm.tqdm # enable dynamic_ncols for advanced users # currently hidden feature dynamic_ncols = os.environ.get('TQDM_DYNAMIC_NCOLS', 'False') if dynamic_ncols.lower() in ['0', 'false']: dynamic_ncols = False else: dynamic_ncols = True pbar = tqdm_pbar( desc='Chain {} - warmup'.format(i + 1), position=i, total=1, # Will set total from Stan's output dynamic_ncols=dynamic_ncols, ) all_pbars.append(pbar) executor.submit(self._run_cmdstan, runset, i, pbar) # Closing all progress bars for pbar in all_pbars: pbar.close() if show_progress: # re-enable logger for console self._logger.propagate = True if not runset._check_retcodes(): msg = 'Error during sampling.\n{}'.format( runset.get_err_msgs()) raise RuntimeError(msg) mcmc = CmdStanMCMC(runset, validate_csv, logger=self._logger) return mcmc