def sampling(self, data=None, pars=None, chains=4, iter=2000, warmup=None, thin=1, seed=None, init='random', sample_file=None, diagnostic_file=None, verbose=False, algorithm=None, control=None, n_jobs=1, **kwargs): """Draw samples from the model. Parameters ---------- data : dict A Python dictionary providing the data for the model. Variables for Stan are stored in the dictionary as expected. Variable names are the keys and the values are their associated values. Stan only accepts certain kinds of values; see Notes. pars : list of string, optional A list of strings indicating parameters of interest. By default all parameters specified in the model will be stored. chains : int, optional Positive integer specifying number of chains. 4 by default. iter : int, 2000 by default Positive integer specifying how many iterations for each chain including warmup. warmup : int, iter//2 by default Positive integer specifying number of warmup (aka burin) iterations. As `warmup` also specifies the number of iterations used for step-size adaption, warmup samples should not be used for inference. thin : int, 1 by default Positive integer specifying the period for saving samples. seed : int or np.random.RandomState, optional The seed, a positive integer for random number generation. Only one seed is needed when multiple chains are used, as the other chain's seeds are generated from the first chain's to prevent dependency among random number streams. By default, seed is ``random.randint(0, MAX_UINT)``. algorithm : {"NUTS", "HMC"}, optional One of algorithms that are implemented in Stan such as the No-U-Turn sampler (NUTS, Hoffman and Gelman 2011) and static HMC. init : {0, '0', 'random', function returning dict, list of dict}, optional Specifies how initial parameter values are chosen: 0 or '0' initializes all to be zero on the unconstrained support; 'random' generates random initial values; list of size equal to the number of chains (`chains`), where the list contains a dict with initial parameter values; function returning a dict with initial parameter values. The function may take an optional argument `chain_id`. sample_file : string, optional File name specifying where samples for *all* parameters and other saved quantities will be written. If not provided, no samples will be written. If the folder given is not writable, a temporary directory will be used. When there are multiple chains, an underscore and chain number are appended to the file name. By default do not write samples to file. diagnostic_file : str, optional File name indicating where diagonstic data for all parameters should be written. If not writable, a temporary directory is used. verbose : boolean, False by default Indicates whether intermediate output should be piped to the console. This output may be useful for debugging. control : dict, optional A dictionary of parameters to control the sampler's behavior. Default values are used if control is not specified. The following are adaptation parameters for sampling algorithms. These are parameters used in Stan with similar names: - `adapt_engaged` : bool - `adapt_gamma` : float, positive, default 0.05 - `adapt_delta` : float, between 0 and 1, default 0.65 - `adapt_kappa` : float, between default 0.75 - `adapt_t0` : float, positive, default 10 In addition, the algorithm HMC (called 'static HMC' in Stan) and NUTS share the following parameters: - `stepsize`: float, positive - `stepsize_jitter`: float, between 0 and 1 - `metric` : str, {"unit_e", "diag_e", "dense_e"} In addition, depending on which algorithm is used, different parameters can be set as in Stan for sampling. For the algorithm HMC we can set - `int_time`: float, positive For algorithm NUTS, we can set - `max_treedepth` : int, positive n_jobs : int, 1 by default Sample in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. Returns ------- fit : StanFit4<model_name> Instance containing the fitted results. Other parameters ---------------- chain_id : int, optional `chain_id` can be a vector to specify the chain_id for all chains or an integer. For the former case, they should be unique. For the latter, the sequence of integers starting from the given `chain_id` are used for all chains. init_r : float, optional `init_r` is only valid if `init` == "random". In this case, the intial values are simulated from [-`init_r`, `init_r`] rather than using the default interval (see the manual of Stan). test_grad: bool, optional append_samples`: bool, optional refresh`: int, optional Argument `refresh` can be used to control how to indicate the progress during sampling (i.e. show the progress every \code{refresh} iterations). By default, `refresh` is `max(iter/10, 1)`. Examples -------- >>> from pystan import StanModel >>> m = StanModel(model_code='parameters {real y;} model {y ~ normal(0,1);}') >>> m.sampling(iter=100) """ # NOTE: in this function, iter masks iter() the python function. # If this ever turns out to be a problem just add: # iter_ = iter # del iter # now builtins.iter is available if diagnostic_file is not None: raise NotImplementedError("diagnostic_file not supported yet") if data is None: data = {} if warmup is None: warmup = int(iter // 2) algorithms = ("NUTS", "HMC") # , "Metropolis") algorithm = "NUTS" if algorithm is None else algorithm if algorithm not in algorithms: raise ValueError("Algorithm must be one of {}".format(algorithms)) fit = self.fit_class(data) m_pars = fit._get_param_names() p_dims = fit._get_param_dims() if pars is not None and len(pars) > 0: if not all(p in m_pars for p in pars): pars = np.asarray(pars) unmatched = pars[np.invert(np.in1d(pars, m_pars))] msg = "No parameter(s): {}; sampling not done." raise ValueError(msg.format(', '.join(pars[unmatched]))) if chains < 1: raise ValueError("The number of chains is less than one; sampling" "not done.") args_list = pystan.misc._config_argss(chains=chains, iter=iter, warmup=warmup, thin=thin, init=init, seed=seed, sample_file=sample_file, diagnostic_file=diagnostic_file, algorithm=algorithm, control=control, **kwargs) # number of samples saved after thinning warmup2 = 1 + (warmup - 1) // thin n_kept = 1 + (iter - warmup - 1) // thin n_save = n_kept + warmup2 if n_jobs == -1: n_jobs = None assert len(args_list) == chains call_sampler_args = izip(itertools.repeat(data), args_list) call_sampler_star = self.module._call_sampler_star if n_jobs is None or n_jobs > 1: pool = multiprocessing.Pool(processes=n_jobs) # in Python 3.3 and higher one could use pool.starmap ret_and_samples = pool.map(call_sampler_star, call_sampler_args) else: ret_and_samples = [call_sampler_star(a) for a in call_sampler_args] samples = [smpl for _, smpl in ret_and_samples] inits_used = pystan.misc._organize_inits([s['inits'] for s in samples], m_pars, p_dims) random_state = np.random.RandomState(args_list[0]['seed']) perm_lst = [random_state.permutation(int(n_kept)) for _ in range(chains)] fnames_oi = fit._get_param_fnames_oi() n_flatnames = len(fnames_oi) fit.sim = {'samples': samples, # rstan has this; name clashes with 'chains' in samples[0]['chains'] 'chains': len(samples), 'iter': iter, 'warmup': warmup, 'thin': thin, 'n_save': [n_save] * chains, 'warmup2': [warmup2] * chains, 'permutation': perm_lst, 'pars_oi': fit._get_param_names_oi(), 'dims_oi': fit._get_param_dims(), 'fnames_oi': fnames_oi, 'n_flatnames': n_flatnames} fit.model_name = self.model_name fit.model_pars = m_pars fit.par_dims = p_dims fit.mode = 0 if not kwargs.get('test_grad') else 1 fit.inits = inits_used fit.stan_args = args_list fit.stanmodel = self fit.date = datetime.datetime.now() return fit
def sampling(self, data=None, pars=None, chains=4, iter=2000, warmup=None, thin=1, seed=None, init='random', sample_file=None, diagnostic_file=None, verbose=False, algorithm=None, control=None, n_jobs=-1, **kwargs): """Draw samples from the model. Parameters ---------- data : dict A Python dictionary providing the data for the model. Variables for Stan are stored in the dictionary as expected. Variable names are the keys and the values are their associated values. Stan only accepts certain kinds of values; see Notes. pars : list of string, optional A list of strings indicating parameters of interest. By default all parameters specified in the model will be stored. chains : int, optional Positive integer specifying number of chains. 4 by default. iter : int, 2000 by default Positive integer specifying how many iterations for each chain including warmup. warmup : int, iter//2 by default Positive integer specifying number of warmup (aka burin) iterations. As `warmup` also specifies the number of iterations used for step-size adaption, warmup samples should not be used for inference. thin : int, 1 by default Positive integer specifying the period for saving samples. seed : int or np.random.RandomState, optional The seed, a positive integer for random number generation. Only one seed is needed when multiple chains are used, as the other chain's seeds are generated from the first chain's to prevent dependency among random number streams. By default, seed is ``random.randint(0, MAX_UINT)``. algorithm : {"NUTS", "HMC", "Fixed_param"}, optional One of algorithms that are implemented in Stan such as the No-U-Turn sampler (NUTS, Hoffman and Gelman 2011), static HMC, or ``Fixed_param``. init : {0, '0', 'random', function returning dict, list of dict}, optional Specifies how initial parameter values are chosen: 0 or '0' initializes all to be zero on the unconstrained support; 'random' generates random initial values; list of size equal to the number of chains (`chains`), where the list contains a dict with initial parameter values; function returning a dict with initial parameter values. The function may take an optional argument `chain_id`. sample_file : string, optional File name specifying where samples for *all* parameters and other saved quantities will be written. If not provided, no samples will be written. If the folder given is not writable, a temporary directory will be used. When there are multiple chains, an underscore and chain number are appended to the file name. By default do not write samples to file. diagnostic_file : str, optional File name indicating where diagonstic data for all parameters should be written. If not writable, a temporary directory is used. verbose : boolean, False by default Indicates whether intermediate output should be piped to the console. This output may be useful for debugging. control : dict, optional A dictionary of parameters to control the sampler's behavior. Default values are used if control is not specified. The following are adaptation parameters for sampling algorithms. These are parameters used in Stan with similar names: - `adapt_engaged` : bool, default True - `adapt_gamma` : float, positive, default 0.05 - `adapt_delta` : float, between 0 and 1, default 0.8 - `adapt_kappa` : float, between default 0.75 - `adapt_t0` : float, positive, default 10 In addition, the algorithm HMC (called 'static HMC' in Stan) and NUTS share the following parameters: - `stepsize`: float, positive - `stepsize_jitter`: float, between 0 and 1 - `metric` : str, {"unit_e", "diag_e", "dense_e"} In addition, depending on which algorithm is used, different parameters can be set as in Stan for sampling. For the algorithm HMC we can set - `int_time`: float, positive For algorithm NUTS, we can set - `max_treedepth` : int, positive n_jobs : int, optional Sample in parallel. If -1 all CPUs are used. If 1, no parallel computing code is used at all, which is useful for debugging. Returns ------- fit : StanFit4Model Instance containing the fitted results. Other parameters ---------------- chain_id : int or iterable of int, optional `chain_id` can be a vector to specify the chain_id for all chains or an integer. For the former case, they should be unique. For the latter, the sequence of integers starting from the given `chain_id` are used for all chains. init_r : float, optional `init_r` is only valid if `init` == "random". In this case, the intial values are simulated from [-`init_r`, `init_r`] rather than using the default interval (see the manual of Stan). test_grad: bool, optional append_samples`: bool, optional refresh`: int, optional Argument `refresh` can be used to control how to indicate the progress during sampling (i.e. show the progress every \code{refresh} iterations). By default, `refresh` is `max(iter/10, 1)`. Examples -------- >>> from pystan import StanModel >>> m = StanModel(model_code='parameters {real y;} model {y ~ normal(0,1);}') >>> m.sampling(iter=100) """ # NOTE: in this function, iter masks iter() the python function. # If this ever turns out to be a problem just add: # iter_ = iter # del iter # now builtins.iter is available if diagnostic_file is not None: raise NotImplementedError("diagnostic_file not supported yet") if data is None: data = {} if warmup is None: warmup = int(iter // 2) algorithms = ("NUTS", "HMC", "Fixed_param") # , "Metropolis") algorithm = "NUTS" if algorithm is None else algorithm if algorithm not in algorithms: raise ValueError("Algorithm must be one of {}".format(algorithms)) fit = self.fit_class(data) m_pars = fit._get_param_names() p_dims = fit._get_param_dims() if isinstance(pars, string_types): pars = [pars] if pars is not None and len(pars) > 0: # Implementation note: this does not set the params_oi for the # instances of stan_fit which actually make the calls to # call_sampler. This is because we need separate instances of # stan_fit in each thread/process. So update_param_oi needs to # be called in every stan_fit instance. fit._update_param_oi(pars) if not all(p in m_pars for p in pars): pars = np.asarray(pars) unmatched = pars[np.invert(np.in1d(pars, m_pars))] msg = "No parameter(s): {}; sampling not done." raise ValueError(msg.format(', '.join(pars[unmatched]))) else: pars = m_pars if chains < 1: raise ValueError("The number of chains is less than one; sampling" "not done.") # check that arguments in kwargs are valid valid_args = { "chain_id", "init_r", "test_grad", "append_samples", "refresh", "control" } for arg in kwargs: if arg not in valid_args: raise ValueError( "Parameter `{}` is not recognized.".format(arg)) args_list = pystan.misc._config_argss(chains=chains, iter=iter, warmup=warmup, thin=thin, init=init, seed=seed, sample_file=sample_file, diagnostic_file=diagnostic_file, algorithm=algorithm, control=control, **kwargs) # number of samples saved after thinning warmup2 = 1 + (warmup - 1) // thin n_kept = 1 + (iter - warmup - 1) // thin n_save = n_kept + warmup2 if n_jobs is None: n_jobs = -1 # disable multiprocessing if we only have a single chain if chains == 1: n_jobs = 1 assert len(args_list) == chains call_sampler_args = izip(itertools.repeat(data), args_list, itertools.repeat(pars)) call_sampler_star = self.module._call_sampler_star ret_and_samples = _map_parallel(call_sampler_star, call_sampler_args, n_jobs) samples = [smpl for _, smpl in ret_and_samples] # _organize_inits strips out lp__ (RStan does it in this method) inits_used = pystan.misc._organize_inits([s['inits'] for s in samples], m_pars, p_dims) random_state = np.random.RandomState(args_list[0]['seed']) perm_lst = [ random_state.permutation(int(n_kept)) for _ in range(chains) ] fnames_oi = fit._get_param_fnames_oi() n_flatnames = len(fnames_oi) fit.sim = { 'samples': samples, # rstan has this; name clashes with 'chains' in samples[0]['chains'] 'chains': len(samples), 'iter': iter, 'warmup': warmup, 'thin': thin, 'n_save': [n_save] * chains, 'warmup2': [warmup2] * chains, 'permutation': perm_lst, 'pars_oi': fit._get_param_names_oi(), 'dims_oi': fit._get_param_dims_oi(), 'fnames_oi': fnames_oi, 'n_flatnames': n_flatnames } fit.model_name = self.model_name fit.model_pars = m_pars fit.par_dims = p_dims fit.mode = 0 if not kwargs.get('test_grad') else 1 fit.inits = inits_used fit.stan_args = args_list fit.stanmodel = self fit.date = datetime.datetime.now() return fit
def sampling(self, data=None, pars=None, chains=4, iter=2000, warmup=None, thin=1, seed=None, init='random', sample_file=None, diagnostic_file=None, verbose=False, algorithm=None, control=None, n_jobs=-1, **kwargs): """Draw samples from the model. Parameters ---------- data : dict A Python dictionary providing the data for the model. Variables for Stan are stored in the dictionary as expected. Variable names are the keys and the values are their associated values. Stan only accepts certain kinds of values; see Notes. pars : list of string, optional A list of strings indicating parameters of interest. By default all parameters specified in the model will be stored. chains : int, optional Positive integer specifying number of chains. 4 by default. iter : int, 2000 by default Positive integer specifying how many iterations for each chain including warmup. warmup : int, iter//2 by default Positive integer specifying number of warmup (aka burn-in) iterations. As `warmup` also specifies the number of iterations used for step-size adaption, warmup samples should not be used for inference. `warmup=0` forced if `algorithm=\"Fixed_param\"`. thin : int, 1 by default Positive integer specifying the period for saving samples. seed : int or np.random.RandomState, optional The seed, a positive integer for random number generation. Only one seed is needed when multiple chains are used, as the other chain's seeds are generated from the first chain's to prevent dependency among random number streams. By default, seed is ``random.randint(0, MAX_UINT)``. algorithm : {"NUTS", "HMC", "Fixed_param"}, optional One of algorithms that are implemented in Stan such as the No-U-Turn sampler (NUTS, Hoffman and Gelman 2011), static HMC, or ``Fixed_param``. Default is NUTS. init : {0, '0', 'random', function returning dict, list of dict}, optional Specifies how initial parameter values are chosen: 0 or '0' initializes all to be zero on the unconstrained support; 'random' generates random initial values; list of size equal to the number of chains (`chains`), where the list contains a dict with initial parameter values; function returning a dict with initial parameter values. The function may take an optional argument `chain_id`. sample_file : string, optional File name specifying where samples for *all* parameters and other saved quantities will be written. If not provided, no samples will be written. If the folder given is not writable, a temporary directory will be used. When there are multiple chains, an underscore and chain number are appended to the file name. By default do not write samples to file. verbose : boolean, False by default Indicates whether intermediate output should be piped to the console. This output may be useful for debugging. control : dict, optional A dictionary of parameters to control the sampler's behavior. Default values are used if control is not specified. The following are adaptation parameters for sampling algorithms. These are parameters used in Stan with similar names: - `adapt_engaged` : bool, default True - `adapt_gamma` : float, positive, default 0.05 - `adapt_delta` : float, between 0 and 1, default 0.8 - `adapt_kappa` : float, between default 0.75 - `adapt_t0` : float, positive, default 10 In addition, the algorithm HMC (called 'static HMC' in Stan) and NUTS share the following parameters: - `stepsize`: float, positive - `stepsize_jitter`: float, between 0 and 1 - `metric` : str, {"unit_e", "diag_e", "dense_e"} In addition, depending on which algorithm is used, different parameters can be set as in Stan for sampling. For the algorithm HMC we can set - `int_time`: float, positive For algorithm NUTS, we can set - `max_treedepth` : int, positive n_jobs : int, optional Sample in parallel. If -1 all CPUs are used. If 1, no parallel computing code is used at all, which is useful for debugging. Returns ------- fit : StanFit4Model Instance containing the fitted results. Other parameters ---------------- chain_id : int or iterable of int, optional `chain_id` can be a vector to specify the chain_id for all chains or an integer. For the former case, they should be unique. For the latter, the sequence of integers starting from the given `chain_id` are used for all chains. init_r : float, optional `init_r` is only valid if `init` == "random". In this case, the intial values are simulated from [-`init_r`, `init_r`] rather than using the default interval (see the manual of Stan). test_grad: bool, optional If `test_grad` is ``True``, Stan will not do any sampling. Instead, the gradient calculation is tested and printed out and the fitted StanFit4Model object is in test gradient mode. By default, it is ``False``. append_samples`: bool, optional refresh`: int, optional Argument `refresh` can be used to control how to indicate the progress during sampling (i.e. show the progress every \code{refresh} iterations). By default, `refresh` is `max(iter/10, 1)`. check_hmc_diagnostics : bool, optional After sampling run `pystan.diagnostics.check_hmc_diagnostics` function. Default is `True`. Checks for n_eff and rhat skipped if the flat parameter count is higher than 1000, unless user explicitly defines ``check_hmc_diagnostics=True``. Examples -------- >>> from pystan import StanModel >>> m = StanModel(model_code='parameters {real y;} model {y ~ normal(0,1);}') >>> m.sampling(iter=100) """ # NOTE: in this function, iter masks iter() the python function. # If this ever turns out to be a problem just add: # iter_ = iter # del iter # now builtins.iter is available if diagnostic_file is not None: raise NotImplementedError("diagnostic_file not supported yet") if data is None: data = {} if warmup is None: warmup = int(iter // 2) if not all(isinstance(arg, numbers.Integral) for arg in (iter, thin, warmup)): raise ValueError('only integer values allowed as `iter`, `thin`, and `warmup`.') algorithms = ("NUTS", "HMC", "Fixed_param") # , "Metropolis") algorithm = "NUTS" if algorithm is None else algorithm if algorithm not in algorithms: raise ValueError("Algorithm must be one of {}".format(algorithms)) if algorithm=="Fixed_param": if warmup > 0: logger.warning("`warmup=0` forced with `algorithm=\"Fixed_param\"`.") warmup = 0 seed = pystan.misc._check_seed(seed) fit = self.fit_class(data, seed) m_pars = fit._get_param_names() p_dims = fit._get_param_dims() if isinstance(pars, string_types): pars = [pars] if pars is not None and len(pars) > 0: # Implementation note: this does not set the params_oi for the # instances of stan_fit which actually make the calls to # call_sampler. This is because we need separate instances of # stan_fit in each thread/process. So update_param_oi needs to # be called in every stan_fit instance. fit._update_param_oi(pars) if not all(p in m_pars for p in pars): pars = np.asarray(pars) unmatched = pars[np.invert(np.in1d(pars, m_pars))] msg = "No parameter(s): {}; sampling not done." raise ValueError(msg.format(', '.join(unmatched))) else: pars = m_pars if chains < 1: raise ValueError("The number of chains is less than one; sampling" "not done.") check_hmc_diagnostics = kwargs.pop('check_hmc_diagnostics', None) # check that arguments in kwargs are valid valid_args = {"chain_id", "init_r", "test_grad", "append_samples", "refresh", "control"} for arg in kwargs: if arg not in valid_args: raise ValueError("Parameter `{}` is not recognized.".format(arg)) args_list = pystan.misc._config_argss(chains=chains, iter=iter, warmup=warmup, thin=thin, init=init, seed=seed, sample_file=sample_file, diagnostic_file=diagnostic_file, algorithm=algorithm, control=control, **kwargs) # number of samples saved after thinning warmup2 = 1 + (warmup - 1) // thin n_kept = 1 + (iter - warmup - 1) // thin n_save = n_kept + warmup2 if n_jobs is None: n_jobs = -1 # disable multiprocessing if we only have a single chain if chains == 1: n_jobs = 1 assert len(args_list) == chains call_sampler_args = izip(itertools.repeat(data), args_list, itertools.repeat(pars)) call_sampler_star = self.module._call_sampler_star ret_and_samples = _map_parallel(call_sampler_star, call_sampler_args, n_jobs) samples = [smpl for _, smpl in ret_and_samples] # _organize_inits strips out lp__ (RStan does it in this method) inits_used = pystan.misc._organize_inits([s['inits'] for s in samples], m_pars, p_dims) random_state = np.random.RandomState(args_list[0]['seed']) perm_lst = [random_state.permutation(int(n_kept)) for _ in range(chains)] fnames_oi = fit._get_param_fnames_oi() n_flatnames = len(fnames_oi) fit.sim = {'samples': samples, # rstan has this; name clashes with 'chains' in samples[0]['chains'] 'chains': len(samples), 'iter': iter, 'warmup': warmup, 'thin': thin, 'n_save': [n_save] * chains, 'warmup2': [warmup2] * chains, 'permutation': perm_lst, 'pars_oi': fit._get_param_names_oi(), 'dims_oi': fit._get_param_dims_oi(), 'fnames_oi': fnames_oi, 'n_flatnames': n_flatnames} fit.model_name = self.model_name fit.model_pars = m_pars fit.par_dims = p_dims fit.mode = 0 if not kwargs.get('test_grad') else 1 fit.inits = inits_used fit.stan_args = args_list fit.stanmodel = self fit.date = datetime.datetime.now() # If problems are found in the fit, this will print diagnostic # messages. if (check_hmc_diagnostics is None and algorithm in ("NUTS", "HMC")) and fit.mode != 1: if n_flatnames > 1000: msg = "Maximum (flat) parameter count (1000) exceeded: " +\ "skipping diagnostic tests for n_eff and Rhat.\n" +\ "To run all diagnostics call pystan.check_hmc_diagnostics(fit)" logger.warning(msg) checks = ["divergence", "treedepth", "energy"] pystan.diagnostics.check_hmc_diagnostics(fit, checks=checks) # noqa else: pystan.diagnostics.check_hmc_diagnostics(fit) # noqa elif (check_hmc_diagnostics and algorithm in ("NUTS", "HMC")) and fit.mode != 1: pystan.diagnostics.check_hmc_diagnostics(fit) # noqa return fit