Пример #1
0
 def test_get_err_msgs(self):
     exe = os.path.join(DATAFILES_PATH, 'logistic' + EXTENSION)
     rdata = os.path.join(DATAFILES_PATH, 'logistic.data.R')
     sampler_args = SamplerArgs()
     cmdstan_args = CmdStanArgs(
         model_name='logistic',
         model_exe=exe,
         chain_ids=[1, 2, 3],
         data=rdata,
         method_args=sampler_args,
     )
     runset = RunSet(args=cmdstan_args, chains=3)
     for i in range(3):
         runset._set_retcode(i, 70)
         stdout_file = 'chain-' + str(i + 1) + '-missing-data-stdout.txt'
         path = os.path.join(DATAFILES_PATH, stdout_file)
         runset._stdout_files[i] = path
     errs = '\n\t'.join(runset._get_err_msgs())
     self.assertIn('Exception', errs)
Пример #2
0
    def test_validate_bad_run(self):
        exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
        jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
        sampler_args = SamplerArgs(max_treedepth=11, adapt_delta=0.95)

        # some chains had errors
        cmdstan_args = CmdStanArgs(
            model_name='bernoulli',
            model_exe=exe,
            chain_ids=[1, 2, 3, 4],
            seed=12345,
            data=jdata,
            output_dir=DATAFILES_PATH,
            method_args=sampler_args,
        )
        runset = RunSet(args=cmdstan_args, chains=4)
        for i in range(4):
            runset._set_retcode(i, 0)
        self.assertTrue(runset._check_retcodes())

        # errors reported
        runset._stderr_files = [
            os.path.join(DATAFILES_PATH, 'runset-bad',
                         'bad-transcript-bern-1.txt'),
            os.path.join(DATAFILES_PATH, 'runset-bad',
                         'bad-transcript-bern-2.txt'),
            os.path.join(DATAFILES_PATH, 'runset-bad',
                         'bad-transcript-bern-3.txt'),
            os.path.join(DATAFILES_PATH, 'runset-bad',
                         'bad-transcript-bern-4.txt'),
        ]
        self.assertEqual(len(runset._get_err_msgs()), 4)

        # csv file headers inconsistent
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-1.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-2.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-3.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-4.csv'),
        ]
        with self.assertRaisesRegex(ValueError, 'header mismatch'):
            CmdStanMCMC(runset)

        # bad draws
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-1.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-2.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-3.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-4.csv'),
        ]
        with self.assertRaisesRegex(ValueError, 'draws'):
            CmdStanMCMC(runset)

        # mismatch - column headers, draws
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-1.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-2.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-3.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-4.csv'),
        ]
        with self.assertRaisesRegex(ValueError,
                                    'bad draw, expecting 9 items, found 8'):
            CmdStanMCMC(runset)
Пример #3
0
    def sample(
        self,
        data: Union[Dict, str] = None,
        chains: Union[int, None] = None,
        cores: Union[int, None] = None,
        seed: Union[int, List[int]] = None,
        chain_ids: Union[int, List[int]] = None,
        inits: Union[Dict, float, str, List[str]] = None,
        iter_warmup: int = None,
        iter_sampling: int = None,
        save_warmup: bool = False,
        thin: int = None,
        max_treedepth: float = None,
        metric: Union[str, List[str]] = None,
        step_size: Union[float, List[float]] = None,
        adapt_engaged: bool = True,
        adapt_delta: float = None,
        adapt_init_phase: int = None,
        adapt_metric_window: int = None,
        adapt_step_size: int = None,
        fixed_param: bool = False,
        output_dir: str = None,
        save_diagnostics: bool = False,
        show_progress: Union[bool, str] = False,
    ) -> CmdStanMCMC:
        """
        Run or more chains of the NUTS sampler to produce a set of draws
        from the posterior distribution of a model conditioned on some data.

        This function validates the specified configuration, composes a call to
        the CmdStan ``sample`` method and spawns one subprocess per chain to run
        the sampler and waits for all chains to run to completion.
        Unspecified arguments are not included in the call to CmdStan, i.e.,
        those arguments will have CmdStan default values.

        For each chain, the ``CmdStanMCMC`` object records the command,
        the return code, the sampler output file paths, and the corresponding
        console outputs, if any. The output files are written either to a
        specified output directory or to a temporary directory which is deleted
        upon session exit.

        The output filenames are composed of the model name, a timestamp
        in the form 'YYYYMMDDhhmm' and the chain id, plus the corresponding
        filetype suffix, either '.csv' for the CmdStan output or '.txt' for
        the console messages, e.g. `bernoulli-201912081451-1.csv`. Output files
        written to the temporary directory contain an additional 8-character
        random string, e.g. `bernoulli-201912081451-1-5nm6as7u.csv`.


        :param data: Values for all data variables in the model, specified
            either as a dictionary with entries matching the data variables,
            or as the path of a data file in JSON or Rdump format.

        :param chains: Number of sampler chains, should be > 1.

        :param cores: Number of processes to run in parallel. Must be an
            integer between 1 and the number of CPUs in the system.
            If none then set automatically to chains but no more
            than total_cpu_count - 2

        :param seed: The seed for random number generator. Must be an integer
            between 0 and 2^32 - 1. If unspecified,
            ``numpy.random.RandomState()``
            is used to generate a seed which will be used for all chains.
            When the same seed is used across all chains,
            the chain-id is used to advance the RNG to avoid dependent samples.

        :param chain_ids: The offset for the random number generator, either
            an integer or a list of unique per-chain offsets.  If unspecified,
            chain ids are numbered sequentially starting from 1.

        :param inits: Specifies how the sampler initializes parameter values.
            Initialization is either uniform random on a range centered on 0,
            exactly 0, or a dictionary or file of initial values for some or all
            parameters in the model.  The default initialization behavior will
            initialize all parameter values on range [-2, 2] on the
            *unconstrained* support.  If the expected parameter values are
            too far from this range, this option may improve adaptation.
            The following value types are allowed:

            * Single number n > 0 - initialization range is [-n, n].
            * 0 - all parameters are initialized to 0.
            * dictionary - pairs parameter name : initial value.
            * string - pathname to a JSON or Rdump data file.
            * list of strings - per-chain pathname to data file.

        :param iter_warmup: Number of warmup iterations for each chain.

        :param iter_sampling: Number of draws from the posterior for each
            chain.

        :param save_warmup: When True, sampler saves warmup draws as part of
            the Stan csv output file.

        :param thin: Period between saved samples.

        :param max_treedepth: Maximum depth of trees evaluated by NUTS sampler
            per iteration.

        :param metric: Specification of the mass matrix, either as a
            vector consisting of the diagonal elements of the covariance
            matrix (``diag`` or ``diag_e``) or the full covariance matrix
            (``dense`` or ``dense_e``).

            If the value of the metric argument is a string other than
            ``diag``, ``diag_e``, ``dense``, or ``dense_e``, it must be
            a valid filepath to a JSON or Rdump file which contains an entry
            ``inv_metric`` whose value is either the diagonal vector or
            the full covariance matrix.

            If the value of the metric argument is a list of paths, its
            length must match the number of chains and all paths must be
            unique.

        :param step_size: Initial stepsize for HMC sampler.  The value is either
            a single number or a list of numbers which will be used as the
            global or per-chain initial step_size, respectively.
            The length of the list of step sizes must match the number of
            chains.

        :param adapt_engaged: When True, adapt stepsize and metric.

        :param adapt_delta: Adaptation target Metropolis acceptance rate.
            The default value is 0.8.  Increasing this value, which must be
            strictly less than 1, causes adaptation to use smaller step sizes.
            It improves the effective sample size, but may increase the time
            per iteration.

        :param adapt_init_phase: Iterations for initial phase of adaptation
            during which step size is adjusted so that the chain converges
            towards the typical set.

        :param adapt_metric_window: The second phase of adaptation tunes
            the metric and stepsize in a series of intervals.  This parameter
            specifies the number of iterations used for the first tuning
            interval; window size increases for each subsequent interval.

        :param adapt_step_size: Number of iterations given over to adjusting
            the step size given the tuned metric during the final phase of
            adaptation.

        :param fixed_param: When True, call CmdStan with argument
            ``algorithm=fixed_param`` which runs the sampler without
            updating the Markov Chain, thus the values of all parameters and
            transformed parameters are constant across all draws and
            only those values in the generated quantities block that are
            produced by RNG functions may change.  This provides
            a way to use Stan programs to generate simulated data via the
            generated quantities block.  This option must be used when the
            parameters block is empty.  Default value is False.

        :param output_dir: Name of the directory to with the  CmdStan output
            files are written. If unspecified, output files will be written
            to a temporary directory which is deleted upon session exit.

        :param save_diagnostics: Whether or not to save diagnostics. If True,
            csv output files are written to
            `<basename>-diagnostic-<chain_id>.csv.`, where `<basename>`
            is set with `csv_basename`.

        :param show_progress: Use tqdm progress bar to show sampling progress.
            If show_progress=='notebook' use tqdm_notebook
            (needs nodejs for jupyter).

        :return: CmdStanMCMC object
        """

        if chains is None:
            if fixed_param:
                chains = 1
            else:
                chains = 4
        if chains < 1:
            raise ValueError(
                'Chains must be a positive integer value, found {}.'.format(
                    chains))

        if chain_ids is None:
            chain_ids = [x + 1 for x in range(chains)]
        else:
            if isinstance(chain_ids, int):
                if chain_ids < 0:
                    raise ValueError(
                        'Chain_id must be a non-negative integer value,'
                        ' found {}.'.format(chain_ids))
                chain_ids = [chain_ids + i + 1 for i in range(chains)]
            else:
                if not len(chain_ids) == chains:
                    raise ValueError(
                        'Chain_ids must correspond to number of chains'
                        ' specified {} chains, found {} chain_ids.'.format(
                            chains, len(chain_ids)))
                for chain_id in chain_ids:
                    if chain_id < 0:
                        raise ValueError(
                            'Chain_id must be a non-negative integer value,'
                            ' found {}.'.format(chain_id))

        cores_avail = cpu_count()
        if cores is None:
            cores = max(min(cores_avail - 2, chains), 1)
        if cores < 1:
            raise ValueError(
                'Argument cores must be a positive integer value, '
                'found {}.'.format(cores))
        if cores > cores_avail:
            self._logger.warning('Requested %u cores, only %u available.',
                                 cores, cpu_count())
            cores = cores_avail

        refresh = None
        if show_progress:
            try:
                import tqdm

                self._logger.propagate = False
            except ImportError:
                self._logger.warning(
                    ('Package tqdm not installed, cannot show progress '
                     'information. Please install tqdm with '
                     "'pip install tqdm'"))
                show_progress = False

        # TODO:  issue 49: inits can be initialization function

        sampler_args = SamplerArgs(
            iter_warmup=iter_warmup,
            iter_sampling=iter_sampling,
            save_warmup=save_warmup,
            thin=thin,
            max_treedepth=max_treedepth,
            metric=metric,
            step_size=step_size,
            adapt_engaged=adapt_engaged,
            adapt_delta=adapt_delta,
            adapt_init_phase=adapt_init_phase,
            adapt_metric_window=adapt_metric_window,
            adapt_step_size=adapt_step_size,
            fixed_param=fixed_param,
        )
        with MaybeDictToFilePath(data, inits) as (_data, _inits):
            args = CmdStanArgs(
                self._name,
                self._exe_file,
                chain_ids=chain_ids,
                data=_data,
                seed=seed,
                inits=_inits,
                output_dir=output_dir,
                save_diagnostics=save_diagnostics,
                method_args=sampler_args,
                refresh=refresh,
                logger=self._logger,
            )
            runset = RunSet(args=args, chains=chains)
            pbar = None
            all_pbars = []

            with ThreadPoolExecutor(max_workers=cores) as executor:
                for i in range(chains):
                    if show_progress:
                        if (isinstance(show_progress, str)
                                and show_progress.lower() == 'notebook'):
                            try:
                                tqdm_pbar = tqdm.tqdm_notebook
                            except ImportError:
                                msg = (
                                    'Cannot import tqdm.tqdm_notebook.\n'
                                    'Functionality is only supported on the '
                                    'Jupyter Notebook and compatible platforms'
                                    '.\nPlease follow the instructions in '
                                    'https://github.com/tqdm/tqdm/issues/394#'
                                    'issuecomment-384743637 and remember to '
                                    'stop & start your jupyter server.')
                                self._logger.warning(msg)
                                tqdm_pbar = tqdm.tqdm
                        else:
                            tqdm_pbar = tqdm.tqdm
                        # enable dynamic_ncols for advanced users
                        # currently hidden feature
                        dynamic_ncols = os.environ.get('TQDM_DYNAMIC_NCOLS',
                                                       'False')
                        if dynamic_ncols.lower() in ['0', 'false']:
                            dynamic_ncols = False
                        else:
                            dynamic_ncols = True

                        pbar = tqdm_pbar(
                            desc='Chain {} - warmup'.format(i + 1),
                            position=i,
                            total=1,  # Will set total from Stan's output
                            dynamic_ncols=dynamic_ncols,
                        )
                        all_pbars.append(pbar)

                    executor.submit(self._run_cmdstan, runset, i, pbar)

            # Closing all progress bars
            for pbar in all_pbars:
                pbar.close()
            if show_progress:
                # re-enable logger for console
                self._logger.propagate = True

            err_msg = 'Error during sampling.\n'
            if not runset._check_retcodes():
                for i in range(chains):
                    if runset._retcode(i) != 0:
                        err_msg = '{}chain {} returned error code {}\n'.format(
                            err_msg, i + 1, runset._retcode(i))
                console_errs = runset._get_err_msgs()
                if len(console_errs) > 0:
                    err_msg = '{}{}'.format(err_msg, ''.join(console_errs))
                raise RuntimeError(err_msg)

            mcmc = CmdStanMCMC(runset)
        return mcmc