예제 #1
0
파일: api.py 프로젝트: maverickg/pystan
def stan(file=None, model_name="anon_model", model_code=None, fit=None,
         data=None, pars=None, chains=4, iter=2000, warmup=None, thin=1,
         init="random", seed=None, algorithm=None, control=None, sample_file=None,
         diagnostic_file=None, save_dso=True, verbose=False, boost_lib=None,
         eigen_lib=None, **kwargs):
    """Fit a model using Stan.

    Parameters
    ----------

    file : string {'filename', file-like object}
        Model code must found via one of the following parameters: `file` or
        `model_code`.

        If `file` is a filename, the string passed as an argument is expected
        to be a filename containing the Stan model specification.

        If `file` is a file object, the object passed must have a 'read' method
        (file-like object) that is called to fetch the Stan model specification.

    charset : string, optional
        If bytes or files are provided, this charset is used to decode. 'utf-8'
        by default.

    model_code : string
        A string containing the Stan model specification. Alternatively,
        the model may be provided with the parameter `file`.

    model_name: string, optional
        A string naming the model. If none is provided 'anon_model' is
        the default. However, if `file` is a filename, then the filename
        will be used to provide a name. 'anon_model' by default.

    fit : StanFit instance
        An instance of StanFit derived from a previous fit, None by
        default. If `fit` is not None, the compiled model associated
        with a previous fit is reused and recompilation is avoided.

    data : dict
        A Python dictionary providing the data for the model. Variables
        for Stan are stored in the dictionary as expected. Variable
        names are the keys and the values are their associated values.
        Stan only accepts certain kinds of values; see Notes.

    pars : list of string, optional
        A list of strings indicating parameters of interest. By default
        all parameters specified in the model will be stored.

    chains : int, optional
        Positive integer specifying number of chains. 4 by default.

    iter : int, 2000 by default
        Positive integer specifying how many iterations for each chain
        including warmup.

    warmup : int, iter//2 by default
        Positive integer specifying number of warmup (aka burin) iterations.
        As `warmup` also specifies the number of iterations used for stepsize
        adaption, warmup samples should not be used for inference.

    thin : int, optional
        Positive integer specifying the period for saving samples.
        Default is 1.

    init : {0, '0', 'random', function returning dict, list of dict}, optional
        Specifies how initial parameter values are chosen: 0 or '0'
        initializes all to be zero on the unconstrained support; 'random'
        generates random initial values; list of size equal to the number
        of chains (`chains`), where the list contains a dict with initial
        parameter values; function returning a dict with initial parameter
        values. The function may take an optional argument `chain_id`.

    seed : int, optional
        The seed, a positive integer for random number generation. Only
        one seed is needed when multiple chains are used, as the other
        chain's seeds are generated from the first chain's to prevent
        dependency among random number streams. By default, seed is
        ``random.randint(0, MAX_UINT)``.

    algorithm : {"NUTS", "HMC"}, optional
        One of algorithms that are implemented in Stan such as the No-U-Turn
        sampler (NUTS, Hoffman and Gelman 2011) and static HMC.

    sample_file : string, optional
        File name specifying where samples for *all* parameters and other
        saved quantities will be written. If not provided, no samples
        will be written. If the folder given is not writable, a temporary
        directory will be used. When there are multiple chains, an underscore
        and chain number are appended to the file name. By default do not
        write samples to file.

    diagnostic_file : string, optional
        File name specifying where diagnostic information should be written.
        By default no diagnostic information is recorded.

    boost_lib : string, optional
        The path to a version of the Boost C++ library to use instead of
        the one supplied with PyStan.

    eigen_lib : string, optional
        The path to a version of the Eigen C++ library to use instead of
        the one in the supplied with PyStan.

    save_dso : boolean, optional
        Indicates whether the dynamic shared object (DSO) compiled from
        C++ code will be saved for use in a future Python session. True by
        default.

    verbose : boolean, optional
        Indicates whether intermediate output should be piped to the console.
        This output may be useful for debugging. False by default.

    control : dict, optional
        A dictionary of parameters to control the sampler's behavior. Default
        values are used if control is not specified.  The following are
        adaptation parameters for sampling algorithms.

        These are parameters used in Stan with similar names:

        - `adapt_engaged` : bool
        - `adapt_gamma` : float, positive, default 0.05
        - `adapt_delta` : float, between 0 and 1, default 0.65
        - `adapt_kappa` : float, between default 0.75
        - `adapt_t0`    : float, positive, default 10

        In addition, the algorithm HMC (called 'static HMC' in Stan) and NUTS
        share the following parameters:

        - `stepsize`: float, positive
        - `stepsize_jitter`: float, between 0 and 1
        - `metric` : str, {"unit_e", "diag_e", "dense_e"}

        In addition, depending on which algorithm is used, different parameters
        can be set as in Stan for sampling. For the algorithm HMC we can set

        - `int_time`: float, positive

        For algorithm NUTS, we can set

        - `max_treedepth` : int, positive

    Returns
    -------

    fit : StanFit instance

    Other parameters
    ----------------

    chain_id : int, optional
        `chain_id` can be a vector to specify the chain_id for all chains or
        an integer. For the former case, they should be unique. For the latter,
        the sequence of integers starting from the given `chain_id` are used
        for all chains.

    init_r : float, optional
        `init_r` is only valid if `init` == "random". In this case, the intial
        values are simulated from [-`init_r`, `init_r`] rather than using the
        default interval (see the manual of Stan).

    test_grad: bool, optional

    append_samples`: bool, optional

    refresh`: int, optional
        Argument `refresh` can be used to control how to indicate the progress
        during sampling (i.e. show the progress every \code{refresh} iterations).
        By default, `refresh` is `max(iter/10, 1)`.

    """
    # NOTE: this is a thin wrapper for other functions. Error handling occurs
    # elsewhere.
    if data is None:
        data = {}
    if warmup is None:
        warmup = int(iter // 2)
    if seed is None:
        seed = random.randint(0, MAX_UINT)
    seed = int(seed)
    if fit is not None:
        m = fit.stanmodel
    else:
        m = StanModel(file=file, model_name=model_name, model_code=model_code,
                      boost_lib=boost_lib, eigen_lib=eigen_lib,
                      save_dso=save_dso, verbose=verbose, **kwargs)
    fit = m.sampling(data, pars, chains, iter, warmup, thin, seed, init,
                     sample_file=sample_file, diagnostic_file=diagnostic_file,
                     verbose=verbose, algorithm=algorithm, control=control, **kwargs)
    return fit
예제 #2
0
파일: api.py 프로젝트: jrings/pystan
def stan(file=None, model_name="anon_model", model_code=None, fit=None,
         data=None, pars=None, chains=4, iter=2000, warmup=None, thin=1,
         init="random", seed=None, algorithm=None, control=None, sample_file=None,
         diagnostic_file=None, save_dso=True, verbose=False, boost_lib=None,
         eigen_lib=None, n_jobs=-1, **kwargs):
    """Fit a model using Stan.

    Parameters
    ----------

    file : string {'filename', file-like object}
        Model code must found via one of the following parameters: `file` or
        `model_code`.

        If `file` is a filename, the string passed as an argument is expected
        to be a filename containing the Stan model specification.

        If `file` is a file object, the object passed must have a 'read' method
        (file-like object) that is called to fetch the Stan model specification.

    charset : string, optional
        If bytes or files are provided, this charset is used to decode. 'utf-8'
        by default.

    model_code : string
        A string containing the Stan model specification. Alternatively,
        the model may be provided with the parameter `file`.

    model_name: string, optional
        A string naming the model. If none is provided 'anon_model' is
        the default. However, if `file` is a filename, then the filename
        will be used to provide a name. 'anon_model' by default.

    fit : StanFit instance
        An instance of StanFit derived from a previous fit, None by
        default. If `fit` is not None, the compiled model associated
        with a previous fit is reused and recompilation is avoided.

    data : dict
        A Python dictionary providing the data for the model. Variables
        for Stan are stored in the dictionary as expected. Variable
        names are the keys and the values are their associated values.
        Stan only accepts certain kinds of values; see Notes.

    pars : list of string, optional
        A list of strings indicating parameters of interest. By default
        all parameters specified in the model will be stored.

    chains : int, optional
        Positive integer specifying number of chains. 4 by default.

    iter : int, 2000 by default
        Positive integer specifying how many iterations for each chain
        including warmup.

    warmup : int, iter//2 by default
        Positive integer specifying number of warmup (aka burin) iterations.
        As `warmup` also specifies the number of iterations used for stepsize
        adaption, warmup samples should not be used for inference.

    thin : int, optional
        Positive integer specifying the period for saving samples.
        Default is 1.

    init : {0, '0', 'random', function returning dict, list of dict}, optional
        Specifies how initial parameter values are chosen:
        - 0 or '0' initializes all to be zero on the unconstrained support.
        - 'random' generates random initial values. An optional parameter
            `init_r` controls the range of randomly generated initial values
            for parameters in terms of their unconstrained support;
        - list of size equal to the number of chains (`chains`), where the
            list contains a dict with initial parameter values;
        - function returning a dict with initial parameter values. The
            function may take an optional argument `chain_id`.

    seed : int or np.random.RandomState, optional
        The seed, a positive integer for random number generation. Only
        one seed is needed when multiple chains are used, as the other
        chain's seeds are generated from the first chain's to prevent
        dependency among random number streams. By default, seed is
        ``random.randint(0, MAX_UINT)``.

    algorithm : {"NUTS", "HMC", "Fixed_param"}, optional
        One of algorithms that are implemented in Stan such as the No-U-Turn
        sampler (NUTS, Hoffman and Gelman 2011) and static HMC.

    sample_file : string, optional
        File name specifying where samples for *all* parameters and other
        saved quantities will be written. If not provided, no samples
        will be written. If the folder given is not writable, a temporary
        directory will be used. When there are multiple chains, an underscore
        and chain number are appended to the file name. By default do not
        write samples to file.

    diagnostic_file : string, optional
        File name specifying where diagnostic information should be written.
        By default no diagnostic information is recorded.

    boost_lib : string, optional
        The path to a version of the Boost C++ library to use instead of
        the one supplied with PyStan.

    eigen_lib : string, optional
        The path to a version of the Eigen C++ library to use instead of
        the one in the supplied with PyStan.

    save_dso : boolean, optional
        Indicates whether the dynamic shared object (DSO) compiled from
        C++ code will be saved for use in a future Python session. True by
        default.

    verbose : boolean, optional
        Indicates whether intermediate output should be piped to the console.
        This output may be useful for debugging. False by default.

    control : dict, optional
        A dictionary of parameters to control the sampler's behavior. Default
        values are used if control is not specified.  The following are
        adaptation parameters for sampling algorithms.

        These are parameters used in Stan with similar names:

        - `adapt_engaged` : bool
        - `adapt_gamma` : float, positive, default 0.05
        - `adapt_delta` : float, between 0 and 1, default 0.8
        - `adapt_kappa` : float, between default 0.75
        - `adapt_t0`    : float, positive, default 10
        - `adapt_init_buffer` : int, positive, defaults to 75
        - `adapt_term_buffer` : int, positive, defaults to 50
        - `adapt_window` : int, positive, defaults to 25

        In addition, the algorithm HMC (called 'static HMC' in Stan) and NUTS
        share the following parameters:

        - `stepsize`: float, positive
        - `stepsize_jitter`: float, between 0 and 1
        - `metric` : str, {"unit_e", "diag_e", "dense_e"}

        In addition, depending on which algorithm is used, different parameters
        can be set as in Stan for sampling. For the algorithm HMC we can set

        - `int_time`: float, positive

        For algorithm NUTS, we can set

        - `max_treedepth` : int, positive

    n_jobs : int, optional
        Sample in parallel. If -1 all CPUs are used. If 1, no parallel
        computing code is used at all, which is useful for debugging.

    Returns
    -------

    fit : StanFit instance

    Other parameters
    ----------------

    chain_id : int, optional
        `chain_id` can be a vector to specify the chain_id for all chains or
        an integer. For the former case, they should be unique. For the latter,
        the sequence of integers starting from the given `chain_id` are used
        for all chains.

    init_r : float, optional
        `init_r` is only valid if `init` == "random". In this case, the intial
        values are simulated from [-`init_r`, `init_r`] rather than using the
        default interval (see the manual of (Cmd)Stan).

    test_grad: bool, optional

    append_samples`: bool, optional

    refresh`: int, optional
        Argument `refresh` can be used to control how to indicate the progress
        during sampling (i.e. show the progress every \code{refresh} iterations).
        By default, `refresh` is `max(iter/10, 1)`.

    Examples
    --------
    >>> from pystan import stan
    >>> import numpy as np
    >>> model_code = '''
    ... parameters {
    ...   real y[2];
    ... }
    ... model {
    ...   y[1] ~ normal(0, 1);
    ...   y[2] ~ double_exponential(0, 2);
    ... }'''
    >>> fit1 = stan(model_code=model_code, iter=10)
    >>> print(fit1)
    >>> excode = '''
    ... transformed data {
    ...     real y[20];
    ...     y[1] <- 0.5796;  y[2]  <- 0.2276;   y[3] <- -0.2959;
    ...     y[4] <- -0.3742; y[5]  <- 0.3885;   y[6] <- -2.1585;
    ...     y[7] <- 0.7111;  y[8]  <- 1.4424;   y[9] <- 2.5430;
    ...     y[10] <- 0.3746; y[11] <- 0.4773;   y[12] <- 0.1803;
    ...     y[13] <- 0.5215; y[14] <- -1.6044;  y[15] <- -0.6703;
    ...     y[16] <- 0.9459; y[17] <- -0.382;   y[18] <- 0.7619;
    ...     y[19] <- 0.1006; y[20] <- -1.7461;
    ... }
    ... parameters {
    ...     real mu;
    ...     real<lower=0, upper=10> sigma;
    ...     vector[2] z[3];
    ...     real<lower=0> alpha;
    ... }
    ... model {
    ...     y ~ normal(mu, sigma);
    ...     for (i in 1:3)
    ...     z[i] ~ normal(0, 1);
    ...     alpha ~ exponential(2);
    ... }'''
    >>>
    >>> def initfun1():
    ...     return dict(mu=1, sigma=4, z=np.random.normal(size=(3, 2)), alpha=1)
    >>> exfit0 = stan(model_code=excode, init=initfun1)
    >>> def initfun2(chain_id=1):
    ...     return dict(mu=1, sigma=4, z=np.random.normal(size=(3, 2)), alpha=1 + chain_id)
    >>> exfit1 = stan(model_code=excode, init=initfun2)
    """
    # NOTE: this is a thin wrapper for other functions. Error handling occurs
    # elsewhere.
    if data is None:
        data = {}
    if warmup is None:
        warmup = int(iter // 2)
    if fit is not None:
        m = fit.stanmodel
    else:
        m = StanModel(file=file, model_name=model_name, model_code=model_code,
                      boost_lib=boost_lib, eigen_lib=eigen_lib,
                      save_dso=save_dso, verbose=verbose)
    # check that arguments in kwargs are valid
    valid_args = {"chain_id", "init_r", "test_grad", "append_samples",
                  "refresh", "control", "obfuscate_model_name"}
    for arg in kwargs:
        if arg not in valid_args:
            raise ValueError("Parameter `{}` is not recognized.".format(arg))

    fit = m.sampling(data, pars=pars, chains=chains, iter=iter,
                     warmup=warmup, thin=thin, seed=seed, init=init,
                     sample_file=sample_file, diagnostic_file=diagnostic_file,
                     verbose=verbose, algorithm=algorithm, control=control,
                     n_jobs=n_jobs, **kwargs)
    return fit
예제 #3
0
파일: api.py 프로젝트: shoyer/pystan
def stan(file=None,
         model_name="anon_model",
         model_code=None,
         fit=None,
         data=None,
         pars=None,
         chains=4,
         iter=2000,
         warmup=None,
         thin=1,
         init="random",
         seed=None,
         algorithm=None,
         control=None,
         sample_file=None,
         diagnostic_file=None,
         save_dso=True,
         verbose=False,
         boost_lib=None,
         eigen_lib=None,
         **kwargs):
    """Fit a model using Stan.

    Parameters
    ----------

    file : string {'filename', file-like object}
        Model code must found via one of the following parameters: `file` or
        `model_code`.

        If `file` is a filename, the string passed as an argument is expected
        to be a filename containing the Stan model specification.

        If `file` is a file object, the object passed must have a 'read' method
        (file-like object) that is called to fetch the Stan model specification.

    charset : string, optional
        If bytes or files are provided, this charset is used to decode. 'utf-8'
        by default.

    model_code : string
        A string containing the Stan model specification. Alternatively,
        the model may be provided with the parameter `file`.

    model_name: string, optional
        A string naming the model. If none is provided 'anon_model' is
        the default. However, if `file` is a filename, then the filename
        will be used to provide a name. 'anon_model' by default.

    fit : StanFit instance
        An instance of StanFit derived from a previous fit, None by
        default. If `fit` is not None, the compiled model associated
        with a previous fit is reused and recompilation is avoided.

    data : dict
        A Python dictionary providing the data for the model. Variables
        for Stan are stored in the dictionary as expected. Variable
        names are the keys and the values are their associated values.
        Stan only accepts certain kinds of values; see Notes.

    pars : list of string, optional
        A list of strings indicating parameters of interest. By default
        all parameters specified in the model will be stored.

    chains : int, optional
        Positive integer specifying number of chains. 4 by default.

    iter : int, 2000 by default
        Positive integer specifying how many iterations for each chain
        including warmup.

    warmup : int, iter//2 by default
        Positive integer specifying number of warmup (aka burin) iterations.
        As `warmup` also specifies the number of iterations used for stepsize
        adaption, warmup samples should not be used for inference.

    thin : int, optional
        Positive integer specifying the period for saving samples.
        Default is 1.

    init : {0, '0', 'random', function returning dict, list of dict}, optional
        Specifies how initial parameter values are chosen: 0 or '0'
        initializes all to be zero on the unconstrained support; 'random'
        generates random initial values; list of size equal to the number
        of chains (`chains`), where the list contains a dict with initial
        parameter values; function returning a dict with initial parameter
        values. The function may take an optional argument `chain_id`.

    seed : int, optional
        The seed, a positive integer for random number generation. Only
        one seed is needed when multiple chains are used, as the other
        chain's seeds are generated from the first chain's to prevent
        dependency among random number streams. By default, seed is
        ``random.randint(0, MAX_UINT)``.

    algorithm : {"NUTS", "HMC"}, optional
        One of algorithms that are implemented in Stan such as the No-U-Turn
        sampler (NUTS, Hoffman and Gelman 2011) and static HMC.

    sample_file : string, optional
        File name specifying where samples for *all* parameters and other
        saved quantities will be written. If not provided, no samples
        will be written. If the folder given is not writable, a temporary
        directory will be used. When there are multiple chains, an underscore
        and chain number are appended to the file name. By default do not
        write samples to file.

    diagnostic_file : string, optional
        File name specifying where diagnostic information should be written.
        By default no diagnostic information is recorded.

    boost_lib : string, optional
        The path to a version of the Boost C++ library to use instead of
        the one supplied with PyStan.

    eigen_lib : string, optional
        The path to a version of the Eigen C++ library to use instead of
        the one in the supplied with PyStan.

    save_dso : boolean, optional
        Indicates whether the dynamic shared object (DSO) compiled from
        C++ code will be saved for use in a future Python session. True by
        default.

    verbose : boolean, optional
        Indicates whether intermediate output should be piped to the console.
        This output may be useful for debugging. False by default.

    control : dict, optional
        A dictionary of parameters to control the sampler's behavior. Default
        values are used if control is not specified.  The following are
        adaptation parameters for sampling algorithms.

        These are parameters used in Stan with similar names:

        - `adapt_engaged` : bool
        - `adapt_gamma` : float, positive, default 0.05
        - `adapt_delta` : float, between 0 and 1, default 0.65
        - `adapt_kappa` : float, between default 0.75
        - `adapt_t0`    : float, positive, default 10

        In addition, the algorithm HMC (called 'static HMC' in Stan) and NUTS
        share the following parameters:

        - `stepsize`: float, positive
        - `stepsize_jitter`: float, between 0 and 1
        - `metric` : str, {"unit_e", "diag_e", "dense_e"}

        In addition, depending on which algorithm is used, different parameters
        can be set as in Stan for sampling. For the algorithm HMC we can set

        - `int_time`: float, positive

        For algorithm NUTS, we can set

        - `max_treedepth` : int, positive

    Returns
    -------

    fit : StanFit instance

    Other parameters
    ----------------

    chain_id : int, optional
        `chain_id` can be a vector to specify the chain_id for all chains or
        an integer. For the former case, they should be unique. For the latter,
        the sequence of integers starting from the given `chain_id` are used
        for all chains.

    init_r : float, optional
        `init_r` is only valid if `init` == "random". In this case, the intial
        values are simulated from [-`init_r`, `init_r`] rather than using the
        default interval (see the manual of Stan).

    test_grad: bool, optional

    append_samples`: bool, optional

    refresh`: int, optional
        Argument `refresh` can be used to control how to indicate the progress
        during sampling (i.e. show the progress every \code{refresh} iterations).
        By default, `refresh` is `max(iter/10, 1)`.

    """
    # NOTE: this is a thin wrapper for other functions. Error handling occurs
    # elsewhere.
    if data is None:
        data = {}
    if warmup is None:
        warmup = int(iter // 2)
    if seed is None:
        seed = random.randint(0, MAX_UINT)
    seed = int(seed)
    if fit is not None:
        m = fit.stanmodel
    else:
        m = StanModel(file=file,
                      model_name=model_name,
                      model_code=model_code,
                      boost_lib=boost_lib,
                      eigen_lib=eigen_lib,
                      save_dso=save_dso,
                      verbose=verbose,
                      **kwargs)
    fit = m.sampling(data,
                     pars,
                     chains,
                     iter,
                     warmup,
                     thin,
                     seed,
                     init,
                     sample_file=sample_file,
                     diagnostic_file=diagnostic_file,
                     verbose=verbose,
                     algorithm=algorithm,
                     control=control,
                     **kwargs)
    return fit
예제 #4
0
파일: api.py 프로젝트: psarka/pystan
def stan(file=None,
         model_name="anon_model",
         model_code=None,
         fit=None,
         data=None,
         pars=None,
         chains=4,
         iter=2000,
         warmup=None,
         thin=1,
         init="random",
         seed=None,
         algorithm=None,
         control=None,
         sample_file=None,
         diagnostic_file=None,
         save_dso=True,
         verbose=False,
         boost_lib=None,
         eigen_lib=None,
         n_jobs=-1,
         **kwargs):
    """Fit a model using Stan.

    Parameters
    ----------

    file : string {'filename', file-like object}
        Model code must found via one of the following parameters: `file` or
        `model_code`.

        If `file` is a filename, the string passed as an argument is expected
        to be a filename containing the Stan model specification.

        If `file` is a file object, the object passed must have a 'read' method
        (file-like object) that is called to fetch the Stan model specification.

    charset : string, optional
        If bytes or files are provided, this charset is used to decode. 'utf-8'
        by default.

    model_code : string
        A string containing the Stan model specification. Alternatively,
        the model may be provided with the parameter `file`.

    model_name: string, optional
        A string naming the model. If none is provided 'anon_model' is
        the default. However, if `file` is a filename, then the filename
        will be used to provide a name. 'anon_model' by default.

    fit : StanFit instance
        An instance of StanFit derived from a previous fit, None by
        default. If `fit` is not None, the compiled model associated
        with a previous fit is reused and recompilation is avoided.

    data : dict
        A Python dictionary providing the data for the model. Variables
        for Stan are stored in the dictionary as expected. Variable
        names are the keys and the values are their associated values.
        Stan only accepts certain kinds of values; see Notes.

    pars : list of string, optional
        A list of strings indicating parameters of interest. By default
        all parameters specified in the model will be stored.

    chains : int, optional
        Positive integer specifying number of chains. 4 by default.

    iter : int, 2000 by default
        Positive integer specifying how many iterations for each chain
        including warmup.

    warmup : int, iter//2 by default
        Positive integer specifying number of warmup (aka burin) iterations.
        As `warmup` also specifies the number of iterations used for stepsize
        adaption, warmup samples should not be used for inference.

    thin : int, optional
        Positive integer specifying the period for saving samples.
        Default is 1.

    init : {0, '0', 'random', function returning dict, list of dict}, optional
        Specifies how initial parameter values are chosen:
        - 0 or '0' initializes all to be zero on the unconstrained support.
        - 'random' generates random initial values. An optional parameter
            `init_r` controls the range of randomly generated initial values
            for parameters in terms of their unconstrained support;
        - list of size equal to the number of chains (`chains`), where the
            list contains a dict with initial parameter values;
        - function returning a dict with initial parameter values. The
            function may take an optional argument `chain_id`.

    seed : int or np.random.RandomState, optional
        The seed, a positive integer for random number generation. Only
        one seed is needed when multiple chains are used, as the other
        chain's seeds are generated from the first chain's to prevent
        dependency among random number streams. By default, seed is
        ``random.randint(0, MAX_UINT)``.

    algorithm : {"NUTS", "HMC", "Fixed_param"}, optional
        One of algorithms that are implemented in Stan such as the No-U-Turn
        sampler (NUTS, Hoffman and Gelman 2011) and static HMC.

    sample_file : string, optional
        File name specifying where samples for *all* parameters and other
        saved quantities will be written. If not provided, no samples
        will be written. If the folder given is not writable, a temporary
        directory will be used. When there are multiple chains, an underscore
        and chain number are appended to the file name. By default do not
        write samples to file.

    diagnostic_file : string, optional
        File name specifying where diagnostic information should be written.
        By default no diagnostic information is recorded.

    boost_lib : string, optional
        The path to a version of the Boost C++ library to use instead of
        the one supplied with PyStan.

    eigen_lib : string, optional
        The path to a version of the Eigen C++ library to use instead of
        the one in the supplied with PyStan.

    save_dso : boolean, optional
        Indicates whether the dynamic shared object (DSO) compiled from
        C++ code will be saved for use in a future Python session. True by
        default.

    verbose : boolean, optional
        Indicates whether intermediate output should be piped to the console.
        This output may be useful for debugging. False by default.

    control : dict, optional
        A dictionary of parameters to control the sampler's behavior. Default
        values are used if control is not specified.  The following are
        adaptation parameters for sampling algorithms.

        These are parameters used in Stan with similar names:

        - `adapt_engaged` : bool
        - `adapt_gamma` : float, positive, default 0.05
        - `adapt_delta` : float, between 0 and 1, default 0.8
        - `adapt_kappa` : float, between default 0.75
        - `adapt_t0`    : float, positive, default 10
        - `adapt_init_buffer` : int, positive, defaults to 75
        - `adapt_term_buffer` : int, positive, defaults to 50
        - `adapt_window` : int, positive, defaults to 25

        In addition, the algorithm HMC (called 'static HMC' in Stan) and NUTS
        share the following parameters:

        - `stepsize`: float, positive
        - `stepsize_jitter`: float, between 0 and 1
        - `metric` : str, {"unit_e", "diag_e", "dense_e"}

        In addition, depending on which algorithm is used, different parameters
        can be set as in Stan for sampling. For the algorithm HMC we can set

        - `int_time`: float, positive

        For algorithm NUTS, we can set

        - `max_treedepth` : int, positive

    n_jobs : int, optional
        Sample in parallel. If -1 all CPUs are used. If 1, no parallel
        computing code is used at all, which is useful for debugging.

    Returns
    -------

    fit : StanFit instance

    Other parameters
    ----------------

    chain_id : int, optional
        `chain_id` can be a vector to specify the chain_id for all chains or
        an integer. For the former case, they should be unique. For the latter,
        the sequence of integers starting from the given `chain_id` are used
        for all chains.

    init_r : float, optional
        `init_r` is only valid if `init` == "random". In this case, the intial
        values are simulated from [-`init_r`, `init_r`] rather than using the
        default interval (see the manual of (Cmd)Stan).

    test_grad: bool, optional

    append_samples`: bool, optional

    refresh`: int, optional
        Argument `refresh` can be used to control how to indicate the progress
        during sampling (i.e. show the progress every \code{refresh} iterations).
        By default, `refresh` is `max(iter/10, 1)`.

    Examples
    --------
    >>> from pystan import stan
    >>> import numpy as np
    >>> model_code = '''
    ... parameters {
    ...   real y[2];
    ... }
    ... model {
    ...   y[1] ~ normal(0, 1);
    ...   y[2] ~ double_exponential(0, 2);
    ... }'''
    >>> fit1 = stan(model_code=model_code, iter=10)
    >>> print(fit1)
    >>> excode = '''
    ... transformed data {
    ...     real y[20];
    ...     y[1] <- 0.5796;  y[2]  <- 0.2276;   y[3] <- -0.2959;
    ...     y[4] <- -0.3742; y[5]  <- 0.3885;   y[6] <- -2.1585;
    ...     y[7] <- 0.7111;  y[8]  <- 1.4424;   y[9] <- 2.5430;
    ...     y[10] <- 0.3746; y[11] <- 0.4773;   y[12] <- 0.1803;
    ...     y[13] <- 0.5215; y[14] <- -1.6044;  y[15] <- -0.6703;
    ...     y[16] <- 0.9459; y[17] <- -0.382;   y[18] <- 0.7619;
    ...     y[19] <- 0.1006; y[20] <- -1.7461;
    ... }
    ... parameters {
    ...     real mu;
    ...     real<lower=0, upper=10> sigma;
    ...     vector[2] z[3];
    ...     real<lower=0> alpha;
    ... }
    ... model {
    ...     y ~ normal(mu, sigma);
    ...     for (i in 1:3)
    ...     z[i] ~ normal(0, 1);
    ...     alpha ~ exponential(2);
    ... }'''
    >>>
    >>> def initfun1():
    ...     return dict(mu=1, sigma=4, z=np.random.normal(size=(3, 2)), alpha=1)
    >>> exfit0 = stan(model_code=excode, init=initfun1)
    >>> def initfun2(chain_id=1):
    ...     return dict(mu=1, sigma=4, z=np.random.normal(size=(3, 2)), alpha=1 + chain_id)
    >>> exfit1 = stan(model_code=excode, init=initfun2)
    """
    # NOTE: this is a thin wrapper for other functions. Error handling occurs
    # elsewhere.
    if data is None:
        data = {}
    if warmup is None:
        warmup = int(iter // 2)
    if fit is not None:
        m = fit.stanmodel
    else:
        m = StanModel(file=file,
                      model_name=model_name,
                      model_code=model_code,
                      boost_lib=boost_lib,
                      eigen_lib=eigen_lib,
                      save_dso=save_dso,
                      verbose=verbose)
    # check that arguments in kwargs are valid
    valid_args = {
        "chain_id", "init_r", "test_grad", "append_samples", "refresh",
        "control", "obfuscate_model_name"
    }
    for arg in kwargs:
        if arg not in valid_args:
            raise ValueError("Parameter `{}` is not recognized.".format(arg))

    fit = m.sampling(data,
                     pars=pars,
                     chains=chains,
                     iter=iter,
                     warmup=warmup,
                     thin=thin,
                     seed=seed,
                     init=init,
                     sample_file=sample_file,
                     diagnostic_file=diagnostic_file,
                     verbose=verbose,
                     algorithm=algorithm,
                     control=control,
                     n_jobs=n_jobs,
                     **kwargs)
    return fit
예제 #5
0
def stan(file=None, model_name="anon_model", model_code=None, fit=None,
         data=None, pars=None, chains=4, iter=2000, warmup=None, thin=1,
         init="random", seed=random.randint(0, MAX_UINT), sample_file=None,
         diagnostic_file=None, save_dso=True, verbose=False, boost_lib=None,
         eigen_lib=None, **kwargs):
    """Fit a model using Stan.

    Parameters
    ----------

    file : string {'filename', file-like object}
        Model code must found via one of the following parameters: `file` or
        `model_code`.

        If `file` is a filename, the string passed as an argument is expected
        to be a filename containing the Stan model specification.

        If `file` is a file object, the object passed must have a 'read' method
        (file-like object) that is called to fetch the Stan model specification.

    charset : string, optional
        If bytes or files are provided, this charset is used to decode. 'utf-8'
        by default.

    model_code : string
        A string containing the Stan model specification. Alternatively,
        the model may be provided with the parameter `file`.

    model_name: string, optional
        A string naming the model. If none is provided 'anon_model' is
        the default. However, if `file` is a filename, then the filename
        will be used to provide a name. 'anon_model' by default.

    fit : StanFit instance
        An instance of StanFit derived from a previous fit, None by
        default. If `fit` is not None, the compiled model associated
        with a previous fit is reused and recompilation is avoided.

    data : dict
        A Python dictionary providing the data for the model. Variables
        for Stan are stored in the dictionary as expected. Variable
        names are the keys and the values are their associated values.
        Stan only accepts certain kinds of values; see Notes.

    pars : list of string, optional
        A list of strings indicating parameters of interest. By default
        all parameters specified in the model will be stored.

    chains : int, optional
        Positive integer specifying number of chains. 4 by default.

    iter : int, 2000 by default
        Positive integer specifying how many iterations for each chain
        including warmup.

    warmup : int, iter//2 by default
        Positive integer specifying number of warmup (aka burin) iterations.
        As `warmup` also specifies the number of iterations used for step-size
        adaption, warmup samples should not be used for inference.

    thin : int, optional
        Positive integer specifying the period for saving samples.
        Default is 1.

    init : {0, '0', 'random', function returning dict, list of dict}, optional
        Specifies how initial parameter values are chosen: 0 or '0'
        initializes all to be zero on the unconstrained support; 'random'
        generates random initial values; list of size equal to the number
        of chains (`chains`), where the list contains a dict with initial
        parameter values; function returning a dict with initial parameter
        values. The function may take an optional argument `chain_id`.

    seed : int, optional
        The seed, a positive integer for random number generation. Only
        one seed is needed when multiple chains are used, as the other
        chain's seeds are generated from the first chain's to prevent
        dependency among random number streams. By default, seed is
        ``random.randint(0, MAX_UINT)``.

    sample_file : string, optional
        File name specifying where samples for *all* parameters and other
        saved quantities will be written. If not provided, no samples
        will be written. If the folder given is not writable, a temporary
        directory will be used. When there are multiple chains, an underscore
        and chain number are appended to the file name. By default do not
        write samples to file.

    diagnostic_file : string, optional
        File name specifying where diagnostic information should be written.
        By default no diagnostic information is recorded.

    boost_lib : string, optional
        The path to a version of the Boost C++ library to use instead of
        the one supplied with PyStan.

    eigen_lib : string, optional
        The path to a version of the Eigen C++ library to use instead of
        the one in the supplied with PyStan.

    save_dso : boolean, optional
        Indicates whether the dynamic shared object (DSO) compiled from
        C++ code will be saved for use in a future Python session. True by
        default.

    verbose : boolean, optional
        Indicates whether intermediate output should be piped to the console.
        This output may be useful for debugging. False by default.


    """
    # NOTE: this is a thin wrapper for other functions. Error handling occurs
    # elsewhere.
    if data is None:
        data = {}
    if warmup is None:
        warmup = int(iter // 2)
    if fit is not None:
        m = fit.stanmodel
    else:
        m = StanModel(file=file, model_name=model_name, model_code=model_code,
                      boost_lib=boost_lib, eigen_lib=eigen_lib,
                      save_dso=save_dso, verbose=verbose, **kwargs)
    if sample_file is not None:
        raise NotImplementedError
    fit = m.sampling(data, pars, chains, iter, warmup, thin, seed, init,
                     sample_file=sample_file, diagnostic_file=diagnostic_file,
                     verbose=verbose, **kwargs)
    return fit
예제 #6
0
def predictLogisticGrowth(logGrowthModel: StanModel,
                          regionTrainIndex: int = None,
                          regionName: str = None,
                          confirmedCases=None,
                          target='confirmed',
                          subGroup='casesGlobal',
                          nSamples=N_SAMPLES,
                          nChains=N_CHAINS,
                          nDaysPredict=N_DAYS_PREDICT,
                          minCasesFilter=MIN_CASES_FILTER,
                          minNumberDaysWithCases=MIN_NUMBER_DAYS_WITH_CASES,
                          predictionsPercentiles=PREDICTIONS_PERCENTILES,
                          randomSeed=2020,
                          **kwargs):
    """Predict the region with the nth highest number of cases

    Parameters
    ----------
    logGrowthModel: A compiled pystan model
    regionTrainIndex: Order countries from highest to lowest, and train the ith region
    regionName: Overwrites regionTrainIndex as the region to train
    confirmedCases: A dataframe of countries as columns, and total number of cases as a time series
        (see covidvu.vujson.parseCSSE)
    target: string in ['confirmed', 'deaths', 'recovered']
    subGroup: A key in the output of covidvu.pipeline.vujson.parseCSSE
    nSamples: Number of samples per chain of MCMC
    nChains: Number of independent chains MCMC
    nDaysPredict: Number of days ahead to predict
    minCasesFilter: Minimum number of cases for prediction
    minNumberDaysWithCases: Minimum number of days with at least minCasesFilter
    predictionsPercentiles: Bayesian confidence intervals to evaluate
    randomSeed: Seed for stan sampler
    kwargs: Optional named arguments passed to covidvu.pipeline.vujson.parseCSSE

    Returns
    -------
    regionTS: All data for the queried region
    predictionsMeanTS: Posterior mean prediction
    predictionsPercentilesTS: Posterior percentiles
    trace: pymc3 trace object
    regionTSClean: Data used for training
    """
    maxTreeDepth = kwargs.get('maxTreedepth', MAX_TREEDEPTH)

    if confirmedCases is None:
        confirmedCases = parseCSSE(
            target,
            siteData=kwargs.get('siteData', SITE_DATA),
            jhCSSEFileConfirmed=kwargs.get('jhCSSEFileConfirmed',
                                           JH_CSSE_FILE_CONFIRMED),
            jhCSSEFileDeaths=kwargs.get('jhCSSEFileDeaths',
                                        JH_CSSE_FILE_DEATHS),
            jhCSSEFileConfirmedDeprecated=kwargs.get(
                'jhCSSEFileConfirmedDeprecated',
                JH_CSSE_FILE_CONFIRMED_DEPRECATED),
            jhCSSEFileDeathsDeprecated=kwargs.get(
                'jhCSSEFileDeathsDeprecated', JH_CSSE_FILE_DEATHS_DEPRECATED),
            jsCSSEReportPath=kwargs.get('jsCSSEReportPath',
                                        JH_CSSE_REPORT_PATH),
        )[subGroup]

    if regionName is None:
        regionName = _getCountryToTrain(int(regionTrainIndex), confirmedCases)
    else:
        assert isinstance(regionName, str)

    regionTS = confirmedCases[regionName]
    regionTSClean = regionTS[regionTS > minCasesFilter]
    if regionTSClean.shape[0] < minNumberDaysWithCases:
        return None

    regionTSClean.index = pd.to_datetime(regionTSClean.index)

    t = np.arange(regionTSClean.shape[0])
    regionTSCleanLog = np.log(regionTSClean.values + 1)

    logisticGrowthData = {
        'nDays': regionTSClean.shape[0],
        't': list(t),
        'casesLog': list(regionTSCleanLog)
    }

    fit = logGrowthModel.sampling(data=logisticGrowthData,
                                  iter=nSamples,
                                  chains=nChains,
                                  seed=randomSeed,
                                  control={'max_treedepth': maxTreeDepth})

    trace = fit.to_dataframe()

    predictionsMean, predictionsPercentilesTS = _getPredictionsFromPosteriorSamples(
        t,
        trace,
        nDaysPredict,
        predictionsPercentiles,
    )

    predictionsMeanTS, predictionsPercentilesTS = _castPredictionsAsTS(
        regionTSClean,
        nDaysPredict,
        predictionsMean,
        predictionsPercentilesTS,
    )

    regionTS.index = pd.to_datetime(regionTS.index)
    prediction = {
        'regionTS': regionTS,
        'predictionsMeanTS': predictionsMeanTS,
        'predictionsPercentilesTS': predictionsPercentilesTS,
        'trace': trace,
        'regionTSClean': regionTSClean,
        'regionName': regionName,
        't': t,
    }

    return prediction
예제 #7
0
def predictLogisticGrowth(logGrowthModel: StanModel,
                          regionName,
                          target                 = 'confirmed',
                          regionType             = 'country',
                          nSamples               = N_SAMPLES,
                          nChains                = N_CHAINS,
                          nDaysPredict           = N_DAYS_PREDICT,
                          minCasesFilter         = MIN_CASES_FILTER,
                          minNumberDaysWithCases = MIN_NUMBER_DAYS_WITH_CASES,
                          predictionsPercentiles = PREDICTIONS_PERCENTILES,
                          randomSeed             = 2020,
                          databasePath           = DATABASE_PATH,
                          maxTreeDepth           = MAX_TREEDEPTH,
                          ):
    """Predict the region with the nth highest number of cases

    Parameters
    ----------
    logGrowthModel: A compiled pystan model
    regionName: Name of the region to train, which must be a country or US state in Cryostation
    target: 'confirmed' or 'deaths'
    regionType: 'country' or 'stateUS
    nSamples: Number of samples per chain of MCMC
    nChains: Number of independent chains MCMC
    nDaysPredict: Number of days ahead to predict
    minCasesFilter: Minimum number of cases for prediction
    minNumberDaysWithCases: Minimum number of days with at least minCasesFilter
    predictionsPercentiles: Bayesian confidence intervals to evaluate
    randomSeed: Seed for stan sampler
    databasePath: Path to virustrack.db
    maxTreeDepth: max_treedepth for pystan

    Returns
    -------
    regionTS: All data for the queried region
    predictionsMeanTS: Posterior mean prediction
    predictionsPercentilesTS: Posterior percentiles
    trace: pymc3 trace object
    regionTSClean: Data used for training
    """

    with Cryostation(databasePath) as storage:
        try:
            if regionType == 'country':
                if target in storage[regionName].keys():
                    regionTS = pd.Series(storage[regionName][target])
                else:
                    return None
            elif regionType == 'stateUS':
                if target in storage['US']['provinces'][regionName].keys():
                    regionTS = pd.Series(storage['US']['provinces'][regionName][target])
                else:
                    return None
            else:
                raise NotImplementedError
        except Exception as e:
            raise e

    regionTS.index = pd.to_datetime(regionTS.index)
    regionTS.sort_index(inplace=True)

    minIndex = (regionTS > minCasesFilter).argmax()
    regionTSClean = regionTS.iloc[minIndex:]
    if regionTSClean.shape[0] < minNumberDaysWithCases:
        return None

    regionTSClean.index = pd.to_datetime(regionTSClean.index)

    t = regionTSClean.index.to_series().diff().map(lambda d: d.days).fillna(0).cumsum().values
    regionTSCleanLog = np.log(regionTSClean.values + 1)

    logisticGrowthData = {'nDays': regionTSClean.shape[0],
                          't': list(t),
                          'casesLog': list(regionTSCleanLog)
                          }


    fit = logGrowthModel.sampling(data=logisticGrowthData, iter=nSamples, chains=nChains, seed=randomSeed,
                                  control={'max_treedepth':maxTreeDepth}
                                  )

    trace = fit.to_dataframe()

    predictionsMean, predictionsPercentilesTS =  _getPredictionsFromPosteriorSamples(t,
                                                                                     trace,
                                                                                     nDaysPredict,
                                                                                     predictionsPercentiles,
                                                                                     )

    predictionsMeanTS, predictionsPercentilesTS = _castPredictionsAsTS(regionTSClean,
                                                                       nDaysPredict,
                                                                       predictionsMean,
                                                                       predictionsPercentilesTS,
                                                                       )

    regionTS.index = pd.to_datetime(regionTS.index)
    prediction = {
        'regionTS':                 regionTS,
        'predictionsMeanTS':        predictionsMeanTS,
        'predictionsPercentilesTS': predictionsPercentilesTS,
        'trace':                    trace,
        'regionTSClean':            regionTSClean,
        'regionName':               regionName,
        't':                        t,
    }

    return prediction