示例#1
0
 def _get_y(self, x, y):
     """Get y, even when x is a DataGenerator and y is None"""
     if y is not None:
         return y
     else:
         y_true = [d for _, d in make_generator(x, y, test=True)]
         return np.concatenate(to_numpy(y_true), axis=0)
示例#2
0
def test_make_generator():

    # Create some data
    x = np.random.randn(100, 3)
    w = np.random.randn(3, 1)
    b = np.random.randn()
    y = x @ w + b

    # Should return an ArrayDataGenerator
    dg = make_generator(x, y)
    assert isinstance(dg, ArrayDataGenerator)

    # Should just return what passed if passed an ArrayDataGenerator
    dg = ArrayDataGenerator(x, y, batch_size=5)
    dg_out = make_generator(dg)
    assert isinstance(dg_out, ArrayDataGenerator)
示例#3
0
 def _sample(self, x, func, ed=None, axis=1):
     """Sample from the model"""
     samples = []
     for x_data, y_data in make_generator(x, test=True):
         if x_data is None:
             samples += [func(self())]
         else:
             samples += [func(self(O.expand_dims(x_data, ed)))]
     return np.concatenate(to_numpy(samples), axis=axis)
示例#4
0
    def __init__(self, metric, x, y=None, verbose=True):

        # Store metric
        self.metric_fn = get_metric_fn(metric)

        # Store validation data
        self.data = make_generator(x, y)

        # Store metrics and epochs
        self.current_metric = np.nan
        self.current_epoch = 0
        self.metrics = []
        self.epochs = []
        self.verbose = verbose
示例#5
0
    def __init__(self, metric, x, y=None, verbose=False):

        # Store metric
        self.metric_fn = get_metric_fn(metric)
        if isinstance(metric, str):
            self.metric_name = metric
        else:
            self.metric_name = self.metric_fn.__name__

        # Store validation data
        self.data = make_generator(x, y)

        # Store metrics and epochs
        self.current_metric = np.nan
        self.current_epoch = 0
        self.metrics = []
        self.epochs = []
        self.verbose = verbose
示例#6
0
    def _intervals(self, fn, x, side, ci=0.95, n=1000, batch_size=None):
        """Compute intervals on some type of sample"""

        # Compute in batches?
        if batch_size is not None:
            intervals = [
                self._intervals(fn, x_data, side, ci=ci, n=n) for x_data,
                y_data in make_generator(x, test=True, batch_size=batch_size)
            ]
            return (np.concatenate(e, axis=0) for e in zip(*intervals))

        # No batching (or this is a batch)
        samples = fn(x, n=n)
        if side == "lower":
            return np.percentile(samples, 100 * (1.0 - ci), axis=0)
        elif side == "upper":
            return np.percentile(samples, 100 * ci, axis=0)
        else:
            lb = 100 * (1.0 - ci) / 2.0
            prcs = np.percentile(samples, [lb, 100.0 - lb], axis=0)
            return prcs[0, ...], prcs[1, ...]
示例#7
0
文件: model.py 项目: stnkl/probflow
    def log_prob(
        self,
        x,
        y=None,
        individually=True,
        distribution=False,
        n=1000,
        batch_size=None,
    ):
        """Compute the log probability of `y` given the model

        TODO: Docs...


        Parameters
        ----------
        x : |ndarray| or |DataFrame| or |Series| or Tensor
            Independent variable values of the dataset to evaluate (aka the
            "features").
        y : |ndarray| or |DataFrame| or |Series| or Tensor
            Dependent variable values of the dataset to evaluate (aka the
            "target").
        individually : bool
            If ``individually`` is True, returns log probability for each
            sample individually, so return shape is ``(x.shape[0], ?)``.
            If ``individually`` is False, returns sum of all log probabilities,
            so return shape is ``(1, ?)``.
        distribution : bool
            If ``distribution`` is True, returns log probability posterior
            distribution (``n`` samples from the model),
            so return shape is ``(?, n)``.
            If ``distribution`` is False, returns log posterior probabilities
            using the maximum a posteriori estimate for each parameter,
            so the return shape is ``(?, 1)``.
        n : int
            Number of samples to draw for each distribution if
            ``distribution=True``.
        batch_size : None or int
            Compute using batches of this many datapoints.  Default is `None`
            (i.e., do not use batching).

        Returns
        -------
        log_probs : |ndarray|
            Log probabilities. Shape is determined by ``individually``,
            ``distribution``, and ``n`` kwargs.
        """

        # Get a distribution of samples
        if distribution:
            with Sampling(n=1, flipout=False):
                probs = []
                for i in range(n):
                    t_probs = []
                    for x_data, y_data in make_generator(
                        x, y, batch_size=batch_size
                    ):
                        if x_data is None:
                            t_probs += [self().log_prob(y_data)]
                        else:
                            t_probs += [self(x_data).log_prob(y_data)]
                    probs += [np.concatenate(to_numpy(t_probs), axis=0)]
            probs = np.stack(to_numpy(probs), axis=probs[0].ndim)

        # Use MAP estimates
        else:
            probs = []
            for x_data, y_data in make_generator(x, y, batch_size=batch_size):
                if x_data is None:
                    probs += [self().log_prob(y_data)]
                else:
                    probs += [self(x_data).log_prob(y_data)]
            probs = np.concatenate(to_numpy(probs), axis=0)

        # Return log prob of each sample or sum of log probs
        if individually:
            return probs
        else:
            return np.sum(probs, axis=0)
示例#8
0
文件: model.py 项目: stnkl/probflow
    def metric(self, metric, x, y=None, batch_size=None):
        """Compute a metric of model performance

        TODO: docs

        TODO: note that this doesn't work w/ generative models


        Parameters
        ----------
        metric : str or callable
            Metric to evaluate.  Available metrics:

            * 'lp': log likelihood sum
            * 'log_prob': log likelihood sum
            * 'accuracy': accuracy
            * 'acc': accuracy
            * 'mean_squared_error': mean squared error
            * 'mse': mean squared error
            * 'sum_squared_error': sum squared error
            * 'sse': sum squared error
            * 'mean_absolute_error': mean absolute error
            * 'mae': mean absolute error
            * 'r_squared': coefficient of determination
            * 'r2': coefficient of determination
            * 'recall': true positive rate
            * 'sensitivity': true positive rate
            * 'true_positive_rate': true positive rate
            * 'tpr': true positive rate
            * 'specificity': true negative rate
            * 'selectivity': true negative rate
            * 'true_negative_rate': true negative rate
            * 'tnr': true negative rate
            * 'precision': precision
            * 'f1_score': F-measure
            * 'f1': F-measure
            * callable: a function which takes (y_true, y_pred)

        x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator|
            Independent variable values of the dataset to evaluate (aka the
            "features").  Or a |DataGenerator| to generate both x and y.
        y : |ndarray| or |DataFrame| or |Series| or Tensor
            Dependent variable values of the dataset to evaluate (aka the
            "target").
        batch_size : None or int
            Compute using batches of this many datapoints.  Default is `None`
            (i.e., do not use batching).

        Returns
        -------
        TODO
        """

        # Get true values and predictions
        y_true = []
        y_pred = []
        for x_data, y_data in make_generator(
            x, y, test=True, batch_size=batch_size
        ):
            y_true += [y_data]
            y_pred += [self(x_data).mean()]
        y_true = np.concatenate(to_numpy(y_true), axis=0)
        y_pred = np.concatenate(to_numpy(y_pred), axis=0)

        # Compute metric between true values and predictions
        metric_fn = get_metric_fn(metric)
        return metric_fn(y_true, y_pred)
示例#9
0
文件: model.py 项目: stnkl/probflow
    def fit(
        self,
        x,
        y=None,
        batch_size: int = 128,
        epochs: int = 200,
        shuffle: bool = False,
        optimizer=None,
        optimizer_kwargs: dict = {},
        lr: float = None,
        flipout: bool = True,
        num_workers: int = None,
        callbacks: List[BaseCallback] = [],
        eager: bool = False,
        n_mc: int = 1,
    ):
        r"""Fit the model to data

        TODO


        Parameters
        ----------
        x : |ndarray| or |DataFrame| or |Series| or |DataGenerator|
            Independent variable values (or, if fitting a generative model,
            the dependent variable values).  Should be of shape (Nsamples,...)
        y : |None| or |ndarray| or |DataFrame| or |Series|
            Dependent variable values (or, if fitting a generative model,
            ``None``). Should be of shape (Nsamples,...).  Default = ``None``
        batch_size : int
            Number of samples to use per minibatch.
            Default = ``128``
        epochs : int
            Number of epochs to train the model.
            Default = ``200``
        shuffle : bool
            Whether to shuffle the data each epoch.  Note that this is ignored
            if ``x`` is a |DataGenerator|
            Default = ``True``
        optimizer : |None| or a backend-specific optimizer
            What optimizer to use for optimizing the variational posterior
            distributions' variables.  When the backend is |TensorFlow| the
            default is to use adam (``tf.keras.optimizers.Adam``).  When the
            backend is |PyTorch| the default is to use TODO
        optimizer_kwargs : dict
            Keyword arguments to pass to the optimizer.
            Default is an empty dict.
        lr : float
            Learning rate for the optimizer.
            Note that the learning rate can be updated during training using
            the set_learning_rate method.
            Default is :math:`\exp (- \log_{10} (N_p N_b))`, where :math:`N_p`
            is the number of parameters in the model, and :math:`N_b` is the
            number of samples per batch (``batch_size``).
        flipout : bool
            Whether to use flipout during training where possible
            Default = True
        num_workers : None or int > 0
            Number of parallel processes to run for loading the data.  If
            ``None``, will not use parallel processes.  If an integer, will use
            a process pool with that many processes.  Note that this parameter
            is ignored if a |DataGenerator| is passed as ``x``.  Default = None
        callbacks : List[BaseCallback]
            List of callbacks to run while training the model.  Default is
            ``[]``, i.e. no callbacks.
        eager : bool
            Whether to use eager execution.  If False, will use ``tf.function``
            (for TensorFlow) or tracing (for PyTorch) to optimize the model
            fitting.  Note that even if eager=True, you can still use eager
            execution when using the model after it is fit.  Default = False
        n_mc : int
            Number of monte carlo samples to take from the variational
            posteriors per minibatch.  The default is to just take one per
            batch.  Using a smaller number of MC samples is faster, but using a
            greater number of MC samples will decrease the variance of the
            gradients, leading to more stable parameter optimization.


        Example
        -------

        See the user guide section on :doc:`/user_guide/fitting`.
        """

        # Determine a somewhat reasonable learning rate if none was passed
        if lr is not None:
            self._learning_rate = lr
        elif self._learning_rate is None:
            default_lr = np.exp(-np.log10(self.n_parameters * batch_size))
            self._learning_rate = default_lr

        # Create DataGenerator from input data if not already
        self._data = make_generator(
            x,
            y,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers,
        )

        # Use default optimizer if none specified
        if optimizer is None and self._optimizer is None:
            if get_backend() == "pytorch":
                import torch

                self._optimizer = torch.optim.Adam(
                    self.trainable_variables,
                    lr=self._learning_rate,
                    **optimizer_kwargs
                )
            else:
                import tensorflow as tf

                self._optimizer = tf.keras.optimizers.Adam(
                    lambda: self._learning_rate, **optimizer_kwargs
                )

        # Use eager if input type is dataframe or series
        eager_types = (pd.DataFrame, pd.Series)
        if any(isinstance(e, eager_types) for e in self._data.get_batch(0)):
            eager = True

        # Create a function to perform one training step
        if get_backend() == "pytorch":
            self._train_fn = self._train_step_pytorch(
                self._data.n_samples, flipout, eager=eager, n_mc=n_mc
            )
        else:
            self._train_fn = self._train_step_tensorflow(
                self._data.n_samples, flipout, eager=eager, n_mc=n_mc
            )

        # Assign model param to callbacks
        for c in callbacks:
            c.model = self

        # Run callbacks at start of training
        self._is_training = True
        for c in callbacks:
            c.on_train_start()

        # Fit the model!
        for i in range(int(epochs)):

            # Stop training early?
            if not self._is_training:
                break

            # Run callbacks at start of epoch
            self._current_elbo = 0.0
            self._data.on_epoch_start()
            for c in callbacks:
                c.on_epoch_start()

            # Update gradients for each batch
            for x_data, y_data in self._data:
                self.train_step(x_data, y_data)

            # Run callbacks at end of epoch
            self._data.on_epoch_end()
            for c in callbacks:
                c.on_epoch_end()

        # Run callbacks at end of training
        self._is_training = False
        for c in callbacks:
            c.on_train_end()
示例#10
0
def test_ContinuousModel(plot):
    """Tests probflow.models.ContinuousModel"""
    class MyModel(ContinuousModel):
        def __init__(self):
            self.weight = Parameter([5, 1], name='Weight')
            self.bias = Parameter([1, 1], name='Bias')
            self.std = ScaleParameter([1, 1], name='Std')

        def __call__(self, x):
            return Normal(x @ self.weight() + self.bias(), self.std())

    # Instantiate the model
    model = MyModel()

    # Data
    x = np.random.randn(100, 5).astype('float32')
    w = np.random.randn(5, 1).astype('float32')
    y = x @ w + 1

    # Fit the model
    model.fit(x, y, batch_size=50, epochs=100, lr=0.01)

    # predictive intervals
    lb, ub = model.predictive_interval(x[:22, :])
    assert isinstance(lb, np.ndarray)
    assert isinstance(ub, np.ndarray)
    assert lb.ndim == 2
    assert lb.shape[0] == 22
    assert lb.shape[1] == 1
    assert ub.ndim == 2
    assert ub.shape[0] == 22
    assert ub.shape[1] == 1

    # predictive intervals lower ci
    llb = model.predictive_interval(x[:22, :], side='lower')
    assert isinstance(llb, np.ndarray)
    assert llb.ndim == 2
    assert llb.shape[0] == 22
    assert llb.shape[1] == 1
    assert np.all(llb <= ub)

    # predictive intervals upper ci
    uub = model.predictive_interval(x[:22, :], side='upper')
    assert isinstance(uub, np.ndarray)
    assert uub.ndim == 2
    assert uub.shape[0] == 22
    assert uub.shape[1] == 1
    assert np.all(uub >= lb)
    assert np.all(uub >= llb)

    # aleatoric intervals
    lb, ub = model.aleatoric_interval(x[:23, :])
    assert isinstance(lb, np.ndarray)
    assert isinstance(ub, np.ndarray)
    assert lb.ndim == 2
    assert lb.shape[0] == 23
    assert lb.shape[1] == 1
    assert ub.ndim == 2
    assert ub.shape[0] == 23
    assert ub.shape[1] == 1

    # epistemic intervals
    lb, ub = model.epistemic_interval(x[:24, :])
    assert isinstance(lb, np.ndarray)
    assert isinstance(ub, np.ndarray)
    assert lb.ndim == 2
    assert lb.shape[0] == 24
    assert lb.shape[1] == 1
    assert ub.ndim == 2
    assert ub.shape[0] == 24
    assert ub.shape[1] == 1

    # posterior predictive plot with one sample
    model.pred_dist_plot(x[:1, :])
    if plot:
        plt.title('Should be one dist on one subfig')
        plt.show()

    # posterior predictive plot with one sample, showing ci
    model.pred_dist_plot(x[:1, :], ci=0.95, style='hist')
    if plot:
        plt.title('Should be one dist on one subfig, w/ ci=0.95')
        plt.show()

    # posterior predictive plot with two samples
    model.pred_dist_plot(x[:2, :])
    if plot:
        plt.title('Should be two dists on one subfig')
        plt.show()

    # posterior predictive plot with two samples, two subfigs
    model.pred_dist_plot(x[:2, :], individually=True)
    if plot:
        plt.title('Should be two dists on two subfigs')
        plt.show()

    # posterior predictive plot with six samples, 6 subfigs, 2 cols
    model.pred_dist_plot(x[:6, :], individually=True, cols=2)
    if plot:
        plt.title('Should be 6 dists, 6 subfigs, 2 cols')
        plt.show()

    # predictive prc
    prcs = model.predictive_prc(x[:7, :], y[:7, :])
    assert isinstance(prcs, np.ndarray)
    assert prcs.ndim == 2
    assert prcs.shape[0] == 7
    assert prcs.shape[1] == 1

    with pytest.raises(TypeError):
        prcs = model.predictive_prc(x[:7, :], None)

    # predictive distribution covered for each sample
    cov = model.pred_dist_covered(x[:11, :], y[:11, :])
    assert isinstance(cov, np.ndarray)
    assert cov.ndim == 2
    assert cov.shape[0] == 11
    assert cov.shape[1] == 1

    with pytest.raises(ValueError):
        cov = model.pred_dist_covered(x, y, n=-1)
    with pytest.raises(ValueError):
        cov = model.pred_dist_covered(x, y, ci=-0.1)
    with pytest.raises(ValueError):
        cov = model.pred_dist_covered(x, y, ci=1.1)

    # predictive distribution covered for each sample
    cov = model.pred_dist_coverage(x[:11, :], y[:11, :])
    assert isinstance(cov, np.float)

    # plot coverage by
    xo, co = model.coverage_by(x[:, :1], x, y)
    assert isinstance(xo, np.ndarray)
    assert isinstance(co, np.ndarray)
    if plot:
        plt.title('should be coverage by plot')
        plt.show()

    # r squared
    r2 = model.r_squared(x, y, n=21)
    assert isinstance(r2, np.ndarray)
    assert r2.ndim == 2
    assert r2.shape[0] == 21
    assert r2.shape[1] == 1

    # r squared with an ArrayDataGenerator
    dg = make_generator(x, y)
    r2 = model.r_squared(dg, n=22)
    assert isinstance(r2, np.ndarray)
    assert r2.ndim == 2
    assert r2.shape[0] == 22
    assert r2.shape[1] == 1

    # plot the r2 dist
    model.r_squared_plot(x, y, style='hist')
    if plot:
        plt.title('should be r2 dist')
        plt.show()

    # residuals
    res = model.residuals(x, y)
    assert isinstance(res, np.ndarray)
    assert res.ndim == 2
    assert res.shape[0] == 100
    assert res.shape[1] == 1

    # plot the distribution of residuals
    model.residuals_plot(x, y)
    if plot:
        plt.title('should be residuals dist')
        plt.show()
示例#11
0
def test_ContinuousModel(plot):
    """Tests probflow.models.ContinuousModel"""
    class MyModel(ContinuousModel):
        def __init__(self):
            self.weight = Parameter([5, 1], name="Weight")
            self.bias = Parameter([1, 1], name="Bias")
            self.std = ScaleParameter([1, 1], name="Std")

        def __call__(self, x):
            return Normal(x @ self.weight() + self.bias(), self.std())

    # Instantiate the model
    model = MyModel()

    # Data
    x = np.random.randn(100, 5).astype("float32")
    w = np.random.randn(5, 1).astype("float32")
    y = x @ w + 1

    # Fit the model
    model.fit(x, y, batch_size=50, epochs=100, lr=0.01)

    # predictive intervals
    lb, ub = model.predictive_interval(x[:22, :])
    assert isinstance(lb, np.ndarray)
    assert isinstance(ub, np.ndarray)
    assert lb.ndim == 2
    assert lb.shape[0] == 22
    assert lb.shape[1] == 1
    assert ub.ndim == 2
    assert ub.shape[0] == 22
    assert ub.shape[1] == 1

    # predictive intervals lower ci
    llb = model.predictive_interval(x[:22, :], side="lower")
    assert isinstance(llb, np.ndarray)
    assert llb.ndim == 2
    assert llb.shape[0] == 22
    assert llb.shape[1] == 1
    assert np.all(llb <= ub)

    # predictive intervals upper ci
    uub = model.predictive_interval(x[:22, :], side="upper")
    assert isinstance(uub, np.ndarray)
    assert uub.ndim == 2
    assert uub.shape[0] == 22
    assert uub.shape[1] == 1
    assert np.all(uub >= lb)
    assert np.all(uub >= llb)

    # predictive intervals with batching
    lb, ub = model.predictive_interval(x[:21, :], batch_size=7)
    assert isinstance(lb, np.ndarray)
    assert isinstance(ub, np.ndarray)
    assert lb.ndim == 2
    assert lb.shape[0] == 21
    assert lb.shape[1] == 1
    assert ub.ndim == 2
    assert ub.shape[0] == 21
    assert ub.shape[1] == 1

    # aleatoric intervals
    lb, ub = model.aleatoric_interval(x[:23, :])
    assert isinstance(lb, np.ndarray)
    assert isinstance(ub, np.ndarray)
    assert lb.ndim == 2
    assert lb.shape[0] == 23
    assert lb.shape[1] == 1
    assert ub.ndim == 2
    assert ub.shape[0] == 23
    assert ub.shape[1] == 1

    # epistemic intervals
    lb, ub = model.epistemic_interval(x[:24, :])
    assert isinstance(lb, np.ndarray)
    assert isinstance(ub, np.ndarray)
    assert lb.ndim == 2
    assert lb.shape[0] == 24
    assert lb.shape[1] == 1
    assert ub.ndim == 2
    assert ub.shape[0] == 24
    assert ub.shape[1] == 1

    # posterior predictive plot with one sample
    model.pred_dist_plot(x[:1, :])
    if plot:
        plt.title("Should be one dist on one subfig")
        plt.show()

    # posterior predictive plot with one sample, showing ci
    model.pred_dist_plot(x[:1, :], ci=0.95, style="hist")
    if plot:
        plt.title("Should be one dist on one subfig, w/ ci=0.95")
        plt.show()

    # posterior predictive plot with two samples
    model.pred_dist_plot(x[:2, :])
    if plot:
        plt.title("Should be two dists on one subfig")
        plt.show()

    # posterior predictive plot with two samples, two subfigs
    model.pred_dist_plot(x[:2, :], individually=True)
    if plot:
        plt.title("Should be two dists on two subfigs")
        plt.show()

    # posterior predictive plot with six samples, 6 subfigs, 2 cols
    model.pred_dist_plot(x[:6, :], individually=True, cols=2)
    if plot:
        plt.title("Should be 6 dists, 6 subfigs, 2 cols")
        plt.show()

    # predictive prc
    prcs = model.predictive_prc(x[:7, :], y[:7, :])
    assert isinstance(prcs, np.ndarray)
    assert prcs.ndim == 2
    assert prcs.shape[0] == 7
    assert prcs.shape[1] == 1

    with pytest.raises(TypeError):
        prcs = model.predictive_prc(x[:7, :], None)

    # predictive distribution covered for each sample
    cov = model.pred_dist_covered(x[:11, :], y[:11, :])
    assert isinstance(cov, np.ndarray)
    assert cov.ndim == 2
    assert cov.shape[0] == 11
    assert cov.shape[1] == 1

    with pytest.raises(ValueError):
        cov = model.pred_dist_covered(x, y, n=-1)
    with pytest.raises(ValueError):
        cov = model.pred_dist_covered(x, y, ci=-0.1)
    with pytest.raises(ValueError):
        cov = model.pred_dist_covered(x, y, ci=1.1)

    # predictive distribution covered for each sample
    cov = model.pred_dist_coverage(x[:11, :], y[:11, :])
    assert isinstance(cov, np.float)

    # plot coverage by
    xo, co = model.coverage_by(x[:, :1], x, y)
    assert isinstance(xo, np.ndarray)
    assert isinstance(co, np.ndarray)
    if plot:
        plt.title("should be coverage by plot")
        plt.show()

    # r squared
    r2 = model.r_squared(x, y, n=21)
    assert isinstance(r2, np.ndarray)
    assert r2.ndim == 2
    assert r2.shape[0] == 21
    assert r2.shape[1] == 1

    # r squared with an ArrayDataGenerator
    dg = make_generator(x, y)
    r2 = model.r_squared(dg, n=22)
    assert isinstance(r2, np.ndarray)
    assert r2.ndim == 2
    assert r2.shape[0] == 22
    assert r2.shape[1] == 1

    # plot the r2 dist
    model.r_squared_plot(x, y, style="hist")
    if plot:
        plt.title("should be r2 dist")
        plt.show()

    # residuals
    res = model.residuals(x, y)
    assert isinstance(res, np.ndarray)
    assert res.ndim == 2
    assert res.shape[0] == 100
    assert res.shape[1] == 1

    # plot the distribution of residuals
    model.residuals_plot(x, y)
    if plot:
        plt.title("should be residuals dist")
        plt.show()

    # calibration curve
    p, p_hat = model.calibration_curve(x[:90, :], y[:90, :], resolution=11)
    assert isinstance(p, np.ndarray)
    assert isinstance(p_hat, np.ndarray)
    assert p.ndim == 1
    assert p.shape[0] == 11
    assert p_hat.ndim == 1
    assert p_hat.shape[0] == 11
    assert np.all(p >= 0)
    assert np.all(p <= 1)
    assert np.all(p_hat >= 0)
    assert np.all(p_hat <= 1)

    # calibration curve (with batching)
    p, p_hat = model.calibration_curve(x[:90, :],
                                       y[:90, :],
                                       resolution=11,
                                       batch_size=30)
    assert isinstance(p, np.ndarray)
    assert isinstance(p_hat, np.ndarray)
    assert p.ndim == 1
    assert p.shape[0] == 11
    assert p_hat.ndim == 1
    assert p_hat.shape[0] == 11
    assert np.all(p >= 0)
    assert np.all(p <= 1)
    assert np.all(p_hat >= 0)
    assert np.all(p_hat <= 1)

    # calibration curve
    model.calibration_curve_plot(x, y, resolution=11)
    if plot:
        plt.title("should be calibration curve")
        plt.show()

    # calibration curve (with batching)
    model.calibration_curve_plot(x, y, resolution=11, batch_size=25)
    if plot:
        plt.title("should be calibration curve (with batching)")
        plt.show()

    # calibration metrics: msce
    msce = model.calibration_metric("msce",
                                    x[:90, :],
                                    y[:90, :],
                                    resolution=11)
    assert isinstance(msce, float)
    assert msce >= 0
    assert msce <= 1

    # calibration metrics: rmsce
    rmsce = model.calibration_metric("rmsce",
                                     x[:90, :],
                                     y[:90, :],
                                     resolution=11)
    assert isinstance(rmsce, float)
    assert rmsce >= 0
    assert rmsce <= 1

    # calibration metrics: mace
    mace = model.calibration_metric("mace",
                                    x[:90, :],
                                    y[:90, :],
                                    resolution=11)
    assert isinstance(mace, float)
    assert mace >= 0
    assert mace <= 1

    # calibration metrics: ma
    ma = model.calibration_metric("ma", x[:90, :], y[:90, :], resolution=11)
    assert isinstance(ma, float)
    assert ma >= 0
    assert ma <= 1

    # should raise value error on invalid metric name
    with pytest.raises(ValueError):
        ma = model.calibration_metric("lala",
                                      x[:90, :],
                                      y[:90, :],
                                      resolution=11)

    # calibration metrics: list of em
    metrics = model.calibration_metric(["mace", "ma"],
                                       x[:90, :],
                                       y[:90, :],
                                       resolution=11)
    assert isinstance(metrics, dict)
    assert len(metrics) == 2
    assert "mace" in metrics
    assert "ma" in metrics
    assert metrics["mace"] >= 0
    assert metrics["mace"] <= 1
    assert metrics["ma"] >= 0
    assert metrics["ma"] <= 1

    # calibration metric with batching
    msce = model.calibration_metric("msce",
                                    x[:90, :],
                                    y[:90, :],
                                    resolution=11,
                                    batch_size=30)
    assert isinstance(msce, (float, np.floating))
    assert msce >= 0
    assert msce <= 1

    # sharpness
    sha = model.sharpness(x[:90, :])
    assert isinstance(sha, (float, np.floating))
    assert rmsce >= 0

    # sharpness w/ batching
    sha = model.sharpness(x[:90, :], batch_size=30)
    assert isinstance(sha, (float, np.floating))
    assert rmsce >= 0

    # dispersion metric: cv
    dm = model.dispersion_metric("cv", x[:90, :])
    assert isinstance(dm, (float, np.floating))
    assert dm >= 0

    # dispersion metric: qcd
    dm = model.dispersion_metric("qcd", x[:90, :])
    assert isinstance(dm, (float, np.floating))
    assert dm >= 0

    # dispersion metric w/ batching
    dm = model.dispersion_metric("cv", x[:90, :], batch_size=30)
    assert isinstance(dm, (float, np.floating))
    assert dm >= 0

    # should raise value error on invalid metric name
    with pytest.raises(ValueError):
        dm = model.dispersion_metric("lala", x[:90, :])

    # dispersion metrics: list of em
    metrics = model.dispersion_metric(["cv", "qcd"], x[:90, :])
    assert isinstance(metrics, dict)
    assert len(metrics) == 2
    assert "cv" in metrics
    assert "qcd" in metrics
    assert metrics["cv"] >= 0
    assert metrics["qcd"] >= 0
示例#12
0
    def predictive_prc(self, x, y=None, n=1000, batch_size=None):
        r"""Compute the percentile of each observation along the posterior
        predictive distribution.

        TODO: Docs...  Returns a percentile between 0 and 1

        Parameters
        ----------
        x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator|
            Independent variable values of the dataset to evaluate (aka the
            "features").  Or a |DataGenerator| for both x and y.
        y : |ndarray| or |DataFrame| or |Series| or Tensor
            Dependent variable values of the dataset to evaluate (aka the
            "target").
        n : int
            Number of samples to draw from the model given ``x``.
            Default = 1000
        batch_size : None or int
            Compute using batches of this many datapoints.  Default is `None`
            (i.e., do not use batching).

        Returns
        -------
        prcs : |ndarray| of float between 0 and 1
        """

        # Need both x and y data
        if y is None and not isinstance(x, DataGenerator):
            raise TypeError("need both x and y to compute predictive prc")

        # Compute in batches?
        if batch_size is not None:
            return np.concatenate(
                [
                    self.predictive_prc(x_data, y_data, n=n) for x_data, y_data
                    in make_generator(x, y, batch_size=batch_size)
                ],
                axis=0,
            )

        # Sample from the predictive distribution
        samples = self.predictive_sample(x, n=n, batch_size=batch_size)

        # Independent variable must be scalar
        if samples.ndim > 2 and any(e > 1 for e in samples.shape[2:]):
            raise NotImplementedError(
                "only scalar dependent variables are supported")

        # Reshape
        Ns = samples.shape[0]
        N = samples.shape[1]
        samples = samples.reshape([Ns, N])
        y = self._get_y(x, y).reshape([1, N])

        # Percentiles of true y data along predictive distribution
        prcs = np.argmax(np.sort(samples, axis=0) > y, axis=0) / Ns

        # Argmax returns 0 when all samples are less than true value!
        prcs[np.reshape(np.max(samples, axis=0) < y, [N])] = 1.0

        # Return percentiles
        return prcs.reshape([N, 1])