def _get_y(self, x, y): """Get y, even when x is a DataGenerator and y is None""" if y is not None: return y else: y_true = [d for _, d in make_generator(x, y, test=True)] return np.concatenate(to_numpy(y_true), axis=0)
def test_make_generator(): # Create some data x = np.random.randn(100, 3) w = np.random.randn(3, 1) b = np.random.randn() y = x @ w + b # Should return an ArrayDataGenerator dg = make_generator(x, y) assert isinstance(dg, ArrayDataGenerator) # Should just return what passed if passed an ArrayDataGenerator dg = ArrayDataGenerator(x, y, batch_size=5) dg_out = make_generator(dg) assert isinstance(dg_out, ArrayDataGenerator)
def _sample(self, x, func, ed=None, axis=1): """Sample from the model""" samples = [] for x_data, y_data in make_generator(x, test=True): if x_data is None: samples += [func(self())] else: samples += [func(self(O.expand_dims(x_data, ed)))] return np.concatenate(to_numpy(samples), axis=axis)
def __init__(self, metric, x, y=None, verbose=True): # Store metric self.metric_fn = get_metric_fn(metric) # Store validation data self.data = make_generator(x, y) # Store metrics and epochs self.current_metric = np.nan self.current_epoch = 0 self.metrics = [] self.epochs = [] self.verbose = verbose
def __init__(self, metric, x, y=None, verbose=False): # Store metric self.metric_fn = get_metric_fn(metric) if isinstance(metric, str): self.metric_name = metric else: self.metric_name = self.metric_fn.__name__ # Store validation data self.data = make_generator(x, y) # Store metrics and epochs self.current_metric = np.nan self.current_epoch = 0 self.metrics = [] self.epochs = [] self.verbose = verbose
def _intervals(self, fn, x, side, ci=0.95, n=1000, batch_size=None): """Compute intervals on some type of sample""" # Compute in batches? if batch_size is not None: intervals = [ self._intervals(fn, x_data, side, ci=ci, n=n) for x_data, y_data in make_generator(x, test=True, batch_size=batch_size) ] return (np.concatenate(e, axis=0) for e in zip(*intervals)) # No batching (or this is a batch) samples = fn(x, n=n) if side == "lower": return np.percentile(samples, 100 * (1.0 - ci), axis=0) elif side == "upper": return np.percentile(samples, 100 * ci, axis=0) else: lb = 100 * (1.0 - ci) / 2.0 prcs = np.percentile(samples, [lb, 100.0 - lb], axis=0) return prcs[0, ...], prcs[1, ...]
def log_prob( self, x, y=None, individually=True, distribution=False, n=1000, batch_size=None, ): """Compute the log probability of `y` given the model TODO: Docs... Parameters ---------- x : |ndarray| or |DataFrame| or |Series| or Tensor Independent variable values of the dataset to evaluate (aka the "features"). y : |ndarray| or |DataFrame| or |Series| or Tensor Dependent variable values of the dataset to evaluate (aka the "target"). individually : bool If ``individually`` is True, returns log probability for each sample individually, so return shape is ``(x.shape[0], ?)``. If ``individually`` is False, returns sum of all log probabilities, so return shape is ``(1, ?)``. distribution : bool If ``distribution`` is True, returns log probability posterior distribution (``n`` samples from the model), so return shape is ``(?, n)``. If ``distribution`` is False, returns log posterior probabilities using the maximum a posteriori estimate for each parameter, so the return shape is ``(?, 1)``. n : int Number of samples to draw for each distribution if ``distribution=True``. batch_size : None or int Compute using batches of this many datapoints. Default is `None` (i.e., do not use batching). Returns ------- log_probs : |ndarray| Log probabilities. Shape is determined by ``individually``, ``distribution``, and ``n`` kwargs. """ # Get a distribution of samples if distribution: with Sampling(n=1, flipout=False): probs = [] for i in range(n): t_probs = [] for x_data, y_data in make_generator( x, y, batch_size=batch_size ): if x_data is None: t_probs += [self().log_prob(y_data)] else: t_probs += [self(x_data).log_prob(y_data)] probs += [np.concatenate(to_numpy(t_probs), axis=0)] probs = np.stack(to_numpy(probs), axis=probs[0].ndim) # Use MAP estimates else: probs = [] for x_data, y_data in make_generator(x, y, batch_size=batch_size): if x_data is None: probs += [self().log_prob(y_data)] else: probs += [self(x_data).log_prob(y_data)] probs = np.concatenate(to_numpy(probs), axis=0) # Return log prob of each sample or sum of log probs if individually: return probs else: return np.sum(probs, axis=0)
def metric(self, metric, x, y=None, batch_size=None): """Compute a metric of model performance TODO: docs TODO: note that this doesn't work w/ generative models Parameters ---------- metric : str or callable Metric to evaluate. Available metrics: * 'lp': log likelihood sum * 'log_prob': log likelihood sum * 'accuracy': accuracy * 'acc': accuracy * 'mean_squared_error': mean squared error * 'mse': mean squared error * 'sum_squared_error': sum squared error * 'sse': sum squared error * 'mean_absolute_error': mean absolute error * 'mae': mean absolute error * 'r_squared': coefficient of determination * 'r2': coefficient of determination * 'recall': true positive rate * 'sensitivity': true positive rate * 'true_positive_rate': true positive rate * 'tpr': true positive rate * 'specificity': true negative rate * 'selectivity': true negative rate * 'true_negative_rate': true negative rate * 'tnr': true negative rate * 'precision': precision * 'f1_score': F-measure * 'f1': F-measure * callable: a function which takes (y_true, y_pred) x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator| Independent variable values of the dataset to evaluate (aka the "features"). Or a |DataGenerator| to generate both x and y. y : |ndarray| or |DataFrame| or |Series| or Tensor Dependent variable values of the dataset to evaluate (aka the "target"). batch_size : None or int Compute using batches of this many datapoints. Default is `None` (i.e., do not use batching). Returns ------- TODO """ # Get true values and predictions y_true = [] y_pred = [] for x_data, y_data in make_generator( x, y, test=True, batch_size=batch_size ): y_true += [y_data] y_pred += [self(x_data).mean()] y_true = np.concatenate(to_numpy(y_true), axis=0) y_pred = np.concatenate(to_numpy(y_pred), axis=0) # Compute metric between true values and predictions metric_fn = get_metric_fn(metric) return metric_fn(y_true, y_pred)
def fit( self, x, y=None, batch_size: int = 128, epochs: int = 200, shuffle: bool = False, optimizer=None, optimizer_kwargs: dict = {}, lr: float = None, flipout: bool = True, num_workers: int = None, callbacks: List[BaseCallback] = [], eager: bool = False, n_mc: int = 1, ): r"""Fit the model to data TODO Parameters ---------- x : |ndarray| or |DataFrame| or |Series| or |DataGenerator| Independent variable values (or, if fitting a generative model, the dependent variable values). Should be of shape (Nsamples,...) y : |None| or |ndarray| or |DataFrame| or |Series| Dependent variable values (or, if fitting a generative model, ``None``). Should be of shape (Nsamples,...). Default = ``None`` batch_size : int Number of samples to use per minibatch. Default = ``128`` epochs : int Number of epochs to train the model. Default = ``200`` shuffle : bool Whether to shuffle the data each epoch. Note that this is ignored if ``x`` is a |DataGenerator| Default = ``True`` optimizer : |None| or a backend-specific optimizer What optimizer to use for optimizing the variational posterior distributions' variables. When the backend is |TensorFlow| the default is to use adam (``tf.keras.optimizers.Adam``). When the backend is |PyTorch| the default is to use TODO optimizer_kwargs : dict Keyword arguments to pass to the optimizer. Default is an empty dict. lr : float Learning rate for the optimizer. Note that the learning rate can be updated during training using the set_learning_rate method. Default is :math:`\exp (- \log_{10} (N_p N_b))`, where :math:`N_p` is the number of parameters in the model, and :math:`N_b` is the number of samples per batch (``batch_size``). flipout : bool Whether to use flipout during training where possible Default = True num_workers : None or int > 0 Number of parallel processes to run for loading the data. If ``None``, will not use parallel processes. If an integer, will use a process pool with that many processes. Note that this parameter is ignored if a |DataGenerator| is passed as ``x``. Default = None callbacks : List[BaseCallback] List of callbacks to run while training the model. Default is ``[]``, i.e. no callbacks. eager : bool Whether to use eager execution. If False, will use ``tf.function`` (for TensorFlow) or tracing (for PyTorch) to optimize the model fitting. Note that even if eager=True, you can still use eager execution when using the model after it is fit. Default = False n_mc : int Number of monte carlo samples to take from the variational posteriors per minibatch. The default is to just take one per batch. Using a smaller number of MC samples is faster, but using a greater number of MC samples will decrease the variance of the gradients, leading to more stable parameter optimization. Example ------- See the user guide section on :doc:`/user_guide/fitting`. """ # Determine a somewhat reasonable learning rate if none was passed if lr is not None: self._learning_rate = lr elif self._learning_rate is None: default_lr = np.exp(-np.log10(self.n_parameters * batch_size)) self._learning_rate = default_lr # Create DataGenerator from input data if not already self._data = make_generator( x, y, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, ) # Use default optimizer if none specified if optimizer is None and self._optimizer is None: if get_backend() == "pytorch": import torch self._optimizer = torch.optim.Adam( self.trainable_variables, lr=self._learning_rate, **optimizer_kwargs ) else: import tensorflow as tf self._optimizer = tf.keras.optimizers.Adam( lambda: self._learning_rate, **optimizer_kwargs ) # Use eager if input type is dataframe or series eager_types = (pd.DataFrame, pd.Series) if any(isinstance(e, eager_types) for e in self._data.get_batch(0)): eager = True # Create a function to perform one training step if get_backend() == "pytorch": self._train_fn = self._train_step_pytorch( self._data.n_samples, flipout, eager=eager, n_mc=n_mc ) else: self._train_fn = self._train_step_tensorflow( self._data.n_samples, flipout, eager=eager, n_mc=n_mc ) # Assign model param to callbacks for c in callbacks: c.model = self # Run callbacks at start of training self._is_training = True for c in callbacks: c.on_train_start() # Fit the model! for i in range(int(epochs)): # Stop training early? if not self._is_training: break # Run callbacks at start of epoch self._current_elbo = 0.0 self._data.on_epoch_start() for c in callbacks: c.on_epoch_start() # Update gradients for each batch for x_data, y_data in self._data: self.train_step(x_data, y_data) # Run callbacks at end of epoch self._data.on_epoch_end() for c in callbacks: c.on_epoch_end() # Run callbacks at end of training self._is_training = False for c in callbacks: c.on_train_end()
def test_ContinuousModel(plot): """Tests probflow.models.ContinuousModel""" class MyModel(ContinuousModel): def __init__(self): self.weight = Parameter([5, 1], name='Weight') self.bias = Parameter([1, 1], name='Bias') self.std = ScaleParameter([1, 1], name='Std') def __call__(self, x): return Normal(x @ self.weight() + self.bias(), self.std()) # Instantiate the model model = MyModel() # Data x = np.random.randn(100, 5).astype('float32') w = np.random.randn(5, 1).astype('float32') y = x @ w + 1 # Fit the model model.fit(x, y, batch_size=50, epochs=100, lr=0.01) # predictive intervals lb, ub = model.predictive_interval(x[:22, :]) assert isinstance(lb, np.ndarray) assert isinstance(ub, np.ndarray) assert lb.ndim == 2 assert lb.shape[0] == 22 assert lb.shape[1] == 1 assert ub.ndim == 2 assert ub.shape[0] == 22 assert ub.shape[1] == 1 # predictive intervals lower ci llb = model.predictive_interval(x[:22, :], side='lower') assert isinstance(llb, np.ndarray) assert llb.ndim == 2 assert llb.shape[0] == 22 assert llb.shape[1] == 1 assert np.all(llb <= ub) # predictive intervals upper ci uub = model.predictive_interval(x[:22, :], side='upper') assert isinstance(uub, np.ndarray) assert uub.ndim == 2 assert uub.shape[0] == 22 assert uub.shape[1] == 1 assert np.all(uub >= lb) assert np.all(uub >= llb) # aleatoric intervals lb, ub = model.aleatoric_interval(x[:23, :]) assert isinstance(lb, np.ndarray) assert isinstance(ub, np.ndarray) assert lb.ndim == 2 assert lb.shape[0] == 23 assert lb.shape[1] == 1 assert ub.ndim == 2 assert ub.shape[0] == 23 assert ub.shape[1] == 1 # epistemic intervals lb, ub = model.epistemic_interval(x[:24, :]) assert isinstance(lb, np.ndarray) assert isinstance(ub, np.ndarray) assert lb.ndim == 2 assert lb.shape[0] == 24 assert lb.shape[1] == 1 assert ub.ndim == 2 assert ub.shape[0] == 24 assert ub.shape[1] == 1 # posterior predictive plot with one sample model.pred_dist_plot(x[:1, :]) if plot: plt.title('Should be one dist on one subfig') plt.show() # posterior predictive plot with one sample, showing ci model.pred_dist_plot(x[:1, :], ci=0.95, style='hist') if plot: plt.title('Should be one dist on one subfig, w/ ci=0.95') plt.show() # posterior predictive plot with two samples model.pred_dist_plot(x[:2, :]) if plot: plt.title('Should be two dists on one subfig') plt.show() # posterior predictive plot with two samples, two subfigs model.pred_dist_plot(x[:2, :], individually=True) if plot: plt.title('Should be two dists on two subfigs') plt.show() # posterior predictive plot with six samples, 6 subfigs, 2 cols model.pred_dist_plot(x[:6, :], individually=True, cols=2) if plot: plt.title('Should be 6 dists, 6 subfigs, 2 cols') plt.show() # predictive prc prcs = model.predictive_prc(x[:7, :], y[:7, :]) assert isinstance(prcs, np.ndarray) assert prcs.ndim == 2 assert prcs.shape[0] == 7 assert prcs.shape[1] == 1 with pytest.raises(TypeError): prcs = model.predictive_prc(x[:7, :], None) # predictive distribution covered for each sample cov = model.pred_dist_covered(x[:11, :], y[:11, :]) assert isinstance(cov, np.ndarray) assert cov.ndim == 2 assert cov.shape[0] == 11 assert cov.shape[1] == 1 with pytest.raises(ValueError): cov = model.pred_dist_covered(x, y, n=-1) with pytest.raises(ValueError): cov = model.pred_dist_covered(x, y, ci=-0.1) with pytest.raises(ValueError): cov = model.pred_dist_covered(x, y, ci=1.1) # predictive distribution covered for each sample cov = model.pred_dist_coverage(x[:11, :], y[:11, :]) assert isinstance(cov, np.float) # plot coverage by xo, co = model.coverage_by(x[:, :1], x, y) assert isinstance(xo, np.ndarray) assert isinstance(co, np.ndarray) if plot: plt.title('should be coverage by plot') plt.show() # r squared r2 = model.r_squared(x, y, n=21) assert isinstance(r2, np.ndarray) assert r2.ndim == 2 assert r2.shape[0] == 21 assert r2.shape[1] == 1 # r squared with an ArrayDataGenerator dg = make_generator(x, y) r2 = model.r_squared(dg, n=22) assert isinstance(r2, np.ndarray) assert r2.ndim == 2 assert r2.shape[0] == 22 assert r2.shape[1] == 1 # plot the r2 dist model.r_squared_plot(x, y, style='hist') if plot: plt.title('should be r2 dist') plt.show() # residuals res = model.residuals(x, y) assert isinstance(res, np.ndarray) assert res.ndim == 2 assert res.shape[0] == 100 assert res.shape[1] == 1 # plot the distribution of residuals model.residuals_plot(x, y) if plot: plt.title('should be residuals dist') plt.show()
def test_ContinuousModel(plot): """Tests probflow.models.ContinuousModel""" class MyModel(ContinuousModel): def __init__(self): self.weight = Parameter([5, 1], name="Weight") self.bias = Parameter([1, 1], name="Bias") self.std = ScaleParameter([1, 1], name="Std") def __call__(self, x): return Normal(x @ self.weight() + self.bias(), self.std()) # Instantiate the model model = MyModel() # Data x = np.random.randn(100, 5).astype("float32") w = np.random.randn(5, 1).astype("float32") y = x @ w + 1 # Fit the model model.fit(x, y, batch_size=50, epochs=100, lr=0.01) # predictive intervals lb, ub = model.predictive_interval(x[:22, :]) assert isinstance(lb, np.ndarray) assert isinstance(ub, np.ndarray) assert lb.ndim == 2 assert lb.shape[0] == 22 assert lb.shape[1] == 1 assert ub.ndim == 2 assert ub.shape[0] == 22 assert ub.shape[1] == 1 # predictive intervals lower ci llb = model.predictive_interval(x[:22, :], side="lower") assert isinstance(llb, np.ndarray) assert llb.ndim == 2 assert llb.shape[0] == 22 assert llb.shape[1] == 1 assert np.all(llb <= ub) # predictive intervals upper ci uub = model.predictive_interval(x[:22, :], side="upper") assert isinstance(uub, np.ndarray) assert uub.ndim == 2 assert uub.shape[0] == 22 assert uub.shape[1] == 1 assert np.all(uub >= lb) assert np.all(uub >= llb) # predictive intervals with batching lb, ub = model.predictive_interval(x[:21, :], batch_size=7) assert isinstance(lb, np.ndarray) assert isinstance(ub, np.ndarray) assert lb.ndim == 2 assert lb.shape[0] == 21 assert lb.shape[1] == 1 assert ub.ndim == 2 assert ub.shape[0] == 21 assert ub.shape[1] == 1 # aleatoric intervals lb, ub = model.aleatoric_interval(x[:23, :]) assert isinstance(lb, np.ndarray) assert isinstance(ub, np.ndarray) assert lb.ndim == 2 assert lb.shape[0] == 23 assert lb.shape[1] == 1 assert ub.ndim == 2 assert ub.shape[0] == 23 assert ub.shape[1] == 1 # epistemic intervals lb, ub = model.epistemic_interval(x[:24, :]) assert isinstance(lb, np.ndarray) assert isinstance(ub, np.ndarray) assert lb.ndim == 2 assert lb.shape[0] == 24 assert lb.shape[1] == 1 assert ub.ndim == 2 assert ub.shape[0] == 24 assert ub.shape[1] == 1 # posterior predictive plot with one sample model.pred_dist_plot(x[:1, :]) if plot: plt.title("Should be one dist on one subfig") plt.show() # posterior predictive plot with one sample, showing ci model.pred_dist_plot(x[:1, :], ci=0.95, style="hist") if plot: plt.title("Should be one dist on one subfig, w/ ci=0.95") plt.show() # posterior predictive plot with two samples model.pred_dist_plot(x[:2, :]) if plot: plt.title("Should be two dists on one subfig") plt.show() # posterior predictive plot with two samples, two subfigs model.pred_dist_plot(x[:2, :], individually=True) if plot: plt.title("Should be two dists on two subfigs") plt.show() # posterior predictive plot with six samples, 6 subfigs, 2 cols model.pred_dist_plot(x[:6, :], individually=True, cols=2) if plot: plt.title("Should be 6 dists, 6 subfigs, 2 cols") plt.show() # predictive prc prcs = model.predictive_prc(x[:7, :], y[:7, :]) assert isinstance(prcs, np.ndarray) assert prcs.ndim == 2 assert prcs.shape[0] == 7 assert prcs.shape[1] == 1 with pytest.raises(TypeError): prcs = model.predictive_prc(x[:7, :], None) # predictive distribution covered for each sample cov = model.pred_dist_covered(x[:11, :], y[:11, :]) assert isinstance(cov, np.ndarray) assert cov.ndim == 2 assert cov.shape[0] == 11 assert cov.shape[1] == 1 with pytest.raises(ValueError): cov = model.pred_dist_covered(x, y, n=-1) with pytest.raises(ValueError): cov = model.pred_dist_covered(x, y, ci=-0.1) with pytest.raises(ValueError): cov = model.pred_dist_covered(x, y, ci=1.1) # predictive distribution covered for each sample cov = model.pred_dist_coverage(x[:11, :], y[:11, :]) assert isinstance(cov, np.float) # plot coverage by xo, co = model.coverage_by(x[:, :1], x, y) assert isinstance(xo, np.ndarray) assert isinstance(co, np.ndarray) if plot: plt.title("should be coverage by plot") plt.show() # r squared r2 = model.r_squared(x, y, n=21) assert isinstance(r2, np.ndarray) assert r2.ndim == 2 assert r2.shape[0] == 21 assert r2.shape[1] == 1 # r squared with an ArrayDataGenerator dg = make_generator(x, y) r2 = model.r_squared(dg, n=22) assert isinstance(r2, np.ndarray) assert r2.ndim == 2 assert r2.shape[0] == 22 assert r2.shape[1] == 1 # plot the r2 dist model.r_squared_plot(x, y, style="hist") if plot: plt.title("should be r2 dist") plt.show() # residuals res = model.residuals(x, y) assert isinstance(res, np.ndarray) assert res.ndim == 2 assert res.shape[0] == 100 assert res.shape[1] == 1 # plot the distribution of residuals model.residuals_plot(x, y) if plot: plt.title("should be residuals dist") plt.show() # calibration curve p, p_hat = model.calibration_curve(x[:90, :], y[:90, :], resolution=11) assert isinstance(p, np.ndarray) assert isinstance(p_hat, np.ndarray) assert p.ndim == 1 assert p.shape[0] == 11 assert p_hat.ndim == 1 assert p_hat.shape[0] == 11 assert np.all(p >= 0) assert np.all(p <= 1) assert np.all(p_hat >= 0) assert np.all(p_hat <= 1) # calibration curve (with batching) p, p_hat = model.calibration_curve(x[:90, :], y[:90, :], resolution=11, batch_size=30) assert isinstance(p, np.ndarray) assert isinstance(p_hat, np.ndarray) assert p.ndim == 1 assert p.shape[0] == 11 assert p_hat.ndim == 1 assert p_hat.shape[0] == 11 assert np.all(p >= 0) assert np.all(p <= 1) assert np.all(p_hat >= 0) assert np.all(p_hat <= 1) # calibration curve model.calibration_curve_plot(x, y, resolution=11) if plot: plt.title("should be calibration curve") plt.show() # calibration curve (with batching) model.calibration_curve_plot(x, y, resolution=11, batch_size=25) if plot: plt.title("should be calibration curve (with batching)") plt.show() # calibration metrics: msce msce = model.calibration_metric("msce", x[:90, :], y[:90, :], resolution=11) assert isinstance(msce, float) assert msce >= 0 assert msce <= 1 # calibration metrics: rmsce rmsce = model.calibration_metric("rmsce", x[:90, :], y[:90, :], resolution=11) assert isinstance(rmsce, float) assert rmsce >= 0 assert rmsce <= 1 # calibration metrics: mace mace = model.calibration_metric("mace", x[:90, :], y[:90, :], resolution=11) assert isinstance(mace, float) assert mace >= 0 assert mace <= 1 # calibration metrics: ma ma = model.calibration_metric("ma", x[:90, :], y[:90, :], resolution=11) assert isinstance(ma, float) assert ma >= 0 assert ma <= 1 # should raise value error on invalid metric name with pytest.raises(ValueError): ma = model.calibration_metric("lala", x[:90, :], y[:90, :], resolution=11) # calibration metrics: list of em metrics = model.calibration_metric(["mace", "ma"], x[:90, :], y[:90, :], resolution=11) assert isinstance(metrics, dict) assert len(metrics) == 2 assert "mace" in metrics assert "ma" in metrics assert metrics["mace"] >= 0 assert metrics["mace"] <= 1 assert metrics["ma"] >= 0 assert metrics["ma"] <= 1 # calibration metric with batching msce = model.calibration_metric("msce", x[:90, :], y[:90, :], resolution=11, batch_size=30) assert isinstance(msce, (float, np.floating)) assert msce >= 0 assert msce <= 1 # sharpness sha = model.sharpness(x[:90, :]) assert isinstance(sha, (float, np.floating)) assert rmsce >= 0 # sharpness w/ batching sha = model.sharpness(x[:90, :], batch_size=30) assert isinstance(sha, (float, np.floating)) assert rmsce >= 0 # dispersion metric: cv dm = model.dispersion_metric("cv", x[:90, :]) assert isinstance(dm, (float, np.floating)) assert dm >= 0 # dispersion metric: qcd dm = model.dispersion_metric("qcd", x[:90, :]) assert isinstance(dm, (float, np.floating)) assert dm >= 0 # dispersion metric w/ batching dm = model.dispersion_metric("cv", x[:90, :], batch_size=30) assert isinstance(dm, (float, np.floating)) assert dm >= 0 # should raise value error on invalid metric name with pytest.raises(ValueError): dm = model.dispersion_metric("lala", x[:90, :]) # dispersion metrics: list of em metrics = model.dispersion_metric(["cv", "qcd"], x[:90, :]) assert isinstance(metrics, dict) assert len(metrics) == 2 assert "cv" in metrics assert "qcd" in metrics assert metrics["cv"] >= 0 assert metrics["qcd"] >= 0
def predictive_prc(self, x, y=None, n=1000, batch_size=None): r"""Compute the percentile of each observation along the posterior predictive distribution. TODO: Docs... Returns a percentile between 0 and 1 Parameters ---------- x : |ndarray| or |DataFrame| or |Series| or Tensor or |DataGenerator| Independent variable values of the dataset to evaluate (aka the "features"). Or a |DataGenerator| for both x and y. y : |ndarray| or |DataFrame| or |Series| or Tensor Dependent variable values of the dataset to evaluate (aka the "target"). n : int Number of samples to draw from the model given ``x``. Default = 1000 batch_size : None or int Compute using batches of this many datapoints. Default is `None` (i.e., do not use batching). Returns ------- prcs : |ndarray| of float between 0 and 1 """ # Need both x and y data if y is None and not isinstance(x, DataGenerator): raise TypeError("need both x and y to compute predictive prc") # Compute in batches? if batch_size is not None: return np.concatenate( [ self.predictive_prc(x_data, y_data, n=n) for x_data, y_data in make_generator(x, y, batch_size=batch_size) ], axis=0, ) # Sample from the predictive distribution samples = self.predictive_sample(x, n=n, batch_size=batch_size) # Independent variable must be scalar if samples.ndim > 2 and any(e > 1 for e in samples.shape[2:]): raise NotImplementedError( "only scalar dependent variables are supported") # Reshape Ns = samples.shape[0] N = samples.shape[1] samples = samples.reshape([Ns, N]) y = self._get_y(x, y).reshape([1, N]) # Percentiles of true y data along predictive distribution prcs = np.argmax(np.sort(samples, axis=0) > y, axis=0) / Ns # Argmax returns 0 when all samples are less than true value! prcs[np.reshape(np.max(samples, axis=0) < y, [N])] = 1.0 # Return percentiles return prcs.reshape([N, 1])