Пример #1
0
class LocalLengthScalesCorrelation(object):
    """ Non-stationary correlation model based on local smoothness estimates.

    This non-stationary correlation model learns internally point estimates of
    local smoothness using a second-level Gaussian Process. For this, it
    selects a subset of the training data and learns length-scales at this
    specific points. These length scales are generalized using the second-level
    Gaussian Process. Furthermore, global (isotropic or anisotropic) length
    scales are learned for both the top-level GP and the length-scale GP.

    The correlation model is based on the family of (stationary) Matern
    kernels. The parameter nu of the Matern kernels (governing the smoothness
    of the GP prior) can either be set or learned jointly with the remaining
    parameters.

    Parameters
    ----------
    isotropic : bool, default=True
        Whether the global length-scales of the top-level GP are isotropic or
        anisotropic

    nu: float, default=1.5
        The parameter nu of the Matern kernels (governing the smoothness
        of the GP prior). If None, nu is learned along with the other
        hyperparameters.

    l_isotropic : bool, default=True
        Whether the global length-scales of the length-scale GP are isotropic
        or anisotropic

    l_samples: int, default=10
        How many datapoints from the training data are selected as support
        points for learning the length-scale GP

    prior_b: float, default=inf
        The variance of the log-normal prior distribution on the length scales.
        If set to infinity, the distribution is assumed to be uniform.

    .. seealso::

    "Nonstationary Gaussian Process Regression using Point Estimates of Local
    Smoothness", Christian Plagemann, Kristian Kersting, and Wolfram Burgard,
    ECML 2008
    """

    def __init__(self, isotropic=True, nu=1.5, l_isotropic=True, l_samples=10,
                 prior_b=np.inf, X_=None):
        self.isotropic = isotropic
        self.nu = nu
        self.l_isotropic = l_isotropic
        self.l_samples = l_samples
        self.prior_b = prior_b
        self.X_ = X_
        if self.X_ is not None:
            assert self.X_.shape[0] == self.l_samples

    def fit(self, X, nugget=10. * MACHINE_EPSILON):
        """ Fits the correlation model for training data X

        Parameters
        ----------
        X : array_like, shape=(n_samples, n_features)
            An array of training datapoints at which observations were made,
            i.e., where the outputs y are known
        nugget : double or ndarray, optional
            The Gaussian Process nugget parameter
            The nugget is added to the diagonal of the assumed training
            covariance; in this way it acts as a Tikhonov regularization in
            the problem.  In the special case of the squared exponential
            correlation function, the nugget mathematically represents the
            variance of the input values. Default assumes a nugget close to
            machine precision for the sake of robustness
            (nugget = 10. * MACHINE_EPSILON).
        """
        self.X = X
        self.nugget = nugget
        self.n_samples = X.shape[0]
        self.n_dims = X.shape[1]

        # Determine how many entries in theta belong to the different
        # categories (used later for parsing theta)
        self.theta_gp_size = 1 if self.isotropic else self.n_dims
        self.theta_l_size = 1 if self.l_isotropic else self.n_dims
        self.nu_size = 1 if not self.nu else 0
        self.theta_size = self.theta_gp_size + self.theta_l_size \
            + self.l_samples + self.nu_size

        # Calculate array with shape (n_eval, n_features) giving the
        # componentwise distances between locations x and x' at which the
        # correlation model should be evaluated.
        self.D, self.ij = l1_cross_differences(self.X)

        if self.X_ is None:
            # Select subset of X for which length scales are optimized.
            # Generalization of length scales to other datapoints is acheived
            # by means of a separate Gaussian Process (gp_l)
            if self.X.shape[0] >= self.l_samples:
                kmeans = KMeans(n_clusters=self.l_samples)
                self.X_ = kmeans.fit(self.X).cluster_centers_
            else:  # Fallback to select centers using sampling with replacement
                self.X_ = self.X[np.random.choice(np.arange(self.X.shape[0]),
                                                  self.l_samples)]

        return self

    def __call__(self, theta, X=None):
        """ Compute correlation for given correlation parameter(s) theta.

        Parameters
        ----------
        theta : array_like
            An array giving the autocorrelation parameter(s).

        X : array_like, shape(n_eval, n_features)
            An array containing the n_eval query points whose correlation with
            the training datapoints shall be computed. If None, autocorrelation
            of the training datapoints is computed instead.

        Returns
        -------
        r : array_like, shape=(n_eval, n_samples) if X != None
                              (n_samples, n_samples) if X == None
            An array containing the values of the correlation model.
        """
        # Parse theta into its components
        theta_gp, theta_l, length_scales, nu = self._parse_theta(theta)

        # Train length-scale Gaussian Process
        from skgp.estimators import GaussianProcess
        self.gp_l = \
            GaussianProcess(corr="matern_1.5",
                            theta0=theta_l).fit(self.X_,
                                                np.log10(length_scales))
        l_train = 10**self.gp_l.predict(self.X)

        # Prepare distances and length scale information for any pair of
        # datapoints, whose correlation shall be computed
        if X is not None:
            # Get pairwise componentwise L1-differences to the input training
            # set
            d = X[:, np.newaxis, :] - self.X[np.newaxis, :, :]
            d = d.reshape((-1, X.shape[1]))
            # Predict length scales for query datapoints
            l_query = 10**self.gp_l.predict(X)
            l = np.transpose([np.tile(l_train, len(l_query)),
                              np.repeat(l_query, len(l_train))])
        else:
            # No external datapoints given; auto-correlation of training set
            # is used instead
            d = self.D
            l = l_train[self.ij]

        # Compute general Matern kernel
        if d.ndim > 1 and theta_gp.size == d.ndim:
            activation = np.sum(theta_gp.reshape(1, d.ndim) * d ** 2, axis=1)
        else:
            activation = theta_gp[0] * np.sum(d ** 2, axis=1)
        tmp = 0.5*(l**2).sum(1)
        tmp2 = np.maximum(2*np.sqrt(nu * activation / tmp), 1e-5)
        r = np.sqrt(l[:, 0]) * np.sqrt(l[:, 1]) / (gamma(nu) * 2**(nu - 1))
        r /= np.sqrt(tmp)
        r *= tmp2**nu * kv(nu, tmp2)

        # Convert correlations to 2d matrix
        if X is not None:
            return r.reshape(-1, self.n_samples)
        else:  # exploit symmetry of auto-correlation
            R = np.eye(self.n_samples) * (1. + self.nugget)
            R[self.ij[:, 0], self.ij[:, 1]] = r
            R[self.ij[:, 1], self.ij[:, 0]] = r
            return R

    def log_prior(self, theta):
        """ Returns the (log) prior probability of parameters theta.

        The prior is assumed to be uniform over the parameter space except for
        the length-scales dimensions. These are assumed to be log-normal
        distributed with mean 0 and variance self.prior_b. If
        self.prior_b is np.inf, the log length-scales are assumed to be
        uniformly distributed as well.

        NOTE: The returned quantity is an improper prior as its integral over
              the parameter space is not equal to 1.

        Parameters
        ----------
        theta : array_like
            An array giving the autocorrelation parameter(s).

        Returns
        -------
        log_p : float
            The (log) prior probability of parameters theta. An improper
            probability.
        """
        if self.prior_b == np.inf:
            return 0.0
        _, _, length_scales, _ = self._parse_theta(theta)

        squared_dist = (np.log10(length_scales)**2).sum()
        return -squared_dist / self.prior_b

    def _parse_theta(self, theta):
        """ Parse parameter vector theta into its components.

        Parameters
        ----------
        theta : array_like
            An array containing all hyperparameters.

        Returns
        -------
        theta_gp : array_like
            An array containing the hyperparameters of the main GP.
        theta_l : array_like
            An array containing the hyperparameters of the length-scale GP.
        length_scales : array_like
            An array containing the length-scales for the length-scale GP.
        nu : float
            The parameter nu controlling the smoothness of the Matern kernel.
        """
        theta = np.asarray(theta, dtype=np.float)

        assert (theta.size == self.theta_size), \
            "theta does not have the expected size (expected: %d, " \
            "actual size %d). Expected: %d entries for main GP, " \
            "%d entries for length-scale GP, %d entries containing the "\
            "length scales, and %d entries for nu." \
            % (self.theta_size, theta.size, self.theta_gp_size,
               self.theta_l_size, self.l_samples, self.nu_size)

        # Split theta in its components
        theta_gp = theta[:self.theta_gp_size]
        theta_l = \
            theta[self.theta_gp_size:][:self.theta_l_size]
        length_scales = \
            theta[self.theta_gp_size+self.theta_l_size:][:self.l_samples]
        nu = self.nu if self.nu else theta[-1]

        return theta_gp, theta_l, length_scales, nu

    @classmethod
    def create(cls, dims, isotropic=True, theta0=1e-1,
               thetaL=None, thetaU=None,
               l_isotropic=True, theta_l_0=1e-1,
               theta_l_L=None, theta_l_U=None,
               l_samples=20, l_0=1.0, l_L=None, l_U=None,
               nu_0=1.5, nu_L=None, nu_U=None, prior_b=np.inf,
               *args, **kwargs):
        """ Factory method for creating non-stationary correlation models.

        ..note:: In addtion to returning an instance of
                 NonStationaryCorrelation, the specification of the search
                 space for the hyperparameters theta of the Gaussian process
                 is returned. This includes the start point of the search
                 (theta0) as well as the lower and upper boundaries thetaL and
                 thetaU for the values of theta.
        """
        theta0 = [theta0] * (1 if isotropic else dims)
        thetaL = [thetaL] * (1 if isotropic else dims)
        thetaU = [thetaU] * (1 if isotropic else dims)

        theta0 += [theta_l_0] * (1 if l_isotropic else dims)
        thetaL += [theta_l_L] * (1 if l_isotropic else dims)
        thetaU += [theta_l_U] * (1 if l_isotropic else dims)

        theta0 += [l_0] * l_samples
        thetaL += [l_L] * l_samples
        thetaU += [l_U] * l_samples

        if nu_L is not None:
            theta0 += [nu_0]
            thetaL += [nu_L]
            thetaU += [nu_U]

        corr = cls(isotropic=isotropic, nu=None if nu_L else nu_0,
                   l_isotropic=l_isotropic, l_samples=l_samples,
                   prior_b=prior_b)

        return corr, theta0, thetaL, thetaU
Пример #2
0
    def __call__(self, theta, X=None):
        """ Compute correlation for given correlation parameter(s) theta.

        Parameters
        ----------
        theta : array_like
            An array giving the autocorrelation parameter(s).

        X : array_like, shape(n_eval, n_features)
            An array containing the n_eval query points whose correlation with
            the training datapoints shall be computed. If None, autocorrelation
            of the training datapoints is computed instead.

        Returns
        -------
        r : array_like, shape=(n_eval, n_samples) if X != None
                              (n_samples, n_samples) if X == None
            An array containing the values of the correlation model.
        """
        # Parse theta into its components
        theta_gp, theta_l, length_scales, nu = self._parse_theta(theta)

        # Train length-scale Gaussian Process
        from skgp.estimators import GaussianProcess
        self.gp_l = \
            GaussianProcess(corr="matern_1.5",
                            theta0=theta_l).fit(self.X_,
                                                np.log10(length_scales))
        l_train = 10**self.gp_l.predict(self.X)

        # Prepare distances and length scale information for any pair of
        # datapoints, whose correlation shall be computed
        if X is not None:
            # Get pairwise componentwise L1-differences to the input training
            # set
            d = X[:, np.newaxis, :] - self.X[np.newaxis, :, :]
            d = d.reshape((-1, X.shape[1]))
            # Predict length scales for query datapoints
            l_query = 10**self.gp_l.predict(X)
            l = np.transpose([np.tile(l_train, len(l_query)),
                              np.repeat(l_query, len(l_train))])
        else:
            # No external datapoints given; auto-correlation of training set
            # is used instead
            d = self.D
            l = l_train[self.ij]

        # Compute general Matern kernel
        if d.ndim > 1 and theta_gp.size == d.ndim:
            activation = np.sum(theta_gp.reshape(1, d.ndim) * d ** 2, axis=1)
        else:
            activation = theta_gp[0] * np.sum(d ** 2, axis=1)
        tmp = 0.5*(l**2).sum(1)
        tmp2 = np.maximum(2*np.sqrt(nu * activation / tmp), 1e-5)
        r = np.sqrt(l[:, 0]) * np.sqrt(l[:, 1]) / (gamma(nu) * 2**(nu - 1))
        r /= np.sqrt(tmp)
        r *= tmp2**nu * kv(nu, tmp2)

        # Convert correlations to 2d matrix
        if X is not None:
            return r.reshape(-1, self.n_samples)
        else:  # exploit symmetry of auto-correlation
            R = np.eye(self.n_samples) * (1. + self.nugget)
            R[self.ij[:, 0], self.ij[:, 1]] = r
            R[self.ij[:, 1], self.ij[:, 0]] = r
            return R
#----------------------------------------------------------------------
# Actual test data
X = np.random.random(50)[:, None] * 4 - 2

# Observations
y = f(X).ravel()

# Mesh the input space for evaluations of the real function, the prediction and
# its MSE
x = np.atleast_2d(np.linspace(-2, 2, 1000)).T

# Instanciate one Gaussian Process model for the stationary Matern kernel and
# one for the non-stationary one
gp_stationary = \
    GaussianProcess(corr='matern_1.5', theta0=1e0, thetaL=1e-2, thetaU=1e+2,
                    random_start=100)
gp_non_stationary = \
    GaussianProcess(corr=NonStationaryCorrelation(),
                    theta0=1e0, thetaL=1e-2, thetaU=1e+2,
                    random_start=100)

# Fit to data using Maximum Likelihood Estimation of the parameters
gp_stationary.fit(X, y)
gp_non_stationary.fit(X, y)
print("Theta:\n\tStationary: {:.3f} \t Non-stationary: {:.3f}"
      .format(gp_stationary.theta_[0], gp_non_stationary.theta_[0]))
print("Posterior probability (negative, average, log):\n\t"
      "Stationary: {:.5f} \t Non-stationary: {:.5f}"
      .format(gp_stationary.posterior_function_value_,
              gp_non_stationary.posterior_function_value_))
Пример #4
0

Xtrain = np.random.random((200, 4)) * 2 - 1
ytrain = f(Xtrain)

plt.figure()
colors = ['r', 'g', 'b', 'c', 'm']
labels = {
    1: "Isotropic",
    4: "Automatic Relevance Determination",
    8: "Factor Analysis"
}
for i, n in enumerate(labels.keys()):
    train_sizes, train_scores, test_scores = \
        learning_curve(GaussianProcess(corr='squared_exponential',
                                       theta0=[1.0] * n, thetaL=[1e-4] * n,
                                       thetaU=[1e2] * n),
                       Xtrain, ytrain, scoring="mean_squared_error",
                       cv=10, n_jobs=4)
    test_scores = -test_scores  # Scores correspond to negative MSE
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_min = np.min(test_scores, axis=1)
    test_scores_max = np.max(test_scores, axis=1)

    plt.plot(train_sizes, test_scores_mean, label=labels[n], color=colors[i])
    plt.fill_between(train_sizes,
                     test_scores_min,
                     test_scores_max,
                     alpha=0.2,
                     color=colors[i])
Пример #5
0
    by ARD. Furthermore, the values x in R^3 and
    x + \alpha (1, 2 , 0) + \beta (1, 0, 2) have the same value for all x and
    all alpha and beta. This can be exploited by FAD.
    """
    return np.tanh(2 * X[:, 0] - X[:, 1] - X[:, 2])


Xtrain = np.random.random((100, 6)) * 2 - 1
ytrain = f(Xtrain)

plt.figure()
colors = ['r', 'g', 'b', 'c', 'm']
labels = {True: "Bayesian GP", False: "Standard GP"}
for i, bayesian in enumerate(labels.keys()):
    model = GaussianProcess(corr='squared_exponential',
                            theta0=[1.0] * 12,
                            thetaL=[1e-4] * 12,
                            thetaU=[1e2] * 12)
    if bayesian:
        model = BayesianGaussianProcess(model,
                                        n_posterior_samples=25,
                                        n_burnin=250,
                                        n_sampling_steps=25)

    train_sizes, train_scores, test_scores = \
        learning_curve(model, Xtrain, ytrain, scoring="mean_squared_error",
                       cv=10, n_jobs=1)
    test_scores = -test_scores  # Scores correspond to negative MSE
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_min = np.min(test_scores, axis=1)
    test_scores_max = np.max(test_scores, axis=1)
Пример #6
0
#----------------------------------------------------------------------
# Actual test data
X = np.random.random(50)[:, None] * 4 - 2

# Observations
y = f(X).ravel()

# Mesh the input space for evaluations of the real function, the prediction and
# its MSE
x = np.atleast_2d(np.linspace(-2, 2, 1000)).T

# Instanciate one Gaussian Process model for the stationary Matern kernel and
# one for the non-stationary one
gp_stationary = \
    GaussianProcess(corr='matern_1.5', theta0=1e0, thetaL=1e-2, thetaU=1e+2,
                    random_start=100)
gp_non_stationary = \
    GaussianProcess(corr=NonStationaryCorrelation(),
                    theta0=1e0, thetaL=1e-2, thetaU=1e+2,
                    random_start=100)

# Fit to data using Maximum Likelihood Estimation of the parameters
gp_stationary.fit(X, y)
gp_non_stationary.fit(X, y)
print("Theta:\n\tStationary: {:.3f} \t Non-stationary: {:.3f}".format(
    gp_stationary.theta_[0], gp_non_stationary.theta_[0]))
print("Posterior probability (negative, average, log):\n\t"
      "Stationary: {:.5f} \t Non-stationary: {:.5f}".format(
          gp_stationary.posterior_function_value_,
          gp_non_stationary.posterior_function_value_))