def test_nystroem_callable():
    # Test Nystroem on a callable.
    rnd = np.random.RandomState(42)
    n_samples = 10
    X = rnd.uniform(size=(n_samples, 4))

    def logging_histogram_kernel(x, y, log):
        """Histogram kernel that writes to a log."""
        log.append(1)
        return np.minimum(x, y).sum()

    kernel_log = []
    X = list(X)     # test input validation
    Nystroem(kernel=logging_histogram_kernel,
             n_components=(n_samples - 1),
             kernel_params={'log': kernel_log}).fit(X)
    assert_equal(len(kernel_log), n_samples * (n_samples - 1) / 2)

    def linear_kernel(X, Y):
        return np.dot(X, Y.T)

    # if degree, gamma or coef0 is passed, we raise a warning
    msg = "Don't pass gamma, coef0 or degree to Nystroem"
    params = ({'gamma': 1}, {'coef0': 1}, {'degree': 2})
    for param in params:
        ny = Nystroem(kernel=linear_kernel, **param)
        with pytest.raises(ValueError, match=msg):
            ny.fit(X)
def test_nystroem_poly_kernel_params():
    """Non-regression: Nystroem should pass other parameters beside gamma."""
    rnd = np.random.RandomState(37)
    X = rnd.uniform(size=(10, 4))

    K = polynomial_kernel(X, degree=3.1, coef0=.1)
    nystroem = Nystroem(kernel="polynomial", n_components=X.shape[0],
                        degree=3.1, coef0=.1)
    X_transformed = nystroem.fit_transform(X)
    assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
Пример #3
0
class SparseKernelClassifier(CDClassifier):
    def __init__(self, mode='exact', kernel='rbf', gamma=1e-3, C=1, alpha=1,
                 n_components=500, n_jobs=1, verbose=False):
        self.mode = mode
        self.kernel = kernel
        self.gamma = gamma
        self.C = C
        self.alpha = alpha
        self.n_components = n_components
        self.n_jobs = n_jobs
        self.verbose = verbose
        super(SparseKernelClassifier, self).__init__(
            C=C,
            alpha=alpha,
            loss='squared_hinge',
            penalty='l1',
            multiclass=False,
            debiasing=True,
            Cd=C,
            warm_debiasing=True,
            n_jobs=n_jobs,
            verbose=False,
        )

    def fit(self, X, y):
        if self.mode == 'exact':
            K = pairwise_kernels(
                X,
                metric=self.kernel,
                filter_params=True,
                gamma=self.gamma
            )
            self.X_train_ = X
        else:
            self.kernel_sampler_ = Nystroem(
                kernel=self.kernel,
                gamma=self.gamma,
                n_components=self.n_components
            )
            K = self.kernel_sampler_.fit_transform(X)
        super(SparseKernelClassifier, self).fit(K, y)
        return self

    def decision_function(self, X):
        if self.mode == 'exact':
            K = pairwise_kernels(
                X, self.X_train_,
                metric=self.kernel,
                filter_params=True,
                gamma=self.gamma
            )
        else:
            K = self.kernel_sampler_.transform(X)
        return super(SparseKernelClassifier, self).decision_function(K)
def test_nystroem_vs_sklearn():
    np.random.seed(42)
    X = np.random.randn(100, 5)

    kernel = Nystroem(kernel='linear', random_state=42)
    kernelR = NystroemR(kernel='linear', random_state=42)

    y1 = kernel.fit_transform([X])[0]
    y2 = kernelR.fit_transform(X)

    assert_array_almost_equal(y1, y2)
Пример #5
0
class WeightedSparseKernelClassifier(LinearSVC):
    def __init__(
            self, mode='exact', kernel='rbf', gamma=1e-3, C=1,
            multi_class='ovr', class_weight='auto', n_components=5000,
            verbose=False
    ):
        self.mode = mode
        self.kernel = kernel
        self.gamma = gamma
        self.C = C
        self.multi_class = multi_class
        self.class_weight = class_weight
        self.n_components = n_components
        self.verbose = verbose

        super(WeightedSparseKernelClassifier, self).__init__(
            C=C,
            loss='squared_hinge',
            penalty='l1',
            dual=False,
            verbose=verbose
        )

    def fit(self, X, y):
        if self.mode == 'exact':
            K = pairwise_kernels(
                X,
                metric=self.kernel,
                filter_params=True,
                gamma=self.gamma
            )
            self.X_train_ = X
        else:
            self.kernel_sampler_ = Nystroem(
                kernel=self.kernel,
                gamma=self.gamma,
                n_components=self.n_components
            )
            K = self.kernel_sampler_.fit_transform(X)
        return super(WeightedSparseKernelClassifier, self).fit(K, y)

    def decision_function(self, X):
        if self.mode == 'exact':
            K = pairwise_kernels(
                X, self.X_train_,
                metric=self.kernel,
                filter_params=True,
                gamma=self.gamma
            )
        else:
            K = self.kernel_sampler_.transform(X)
        return super(WeightedSparseKernelClassifier, self).decision_function(K)
Пример #6
0
    def ApplyNystroemOnKernelMatrix(x, kernelFn, nComponents):
        """
        Given a data matrix (each row is an observation, each column is a variable) and a kernel function,
        compute the Nystroem approximation of its uncentered Kernel matrix.

        :param x: numpy matrix. Data matrix.
        :param kernelFn: callable function. Returned by calling KernelSelector().
        :param nComponents: integer. Number of ranks retained in Nystroem method.
        :return
            numpy matrix.
        """
        nystroem = Nystroem(kernelFn, n_components=nComponents)
        return np.matrix(nystroem.fit_transform(x))
def test_nystroem_singular_kernel():
    # test that nystroem works with singular kernel matrix
    rng = np.random.RandomState(0)
    X = rng.rand(10, 20)
    X = np.vstack([X] * 2)  # duplicate samples

    gamma = 100
    N = Nystroem(gamma=gamma, n_components=X.shape[0]).fit(X)
    X_transformed = N.transform(X)

    K = rbf_kernel(X, gamma=gamma)

    assert_array_almost_equal(K, np.dot(X_transformed, X_transformed.T))
    assert_true(np.all(np.isfinite(Y)))
Пример #8
0
    def gram_Nystroem(self, x, nComponents):
        """
        Nystroem approximation of the kernel matrix given data. No centering.

        :type x: 2d array, with size n * p
        :param x: data matrix for the covariates belonging to the same group, associated
                  with the given matrix.

        :type nComponents: int
        :param nComponents: number of rank to retain

        :return: approximated kernel matrix with reduced rank, with size n * nComponents
        """
        nystroem = Nystroem(self.fn, n_components=nComponents)
        return nystroem.fit_transform(x)
Пример #9
0
    def fit(self, X, Y, weights=None, context_transform=True):
        """ Trains policy by weighted maximum likelihood.

        .. note:: This call changes this policy (self)

        Parameters
        ----------
        X: array-like, shape (n_samples, context_dims)
            Context vectors

        Y: array-like, shape (n_samples, weight_dims)
            Low-level policy parameter vectors

        weights: array-like, shape (n_samples,)
            Weights of individual samples (should depend on the obtained
            reward)
        """
        # Kernel approximation
        self.nystroem = Nystroem(
            kernel=self.kernel,
            gamma=self.gamma,
            coef0=self.coef0,
            n_components=np.minimum(X.shape[0], self.n_components),
            random_state=self.random_state,
        )
        self.X = self.nystroem.fit_transform(X)
        if self.bias:
            self.X = np.hstack((self.X, np.ones((self.X.shape[0], 1))))
        if self.normalize:
            self.X /= np.abs(self.X).sum(1)[:, None]

        # Standard ridge regression
        ridge = Ridge(alpha=self.alpha, fit_intercept=False)
        ridge.fit(self.X, Y, weights)
        self.W = ridge.coef_
def test_nystroem_default_parameters():
    rnd = np.random.RandomState(42)
    X = rnd.uniform(size=(10, 4))

    # rbf kernel should behave as gamma=None by default
    # aka gamma = 1 / n_features
    nystroem = Nystroem(n_components=10)
    X_transformed = nystroem.fit_transform(X)
    K = rbf_kernel(X, gamma=None)
    K2 = np.dot(X_transformed, X_transformed.T)
    assert_array_almost_equal(K, K2)

    # chi2 kernel should behave as gamma=1 by default
    nystroem = Nystroem(kernel='chi2', n_components=10)
    X_transformed = nystroem.fit_transform(X)
    K = chi2_kernel(X, gamma=1)
    K2 = np.dot(X_transformed, X_transformed.T)
    assert_array_almost_equal(K, K2)
def test_nystrom_approximation():
    # some basic tests
    rnd = np.random.RandomState(0)
    X = rnd.uniform(size=(10, 4))

    # With n_components = n_samples this is exact
    X_transformed = Nystroem(n_components=X.shape[0]).fit_transform(X)
    K = rbf_kernel(X)
    assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)

    trans = Nystroem(n_components=2, random_state=rnd)
    X_transformed = trans.fit(X).transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 2))

    # test callable kernel
    linear_kernel = lambda X, Y: np.dot(X, Y.T)
    trans = Nystroem(n_components=2, kernel=linear_kernel, random_state=rnd)
    X_transformed = trans.fit(X).transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 2))
def test_lndmrk_nystroem_approximation():
    np.random.seed(42)
    X = np.random.randn(100, 5)

    u = np.arange(X.shape[0])[5::1]
    v = np.arange(X.shape[0])[::1][:u.shape[0]]
    lndmrks = X[np.unique((u, v))]

    kernel = LandmarkNystroem(kernel='rbf', random_state=42)
    kernelR = NystroemR(kernel='rbf', random_state=42)

    y1_1 = kernel.fit_transform([X])[0]
    kernel.landmarks = lndmrks
    y1_2 = kernel.fit_transform([X])[0]

    y2 = kernelR.fit_transform(X)

    assert_array_almost_equal(y2, y1_1)

    assert not all((np.abs(y2 - y1_2) > 1E-6).flatten())
Пример #13
0
class NystromScikit:

    """
        Nystrom implementation form Scikit Learn wrapper.
        The main difference is in selection of inducing inputs.
    """

    def __init__(self, rank=10, random_state=42):
        """
        :param rank: (``int``) Maximal decomposition rank.

        :param random_state: (``int``) Random generator seed.
        """
        self.trained = False
        self.rank = rank
        self.random_state = random_state


    def fit(self, K, y):
        """
        Fit approximation to the kernel function / matrix.

        :param K: (``numpy.ndarray``) or of (``Kinterface``). The kernel to be approximated with G.

        :param y: (``numpy.ndarray``) Class labels :math:`y_i \in {-1, 1}` or regression targets.
        """
        assert isinstance(K, Kinterface)

        self.n           = K.shape[0]
        kernel           = lambda x, y: K.kernel(x, y, **K.kernel_args)
        self.model       = Nystroem(kernel=kernel,
                                    n_components=self.rank,
                                    random_state=self.random_state)

        self.model.fit(K.data, y)
        self.active_set_ = list(self.model.component_indices_[:self.rank])
        assert len(set(self.active_set_)) == len(self.active_set_) == self.rank
        R = self.model.normalization_
        self.G = K[:, self.active_set_].dot(R)
        self.trained = True
def test_nystroem_approximation():
    # some basic tests
    rnd = np.random.RandomState(0)
    X = rnd.uniform(size=(10, 4))

    # With n_components = n_samples this is exact
    X_transformed = Nystroem(n_components=X.shape[0]).fit_transform(X)
    K = rbf_kernel(X)
    assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)

    trans = Nystroem(n_components=2, random_state=rnd)
    X_transformed = trans.fit(X).transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 2))

    # test callable kernel
    def linear_kernel(X, Y):
        return np.dot(X, Y.T)
    trans = Nystroem(n_components=2, kernel=linear_kernel, random_state=rnd)
    X_transformed = trans.fit(X).transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 2))

    # test that available kernels fit and transform
    kernels_available = kernel_metrics()
    for kern in kernels_available:
        trans = Nystroem(n_components=2, kernel=kern, random_state=rnd)
        X_transformed = trans.fit(X).transform(X)
        assert_equal(X_transformed.shape, (X.shape[0], 2))
Пример #15
0
class LocalitySensitiveHash():

    def __init__(self, r=0.1, num_functions=50, dimensionality=128, gamma=1):
        self.feature_map_LSH = DiscreteLocalitySensitiveHash(r, num_functions, dimensionality)
        self.feature_map_nystroem = Nystroem(kernel='rbf', gamma=gamma, n_components=dimensionality)

    def set_params(self, r=0.1, num_functions=50, dimensionality=128, gamma=1):
        self.feature_map_LSH = DiscreteLocalitySensitiveHash(r, num_functions, dimensionality)
        self.feature_map_nystroem = Nystroem(kernel='rbf', gamma=gamma, n_components=dimensionality)

    def transform(self, data_matrix):
        data_matrix_dense = self.feature_map_nystroem.fit_transform(data_matrix)
        return self.feature_map_LSH.transform(data_matrix_dense)
Пример #16
0
class LSH():

    def __init__(self, r=0.1, num_functions=50, dimensionality=128, gamma=1):
        self.feature_map_LSH = discreteLSH(r, num_functions, dimensionality)
        self.feature_map_nystroem = Nystroem(kernel='rbf', gamma=gamma, n_components=dimensionality)

    def set_params(self, r=0.1, num_functions=50, dimensionality=128, gamma=1):
        self.feature_map_LSH = discreteLSH(r, num_functions, dimensionality)
        self.feature_map_nystroem = Nystroem(kernel='rbf', gamma=gamma, n_components=dimensionality)

    def transform(self, X):
        Xl = self.feature_map_nystroem.fit_transform(X)
        return self.feature_map_LSH.transform(Xl)
Пример #17
0
    def __init__(self, model, bounds, n_components, seed):
        self.gp = model
        self.bounds = bounds
        self.n_components = n_components
        self.rng = np.random.RandomState(seed)

        self.X_space = self.rng.uniform(self.bounds[:, 0], self.bounds[:, 1],
                                        (1000, self.bounds.shape[0]))

        assert self.gp.X_fit_.shape[1] == self.X_space.shape[1]

        self.kernel = self.gp.kernel_
        self.nystr = Nystroem(
            n_components=min(self.n_components, self.X_space.shape[0]),
            kernel='precomputed', random_state=self.rng)
        self.nystr.fit(self.kernel(self.X_space))
Пример #18
0
 def fit(self, X, y):
     if self.mode == 'exact':
         K = pairwise_kernels(
             X,
             metric=self.kernel,
             filter_params=True,
             gamma=self.gamma
         )
         self.X_train_ = X
     else:
         self.kernel_sampler_ = Nystroem(
             kernel=self.kernel,
             gamma=self.gamma,
             n_components=self.n_components
         )
         K = self.kernel_sampler_.fit_transform(X)
     return super(WeightedSparseKernelClassifier, self).fit(K, y)
Пример #19
0
    def fit(self, K, y):
        """
        Fit approximation to the kernel function / matrix.

        :param K: (``numpy.ndarray``) or of (``Kinterface``). The kernel to be approximated with G.

        :param y: (``numpy.ndarray``) Class labels :math:`y_i \in {-1, 1}` or regression targets.
        """
        assert isinstance(K, Kinterface)

        self.n           = K.shape[0]
        kernel           = lambda x, y: K.kernel(x, y, **K.kernel_args)
        self.model       = Nystroem(kernel=kernel,
                                    n_components=self.rank,
                                    random_state=self.random_state)

        self.model.fit(K.data, y)
        self.active_set_ = list(self.model.component_indices_[:self.rank])
        assert len(set(self.active_set_)) == len(self.active_set_) == self.rank
        R = self.model.normalization_
        self.G = K[:, self.active_set_].dot(R)
        self.trained = True
Пример #20
0
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.990909090909091
exported_pipeline = make_pipeline(
    Nystroem(gamma=0.1, kernel="poly", n_components=9),
    MultinomialNB(alpha=0.01, fit_prior=True))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #21
0
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=229, stratify=y)

## grid search
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'laplacian']
aucs = np.zeros((5, 4, 5))

count = 1
for i in range(len(Cs)):
    for j in range(len(gammas)):
        for k in range(len(kernels)):
            print(count)
            clf = LinearSVC(penalty='l2', C=Cs[i], class_weight='balanced', random_state=229, max_iter=3000, tol=1e-3)
            feature_map = Nystroem(kernel=kernels[k], gamma=gammas[j], random_state=229, n_components=300)
            train_transformed = feature_map.fit_transform(x_train)
            aucs[i, j, k] = np.mean(cross_val_score(clf, train_transformed, y_train, scoring='roc_auc', cv=5))
            count += 1

print(aucs)

C_best = Cs[9]
print("Optimal lambda:", 1/C_best)

gamma_best = gammas[2]
print("Optimal gamma:", gamma_best)

clf = LinearSVC(penalty='l2', C=C_best, class_weight='balanced', random_state=229, max_iter=5000, tol=1e-3)
feature_map = Nystroem(kernel='laplacian', gamma=gamma_best, random_state=229, n_components=300)
train_transformed = feature_map.fit_transform(x_train)
Пример #22
0
def compare(dataset_name, gamma, task='classification'):
    data_train, data_test, targets_train, targets_test = data_preparation(dataset_name)
    
    if task == 'classification':
        rbf_svm = SVC(gamma=gamma, kernel='rbf')
        linear_svm = LinearSVC()
    if task == 'regression':
        rbf_svm = SVR(gamma=gamma, kernel='rbf')
        linear_svm = LinearSVR()

    # baseline1: SVM with RBF kernel
    starttime = time()
    rbf_svm.fit(data_train, targets_train)
    rbf_svm_time = time() - starttime
    rbf_svm_score = rbf_svm.score(data_test, targets_test)

    # baseline2: Linear SVM
    starttime = time()
    linear_svm.fit(data_train, targets_train)
    linear_svm_time = time() - starttime
    linear_svm_score = linear_svm.score(data_test, targets_test)

    # Approximation methods for random features and linear SVM
    rff = RFF(gamma=gamma)
    rff_sc = RFF_sincos(gamma=gamma)
    nystroem = Nystroem(gamma=gamma)

    if task == 'classification':
        rff_approx = Pipeline([('feature_map', rff), ('svm', LinearSVC())])
        rff_sc_approx = Pipeline([('feature_map', rff_sc), ('svm', LinearSVC())])
        nystroem_approx = Pipeline([('feature_map', nystroem), ('svm', LinearSVC())])
    if task == 'regression':
        rff_approx = Pipeline([('feature_map', rff), ('svm', LinearSVR())])
        rff_sc_approx = Pipeline([('feature_map', rff_sc), ('svm', LinearSVR())])
        nystroem_approx = Pipeline([('feature_map', nystroem), ('svm', LinearSVR())])
    
    rff_scores = []
    rff_times = []

    rff_sc_scores = []
    rff_sc_times = []

    nystroem_scores = []
    nystroem_times = []

    if data_test.shape[0] > 5000:
        component_nums = np.arange(50, 1000, 50)
    else:
        component_nums = int(data_test.shape[0]/30) * np.arange(1, 10)

    for n in component_nums:
        rff_approx.set_params(feature_map__n_components=n)
        starttime = time()
        rff_approx.fit(data_train, targets_train)
        rff_times.append(time() - starttime)
        rff_score = rff_approx.score(data_test, targets_test)
        rff_scores.append(rff_score)

        rff_sc_approx.set_params(feature_map__n_components=n)
        starttime = time()
        rff_sc_approx.fit(data_train, targets_train)
        rff_sc_times.append(time() - starttime)
        rff_sc_score = rff_sc_approx.score(data_test, targets_test)
        rff_sc_scores.append(rff_sc_score)

        nystroem_approx.set_params(feature_map__n_components=n)
        starttime = time()
        nystroem_approx.fit(data_train, targets_train)
        nystroem_times.append(time() - starttime)
        nystroem_score = nystroem_approx.score(data_test, targets_test)
        nystroem_scores.append(nystroem_score)

    plot_comparison(dataset_name,
                     rbf_svm_score, rbf_svm_time,
                     linear_svm_score, linear_svm_time,
                     rff_scores, rff_times,
                     rff_sc_scores, rff_sc_times,
                     nystroem_scores, nystroem_times,
                     component_nums, task)
Пример #23
0
from params import ts_depths,n_fea,gamma, np, sp
class Whitener:
    def __init__(self,X):
        self.Xmean = X.mean(0)
        self.Xstd = X.std(0)
    def whiten(self,Z):
        return (Z-self.Xmean)/self.Xstd
    def unwhiten(self,Zw):
        return Zw*self.Xstd + self.Xmean

def expkern(x,y):
    return np.exp(-gamma*la.norm(x-y))

wh = Whitener(ts_depths)
ts_depths_w = wh.whiten(ts_depths)
xx = np.linspace(ts_depths_w.min(),ts_depths_w.max(),n_fea)[:,np.newaxis]
rbf_tr = Nystroem(expkern,gamma,n_components=n_fea)
#rbf_tr = Nystroem(gamma=gamma,n_components=n_fea)
#class rbf_transformer:
#    def __init__(self,X,gamma):
#        self.X = X
#        self.gamma = gamma
#    def transform(self,xx):
#        return rbf_kernel(xx,self.X) 

#rbf_tr = rbf_transformer(xx,gamma)
#rbf_tr.fit(x)
rbf_tr.fit(xx)
ts_depths_tr = rbf_tr.transform(ts_depths_w)

# Estimate the score after iterative imputation of the missing values
# with different estimators
estimators = [
    BayesianRidge(),
    RandomForestRegressor(
        # We tuned the hyperparameters of the RandomForestRegressor to get a good
        # enough predictive performance for a restricted execution time.
        n_estimators=4,
        max_depth=10,
        bootstrap=True,
        max_samples=0.5,
        n_jobs=2,
        random_state=0,
    ),
    make_pipeline(
        Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
    ),
    KNeighborsRegressor(n_neighbors=15),
]
score_iterative_imputer = pd.DataFrame()
# iterative imputer is sensible to the tolerance and
# dependent on the estimator used internally.
# we tuned the tolerance to keep this example run with limited computational
# resources while not changing the results too much compared to keeping the
# stricter default value for the tolerance parameter.
tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
for impute_estimator, tol in zip(estimators, tolerances):
    estimator = make_pipeline(
        IterativeImputer(
            random_state=0, estimator=impute_estimator, max_iter=25, tol=tol
        ),
                                              line=dict(color='black',
                                                        width=1)))

fig.append_trace(projection_fourier_1, 1, 2)
fig.append_trace(projection_fourier_2, 1, 2)

fig['layout']['xaxis2'].update(title='1st principal component',
                               zeroline=False,
                               showgrid=False)
fig['layout']['yaxis2'].update(title='2nd component',
                               zeroline=False,
                               showgrid=False)

## Nystroem

nystroem = Nystroem(gamma=gamma, random_state=1)
nystroem.fit(X_pca)
X_nystroem_pca = nystroem.transform(X_pca)

projection_nystroem_1 = go.Scatter(x=X_nystroem_pca[reds, 0],
                                   y=X_nystroem_pca[reds, 1],
                                   mode='markers',
                                   showlegend=False,
                                   marker=dict(color='red',
                                               line=dict(color='black',
                                                         width=1)))
projection_nystroem_2 = go.Scatter(x=X_nystroem_pca[blues, 0],
                                   y=X_nystroem_pca[blues, 1],
                                   mode='markers',
                                   showlegend=False,
                                   marker=dict(color='blue',
Пример #26
0
def test_sklearn_benchmarks(ray_start_cluster_2_nodes):
    ESTIMATORS = {
        "CART":
        DecisionTreeClassifier(),
        "ExtraTrees":
        ExtraTreesClassifier(n_estimators=10),
        "RandomForest":
        RandomForestClassifier(),
        "Nystroem-SVM":
        make_pipeline(Nystroem(gamma=0.015, n_components=1000),
                      LinearSVC(C=1)),
        "SampledRBF-SVM":
        make_pipeline(RBFSampler(gamma=0.015, n_components=1000),
                      LinearSVC(C=1)),
        "LogisticRegression-SAG":
        LogisticRegression(solver="sag", tol=1e-1, C=1e4),
        "LogisticRegression-SAGA":
        LogisticRegression(solver="saga", tol=1e-1, C=1e4),
        "MultilayerPerceptron":
        MLPClassifier(hidden_layer_sizes=(32, 32),
                      max_iter=100,
                      alpha=1e-4,
                      solver="sgd",
                      learning_rate_init=0.2,
                      momentum=0.9,
                      verbose=1,
                      tol=1e-2,
                      random_state=1),
        "MLP-adam":
        MLPClassifier(hidden_layer_sizes=(32, 32),
                      max_iter=100,
                      alpha=1e-4,
                      solver="adam",
                      learning_rate_init=0.001,
                      verbose=1,
                      tol=1e-2,
                      random_state=1)
    }
    # Load dataset.
    print("Loading dataset...")
    unnormalized_X_train, y_train = pickle.load(
        open(
            os.path.join(os.path.dirname(__file__),
                         "mnist_784_100_samples.pkl"), "rb"))
    # Normalize features.
    X_train = unnormalized_X_train / 255

    register_ray()
    train_time = {}
    random_seed = 0
    # Use two workers per classifier.
    num_jobs = 2
    with joblib.parallel_backend("ray"):
        for name in sorted(ESTIMATORS.keys()):
            print("Training %s ... " % name, end="")
            estimator = ESTIMATORS[name]
            estimator_params = estimator.get_params()
            estimator.set_params(
                **{
                    p: random_seed
                    for p in estimator_params if p.endswith("random_state")
                })

            if "n_jobs" in estimator_params:
                estimator.set_params(n_jobs=num_jobs)
            time_start = time.time()
            estimator.fit(X_train, y_train)
            train_time[name] = time.time() - time_start
            print("training", name, "took", train_time[name], "seconds")
Пример #27
0
# interaction that we would like to model could be the impact of the rain that
# might not be the same during the working days and the week-ends and holidays
# for instance.
#
# To model all such interactions, we could either use a polynomial expansion on
# all marginal features at once, after their spline-based expansion. However,
# this would create a quadratic number of features which can cause overfitting
# and computational tractability issues.
#
# Alternatively, we can use the Nyström method to compute an approximate
# polynomial kernel expansion. Let us try the latter:
from sklearn.kernel_approximation import Nystroem

cyclic_spline_poly_pipeline = make_pipeline(
    cyclic_spline_transformer,
    Nystroem(kernel="poly", degree=2, n_components=300, random_state=0),
    RidgeCV(alphas=alphas),
)
evaluate(cyclic_spline_poly_pipeline, X, y, cv=ts_cv)

# %%
#
# We observe that this model can almost rival the performance of the gradient
# boosted trees with an average error around 6% of the maximum demand.
#
# Note that while the final step of this pipeline is a linear regression model,
# the intermediate steps such as the spline feature extraction and the Nyström
# kernel approximation are highly non-linear. As a result the compound pipeline
# is much more expressive than a simple linear regression model with raw features.
#
# For the sake of completeness, we also evaluate the combination of one-hot
Пример #28
0
import numpy as np
from params import *
from sklearn.kernel_approximation import RBFSampler, Nystroem
from sklearn.ensemble import RandomTreesEmbedding

fabric = np.genfromtxt(fabric_file, delimiter=' ',
                       skip_header=True).astype('float32')

depths = fabric[:, 0].reshape(fabric[:, 0].size, 1)
depths_sd = depths.std()
depths_mean = depths.mean()
depths_norm = (depths - depths_mean) / depths_sd

transformer = Nystroem(kernel='rbf', gamma=gamma, n_components=n_fea)
depths_tr = transformer.fit_transform(depths_norm).astype('float32')
n_fea = depths_tr.shape[1]
wais_cs = fabric[:, 1:4]
depths_test = np.linspace(-1, 1, 1000).reshape(1000, 1)
depths_test_tr = transformer.transform(depths_test).astype('float32')
Пример #29
0
def predefined_ops():
    '''return dict of user defined none-default instances of operators
    '''
    clean = {
        'clean':
        Cleaner(dtype_filter='not_datetime',
                na1='null',
                na2='mean',
                drop_uid=True),
        'cleanNA':
        Cleaner(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Cleaner(dtype_filter='not_datetime', na1='most_frequent', na2='mean'),
        'cleanMn':
        Cleaner(dtype_filter='not_datetime', na1='missing', na2='mean'),
    }
    #
    encode = {
        'woe8': WoeEncoder(max_leaf_nodes=8),
        'woe5': WoeEncoder(max_leaf_nodes=5),
        'woeq8': WoeEncoder(q=8),
        'woeq5': WoeEncoder(q=5),
        'woeb5': WoeEncoder(bins=5),
        'woem': WoeEncoder(mono=True),
        'oht': OhtEncoder(),
        'ordi': OrdiEncoder(),

        # 'bin10': BinEncoder(bins=10, int_bins=True),  # 10 bin edges encoder
        # 'bin5': BinEncoder(bins=5, int_bins=True),  # 5 bin edges encoder
        # 'binm10': BinEncoder(max_leaf_nodes=10,
        #                      int_bins=True),  # 10 bin tree cut edges encoder
        # 'binm5': BinEncoder(max_leaf_nodes=5,
        #                     int_bins=True),  # 5 bin tree cut edges encoder
    }

    resample = {
        # over_sampling
        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),
        # clean outliers
        'inlierForest':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'IsolationForest',
                            'contamination': 0.1
                        }),
        'inlierLocal':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'LocalOutlierFactor',
                            'contamination': 0.1
                        }),
        'inlierEllip':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'EllipticEnvelope',
                            'contamination': 0.1
                        }),
        'inlierOsvm':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'OneClassSVM',
                            'contamination': 0.1
                        }),
    }

    scale = {
        'stdscale': StandardScaler(),
        'minmax': MinMaxScaler(),
        'absmax': MaxAbsScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'quantile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        # kernel approximation
        'Nys': Nystroem(random_state=0),
        'rbf': RBFSampler(random_state=0),
        'rfembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(WoeEncoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(LogisticRegression(penalty='l1', solver='saga',
                                           C=1e-2)),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fxgb':
        SelectFromModel(
            XGBClassifier(n_jobs=-1,
                          booster='gbtree',
                          max_depth=2,
                          n_estimators=50), ),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=50, max_depth=2)),

        # fixed number of features
        'fxgb20':
        SelectFromModel(XGBClassifier(n_jobs=-1, booster='gbtree'),
                        max_features=20),
        'frf20':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=20),
        'frf10':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=10),
        'fRFElog':
        RFE(LogisticRegression(penalty='l1', solver='saga', C=1e-2), step=0.1),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1, booster='gbtree'), step=0.1),
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }

    imp = {
        "impXGB":
        XGBClassifier(n_jobs=-1,
                      booster='gbtree',
                      max_depth=2,
                      n_estimators=50),
        "impRF":
        ExtraTreesClassifier(n_estimators=100, max_depth=2)
    }

    instances = {}
    instances.update(**clean, **encode, **scale, **feature_c, **feature_m,
                     **feature_u, **resample, **imp)
    return instances
Пример #30
0
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=75)

# Average CV score on the training set was:0.8098183166481275
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=0, subset_list="subsets.csv"),
    Nystroem(gamma=0.8500000000000001, kernel="linear", n_components=7),
    GradientBoostingClassifier(learning_rate=0.5, max_depth=3, max_features=0.5, min_samples_leaf=4, min_samples_split=6, n_estimators=100, subsample=0.5)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #31
0
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=45)

# Average CV score on the training set was:0.7680533926585095
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=0, subset_list="subsets.csv"),
    Nystroem(gamma=0.25, kernel="polynomial", n_components=9),
    ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.9000000000000001, min_samples_leaf=1, min_samples_split=5, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
# Estimate the score after iterative imputation of the missing values
# with different estimators
estimators = [
    BayesianRidge(),
    RandomForestRegressor(
        # We tuned the hyperparameters of the RandomForestRegressor to get a good
        # enough predictive performance for a restricted execution time.
        n_estimators=4,
        max_depth=10,
        bootstrap=True,
        max_samples=0.5,
        n_jobs=2,
        random_state=0,
    ),
    make_pipeline(Nystroem(kernel="polynomial", degree=2, random_state=0),
                  Ridge(alpha=1e3)),
    KNeighborsRegressor(n_neighbors=15),
]
score_iterative_imputer = pd.DataFrame()
# iterative imputer is sensible to the tolerance and
# dependent on the estimator used internally.
# we tuned the tolerance to keep this example run with limited computational
# resources while not changing the results too much compared to keeping the
# stricter default value for the tolerance parameter.
tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
for impute_estimator, tol in zip(estimators, tolerances):
    estimator = make_pipeline(
        IterativeImputer(random_state=0,
                         estimator=impute_estimator,
                         max_iter=25,
Пример #33
0
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=27)

# Average CV score on the training set was:0.7882832777159806
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=0, subset_list="subsets.csv"),
    Nystroem(gamma=0.5, kernel="linear", n_components=8),
    ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.9000000000000001, min_samples_leaf=3, min_samples_split=6, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #34
0
from sklearn.gaussian_process.kernels import Matern
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=123)

# Average CV score on the training set was: 0.9917144075724378
exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy),
               Nystroem(gamma=0.4, kernel="chi2", n_components=6)),
    GaussianProcessRegressor(kernel=Matern(length_scale=2.9000000000000004,
                                           nu=1.5),
                             n_restarts_optimizer=155,
                             normalize_y=True))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 123)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #35
0
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.01,
                                             max_depth=1,
                                             min_child_weight=16,
                                             n_estimators=100,
                                             nthread=1,
                                             subsample=0.55)),
    StackingEstimator(
        estimator=GradientBoostingRegressor(alpha=0.9,
                                            learning_rate=0.001,
                                            loss="ls",
                                            max_depth=2,
                                            max_features=0.7500000000000001,
                                            min_samples_leaf=12,
                                            min_samples_split=17,
                                            n_estimators=100,
                                            subsample=1.0)),
    Nystroem(gamma=0.25, kernel="laplacian", n_components=10),
    GradientBoostingRegressor(alpha=0.95,
                              learning_rate=0.1,
                              loss="lad",
                              max_depth=2,
                              max_features=0.8500000000000001,
                              min_samples_leaf=19,
                              min_samples_split=7,
                              n_estimators=100,
                              subsample=0.6500000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #36
0
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.8028858267395129
exported_pipeline = make_pipeline(
    Nystroem(gamma=0.7000000000000001, kernel="linear", n_components=6),
    XGBClassifier(learning_rate=1.0, max_depth=8, min_child_weight=4, n_estimators=100, nthread=1, subsample=0.45)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #37
0
 kcca.fit(ktrain,Ytrain)
 XtrainT = kcca.transform(ktrain)
 XtestT = kcca.transform(ktest)
 kccaScores = np.zeros((2,np.alen(nComponents)))
 for i,n in enumerate(nComponents):   
     kccaScores[:,i] = util.classify(XtrainT[:,0:n],XtestT[:,0:n],labelsTrain,labelsTest)
 
 #%% Subsampling methods
 kpls = PLSRegression(n_components=150)
 nComponents = np.arange(173,2173,100)
 
 # Nystroem method
 elapTimeNys = np.zeros(np.shape(nComponents))
 kplsScoresNys = np.zeros((2,3))
 for i,n in enumerate(nComponents):
     nys = Nystroem(n_components=n,gamma=gamma)
     nys.fit(Xtrain)
     ktrain = nys.transform(Xtrain)
     ktest = nys.transform(Xtest)
     startTime = timeit.default_timer()
     kpls.fit(ktrain,Ytrain)
     elapTimeNys[i] = timeit.default_timer() - startTime
     XtrainT = kpls.transform(ktrain)
     XtestT = kpls.transform(ktest)
     
     if n==573:
         kplsScoresNys[:,0] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest)
     elif n==1073:
         kplsScoresNys[:,1] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest)
     elif n==1573:
         kplsScoresNys[:,2] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest)
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids):
    """
    Evaluation Nystrom construction time and approximation precision.

    The approximation is based on a subsample of size n_sample of the input data set.

    :param x_train: Input dataset as ndarray.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param n_sample: The number of sample to take into account in the reconstruction (can't be too large)

    :return:
    """

    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning(
            "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    # Compute euristic gamma as the mean of euclidian distance between example
    gamma = compute_euristic_gamma(x_train)
    log_memory_usage(
        "Memory after euristic gamma computation in make_nystrom_evaluation")
    # precompute the centroids norm for later use (optimization)
    centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)
    # centroids_norm = None

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None
    log_memory_usage(
        "Memory after sample selection in make_nystrom_evaluation")

    ########################
    # Nystrom on centroids #
    ########################
    logger.info("Build Nystrom on centroids")
    ## TIME: nystrom build time
    # nystrom build time is Nystrom preparation time for later use.
    ## START
    nystrom_build_start_time = time.process_time()
    metric = prepare_nystrom(U_centroids, centroids_norm, gamma=gamma)
    nystrom_build_stop_time = time.process_time()
    log_memory_usage("Memory after SVD computation in make_nystrom_evaluation")
    # STOP
    nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time

    ## TIME: nystrom inference time
    # Nystrom inference time is the time for Nystrom transformation for all the samples.
    ## START
    nystrom_inference_time_start = time.process_time()
    nystrom_embedding = nystrom_transformation(sample,
                                               U_centroids,
                                               metric,
                                               centroids_norm,
                                               samples_norm,
                                               gamma=gamma)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T
    nystrom_inference_time_stop = time.process_time()
    log_memory_usage(
        "Memory after kernel matrix approximation in make_nystrom_evaluation")
    ## STOP
    nystrom_inference_time = (nystrom_inference_time_stop -
                              nystrom_inference_time_start) / n_sample

    ################################################################

    ######################
    # Nystrom on uniform #
    ######################
    logger.info("Build Nystrom on uniform sampling")

    indexes_uniform_samples = np.random.permutation(
        x_train.shape[0])[:U_centroids.shape[0]]
    uniform_sample = x_train[indexes_uniform_samples]
    uniform_sample_norm = None
    log_memory_usage(
        "Memory after uniform sample selection in make_nystrom_evaluation")

    metric_uniform = prepare_nystrom(uniform_sample,
                                     uniform_sample_norm,
                                     gamma=gamma)
    log_memory_usage(
        "Memory after SVD computation in uniform part of make_nystrom_evaluation"
    )

    nystrom_embedding_uniform = nystrom_transformation(sample,
                                                       uniform_sample,
                                                       metric_uniform,
                                                       uniform_sample_norm,
                                                       samples_norm,
                                                       gamma=gamma)
    nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T

    #################################################################

    ###############
    # Real Kernel #
    ###############
    logger.info("Compute real kernel matrix")

    real_kernel_special = special_rbf_kernel(sample,
                                             sample,
                                             gamma,
                                             norm_X=samples_norm,
                                             norm_Y=samples_norm)
    real_kernel = rbf_kernel(sample, sample, gamma)
    real_kernel_norm = np.linalg.norm(real_kernel)
    log_memory_usage(
        "Memory after real kernel computation in make_nystrom_evaluation")

    #################################
    # Sklearn based Nystrom uniform #
    #################################

    sklearn_nystrom = Nystroem(gamma=gamma,
                               n_components=uniform_sample.shape[0])
    sklearn_nystrom = sklearn_nystrom.fit(uniform_sample)
    sklearn_transfo = sklearn_nystrom.transform(sample)
    kernel_sklearn_nys = sklearn_transfo @ sklearn_transfo.T

    ################################################################

    ####################
    # Error evaluation #
    ####################

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value -
                                             real_kernel) / real_kernel_norm
    sampled_froebenius_norm_uniform = np.linalg.norm(
        nystrom_approx_kernel_value_uniform - real_kernel) / real_kernel_norm

    # svm evaluation
    if x_test is not None:
        logger.info("Start classification")

        time_classification_start = time.process_time()
        x_train_nystrom_embedding = nystrom_transformation(x_train,
                                                           U_centroids,
                                                           metric,
                                                           centroids_norm,
                                                           None,
                                                           gamma=gamma)
        x_test_nystrom_embedding = nystrom_transformation(x_test,
                                                          U_centroids,
                                                          metric,
                                                          centroids_norm,
                                                          None,
                                                          gamma=gamma)

        linear_svc_clf = LinearSVC()
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        accuracy_nystrom_svm = linear_svc_clf.score(x_test_nystrom_embedding,
                                                    y_test)
        time_classification_stop = time.process_time()

        delta_time_classification = time_classification_stop - time_classification_start
    else:
        accuracy_nystrom_svm = None
        delta_time_classification = None

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm,
        "nystrom_sampled_error_reconstruction_uniform":
        sampled_froebenius_norm_uniform,
        "nystrom_svm_accuracy": accuracy_nystrom_svm,
        "nystrom_svm_time": delta_time_classification
    }

    resprinter.add(nystrom_results)
Пример #39
0
 def __init__(self, r=0.1, num_functions=50, dimensionality=128, gamma=1):
     self.feature_map_LSH = discreteLSH(r, num_functions, dimensionality)
     self.feature_map_nystroem = Nystroem(kernel='rbf', gamma=gamma, n_components=dimensionality)
Пример #40
0
 def set_params(self, r=0.1, num_functions=50, dimensionality=128, gamma=1):
     self.feature_map_LSH = DiscreteLocalitySensitiveHash(r, num_functions, dimensionality)
     self.feature_map_nystroem = Nystroem(kernel='rbf', gamma=gamma, n_components=dimensionality)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from fully_connected.utils import treshhold_labels, normalize_data, load_monolithic

if __name__ == '__main__':
    train, dev, test = load_monolithic('data_monolithic_mfcc.pkl')

    X_train, S_train, Y_train = train
    X_dev, S_dev, Y_dev = dev
    X_test, S_test, Y_test = test
    X_train, X_dev, X_test = normalize_data(X_train, X_dev, X_test)
    Y_train, Y_dev, Y_test = treshhold_labels(Y_train, Y_dev, Y_test, .25)

    # rbf_feature = RBFSampler(gamma=1, n_components=800, random_state=1)
    rbf_feature = Nystroem(gamma=1, n_components=200, random_state=1)
    print('transform features')
    X_train_features = rbf_feature.fit_transform(X_train)
    X_dev_features = rbf_feature.transform(X_dev)
    print('finish')
    clf = SGDClassifier(max_iter=400, loss='log', n_jobs=-1, random_state=1,
                        alpha=0.00000001, tol=1e-9, early_stopping=False,
                        verbose=1, n_iter_no_change=40)

    clf.fit(X_train_features, Y_train)
    print('=== Training Set Performance ===')
    print(clf.score(X_train_features, Y_train))
    print(confusion_matrix(Y_train, clf.predict(X_train_features)))
    print(roc_auc_score(Y_train, clf.predict_proba(X_train_features)[:, 1]))
    print('=== Dev Set Performance ===')
    print(clf.score(X_dev_features, Y_dev))
Пример #42
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=47)

# Average CV score on the training set was:0.7484686688913607
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=0, subset_list="subsets.csv"),
    Nystroem(gamma=0.7000000000000001, kernel="polynomial", n_components=8),
    RandomForestClassifier(bootstrap=False,
                           criterion="entropy",
                           max_features=0.45,
                           min_samples_leaf=6,
                           min_samples_split=4,
                           n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)




start_time = time.time() 
RFF = RBFSampler(gamma=1,n_components= int(num_of_samples*sampling_percentage))
V = RFF.fit_transform(X)
RFF_estimated_kernel = V.dot(V.T)
print("--- RFF Time : %s seconds ---" % (time.time() - start_time))




start_time = time.time() 
N = Nystroem(gamma=1,n_components= int(num_of_samples*sampling_percentage))
V = N.fit_transform(X)
estimated_kernel = V.dot(V.T)
print("--- Nystrom Time : %s seconds ---" % (time.time() - start_time))


start_time = time.time() 
real_kernel = sklearn.metrics.pairwise.rbf_kernel(X, gamma=1)
print("--- Real Time : %s seconds ---" % (time.time() - start_time))



print estimated_kernel[0:5, 0:5]
print '\n\n'
print real_kernel[0:5, 0:5]
print '\n\n'
Пример #44
0
from zipfile import ZipFile

# Load Data
train_data = pd.read_csv(ZipFile('data/processed/train.zip').open('train.csv'))
X_train = train_data.drop([
    'Category', '2010-2012', '0600-1759', 'PdDistrict_TARAVAL',
    'Patrol_Division', 'Polar_Rho', 'Polar_Phi', 'X_R30', 'Y_R30', 'X_R60',
    'Y_R60', 'XY_PCA1', 'XY_PCA2'
],
                          axis=1)
category = pd.factorize(train_data['Category'], sort=True)
y_train = category[0]

# RBF Kernel Approximation (Nystroem Approximation)
feature_map_nystroem = Nystroem(gamma=1e-4,
                                n_components=1000,
                                random_state=2019)

# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=2019)
calibrated_LinearSVC_SGD = CalibratedClassifierCV(
    base_estimator=SGDClassifier(), method='sigmoid', cv=2)

parameters = {
    'calibratedclassifiercv__base_estimator__alpha': [1e-4, 1e-3, 1e-2],
    'calibratedclassifiercv__base_estimator__class_weight': [None, 'balanced'],
    'calibratedclassifiercv__base_estimator__fit_intercept': [True, False],
    'calibratedclassifiercv__base_estimator__max_iter': [100000],
    'calibratedclassifiercv__base_estimator__random_state': [2019]
}
pipe = make_pipeline(StandardScaler(), feature_map_nystroem,
Пример #45
0
    def _dpp_sel(self, X_, y=None):
        """
        DPP only relies on X. 
        
        We will condition the sampling based on:
        *  `self.coef_info['cols']`
        
        After sampling it will go ahead and then perform grouped wilcoxon selection.
        """
        X = np.array(X_)
        cols_to_index = [
            idx for idx, x in enumerate(X_.columns)
            if x in self.coef_info['cols']
        ]
        unseen_cols_to_index = [
            idx for idx, x in enumerate(X_.columns)
            if x not in self.coef_info['cols']
        ]
        if X.shape[0] < 1000:
            feat_dist = rbf_kernel(X.T)
        else:
            feat_dist = Nystroem().fit_transform(X.T)
        #feat_dist = np.nan_to_num(feat_dist)
        unseen_kernel = feat_dist[
            unseen_cols_to_index, :][:, unseen_cols_to_index]
        #print(unseen_kernel.shape)
        self._dpp_estimate_k(unseen_kernel)
        k = self.dpp_k['pca']  # - len(self.coef_info['cols'])
        """
        if k < 1:
            # this means k is possibly negative, reevaluate k based only on new incoming feats!
            self.unseen_only = True            
            #k = max(self._dpp_estimate_k(unseen_kernel), int(unseen_kernel.shape[0] * 0.5)+1)            
            k = unseen_kernel.shape[0]
            #print("Unseen only")
            #print(k)
        """
        feat_index = []
        while len(feat_index) == 0:
            if len(self.coef_info['cols']) == 0:
                feat_index = sample_dpp(decompose_kernel(feat_dist), k=k)
            else:
                feat_index = sample_conditional_dpp(feat_dist,
                                                    cols_to_index,
                                                    k=k)
            feat_index = [x for x in feat_index if x is not None]

        # select features using entropy measure
        # how can we order features from most to least relevant first?
        # we chould do it using f test? Or otherwise - presume DPP selects best one first
        """
        feat_entropy = []
        excl_entropy = []
        X_sel = X[:, feat_index]
        
        for idx, feat in enumerate(X_sel.T):
            if len(feat_entropy) == 0:
                feat_entropy.append(idx)
                continue
            if entropy(X_sel[:, feat_entropy]) > entropy(X_sel[:, feat_entropy+[idx]]):
                feat_entropy.append(idx)
            else:
                excl_entropy.append(idx)
        """
        # iterate over feat_index to determine
        # information on wilcoxon test
        # as the feat index are already "ordered" as that is how DPP would
        # perform the sampling - we will do the single pass in the same
        # way it was approached in the OGFS
        # feat index will have all previous sampled columns as well...

        if not self.unseen_only:
            feat_check = []
            excl_check = []
            X_sel = X[:, feat_index]

            for idx, feat in enumerate(X_sel.T):
                if len(feat_check) == 0:
                    feat_check.append(idx)
                    continue
                wilcoxon_pval = wilcoxon_group(X_sel[:, feat_check], feat)
                #print("\tWilcoxon: {}".format(wilcoxon_pval))
                if wilcoxon_pval < self.intragroup_alpha:
                    feat_check.append(idx)
                else:
                    excl_check.append(idx)
            index_to_col = [
                col for idx, col in enumerate(X_.columns) if idx in feat_check
            ]
        else:
            # if we are considering unseen only, we will simply let the regulariser
            # act on it, sim. to grafting.
            index_to_col = [
                col for idx, col in enumerate(X_.columns) if idx in feat_index
            ]
        self.unseen_only = False  # perhaps add more conditions around unseen - i.e. once unseen condition kicks in, it remains active?
        self.coef_info['cols'] = list(
            set(self.coef_info['cols'] + index_to_col))
        col_rem = X_.columns.difference(self.coef_info['cols'])
        # update column exclusion...
        self.coef_info['excluded_cols'] = [
            x for x in self.coef_info['excluded_cols']
            if x not in self.coef_info['cols']
        ]
        self.add_column_exclusion(col_rem)
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, PolynomialFeatures
from tpot.builtins import OneHotEncoder

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

# Score on the training set was:1.0
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    Nystroem(gamma=0.05, kernel="poly", n_components=7),
    OneHotEncoder(minimum_fraction=0.05, sparse=False), GaussianNB())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #47
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=1)

# Average CV score on the training set was:0.7259130434782607
exported_pipeline = make_pipeline(
    Nystroem(gamma=1.0, kernel="cosine", n_components=7),
    RandomForestClassifier(bootstrap=False,
                           criterion="gini",
                           max_features=0.15000000000000002,
                           min_samples_leaf=13,
                           min_samples_split=12,
                           n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
pipeline_optimizer.fit(yval_est, yval)
print(pipeline_optimizer.score(yval_est, yval))
pipeline_optimizer.export('tpot_exported_pipeline.py')

#%%

exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            PolynomialFeatures(degree=2,
                               include_bias=False,
                               interaction_only=False),
            StackingEstimator(
                estimator=RandomForestRegressor(bootstrap=True,
                                                max_features=0.25,
                                                min_samples_leaf=17,
                                                min_samples_split=8,
                                                n_estimators=100)),
            StackingEstimator(estimator=AdaBoostRegressor(
                learning_rate=1.0, loss="exponential", n_estimators=100)),
            Nystroem(gamma=0.1, kernel="additive_chi2", n_components=7)),
        FunctionTransformer(copy)), ElasticNetCV(l1_ratio=0.9, tol=0.01))
exported_pipeline.fit(yval_est, yval)
results = exported_pipeline.predict(predictions)

submission = pd.DataFrame(index=Test.index,
                          columns=['seg_id', 'time_to_failure'])
submission['seg_id'] = Test['seg_id'].values
submission['time_to_failure'] = results
submission.to_csv('submission.csv', index=False)
Пример #49
0
 def _nystrom(self, ind, X):
     # 'Nystrom'      : 'f(0.01,0.5)',
     return Nystroem(n_components=self._ncomp(ind, X)).fit_transform(X)
Пример #50
0
#full dataset classification
X_data =images/255.0
Y = targets

#split data to train and test 
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, Y, test_size=0.15, random_state=42)

# Create a classifier: a support vector classifier
kernel_svm = svm.SVC(gamma=.2)
linear_svm = svm.LinearSVC()

# create pipeline from kernel approximation
# and linear svm
feature_map_fourier = RBFSampler(gamma=.2, random_state=1)
feature_map_nystroem = Nystroem(gamma=.2, random_state=1)

fourier_approx_svm = pipeline.Pipeline([("feature_map", feature_map_fourier),
                                        ("svm", svm.LinearSVC())])

nystroem_approx_svm = pipeline.Pipeline([("feature_map", feature_map_nystroem),
                                        ("svm", svm.LinearSVC())])

# fit and predict using linear and kernel svm:

import datetime as dt
# We learn the digits on train part

kernel_svm_start_time = dt.datetime.now()
print('Start kernel svm learning at {}'.format(str(kernel_svm_start_time)))
kernel_svm.fit(X_train, y_train)
Пример #51
0
class KernelRegressionPolicy(UpperLevelPolicy):
    """Linear policy in approximated kernel space.

    A linear policy in kernel space is learned. In order to keep computation
    and risk of overfitting limited, a low-dimensional approximation of the
    kernel space is used, which is determined by the Nystroem approximation.
    Thus, an explicit feature map is learned based on the training data. This
    has the advantage compared to predefined feature maps that the features
    are adaptive.

    Parameters
    ----------
    weight_dims: int
        dimensionality of weight vector of lower-level policy

    context_dims: int
        dimensionality of context vector

    kernel : string or callable (default: "rbf")
        Kernel map to be approximated. A callable should accept two
        arguments and the keyword arguments passed to this object as
        kernel_params, and should return a floating point number.

    gamma : float (default: None)
        Gamma parameter for the RBF, polynomial, exponential chi2 and sigmoid
        kernels. Interpretation of the default value is left to the kernel;
        see the documentation for sklearn.metrics.pairwise. Ignored by
        other kernels.

    coef0 : float (default: 1.5)
        The coef0 parameter for the kernels. Interpretation of the value
        is left to the kernel; see the documentation for
        sklearn.metrics.pairwise. Ignored by other kernels.

    n_components: int (default: 20)
        The number of components used in the Nystroem approximation of the
        kernel

    covariance_scale: float (default: 1.0)
        the covariance is initialized to numpy.eye(weight_dims) *
        covariance_scale.

    alpha: float (default: 0.0)
        Controlling the L2-regularization in the ridge regression for
        learning of the policy's weights

    bias: bool (default: True)
        Whether a constant bias dimension is added to the approximated kernel
        space. This allows learning offsets more easily.

    normalize: bool (default: True)
        Whether the activations in the approximated kernel space are normalized.
        This should improve generalization beyond the boundaries of the
        observed context space.

    random_state : optional, int
        Seed for the random number generator.

    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2014/11/20
    """

    def __init__(
        self,
        weight_dims,
        context_dims,
        kernel="rbf",
        gamma=None,
        coef0=1.5,
        n_components=20,
        covariance_scale=1.0,
        alpha=0.0,
        bias=True,
        normalize=True,
        random_state=None,
    ):
        self.weight_dims = weight_dims
        self.context_dims = context_dims
        self.kernel = kernel
        self.gamma = gamma
        self.coef0 = coef0
        self.n_components = n_components
        self.alpha = alpha
        self.bias = bias
        self.normalize = normalize

        self.Sigma = np.eye(weight_dims) * covariance_scale

        self.random_state = check_random_state(random_state)

    def __call__(self, context, explore=True):
        """Evaluates policy for given contexts.

        Samples weight vector from distribution if explore is true, otherwise
        return the distribution's mean (which depends on the context).

        Parameters
        ----------
        contexts: array-like, [n_contexts, context_dims]
            context vector

        explore: bool
            if true, weight vector is sampled from distribution. otherwise the
            distribution's mean is returned
        """
        X = self.nystroem.transform(context)
        if self.bias:
            X = np.hstack((X, np.ones((X.shape[0], 1))))
        if self.normalize:
            X /= np.abs(X).sum(1)[:, None]

        mean = np.dot(X, self.W.T)
        if not explore:
            return mean[0]
        else:
            sample_func = lambda x: self.random_state.multivariate_normal(x, self.Sigma, size=[1])[0]
            samples = np.apply_along_axis(sample_func, 1, mean)[0]
            return samples

    def fit(self, X, Y, weights=None, context_transform=True):
        """ Trains policy by weighted maximum likelihood.

        .. note:: This call changes this policy (self)

        Parameters
        ----------
        X: array-like, shape (n_samples, context_dims)
            Context vectors

        Y: array-like, shape (n_samples, weight_dims)
            Low-level policy parameter vectors

        weights: array-like, shape (n_samples,)
            Weights of individual samples (should depend on the obtained
            reward)
        """
        # Kernel approximation
        self.nystroem = Nystroem(
            kernel=self.kernel,
            gamma=self.gamma,
            coef0=self.coef0,
            n_components=np.minimum(X.shape[0], self.n_components),
            random_state=self.random_state,
        )
        self.X = self.nystroem.fit_transform(X)
        if self.bias:
            self.X = np.hstack((self.X, np.ones((self.X.shape[0], 1))))
        if self.normalize:
            self.X /= np.abs(self.X).sum(1)[:, None]

        # Standard ridge regression
        ridge = Ridge(alpha=self.alpha, fit_intercept=False)
        ridge.fit(self.X, Y, weights)
        self.W = ridge.coef_
Пример #52
0
class Polykernel(Model):
    def __init__(self,
                 model=None,
                 cv=False,
                 multi_threaded=True,
                 upsample=True):
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.X_preds = None
        self.svm_models = None

        self.upsample = upsample
        self.model = model
        self.cv = (cv > 1) * cv
        self.multi_threaded = multi_threaded
        if model is None:
            #self.model = LogisticRegression(n_jobs=1, penalty='l1', random_state=1337, max_iter=10 ** 3, solver='saga')
            # self.model = SVC(kernel='poly', degree=2, random_state=1337)
            self.model = SGDClassifier(power_t=0.4,
                                       n_jobs=1,
                                       max_iter=10**5,
                                       learning_rate="invscaling",
                                       eta0=1,
                                       penalty='L1',
                                       n_iter_no_change=10,
                                       random_state=1337)
        from sklearn.kernel_approximation import Nystroem
        self.feature_map_nystroem = Nystroem(kernel='poly',
                                             degree=2,
                                             gamma=.2,
                                             random_state=1337,
                                             n_components=300)

    def train(self, X, y):
        self.X_train = self.feature_map_nystroem.fit_transform(X)
        self.y_train = y
        self.model.fit(self.X_train, y)

    def predict(self, X_test):
        self.X_test = self.feature_map_nystroem.fit_transform(X_test)
        self.X_preds = self.model.predict(self.X_test)
        return self.X_preds

    def get_importances(self):
        return [0] * self.X_train.shape[1]

    def __call__(self, X, y):
        if self.cv:
            return self.cross_validate(X, y)
        else:
            return self.single_run(X, y)

    def set_weight(self, weight):
        self.model.class_weight = weight

    def get_all_importances(self):
        if self.svm_models is None:  # not running CV but running the model 1 time
            return [[0] * self.X_train.shape[1]]

        coeffs = []
        for m in self.svm_models:
            coeffs.append(m.model.coef_[0])
        return coeffs
import itertools

dataset = sys.argv[1]

preprocessor_list = [
    Binarizer(),
    MaxAbsScaler(),
    MinMaxScaler(),
    Normalizer(),
    PolynomialFeatures(),
    RobustScaler(),
    StandardScaler(),
    FastICA(),
    PCA(),
    RBFSampler(),
    Nystroem(),
    FeatureAgglomeration(),
    SelectFwe(),
    SelectKBest(),
    SelectPercentile(),
    VarianceThreshold(),
    SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
    RFE(estimator=ExtraTreesClassifier(n_estimators=100))
]

# Read the data set into memory
input_data = pd.read_csv(dataset, compression='gzip',
                         sep='\t').sample(frac=1.,
                                          replace=False,
                                          random_state=42)
Пример #54
0
class ParametricModelApproximation(object):
    """Approximate a Gaussian Process by a parametric model.

    Approximating a Gaussian Process by a parametric model can be useful if
    one has to evaluate a sample function from the GP repeatedly or on many
    evaluation points as this would become computationally very expensive
    with a GP.

    Parameters
    ----------
    model : GaussianProcessRegressor
        The Gaussian Process which is to be approximated

    bounds: list of pair of floats
        The boundaries of the data space. This is used when determining the
        features of the parametric approximation (they are centered at random
        points in the data space)

    n_components: int
        The number of features/parameters of the parametric model

    seed: int
        The seed of the random number generator
    """
    def __init__(self, model, bounds, n_components, seed):
        self.gp = model
        self.bounds = bounds
        self.n_components = n_components
        self.rng = np.random.RandomState(seed)

        self.X_space = self.rng.uniform(self.bounds[:, 0], self.bounds[:, 1],
                                        (1000, self.bounds.shape[0]))

        assert self.gp.X_fit_.shape[1] == self.X_space.shape[1]

        self.kernel = self.gp.kernel_
        self.nystr = Nystroem(
            n_components=min(self.n_components, self.X_space.shape[0]),
            kernel='precomputed', random_state=self.rng)
        self.nystr.fit(self.kernel(self.X_space))

    def determine_coefs(self, X_query=None, y_query_samples=None, n_samples=1):
        """ Determine coefficients of parametric model.

        Simulate an evaluation at X_query with outcomes y_query_samples.
        Determine coefficients of parametric model the updated GP.

        Parameters
        ----------
        X_query : ndarray-like, default: None
            The query point at which an additional evaluation is simulated.
            If None, a parametric approximation of the unmodified GP is
            returned.

        y_query_samples: ndarray-like, default: None
            The possible outcomes of a query at X_query.

        n_samples: int
            The number of independent samples of model coefficients from the
            Bayesian posterior over model coefficients
        """
        if X_query is not None:
            X_query = np.asarray(X_query)
            X_queried = np.vstack((self.gp.X_fit_, X_query))
        else:
            X_queried = self.gp.X_fit_
            y_queried = self.gp.y_fit_

        Phi = self.nystr.transform(self.kernel(self.X_space, X_queried))
        A = Phi.T.dot(Phi) + self.gp.alpha * np.eye(Phi.shape[1])
        A_inv = np.linalg.inv(A)

        cov = self.gp.alpha * A_inv

        coefs = \
            np.empty((n_samples, self.n_components, y_query_samples.shape[0]))
        for i in range(y_query_samples.shape[0]): # XXX: Vectorize
            y_queried = np.hstack((self.gp.y_fit_, y_query_samples[i]))
            mean = A_inv.dot(Phi.T).dot(y_queried)
            coefs[:, :, i] = self.rng.multivariate_normal(mean, cov, n_samples)
        return np.array(coefs)

    def __call__(self, X, coefs):
        """ Evaluate parametric model at X for the given sampled coefficients.

        Parameters
        ----------
        X : ndarray-like
            The points at which the parametric model is to be evaluated

        coefs: ndarray-like
            The coefficients of the parametric model.
        """
        X = np.atleast_2d(X)

        Phi = self.nystr.transform(self.kernel(self.X_space, X))
        f = Phi.dot(coefs)
        return f
Пример #55
0
    y_test = y[n_train:]

    return X_train, X_test, y_train, y_test


ESTIMATORS = {
    "dummy":
    DummyClassifier(),
    'CART':
    DecisionTreeClassifier(),
    'ExtraTrees':
    ExtraTreesClassifier(n_estimators=100),
    'RandomForest':
    RandomForestClassifier(n_estimators=100),
    'Nystroem-SVM':
    make_pipeline(Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)),
    'SampledRBF-SVM':
    make_pipeline(RBFSampler(gamma=0.015, n_components=1000),
                  LinearSVC(C=100)),
    'LogisticRegression-SAG':
    LogisticRegression(solver='sag', tol=1e-1, C=1e4),
    'LogisticRegression-SAGA':
    LogisticRegression(solver='saga', tol=1e-1, C=1e4),
    'MultilayerPerceptron':
    MLPClassifier(hidden_layer_sizes=(100, 100),
                  max_iter=400,
                  alpha=1e-4,
                  solver='sgd',
                  learning_rate_init=0.2,
                  momentum=0.9,
                  verbose=1,
Пример #56
0
class ParametricModelApproximation(object):
    """Approximate a Gaussian Process by a parametric model.

    Approximating a Gaussian Process by a parametric model can be useful if
    one has to evaluate a sample function from the GP repeatedly or on many
    evaluation points as this would become computationally very expensive
    with a GP.

    Parameters
    ----------
    model : GaussianProcessRegressor
        The Gaussian Process which is to be approximated

    bounds: list of pair of floats
        The boundaries of the data space. This is used when determining the
        features of the parametric approximation (they are centered at random
        points in the data space)

    n_components: int
        The number of features/parameters of the parametric model

    seed: int
        The seed of the random number generator
    """
    def __init__(self, model, bounds, n_components, seed):
        self.gp = model
        self.bounds = bounds
        self.n_components = n_components
        self.rng = np.random.RandomState(seed)

        self.X_space = self.rng.uniform(self.bounds[:, 0], self.bounds[:, 1],
                                        (1000, self.bounds.shape[0]))

        assert self.gp.X_fit_.shape[1] == self.X_space.shape[1]

        self.kernel = self.gp.kernel_
        self.nystr = Nystroem(
            n_components=min(self.n_components, self.X_space.shape[0]),
            kernel='precomputed', random_state=self.rng)
        self.nystr.fit(self.kernel(self.X_space))

    def determine_coefs(self, X_query=None, y_query_samples=None, n_samples=1):
        """ Determine coefficients of parametric model.

        Simulate an evaluation at X_query with outcomes y_query_samples.
        Determine coefficients of parametric model the updated GP.

        Parameters
        ----------
        X_query : ndarray-like, default: None
            The query point at which an additional evaluation is simulated.
            If None, a parametric approximation of the unmodified GP is
            returned.

        y_query_samples: ndarray-like, default: None
            The possible outcomes of a query at X_query.

        n_samples: int
            The number of independent samples of model coefficients from the
            Bayesian posterior over model coefficients
        """
        if X_query is not None:
            X_query = np.asarray(X_query)
            X_queried = np.vstack((self.gp.X_fit_, X_query))
        else:
            X_queried = self.gp.X_fit_
            y_queried = self.gp.y_fit_

        Phi = self.nystr.transform(self.kernel(self.X_space, X_queried))
        A = Phi.T.dot(Phi) + self.gp.alpha * np.eye(Phi.shape[1])
        A_inv = np.linalg.inv(A)

        cov = self.gp.alpha * A_inv

        coefs = \
            np.empty((n_samples, self.n_components, y_query_samples.shape[0]))
        for i in range(y_query_samples.shape[0]): # XXX: Vectorize
            y_queried = np.hstack((self.gp.y_fit_, y_query_samples[i]))
            mean = A_inv.dot(Phi.T).dot(y_queried)
            coefs[:, :, i] = self.rng.multivariate_normal(mean, cov, n_samples)
        return np.array(coefs)

    def __call__(self, X, coefs):
        """ Evaluate parametric model at X for the given sampled coefficients.

        Parameters
        ----------
        X : ndarray-like
            The points at which the parametric model is to be evaluated

        coefs: ndarray-like
            The coefficients of the parametric model.
        """
        X = np.atleast_2d(X)

        Phi = self.nystr.transform(self.kernel(self.X_space, X))
        f = Phi.dot(coefs)
        return f