def test_polynomial_count_sketch(X, Y, gamma, degree, coef0): # test that PolynomialCountSketch approximates polynomial # kernel on random area_data # compute exact kernel kernel = polynomial_kernel(X, Y, gamma=gamma, degree=degree, coef0=coef0) # approximate kernel mapping ps_transform = PolynomialCountSketch(n_components=5000, gamma=gamma, coef0=coef0, degree=degree, random_state=42) X_trans = ps_transform.fit_transform(X) Y_trans = ps_transform.transform(Y) kernel_approx = np.dot(X_trans, Y_trans.T) error = kernel - kernel_approx assert np.abs(np.mean(error)) <= 0.05 # close to unbiased np.abs(error, out=error) assert np.max(error) <= 0.1 # nothing too far off assert np.mean(error) <= 0.05 # mean is fairly close
def test_polynomial_count_sketch_dense_sparse(gamma, degree, coef0): """Check that PolynomialCountSketch results are the same for dense and sparse input. """ ps_dense = PolynomialCountSketch(n_components=500, gamma=gamma, degree=degree, coef0=coef0, random_state=42) Xt_dense = ps_dense.fit_transform(X) Yt_dense = ps_dense.transform(Y) ps_sparse = PolynomialCountSketch(n_components=500, gamma=gamma, degree=degree, coef0=coef0, random_state=42) Xt_sparse = ps_sparse.fit_transform(csr_matrix(X)) Yt_sparse = ps_sparse.transform(csr_matrix(Y)) assert_allclose(Xt_dense, Xt_sparse) assert_allclose(Yt_dense, Yt_sparse)
def test_polynomial_count_sketch_raises_if_degree_lower_than_one(degree): with pytest.raises(ValueError, match=f'degree={degree} should be >=1.'): ps_transform = PolynomialCountSketch(degree=degree) ps_transform.fit(X, Y)
lsvm_score = 100 * lsvm.score(X_test, y_test) # Evaluate kernelized SVM ksvm = SVC(kernel="poly", degree=2, gamma=1.0).fit(X_train, y_train) ksvm_score = 100 * ksvm.score(X_test, y_test) # Evaluate PolynomialCountSketch + LinearSVM ps_svm_scores = [] n_runs = 5 # To compensate for the stochasticity of the method, we make n_tets runs for k in out_dims: score_avg = 0 for _ in range(n_runs): ps_svm = Pipeline([ ("PS", PolynomialCountSketch(degree=2, n_components=k)), ("SVM", LinearSVC()), ]) score_avg += ps_svm.fit(X_train, y_train).score(X_test, y_test) ps_svm_scores.append(100 * score_avg / n_runs) # Evaluate Nystroem + LinearSVM ny_svm_scores = [] n_runs = 5 for k in out_dims: score_avg = 0 for _ in range(n_runs): ny_svm = Pipeline([ ( "NY",
# features (precisely, 54^4). Thanks to :class:`PolynomialCountSketch`, we can # condense most of the discriminative information of that feature space into a # much more compact representation. We repeat the experiment 5 times to # compensate for the stochastic nature of :class:`PolynomialCountSketch`. n_runs = 3 for n_components in [250, 500, 1000, 2000]: ps_lsvm_time = 0 ps_lsvm_score = 0 for _ in range(n_runs): pipeline = Pipeline(steps=[ ( "kernel_approximator", PolynomialCountSketch(n_components=n_components, degree=4), ), ("linear_classifier", LinearSVC()), ]) start = time.time() pipeline.fit(X_train, y_train) ps_lsvm_time += time.time() - start ps_lsvm_score += 100 * pipeline.score(X_test, y_test) ps_lsvm_time /= n_runs ps_lsvm_score /= n_runs results[f"LSVM + PS({n_components})"] = { "time": ps_lsvm_time, "score": ps_lsvm_score,
# ------------------------------------------------------- # The new :class:`~sklearn.kernel_approximation.PolynomialCountSketch` # approximates a polynomial expansion of a feature space when used with linear # models, but uses much less memory than # :class:`~sklearn.preprocessing.PolynomialFeatures`. from sklearn.datasets import fetch_covtype from sklearn.pipeline import make_pipeline from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.kernel_approximation import PolynomialCountSketch from sklearn.linear_model import LogisticRegression X, y = fetch_covtype(return_X_y=True) pipe = make_pipeline(MinMaxScaler(), PolynomialCountSketch(degree=2, n_components=300), LogisticRegression(max_iter=1000)) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, test_size=10000, random_state=42) pipe.fit(X_train, y_train).score(X_test, y_test) # ############################################################################## # # For comparison, here is the score of a linear baseline for the same data: linear_baseline = make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=1000)) linear_baseline.fit(X_train, y_train).score(X_test, y_test)
is_all_zero = np.all(X_array == 0) if is_all_zero: print('array is all zeros') else: print('Array is good') choice_length = np.count_nonzero(~np.isnan(labels)) X, y = shuffle(X_array, labels) X = X[:choice_length] y = y[:choice_length].fillna(0) scaler = MinMaxScaler(feature_range=(-1, 1)) mm = make_pipeline(MinMaxScaler(), Normalizer()) X = mm.fit_transform(X) rbf_feature = RBFSampler(gamma=1.5, random_state=10) ps = PolynomialCountSketch(degree=11, random_state=1) X_rbf_features = rbf_feature.fit_transform(X) X_poly_features = ps.fit_transform(X) # We want to get TSNE embedding with 2 dimensions n_components = 3 tsne = TSNE(n_components) tsne_result = tsne.fit_transform(X_rbf_features) locationFileName = os.path.join( figuresDestination, str(sorted(symbols)[symbolIdx]) + '_idx_' + str(idx) + 'date_' + str(dateIdx) + '_' + str(labelName) + '_tsne_rbf_kernelised.png') fashion_scatter(tsne_result, y, locationFileName) fig = plt.figure(figsize=(16, 9))