def test_2d_to_1d(): min_x, max_x, gradient, intercept, n_samples = 10, 30, 1.5, 20, 300 X_1 = np.random.uniform(min_x, max_x, n_samples)[:, np.newaxis] X_2_exact = gradient * X_1 + intercept flat_noise = np.random.standard_normal(size=(n_samples, 1)) X_2_noise = flat_noise * (10 - X_1) * (X_1 - 30) / 40 X_2 = X_2_exact + X_2_noise X_train = np.concatenate([X_1, X_2], axis=1) pca_model = PCA(n_components=1) pca_model.fit(features=X_train) assert pca_model.n_components_ == 1 assert pca_model.n_samples_ == n_samples assert pca_model.n_features_ == 2 assert pca_model.explained_variance_.shape == (1, ) assert pca_model.explained_variance_[0] >= 80 assert pca_model.explained_variance_ratio_.shape == (1, ) assert pca_model.explained_variance_ratio_[0] >= 0.98 assert pca_model.components_.shape == (1, 2) principle = pca_model.components_[0] assert principle[1] / principle[0] == pytest.approx(gradient, abs=0.2) assert principle[1]**2 + principle[0]**2 == pytest.approx(1) X_transform = pca_model.transform(features=X_train) assert X_transform.shape == (n_samples, 1) assert np.mean(X_transform) == pytest.approx(0)
def pca_and_pair_plot(): from endochrone.stats.scaling import FeatureScaling from endochrone.decomposition import PCA global x if test_fit: # TODO: make this part of PCA # We need to remove features with no variation, otherwise scaling & PCA # will fail features_to_keep = [ i for i, feat in enumerate(x.T) if len(np.unique(feat)) > 1 ] x = x[:, features_to_keep] # first we rescale to stop large floats dominating categories scale_model = FeatureScaling(method='z_score') scaled_x = scale_model.fit_and_transform(features=x) N = 3 pcam_min = PCA(n_components=N) pcam_min.fit(features=scaled_x) pca_x = pcam_min.transform(features=scaled_x) labels = (list(range(N)) + ['species']) df = pd.DataFrame(np.hstack([pca_x, y[:, np.newaxis]]), columns=labels) sns.pairplot(df, hue='species', height=1.5) plt.show()
def test_zero_components_specified(): min_x, max_x, gradient, intercept, n_samples = 10, 30, 4, 20, 300 X_1 = np.random.uniform(min_x, max_x, n_samples)[:, np.newaxis] X_2_exact = gradient * X_1 + intercept flat_noise = np.random.standard_normal(size=(n_samples, 1)) X_2_noise = flat_noise * (10 - X_1) * (X_1 - 30) / 40 X_2 = X_2_exact + X_2_noise X_train = np.concatenate([X_1, X_2], axis=1) pca_model = PCA() pca_model.fit(features=X_train) assert pca_model.n_components_ == 2 assert pca_model.n_samples_ == n_samples assert pca_model.n_features_ == 2 assert pca_model.explained_variance_.shape == (2, ) assert pca_model.explained_variance_ratio_.shape == (2, ) assert sum(pca_model.explained_variance_ratio_) == pytest.approx(1) assert pca_model.components_.shape == (2, 2)
def iris_pca(): from mpl_toolkits import mplot3d # noqa: F401 from endochrone.decomposition import PCA fig = plt.figure(facecolor="w", figsize=(14, 7)) i_target = iris['target'] pcam_2 = PCA(n_components=2) red_i_data_2 = pcam_2.fit_and_transform(features=i_data) var_sum_2 = np.abs(np.sum(pcam_2.explained_variance_ratio_)) title_2 = '%s components, capturing %.4f%% variation' % (2, var_sum_2*100) X_2 = red_i_data_2[:, 0] Y_2 = red_i_data_2[:, 1] ax_2 = fig.add_subplot(1, 2, 1, title=title_2) ax_2.scatter(X_2, Y_2, c=i_target, s=3, marker='d', cmap='cool') # Compare to 3 component reduction pcam_3 = PCA(n_components=3) red_i_data_3 = pcam_3.fit_and_transform(features=i_data) var_sum_3 = np.abs(np.sum(pcam_3.explained_variance_ratio_)) title_3 = '%s components, capturing %.4f%% variation' % (2, var_sum_3*100) X_3 = red_i_data_3[:, 0] Y_3 = red_i_data_3[:, 1] Z_3 = red_i_data_3[:, 2] ax_3 = fig.add_subplot(1, 2, 2, projection='3d', title=title_3) ax_3.scatter3D(X_3, Y_3, Z_3, c=i_target, s=3, marker='d', cmap='cool') plt.show()
def test_6d_to_2d(): n_samples = 300 min_x, max_x, gradient_1, gradient_2, intercept = 10, 30, 1.5, 0.6, 20 X_1 = np.random.uniform(min_x, max_x, n_samples)[:, np.newaxis] X_2 = np.random.uniform(min_x, max_x, n_samples)[:, np.newaxis] noise = 5 * np.random.standard_normal(size=(n_samples, 1)) X_3 = gradient_1 * X_1 + intercept + noise X_4 = gradient_2 * X_2 + intercept + noise X_5 = X_3 + X_2 X_6 = X_4 + X_1 X_train = np.concatenate([X_1, X_2, X_3, X_4, X_5, X_6], axis=1) pca_model = PCA(n_components=2) pca_model.fit(features=X_train) assert pca_model.n_components_ == 2 assert pca_model.n_samples_ == n_samples assert pca_model.n_features_ == 6 assert pca_model.explained_variance_.shape == (2, ) assert pca_model.explained_variance_ratio_.shape == (2, ) assert np.abs(np.sum(pca_model.explained_variance_ratio_)) > 0.92 assert pca_model.components_.shape == (2, 6)
def test_accuracy_of_inversion(): n_features, n_samples = 6, 300 X_train = np.random.rand(n_samples, n_features) pcam = PCA(n_components=n_features) pcam.fit(features=X_train) act = pcam.reverse(features=pcam.transform(features=X_train)) assert np.all(act == pytest.approx(X_train))
def with_pca(): from endochrone.stats.scaling import FeatureScaling from endochrone.decomposition import PCA global x if test_fit: # TODO: make this part of PCA # We need to remove features with no variation, otherwise scaling & PCA # will fail features_to_keep = [ i for i, feat in enumerate(x.T) if len(np.unique(feat)) > 1 ] x = x[:, features_to_keep] # first we rescale to stop large floats dominating categories scale_model = FeatureScaling(method='z_score') scaled_x = scale_model.fit_and_transform(features=x) # Then build a test PCA model so we can figure out how many components we # want pcam_test = PCA() pcam_test.fit(features=scaled_x) cutoff = 0.97 # i.e. we want to retain this % of variance # TODO: want to be able to do this with a single PCA model n_comp = np.argmax(np.cumsum(pcam_test.explained_variance_ratio_) > cutoff) # Optionally see how our variance increases as n_comp increases show_pca_variances = False if show_pca_variances: plt.plot(range(54), np.cumsum(pcam_test.explained_variance_ratio_)) plt.show() # Transform according to the above PCA pcam_forest = PCA(n_components=n_comp) pca_x = pcam_forest.fit_and_transform(features=scaled_x) xtrain, xtest, ytrain, ytest = train_test_split(pca_x, y) # Now build our RF model and fit it t0 = time.process_time() forest_model = RandomForest(64, max_tree_depth=15) forest_model.fit(xtrain, ytrain, debug=True) t1 = time.process_time() print('train time: ', t1 - t0) # print(forest_model.trees[0]) ypred = forest_model.predict(xtest) metrics = mcm(ytest, ypred) print('\nmetrics for test set') print('macro_precision', metrics.macro_precision) print('macro_recall', metrics.macro_recall) print('macro_f1', metrics.macro_f1_score) print('micro_f1', metrics.micro_f1_score) train_pred = forest_model.predict(xtrain) metrics = mcm(ytrain, train_pred) print('\nmetrics for training set') print('macro_precision', metrics.macro_precision) print('macro_recall', metrics.macro_recall) print('macro_f1', metrics.macro_f1_score) print('micro_f1', metrics.micro_f1_score) """Example output:
gridspec_kw=dict(hspace=0.1, wspace=0.1)) for i, ax in enumerate(axes.flat): ax.imshow(digits.images[i], cmap='binary', interpolation='nearest') ax.text(0.05, 0.05, str(digits.target[i]), transform=ax.transAxes, color='g') plt.show() Xtrain, Xtest, Ytrain, Ytest = train_test_split(digits.data, digits.target) # Test run to figure out how many components we should keep # TODO should be able to do this with a single model pcam_test = PCA() pcam_test.fit(features=Xtrain) cutoff = 0.97 # i.e. we want to retain this % of variance n_comp = np.argmax(np.cumsum(pcam_test.explained_variance_ratio_) > cutoff) # Now reduce our features with this PCA model pcam = PCA(n_components=n_comp) pca_Xtrain = pcam.fit_and_transform(features=Xtrain) pca_Xtest = pcam.transform(features=Xtest) # Try KNN to see if we're any good at classifying knn_model = KNearest() knn_model.fit(features=pca_Xtrain, targets=Ytrain) ypred = knn_model.predict(features=pca_Xtest) acc = accuracy_score(Ytest, ypred) * 100 print("cut-off: %s \nn_comp: %s \naccuracy: %0.4f%%" % (cutoff, n_comp, acc))