예제 #1
0
def test_2d_to_1d():
    min_x, max_x, gradient, intercept, n_samples = 10, 30, 1.5, 20, 300
    X_1 = np.random.uniform(min_x, max_x, n_samples)[:, np.newaxis]
    X_2_exact = gradient * X_1 + intercept
    flat_noise = np.random.standard_normal(size=(n_samples, 1))
    X_2_noise = flat_noise * (10 - X_1) * (X_1 - 30) / 40
    X_2 = X_2_exact + X_2_noise
    X_train = np.concatenate([X_1, X_2], axis=1)

    pca_model = PCA(n_components=1)
    pca_model.fit(features=X_train)

    assert pca_model.n_components_ == 1
    assert pca_model.n_samples_ == n_samples
    assert pca_model.n_features_ == 2
    assert pca_model.explained_variance_.shape == (1, )
    assert pca_model.explained_variance_[0] >= 80
    assert pca_model.explained_variance_ratio_.shape == (1, )
    assert pca_model.explained_variance_ratio_[0] >= 0.98
    assert pca_model.components_.shape == (1, 2)

    principle = pca_model.components_[0]
    assert principle[1] / principle[0] == pytest.approx(gradient, abs=0.2)
    assert principle[1]**2 + principle[0]**2 == pytest.approx(1)

    X_transform = pca_model.transform(features=X_train)
    assert X_transform.shape == (n_samples, 1)
    assert np.mean(X_transform) == pytest.approx(0)
예제 #2
0
def pca_and_pair_plot():
    from endochrone.stats.scaling import FeatureScaling
    from endochrone.decomposition import PCA

    global x
    if test_fit:
        # TODO: make this part of PCA
        # We need to remove features with no variation, otherwise scaling & PCA
        # will fail
        features_to_keep = [
            i for i, feat in enumerate(x.T) if len(np.unique(feat)) > 1
        ]
        x = x[:, features_to_keep]

    # first we rescale to stop large floats dominating categories
    scale_model = FeatureScaling(method='z_score')
    scaled_x = scale_model.fit_and_transform(features=x)

    N = 3
    pcam_min = PCA(n_components=N)
    pcam_min.fit(features=scaled_x)
    pca_x = pcam_min.transform(features=scaled_x)
    labels = (list(range(N)) + ['species'])
    df = pd.DataFrame(np.hstack([pca_x, y[:, np.newaxis]]), columns=labels)
    sns.pairplot(df, hue='species', height=1.5)
    plt.show()
예제 #3
0
def test_zero_components_specified():
    min_x, max_x, gradient, intercept, n_samples = 10, 30, 4, 20, 300
    X_1 = np.random.uniform(min_x, max_x, n_samples)[:, np.newaxis]
    X_2_exact = gradient * X_1 + intercept
    flat_noise = np.random.standard_normal(size=(n_samples, 1))
    X_2_noise = flat_noise * (10 - X_1) * (X_1 - 30) / 40
    X_2 = X_2_exact + X_2_noise
    X_train = np.concatenate([X_1, X_2], axis=1)

    pca_model = PCA()
    pca_model.fit(features=X_train)

    assert pca_model.n_components_ == 2
    assert pca_model.n_samples_ == n_samples
    assert pca_model.n_features_ == 2
    assert pca_model.explained_variance_.shape == (2, )
    assert pca_model.explained_variance_ratio_.shape == (2, )
    assert sum(pca_model.explained_variance_ratio_) == pytest.approx(1)
    assert pca_model.components_.shape == (2, 2)
예제 #4
0
def iris_pca():
    from mpl_toolkits import mplot3d  # noqa: F401
    from endochrone.decomposition import PCA

    fig = plt.figure(facecolor="w", figsize=(14, 7))
    i_target = iris['target']

    pcam_2 = PCA(n_components=2)
    red_i_data_2 = pcam_2.fit_and_transform(features=i_data)
    var_sum_2 = np.abs(np.sum(pcam_2.explained_variance_ratio_))
    title_2 = '%s components, capturing %.4f%% variation' % (2, var_sum_2*100)

    X_2 = red_i_data_2[:, 0]
    Y_2 = red_i_data_2[:, 1]
    ax_2 = fig.add_subplot(1, 2, 1, title=title_2)
    ax_2.scatter(X_2, Y_2, c=i_target, s=3, marker='d', cmap='cool')

    # Compare to 3 component reduction
    pcam_3 = PCA(n_components=3)
    red_i_data_3 = pcam_3.fit_and_transform(features=i_data)
    var_sum_3 = np.abs(np.sum(pcam_3.explained_variance_ratio_))
    title_3 = '%s components, capturing %.4f%% variation' % (2, var_sum_3*100)

    X_3 = red_i_data_3[:, 0]
    Y_3 = red_i_data_3[:, 1]
    Z_3 = red_i_data_3[:, 2]
    ax_3 = fig.add_subplot(1, 2, 2, projection='3d', title=title_3)
    ax_3.scatter3D(X_3, Y_3, Z_3, c=i_target, s=3, marker='d', cmap='cool')

    plt.show()
예제 #5
0
def test_6d_to_2d():
    n_samples = 300
    min_x, max_x, gradient_1, gradient_2, intercept = 10, 30, 1.5, 0.6, 20
    X_1 = np.random.uniform(min_x, max_x, n_samples)[:, np.newaxis]
    X_2 = np.random.uniform(min_x, max_x, n_samples)[:, np.newaxis]
    noise = 5 * np.random.standard_normal(size=(n_samples, 1))
    X_3 = gradient_1 * X_1 + intercept + noise
    X_4 = gradient_2 * X_2 + intercept + noise
    X_5 = X_3 + X_2
    X_6 = X_4 + X_1
    X_train = np.concatenate([X_1, X_2, X_3, X_4, X_5, X_6], axis=1)

    pca_model = PCA(n_components=2)
    pca_model.fit(features=X_train)

    assert pca_model.n_components_ == 2
    assert pca_model.n_samples_ == n_samples
    assert pca_model.n_features_ == 6
    assert pca_model.explained_variance_.shape == (2, )
    assert pca_model.explained_variance_ratio_.shape == (2, )
    assert np.abs(np.sum(pca_model.explained_variance_ratio_)) > 0.92
    assert pca_model.components_.shape == (2, 6)
예제 #6
0
def test_accuracy_of_inversion():
    n_features, n_samples = 6, 300
    X_train = np.random.rand(n_samples, n_features)

    pcam = PCA(n_components=n_features)
    pcam.fit(features=X_train)

    act = pcam.reverse(features=pcam.transform(features=X_train))
    assert np.all(act == pytest.approx(X_train))
예제 #7
0
def with_pca():
    from endochrone.stats.scaling import FeatureScaling
    from endochrone.decomposition import PCA

    global x
    if test_fit:
        # TODO: make this part of PCA
        # We need to remove features with no variation, otherwise scaling & PCA
        # will fail
        features_to_keep = [
            i for i, feat in enumerate(x.T) if len(np.unique(feat)) > 1
        ]
        x = x[:, features_to_keep]

    # first we rescale to stop large floats dominating categories
    scale_model = FeatureScaling(method='z_score')
    scaled_x = scale_model.fit_and_transform(features=x)

    # Then build a test PCA model so we can figure out how many components we
    # want
    pcam_test = PCA()
    pcam_test.fit(features=scaled_x)
    cutoff = 0.97  # i.e. we want to retain this % of variance
    # TODO: want to be able to do this with a single PCA model
    n_comp = np.argmax(np.cumsum(pcam_test.explained_variance_ratio_) > cutoff)

    # Optionally see how our variance increases as n_comp increases
    show_pca_variances = False
    if show_pca_variances:
        plt.plot(range(54), np.cumsum(pcam_test.explained_variance_ratio_))
        plt.show()

    # Transform according to the above PCA
    pcam_forest = PCA(n_components=n_comp)
    pca_x = pcam_forest.fit_and_transform(features=scaled_x)

    xtrain, xtest, ytrain, ytest = train_test_split(pca_x, y)

    # Now build our RF model and fit it
    t0 = time.process_time()
    forest_model = RandomForest(64, max_tree_depth=15)
    forest_model.fit(xtrain, ytrain, debug=True)
    t1 = time.process_time()
    print('train time: ', t1 - t0)

    # print(forest_model.trees[0])
    ypred = forest_model.predict(xtest)
    metrics = mcm(ytest, ypred)
    print('\nmetrics for test set')
    print('macro_precision', metrics.macro_precision)
    print('macro_recall', metrics.macro_recall)
    print('macro_f1', metrics.macro_f1_score)
    print('micro_f1', metrics.micro_f1_score)

    train_pred = forest_model.predict(xtrain)
    metrics = mcm(ytrain, train_pred)
    print('\nmetrics for training set')
    print('macro_precision', metrics.macro_precision)
    print('macro_recall', metrics.macro_recall)
    print('macro_f1', metrics.macro_f1_score)
    print('micro_f1', metrics.micro_f1_score)
    """Example output:
예제 #8
0
                             gridspec_kw=dict(hspace=0.1, wspace=0.1))

    for i, ax in enumerate(axes.flat):
        ax.imshow(digits.images[i], cmap='binary', interpolation='nearest')
        ax.text(0.05,
                0.05,
                str(digits.target[i]),
                transform=ax.transAxes,
                color='g')
    plt.show()

Xtrain, Xtest, Ytrain, Ytest = train_test_split(digits.data, digits.target)

# Test run to figure out how many components we should keep
# TODO should be able to do this with a single model
pcam_test = PCA()
pcam_test.fit(features=Xtrain)
cutoff = 0.97  # i.e. we want to retain this % of variance
n_comp = np.argmax(np.cumsum(pcam_test.explained_variance_ratio_) > cutoff)

# Now reduce our features with this PCA model
pcam = PCA(n_components=n_comp)
pca_Xtrain = pcam.fit_and_transform(features=Xtrain)
pca_Xtest = pcam.transform(features=Xtest)

# Try KNN to see if we're any good at classifying
knn_model = KNearest()
knn_model.fit(features=pca_Xtrain, targets=Ytrain)
ypred = knn_model.predict(features=pca_Xtest)
acc = accuracy_score(Ytest, ypred) * 100
print("cut-off: %s \nn_comp: %s \naccuracy: %0.4f%%" % (cutoff, n_comp, acc))