예제 #1
0
def test_pass_pca_corr_pca_out():
    X, y = iris_data()
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    eigen = pca.explained_variance_

    plot_pca_correlation_graph(X,
                               variables_names=['1', '2', '3', '4'],
                               X_pca=X_pca,
                               explained_variance=eigen)
예제 #2
0
def test_no_X_PCA_but_explained_variance():
    with pytest.raises(ValueError,
                       match='If `explained variance` is not None, the '
                       '`X_pca` values should not be `None`.'):

        X, y = iris_data()
        pca = PCA(n_components=2)
        pca.fit(X)
        eigen = pca.explained_variance_

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=None,
                                   explained_variance=eigen)
예제 #3
0
def test_X_PCA_but_no_explained_variance():
    with pytest.raises(
            ValueError,
            match='If `X_pca` is not None, the `explained variance` '
            'values should not be `None`.'):

        X, y = iris_data()
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=X_pca,
                                   explained_variance=None)
예제 #4
0
def test_not_enough_components():
    s = (
        'Number of principal components must match the number of eigenvalues. Got 2 != 1'
    )
    with pytest.raises(ValueError, match=s):

        X, y = iris_data()
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)
        eigen = pca.explained_variance_

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=X_pca,
                                   explained_variance=eigen[:-1])
예제 #5
0
def pca(x, dataset, mod):
    """

    :param x:
    :param dataset:
    :return:
    """
    xn = x[list(x.columns[np.argwhere(np.array(mod) == 'numeric')])]
    xm = x[list(x.columns[np.argwhere(np.array(mod) == 'modal')])]
    if dataset == 'kidney-disease':
        n = 10
        pca_instance = PCA(n_components=n)
        pca_instance.fit(xn[list(xn.columns)])
        xp = pca_instance.transform(xn)
        var = sum(pca_instance.explained_variance_[:n]) * 100 / sum(
            pca_instance.explained_variance_)
        print(
            'The {} principal components are responsible for {}% of the variance'
            .format(n, var))
        feature_names = list(xn.columns)
        figure, correlation_matrix = plot_pca_correlation_graph(
            xn, feature_names, figure_axis_size=10)
        plt.savefig('Output/Images' + "/" +
                    'PCA_for_dataset_{}'.format(dataset))
        plt.close()
        xp = pd.DataFrame(data=xp, index=xn.index)
        xp = pd.concat([xp, xm], axis=1)
        return xp
    elif dataset == 'bank-note':
        n = 2
        pca_instance = PCA(n_components=n)
        pca_instance.fit(x)
        xp = pca_instance.transform(x)
        var = sum(pca_instance.explained_variance_[:n]) * 100 / sum(
            pca_instance.explained_variance_)
        print(
            'The {} principal components are responsible for {}% of the variance'
            .format(n, var))
        feature_names = ['variance', 'skewness', 'curtosis', 'entropy']
        figure, correlation_matrix = plot_pca_correlation_graph(
            x, feature_names, figure_axis_size=10)
        plt.savefig('Output/Images' + "/" +
                    'PCA_for_dataset_{}'.format(dataset))
        plt.close()
        return xp
g = sns.lmplot('PC1',
               'PC2',
               hue='Case of flush',
               data=ml_data,
               fit_reg=False,
               scatter=True)
plt.show()

#Correlation Circle
features_name = [
    'Blue 1-1', 'Blue 1-2', 'Green 1-1', 'Green 1-2', 'Red 1-1', 'Red 1-2',
    'Blue 2-1', 'Blue 2-2', 'Green 2-1', 'Green 2-2', 'Red 2-1', 'Red 2-2',
    'Case of flush'
]
fig, correlation_matrix = plot_pca_correlation_graph(x_norm,
                                                     features_name,
                                                     dimensions=(1, 2, 3, 4))
plt.show()

#K-means

#Elbow Method - SSE = Sum Squared Error

sse = []
for k in range(1, 15):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(x_norm)
    sse.append(kmeans.inertia_)

kl = KneeLocator(range(1, 15), sse, curve='convex', direction='decreasing')
nbr_cluster = kl.elbow
예제 #7
0
from sklearn.preprocessing import StandardScaler
from mlxtend.plotting import plot_pca_correlation_graph

#%% load and standardize data
iris = datasets.load_iris()
X = iris.data
X_std = StandardScaler().fit_transform(X)

#%% specify feature names
feature_names = ['sepal length', 'sepal width', 'petal length', 'petal width']

#%%
# calculate the correlation matrix and
# create a correlation graph
fig, cor_mat = plot_pca_correlation_graph(X_std, \
    feature_names, dimensions=(1, 2), \
    figure_axis_size=10)

#%%
# show the numbers of the correlation
# matrix for the 4 features
print(cor_mat)
# console output:
#                  Dim 1     Dim 2
# sepal length -0.890169 -0.360830
# sepal width   0.460143 -0.882716
# petal length -0.991555 -0.023415
# petal width  -0.964979 -0.064000

# %%
예제 #8
0
def test_pass_pca_corr():
    X, y = iris_data()
    plot_pca_correlation_graph(X, variables_names=['1', '2', '3', '4'])
예제 #9
0
    ax.scatter(Ch[0], Ch[1], s=120)
    ax.scatter(Dh[0], Dh[1], s=160)
    ax.scatter(Eh[0], Eh[1], s=200)
    ax.scatter(Fh[0], Fh[1], s=240)
    ax.scatter(Gh[0], Gh[1], s=270)
    ax.legend([('class ' + str(k + 1)) for k in range(7)])
    plt.show()

    plt.title(
        'Projection by PCA of the zoo on the first 2 principal components')
    plt.xlabel('PC1')
    plt.ylabel('PC2')

    ## PCA circle (we just use a librairy to do this part)

    figure, correlation_matrix = plot_pca_correlation_graph(
        Y_norm, feature_names, dimensions=(1, 2), figure_axis_size=10)
    plt.show()

## MDS
if (choice == 2 or choice == 3 or choice == 0):
    Yh_norm = Y_norm.T
    if (choice == 3 or choice == 0):
        metric = []
        for k in range(DIMENTION):
            metric.append(
                (abs(np.corrcoef(Yh_norm[k], Type)[0][1])
                 ))  ## find correlation betweeen type and initial variables

        Y_norm = Y_norm * metric  #change the metric (that mean that's we encourage interspace between types.

    S = np.dot(Y_norm, Y_norm.T)
def test_pass_pca_corr():
    plot_pca_correlation_graph(X, ['1', '2', '3', '4'])