def test_whitening():
    pca = PCA(n_components=2)
    res = pca.fit(X_std).transform(X_std)
    diagonals_sum = np.sum(np.diagonal(np.cov(res.T)))
    assert round(diagonals_sum, 1) == 3.9, diagonals_sum

    pca = PCA(n_components=2, whitening=True)
    res = pca.fit(X_std).transform(X_std)
    diagonals_sum = np.sum(np.diagonal(np.cov(res.T)))
    assert round(diagonals_sum, 1) == 2.0, diagonals_sum
def test_whitening():
    pca = PCA(n_components=2)
    res = pca.fit(X_std).transform(X_std)
    diagonals_sum = np.sum(np.diagonal(np.cov(res.T)))
    assert round(diagonals_sum, 1) == 3.9, diagonals_sum

    pca = PCA(n_components=2, whitening=True)
    res = pca.fit(X_std).transform(X_std)
    diagonals_sum = np.sum(np.diagonal(np.cov(res.T)))
    assert round(diagonals_sum, 1) == 2.0, diagonals_sum
示例#3
0
    def run_pipeline(self, train, test):

        X = train[self.test_features]
        y = train[self.target_variable]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 7)

        
        max_score = 0
        
        n_states = 3
        self.best_pipeline = None

        for i in range(2,25):
            pipe_pca = make_pipeline(StandardScaler(),
                            PrincipalComponentAnalysis(n_components=i),
                            #mix.GaussianMixture (n_components=3, random_state=7),
                            KNeighborsRegressor(n_neighbors=3),
                            )
            
            pipe_pca.fit(X_train, y_train)
            score = pipe_pca.score(X_test, y_test)
            future_score = pipe_pca.score(test[self.test_features], test[self.target_variable])
            
            if score>max_score:
                self.best_pipeline = pipe_pca
                max_score = score
                print(i)
                print('Transf. training accyracy: %.2f%%' % (pipe_pca.score(X_train, y_train)*100))
                print('Transf. test accyracy: %.2f%%' % (pipe_pca.score(X_test, y_test)*100))
                print('Future test accyracy: %.2f%%' % (future_score*100))
示例#4
0
    def run_pipeline(self, train, test):

        X = train[['return']+self.test_features]
        y = train[self.target_variable]
        """
        self.bins     = np.linspace(train['return'].min(), train['return'].max(), 3)
        y = np.digitize(y, self.bins)

        y_future_test = np.digitize(test[self.target_variable], self.bins)
        """

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 7)
        

        
        
        max_score = -np.inf
        

        
        self.best_pipeline = None

        for pca_n_components in range(2,25):
            for i in range(20):

                shuffle(self.test_features)
                
                this_features = self.test_features[0:self.k_features]

                pipe_pca = make_pipeline(StandardScaler(),
                                PrincipalComponentAnalysis(n_components=pca_n_components),
                                #mix.GaussianMixture (n_components=3, random_state=7),
                                KNeighborsRegressor(n_neighbors=self.k_neighbors, weights='distance'),
                                )
                
                pipe_pca.fit(X_train[ ['return']+this_features ], y_train)
                
                score = pipe_pca.score(X_test[ ['return']+this_features ], y_test)
                
                test['state'] = pipe_pca.predict(test[['return']+this_features])
                test['next_change'] = test['return'].shift(-1)
                correl = test[['state','next_change']].dropna().corr()['state']['next_change']
                
                if score>max_score and correl>0:
                    
                    
                    self.training_score = pipe_pca.score(X_train[ ['return']+this_features ], y_train)*100
                    self.testing_score = pipe_pca.score(X_test[ ['return']+this_features ], y_test)*100
                    
                    self.future_testing_score = pipe_pca.score(test[ ['return']+this_features ],test[self.target_variable])*100
                    #print(self.training_score)
                    self.pca_n_components = pca_n_components
                    self.best_pipeline = pipe_pca
                    self.found_best_features = ['return'] + this_features
                    max_score = score
                    #print(i)
                    #print('Transf. training accyracy: %.2f%%' % (self.training_score))
                    print('Transf. test accyracy: %.2f%%' % (self.testing_score))
                    print('Future test accyracy: %.2f%%' % (self.future_testing_score))
                    input()
def test_pca_on_uncentered_data():
    pca1 = PCA(solver='svd')
    pca1.fit(X)

    pca2 = PCA(solver='eigen')
    pca2.fit(X)
    assert_almost_equal(pca1.e_vals_normalized_, pca2.e_vals_normalized_)
def test_pca_on_uncentered_data():
    pca1 = PCA(solver='svd')
    pca1.fit(X)

    pca2 = PCA(solver='eigen')
    pca2.fit(X)
    assert_almost_equal(pca1.e_vals_normalized_, pca2.e_vals_normalized_)
def test_evals():

    pca = PCA(n_components=2, solver='eigen')
    pca.fit(X_std)
    assert_almost_equal(pca.e_vals_, [2.9, 0.9, 0.2, 0.02], decimal=1)

    pca = PCA(n_components=2, solver='svd')
    pca.fit(X_std)
    assert_almost_equal(pca.e_vals_, [2.9, 0.9, 0.2, 0.02], decimal=1)
def test_evals():

    pca = PCA(n_components=2, solver='eigen')
    pca.fit(X_std)
    assert_almost_equal(pca.e_vals_, [2.9, 0.9, 0.2, 0.02], decimal=1)

    pca = PCA(n_components=2, solver='svd')
    pca.fit(X_std)
    assert_almost_equal(pca.e_vals_, [2.9, 0.9, 0.2, 0.02], decimal=1)
示例#9
0
def test_evals():

    pca = PCA(n_components=2, solver='eigen')
    pca.fit(X_std)

    expected = [2.93035378, 0.92740362, 0.14834223, 0.02074601]
    assert_almost_equal(pca.e_vals_, expected, decimal=5)

    pca = PCA(n_components=2, solver='svd')
    pca.fit(X_std)
    assert_almost_equal(pca.e_vals_, expected, decimal=5)
示例#10
0
    def get_model(self):

        self.pipe_pca = make_pipeline(
            StandardScaler(), PrincipalComponentAnalysis(n_components=3),
            GaussianHMM(n_components=3, covariance_type='full',
                        random_state=7))

        self.pipe_pca.fit(self.train[['return'] + self.features])
        model = self.pipe_pca.steps[2][1]

        results = []
        for i in range(3):
            result = [i, model.means_[i][0], np.diag(model.covars_[i])[0]]
            results.append(result)

        results = pd.DataFrame(results)
        results.columns = ['state', 'train_mean', 'train_var']
        self.results = results.set_index('state')

        self.get_renamed_states()
def test_loadings():

    expect = np.array([[0.9, -0.4, -0.3, 0.], [-0.5, -0.9, 0.1, -0.],
                       [1., -0., 0.1, -0.1], [1., -0.1, 0.2, 0.1]])

    pca = PCA(solver='eigen')
    pca.fit(X_std)
    assert_almost_equal(pca.loadings_, expect, decimal=1)

    expect = np.array([[-0.9, -0.4, 0.3, 0.], [0.4, -0.9, -0.1, -0.],
                       [-1., -0., -0.1, -0.1], [-1., -0.1, -0.2, 0.1]])

    pca = PCA(solver='svd')
    pca.fit(X_std)
    assert_almost_equal(pca.loadings_, expect, decimal=1)
示例#12
0
 def get_trained_pipelines(train):
     train_dfs = np.array_split(train, n_subsets)
     int_name = 0
     pipelines = []
     for train_subset in train_dfs:
         try:
             pipe_pca = make_pipeline(StandardScaler(),
                         PrincipalComponentAnalysis(n_components=n_components),
                         GMMHMM(n_components=n_components, covariance_type='full', n_iter=150, random_state=7),
                         )
             pipe_pca.fit(train_subset[ features ])
             train['state'] = pipe_pca.predict(train[ features ])
             results = pd.DataFrame(train.groupby(by=['state'])['return'].mean().sort_values())
             results['new_state'] = list(range(n_components))
             results.columns = ['mean', 'new_state']
             results = results.reset_index()
             results['name'] = int_name
             int_name = int_name + 1
             pipelines.append( [pipe_pca, results] )
         except Exception as e:
             #print('make trained pipelines exception', e)
             pass
     
     return pipelines
def test_loadings():

    expect = np.array([[0.9, -0.4, -0.3, 0.],
                       [-0.5, -0.9, 0.1, -0.],
                       [1., -0., 0.1, -0.1],
                       [1., -0.1, 0.2, 0.1]])

    pca = PCA(solver='eigen')
    pca.fit(X_std)
    assert_almost_equal(pca.loadings_, expect, decimal=1)

    expect = np.array([[-0.9, -0.4, 0.3, 0.],
                       [0.4, -0.9, -0.1, -0.],
                       [-1., -0., -0.1, -0.1],
                       [-1., -0.1, -0.2, 0.1]])

    pca = PCA(solver='svd')
    pca.fit(X_std)
    assert_almost_equal(pca.loadings_, expect, decimal=1)
示例#14
0
def test_fail_array_fit():
    pca = PCA(n_components=2)
    pca.fit(X[1])
示例#15
0
def test_fail_array_transform():
    pca = PCA(n_components=2)
    pca.fit(X)
    exp = pca.transform(X[1])
def test_eigen_vs_svd():
    pca = PCA(n_components=2, solver='eigen')
    eigen_res = pca.fit(X).transform(X)
    pca = PCA(n_components=2, solver='svd')
    svd_res = pca.fit(X).transform(X)
    assert_allclose(np.absolute(eigen_res), np.absolute(svd_res), atol=0.0001)
def test_evals():
    pca = PCA(n_components=2, solver='eigen')
    pca.fit(X)
    res = pca.fit(X).transform(X)
    assert_almost_equal(pca.e_vals_, [2.93, 0.93, 0.15, 0.02], decimal=2)
def test_default_2components():
    pca = PCA(n_components=2)
    res = pca.fit(X).transform(X)
    assert res.shape[1] == 2
示例#19
0
def test_default_components():
    pca = PCA(n_components=0)
    pca.fit(X)
    res = pca.fit(X).transform(X)
def plot_pca_correlation_graph(X,
                               variables_names,
                               dimensions=(1, 2),
                               figure_axis_size=6,
                               X_pca=None):
    """
    Compute the PCA for X and plots the Correlation graph

    Parameters
    ----------
    X : 2d array like.
        The columns represent the different variables and the rows are the
         samples of thos variables
    variables_names : array like
        Name of the columns (the variables) of X
    dimensions: tuple with two elements.
        dimensions to be plot (x,y)
    X_pca : optional. if not provided, compute PCA independently
    figure_axis_size :
         size of the final frame. The figure created is a square with length
         and width equal to figure_axis_size.
    Returns
    ----------
        matplotlib_figure , correlation_matrix
    """
    X = np.array(X)
    X = X - X.mean(axis=0)
    n_comp = max(dimensions)

    if X_pca is None:
        pca = PrincipalComponentAnalysis(n_components=n_comp)
        pca.fit(X)
        X_pca = pca.transform(X)

    corrs = create_correlation_table(
        X_pca, X, ['Dim ' + str(i + 1) for i in range(n_comp)],
        variables_names)
    tot = sum(pca.e_vals_)
    explained_var_ratio = [(i / tot) * 100 for i in pca.e_vals_]

    # Plotting circle
    fig_res = plt.figure(figsize=(figure_axis_size, figure_axis_size))
    plt.Circle((0, 0), radius=1, color='k', fill=False)
    circle1 = plt.Circle((0, 0), radius=1, color='k', fill=False)
    fig = plt.gcf()
    fig.gca().add_artist(circle1)

    # Plotting arrows
    texts = []
    for name, row in corrs.iterrows():
        x = row['Dim ' + str(dimensions[0])]
        y = row['Dim ' + str(dimensions[1])]
        plt.arrow(0.0,
                  0.0,
                  x,
                  y,
                  color='k',
                  length_includes_head=True,
                  head_width=.05)

        plt.plot([0.0, x], [0.0, y], 'k-')
        texts.append(plt.text(x, y, name, fontsize=2 * figure_axis_size))
    # Plotting vertical lines
    plt.plot([-1.1, 1.1], [0, 0], 'k--')
    plt.plot([0, 0], [-1.1, 1.1], 'k--')

    # Adjusting text
    adjust_text(texts)
    # Setting limits and title
    plt.xlim((-1.1, 1.1))
    plt.ylim((-1.1, 1.1))
    plt.title("Correlation Circle", fontsize=figure_axis_size * 3)

    plt.xlabel("Dim " + str(dimensions[0]) + " (%s%%)" %
               str(explained_var_ratio[dimensions[0] - 1])[:4].lstrip("0."),
               fontsize=figure_axis_size * 2)
    plt.ylabel("Dim " + str(dimensions[1]) + " (%s%%)" %
               str(explained_var_ratio[dimensions[1] - 1])[:4].lstrip("0."),
               fontsize=figure_axis_size * 2)
    return fig_res, corrs
def test_variance_explained_ratio():
    pca = PCA()
    pca.fit(X_std)
    assert math.isclose(np.sum(pca.e_vals_normalized_), 1.)
    assert math.isclose(np.sum(pca.e_vals_normalized_ < 0.), 0, abs_tol=1e-10)
示例#22
0
    #scores = cross_val_score(dt, X_pca[nonmissings,:], xtrain[nonmissings,i],scoring='neg_mean_absolute_error',cv=5,verbose=5)
    #print scores

np.savetxt('../../contest_data/xtest_tree_imputed.csv', xtrain, delimiter=',')

from sklearn.naive_bayes import MultinomialNB
gnb = MultinomialNB()
scores = cross_val_score(gnb,
                         xtrain,
                         ytrain,
                         scoring='f1_micro',
                         cv=5,
                         verbose=5)
print scores.mean()

pca = PCA(n_components=300)
xtrain_pca = pca.fit(xtrain[:, 500:]).transform(xtrain[:, 500:])
xtest_pca = pca.fit(xtrain[:, 500:]).transform(xtest[:, 500:])
#imputing test data
for i in range(500):
    print i
    train_missings = np.isnan(xtrain[:, i])
    train_nonmissings = ~train_missings
    test_missings = np.isnan(xtest[:, i])
    test_nonmissings = ~test_missings
    xtest[test_missings,
          i] = lin.fit(xtrain_pca[train_nonmissings, :],
                       xtrain[train_nonmissings,
                              i]).predict(xtest_pca[test_missings, :])

np.savetxt('../../contest_data/xtest_linear_imputed.csv', xtest, delimiter=',')
示例#23
0
from sklearn.ensemble import ExtraTreesClassifier
from mlxtend.feature_extraction import PrincipalComponentAnalysis as PCA
from sklearn.model_selection import cross_val_score
import numpy as np
import matplotlib.pyplot as plt

X = np.genfromtxt('../../contest_data/xtrain_linear_imputed.csv', delimiter=',')
y = np.genfromtxt('../../contest_data/train.csv', delimiter=',')[1:,-1]




pca = PCA(n_components=1000)
X_pca = pca.fit(X).transform(X)
et = ExtraTreesClassifier(n_estimators=1000, max_depth=None, random_state=0,verbose=0)
scores = cross_val_score(et, X_pca, y,scoring='f1_micro',cv=5,verbose=5)
print scores.mean()


et = ExtraTreesClassifier(n_estimators=300, max_depth=None, random_state=0,verbose=1)
scores = cross_val_score(et, X, y,scoring='f1_micro',cv=5,verbose=5)
print scores.mean()
'''
components=1000,estimators=1000 gives 32.6% f1
'''





pca = PCA(n_components=1000)
def test_variance_explained_ratio():
    pca = PCA()
    pca.fit(X_std)
    assert np.sum(pca.e_vals_normalized_) == 1.
    assert np.sum(pca.e_vals_normalized_ < 0.) == 0
def test_default_components():
    pca = PCA()
    res = pca.fit(X_std).transform(X_std)
    assert res.shape[1] == 4
示例#26
0
#Blue half moon
plt.scatter(X[y==1, 0], X[y==1, 1], # Start and peak/trough of each 'moon'.
            color='blue', marker='^', alpha=0.5)

plt.xlabel('x coordinate')
plt.ylabel('y coordinate')

#plt.show()
plt.savefig('../figs/tutorial/mlxtendex1_1.png')
plt.close()
# Moons are linearly inseperable so standard linear PCA will fail to accurately represent data in 1D space.

#Use PCA for dimensionality reduction

#specify number of components in PCA
pca = PCA(n_components=2)
#Transform X in accordance with 2-component PCA
X_pca = pca.fit(X).transform(X)

# Red half moon
plt.scatter(X_pca[y==0, 0], X_pca[y==0, 1], # Start and peak/troughof each 'moon'.
            color ='red', marker='o', alpha=0.5)

#Blue half moon
plt.scatter(X_pca[y==1, 0], X_pca[y==1, 1], # Start and peak/troughof each 'moon'.
            color='blue', marker='^', alpha=0.5)

plt.xlabel('PC1')
plt.ylabel('PC2')

#plt.show()
    def run_pipeline(self, production=False):
        self.pipeline_failed = True
        self.max_score = -np.inf
        self.max_correl = -np.inf
        
        # create pipeline
        pipe_pca = make_pipeline(StandardScaler(),
                                PrincipalComponentAnalysis(n_components=self.pca_n_components),
                                #GMMHMM(n_components=3, covariance_type='full'))
                                GaussianHMM(n_components=3, covariance_type='full'))
        exp_num = 0
        if self.run_type == 'find features':
            print('finding features')
        while exp_num < self.n_experiments:


            train = self.clean_train.copy()
            test = self.clean_test.copy()
            means = []
            stddevs = []
            scores = []
            correls = []

            if self.run_type == 'find_features':
                # choose features
                shuffle(self.starting_features)
                test_cols = ['return'] + self.starting_features[0:self.k_features]
                
                if 'stoch' not in str(test_cols):
                    continue
                
            elif self.run_type == 'production' or self.run_type == 'rolling_test':
                test_cols = self.features_found

            
            # test features on training dataset
            pipe_pca.fit(train[ test_cols ])
            try:
                self.train_score = np.exp( pipe_pca.score(train[ test_cols ]) / len(train) ) * 100
            except:
                self.train_score = None
            train['state'] = pipe_pca.predict(train[test_cols])
            train = self.rename_states(train)
            if train is None:
                continue
            criteria_check = self.check_criteria(train)
            if criteria_check == False:
                continue

            # get the correlation between state and next day percent changes
            train['next_day'] = train['close'].shift(-1) / train['close'] - 1
            train_means = train.dropna().groupby(by='state')[['return', 'next_day']].mean()*100
            train_correl = train_means.corr()
            self.train_correl = train_correl['return']['next_day']


            # do the same for the test data
            pipe_pca.fit(test[ test_cols ])
            try:
                self.test_score = np.exp( pipe_pca.score(test[ test_cols ]) / len(test) ) * 100
            except:
                self.test_score = None
            test['state'] = pipe_pca.predict(test[test_cols])
            test = self.rename_states(test)

            if self.run_type == 'production':
                self.new_predictions = test.tail(30)
                return
            
            
            if self.run_type == 'find_features':
                if test is None:
                    continue
                criteria_check = self.check_criteria(test)
                if criteria_check == False:
                    continue

            # get the correlation between state and next day percent changes
            test['next_day'] = test['close'].shift(-1) / test['close'] - 1
            test_means = test.dropna().groupby(by='state')[['return', 'next_day']].mean()*100
            test_correl = test_means.corr()
            self.test_correl = test_correl['return']['next_day']

            exp_num = exp_num + 1
            
            if ( self.train_correl > self.max_correl and self.test_correl>0 ) or self.run_type == 'rolling_test':
                
                self.train_predicted = train
                self.test_predicted = test
                self.features_found = test_cols

                self.train_means = train_means
                self.test_means = test_means
                

                #print('model found on expirement number', exp_num)
                #print(self.features_found)
                
                self.max_correl = self.train_correl
                self.pipeline_failed = False
def test_eigen_vs_svd():
    pca = PCA(n_components=2, solver='eigen')
    eigen_res = pca.fit(X).transform(X)
    pca = PCA(n_components=2, solver='svd')
    svd_res = pca.fit(X).transform(X)
    assert_allclose(np.absolute(eigen_res), np.absolute(svd_res), atol=0.0001)
def test_variance_explained_ratio():
    pca = PCA()
    pca.fit(X_std)
    assert np.sum(pca.e_vals_normalized_) == 1.
    assert np.sum(pca.e_vals_normalized_ < 0.) == 0
def test_default_2components():
    pca = PCA(n_components=2)
    res = pca.fit(X).transform(X)
    assert res.shape[1] == 2
def test_default_components():
    pca = PCA()
    res = pca.fit(X_std).transform(X_std)
    assert res.shape[1] == 4
def test_evals():
    pca = PCA(n_components=2, solver='eigen')
    pca.fit(X)
    res = pca.fit(X).transform(X)
    assert_almost_equal(pca.e_vals_, [2.93, 0.93, 0.15, 0.02], decimal=2)
示例#33
0
def plot_pca_correlation_graph(X,
                               variables_names,
                               dimensions=(1, 2),
                               figure_axis_size=6,
                               X_pca=None,
                               explained_variance=None):
    """
    Compute the PCA for X and plots the Correlation graph

    Parameters
    ----------
    X : 2d array like.
        The columns represent the different variables and the rows are the
         samples of thos variables

    variables_names : array like
        Name of the columns (the variables) of X

    dimensions: tuple with two elements.
        dimensions to be plotted (x,y)

    figure_axis_size :
         size of the final frame. The figure created is a square with length
         and width equal to figure_axis_size.

    X_pca : np.ndarray, shape = [n_samples, n_components].
        Optional.
        `X_pca` is the matrix of the transformed components from X.
        If not provided, the function computes PCA automatically using
        mlxtend.feature_extraction.PrincipalComponentAnalysis
        Expected `n_componentes >= max(dimensions)`

    explained_variance : 1 dimension np.ndarray, length = n_components
        Optional.
        `explained_variance` are the eigenvalues from the diagonalized
        covariance matrix on the PCA transformatiopn.
        If not provided, the function computes PCA independently
        Expected `n_componentes == X.shape[1]`

    Returns
    ----------
        matplotlib_figure, correlation_matrix

    Examples
    -----------
    For usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/plotting/plot_pca_correlation_graph/

    """

    X = np.array(X)
    X = X - X.mean(axis=0)
    n_comp = max(dimensions)

    if (X_pca is None) and (explained_variance is None):
        pca = PrincipalComponentAnalysis(n_components=n_comp)
        pca.fit(X)
        X_pca = pca.transform(X)
        explained_variance = pca.e_vals_

    elif (X_pca is not None) and (explained_variance is None):
        raise ValueError("If `X_pca` is not None, the `explained variance`"
                         " values should not be `None`.")

    elif (X_pca is None) and (explained_variance is not None):
        raise ValueError("If `explained variance` is not None, the `X_pca`"
                         " values should not be `None`.")

    elif (X_pca is not None) and (explained_variance is not None):
        if X_pca.shape[1] != len(explained_variance):
            raise ValueError(f"Number of principal components must "
                             f"match the number "
                             f"of eigenvalues. Got "
                             f"{X_pca.shape[1]} "
                             f"!= "
                             f"{len(explained_variance)}")

    if X_pca.shape[1] < n_comp:
        raise ValueError(f"Input array `X_pca` contains fewer principal"
                         f" components than expected based on `dimensions`."
                         f" Got {X_pca.shape[1]} components in X_pca, expected"
                         f" at least `max(dimensions)={n_comp}`.")
    if len(explained_variance) < n_comp:
        raise ValueError(f"Input array `explained_variance` contains fewer"
                         f" elements than expected. Got"
                         f" {len(explained_variance)} elements, expected"
                         f"`X.shape[1]={X.shape[1]}`.")

    corrs = create_correlation_table(
        X_pca, X, ['Dim ' + str(i + 1) for i in range(n_comp)],
        variables_names)
    tot = sum(X.var(0)) * X.shape[0] / (X.shape[0] - 1)
    explained_var_ratio = [(i / tot) * 100 for i in explained_variance]

    # Plotting circle
    fig_res = plt.figure(figsize=(figure_axis_size, figure_axis_size))
    plt.Circle((0, 0), radius=1, color='k', fill=False)
    circle1 = plt.Circle((0, 0), radius=1, color='k', fill=False)
    fig = plt.gcf()
    fig.gca().add_artist(circle1)

    # Plotting arrows
    texts = []
    for name, row in corrs.iterrows():
        x = row['Dim ' + str(dimensions[0])]
        y = row['Dim ' + str(dimensions[1])]
        plt.arrow(0.0,
                  0.0,
                  x,
                  y,
                  color='k',
                  length_includes_head=True,
                  head_width=.05)

        plt.plot([0.0, x], [0.0, y], 'k-')
        texts.append(plt.text(x, y, name, fontsize=2 * figure_axis_size))
    # Plotting vertical lines
    plt.plot([-1.1, 1.1], [0, 0], 'k--')
    plt.plot([0, 0], [-1.1, 1.1], 'k--')

    # Adjusting text
    adjust_text(texts)
    # Setting limits and title
    plt.xlim((-1.1, 1.1))
    plt.ylim((-1.1, 1.1))
    plt.title("Correlation Circle", fontsize=figure_axis_size * 3)

    plt.xlabel("Dim " + str(dimensions[0]) +
               " (%s%%)" % str(explained_var_ratio[dimensions[0] - 1])[:4],
               fontsize=figure_axis_size * 2)
    plt.ylabel("Dim " + str(dimensions[1]) +
               " (%s%%)" % str(explained_var_ratio[dimensions[1] - 1])[:4],
               fontsize=figure_axis_size * 2)
    return fig_res, corrs
示例#34
0
def plot_pca_correlation_graph(X, variables_names, dimensions=(1, 2),
                               figure_axis_size=6, X_pca=None):
    """
    Compute the PCA for X and plots the Correlation graph

    Parameters
    ----------
    X : 2d array like.
        The columns represent the different variables and the rows are the
         samples of thos variables
    variables_names : array like
        Name of the columns (the variables) of X
    dimensions: tuple with two elements.
        dimensions to be plot (x,y)
    X_pca : optional. if not provided, compute PCA independently
    figure_axis_size :
         size of the final frame. The figure created is a square with length
         and width equal to figure_axis_size.
    Returns
    ----------
        matplotlib_figure , correlation_matrix
    """
    X = np.array(X)
    X = X - X.mean(axis=0)
    n_comp = max(dimensions)

    if X_pca is None:
        pca = PrincipalComponentAnalysis(n_components=n_comp)
        pca.fit(X)
        X_pca = pca.transform(X)

    corrs = create_correlation_table(X_pca, X, ['Dim ' + str(i + 1) for i in
                                                range(n_comp)],
                                     variables_names)
    tot = sum(pca.e_vals_)
    explained_var_ratio = [(i / tot) * 100 for i in pca.e_vals_]

    # Plotting circle
    fig_res = plt.figure(figsize=(figure_axis_size, figure_axis_size))
    plt.Circle((0, 0), radius=1, color='k', fill=False)
    circle1 = plt.Circle((0, 0), radius=1, color='k', fill=False)
    fig = plt.gcf()
    fig.gca().add_artist(circle1)

    # Plotting arrows
    texts = []
    for name, row in corrs.iterrows():
        x = row['Dim ' + str(dimensions[0])]
        y = row['Dim ' + str(dimensions[1])]
        plt.arrow(0.0, 0.0, x, y, color='k', length_includes_head=True,
                  head_width=.05)

        plt.plot([0.0, x], [0.0, y], 'k-')
        texts.append(plt.text(x, y, name, fontsize=2 * figure_axis_size))
    # Plotting vertical lines
    plt.plot([-1.1, 1.1], [0, 0], 'k--')
    plt.plot([0, 0], [-1.1, 1.1], 'k--')

    # Adjusting text
    adjust_text(texts)
    # Setting limits and title
    plt.xlim((-1.1, 1.1))
    plt.ylim((-1.1, 1.1))
    plt.title("Correlation Circle", fontsize=figure_axis_size * 3)

    plt.xlabel("Dim " + str(dimensions[0]) + " (%s%%)" %
               str(explained_var_ratio[dimensions[0] - 1])[:4].lstrip("0."),
               fontsize=figure_axis_size * 2)
    plt.ylabel("Dim " + str(dimensions[1]) + " (%s%%)" %
               str(explained_var_ratio[dimensions[1] - 1])[:4].lstrip("0."),
               fontsize=figure_axis_size * 2)
    return fig_res, corrs
示例#35
0
def test_variance_explained_ratio():
    pca = PCA()
    pca.fit(X_std)
    assert_almost_equal(np.sum(pca.e_vals_normalized_), 1.)
    assert np.sum(pca.e_vals_normalized_ < 0.) == 0