def test_outlier_weights(): rand = np.random.RandomState(0) X = rand.multivariate_normal([0, 0], [[12, 6], [6, 5]], size=1000) pca = PCA(2).fit(X) def check_results(Estimator, n_outliers, noise_level, rtol): i = rand.randint(0, 100, size=n_outliers) j = rand.randint(0, 2, size=n_outliers) X2 = X.copy() X2[i, j] += noise_level * rand.randn(n_outliers) W2 = np.ones_like(X2) W2[i, j] = 1. / noise_level pca2 = Estimator(2, **KWDS[Estimator]).fit(X2, weights=W2) assert_columns_allclose_upto_sign(pca.components_.T, pca2.components_.T, rtol=rtol) for (n_outliers, noise_level, rtol) in [(1, 20, 1E-3), (10, 10, 3E-2)]: for Estimator in ESTIMATORS: yield check_results, Estimator, n_outliers, noise_level, rtol
w0.shape, w.shape, df[weights.values.ravel() != 0].shape, df.shape # In[9]: from wpca import WPCA as PCA # In[10]: from sklearn.pipeline import Pipeline # from sklearn.decomposition import PCA from sklearn.preprocessing import Imputer, StandardScaler imp = Imputer(strategy='mean') scl = StandardScaler() pca = PCA() pipeline = Pipeline([ ('imp', imp), ('scl', scl), ('pca', pca), ]) scaler_pipeline = Pipeline([ ('imp', imp), ('scl', scl), ]) data_pca = pipeline.fit_transform(df) _scaled = scaler_pipeline.transform(df) # ### Explained variance # # How much of the variance in the data is explained by each successive component?
print('NumPy covariance matrix: \n%s' %np.cov(X_std.T)) #Perform eigendecomposition on covariance matrix cov_mat = np.cov(X_std.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) print('Eigenvectors \n%s' %eig_vecs) print('\nEigenvalues \n%s' %eig_vals) # Visually confirm that the list is correctly sorted by decreasing eigenvalues eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] print('Eigenvalues in descending order:') for i in eig_pairs: print(i[0]) #Variable loadings pca = PCA(n_components=2) pca.fit_transform(df1) i = np.identity(df1.shape[1]) coef = pca.transform(i) loadings = pd.DataFrame(coef, columns=['PC-1', 'PC-2'], index=M.columns) print (loadings.head(10)) #print the eigenvalues for the first two principle components print (pca.explained_variance_ratio_) #Explained variance pca = PCA().fit(X_std) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance') plt.show()
print('NumPy covariance matrix: \n%s' %np.cov(X_std.T)) #Perform eigendecomposition on covariance matrix cov_mat = np.cov(X_std.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) print('Eigenvectors \n%s' %eig_vecs) print('\nEigenvalues \n%s' %eig_vals) # Visually confirm that the list is correctly sorted by decreasing eigenvalues eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] print('Eigenvalues in descending order:') for i in eig_pairs: print(i[0]) #Variable loadings pca = PCA(n_components=2) pca.fit_transform(df1) i = np.identity(df1.shape[1]) coef = pca.transform(i) loadings = pd.DataFrame(coef, columns=['PC-1', 'PC-2'], index=M.columns) print loadings.head(10) #print the eigenvalues for the first two principle components print pca.explained_variance_ratio_ #Explained variance pca = PCA().fit(X_std) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance') plt.show()