def test_pca_on_uncentered_data(): pca1 = PCA(solver='svd') pca1.fit(X) pca2 = PCA(solver='eigen') pca2.fit(X) assert_almost_equal(pca1.e_vals_normalized_, pca2.e_vals_normalized_)
def test_evals(): pca = PCA(n_components=2, solver='eigen') pca.fit(X_std) assert_almost_equal(pca.e_vals_, [2.9, 0.9, 0.2, 0.02], decimal=1) pca = PCA(n_components=2, solver='svd') pca.fit(X_std) assert_almost_equal(pca.e_vals_, [2.9, 0.9, 0.2, 0.02], decimal=1)
def test_whitening(): pca = PCA(n_components=2) res = pca.fit(X_std).transform(X_std) diagonals_sum = np.sum(np.diagonal(np.cov(res.T))) assert round(diagonals_sum, 1) == 3.9, diagonals_sum pca = PCA(n_components=2, whitening=True) res = pca.fit(X_std).transform(X_std) diagonals_sum = np.sum(np.diagonal(np.cov(res.T))) assert round(diagonals_sum, 1) == 2.0, diagonals_sum
def test_evals(): pca = PCA(n_components=2, solver='eigen') pca.fit(X_std) expected = [2.93035378, 0.92740362, 0.14834223, 0.02074601] assert_almost_equal(pca.e_vals_, expected, decimal=5) pca = PCA(n_components=2, solver='svd') pca.fit(X_std) assert_almost_equal(pca.e_vals_, expected, decimal=5)
def test_loadings(): expect = np.array([[0.9, -0.4, -0.3, 0.], [-0.5, -0.9, 0.1, -0.], [1., -0., 0.1, -0.1], [1., -0.1, 0.2, 0.1]]) pca = PCA(solver='eigen') pca.fit(X_std) assert_almost_equal(pca.loadings_, expect, decimal=1) expect = np.array([[-0.9, -0.4, 0.3, 0.], [0.4, -0.9, -0.1, -0.], [-1., -0., -0.1, -0.1], [-1., -0.1, -0.2, 0.1]]) pca = PCA(solver='svd') pca.fit(X_std) assert_almost_equal(pca.loadings_, expect, decimal=1)
def extract_features(extraction_type, n_components): if extraction_type == 'pca': ext = PCA(n_components=n_components) return ext elif extraction_type == 'lda': ext = LDA(n_discriminants=n_components) return ext else: print("Input a valid method for (PCA or LDA)\n")
def extract_features(tipo, n): if tipo == 'pca': ext = PCA(n_components=n) return ext elif tipo == 'lda': ext = LDA(n_discriminants=n) return ext else: print ("Ingrese un método válido (pca o lda)\n")
def test_fail_array_transform(): pca = PCA(n_components=2) pca.fit(X) exp = pca.transform(X[1])
def test_variance_explained_ratio(): pca = PCA() pca.fit(X_std) assert math.isclose(np.sum(pca.e_vals_normalized_), 1.) assert math.isclose(np.sum(pca.e_vals_normalized_ < 0.), 0, abs_tol=1e-10)
def test_default_components(): pca = PCA() res = pca.fit(X_std).transform(X_std) assert res.shape[1] == 4
def test_evals(): pca = PCA(n_components=2, solver='eigen') pca.fit(X) res = pca.fit(X).transform(X) assert_almost_equal(pca.e_vals_, [2.93, 0.93, 0.15, 0.02], decimal=2)
def test_fail_array_fit(): pca = PCA(n_components=2) pca.fit(X[1])
def test_default_2components(): pca = PCA(n_components=2) res = pca.fit(X).transform(X) assert res.shape[1] == 2
def test_variance_explained_ratio(): pca = PCA() pca.fit(X_std) assert np.sum(pca.e_vals_normalized_) == 1. assert np.sum(pca.e_vals_normalized_ < 0.) == 0
bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=1, random_state=0, verbose=1) scores = cross_val_score(bag, X, y, scoring='f1_micro', cv=5, verbose=5) print scores.mean() ''' trees estims f1 300 10 33.4 300 20 150 50 ''' pca = PCA(n_components=1000) X_pca = pca.fit(X).transform(X) et = ExtraTreesClassifier(n_estimators=300, max_depth=None, random_state=0, verbose=1) bag = BaggingClassifier(base_estimator=et, n_estimators=20, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=1,
#extratrees from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import cross_val_score et = ExtraTreesClassifier(n_estimators=300, max_depth=None, random_state=0,verbose=5) scores = cross_val_score(et, X, y,scoring='f1_micro',cv=5,verbose=5) print scores.mean() #32% ,max depth=none, n_est=300 #kernel PCA from mlxtend.feature_extraction import PrincipalComponentAnalysis as PCA from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import cross_val_score pca = PCA(n_components=700) X_pca = pca.fit(X).transform(X) et = ExtraTreesClassifier(n_estimators=500, max_depth=None, random_state=0,verbose=5) scores = cross_val_score(et, X_pca, y,scoring='f1_micro',cv=5,verbose=5) print scores.mean() from mlxtend.feature_extraction import RBFKernelPCA as KPCA kpca = KPCA(gamma=1.0, n_components=700) kpca.fit(X) X_kpca = kpca.fit(X).transform(X) et = ExtraTreesClassifier(n_estimators=500, max_depth=None, random_state=0,verbose=5) scores = cross_val_score(et, X_pca, y,scoring='f1_micro',cv=5,verbose=5) print scores.mean()
color='blue', marker='^', #triangle marker alpha=0.5, ) plt.title('BMI vs glucose by sex') plt.ylabel('Serum glucose concentration') plt.xlabel('BMI') plt.legend([sex1, sex2], ['Sex 1', 'Sex 2']) #plt.show() plt.savefig('../../figs/bivariate/subsetkpca1_2') plt.close() """ #PCA only takes 2D axis. Look into how to deal with that later. pca = PCA(n_components=2) #2-component linear PCA X_pca = pca.fit(X).transform(X) #print(X_pca) ''' #Graph after pca #generate graph from matrix for i in range(len(X)): if sex[i] == 1: #Sex 1 glucose v BMI sex1 = plt.scatter(X_pca[i][0], #bmi X_pca[i][1], #glucose color ='red', marker='o', #circle marker alpha=0.5, )
#Blue half moon plt.scatter(X[y==1, 0], X[y==1, 1], # Start and peak/trough of each 'moon'. color='blue', marker='^', alpha=0.5) plt.xlabel('x coordinate') plt.ylabel('y coordinate') #plt.show() plt.savefig('../figs/tutorial/mlxtendex1_1.png') plt.close() # Moons are linearly inseperable so standard linear PCA will fail to accurately represent data in 1D space. #Use PCA for dimensionality reduction #specify number of components in PCA pca = PCA(n_components=2) #Transform X in accordance with 2-component PCA X_pca = pca.fit(X).transform(X) # Red half moon plt.scatter(X_pca[y==0, 0], X_pca[y==0, 1], # Start and peak/troughof each 'moon'. color ='red', marker='o', alpha=0.5) #Blue half moon plt.scatter(X_pca[y==1, 0], X_pca[y==1, 1], # Start and peak/troughof each 'moon'. color='blue', marker='^', alpha=0.5) plt.xlabel('PC1') plt.ylabel('PC2') #plt.show()
def test_variance_explained_ratio(): pca = PCA() pca.fit(X_std) assert_almost_equal(np.sum(pca.e_vals_normalized_), 1.) assert np.sum(pca.e_vals_normalized_ < 0.) == 0
def test_eigen_vs_svd(): pca = PCA(n_components=2, solver='eigen') eigen_res = pca.fit(X).transform(X) pca = PCA(n_components=2, solver='svd') svd_res = pca.fit(X).transform(X) assert_allclose(np.absolute(eigen_res), np.absolute(svd_res), atol=0.0001)
from mlxtend.feature_extraction import PrincipalComponentAnalysis as PCA from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from mlxtend.classifier import StackingClassifier X = np.genfromtxt('../../contest_data/xtrain_linear_imputed.csv', delimiter=',') print 'loaded X' y = np.genfromtxt('../../contest_data/train.csv', delimiter=',')[1:, -1] print 'loaded y' pca = PCA(n_components=300) X_pca = pca.fit(X).transform(X) et = ExtraTreesClassifier(n_estimators=1000, max_depth=None, random_state=0, verbose=0) svc = SVC(C=1, gamma='auto', verbose=0) #dt = DecisionTreeClassifier(min_samples_leaf=5,random_state=0) rf = RandomForestClassifier(n_estimators=1000, max_depth=None, random_state=0, verbose=0) lr = LogisticRegression() sclf = StackingClassifier(classifiers=[et, svc, rf], use_probas=True,
def test_fail_array_dimension(): pca = PCA(n_components=2) assert_raises(ValueError, 'X must be a 2D array. Try X[:, numpy.newaxis]', pca.transform, X[1])
def test_default_components(): pca = PCA(n_components=0) pca.fit(X) res = pca.fit(X).transform(X)