def test_whitening(): pca = PCA(n_components=2) res = pca.fit(X_std).transform(X_std) diagonals_sum = np.sum(np.diagonal(np.cov(res.T))) assert round(diagonals_sum, 1) == 3.9, diagonals_sum pca = PCA(n_components=2, whitening=True) res = pca.fit(X_std).transform(X_std) diagonals_sum = np.sum(np.diagonal(np.cov(res.T))) assert round(diagonals_sum, 1) == 2.0, diagonals_sum
def run_pipeline(self, train, test): X = train[self.test_features] y = train[self.target_variable] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 7) max_score = 0 n_states = 3 self.best_pipeline = None for i in range(2,25): pipe_pca = make_pipeline(StandardScaler(), PrincipalComponentAnalysis(n_components=i), #mix.GaussianMixture (n_components=3, random_state=7), KNeighborsRegressor(n_neighbors=3), ) pipe_pca.fit(X_train, y_train) score = pipe_pca.score(X_test, y_test) future_score = pipe_pca.score(test[self.test_features], test[self.target_variable]) if score>max_score: self.best_pipeline = pipe_pca max_score = score print(i) print('Transf. training accyracy: %.2f%%' % (pipe_pca.score(X_train, y_train)*100)) print('Transf. test accyracy: %.2f%%' % (pipe_pca.score(X_test, y_test)*100)) print('Future test accyracy: %.2f%%' % (future_score*100))
def run_pipeline(self, train, test): X = train[['return']+self.test_features] y = train[self.target_variable] """ self.bins = np.linspace(train['return'].min(), train['return'].max(), 3) y = np.digitize(y, self.bins) y_future_test = np.digitize(test[self.target_variable], self.bins) """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 7) max_score = -np.inf self.best_pipeline = None for pca_n_components in range(2,25): for i in range(20): shuffle(self.test_features) this_features = self.test_features[0:self.k_features] pipe_pca = make_pipeline(StandardScaler(), PrincipalComponentAnalysis(n_components=pca_n_components), #mix.GaussianMixture (n_components=3, random_state=7), KNeighborsRegressor(n_neighbors=self.k_neighbors, weights='distance'), ) pipe_pca.fit(X_train[ ['return']+this_features ], y_train) score = pipe_pca.score(X_test[ ['return']+this_features ], y_test) test['state'] = pipe_pca.predict(test[['return']+this_features]) test['next_change'] = test['return'].shift(-1) correl = test[['state','next_change']].dropna().corr()['state']['next_change'] if score>max_score and correl>0: self.training_score = pipe_pca.score(X_train[ ['return']+this_features ], y_train)*100 self.testing_score = pipe_pca.score(X_test[ ['return']+this_features ], y_test)*100 self.future_testing_score = pipe_pca.score(test[ ['return']+this_features ],test[self.target_variable])*100 #print(self.training_score) self.pca_n_components = pca_n_components self.best_pipeline = pipe_pca self.found_best_features = ['return'] + this_features max_score = score #print(i) #print('Transf. training accyracy: %.2f%%' % (self.training_score)) print('Transf. test accyracy: %.2f%%' % (self.testing_score)) print('Future test accyracy: %.2f%%' % (self.future_testing_score)) input()
def test_pca_on_uncentered_data(): pca1 = PCA(solver='svd') pca1.fit(X) pca2 = PCA(solver='eigen') pca2.fit(X) assert_almost_equal(pca1.e_vals_normalized_, pca2.e_vals_normalized_)
def test_evals(): pca = PCA(n_components=2, solver='eigen') pca.fit(X_std) assert_almost_equal(pca.e_vals_, [2.9, 0.9, 0.2, 0.02], decimal=1) pca = PCA(n_components=2, solver='svd') pca.fit(X_std) assert_almost_equal(pca.e_vals_, [2.9, 0.9, 0.2, 0.02], decimal=1)
def test_evals(): pca = PCA(n_components=2, solver='eigen') pca.fit(X_std) expected = [2.93035378, 0.92740362, 0.14834223, 0.02074601] assert_almost_equal(pca.e_vals_, expected, decimal=5) pca = PCA(n_components=2, solver='svd') pca.fit(X_std) assert_almost_equal(pca.e_vals_, expected, decimal=5)
def get_model(self): self.pipe_pca = make_pipeline( StandardScaler(), PrincipalComponentAnalysis(n_components=3), GaussianHMM(n_components=3, covariance_type='full', random_state=7)) self.pipe_pca.fit(self.train[['return'] + self.features]) model = self.pipe_pca.steps[2][1] results = [] for i in range(3): result = [i, model.means_[i][0], np.diag(model.covars_[i])[0]] results.append(result) results = pd.DataFrame(results) results.columns = ['state', 'train_mean', 'train_var'] self.results = results.set_index('state') self.get_renamed_states()
def test_loadings(): expect = np.array([[0.9, -0.4, -0.3, 0.], [-0.5, -0.9, 0.1, -0.], [1., -0., 0.1, -0.1], [1., -0.1, 0.2, 0.1]]) pca = PCA(solver='eigen') pca.fit(X_std) assert_almost_equal(pca.loadings_, expect, decimal=1) expect = np.array([[-0.9, -0.4, 0.3, 0.], [0.4, -0.9, -0.1, -0.], [-1., -0., -0.1, -0.1], [-1., -0.1, -0.2, 0.1]]) pca = PCA(solver='svd') pca.fit(X_std) assert_almost_equal(pca.loadings_, expect, decimal=1)
def get_trained_pipelines(train): train_dfs = np.array_split(train, n_subsets) int_name = 0 pipelines = [] for train_subset in train_dfs: try: pipe_pca = make_pipeline(StandardScaler(), PrincipalComponentAnalysis(n_components=n_components), GMMHMM(n_components=n_components, covariance_type='full', n_iter=150, random_state=7), ) pipe_pca.fit(train_subset[ features ]) train['state'] = pipe_pca.predict(train[ features ]) results = pd.DataFrame(train.groupby(by=['state'])['return'].mean().sort_values()) results['new_state'] = list(range(n_components)) results.columns = ['mean', 'new_state'] results = results.reset_index() results['name'] = int_name int_name = int_name + 1 pipelines.append( [pipe_pca, results] ) except Exception as e: #print('make trained pipelines exception', e) pass return pipelines
def test_fail_array_fit(): pca = PCA(n_components=2) pca.fit(X[1])
def test_fail_array_transform(): pca = PCA(n_components=2) pca.fit(X) exp = pca.transform(X[1])
def test_eigen_vs_svd(): pca = PCA(n_components=2, solver='eigen') eigen_res = pca.fit(X).transform(X) pca = PCA(n_components=2, solver='svd') svd_res = pca.fit(X).transform(X) assert_allclose(np.absolute(eigen_res), np.absolute(svd_res), atol=0.0001)
def test_evals(): pca = PCA(n_components=2, solver='eigen') pca.fit(X) res = pca.fit(X).transform(X) assert_almost_equal(pca.e_vals_, [2.93, 0.93, 0.15, 0.02], decimal=2)
def test_default_2components(): pca = PCA(n_components=2) res = pca.fit(X).transform(X) assert res.shape[1] == 2
def test_default_components(): pca = PCA(n_components=0) pca.fit(X) res = pca.fit(X).transform(X)
def plot_pca_correlation_graph(X, variables_names, dimensions=(1, 2), figure_axis_size=6, X_pca=None): """ Compute the PCA for X and plots the Correlation graph Parameters ---------- X : 2d array like. The columns represent the different variables and the rows are the samples of thos variables variables_names : array like Name of the columns (the variables) of X dimensions: tuple with two elements. dimensions to be plot (x,y) X_pca : optional. if not provided, compute PCA independently figure_axis_size : size of the final frame. The figure created is a square with length and width equal to figure_axis_size. Returns ---------- matplotlib_figure , correlation_matrix """ X = np.array(X) X = X - X.mean(axis=0) n_comp = max(dimensions) if X_pca is None: pca = PrincipalComponentAnalysis(n_components=n_comp) pca.fit(X) X_pca = pca.transform(X) corrs = create_correlation_table( X_pca, X, ['Dim ' + str(i + 1) for i in range(n_comp)], variables_names) tot = sum(pca.e_vals_) explained_var_ratio = [(i / tot) * 100 for i in pca.e_vals_] # Plotting circle fig_res = plt.figure(figsize=(figure_axis_size, figure_axis_size)) plt.Circle((0, 0), radius=1, color='k', fill=False) circle1 = plt.Circle((0, 0), radius=1, color='k', fill=False) fig = plt.gcf() fig.gca().add_artist(circle1) # Plotting arrows texts = [] for name, row in corrs.iterrows(): x = row['Dim ' + str(dimensions[0])] y = row['Dim ' + str(dimensions[1])] plt.arrow(0.0, 0.0, x, y, color='k', length_includes_head=True, head_width=.05) plt.plot([0.0, x], [0.0, y], 'k-') texts.append(plt.text(x, y, name, fontsize=2 * figure_axis_size)) # Plotting vertical lines plt.plot([-1.1, 1.1], [0, 0], 'k--') plt.plot([0, 0], [-1.1, 1.1], 'k--') # Adjusting text adjust_text(texts) # Setting limits and title plt.xlim((-1.1, 1.1)) plt.ylim((-1.1, 1.1)) plt.title("Correlation Circle", fontsize=figure_axis_size * 3) plt.xlabel("Dim " + str(dimensions[0]) + " (%s%%)" % str(explained_var_ratio[dimensions[0] - 1])[:4].lstrip("0."), fontsize=figure_axis_size * 2) plt.ylabel("Dim " + str(dimensions[1]) + " (%s%%)" % str(explained_var_ratio[dimensions[1] - 1])[:4].lstrip("0."), fontsize=figure_axis_size * 2) return fig_res, corrs
def test_variance_explained_ratio(): pca = PCA() pca.fit(X_std) assert math.isclose(np.sum(pca.e_vals_normalized_), 1.) assert math.isclose(np.sum(pca.e_vals_normalized_ < 0.), 0, abs_tol=1e-10)
#scores = cross_val_score(dt, X_pca[nonmissings,:], xtrain[nonmissings,i],scoring='neg_mean_absolute_error',cv=5,verbose=5) #print scores np.savetxt('../../contest_data/xtest_tree_imputed.csv', xtrain, delimiter=',') from sklearn.naive_bayes import MultinomialNB gnb = MultinomialNB() scores = cross_val_score(gnb, xtrain, ytrain, scoring='f1_micro', cv=5, verbose=5) print scores.mean() pca = PCA(n_components=300) xtrain_pca = pca.fit(xtrain[:, 500:]).transform(xtrain[:, 500:]) xtest_pca = pca.fit(xtrain[:, 500:]).transform(xtest[:, 500:]) #imputing test data for i in range(500): print i train_missings = np.isnan(xtrain[:, i]) train_nonmissings = ~train_missings test_missings = np.isnan(xtest[:, i]) test_nonmissings = ~test_missings xtest[test_missings, i] = lin.fit(xtrain_pca[train_nonmissings, :], xtrain[train_nonmissings, i]).predict(xtest_pca[test_missings, :]) np.savetxt('../../contest_data/xtest_linear_imputed.csv', xtest, delimiter=',')
from sklearn.ensemble import ExtraTreesClassifier from mlxtend.feature_extraction import PrincipalComponentAnalysis as PCA from sklearn.model_selection import cross_val_score import numpy as np import matplotlib.pyplot as plt X = np.genfromtxt('../../contest_data/xtrain_linear_imputed.csv', delimiter=',') y = np.genfromtxt('../../contest_data/train.csv', delimiter=',')[1:,-1] pca = PCA(n_components=1000) X_pca = pca.fit(X).transform(X) et = ExtraTreesClassifier(n_estimators=1000, max_depth=None, random_state=0,verbose=0) scores = cross_val_score(et, X_pca, y,scoring='f1_micro',cv=5,verbose=5) print scores.mean() et = ExtraTreesClassifier(n_estimators=300, max_depth=None, random_state=0,verbose=1) scores = cross_val_score(et, X, y,scoring='f1_micro',cv=5,verbose=5) print scores.mean() ''' components=1000,estimators=1000 gives 32.6% f1 ''' pca = PCA(n_components=1000)
def test_variance_explained_ratio(): pca = PCA() pca.fit(X_std) assert np.sum(pca.e_vals_normalized_) == 1. assert np.sum(pca.e_vals_normalized_ < 0.) == 0
def test_default_components(): pca = PCA() res = pca.fit(X_std).transform(X_std) assert res.shape[1] == 4
#Blue half moon plt.scatter(X[y==1, 0], X[y==1, 1], # Start and peak/trough of each 'moon'. color='blue', marker='^', alpha=0.5) plt.xlabel('x coordinate') plt.ylabel('y coordinate') #plt.show() plt.savefig('../figs/tutorial/mlxtendex1_1.png') plt.close() # Moons are linearly inseperable so standard linear PCA will fail to accurately represent data in 1D space. #Use PCA for dimensionality reduction #specify number of components in PCA pca = PCA(n_components=2) #Transform X in accordance with 2-component PCA X_pca = pca.fit(X).transform(X) # Red half moon plt.scatter(X_pca[y==0, 0], X_pca[y==0, 1], # Start and peak/troughof each 'moon'. color ='red', marker='o', alpha=0.5) #Blue half moon plt.scatter(X_pca[y==1, 0], X_pca[y==1, 1], # Start and peak/troughof each 'moon'. color='blue', marker='^', alpha=0.5) plt.xlabel('PC1') plt.ylabel('PC2') #plt.show()
def run_pipeline(self, production=False): self.pipeline_failed = True self.max_score = -np.inf self.max_correl = -np.inf # create pipeline pipe_pca = make_pipeline(StandardScaler(), PrincipalComponentAnalysis(n_components=self.pca_n_components), #GMMHMM(n_components=3, covariance_type='full')) GaussianHMM(n_components=3, covariance_type='full')) exp_num = 0 if self.run_type == 'find features': print('finding features') while exp_num < self.n_experiments: train = self.clean_train.copy() test = self.clean_test.copy() means = [] stddevs = [] scores = [] correls = [] if self.run_type == 'find_features': # choose features shuffle(self.starting_features) test_cols = ['return'] + self.starting_features[0:self.k_features] if 'stoch' not in str(test_cols): continue elif self.run_type == 'production' or self.run_type == 'rolling_test': test_cols = self.features_found # test features on training dataset pipe_pca.fit(train[ test_cols ]) try: self.train_score = np.exp( pipe_pca.score(train[ test_cols ]) / len(train) ) * 100 except: self.train_score = None train['state'] = pipe_pca.predict(train[test_cols]) train = self.rename_states(train) if train is None: continue criteria_check = self.check_criteria(train) if criteria_check == False: continue # get the correlation between state and next day percent changes train['next_day'] = train['close'].shift(-1) / train['close'] - 1 train_means = train.dropna().groupby(by='state')[['return', 'next_day']].mean()*100 train_correl = train_means.corr() self.train_correl = train_correl['return']['next_day'] # do the same for the test data pipe_pca.fit(test[ test_cols ]) try: self.test_score = np.exp( pipe_pca.score(test[ test_cols ]) / len(test) ) * 100 except: self.test_score = None test['state'] = pipe_pca.predict(test[test_cols]) test = self.rename_states(test) if self.run_type == 'production': self.new_predictions = test.tail(30) return if self.run_type == 'find_features': if test is None: continue criteria_check = self.check_criteria(test) if criteria_check == False: continue # get the correlation between state and next day percent changes test['next_day'] = test['close'].shift(-1) / test['close'] - 1 test_means = test.dropna().groupby(by='state')[['return', 'next_day']].mean()*100 test_correl = test_means.corr() self.test_correl = test_correl['return']['next_day'] exp_num = exp_num + 1 if ( self.train_correl > self.max_correl and self.test_correl>0 ) or self.run_type == 'rolling_test': self.train_predicted = train self.test_predicted = test self.features_found = test_cols self.train_means = train_means self.test_means = test_means #print('model found on expirement number', exp_num) #print(self.features_found) self.max_correl = self.train_correl self.pipeline_failed = False
def plot_pca_correlation_graph(X, variables_names, dimensions=(1, 2), figure_axis_size=6, X_pca=None, explained_variance=None): """ Compute the PCA for X and plots the Correlation graph Parameters ---------- X : 2d array like. The columns represent the different variables and the rows are the samples of thos variables variables_names : array like Name of the columns (the variables) of X dimensions: tuple with two elements. dimensions to be plotted (x,y) figure_axis_size : size of the final frame. The figure created is a square with length and width equal to figure_axis_size. X_pca : np.ndarray, shape = [n_samples, n_components]. Optional. `X_pca` is the matrix of the transformed components from X. If not provided, the function computes PCA automatically using mlxtend.feature_extraction.PrincipalComponentAnalysis Expected `n_componentes >= max(dimensions)` explained_variance : 1 dimension np.ndarray, length = n_components Optional. `explained_variance` are the eigenvalues from the diagonalized covariance matrix on the PCA transformatiopn. If not provided, the function computes PCA independently Expected `n_componentes == X.shape[1]` Returns ---------- matplotlib_figure, correlation_matrix Examples ----------- For usage examples, please see http://rasbt.github.io/mlxtend/user_guide/plotting/plot_pca_correlation_graph/ """ X = np.array(X) X = X - X.mean(axis=0) n_comp = max(dimensions) if (X_pca is None) and (explained_variance is None): pca = PrincipalComponentAnalysis(n_components=n_comp) pca.fit(X) X_pca = pca.transform(X) explained_variance = pca.e_vals_ elif (X_pca is not None) and (explained_variance is None): raise ValueError("If `X_pca` is not None, the `explained variance`" " values should not be `None`.") elif (X_pca is None) and (explained_variance is not None): raise ValueError("If `explained variance` is not None, the `X_pca`" " values should not be `None`.") elif (X_pca is not None) and (explained_variance is not None): if X_pca.shape[1] != len(explained_variance): raise ValueError(f"Number of principal components must " f"match the number " f"of eigenvalues. Got " f"{X_pca.shape[1]} " f"!= " f"{len(explained_variance)}") if X_pca.shape[1] < n_comp: raise ValueError(f"Input array `X_pca` contains fewer principal" f" components than expected based on `dimensions`." f" Got {X_pca.shape[1]} components in X_pca, expected" f" at least `max(dimensions)={n_comp}`.") if len(explained_variance) < n_comp: raise ValueError(f"Input array `explained_variance` contains fewer" f" elements than expected. Got" f" {len(explained_variance)} elements, expected" f"`X.shape[1]={X.shape[1]}`.") corrs = create_correlation_table( X_pca, X, ['Dim ' + str(i + 1) for i in range(n_comp)], variables_names) tot = sum(X.var(0)) * X.shape[0] / (X.shape[0] - 1) explained_var_ratio = [(i / tot) * 100 for i in explained_variance] # Plotting circle fig_res = plt.figure(figsize=(figure_axis_size, figure_axis_size)) plt.Circle((0, 0), radius=1, color='k', fill=False) circle1 = plt.Circle((0, 0), radius=1, color='k', fill=False) fig = plt.gcf() fig.gca().add_artist(circle1) # Plotting arrows texts = [] for name, row in corrs.iterrows(): x = row['Dim ' + str(dimensions[0])] y = row['Dim ' + str(dimensions[1])] plt.arrow(0.0, 0.0, x, y, color='k', length_includes_head=True, head_width=.05) plt.plot([0.0, x], [0.0, y], 'k-') texts.append(plt.text(x, y, name, fontsize=2 * figure_axis_size)) # Plotting vertical lines plt.plot([-1.1, 1.1], [0, 0], 'k--') plt.plot([0, 0], [-1.1, 1.1], 'k--') # Adjusting text adjust_text(texts) # Setting limits and title plt.xlim((-1.1, 1.1)) plt.ylim((-1.1, 1.1)) plt.title("Correlation Circle", fontsize=figure_axis_size * 3) plt.xlabel("Dim " + str(dimensions[0]) + " (%s%%)" % str(explained_var_ratio[dimensions[0] - 1])[:4], fontsize=figure_axis_size * 2) plt.ylabel("Dim " + str(dimensions[1]) + " (%s%%)" % str(explained_var_ratio[dimensions[1] - 1])[:4], fontsize=figure_axis_size * 2) return fig_res, corrs
def plot_pca_correlation_graph(X, variables_names, dimensions=(1, 2), figure_axis_size=6, X_pca=None): """ Compute the PCA for X and plots the Correlation graph Parameters ---------- X : 2d array like. The columns represent the different variables and the rows are the samples of thos variables variables_names : array like Name of the columns (the variables) of X dimensions: tuple with two elements. dimensions to be plot (x,y) X_pca : optional. if not provided, compute PCA independently figure_axis_size : size of the final frame. The figure created is a square with length and width equal to figure_axis_size. Returns ---------- matplotlib_figure , correlation_matrix """ X = np.array(X) X = X - X.mean(axis=0) n_comp = max(dimensions) if X_pca is None: pca = PrincipalComponentAnalysis(n_components=n_comp) pca.fit(X) X_pca = pca.transform(X) corrs = create_correlation_table(X_pca, X, ['Dim ' + str(i + 1) for i in range(n_comp)], variables_names) tot = sum(pca.e_vals_) explained_var_ratio = [(i / tot) * 100 for i in pca.e_vals_] # Plotting circle fig_res = plt.figure(figsize=(figure_axis_size, figure_axis_size)) plt.Circle((0, 0), radius=1, color='k', fill=False) circle1 = plt.Circle((0, 0), radius=1, color='k', fill=False) fig = plt.gcf() fig.gca().add_artist(circle1) # Plotting arrows texts = [] for name, row in corrs.iterrows(): x = row['Dim ' + str(dimensions[0])] y = row['Dim ' + str(dimensions[1])] plt.arrow(0.0, 0.0, x, y, color='k', length_includes_head=True, head_width=.05) plt.plot([0.0, x], [0.0, y], 'k-') texts.append(plt.text(x, y, name, fontsize=2 * figure_axis_size)) # Plotting vertical lines plt.plot([-1.1, 1.1], [0, 0], 'k--') plt.plot([0, 0], [-1.1, 1.1], 'k--') # Adjusting text adjust_text(texts) # Setting limits and title plt.xlim((-1.1, 1.1)) plt.ylim((-1.1, 1.1)) plt.title("Correlation Circle", fontsize=figure_axis_size * 3) plt.xlabel("Dim " + str(dimensions[0]) + " (%s%%)" % str(explained_var_ratio[dimensions[0] - 1])[:4].lstrip("0."), fontsize=figure_axis_size * 2) plt.ylabel("Dim " + str(dimensions[1]) + " (%s%%)" % str(explained_var_ratio[dimensions[1] - 1])[:4].lstrip("0."), fontsize=figure_axis_size * 2) return fig_res, corrs
def test_variance_explained_ratio(): pca = PCA() pca.fit(X_std) assert_almost_equal(np.sum(pca.e_vals_normalized_), 1.) assert np.sum(pca.e_vals_normalized_ < 0.) == 0