def fit_base_model(classifiers, fully, dummyY, trainx, testx): """ Takes a list of classifiers and/or PLS regression and does dimension reduction by returning the predictions of the classifiers or first two scores of the PLS regression on bootstrapped subsamples of the data.""" trainProbs = [] testProbs = [] iterations = 0 for clf in classifiers: for i in range(clf[1]): iterations += 1 print(iterations) print(clf[0]) train_rows = np.random.choice(trainx.shape[0], round(trainx.shape[0] * base_prop), True) oob_rows = list(set(range(trainx.shape[0])) - set(train_rows)) print(len(train_rows)) print(len(oob_rows)) x = trainx[train_rows, :] if clf[0] == 'PLS': y = dummyY[train_rows, :] mod = PLSRegression().fit(x, y) trainscores = mod.transform(trainx) testscores = mod.transform(testx) trainProbs.append(trainscores[:, 0]) trainProbs.append(trainscores[:, 1]) testProbs.append(testscores[:, 0]) testProbs.append(testscores[:, 1]) else: y = fully[train_rows] print('\t Fitting model...') mod = clf[0].fit(x, y) print('\t Predicting training results...') tpreds = mod.predict_proba(trainx) trainProbs.append(list(tpreds[:, 1])) print('\t Predicting test results...') testProbs.append(list(mod.predict_proba(testx)[:, 1])) print('\t OOB score: ' + str(log_loss(fully[oob_rows], tpreds[oob_rows, :]))) return trainProbs, testProbs
def do_pls(X, Y): pls2 = PLSRegression(n_components=2) pls2.fit(X,Y) out = pls2.transform(X) print(out) print(out.shape) plt.title("PLS2") plt.xlabel("PL1") plt.ylabel("PL2") plt.grid(); plt.scatter(out[:, 0], out[:, 1], c=Y, cmap='viridis') plt.savefig('pls.png', dpi=125)
def pls_approach(): from sklearn.cross_decomposition import PLSRegression (X, Y), cities = pull_xy_data() pls = PLSRegression() pls.fit(X, Y) plsX, plsY = pls.transform(X, Y) plot(plsX, cities, ["Lat01", "Lat02", "Lat03"], ellipse_sigma=1) return "OK What Now?"
def hacerPLS(X,Y): pls_wild_b = PLSRegression(n_components = 9) pls_wild_b.fit(X,Y) Z = pls_wild_b.transform(X) scores = list() scores_std = list() n_features = np.shape(X)[1] X,X_test_tot, Y, Y_test_tot = cross_validation.train_test_split(X,Y,test_size = 0.5,random_state = 0) N = np.shape(X)[0] for num_comp in range(n_features): kf = KFold(N,n_folds = 10) aux_scores = list() for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] if num_comp == 0: y_pred = np.mean(y_test) y_pred = y_pred* np.ones(np.shape(y_test)) aux_scores.append(metrics.mean_squared_error(y_test,y_pred)) else: pls_foo = PLSRegression(n_components = num_comp) pls_foo.fit(X_train,y_train) y_pred = pls_foo.predict(X_test) #obtaing the score this_score = metrics.mean_squared_error(y_test,y_pred) aux_scores.append(this_score) scores.append(np.mean(aux_scores)) scores_std.append(np.std(aux_scores)) plt.plot(scores) xlabel('Componentes') ylabel("$MSE$") title("Animales PLS") plt.show() num_comp = np.argmin(scores) pls_pred = PLSRegression(n_components =2) pls_pred.fit(X,Y) y_pred_test = pls_pred.predict(X_test_tot) print "MSE test = " + str(metrics.mean_squared_error(Y_test_tot,y_pred_test))
def reduce_PLS(dataframe): PLS_file="data/pls_structure.pickle" selectedcolumn=[x for x in dataframe.columns if x not in ["id","click","device_id","device_ip"]] X=np.array(dataframe[selectedcolumn]) y=np.array(dataframe["click"]) if os.path.exists(PLS_file): stand_PLS=pickle.load(open(PLS_file,'rb')) print "PLS structure is loaded." else: stand_PLS=PLSRegression(n_components=10,scale=True) stand_PLS.fit(X, y[:,np.newaxis]) stand_PLS.y_scores_=None stand_PLS.x_scores_=None pickle.dump(stand_PLS,open(PLS_file,"wb")) print "PLS transform structure is stored." T=stand_PLS.transform(X) print "PLS transformation is performed." return T
class PLS_method(DR_Technique): r"""Partial Least Squares dimension reduced subspace This computes reduced subspace by: (1) standardizing x and y (1) applying 2-blocks regression PLS2 over x and y Example: >>> DR_model=PLS_method(0,[0,1]) >>> DR_model.calculate(train_x,train_y) """ def __init__(self, dim_DR, orig_range): r"""Args: dim_DR: number of dimensions to reduce to orig_range: the bounds of the original subspace """ if dim_DR != 0 and isinstance(dim_DR, int): super().__init__('PLS', dim_DR, orig_range) else: raise ValueError( 'dim_DR cannot equal 0 for PLS or is not an integer') #need to save mean and std for later encode/decode self.x_mean = None self.x_std = None self.y_mean = None self.y_std = None def calculate(self, train_x, train_y): ###assumes train_x or train_y is not standardized ############################################ #Step 1: calc params for later use # ############################################ self.x_mean = train_x.mean(axis=0) self.x_std = train_x.std(axis=0) self.x_std[self.x_std == 0.0] = 1.0 self.y_mean = train_y.mean(axis=0) self.y_std = train_y.std(axis=0) self.y_std[self.y_std == 0.0] = 1.0 ############################################ #Step 2: Create instance and fit PLS # ############################################ self.Model = PLSRegression(n_components=self.dim_DR) #automatically standardizes everything for us self.Model.fit(train_x, train_y) ############################################ #Step 3: Determine reduced subspace bounds # ############################################ DR = self.Model.transform(train_x) #will standardize for us self.DR_range = np.c_[np.min(DR, axis=0) * 1.1, np.max(DR, axis=0) * 1.1] #pad a bit return print('PLS model created') def Decode_X(self, DR_input): ##################################################### # Convert DR-> orig # ##################################################### xhat_n = np.dot(DR_input, self.Model.x_rotations_.T) #convert back to original domain xhat = (xhat_n * self.x_std) + self.x_mean #verify and correct domain boundaries return self.Enforce_Bounds(xhat) def Encode_X(self, x_set): ##################################################### # Convert orig->DR # ##################################################### #standardize N(0,1) X_0 = np.divide(x_set - self.x_mean, self.x_std) #convert to DR return np.dot(X_0, self.Model.x_weights_) def Pred_Y(self, DR_set): ##################################################### # predict Y from DR # ##################################################### #Y=TQ'+F yhat_n = np.dot(DR_set, self.Model.y_loadings_.T) if len(yhat_n) == 1: yhat_n = yhat_n[:, None] #destandardize return (yhat_n * self.y_std) + self.y_mean
plt.xlim(1,np.amax(nComponents)) plt.title('PLS Cannonical accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right') plt.grid(True) if (0): #%% PLS Regression nComponents = np.arange(1,nClasses+1) plsRegScores = np.zeros((5,np.alen(nComponents))) for i,n in enumerate(nComponents): plsReg = PLSRegression(n_components=n) plsReg.fit(Xtrain,Ytrain) XtrainT = plsReg.transform(Xtrain) XtestT = plsReg.transform(Xtest) plsRegScores[:,i] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) plsReg = PLSRegression(n_components=2) plsReg.fit(Xtrain,Ytrain) xt = plsReg.transform(Xtrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) plt.title('First 2 components of projected data') #%% Plot accuracies for PLSSVD plt.figure() for i in range (5):
class MspmPartialLeastSquares: """ This module is to construct a partial_least_squares (PLS) model for feature analysis. Parameters ---------- x (n_samples, n_features) – The training input samples y (n_samples, n_targets) – The training target samples n_components – The number of feature scores preprocess (default = True) - the preprocessing of the data Attributes ---------- pls - model of PLS Example ------- >>> from sklearn.datasets import load_iris >>> from pypm.models.mspm_partial_least_squares import MspmPartialLeastSquares >>> data = load_iris() >>> x = data.data array([[5.1, 3.5, 1.4, 0.2]... >>> y = data.target array([0, 0, 0, 0, 0, 0, 0... >>> PLS_model = MspmPartialLeastSquares(x, y, 3) >>> PLS_model.construct_pls_model() >>> Features = PLS_model.extract_pls_feature(x) array([[-2.26393268e+00, 1.74075256e-01, 3.62141834e-01]... >>> Prediction = PLS_model.pls_predict(x) array([[-8.05094197e-02]... """ def __init__(self, x, y, n_components, preprocess=True): self.x = x self.y = y self.preprocess = preprocess self.n_components = n_components if self.preprocess: self.Xscaler = preprocessing.StandardScaler().fit(self.x) self.x = self.Xscaler.transform(self.x) def construct_pls_model(self): """ Function to construct a pls model. """ self.pls = PLSRegression(self.n_components) self.pls.fit(self.x, self.y) def extract_pls_feature(self, x_test): """ Function to extract the PCA feature of given data using the trained-well PCA model. Parameters ---------- x_test (_, n_features) - The testing samples """ if self.preprocess: x_test = self.Xscaler.transform(x_test) return self.pls.transform(x_test) def pls_predict(self, x_test): if self.preprocess: x_test = self.Xscaler.transform(x_test) return self.pls.predict(x_test)
plt.ylabel('1st component') elif i == 1: plt.ylabel('2nd component') else: plt.ylabel('3rd component') axis_c = plt.gca() axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7) axis_c.set_xticks(axis_c.get_xticks() + 0.5) print "dentro del bucleeeeeeeeeee" #Select the number of components using CV #%% ##PLSR pls_wild_b = PLSRegression(n_components = 3) pls_wild_b.fit(X_train_prepro,Y_train) X_train_pls_proj = pls_wild_b.transform(X_train_prepro) print("loadings") for i in range(pls_wild_b.n_components): plt.figure() plt.bar(np.arange(np.shape(X_train_prepro)[1]), pls_wild_b.x_loadings_[:,i]) if i == 0: plt.ylabel('PLS 1st component') elif i == 1: plt.ylabel('PLS2nd component') else: plt.ylabel('PLS 3rd component') axis_c = plt.gca() axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7) axis_c.set_xticks(axis_c.get_xticks() + 0.5)
from sklearn.lda import LDA lda = LDA() lda.fit(Xtrain,Ytrain) LDA_centroids = lda.means_ # Centroids of the classes (n_class, n_features) Xtrain = lda.transform(Xtrain) Xtest = lda.transform(Xtest) # Linear PLS if (FE_PLS == 1): pls2 = PLSRegression(n_components=n_comp) pls2.fit(Xtrain,Ytrain_m) pls2 Xtrain = pls2.transform(Xtrain) Xtest = pls2.transform(Xtest) # Kernel PLS if (FE_kPLS == 1): d = pair.pairwise_distances(Xtrain,Xtrain) aux = np.triu(d) sigma = np.sqrt(np.mean(np.power(aux[aux!=0],2)*0.5)) gamma = 1/(2*sigma**2) ktrain = pair.rbf_kernel(Xtrain,Xtrain,gamma) ktest = pair.rbf_kernel(Xtest,Xtrain,gamma) kcent = KernelCenterer()
var_index = data.columns.values.tolist() # vector of class responses associated with data resp = load_data.getResponseMatrix1D() resp2 = load_data.getResponseMatrix2D() #### Create object to normalize and un-normalize data norm_trans = pre.StandardScaler().fit(d) data_norm = norm_trans.transform(d) #### Train OPLS opls = OPLS(2, resp2).fit(data_norm, resp) #### Train PLS for comparison pls = PLS(2).fit(data_norm, resp) pls.rotated_data = pls.transform(data_norm) pls.responses = resp2 #### Figures opls.plotProjectionScatterMultiClass(2, labels=["Healthy", "Not Healthy"]) OPLS.plotProjectionScatterMultiClass(pls, 2, labels=["Healthy", "Not Healthy"]) plt.figure() plt.plot(opls.analysis.coef_[:, 0]**2) #plt.plot(opls.analysis.coef_[:,1]**2) plt.title("OPLS Weights") plt.figure() plt.plot(pls.coef_[:, 0]**2) #plt.plot(pls.x_weights_[:,1]**2) plt.title("PLS Weights")
pca = PCA(n_components=j, random_state=np.random.RandomState(0)) pca.fit(x) x3 = pca.transform(x) string = "pca_" pca_column_name = [string + ` i ` for i in range(x3.shape[1])] reduced_df = pd.DataFrame(pca.components_, columns=x.columns, index=pca_column_name) sig_features = list(set(reduced_df.idxmax(axis=1).values)) print sig_features df_final = x[sig_features] pca_df = reduced_df[sig_features] plsca = PLSRegression(n_components=j) plsca.fit(x, y) x_pls = plsca.transform(x) string = "pls_" x_pls_column_name = [string + ` i ` for i in range(x_pls.shape[1])] plsca_df = pd.DataFrame(plsca.x_weights_) plsca_trans = plsca_df.transpose() x_pls_reduced_df = pd.DataFrame(plsca_trans.values, columns=x.columns, index=x_pls_column_name) pls_sig_features = list(set(x_pls_reduced_df.idxmax(axis=1).values)) print pls_sig_features df_trans.reset_index(['CUSTOMER_KEY'], inplace=True) pls_final = pd.concat([df_trans[pls_sig_features], df_trans['CUSTOMER_KEY']], axis=1) y.reset_index(['CUSTOMER_KEY'], inplace=True) df2 = pd.concat([y, pls_final], axis=1) df2.set_index('CUSTOMER_KEY', inplace=True)
plt.title("PCA") ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight') lda = LinearDiscriminantAnalysis(n_components=2).fit(x, y) Y = lda.transform(x) ax = fig.add_subplot(243) plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap=plt.cm.Spectral) plt.title("lda") ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight') pls = PLSRegression(n_components=2).fit(x, y) Y = pls.transform(x) ax = fig.add_subplot(244) plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap=plt.cm.Spectral) plt.title("%s" % "PLS") ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight') Y = manifold.MDS(n_components=2).fit_transform(x) ax = fig.add_subplot(246) plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap=plt.cm.Spectral) plt.title("mds") ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight')
def pls_thing(scenario_data, xcols, ycols, titlestr): #PLS Summary Stats pls = PLSRegression(n_components=3) pls.fit(scenario_data[xcols], scenario_data[ycols]) k = 0 transformed_x_full = pls.transform(scenario_data[xcols]) y = scenario_data[ycols] results = pd.DataFrame(columns=('Case Label', 'Explained Variance Ratio', 'RegressionCoefs', 'Regression R^2', 'SpearmanCorr', 'SpearmanPvalue', 'Loadings', 'X Weights', 'X Loadings', 'X Scores')) if type(titlestr) == type([]): titlestr = ' '.join(titlestr) #Linear fits for each individual component for c in range(np.shape(pls.x_weights_)[1]): x_transformed_1pc = transformed_x_full[:, k].reshape(-1, 1) lr = linear_model.LinearRegression(fit_intercept=True, normalize=True) lr.fit(x_transformed_1pc, y) print('Regression Coefs', lr.coef_) print('R^2', lr.score(x_transformed_1pc, y)) print('Spearman: ', scipy.stats.spearmanr(x_transformed_1pc, y)) print('Component: ', c) results.loc[len(results)] = np.nan results.loc[len(results) - 1, 'Case Label'] = titlestr + ' Component ' + str(k) # results.loc[len(results)-1,'Explained Variance Ratio'] = pls.explained_variance_ratio_[k] results.set_value(len(results) - 1, 'RegressionCoefs', lr.coef_) results.loc[len(results) - 1, 'Regression R^2'] = lr.score(x_transformed_1pc, y) results.loc[len(results) - 1, 'SpearmanCorr'] = scipy.stats.spearmanr( x_transformed_1pc, y)[0] results.loc[len(results) - 1, 'SpearmanPvalue'] = scipy.stats.spearmanr( x_transformed_1pc, y)[1] results.set_value(len(results) - 1, 'X Weights', pls.x_weights_[:, k]) results.set_value( len(results) - 1, 'X Loadings', pls.x_loadings_[:, k]) results.set_value(len(results) - 1, 'X Scores', pls.x_scores_[:, k]) plt.plot(x_transformed_1pc, y, '*') plt.xlabel('Component ' + str(k)) plt.ylabel('Performance') plt.title('PLS ' + titlestr) plt.show() k += 1 print(results) fig = plt.figure() ax = fig.add_subplot(111) ax.set_title("PLS PC0 vs PC1 vs Performance " + ' '.join(cs), fontsize=14) ax.set_xlabel("PC0", fontsize=12) ax.set_ylabel("PC1", fontsize=12) ax.scatter(transformed_x_full[:, 0], transformed_x_full[:, 1], s=100, c=y, marker='*', cmap=cm.bwr) plt.show() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(transformed_x_full[:, 0], transformed_x_full[:, 1], transformed_x_full[:, 2], s=100, c=y, marker='*', cmap=cm.bwr) ax.set_title("PLS PC0 vs PC1 vs PC2 vs Performance " + ' '.join(cs), fontsize=14) ax.set_xlabel("PC0", fontsize=12) ax.set_ylabel("PC1", fontsize=12) ax.set_zlabel("PC2", fontsize=12) plt.show() print(results) return results
pcr.fit(X_train, y_train) pca = pcr.named_steps["pca"] # retrieve the PCA step of the pipeline pls = PLSRegression(n_components=1) pls.fit(X_train, y_train) fig, axes = plt.subplots(1, 2, figsize=(10, 3)) axes[0].scatter(pca.transform(X_test), y_test, alpha=0.3, label="ground truth") axes[0].scatter( pca.transform(X_test), pcr.predict(X_test), alpha=0.3, label="predictions" ) axes[0].set( xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA" ) axes[0].legend() axes[1].scatter(pls.transform(X_test), y_test, alpha=0.3, label="ground truth") axes[1].scatter( pls.transform(X_test), pls.predict(X_test), alpha=0.3, label="predictions" ) axes[1].set(xlabel="Projected data onto first PLS component", ylabel="y", title="PLS") axes[1].legend() plt.tight_layout() plt.show() # %% # As expected, the unsupervised PCA transformation of PCR has dropped the # second component, i.e. the direction with the lowest variance, despite # it being the most predictive direction. This is because PCA is a completely # unsupervised transformation, and results in the projected data having a low # predictive power on the target. #
from sklearn.cross_decomposition import PLSRegression from sklearn.decomposition import PCA, TruncatedSVD pca = PCA(n_components=8) pca_feats = [3, 5, 10, 14, 18, 19, 22, 23, 25, 26, 27] train_pca_df = pd.DataFrame([]) test_pca_df = pd.DataFrame([]) for feat in pca_feats: feat_label = "F" + str(feat) train_pca_df[feat_label] = train_features[feat_label] test_pca_df[feat_label] = test_features[feat_label] pls = PLSRegression(n_components=8) # This works good for the log reg model pls.fit(train_pca_df, train_y) train_feats_pls = pd.DataFrame(pls.transform(train_pca_df), index=train_features.index) test_feats_pls = pd.DataFrame(pls.transform(test_pca_df), index=test_features.index) #%% Replace pca feats with new feats for feat in pca_feats: feat_label = "F" + str(feat) train_features = train_features.drop([feat_label], axis=1) test_features = test_features.drop([feat_label], axis=1) train_features = pd.concat([train_features, train_feats_pls], axis=1) test_features = pd.concat([test_features, test_feats_pls], axis=1) #%% Logistic Regression on the initial features lr = LogisticRegression()
def plot_pcr_vs_pls(): rng = np.random.RandomState(0) n_samples = 500 cov = [[3, 3], [3, 4]] X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples) pca = PCA(n_components=2).fit(X) plt.scatter(X[:, 0], X[:, 1], alpha=.3, label='samples') for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)): comp = comp * var # scale component by its variance explanation power plt.plot([0, comp[0]], [0, comp[1]], label=f"Component {i}", linewidth=5, color=f"C{i + 2}") plt.gca().set(aspect='equal', title="2-dimensional dataset with principal components", xlabel='first feature', ylabel='second feature') plt.legend() plt.show() y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2 fig, axes = plt.subplots(1, 2, figsize=(10, 3)) axes[0].scatter(X.dot(pca.components_[0]), y, alpha=.3) axes[0].set(xlabel='Projected data onto first PCA component', ylabel='y') axes[1].scatter(X.dot(pca.components_[1]), y, alpha=.3) axes[1].set(xlabel='Projected data onto second PCA component', ylabel='y') plt.tight_layout() plt.show() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression()) pcr.fit(X_train, y_train) pca = pcr.named_steps['pca'] # retrieve the PCA step of the pipeline pls = PLSRegression(n_components=1) pls.fit(X_train, y_train) fig, axes = plt.subplots(1, 2, figsize=(10, 3)) axes[0].scatter(pca.transform(X_test), y_test, alpha=.3, label='ground truth') axes[0].scatter(pca.transform(X_test), pcr.predict(X_test), alpha=.3, label='predictions') axes[0].set(xlabel='Projected data onto first PCA component', ylabel='y', title='PCR / PCA') axes[0].legend() axes[1].scatter(pls.transform(X_test), y_test, alpha=.3, label='ground truth') axes[1].scatter(pls.transform(X_test), pls.predict(X_test), alpha=.3, label='predictions') axes[1].set(xlabel='Projected data onto first PLS component', ylabel='y', title='PLS') axes[1].legend() plt.tight_layout() plt.show() print(f"PCR r-squared {pcr.score(X_test, y_test):.3f}") print(f"PLS r-squared {pls.score(X_test, y_test):.3f}") pca_2 = make_pipeline(PCA(n_components=2), LinearRegression()) pca_2.fit(X_train, y_train) print(f"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}")
class PLS: def __init__(self, params): self.name = "pls" self.model = PLSRegression(n_components=params['n_components']) self.target_col = None def _format_data(self, data_map): if self.target_col is None: raise ValueError("Target col is None!") order = sorted(list(data_map.keys())) X = {k:data_map[k] for k in order if k != self.target_col} inputs = np.concatenate([X[k] for k in X], axis=1) return inputs def fit(self, train_map, target_col, valid_fraction=0.2, use_cv=False): print("Formatting data") self.target_col = target_col y = train_map[target_col].values X = self._format_data(train_map) splitpoint = int(y.shape[0]*(1-valid_fraction)) y_valid, X_valid = y[splitpoint:], X[splitpoint:] y, X = y[:splitpoint], X[:splitpoint] if use_cv: print('Running grid search') param_dist = self.get_hyperparam_ranges() self.model = select.GridSearchCV(self.model, param_grid=param_dist, cv=3, n_jobs=2) print("Fitting PLS") self.model.fit(X, y) if valid_fraction != 0: print("Scoring on validation data") r2 = self.model.score(X_valid, y_valid) print("R2 for PLS:", r2) return r2 else: print("No validation data") return 0.0 def predict(self, data_map): X = self._format_data(data_map) return self.model.predict(X) def get_latents(self, data_map): X = self._format_data(data_map) return self.model.transform(X) def get_save_name(self, model_folder): return os.path.join(model_folder, self.name+".joblib") def save(self, model_folder): name = self.get_save_name(model_folder) joblib.dump(self.model, name) def load(self, model_folder): name = self.get_save_name(model_folder) self.model = joblib.load(name) @classmethod def get_hyperparam_ranges(cls): params = {'n_components': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 40]} return params
def plsda(df, a, b, n_components=2, mean_center=False, scale=True, **kwargs): """ Partial Least Squares Discriminant Analysis, based on `sklearn.cross_decomposition.PLSRegression` Performs a binary group partial least squares discriminant analysis (PLS-DA) on the supplied dataframe, selecting the first ``n_components``. Sample groups are defined by the selectors ``a`` and ``b`` which are used to select columns from the supplied dataframe. The result model is applied to the entire dataset, projecting non-selected samples into the same space. For more information on PLS regression and the algorithm used, see the `scikit-learn documentation <http://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html>`_. :param df: Pandas ``DataFrame`` to perform the analysis on :param a: Column selector for group a :param b: Column selector for group b :param n_components: ``int`` number of components to select :param mean_center: ``bool`` mean center the data before performing PLS regression :param kwargs: additional keyword arguments to `sklearn.cross_decomposition.PLSRegression` :return: scores ``DataFrame`` of PLSDA scores n_components x n_samples weights ``DataFrame`` of PLSDA weights n_variables x n_components """ if not sklearn: assert ( 'This library depends on scikit-learn (sklearn) to perform PLS-DA') from sklearn.cross_decomposition import PLSRegression df = df.copy() # We have to zero fill, nan errors in PLSRegression df[np.isnan(df)] = 0 if mean_center: mean = np.mean(df.values, axis=0) df = df - mean sxa, _ = df.columns.get_loc_level(a) sxb, _ = df.columns.get_loc_level(b) dfa = df.iloc[:, sxa] dfb = df.iloc[:, sxb] dff = pd.concat([dfa, dfb], axis=1) y = np.ones(dff.shape[1]) y[np.arange(dfa.shape[1])] = 0 plsr = PLSRegression(n_components=n_components, scale=scale, **kwargs) plsr.fit(dff.values.T, y) # Apply the generated model to the original data x_scores = plsr.transform(df.values.T) scores = pd.DataFrame(x_scores.T) scores.index = [ 'Latent Variable %d' % (n + 1) for n in range(0, scores.shape[0]) ] scores.columns = df.columns weights = pd.DataFrame(plsr.x_weights_) weights.index = df.index weights.columns = [ 'Weights on Latent Variable %d' % (n + 1) for n in range(0, weights.shape[1]) ] loadings = pd.DataFrame(plsr.x_loadings_) loadings.index = df.index loadings.columns = [ 'Loadings on Latent Variable %d' % (n + 1) for n in range(0, loadings.shape[1]) ] return scores, weights, loadings
x3 = lda.transform(x[index_test]) model.fit(x2, y[index_train]) predict = model.predict(x3) accuracy_lda = metrics.accuracy_score(y[index_test], predict) cv_lda[count2] = 1 - accuracy_lda count2 += 1 lda_score[count] = cv_lda.mean() lda_std[count] = cv_lda.std() # pls cv_pls = np.zeros(times) count2 = 0 for train, test in kf.split(index): index_train = index[train] index_test = index[test] pls = PLSRegression(n_components=i).fit(x[index_train], y[index_train]) x2 = pls.transform(x[index_train]) x3 = pls.transform(x[index_test]) model.fit(x2, y[index_train]) predict = model.predict(x3) accuracy_pls = metrics.accuracy_score(y[index_test], predict) cv_pls[count2] = 1 - accuracy_pls count2 += 1 pls_score[count] = cv_pls.mean() pls_std[count] = cv_pls.std() print '维度为%d' % i print 'pca降维后分类错误率: %0.2f (+/- %0.2f)' % (cv_pca.mean(), cv_pca.std() * 2) print 'mds降维后分类错误率: %0.2f (+/- %0.2f)' % (cv_MDS.mean(), cv_MDS.std() * 2) print 'Isomap降维后分类错误确率: %0.2f (+/- %0.2f)' % (cv_Isomap.mean(), cv_Isomap.std() * 2) print 'lda降维后分类错误率: %0.2f (+/- %0.2f)' % (cv_lda.mean(), cv_lda.std() * 2)
pc1 = tmp[:,i] pc2 = tmp[:,j] plt.scatter(pc1, pc2) plt.xlabel("PLS Component "+str(i+1)) plt.ylabel("PLS Component "+str(j+1)) plt.show() ##################### MAIN CODE ##################### #### Load data into numpy array' # Keep pandas just for conveinience right now data = load_data.loadDataPandas('../data/SCLC_study_output_filtered_2.csv') d = data.to_numpy() var_index = data.columns.values.tolist() # vector of class responses associated with data resp = load_data.getResponseMatrix2D() #### Create object to normalize and un-normalize data norm_trans = pre.StandardScaler().fit(d) data_norm = norm_trans.transform(d) #data_norm, norm_trans = pre.mean_center(d) #In-built preprocessing method - TBD #### Fit a Partial Least Squaresn pls = PLS().fit(data_norm, resp) pls_trans = pls.transform(data_norm) plotProjectionScatterMultiClass(pls_trans, resp, 2)
pls_components = range(1, 18) cv_pls = np.array([]) for m in pls_components: pls = PLSRegression(n_components=m) foo = np.transpose(college_train_x.get_values()) transformed_college_train_x = pls.fit_transform(college_train_x, college_train_y)[0] lrm = LinearRegression() pls_this_rmse = rmse_cv(LinearRegression(), transformed_college_train_x, college_train_y).mean() cv_pls = np.append(cv_pls, pls_this_rmse) min_m = pls_components[np.argmin(cv_pls)] cv_pls = pd.Series(cv_pls, index=pls_components) cv_pls.plot(title="PLSRegression Cross Validation") plt.xlabel("Number of Components (M)") plt.ylabel("Root Mean Square Error") if show_plots_flag: plt.show() best_pls = PLSRegression(n_components=min_m) transformed_college_train_x = best_pls.fit_transform(college_train_x, college_train_y)[0] transformed_college_test_x = best_pls.transform(college_test_x) lrm = LinearRegression() lrm.fit(transformed_college_train_x, college_train_y) print "\nPLSRegression Regression test RMSE (M = " + str(min_m) + ")" print rmse(lrm, transformed_college_test_x, college_test_y)
data_x += a #Split the feature vector for sample in a: for i in range(subset): fc7_x[i].append(sample[i * offset:(i + 1) * offset]) #Create the labels refering to the selected data data_y += [k] * len(a) #With PLS the results improve in accuracy and computational time pls = PLSRegression(n_components=10, scale=True) for i in range(subset): pls.fit(fc7_x[i], data_y) fc7_x[i] = pls.transform(fc7_x[i]) fc7_X_train = [None] * subset fc7_X_test = [None] * subset fc7_y_train = [None] * subset fc7_y_test = [None] * subset #Generate train/test splits for all subsets for i in range(subset): fc7_X_train[i], fc7_X_test[i], fc7_y_train[i], fc7_y_test[ i] = train_test_split(fc7_x[i], data_y, test_size=0.33, random_state=42) #Create parameters to choose in the grid search
LDA_centroids = lda.means_ # Centroids of the classes (n_class, n_features) Xtrain_LDA = lda.transform(Xtrain) Xtest_LDA = lda.transform(Xtest) # PLS if (FE_PLS == 1): from sklearn.cross_decomposition import PLSSVD,PLSCanonical,PLSRegression pls = PLSRegression(n_components = n_comp) pls.fit(Xtrain,Ytrain_m) PLS_weights = pls.x_weights_.T Xtrain_PLS = pls.transform(Xtrain) Xtest_PLS = pls.transform(Xtest) Xtrain = Xtrain_LDA Xtest = Xtest_LDA ####################################################################### ####################################################################### ## FEATURE SELECTION ####################################################################### ####################################################################### from sklearn.ensemble import ExtraTreesClassifier from sklearn import random_projection from sklearn.svm import SVC from sklearn.feature_selection import RFE
#print "yp_t_not ", yp_t_not.shape pls.fit(Xp_t,yp_t_not.astype(int)) yp_new = pls.predict(Xp_t, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) yp_t = yp_t.astype(int) #print y_new,y_pred, y_t error = ((yp_t - yp_pred) ** 2).sum() print "PLS Training error " , float(error)/yp_t.shape[0] yp_new = pls.predict(Xp_v, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) #print y_new, y_pred, y_v #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0] error = ((yp_v - yp_pred) ** 2).sum() print "PLS Validation error " , float(error)/yp_v.shape[0] X_new = pls.transform(X) rf = RandomForestClassifier(n_estimators=500, max_depth=None, max_features=int(math.sqrt(n_components)), min_samples_split=100, random_state=144, n_jobs=4) #print "shapes ", X_new.shape, y.shape #print X_new,y X_t, X_v, y_t, y_v = tts(X_new,yd,train_size=0.85) rf.fit(X_t, y_t) print "Random Forest Classifier: ", rf.get_params() print "Covariance Classifier Training score: ", rf.score(X_t, y_t) print "Covariance Classifier Validation score: ", rf.score(X_v, y_v) #print "Class prob: ", zip(rf.predict_proba(X_v), y_v) sample_weights = rf.predict_proba(pls.transform(Xp_t))[:,1] print sample_weights.shape sample_weights = abs(sample_weights-0.5)
plt.show() # CCA from sklearn.cross_decomposition import CCA cca = CCA(n_components=2) cca.fit(X, Y) X_cca = lda.transform(X) plt.plot(X_cca[0:50,0],X_cca[0:50,1],'o',label='setosa') plt.plot(X_cca[50:100,0],X_cca[50:100,1],'o',label='versicolor') plt.plot(X_cca[100:150,0],X_cca[100:150,1],'o',label='virginica') plt.xlim([-8,9]) plt.ylim([-4,4]) plt.title('CCA') plt.legend(loc='lower right') plt.show() # PLS from sklearn.cross_decomposition import PLSRegression pls2 = PLSRegression(n_components=2) pls2.fit(X, Y) X_pls = pls2.transform(X) plt.plot(X_pls[0:50,0],X_pls[0:50,1],'o',label='setosa') plt.plot(X_pls[50:100,0],X_pls[50:100,1],'o',label='versicolor') plt.plot(X_pls[100:150,0],X_pls[100:150,1],'o',label='virginica') plt.xlim([-3,3]) plt.ylim([-1,1]) plt.title('PLS') plt.legend(loc='lower right') plt.show()
original_dataset = pd.read_csv(settings.TRAIN_FILE) target = FeatureColumnsExtractor(settings.TARGET).fit_transform( original_dataset).apply(lambda x: np.sqrt(x)) feature_union = get_feature_union() dataset = feature_union.fit_transform(original_dataset, target) var_thresh = VarianceThreshold(threshold=0.02) dataset = var_thresh.fit_transform(dataset) high_corr = HighCorrelationFilter(threshold=0.82) dataset = high_corr.fit_transform(dataset) n_components = 17 pls = PLSRegression(n_components=n_components) pls.fit(dataset, target) dataset_ = pls.transform(dataset) estimators = get_estimation_pipeline() estimators.fit(dataset_, target) original_test_set = pd.read_csv(settings.TEST_FILE) # test_set = get_preprocessing_pipeline().fit_transform(original_test_set) test_set = feature_union.transform(original_test_set) test_set = var_thresh.transform(test_set) test_set = high_corr.transform(test_set) test_set_ = pls.transform(test_set) predictions = estimators.predict(test_set_) output = pd.DataFrame({
def __fit__(self, correctors, predictors, observations, n_jobs=-1, *args, **kwargs): '''Computes the correction and prediction parameters that best fit the observations according to the Partial Least Squares metdhos Parameters: - correctors: NxC (2-dimensional) matrix, representing the covariates, i.e., features that (may) explain a part of the observational data in which we are not interested, where C is the number of correctors and N the number of elements for each corrector. - predictors: NxR (2-dimensional) matrix, representing the predictors, i.e., features to be used to try to explain/predict the observations (experimental data), where R is the number of predictors and N the number of elements for each predictor (the latter is ensured to be the same as that in the 'correctors' argument). - observations: NxM (2-dimensional) matrix, representing the observational data, i.e., values obtained by measuring the variables of interest, whose behaviour is wanted to be explained by the correctors and predictors, where M is the number of variables and N the number of observations for each variable (the latter is ensured to be the same as those in the 'correctors' and the 'predictors' arguments). - num_threads: integer (default -1), indicating the number of threads to be used by the algo- rithm. If set to -1, all CPUs are used. This will only provide speed-up for M > 1 and sufficiently large problems. Returns: - Correction parameters: (num_comp+2)*CxM (3-dimensional) matrix, representing the parameters that best fit the correctors to the observations for each variable, where M is the number of variables (same as that in the 'observations' argument) and C is the number of correction parameters for each variable (same as the number of correctors). - Regression parameters: ((num_comp+2)*R + 2)xM (3-dimensional) matrix, representing the parameters that best fit the predictors to the corrected observations for each variable, where M is the number of variables (same as that in the 'observations' argument) and R is the number of prediction parameters for each variable (same as the number of predictors). The first dimension correspond to (x_rotations, coef, x_mean, y_mean, num_components) ''' # All-at-once approach pls_corr = PLSRegression(n_components=self.num_components_corr, scale=False) pls_pred = PLSRegression(n_components=self.num_components_pred, scale=False) M = observations.shape[1] R = predictors.shape[1] if correctors.size != 0: cparams = np.zeros((R * (self.num_components_pred + 2) + 3, M)) for n in range(M): if np.std(observations[:, n]) == 0: continue pls_corr.fit(correctors, observations[:, n]) observations[:, n] = observations[:, n] - np.dot( pls_corr.transform(correctors), pls_corr.y_loadings_.T) cparams[:R * self.num_components_corr, n] = pls_corr.x_rotations_.reshape((-1, )) cparams[R * self.num_components_corr:R * (self.num_components_corr + 1), n] = pls_corr.coef_.reshape((-1, )) cparams[R * (self.num_components_corr + 1):-2, n] = pls_corr.x_mean_.reshape((-1, )) cparams[-3, n] = pls_corr.y_mean_.reshape((-1, )) cparams[-2, n] = correctors.shape[1] cparams[-1, n] = self.num_components_corr cparams = np.concatenate((pls_corr.x_rotations_[np.newaxis], pls_corr.y_loadings_[np.newaxis], pls_corr.x_mean_[np.newaxis], pls_corr.y_mean_[np.newaxis]), axis=0) else: cparams = np.asarray([[]]) if predictors.size != 0: pparams = np.zeros( ((R + 1) * (self.num_components_pred + 1) + R + 2, M)) for n in range(M): if np.std(observations[:, n]) == 0: pparams[-3, n] = np.mean(observations[:, n]).reshape( (-1, )) continue pls_pred.fit(predictors, observations[:, n]) pparams[:R * self.num_components_pred, n] = pls_pred.x_rotations_.reshape((-1, )) pparams[R * self.num_components_pred:(R + 1) * self.num_components_pred, n] = pls_pred.y_rotations_.reshape((-1, )) pparams[(R + 1) * self.num_components_pred:(R + 1) * self.num_components_pred + R, n] = pls_pred.coef_.reshape((-1, )) pparams[(R + 1) * self.num_components_pred + R:-3, n] = pls_pred.x_mean_.reshape((-1, )) pparams[-3, n] = pls_pred.y_mean_.reshape((-1, )) pparams[-2, n] = R pparams[-1, n] = self.num_components_pred else: pparams = np.asarray([[]]) return (cparams, pparams)
import numpy as np from sklearn.cross_decomposition import PLSRegression from sklearn.datasets import make_classification from pls_gpu import PLSGPU import time if __name__ == '__main__': np.random.seed(12227) X, y = make_classification(n_samples=10000, n_features=3000, n_classes=2, n_clusters_per_class=1) pls = PLSRegression(n_components=10) pls.fit(X, y) start = time.time() pls.transform(X) end = time.time() print('Projection time PLS [{:.4f}]'.format(end-start)) pls_gpu = PLSGPU(pls, batch_size=X.shape[0]) start = time.time() pls_gpu.transform(X) end = time.time() print('Projection time PLSGPU [{:.4f}]'.format(end - start))
xt,yt = plscan.fit_transform(dataTrain,Ytrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) u = plscan.x_weights_ plt.quiver(u[0,0],u[1,0],color='k',edgecolor='k',lw=1,scale=0.1,figure=fig) plt.quiver(-u[1,0],u[0,0],color='k',edgecolor='k',lw=1,scale=0.4,figure=fig) #%% PLS2 lda = LDA() nComponents = np.arange(1,nFeatures,8) pls2Scores = np.zeros((2,np.alen(nComponents))) for i,n in enumerate(nComponents): pls2 = PLSRegression(n_components=n) pls2.fit(dataTrain,Ytrain) dataTrainT = pls2.transform(dataTrain) dataTestT = pls2.transform(dataTest) pls2Scores[:,i] = util.classify(dataTrainT,dataTestT,labelsTrain,labelsTest) pls2 = PLSRegression(n_components=2) xtPLS,yt = pls2.fit_transform(dataTrain,Ytrain) uPLS = pls2.x_weights_ #%% Canonical Correlation Analysis nComponents = np.arange(1,nClasses+1) cca = CCA(n_components=nClasses) cca.fit(dataTrain,Ytrain) dataTrainT = cca.transform(dataTrain) dataTestT = cca.transform(dataTest) ccaScores = np.zeros((2,np.alen(nComponents)))
# In[136]: # Split data to train and test on 50-50 ratio X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=None) # In[137]: pls = PLSRegression(n_components=27) pls.fit(X_train, X_test) X_pls = pls.fit_transform(X_train, X_test) x2 =pls.transform(x) # In[138]: x2=pd.DataFrame(x2) print(x2) #x2= NormalizeData(x2) #print(X_pls) #two_arrays = X_pls #datapls = np.hstack(two_arrays) #np.savetxt('lungcancerpls111.csv', datapls, delimiter=',') # In[139]:
#print "yp_t_not ", yp_t_not.shape pls.fit(Xp_t, yp_t_not.astype(int)) yp_new = pls.predict(Xp_t, copy=True) yp_pred = (yp_new[:, 0] > yp_new[:, 1]).astype(int) yp_t = yp_t.astype(int) #print y_new,y_pred, y_t error = ((yp_t - yp_pred)**2).sum() print "PLS Training error ", float(error) / yp_t.shape[0] yp_new = pls.predict(Xp_v, copy=True) yp_pred = (yp_new[:, 0] > yp_new[:, 1]).astype(int) #print y_new, y_pred, y_v #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0] error = ((yp_v - yp_pred)**2).sum() print "PLS Validation error ", float(error) / yp_v.shape[0] X_new = pls.transform(X) rf = RandomForestClassifier(n_estimators=500, max_depth=None, max_features=int( math.sqrt(n_components)), min_samples_split=100, random_state=144, n_jobs=4) #print "shapes ", X_new.shape, y.shape #print X_new,y X_t, X_v, y_t, y_v = tts(X_new, yd, train_size=0.85) rf.fit(X_t, y_t) print "Random Forest Classifier: ", rf.get_params() print "Covariance Classifier Training score: ", rf.score(X_t, y_t) print "Covariance Classifier Validation score: ", rf.score(
class metamodel(): def __init__(self, X, y, bounds=None, testfunction=None, reg=None, name='', testPoints=None, MLEP=True, normtype='std', Lambda=0.01, PLS=False, PLS_order=2, **kwargs): self.X_orig = copy.deepcopy(X) self.y_orig = copy.deepcopy(y) self.X = copy.deepcopy(X) self.y = copy.deepcopy(y) self.testfunction = testfunction self.flag_penal = MLEP self.bounds = bounds self.name = name self.n = self.X.shape[0] # Nr points self.k = self.X.shape[1] # nr dimensions self.non_feasible_mc = None self.feasible_mc = None self.feasible_y_mc = None self.non_feasible_y_mc = None self.non_feasible = None self.feasible = None self.feasible_y = None self.non_feasible_y = None self.Lambda = 0 self.sigma = 0 self.normtype = normtype # std if normalized st std is one, else normalized on interval [0, 1] self.normRange = [] self.ynormRange = [] self.normalizeData() # normalizes the input data! self.PLS = PLS self.pls2 = None self.PLS_order = PLS_order if self.PLS_order > self.X_orig.shape[1]: print('Higher PLS than dimension of problem') raise (ValueError) # lower so that it fits to at least a 3**dim grid! if self.n > 3**self.PLS_order: self.PLS_order = PLS_order else: self.PLS_order = int(np.floor(np.log(self.n) / np.log(3))) if self.PLS: # Compute all directions, reduction is done in later step! self.pls2 = PLSRegression(n_components=self.PLS_order) # if self.k == 1: # self.pls2 = PLSRegression(n_components=1) # elif self.k == 2: # self.pls2 = PLSRegression(n_components=2) # elif self.k > 2: # self.pls2 = PLSRegression(n_components=3) # else: # raise ValueError self.pls2.fit(self.X, self.y) self.X = self.pls2.transform(self.X) # self.X = self.PLS_trans(self.X) try: self.k = self.X.shape[1] except: self.k = 1 self.X = self.X.reshape(-1, 1) self.theta = np.ones(self.k) self.pl = np.ones(self.k) * 2. self.sp = sp(self.k) self.reg = reg # self.updateData() # self.updateModel() self.thetamin = 1 self.thetamax = 15 self.pmin = 1.7 self.pmax = 2.3 self.pl = np.ones(self.k) * 2 self.Lambda_min = 0.01 #1e-2 self.Lambda_max = 0.1 self.Lambda = Lambda #0.1 #0.03 # regression order def PLS_trans(self, X): # The PLS - regression computes a new basis in which the bm = self.pls2.x_rotations_ # full rotation try: Xt = np.linalg.solve(bm, X.T).T except: print(traceback.format_exc()) # Pick out only first two components of this vector if np.isscalar(X[0]): raise (ValueError) Xt = Xt[:, :self.PLS_order] # don't work for pointwise data return Xt def PLS_inv_rot(self, X): bm = self.pls2.x_rotations_ # full rotation Xr = np.dot(bm, X.T).T return Xr def normX(self, X): ''' :param X: An array of points (self.k long) in physical world units :return X: An array normed to our model range of [0,1] for each dimension ''' scalar = False if np.isscalar(X[0]): X = [X] scalar = True X_norm = np.ones(np.shape(X)) * np.nan for i, row in enumerate(X): # for every row for j, elem in enumerate(row): # for every element in every row if self.normtype == 'std': # with standard deviation one! X_norm[i, j] = (elem - self.normRange[j][0]) / self.normRange[j][1] else: # in interval [0,1] X_norm[i, j] = (elem - self.normRange[j][0]) / float( self.normRange[j][1] - self.normRange[j][0]) if scalar: # unpack [X_norm] = X_norm return X_norm else: return X_norm def inversenormX(self, X): ''' :param X: An array of points (with self.k elem) in normalized model units :return X : An array of real world units ''' scalar = False if np.isscalar(X[0]): X = [X] scalar = True X_inv = np.ones(np.shape(X)) * np.nan for i, row in enumerate(X): # for every row for j, elem in enumerate(row): # for every element in every row if self.normtype == 'std': X_inv[i, j] = self.normRange[j][0] + elem * self.normRange[j][ 1] # x = mu + u*std(X) else: X_inv[i, j] = (elem * float(self.normRange[j][1] - self.normRange[j][0]) ) + self.normRange[j][0] if scalar: # unpack [X_inv] = X_inv return X_inv else: return X_inv def normy(self, y): ''' :param y: An array of observed values in real-world units :return y: A normalized array of model units in the range of [0,1] ''' if self.normtype == 'std': return (y - self.ynormRange[0] ) / self.ynormRange[1] # u = (x-mu)/std(X) else: return (y - self.ynormRange[0]) / (self.ynormRange[1] - self.ynormRange[0]) def inversenormy(self, y): ''' :param y: A normalized array of model units in the range of [0,1] :return: An array of observed values in real-world units ''' if self.normtype == 'std': return self.ynormRange[0] + y * self.ynormRange[ 1] # x = mu + u * std(X) else: return ( y * (self.ynormRange[1] - self.ynormRange[0])) + self.ynormRange[0] def normalizeData(self): ''' This function is called when the initial data in the model is set. We find the max and min of each dimension and norm that axis to a range of [0,1] ''' # lower and upper bound of data. for i in range(self.X.shape[1] ): # self.k can be smth different if PLS is used! if self.normtype == 'std': self.normRange.append([ np.mean(self.X[:, i]), np.std(self.X[:, i], dtype=np.float64) ]) else: # determine the intervals self.normRange.append([min(self.X[:, i]), max(self.X[:, i])]) # Normalize data self.X = self.normX(self.X) if self.normtype == 'std': self.ynormRange.append(np.mean(self.y)) self.ynormRange.append(np.std(self.y, dtype=np.float64)) else: # determine the intervals self.ynormRange.append(min(self.y)) self.ynormRange.append(max(self.y)) for i in range(self.n): self.y[i] = self.normy(self.y[i]) def animate(): if animate: def init(): ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) ax.set_zlim([0, 250]) ax.plot_wireframe(X, Y, Z, rstride=3, cstride=3, label='Metamodel') ax.scatter(spx, spy, self.inversenormy(self.y), color='k', label='Experiments') ax.legend(prop={'size': 20}) if self.testfunction is not None: ax.plot_surface(X, Y, ZT, rstride=3, cstride=3, alpha=0.5, cmap='jet') ax.set_xlabel('$X_1$') ax.set_ylabel('$X_2$') ax.set_zlabel('$\mathbf{G}(X_1, X_2)$') # ax.legend() return fig, def animate(i): ax.view_init(elev=10., azim=i) return fig, # Animate anim = animation.FuncAnimation(fig, animate, init_func=init, frames=360, interval=20, blit=True) # Save anim.save( r'C:\Users\pettlind\Dropbox\KTH\PhD\Article2\animate\animation.mp4', fps=30, extra_args=['-vcodec', 'libx264']) raise NotImplementedError() def plot(self, fig=None, ax=None, labels=False, show=True, animate=False, only_points=False, name=None, PF=False, bounds=None): ''' This function plots 2D and 3D models :param labels: :param show: If True, the plots are displayed at the end of this call. If False, plt.show() should be called outside this function :return: https://stackoverflow.com/questions/13316397/matplotlib-animation-no-moviewriters-available ''' if self.X_orig.shape[1] == 1000: # DESTROYED! dim = self.X_orig.shape[1] # Multisubplot! def comp(x1, x2, x0, bounds): ''' compute variation in only two variables at the time. Input: x1 - index first variable x2 - index second variable bounds - bounds for all variable x0 - nominal value''' x = np.linspace(bounds[x1][0], bounds[x1][1], num=20) y = np.linspace(bounds[x2][0], bounds[x2][1], num=20) # Normalize wrong place! # for iter, xp, yp in zip(range(0,len(x)),x,y): # x[iter], y[iter] = self.normX(np.array([xp, yp])) X, Y = np.meshgrid(x, y) modeldata = np.asarray([np.ravel(X), np.ravel(Y)]).T # 2d up to here pos = np.linspace(0, 9, 10) bol1 = pos == x1 bol2 = pos == x2 # np.logical_or(pos == x1, pos == x2) modeldata_upd = np.ones((modeldata.shape[0], 10)) * np.nan test_data = copy.copy(modeldata_upd) for ii, xa in enumerate(modeldata): temp = copy.copy(x0) temp[bol1] = xa[0] temp[bol2] = xa[1] modeldata_upd[ii] = copy.copy(temp) test_data[ii] = copy.copy(temp) # prediction # zs =self.predict(self.PLS_trans(self.normX(modeldata_upd))) zs = self.predict(self.pls2.transform( self.normX(modeldata_upd)), norm=False) Z = zs.reshape(X.shape) # non-normed zt = self.testfunction(test_data) ZT = zt.reshape(X.shape) return Z, ZT # specs_fix = np.asarray([{'type': 'surface'}]*5*5).reshape(5, 5).tolist() # fig = make_subplots(rows=5, cols=5, specs = specs_fix) fig = plt.figure() fig, axs = plt.subplots(dim - 1, dim - 1, sharex='col', sharey='row') # Plot x = np.linspace(0, 1, num=20) y = np.linspace(0, 1, num=20) X, Y = np.meshgrid(x, y) bounds = np.asarray(bounds) x0 = 0.5 * bounds[:, 0] + 0.5 * bounds[:, 1] num_mat = np.linspace(0, (dim - 1)**2 - 1, (dim - 1)**2).reshape(dim - 1, dim - 1) num_v = [] for i in range(1, dim): for j in range(0, i): Z, ZT = comp(i, j, x0, bounds) num_v.append(num_mat[i - 1, j]) # ax = fig.add_subplot(dim - 1, dim - 1, numb)#, projection='3d') # ax.contourf(X, Y, Z, rstride=3, cstride=3, label='Metamodel') # ax.plot_surface(X, Y, ZT, rstride=3, cstride=3, alpha=0.5, cmap='jet') # contour_levels = 10 try: contour_levels = np.linspace(180, 360, 11) CS = axs[i - 1, j].contour(X, Y, -Z, contour_levels, colors='k', linestyles='solid', zorder=2) # Change contour levels so that they match int 180-340! # contour_levels = CS.levels # delta = np.abs(contour_levels[0]-contour_levels[1]) # contour_levels = np.insert(contour_levels, 0, contour_levels[0]-delta) # contour_levels = np.append(contour_levels, contour_levels[-1]+delta) CT = axs[i - 1, j].contourf(X, Y, -ZT, contour_levels, cmap='cividis', zorder=1) # ax.plot_surface(X, Y, ZT, ) axs[i - 1, j].axis('off') except: pdb.set_trace() # Add colorbar # Remove empty subplots for i in range(0, num_mat.size): if not (i == np.asarray(num_v)).any(): axs.flat[i].set_visible(False) # remove these # axes = fig.get_axes()[0] fig.colorbar(CT, ax=axs.flat) # Set common x and y labels for ax in axs.flat: ax.set(xlabel='x-label', ylabel='y-label') # Hide x labels and tick labels for top plots and y ticks for right plots. for ax in axs.flat: ax.label_outer() if show: plt.show() elif self.k == 2: if fig is None: fig = plt.figure(figsize=(8, 6)) # samplePoints = list(zip(*self.inversenormX(self.X_orig))) # lists of list of every coordiante # Create a set of data to plot plotgrid = 50 if bounds is None: x = np.linspace(min(self.X[:, 0]), max(self.X[:, 0]), num=plotgrid) y = np.linspace(min(self.X[:, 1]), max(self.X[:, 1]), num=plotgrid) nor = False else: # boundries x = np.linspace(bounds[0][0], bounds[0][1], num=plotgrid) y = np.linspace(bounds[1][0], bounds[1][1], num=plotgrid) # Normalize for iter, xp, yp in zip(range(0, len(x)), x, y): x[iter], y[iter] = self.normX(np.array([xp, yp])) nor = False X, Y = np.meshgrid(x, y) if not only_points: # compute the true values at all points! modeldata = np.asarray([np.ravel(X), np.ravel(Y)]).T zs = np.array( [self.predict(data, norm=nor) for data in modeldata]) Z = zs.reshape(X.shape) # non-normed if self.testfunction is not None and self.X_orig.shape[1] == 2: testdata = np.array(list(zip(np.ravel(X), np.ravel(Y)))) if self.PLS: # rotate according to PLS if True testdata = self.PLS_inv_rot(testdata) zt = self.testfunction(self.inversenormX(testdata)) ZT = zt.reshape(X.shape) if ax is None: # ax = fig.add_subplot(111, projection='3d') # ax = Axes3D(fig) matplotlib.rcParams['font.family'] = "Times New Roman" plt.style.use('seaborn-bright') # ax = fig.add_subplot(212, projection='3d') # fig = plt.gcf() #ax = fig.gca(projection='3d') fig2 = plt.figure(figsize=(8, 6)) ax2 = Axes3D(fig2) # ax2.set_xlim([0, 1]) # ax2.set_ylim([0, 1]) # ax2.set_zlim([0, 250]) ax2.scatter(self.X[:, 0], self.X[:, 1], self.inversenormy(self.y), color='k', label='Experiments') if PF: if self.feasible is not None: # ax2.scatter(self.feasible[:, 0], self.feasible[:, 1], self.feasible_y, color='g', marker="o", label='Feasible model') ax2.scatter(self.non_feasible[:, 0], self.non_feasible[:, 1], self.non_feasible_y, color='r', marker="o", label='Non Feasible model') if self.feasible_mc is not None: # Monte Carlo ax2.scatter(self.feasible_mc[:, 0], self.feasible_mc[:, 1], self.feasible_y_mc, color='g', marker='s', label='Feasible mc') ax2.scatter(self.non_feasible_mc[:, 0], self.non_feasible_mc[:, 1], self.non_feasible_y_mc, color='r', marker='s', label='Non Feasible mc') if not only_points: ax2.plot_wireframe(X, Y, Z, rstride=3, cstride=3, label='Metamodel') if self.testfunction is not None and self.X_orig.shape[ 1] == 2: ax2.plot_surface(X, Y, ZT, rstride=3, cstride=3, alpha=0.5, cmap='jet') ax2.legend(prop={'size': 20}) ax2.set_xlabel('$X_1$') ax2.set_ylabel('$X_2$') ax2.set_zlabel('$\mathbf{G}(X_1, X_2)$') my_path = os.path.abspath('.') plt.savefig(my_path + '\\img\\' + name + '.png', format='png', dpi=1000) if show: plt.show() else: pass # pylab.title(self.reg) # ax.legend(['Approx fun.', 'True fun.'], loc="upper right") # ax.legend(['Approx fun.', 'True fun.'], loc="upper right") # Now add the legend with some customizations. # legend = ax.legend(loc='upper center', shadow=True) # legend = ax.legend(loc='upper center', shadow=True) elif self.k == 1: if fig is None: fig = plt.figure(figsize=(8, 6)) # Create a set of data to plot plotgrid = 50 if plot_int is None: x_vec = np.linspace(self.normRange[0][0], self.normRange[0][1], num=plotgrid) else: xmin, xmax = plot_int x_vec = np.linspace(xmin, xmax, num=plotgrid) # Predict based on the optimized results y = np.array([ self.predict(np.array(x).reshape(1, )) for x in np.ravel(x_vec) ]) plt.plot(x, y, 'ro') def pf(self, mu, coe, MC_num, bounds=[], MC=False, PF=False, threshold=0): ''' Computes Pf Input: mu - vector of mean values coe - coefficient of determination MC_num - number of mc samples MC - Bool, if pure MC is to be done at the surface ''' # Sample points on the surface using MC # X = sp(k=self.X_orig.shape[1]).MC(int(MC_num)) samples = [] for m, c, bound in zip(mu, coe, bounds): vec = np.random.normal(m, m * c, int(MC_num)) if len(bound) > 0: # TRUNCATE! vec[vec < bound[0]] = bound[0] vec[vec > bound[1]] = bound[1] samples.append(vec) samples = np.asarray(samples).T nor = True if self.PLS: mtest = self.pls2.transform(self.normX( samples)) # apply dimension reduction to the training data. else: mtest = self.normX(samples) if PF: f_vec = np.asarray([self.predict(xs, norm=False) for xs in mtest]).reshape(mtest.shape[0]) self.feasible = mtest[f_vec > threshold] self.non_feasible = mtest[f_vec < threshold] self.feasible_y = f_vec[f_vec > threshold] self.non_feasible_y = f_vec[f_vec < threshold] self.Pf = sum(f_vec < threshold) / float(MC_num) if MC: f_mc = np.asarray(self.testfunction(samples)).reshape( mtest.shape[0]) self.Mc = sum(f_mc < threshold) / float(MC_num) self.feasible_mc = mtest[f_mc > threshold] self.non_feasible_mc = mtest[f_mc < threshold] self.feasible_y_mc = f_mc[f_mc > threshold] self.non_feasible_y_mc = f_mc[f_mc < threshold] if np.isnan(f_mc).any(): # Left a sanity check here! print('Probably wrong input into aircraft function!') raise ValueError() def RRMSE_R2(self, k, bounds, n=500): ''' This function calculates the mean relative MSE metric of the model by evaluating MSE at a number of points and the Coefficient of determiniation. :param n: Points to Sample, the number of points to sample the mean squared error at. Ignored if the points argument is specified :param points: an array of points to sample the model at :return: the mean value of MSE and the standard deviation of the MSE points ''' inside = 0 den = 0 SS_tot = 0 SS_res = 0 f_vec = np.zeros((n, )) y_vec = np.zeros((n, )) # # # # nd = n ** (1 / k) # xi = [] # nump = int(np.floor(nd)) # if nump < 3: # nump = 3 # marrays = np.asarray([np.linspace(0,1,nump) for i in range(k)]) # Do instead LHS - with 100*input samples ? marrays = sp(k=k).rlh(n) mravel = np.ones(marrays.shape) * np.nan # Scale for i in range(k): mravel[:, i] = bounds[i][0] + (bounds[i][1] - bounds[i][0]) * marrays[:, i] # All points # mravel = [] # for items in product(*marrays): # mravel.append(items) mtest = copy.deepcopy(mravel) if self.PLS: mtest = self.pls2.transform(self.normX( mravel)) # apply dimension reduction on the training data. f_vec = np.asarray([self.predict(xs, norm=False) for xs in mtest]) y_vec = self.testfunction(mravel) y_bar = np.sum(y_vec) / n**2 # https://en.wikipedia.org/wiki/Root-mean-square_deviation for f_i, y_i in zip(f_vec, y_vec): inside += (f_i - y_i)**2 SS_tot += (y_i - y_bar)**2 # den += y_i # https://www.sciencedirect.com/science/article/pii/S1364032115013258?via%3Dihub # https://stats.stackexchange.com/questions/260615/what-is-the-difference-between-rrmse-and-rmsre?rq=1 # https://en.wikipedia.org/wiki/Coefficient_of_determination RMSD = np.sqrt(inside / n**2) R_sq = 1 - inside / SS_tot if RMSD < 0: # or RMSD > 1: # or R_sq > 1: # R_sq can be less than zero! - fits data worse than horizontal line. raise ValueError('Something of with error estimate!') pdb.set_trace() return R_sq, RMSD # In percentage!
class PLS_NIPALS(BaseModel): """ Partial least-squares regression using the SIMPLS algorithm. Parameters ---------- n_components : int, (default 2) Number of components to keep. Methods ------- train : Fit model to data. test : Apply model to test data. evaluate : Evaluate model. calc_bootci : Calculate bootstrap intervals for plot_featureimportance. plot_featureimportance : Plot coefficient and Variable Importance in Projection (VIP). plot_permutation_test : Perform a permutation test and plot. """ parametric = True # bootlist = ["model.vip_", "model.coef_"] # list of metrics to bootstrap # bootlist = ["model.vip_", "model.coef_", "model.x_loadings_", "model.x_scores_", "Y_pred", "model.pctvar_", "model.y_loadings_"] # list of metrics to bootstrap bootlist = [ "model.vip_", "model.coef_", "model.x_loadings_", "model.x_scores_", "Y_pred", "model.pctvar_", "model.y_loadings_", "model.metrics" ] def __init__(self, n_components=2): self.model = PLSRegression( n_components=n_components) # Should change this to an empty model self.n_component = n_components self.k = n_components self.__name__ = 'cimcb.model.PLS_NIPALS' self.__params__ = {'n_components': n_components} def set_params(self, params): self.__init__(**params) def train(self, X, Y): """ Fit the PLS model, save additional stats (as attributes) and return Y predicted values. Parameters ---------- X : array-like, shape = [n_samples, n_features] Predictor variables, where n_samples is the number of samples and n_features is the number of predictors. Y : array-like, shape = [n_samples, 1] Response variables, where n_samples is the number of samples. Returns ------- y_pred_train : array-like, shape = [n_samples, 1] Predicted y score for samples. """ # Error check # X, Y = self.input_check(X, Y) # Fit model self.model.fit(X, Y) # Calculate vip, pctvar (Explained variance in X) and flatten coef_ for future use # meanX = np.mean(X, axis=0) # X0 = X - meanX # self.model.pctvar_ = sum(abs(self.model.x_loadings_) ** 2) / sum(sum(abs(X0) ** 2)) * 100 # self.model.vip_ = vip(self.model) # self.model.coef_ = self.model.coef_.flatten() y_pred_train = self.model.predict(X).flatten() self.model.pctvar_ = [] for i in range(self.n_component): Y_pred = np.dot(self.model.x_scores_[:, i].reshape( -1, 1), self.model.y_loadings_[:, i].reshape(-1, 1).T) * Y.std( axis=0, ddof=1) + Y.mean(axis=0) explainedvar = r2_score(Y, Y_pred) * 100 self.model.pctvar_.append(explainedvar) self.model.pctvar_ = np.array(self.model.pctvar_) # T = self.model.x_scores_ # W = self.model.x_weights_ # Q = self.model.y_loadings_ # w0, w1 = W.shape # s = np.sum(T ** 2, axis=0) * np.sum(Q ** 2, axis=0) # s_sum = np.sum(s, axis=0) # w_norm = np.array([(W[:, i] / np.linalg.norm(W[:, i])) # for i in range(w1)]) # self.model.vip_ = np.sqrt(w0 * np.sum(s * w_norm.T ** 2, axis=1) / s_sum) t = self.model.x_scores_ w = self.model.x_weights_ q = self.model.y_loadings_ p, h = w.shape vips = np.zeros((p, )) s = np.diag(t.T @ t @ q.T @ q).reshape(h, -1) total_s = np.sum(s) for i in range(p): weight = np.array([(w[i, j] / np.linalg.norm(w[:, j]))**2 for j in range(h)]) vips[i] = np.sqrt(p * (s.T @ weight) / total_s) self.model.vip_ = vips # Calculate and return Y predicted value y_pred_train = self.model.predict(X).flatten() self.model.coef_ = self.model.coef_.flatten() self.model.y_loadings_ = self.model.y_weights_ self.model.x_scores = t self.Y_pred = y_pred_train # Y_pred vs. Y_pred_train self.Y_true = Y self.X = X self.Y = Y # Y vs. Y_true self.metrics_key = [] self.model.eval_metrics_ = [] bm = binary_evaluation(Y, y_pred_train) for key, value in bm.items(): self.model.eval_metrics_.append(value) self.metrics_key.append(key) return y_pred_train def test(self, X, Y=None): """Calculate and return Y predicted value. Parameters ---------- X : array-like, shape = [n_samples, n_features] Test variables, where n_samples is the number of samples and n_features is the number of predictors. Returns ------- y_pred_test : array-like, shape = [n_samples, 1] Predicted y score for samples. """ # Convert to X to numpy array if a DataFrame if isinstance(X, pd.DataFrame or pd.Series): X = np.array(X) # Overwrite x_scores_ from model.fit with using test X (or do model.x_scores_test_) ? self.model.x_scores_ = self.model.transform(X) # Calculate and return Y predicted value y_pred_test = self.model.predict(X).flatten() self.Y_pred = y_pred_test if Y is not None: self.metrics_key = [] self.model.eval_metrics_ = [] bm = binary_evaluation(Y, y_pred_test) for key, value in bm.items(): self.model.eval_metrics_.append(value) self.metrics_key.append(key) self.model.eval_metrics_ = np.array(self.model.eval_metrics_) return y_pred_test
y = dataset["target"] # Center each feature and scale the variance to be unitary X = preprocessing.scale(X) # Compute the variance for each column print(numpy.var(X, 0).sum()) # Now use PCA using 3 components pca = PCA(3) X2 = pca.fit_transform(X) print(numpy.var(X2, 0).sum()) pls = PLSRegression(3) pls.fit(X, y) X2 = pls.transform(X) print(numpy.var(X2, 0).sum()) # Make predictions using an SVM with PCA and PLS pca_error = 0 pls_error = 0 n_folds = 10 svc = LinearSVC() for train_inds, test_inds in KFold(X.shape[0], n_folds=n_folds): X_train, X_test = X[train_inds], X[test_inds] y_train, y_test = y[train_inds], y[test_inds] # Use PCA and then classify using an SVM X_train2 = pca.fit_transform(X_train)
class PLSClassifier(BaseEstimator, ClassifierMixin): __name__ = 'MultiLayeredPLS' def __init__(self, estimator=None, n_iter=1500, eps=1e-6, n_comp=10, mode='regression'): warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd") self.n_iter = n_iter self.eps = eps self.n_comp = n_comp self.mode = mode self.estimator = estimator self.estimator_ = None self.pls = None def fit(self, X, y): # if X is not np.array or y is not np.array: # print('x and y must be of type np.array') # raise ValueError if X.shape[0] != y.shape[0]: raise ValueError() if self.estimator is None: self.estimator_ = LinearRegression() else: self.estimator_ = sklearn.base.clone(self.estimator_) self.classes_, target = np.unique(y, return_inverse=True) target[target == 0] = -1 if self.mode == 'canonical': self.pls = PLSCanonical(n_components=self.n_comp, scale=True, max_iter=self.n_iter, tol=self.eps) elif self.mode == 'regression': self.pls = PLSRegression(n_components=self.n_comp, scale=True, max_iter=self.n_iter, tol=self.eps) proj_x, proj_y = self.pls.fit_transform(X, target) self.estimator_.fit(proj_x, target) return self def predict_value(self, x): resp = self.decision_function(x) if resp.ndim == 1: ans = np.zeros(resp.shape, dtype=np.int32) ans[resp > 0] = self.classes_[1] ans[resp <= 0] = self.classes_[0] else: ans = self.classes_[np.argmax(resp, axis=1)] return ans def predict_confidence(self, x): resp = self.decision_function(x) return resp[0] def decision_function(self, x): x = np.array(x).reshape((1, -1)) proj = self.pls.transform(x) resp = self.estimator_.predict(proj) return resp def predict_proba(self, x): resp = self.decision_function(x) resp = np.min(-1, resp) resp = np.max(1, resp) resp -= 1 resp /= 2 # resp = np.exp(resp) # for r in range(len(resp)): # resp[r] /= np.sum(resp[r]) return resp
features = [] temp = [] for data in MA_data: for i in range(1, numFeatures + 1): temp = np.append(temp, data[np.where(ReconRank == i)[0][0]]) if np.shape(features)[0] == 0: features = temp temp = [] else: features = np.vstack([features, temp]) temp = [] #PLS Dimension Reduction pls2 = PLSRegression(n_components=n_components) pls2.fit(features, MA_label) XScore = pls2.transform(features) # XScore = features #LDA Classification kf = KFold(n_splits=5) kf.get_n_splits(XScore) mean_acc = 0 for train_index, test_index in kf.split(XScore): X_train, X_test = XScore[train_index], XScore[test_index] y_train, y_test = MA_label[train_index], MA_label[test_index] clf = LDA() clf.fit(X_train, y_train) Y_predict = clf.predict(X_test) for i in range(len(Y_predict)): print("Y_Predict {} - Y_Test {}".format(Y_predict[i], y_test[i])) acc = accuracy_score(Y_predict, y_test)
plt.xlim(1, np.amax(nComponents)) plt.title('PLS Cannonical accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend(['LR', 'LDA', 'GNB', 'Linear SVM', 'rbf SVM'], loc='lower right') plt.grid(True) if (0): #%% PLS Regression nComponents = np.arange(1, nClasses + 1) plsRegScores = np.zeros((5, np.alen(nComponents))) for i, n in enumerate(nComponents): plsReg = PLSRegression(n_components=n) plsReg.fit(Xtrain, Ytrain) XtrainT = plsReg.transform(Xtrain) XtestT = plsReg.transform(Xtest) plsRegScores[:, i] = util.classify(XtrainT, XtestT, labelsTrain, labelsTest) plsReg = PLSRegression(n_components=2) plsReg.fit(Xtrain, Ytrain) xt = plsReg.transform(Xtrain) fig = plt.figure() util.plotData(fig, xt, labelsTrain, classColors) plt.title('First 2 components of projected data') #%% Plot accuracies for PLSSVD plt.figure() for i in range(5): plt.plot(nComponents, plsRegScores[i, :], lw=3)