def pls_learning(num_components, num_vars): print(1) data_full = pd.read_csv('mango_flouro_rows.csv') data_full = data_full[data_full['position'] == 'pos 2'] y_name = 'Total Chlorophyll (ug/ml)' # y_name = 'Chlorophyll b (ug/ml)' y_data = data_full[y_name] x_data = data_full[x_data_columns] x_data = get_best_pls_variables(x_data, y_data, num_components, num_vars) cv = GroupShuffleSplit(n_splits=100, test_size=0.3, random_state=0) group_splitter = data_full['Leaf number'] estimator = PLS(num_components) title = "Learning curve {} components".format(num_components) plot_learning_curve(estimator, title, x_data, y_data, cv=cv, group=group_splitter) plt.show()
def get_all_regrs(): regrs = { "Linear regression": linear_model.LinearRegression(), # "Perceptron": linear_model.Perceptron(), "Lars": linear_model.Lars(), "Lasso": linear_model.LassoCV(max_iter=5000), # "Passive Aggressive": linear_model.PassiveAggressiveRegressor(), "PLS": PLS(n_components=3), "Random Forest": ensemble.RandomForestRegressor(), "Gradient Boost": ensemble.GradientBoostingRegressor(), "Extra Trees": ensemble.ExtraTreesRegressor(max_depth=2), "Ada Boost": ensemble.AdaBoostRegressor( base_estimator=tree.DecisionTreeRegressor(max_depth=2), n_estimators=250), "Gaussian Process": gaussian_process.GaussianProcessRegressor(), # "Isotonic": isotonic.IsotonicRegression(), "Kernel Ridge": kernel_ridge.KernelRidge(), "Ridge CV": linear_model.RidgeCV(), # "Exp tranform": TransformedTargetRegressor(regressor=PLS(n_components=3), # func=np.exp, # inverse_func=np.log), # "Log tranform": TransformedTargetRegressor(regressor=PLS(n_components=3), # func=np.log, # inverse_func=np.exp), # "Inv tranform": TransformedTargetRegressor(regressor=PLS(n_components=3), # func=invert, # inverse_func=invert), # "Log regressor": linear_model.LogisticRegressionCV(), "ML Perceptron": neural_network.MLPRegressor(max_iter=50000, hidden_layer_sizes=(5, 5)), "Linear SVR": linear_svc, "RBF SVR": svm.SVR(kernel='rbf'), "Poly SVR": svm.SVR(kernel='poly'), # "Sigmoid SVR": svm.SVR(kernel='sigmoid'), "Bayesian Ridge": linear_model.BayesianRidge(), "Huber": linear_model.HuberRegressor(), # "Poisson": linear_model.PoissonRegressor(), "K-neighbors": neighbors.KNeighborsRegressor() } # "Radius Neighbors": neighbors.RadiusNeighborsRegressor()} return regrs
def get_best_pls_variables(x, y, num_pls_components, num_varaibles): x_scaled_np = StandardScaler().fit_transform(x) x_scaled = pd.DataFrame(x_scaled_np, columns=x.columns) pls = PLS(num_pls_components) pls.fit(x_scaled, y) sorted_coeff = np.argsort(np.abs(pls.coef_[:, 0])) sorted_coeff = np.flip(sorted_coeff) columns_to_keep = x.columns[sorted_coeff[:num_varaibles]] print(columns_to_keep) return x_scaled[columns_to_keep]
def transform(X, factors, get_model, method, y=None): if method == "raw" or method is None: return X if not factors or factors == "full": factors = np.prod(X.shape[1:]) if method == "lda": factors -= 1 if not isinstance(method, str): raise RuntimeError("Please supply a method name (pca, lda, ica, cca, pls)") method = method.lower() if method == "pca": from sklearn.decomposition import PCA model = PCA(n_components=factors, whiten=True) elif method == "lda": from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA model = LDA(n_components=factors) elif method == "ica": from sklearn.decomposition import FastICA as ICA model = ICA(n_components=factors) elif method == "cca": from sklearn.cross_decomposition import CCA model = CCA(n_components=factors) elif method == "pls": from sklearn.cross_decomposition import PLSRegression as PLS model = PLS(n_components=factors) if str(y.dtype)[:3] not in ("flo", "int"): y = dummycode(y, get_translator=False) else: raise ValueError("Method {} unrecognized!".format(method)) X = rtm(X) if method in ("lda", "cca", "pls"): if y is None: raise RuntimeError("y must be supplied for {}!".format(method)) latent = model.fit_transform(X, y) else: if y is not None: warnings.warn("y supplied for {}. Ignoring!".format(method)) latent = model.fit_transform(X) if isinstance(latent, tuple): latent = latent[0] if get_model: return latent, model else: return latent
def plot_param_learning_curves(): x_data, _y, full_data = data_get.get_data('as7262 mango', average=False) pls = PLS(n_components=6) print(full_data.columns) currents = full_data['LED current'].unique() times = full_data['integration time'].unique() print(currents, times) print(full_data['saturation check'].unique()) figure, axes, = plt.subplots(len(currents), len(times), figsize=(9, 12), constrained_layout=True) figure.suptitle("Parameter scan of new AS7262 Mango data") # figure.suptitle("Gradient Boosting Regressor fit\nAS7262 Betel data") # axes_ = [axes[0][0], axes[0][1], axes[0][2], axes[0][3], # axes[1][0], axes[1][1], axes[1][2], axes[1][3], # axes[2][0], axes[2][1], axes[2][2], axes[2][3], # axes[3][0], axes[3][1], axes[3][2], axes[3][3], # axes[4][0], axes[4][1], axes[4][2], axes[4][3],] current_i = 0 time_i = 0 for current in currents: for time in times: X, Y = data_get.get_data("as7262 mango", integration_time=time, led_current=current, return_type="XY") X = StandardScaler().fit_transform(X) X = PolynomialFeatures().fit_transform(X) Y = Y['Total Chlorophyll (µg/mg)'] title = str(time * 2.8) + " ms " + current print(title) plot_learning_curve(pls, title, X, Y, cv=cv, ax=axes[current_i][time_i], ylim=[-0.3, -.1]) time_i += 1 time_i = 0 current_i += 1
def fit(self, data, resp): dof = data.shape[1] n = data.shape[0] ortho_comp = dof - self.comp W = np.ndarray(shape=(dof, ortho_comp)) P = np.ndarray(shape=(dof, ortho_comp)) T = np.ndarray(shape=(n, ortho_comp)) # Start with Vector w = np.transpose(np.matmul(np.transpose(resp), data) / lin.norm(resp)) for i in range(ortho_comp): t = np.matmul(data, w) / np.matmul(np.transpose(w), w) # get pls scores p = np.transpose( np.matmul(np.transpose(t), data) / np.matmul(np.transpose(t), t)) # pls loadings ## Get Orthogonal Components w_ortho = p - ((np.matmul(np.transpose(w), p) / np.matmul(np.transpose(w), w)) * w) w_ortho = w_ortho / lin.norm(w_ortho) t_ortho = np.matmul(data, w_ortho) / np.matmul( np.transpose(w_ortho), w_ortho) p_ortho = np.transpose( np.matmul(np.transpose(t_ortho), data) / np.matmul(np.transpose(t_ortho), t_ortho)) data = data - np.matmul(t_ortho, np.transpose(p_ortho)) W[:, i] = np.reshape(w_ortho, (dof, )) P[:, i] = np.reshape(p_ortho, (dof, )) T[:, i] = np.reshape(t_ortho, (n, )) self.data_p = data self.data_o = np.matmul(T, np.transpose(P)) self.W_o = W self.T_o = T self.P_o = P ## Build PLS Regression from data.P pls = PLS(n_components=self.comp).fit(self.data_p, resp) self.analysis = pls self.rotated_data = pls.transform(self.data_p) return (self)
def _pls_regression_train(table, feature_cols, label_cols, n_components=2, scale=True, max_iter=500, tol=1e-6): pls_model = PLS(n_components=n_components, scale=scale, max_iter=max_iter, tol=tol) _, features = check_col_type(table, feature_cols) _, labels = check_col_type(table, label_cols) pls_model.fit(features, labels) predict = pls_model.predict(features) _mean_absolute_error = mean_absolute_error(labels, predict) _mean_squared_error = mean_squared_error(labels, predict) _r2_score = r2_score(labels, predict) result_table = pd.DataFrame.from_items([ ['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']], ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]] ]) label_name = { 'n_components': 'Number of components', 'scale': "Scale", 'max_iter': 'Max iteration', 'tol': 'Tolerance' } get_param = pls_model.get_params() param_table = pd.DataFrame.from_items([ ['Parameter', list(label_name.values())], ['Value', [get_param[x] for x in list(label_name.keys())]] ]) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ### PLS Regression Result | {result} | ### Parameters | {list_parameters} """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table) ))) model = _model_dict('pls_regression_model') model['feature_cols'] = feature_cols model['label'] = label_cols model['mean_absolute_error'] = _mean_absolute_error model['mean_squared_error'] = _mean_squared_error model['r2_score'] = _r2_score model['max_iter'] = max_iter model['tol'] = tol model['pls_model'] = pls_model model['_repr_brtc_'] = rb.get() return {'model': model}
res_data = res_data[y_name] x_data_column = data["550 nm"] def model(x, a, b, c): return a * np.exp(-b * x) + c fit_values, _ = curve_fit(model, x_data_column, y_data, maxfev=10**6) y_fit_model = model(x_data_column, *fit_values) res_model = y_data - y_fit_model # make residues from PLS pls = PLS(n_components=3) pls_model = pls.fit(x_data, y_data) y_fit_pls = pls.predict(x_data) print(y_fit_pls) print(y_data) print(type(y_data), type(y_fit_pls), type(y_fit_model)) print(y_data) print(y_data.to_numpy()) print(y_fit_pls.T[0]) res_pls = y_data - y_fit_pls.T[0] print(res_model) print(type(res_model)) print(type(res_pls)) plt.hist(res_model, alpha=0.5,
crossval = KFold(n_splits=5) cv_2 = crossval.split(data_norm, resp) cv_accuracy = np.ndarray(5) models = list() vip = np.ndarray((5, 19)) iter = 0 for (train, test) in cv_2: train = np.array(train) test = np.array(test) train_data = data_norm[train, :] train_resp = resp[train, :] test_data = data_norm[test, :] test_resp = resp[test, :] models.append(PLS().fit(train_data, train_resp)) pls_comps = models.__getitem__(iter).transform(train_data) testing = models.__getitem__(iter).predict(test_data) tmp = np.sqrt(np.mean((testing - resp[test])**2)) cv_accuracy[iter] = tmp #print(testing) #print(resp[test]) #print(tmp) vip[iter, :] = getVIP(models.__getitem__(iter)) #print(vip) iter = iter + 1
for column in data.columns: if 'nm' in column: data_columns.append(column) print(data_columns) # x_data = data[data_columns] y_columns = [ 'Total Chlorophyll (ug/ml)', 'Chlorophyll a (ug/ml)', 'Chlorophyll b (ug/ml)' ] invert_y = False # x_data = np.log(x_data) conditions = "Partial Least Squared" modeler = PLS(n_components=1) modeler_name = "Partial Least Squared" modeler = TransformedTargetRegressor(regressor=PLS(n_components=3), func=np.reciprocal, inverse_func=np.reciprocal) def plot_learning_curve(estimator, axis, X, y, ylim=None, cv=None, group=None, scorer=make_scorer(mean_absolute_error),
pc1 = tmp[:,i] pc2 = tmp[:,j] plt.scatter(pc1, pc2) plt.xlabel("PLS Component "+str(i+1)) plt.ylabel("PLS Component "+str(j+1)) plt.show() ##################### MAIN CODE ##################### #### Load data into numpy array' # Keep pandas just for conveinience right now data = load_data.loadDataPandas('../data/SCLC_study_output_filtered_2.csv') d = data.to_numpy() var_index = data.columns.values.tolist() # vector of class responses associated with data resp = load_data.getResponseMatrix2D() #### Create object to normalize and un-normalize data norm_trans = pre.StandardScaler().fit(d) data_norm = norm_trans.transform(d) #data_norm, norm_trans = pre.mean_center(d) #In-built preprocessing method - TBD #### Fit a Partial Least Squaresn pls = PLS().fit(data_norm, resp) pls_trans = pls.transform(data_norm) plotProjectionScatterMultiClass(pls_trans, resp, 2)
# "Ridge alpha 0.01", "Ridge alpha 0.001", # "Ridge alpha 0.0001", "Ridge alpha 0.00001", # "Lars", "Guassian Regression", # "Gradient Boosting", # "SVR", "LinearSVR", "NuSVR", # "LogisticRegression", "LinearRegression", "SGDRegressor", # "ElasticNet", "ARDRegression", "BayesianRidge", # "HuberRegressor", "RANSACRegressor", "TheilSenRegressor", # "PassiveAggressiveRegressor", # "AdaBoostRegressor", "BaggingRegressor", "GradientBoostingRegressor", # "RandomForestRegressor", "ExtraTreesRegressor", # "Kernel Ridge" # ] models_to_test = [ PLS(n_components=1), PLS(n_components=2), PLS(n_components=3), PLS(n_components=4), Lasso(alpha=5), Lasso(alpha=2), Lasso(alpha=1), Lasso(alpha=0.2), LassoLars(alpha=1), LassoLars(alpha=0.1), LassoLars(alpha=0.01), LassoLars(alpha=0.001), LassoLars(alpha=0.0003), Ridge(alpha=0.01, max_iter=5000), Ridge(alpha=0.001, max_iter=5000), Ridge(alpha=0.0001, max_iter=5000),