[ DecisionTreeRegressor(), { "criterion": ["mse", "friedman_mse"], "splitter": ["best", "random"], "min_samples_split": [x for x in range(2, 6)] # generates a list [2,3,4,5] } ], [ GradientBoostingRegressor(), { "loss": ["ls", "lad", "huber", "quantile"] } ], [GaussianProcessRegressor(), {}], [PLSRegression(), {}], [AdaBoostRegressor(), {}] ] # Dataset train_X = [[5, 3], [9, 1], [8, 6], [5, 4]] train_Y = [28, 810, 214, 19] pred_X = [7, 3] # Train each model individually using grid search for model in models: regressor = model[0] param_grid = model[1] model = GridSearchCV(regressor, param_grid)
def CCA_across_patients(data_files, alg='cca', freq_clustering='cannonical', bin_size=10, window_size=500, post_shift=0, pre_shift=0, band='alpha', pair=(1, 1)): # Assemble the set of feature vectors # Send the arguments in units of ms samp_factor = 10 window_size = int(window_size / samp_factor) pre_shift = pre_shift / samp_factor post_shift = post_shift / samp_factor pre_stim_feature_vector = np.array([]) post_stim_feature_vector = np.array([]) for data_file in data_files: with h5py.File(data_file, 'r') as f: # ERSP time series references ERSP_refs = f['cfg_PAINT_cond']['ChanERSP'] for i in range(ERSP_refs.size): # Use 32 bit floating precision ERSP = np.zeros((250, 51, 95), dtype=np.float64) f[ERSP_refs[i][0]].read_direct(ERSP) # Need to exclude the maximum nan padding leading_nan_count = np.zeros((51, 95)) trailing_nan_count = np.zeros((51, 95)) for j in range(51): for k in range(95): x1, x2 = count_leading_trailing_true( np.isnan(ERSP[:, j, k])) leading_nan_count[j, k] = x1 trailing_nan_count[j, k] = x2 # Select pre and post stimulation leading_max = int(np.amax(leading_nan_count)) trailing_max = int(np.amax(trailing_nan_count)) pre_window_end = int(1000 / samp_factor - pre_shift) post_window_start = int(1000 / samp_factor + post_shift) # Ensure that we don't encroach on the nan-padding window_size1 = min(window_size, pre_window_end - leading_max) window_size2 = min( window_size, int(2500 / samp_factor - trailing_max - post_window_start)) window_size = int(min(window_size1, window_size2)) pre_stim = ERSP[pre_window_end - window_size:pre_window_end, :, :] post_stim = ERSP[post_window_start:post_window_start + window_size, :, :] # Re-arrange axes so that frequency bins are last pre_stim = np.swapaxes(pre_stim, 1, 2) post_stim = np.swapaxes(post_stim, 1, 2) if freq_clustering == 'cannonical': # Average across cannonical frequency bands pre_stim_theta = np.mean(pre_stim[:, :, 0:4], axis=-1) pre_stim_alpha = np.mean(pre_stim[:, :, 4:8], axis=-1) pre_stim_beta = np.mean(pre_stim[:, :, 8:26], axis=-1) pre_stim_gamma = np.mean(pre_stim[:, :, 26::], axis=-1) pre_stim = np.concatenate([ pre_stim_theta, pre_stim_alpha, pre_stim_beta, pre_stim_gamma ], axis=-1) post_stim_theta = np.mean(post_stim[:, :, 0:4], axis=-1) post_stim_alpha = np.mean(post_stim[:, :, 4:8], axis=-1) post_stim_beta = np.mean(post_stim[:, :, 8:26], axis=-1) post_stim_gamma = np.mean(post_stim[:, :, 26::], axis=-1) post_stim = np.concatenate([ post_stim_theta, post_stim_alpha, post_stim_beta, post_stim_gamma ], axis=-1) elif freq_clustering == 'equal': # Chop off the lowest frequency bin so we have a non-prime number of bins... pre_stim = pre_stim[..., 1::] post_stim = post_stim[..., 1::] # Average across equal number of frequency bands pre_stim = np.mean(pre_stim.reshape( (pre_stim.shape[0], pre_stim.shape[1], -1, bin_size)), axis=-1) post_stim = np.mean(post_stim.reshape( (post_stim.shape[0], post_stim.shape[1], -1, bin_size)), axis=-1) # Collapse pre_stim = pre_stim.reshape( (pre_stim.shape[0], pre_stim.shape[1] * pre_stim.shape[2])) post_stim = post_stim.reshape( (post_stim.shape[0], post_stim.shape[1] * post_stim.shape[2])) elif freq_clustering == 'random': # Chop off the lowest frequency bin so we have a non-prime number of bins... pre_stim = pre_stim[..., 1::] post_stim = post_stim[..., 1::] # Average across random collection of frequency bins idxs = np.arange(pre_stim.shape[-1]) np.random.shuffle(idxs) idxs = np.split(idxs, int(pre_stim.shape[-1] / bin_size)) pre_stim_rand1 = np.mean(pre_stim[:, :, idxs[0]], axis=-1) pre_stim_rand2 = np.mean(pre_stim[:, :, idxs[1]], axis=-1) pre_stim_rand3 = np.mean(pre_stim[:, :, idxs[2]], axis=-1) pre_stim_rand4 = np.mean(pre_stim[:, :, idxs[3]], axis=-1) pre_stim_rand5 = np.mean(pre_stim[:, :, idxs[4]], axis=-1) pre_stim = np.concatenate([ pre_stim_rand1, pre_stim_rand2, pre_stim_rand3, pre_stim_rand4, pre_stim_rand5 ], axis=-1) post_stim_rand1 = np.mean(post_stim[:, :, idxs[0]], axis=-1) post_stim_rand2 = np.mean(post_stim[:, :, idxs[1]], axis=-1) post_stim_rand3 = np.mean(post_stim[:, :, idxs[2]], axis=-1) post_stim_rand4 = np.mean(post_stim[:, :, idxs[3]], axis=-1) post_stim_rand5 = np.mean(post_stim[:, :, idxs[4]], axis=-1) post_stim = np.concatenate([ post_stim_rand1, post_stim_rand2, post_stim_rand3, post_stim_rand4, post_stim_rand5 ], axis=-1) elif freq_clustering == 'single_band': if band == 'theta': pre_stim = pre_stim[:, :, 0:4] post_stim = post_stim[:, :, 0:4] elif band == 'alpha': pre_stim = pre_stim[:, :, 4:8] post_stim = post_stim[:, :, 4:8] elif band == 'beta': pre_stim = pre_stim[:, :, 8:26] post_stim = post_stim[:, :, 8:26] elif band == 'gamma': pre_stim = pre_stim[:, :, 26::] post_stim = post_stim[:, :, 26::] elif band == 'topgamma': pre_stim = pre_stim[:, :, 41:51] post_stim = post_stim[:, :, 41:51] elif band == 'all': pass elif freq_clustering == 'pairwise': pre_stim = pre_stim[:, :, pair[0]] post_stim = post_stim[:, :, pair[1]] # Collpase and append if pre_stim_feature_vector.size == 0: pre_stim_feature_vector = np.append( pre_stim_feature_vector, pre_stim.reshape((1, -1))) post_stim_feature_vector = np.append( post_stim_feature_vector, post_stim.reshape((1, -1))) pre_stim_feature_vector = pre_stim_feature_vector.reshape( (1, -1)) post_stim_feature_vector = post_stim_feature_vector.reshape( (1, -1)) else: pre_stim_feature_vector = np.concatenate( [pre_stim_feature_vector, pre_stim.reshape((1, -1))]) post_stim_feature_vector = np.concatenate( [post_stim_feature_vector, post_stim.reshape((1, -1))]) # Convert to 32 bit floating precision pre_stim_feature_vector = pre_stim_feature_vector.astype(np.float32) post_stim_feature_vector = post_stim_feature_vector.astype(np.float32) # Attempt to do a cross-validated CCA across all the features # Perform a cross-validated cannonical correlation analysis on the basis of this data if alg == 'cca': corrmodel = CCA(n_components=1) crsval = cross_validate(corrmodel, pre_stim_feature_vector, post_stim_feature_vector, cv=5, return_train_score=True) return np.mean(crsval['test_score']), np.mean(crsval['train_score']) elif alg == 'pls': corrmodel = PLSRegression() # Manually cross-validate folds = KFold(n_splits=5) test_scores = [] train_scores = [] for train_index, test_index in folds.split(pre_stim_feature_vector, post_stim_feature_vector): corrmodel.fit(pre_stim_feature_vector[train_index], post_stim_feature_vector[train_index]) test_scores.append( corrmodel.score(pre_stim_feature_vector[test_index], post_stim_feature_vector[test_index])) train_scores.append( corrmodel.score(pre_stim_feature_vector[train_index], post_stim_feature_vector[train_index])) return np.mean(test_scores), np.mean(train_scores)
from sklearn.gaussian_process.kernels import ConstantKernel as C # sklearn NO random forest KAIKI lr = LinearRegression() dtr = DecisionTreeRegressor() rfr = RandomForestRegressor() rte = RandomTreesEmbedding() mr = MLPRegressor(max_iter=1000) omp = OrthogonalMatchingPursuit() ran = RANSACRegressor() tsr = TheilSenRegressor(random_state=42) br = BayesianRidge(n_iter=300, tol=0.001) bgm = BayesianGaussianMixture() knr = KNeighborsRegressor(n_neighbors=5) rnr = RadiusNeighborsRegressor(radius=1.0) pls = PLSRegression(n_components=1) gnb = GaussianNB() mnb = MultinomialNB() svl = SVR(kernel='linear') svr = SVR() las = Lasso() en = ElasticNet() rr = Ridge() kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9) estimators = { 'LR ': lr, 'DTR': dtr, 'RFR': rfr, 'OMP': omp,
plt.xticks(()) plt.yticks(()) plt.show() ############################################################################### # PLS regression, with multivariate response, a.k.a. PLS2 n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) ############################################################################### # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
GBDT_params={'learning_rate':[0.1,0.1,0.1,0.1],'maxdepth':[2,3,2,2],'n_estimators':[100,100,100,100]}#XGBOOST与GBDT相同 此处共用 ENANN_params = {'max_iter': [100, 100, 200, 300], 'p': [0.3, 0.5, 0.7, 0.5]} DFN_params = {'learning_rate':[0.1, 0.1, 0.1, 0.001], 'batch': [300, 400, 300, 400]} LSTM_params = {'learning_rate':[1e-4, 1e-5, 1e-4, 1e-6], 'depth': [2, 2, 1, 2], 'hidden_number': [256]*4} RNN_params = {'learning_rate':[0.1, 0.1, 0.1, 0.001], 'depth': [1, 1, 2, 1], 'hidden_number': [256]*4} #**********************2.全样本3/12/24/36个月滑动窗口函数运行**********************************# path = r'..\DataBase\factor'#96项因子所在路径 factorname = [x[1:-4] for x in os.listdir(path)] riskfree, timeseries, factor, timeseries2, index = datatransfrom(path)[0], datatransfrom(path)[1], datatransfrom(path)[2], datatransfrom2(path)[0], datatransfrom2(path)[1] for i in range(4): i= 0 output(window[i],LinearRegression(),'OLS'+str(window[i]),riskfree[i], timeseries) FC(window[i], riskfree[i], timeseries, 96,'FC') output(window[i], PLSRegression(PLS_params[i]), 'PLS' + str(window[i]), riskfree[i], timeseries) output(window[i],Lasso(alpha=lasso_params[i]),'Lasso'+ str(window[i]), riskfree[i], timeseries) output(window[i],Ridge(alpha=ridge_params[i]),'Ridge'+str(window[i]),riskfree[i], timeseries) output(window[i],ElasticNet(alpha= elasticnet_params['alpha'] [i],l1_ratio= elasticnet_params['l1_ratio'][i]),'ElasticNet'+str(window[i]),riskfree[i], timeseries) output(window[i],SVR(kernel=SVR_params['kernel'][i],gamma= SVR_params ['gamma'][i],C= SVR_params ['C'][i] ),'SVR'+str(window[i]),riskfree[i], timeseries) output(window[i], GradientBoostingRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i],learning_rate=GBDT_params['learning_rate'][i]), 'GBDT' + str(window[i]),riskfree[i], timeseries) output(window[i], XGBRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i], learning_rate=GBDT_params['learning_rate'][i]), 'XGBOOST' + str(window[i]), riskfree[i], timeseries) output(window[i], ensemblenn(5,modeluse = MLPRegressor(solver = 'lbfgs', max_iter=ENANN_params['max_iter'][i]), pickpercent=ENANN_params['p'][i]), 'ENANN' + str(window[i]), riskfree[i], timeseries) output(window[i], DFN.DFN(outputdim=1, neuralset=[96, 50, 25, 10, 5, 2], ctx=gpu(0), epoch=10, batch_size=DFN_params['batch'][i], lr=DFN_params['learning_rate'][i]), 'DFN' + str(window[i]), riskfree[i], timeseries) output2(window[i], rm.lstmmodule(96, LSTM_params['hidden_number'][i], LSTM_params['depth'][i], 100, 3571, lr=LSTM_params['learning_rate'][i]), 'LSTM'+ str(window[i]) ,riskfree[i], timeseries2) output2(window[i], rm.lstmmodule(96, RNN_params['hidden_number'][i], RNN_params['depth'][i], 100, 3571, lr=RNN_params['learning_rate'][i], ntype='RNN'), 'RNN'+ str(window[i]), riskfree[i], timeseries2) modellist = [DFN.DFN(outputdim=1, neuralset=[96, 50, 25, 10, 5, 2], ctx=gpu(0), epoch=10, batch_size=DFN_params['batch'][i], lr=DFN_params['learning_rate'][i]), ensemblenn(5,modeluse = MLPRegressor(solver = 'lbfgs', max_iter=ENANN_params['max_iter'][i]), pickpercent=ENANN_params['p'][i]), XGBRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i], learning_rate=GBDT_params['learning_rate'][i]), GradientBoostingRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i],learning_rate=GBDT_params['learning_rate'][i]), PLSRegression(PLS_params[i]),
# %% ###### GRIDSEARCH ####### #### Scalers scalers = [RobustScaler(), StandardScaler(), MinMaxScaler()] ### Regressions #cvR=SKF(10).split(feats, data_train['gender']) #a=list(cvR) alphas = np.arange(0.001, 10, 0.005) lasso = LassoCV(alphas=alphas, fit_intercept=True, max_iter=100000) ridge = RidgeCV(alphas=alphas, fit_intercept=True) pls = PLSRegression(n_components=10, scale=False, max_iter=1000) gbr = GradientBoostingRegressor(loss='lad', alpha=0.7) SVR = SVR(kernel='linear', C=3) GPR = GaussianProcessRegressor(normalize_y=True, n_restarts_optimizer=50, kernel=RBF()) regressors = [lasso, ridge] # %% #### Pipeline #### #cv=SKF(10).split(feats, data_train['site']) cv = KF(10, shuffle=True) pipe = Pipeline([('scale', StandardScaler()), ('regress', lasso)])
def stacklearning(self): class sparseNorm(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): from sklearn import preprocessing Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values)) return Y fm = sgd.FMRegression( n_iter=4743, init_stdev=0.1, rank=100, l2_reg_w=0, l2_reg_V=0, step_size=0.1, ) fm = sgd.FMRegression( n_iter=9943, init_stdev=0.1, rank=219, l2_reg_w=0, l2_reg_V=0.06454, step_size=0.1, ) pipe = make_pipeline(sparseNorm(), fm) calcACC(pipe, X=X2) xgb = xgboost.XGBRegressor(n_estimators=100, max_depth=7, gamma=0, colsample_bytree=0.1) lgbm = LGBMRegressor(boosting_type='gbdt', num_leaves=367, learning_rate=0.06, feature_fraction=0.14, max_depth=28, min_data_in_leaf=8) rgf = RGFRegressor(max_leaf=1211, algorithm="RGF", test_interval=100, loss="LS", verbose=False, l2=0.93, min_samples_leaf=2) rf = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=56, min_samples_split=2, max_features=0.21) rf = RandomForestRegressor() ext = ExtraTreesRegressor(n_estimators=384, max_features=2228, min_samples_split=0.01, max_depth=856, min_samples_leaf=1) svr = SVR(gamma=9.5367431640625e-07, epsilon=0.0009765625, C=2048.0) #test combination desNew = make_pipeline(extdescriptorNew(), rf) morNew = make_pipeline(extMorganNew(), rf) kotNew = make_pipeline(extklekotaTothNew(), rf) macNew = make_pipeline(extMACCSNew(), rf) desMac = make_pipeline(extDescriptorMACCS(), rf) morMac = make_pipeline(extMorganMACCS(), rf) kotMac = make_pipeline(extKlekotaTothMACCS(), rf) morKotNew = make_pipeline(extMorganKlekotaTothNew(), rf) des = make_pipeline(extOnlyDescriptor(), rf) mor = make_pipeline(extOnlyMorgan(), rf) kot = make_pipeline(extOnlyklekotaToth(), rf) mac = make_pipeline(extOnlyMACCS(), rf) all = make_pipeline(extAll(), rf) allwithoutNew = make_pipeline(extAllwithoutNew(), rf) allwithoutMaccs = make_pipeline(extAllwithoutMaccs(), rf) allwithoutDes = make_pipeline(extAllwithoutDescriptor(), rf) testDic = { "Desc+New": desNew, "Mor+New": morNew, "kot+New": kotNew, "MACCS+New": macNew, "Des+MAC": desMac, "Morgan+Maccs": morMac, "Kot+MACCS": kotMac, "mor+kot+New": morKotNew, "descriptor": des, "morgan": mor, "kot": kot, "MACCS": mac, "All": all, "All without " "new": allwithoutNew, "All without MACCS": allwithoutMaccs, "All without Des": allwithoutDes } #10fold cv = KFold(n_splits=10, shuffle=True, random_state=0) #Fingerprinttest resultDic = {} resultDic2 = {} for name, model in testDic.items(): #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1) #calcACC(model,X=X,y=y2,name=name) Scores = cross_validate(model, X2, y2, cv=cv, scoring=myScoreFunc) RMSETmp = Scores['test_RMSE'].mean() CORRTmP = Scores['test_Correlation coefficient'].mean() resultDic.update({name: [RMSETmp, CORRTmP]}) print(name, RMSETmp, CORRTmP) #stacking alldata = make_pipeline(extAll()) # random forest #1.1546 0.70905 stack = StackingRegressor(regressors=[alldata], meta_regressor=rf, verbose=1) # Light Gradient boosting # 1.160732 0.703776 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm, verbose=1) # XGboost # 1.1839805 0.689571 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb, verbose=1) # Regularized greedily forest # 1.17050 0.6992 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf, verbose=1) #pls 22.808047774809697 0.6410026452910016 i=4 for i in np.arange(3, 11, 1): pls = PLSRegression(n_components=i) testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls, verbose=0) calcACC(testmodel) pls = PLSRegression(n_components=4) #SVR svr = SVR( gamma=9.5367431640625 / 10000000, C=1559.4918100725592, epsilon=0.0009765625, ) svr = SVR(kernel='rbf', gamma=9.5367431640625e-07, epsilon=0.0009765625, C=2048.0) testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1) calcACC(svr) #Extratree 1.157420824123527 0.7061010221224269 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1) calcACC(testmodel) #k-NN nbrs = KNeighborsRegressor(3) ##Linear regressions #Stochastic Gradient Descenta sgd = SGDRegressor(max_iter=1000) # Ridge for i in [1, 10, 100, 1000]: ridge = Ridge(alpha=i) calcACC(ridge) ridge = Ridge(alpha=45.50940042350705) calcACC(ridge) # multiple linear lin = make_pipeline(forlinear(), LinearRegression(n_jobs=-1)) calcACC(lin) #stacking #0.69 testmodel = StackingRegressor(regressors=[alldata, nbrs, all], meta_regressor=rf, verbose=1) #1.1532 0.70926 testmodel = StackingRegressor( regressors=[alldata, nbrs, all, xgb, lgbm, rgf], meta_regressor=rf, verbose=1) #1.16420 0.7041 testmodel = StackingRegressor(regressors=[alldata, alldata, all], meta_regressor=rf, verbose=1) #1.16379 0.7044 stack1 = StackingRegressor( regressors=[alldata, nbrs, all, xgb, lgbm, rgf], meta_regressor=rf, verbose=1) testmodel = StackingRegressor(regressors=[alldata, stack1, stack1], meta_regressor=rf, verbose=1) #1.1535496740699531 0.7108839199109559 pcaFeature = make_pipeline(extPCA()) testmodel = StackingRegressor( regressors=[pcaFeature, alldata, nbrs, rf, xgb, lgbm, rgf], meta_regressor=rf, verbose=1) #1.181801005432221 0.6889745579620922 testmodel = StackingRegressor( regressors=[pcaFeature, alldata, nbrs, rf, xgb, lgbm, rgf], meta_regressor=lgbm, verbose=1) #0.70613 testmodel = StackingRegressor( regressors=[pcaFeature, alldata, nbrs, rf, xgb, lgbm, rgf, ext], meta_regressor=xgb, verbose=1) #0.71641717 testmodel = StackingRegressor( regressors=[pcaFeature, alldata, nbrs, rf, xgb, lgbm, rgf, ext], meta_regressor=rf, verbose=1) #0.7146922 testmodel = StackingRegressor(regressors=[ pcaFeature, alldata, nbrs, ridge, rf, xgb, lgbm, rgf, ext ], meta_regressor=rf, verbose=1) #new features pcaFeature = make_pipeline(extPCA()) #old pipe1 = make_pipeline(extMACCS(), rf) pipe2 = make_pipeline(extMorgan(), rf) pipe3 = make_pipeline(extDescriptor(), rf) pipe4 = make_pipeline(extPCA(), rgf) pipe7 = make_pipeline(extDescriptor(), rgf) pipe8 = make_pipeline(extDescriptor(), rgf) xgb = xgboost.XGBRegressor() nbrs = KNeighborsRegressor(2) svr = SVR(gamma='auto', kernel='linear') pls = PLSRegression(n_components=4) extMACCSdata = make_pipeline(extMACCS()) nbrsPipe = make_pipeline(extMorgan(), nbrs) pipe6 = make_pipeline(extMACCS(), rgf) alldata = make_pipeline(extAll()) ave = extAverage() withoutdesc = make_pipeline(extMACCS()) meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400) #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1) #0.70 stack = StackingRegressor( regressors=[pipe1, pipe2, pipe3, xgb, lgbm, rgf, rf], meta_regressor=ave, verbose=1) #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1) #0.69###################### stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3], meta_regressor=rf, verbose=1) #0.70 stack2 = StackingRegressor( regressors=[stack1, alldata, rgf, lgbm, xgb], meta_regressor=rf, verbose=1) #0.71 stack3 = StackingRegressor(regressors=[stack2, pipe1], meta_regressor=ave, verbose=1) ########################### ########################### stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3], meta_regressor=rf, verbose=1) stack2 = StackingRegressor(regressors=[stack1, withoutdesc, lgbm, rgf], meta_regressor=rf, verbose=1) stack3 = StackingRegressor(regressors=[stack2, pipe1, xgb], meta_regressor=ave, verbose=1) ########################### #stackingwithknn stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3], meta_regressor=rf, verbose=1) stack2 = StackingRegressor(regressors=[stack1, nbrs, pipe1], meta_regressor=rf, verbose=1) #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1) cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) cv = KFold(n_splits=10, shuffle=True, random_state=0) St1Scores = cross_validate(stack1, X, y, cv=cv) St1Scores['test_score'].mean()**(1 / 2) St2Scores = cross_validate(stack2, X, y, cv=cv) St2Scores['test_score'].mean()**(1 / 2) St3Scores = cross_validate(stack3, X, y, cv=cv) St3Scores['test_score'].mean()**(1 / 2) stackScore = cross_validate(stack, X, y, cv=cv) stackScore['test_score'].mean()**(1 / 2) lgbmScores = cross_validate(lgbm, X, y, cv=cv) lgbmScores['test_score'].mean()**(1 / 2) rgfScores = cross_validate(rgf, X, y, cv=cv) rgfScores['test_score'].mean()**(1 / 2) RFScores = cross_validate(rf, X, y, cv=cv) RFScores['test_score'].mean()**(1 / 2) scores = cross_validate(stack2, X, y, cv=cv) scores['test_score'].mean()**(1 / 2) print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking')) stack3.fit(X, y) y_pred = stack3.predict(X_train) y_val = stack3.predict(X_test) #stack3.score(X_train, y_train) exX = preprocess(extractDf, changeList) valy = (10**(stack3.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) stack1.fit(X, y) valy = (10**(stack1.predict(exX))).tolist() sgd.fit(X, y) valy = (10**(sgd.predict(exX))).tolist() rgfpipe = make_pipeline(extMACCS(), rf) rgf.fit(X, y) valy = (10**(rgf.predict(exX))).tolist() nbrs.fit(X, y) valy = (10**(nbrs.predict(exX))).tolist() pipe = make_pipeline(extMACCS(), rf) pipe.fit(X, y) valy = (10**(pipe.predict(exX))).tolist() rf.fit(X, y) y_pred = rf.predict(X_train) y_val = rf.predict(X_test) exX = preprocess(extractDf, changeList) valy = (10**(rf.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) lgbm.fit(X, y) #y_pred = pipe1.predict(X_train) #y_val = pipe1.predict(X_test) exX = preprocess(extractDf, changeList) valy = (10**(lgbm.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
def pls_cal(dbfile,maskfile,outpath,which_elem,testfold,nc,normtype=1,mincomp=0,maxcomp=100,plstype='mlpy',keepfile=None,removefile=None,cal_dir=None,masterlist_file=None,compfile=None,name_sub_file=None,foldfile=None,nfolds=7,seed=None,n_bag=None,skscale=False,n_boost=None,max_samples=0.1,n_elems=9): plstype_string=plstype if n_bag!=None: plstype_string=plstype+'_bag' if n_boost!=None: plstype_string=plstype+'_boost' if skscale==True: plstype_string=plstype+'_scale' print 'Reading database' sys.stdout.flush() spectra,comps,spect_index,names,labels,wvl=ccam.read_db(dbfile,compcheck=True,n_elems=n_elems) oxides=labels[2:] compindex=numpy.where(oxides==which_elem)[0] print 'Choosing spectra' which_removed=outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_removed.csv' spectra,names,spect_index,comps=ccam.choose_spectra(spectra,spect_index,names,comps,compindex,mincomp=mincomp,maxcomp=maxcomp,keepfile=keepfile,removefile=removefile,which_removed=which_removed) print 'Masking spectra' spectra,wvl=ccam.mask(spectra,wvl,maskfile) print 'Normalizing spectra' spectra=ccam.normalize(spectra,wvl,normtype=normtype) print 'Assigning Folds' if foldfile!=None: #if a fold file is specified, use it folds=ccam.folds(foldfile,names) else: #otherwise, define random folds folds=ccam.random_folds(names,nfolds,seed=seed) names_nofold=names[(folds==0)] spect_index_nofold=spect_index[(folds==0)] #write a file containing the samples not assigned to folds with open(which_removed,'ab') as writefile: writer=csv.writer(writefile,delimiter=',',) for i in range(len(names_nofold)): writer.writerow([names_nofold[i],spect_index_nofold[i],'No Fold']) #remove spectra that are not assigned to any fold spectra=spectra[(folds!=0),:] spect_index=spect_index[(folds!=0)] names=names[(folds!=0)] comps=comps[(folds!=0),:] folds=folds[(folds!=0)] print 'Defining Training and Test Sets' spectra_train=spectra[(folds!=testfold)] spect_index_train=spect_index[(folds!=testfold)] names_train=names[(folds!=testfold)] comps_train=comps[(folds!=testfold),compindex] folds_train=folds[(folds!=testfold)] folds_train_unique=numpy.unique(folds_train) spectra_test=spectra[(folds==testfold)] spect_index_test=spect_index[(folds==testfold)] names_test=names[(folds==testfold)] comps_test=comps[(folds==testfold),compindex] folds_test=folds[(folds==testfold)] print 'Do Leave One Label Out (LOLO) cross validation with all folds but the test set' #define array to hold cross validation predictions and RMSEs train_predict_cv=numpy.zeros((len(names_train),nc)) RMSECV=numpy.zeros(nc) for i in folds_train_unique: print 'Holding out fold #'+str(i) if skscale==False: #mean center those spectra left in #X_cv_in1,X_cv_in_mean1=meancenter.ccam_meancenter(spectra_train[(folds_train!=i),:]) X_cv_in,X_cv_in_mean=ccam.meancenter(spectra_train[(folds_train!=i),:]) #and those left out X_cv_out=ccam.meancenter(spectra_train[(folds_train==i),:],X_mean=X_cv_in_mean)[0] #mean center compositions left in Y_cv_in,Y_cv_in_mean=ccam.meancenter(comps_train[(folds_train!=i)]) if skscale==True: X_cv_in=spectra_train[(folds_train!=i),:] X_cv_out=spectra_train[(folds_train==i),:] Y_cv_in=comps_train[(folds_train!=i)] Y_cv_in_mean=0 #step through each number of components for j in range(1,nc+1): print 'Training Model for '+str(j)+' components' #train the model if plstype=='mlpy': PLS1model=ccam.mlpy_pls.PLS(j) PLS1model.learn(X_cv_in,Y_cv_in) #predict the samples held out train_predict_cv[(folds_train==i),j-1]=PLS1model.pred(X_cv_out)+Y_cv_in_mean if plstype=='sklearn': PLS1model=PLSRegression(n_components=j,scale=skscale) if n_bag==None and n_boost==None: PLS1model.fit(X_cv_in,Y_cv_in) train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1model.predict(X_cv_out)+Y_cv_in_mean) if n_bag!=None: PLS1bagged=ensemble.BaggingRegressor(PLS1model,n_estimators=n_bag,max_samples=max_samples,verbose=1) PLS1bagged.fit(X_cv_in,Y_cv_in) train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1bagged.predict(X_cv_out)+Y_cv_in_mean) if n_boost!=None: PLS1boosted=ensemble.AdaBoostRegressor(PLS1model,n_estimators=n_boost) PLS1boosted.fit(X_cv_in,Y_cv_in) train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1boosted.predict(X_cv_out)+Y_cv_in_mean) #calculate RMSECV for i in range(0,nc): sqerr=(train_predict_cv[:,i]-comps_train)**2.0 RMSECV[i]=numpy.sqrt(numpy.mean(sqerr)) #mean center full model if skscale==False: X,X_mean=ccam.meancenter(spectra_train) X_test=ccam.meancenter(spectra_test,X_mean=X_mean)[0] X_all=ccam.meancenter(spectra,X_mean=X_mean)[0] Y,Y_mean=ccam.meancenter(comps_train) if skscale==True: X=spectra_train X_test=spectra_test X_all=spectra Y=comps_train Y_mean=0 #create arrays for results and RMSEs trainset_results=numpy.zeros((len(names_train),nc)) testset_results=numpy.zeros((len(names_test),nc)) results=numpy.zeros((len(names),nc)) RMSEP=numpy.zeros(nc) RMSEC=numpy.zeros(nc) beta=numpy.zeros((len(X[0,:]),nc)) Q_res=numpy.zeros((len(X[:,0]),nc)) T2=numpy.zeros((len(X[:,0]),nc)) [a,evals,b]=numpy.linalg.svd(numpy.cov(numpy.dot(X,X.transpose()))) evals=numpy.diag(evals**2) if cal_dir!=None: print 'Reading cal target data' cal_data,cal_wvl,cal_filelist=ccam.read_ccs(cal_dir) cal_data,cal_wvl=ccam.mask(cal_data,cal_wvl,maskfile) cal_data=ccam.normalize(cal_data,cal_wvl,normtype=normtype) if skscale==True: cal_data_centered=cal_data if skscale==False: cal_data_centered=ccam.meancenter(cal_data,X_mean=X_mean)[0] RMSEP_cal=numpy.zeros(nc) RMSEP_cal_good=numpy.zeros(nc) RMSEP_KGAMEDS=numpy.zeros(nc) RMSEP_MACUSANITE=numpy.zeros(nc) RMSEP_NAU2HIS=numpy.zeros(nc) RMSEP_NAU2LOS=numpy.zeros(nc) RMSEP_NAU2MEDS=numpy.zeros(nc) RMSEP_NORITE=numpy.zeros(nc) RMSEP_PICRITE=numpy.zeros(nc) RMSEP_SHERGOTTITE=numpy.zeros(nc) targets,dists,amps,nshots=ccam.target_lookup(cal_filelist,masterlist_file,name_sub_file) target_comps=ccam.target_comp_lookup(targets,compfile,which_elem) cal_results=numpy.zeros((len(targets),nc)) model_list=[] #Now step through each # of components with the full model for j in range(1,nc+1): print 'Training full model for '+str(j)+' components' if plstype=='mlpy': PLS1model=ccam.mlpy_pls.PLS(j) PLS1model.learn(X,Y) beta[:,j-1]=PLS1model.beta() model_list.append([PLS1model]) trainset_results[:,j-1]=PLS1model.pred(X)+Y_mean testset_results[:,j-1]=PLS1model.pred(X_test)+Y_mean results[:,j-1]=PLS1model.pred(X_all)+Y_mean if cal_dir != None: comps_copy=copy.copy(target_comps) # if skscale==True: # cal_results[:,j-1]=PLS1model.pred(cal_data) # if skscale==False: cal_results[:,j-1]=PLS1model.pred(cal_data_centered)+Y_mean RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results) if plstype=='sklearn': PLS1model=PLSRegression(n_components=j,scale=skscale) if n_bag==None and n_boost==None: PLS1model.fit(X,Y) T=PLS1model.x_scores_ #There's probably a more efficient way to calculate T2... for k in range(len(X[:,0])): T2[k,j-1]=numpy.dot(T[k,:],numpy.dot(numpy.linalg.inv(numpy.dot(T.transpose(),T)),T[k,:])) E=X-numpy.dot(PLS1model.x_scores_,PLS1model.x_loadings_.transpose()) Q_res[:,j-1]=numpy.dot(E,E.transpose()).diagonal() trainset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X)+Y_mean) testset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X_test)+Y_mean) results[:,j-1]=numpy.squeeze(PLS1model.predict(X_all)+Y_mean) beta[:,j-1]=numpy.squeeze(PLS1model.coefs) model_list.append([PLS1model]) if cal_dir != None: comps_copy=copy.copy(target_comps) cal_results[:,j-1]=numpy.squeeze(PLS1model.predict(cal_data_centered)+Y_mean) RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results) if n_bag!=None: PLS1bagged=ensemble.BaggingRegressor(PLS1model,n_estimators=n_bag,max_samples=max_samples,verbose=1) PLS1bagged.fit(X,Y) trainset_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X)+Y_mean) testset_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X_test)+Y_mean) results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X_all)+Y_mean) beta[:,j-1]=None model_list.append([PLS1bagged]) if cal_dir != None: comps_copy=copy.copy(target_comps) cal_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(cal_data_centered)+Y_mean) RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results) if n_boost!=None: PLS1boosted=ensemble.AdaBoostRegressor(PLS1model,n_estimators=n_boost) PLS1boosted.fit(X,Y) trainset_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X)+Y_mean) testset_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X_test)+Y_mean) results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X_all)+Y_mean) beta[:,j-1]=None model_list.append([PLS1boosted]) if cal_dir != None: comps_copy=copy.copy(target_comps) cal_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(cal_data_centered)+Y_mean) RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results) RMSEC[j-1]=numpy.sqrt(numpy.mean((trainset_results[:,j-1]-comps_train)**2.0)) RMSEP[j-1]=numpy.sqrt(numpy.mean((testset_results[:,j-1]-comps_test)**2.0)) with open(outpath+which_elem+'_'+plstype_string+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'.pkl','wb') as picklefile: pickle.dump(model_list,picklefile) #if cal_dir is specified, read cal target data and calculate RMSEs if cal_dir!=None: n_good_cal=numpy.sum(numpy.array([RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE])[:,0]!=0) print n_good_cal RMSEP_cal=(RMSEP_KGAMEDS+RMSEP_MACUSANITE+RMSEP_NAU2HIS+RMSEP_NAU2LOS+RMSEP_NAU2MEDS+RMSEP_NORITE+RMSEP_PICRITE+RMSEP_SHERGOTTITE)/n_good_cal RMSEP_single_cals=[RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE,RMSEP_cal] with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_caltargets_predict.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') row=['File','Target','Laser Energy','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(targets)): row=[cal_filelist[i],targets[i],amps[i],target_comps[i]] row.extend(cal_results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP_caltargets.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSEP Cal Targets (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSEP_cal[i]]) ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal.png',RMSEP_cals=RMSEP_single_cals) ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal_good.png',RMSEP_good=RMSEP_cal_good) # plot RMSEs ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot.png') #Write output info to files with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_Q_res.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') row=["Sample","Spectrum","Fold","True Comp"] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(Q_res[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+str(mincomp)+'-'+str(maxcomp)+'_quartiles.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') row=[which_elem] writer.writerow(row) row=['Min',numpy.percentile(comps[:,compindex],0)] writer.writerow(row) row=['1st Quartile',numpy.percentile(comps[:,compindex],25)] writer.writerow(row) row=['Median',numpy.percentile(comps[:,compindex],50)] writer.writerow(row) row=['3rd Quartile',numpy.percentile(comps[:,compindex],75)] writer.writerow(row) row=['Max',numpy.percentile(comps[:,compindex],100)] writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_HotellingT2.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') row=["Sample","Spectrum","Fold","True Comp"] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(T2[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSECV.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSECV (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSECV[i]]) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEC.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSEC (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSEC[i]]) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSEP (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSEP[i]]) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_cv_predict.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Fold','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(train_predict_cv[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_train_predict.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Fold','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(trainset_results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_test_predict.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Fold','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_test)): row=[names_test[i],spect_index_test[i],folds_test[i],comps_test[i]] row.extend(testset_results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_all_predict.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Fold','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names)): row=[names[i],spect_index[i],folds[i],comps[i,compindex]] row.extend(results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_beta_coeffs.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') row=['wvl'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(wvl)): row=[wvl[i]] row.extend(beta[i,:]) writer.writerow(row) if skscale==False: with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_meancenters.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow([which_elem+' mean',Y_mean]) for i in range(0,len(wvl)): row=[wvl[i],X_mean[i]] writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_inputinfo.csv','wb') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['Spectral database =',dbfile]) writer.writerow(['Spectra Kept =',keepfile]) writer.writerow(['Spectra Removed =',which_removed]) writer.writerow(['Fold Definition =',foldfile]) writer.writerow(['Test Fold =',maskfile]) writer.writerow(['Mask File =',maskfile]) writer.writerow(['Algorithm =',plstype_string]) writer.writerow(['# of components =',nc]) writer.writerow(['Normalization Type =',normtype]) writer.writerow(['Composition Min. =',mincomp]) writer.writerow(['Composition Max. =',maxcomp])
def get_r2(x_learn, x_valid, y_learn, y_valid, regressor='pls'): """ The function Get_R2 takes features and labels from the learning and validation set. When using 'pls' as regressor, the MSE is calculated for all LOOCV sets for predicted vs true labels (mse = mean_squared_error(y_test_loo, y_pred_loo) for a fixed number of components for PLS regression. In the next iteration, the number of components is increased by 1 (number_of_components += 1) and the MSE is calculated for this regressor. The loop breaks if i > 9. Finally, the model of the single AAindex model with the lowest MSE is chosen. When using other regressors the parameters are tuned using GridSearchCV. This function returnes performance (R2, (N)RMSE, Pearson's r) and model parameters. """ regressor = regressor.lower() mean_squared_error_list = [] if regressor == 'pls': # PLS regression with LOOCV n_components tuning as described by Cadet et al. # https://doi.org/10.1186/s12859-018-2407-8 # https://doi.org/10.1038/s41598-018-35033-y # Hyperparameter (N component) tuning of PLS regressor for n_comp in range(1, 10): # n_comp = 1, 2,..., 9 pls = PLSRegression(n_components=n_comp) loo = LeaveOneOut() y_pred_loo = [] y_test_loo = [] for train, test in loo.split(x_learn): x_learn_loo = [] y_learn_loo = [] x_test_loo = [] for j in train: x_learn_loo.append(x_learn[j]) y_learn_loo.append(y_learn[j]) for k in test: x_test_loo.append(x_learn[k]) y_test_loo.append(y_learn[k]) pls.fit(x_learn_loo, y_learn_loo) y_pred_loo.append(pls.predict(x_test_loo)[0][0]) mse = mean_squared_error(y_test_loo, y_pred_loo) mean_squared_error_list.append(mse) mean_squared_error_list = np.array(mean_squared_error_list) # idx = np.where(...) finds best number of components idx = np.where(mean_squared_error_list == np.min(mean_squared_error_list))[0][0] + 1 # Model is fitted with best n_components (lowest MSE) best_params = {'n_components': idx} regressor_ = PLSRegression(n_components=best_params.get('n_components')) # other regression options (CV tuning) elif regressor == 'pls_cv': params = {'n_components': list(np.arange(1, 10))} # n_comp = 1, 2,..., 9 regressor_ = GridSearchCV(PLSRegression(), param_grid=params, iid=False, cv=5) # iid in future # versions redundant elif regressor == 'rf': params = { # similar parameter grid as Xu et al., https://doi.org/10.1021/acs.jcim.0c00073 'random_state': [42], # state determined 'n_estimators': [100, 250, 500, 1000], # number of individual decision trees in the forest 'max_features': ['auto', 'sqrt', 'log2'] # “auto” -> max_features=n_features, # “sqrt” -> max_features=sqrt(n_features) “log2” -> max_features=log2(n_features) } regressor_ = GridSearchCV(RandomForestRegressor(), param_grid=params, iid=False, cv=5) elif regressor == 'svr': params = { # similar parameter grid as Xu et al. 'C': [2 ** 0, 2 ** 2, 2 ** 4, 2 ** 6, 2 ** 8, 2 ** 10, 2 ** 12], # Regularization parameter 'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001] # often 1 / n_features or 1 / (n_features * X.var()) } regressor_ = GridSearchCV(SVR(), param_grid=params, iid=False, cv=5) elif regressor == 'mlp': params = { # feedforward network trained via backpropagation – here only using a single hidden layer 'hidden_layer_sizes': [i for i in range(1, 12)], # size of hidden layer [(1,), (2,), ..., (12,)] 'activation': ['relu'], # rectified linear unit 'solver': ['adam', 'lbfgs'], # ADAM: A Method for Stochastic Optimization , or Limited-memory BFGS 'learning_rate': ['constant'], # learning rate given by ‘learning_rate_init’ 'learning_rate_init': [0.001, 0.01, 0.1], # only used when solver=’sgd’ or ‘adam’ 'max_iter': [1000, 200], # for stochastic solvers (‘sgd’, ‘adam’) determines epochs 'random_state': [42] } regressor_ = GridSearchCV(MLPRegressor(), param_grid=params, iid=False, cv=5) else: raise SystemError("Did not find specified regression model as valid option. See '--help' for valid " "regression model options.") regressor_.fit(x_learn, y_learn) # fit model if regressor != 'pls': # take best parameters for the regressor and the AAindex best_params = regressor_.best_params_ y_pred = [] for y_p in regressor_.predict(x_valid): # predict validation entries with fitted model y_pred.append(float(y_p)) r2 = r2_score(y_valid, y_pred) rmse = np.sqrt(mean_squared_error(y_valid, y_pred)) nrmse = rmse / np.std(y_valid, ddof=1) # ranks for Spearman's rank correlation y_val_rank = np.array(y_valid).argsort().argsort() y_pred_rank = np.array(y_pred).argsort().argsort() with warnings.catch_warnings(): # catching RunTime warning when there's no variance in an array, e.g. [2, 2, 2, 2] warnings.simplefilter("ignore") # which would mean divide by zero pearson_r = np.corrcoef(y_valid, y_pred)[0][1] spearman_rho = np.corrcoef(y_val_rank, y_pred_rank)[0][1] return r2, rmse, nrmse, pearson_r, spearman_rho, regressor, best_params
def plot_processing_results(figure_name,num_PCs,training_error=0,train_conc=False\ ,mixture_error = 0, mix_conc=False): r2 = [] rmse = [] r2_centered = [] rmse_centered = [] r2_cscaled = [] rmse_cscaled = [] r2_scaled = [] rmse_scaled = [] r2_scaled_std = [] rmse_scaled_std = [] r2_PLS = [] rmse_PLS = [] r2_PLS_scaled = [] rmse_PLS_scaled = [] num_PCs = num_PCs crange = 10**np.linspace(-3, 1, num=25, endpoint=True) cratio = crange * 4**4 / 4 print(cratio) for c in crange: intensities, concentrations = get_training_data(c,training_error\ ,conc_error=train_conc) mixture_int, mixture_conc = get_mixture(c,mixture_error\ , conc_error=mix_conc) pca = PCA(intensities, concentrations, center=False, scale=False, add_constant=True) pca.get_PCs_and_regressors(num_PCs) prediction = pca.predict(mixture_int) r2.append(PCA.get_r2(mixture_conc, prediction)) rmse.append(PCA.get_rmse(mixture_conc, prediction)) #centered pca_centered = PCA(intensities, concentrations, center=True, scale=False) pca_centered.get_PCs_and_regressors(num_PCs) prediction = pca_centered.predict(mixture_int) r2_centered.append(PCA.get_r2(mixture_conc, prediction)) rmse_centered.append(PCA.get_rmse(mixture_conc, prediction)) #centered and scaled sigma^2 pca_cscaled = PCA(intensities, concentrations, center=True, scale=True) pca_cscaled.get_PCs_and_regressors(num_PCs) prediction = pca_cscaled.predict(mixture_int) r2_cscaled.append(PCA.get_r2(mixture_conc, prediction)) rmse_cscaled.append(PCA.get_rmse(mixture_conc, prediction)) #scaled sigma^2 pca_scaled = PCA(intensities, concentrations, center=False, scale=True, add_constant=True) pca_scaled.get_PCs_and_regressors(num_PCs) prediction = pca_scaled.predict(mixture_int) r2_scaled.append(PCA.get_r2(mixture_conc, prediction)) rmse_scaled.append(PCA.get_rmse(mixture_conc, prediction)) #scaled sigma pca_scaled_std = PCA(intensities, concentrations, center=False, scale=True, scale_type='std', add_constant=True) pca_scaled_std.get_PCs_and_regressors(num_PCs) prediction = pca_scaled_std.predict(mixture_int) r2_scaled_std.append(PCA.get_r2(mixture_conc, prediction)) rmse_scaled_std.append(PCA.get_rmse(mixture_conc, prediction)) #PLS PLS = PLSRegression(n_components=num_PCs, scale=False, max_iter=500\ , tol=1e-06, copy=True, center=True) PLS.fit(intensities, concentrations) prediction = PLS.predict(mixture_int) r2_PLS.append(PCA.get_r2(mixture_conc, prediction)) rmse_PLS.append(PCA.get_rmse(mixture_conc, prediction)) #PLS scaled PLS_scaled = PLSRegression(n_components=num_PCs, scale=True\ , max_iter=500, tol=1e-06, copy=True, center=True) PLS_scaled.fit(intensities, concentrations) prediction = PLS_scaled.predict(mixture_int) r2_PLS_scaled.append(PCA.get_r2(mixture_conc, prediction)) rmse_PLS_scaled.append(PCA.get_rmse(mixture_conc, prediction)) r2 = np.array(r2) r2_centered = np.array(r2_centered) r2_scaled = np.array(r2_scaled) r2_cscaled = np.array(r2_cscaled) r2_scaled_std = np.array(r2_scaled_std) r2_PLS = np.array(r2_PLS) r2_PLS_scaled = np.array(r2_PLS_scaled) rmse = np.array(rmse) rmse_centered = np.array(rmse_centered) rmse_scaled = np.array(rmse_scaled) rmse_cscaled = np.array(rmse_cscaled) rmse_scaled_std = np.array(rmse_scaled_std) rmse_PLS = np.array(rmse_PLS) rmse_PLS_scaled = np.array(rmse_PLS_scaled) """ plt.figure(0) plt.plot(cratio,np.mean(r2,axis=1),'blue') plt.plot(cratio,np.mean(r2_centered,axis=1),'og') plt.plot(cratio,np.mean(r2_scaled,axis=1),':r') plt.plot(cratio,np.mean(r2_cscaled,axis=1),':g') plt.plot(cratio,np.mean(r2_scaled_std,axis=1),'^r') plt.plot(cratio,np.mean(r2_PLS,axis=1),'orange') plt.plot(cratio,np.mean(r2_PLS_scaled,axis=1),marker='s',color='orange',linewidth=0) plt.legend(['PCA unprocessed','PCA centered','PCA $\sigma^{2}$ scaled' ,'PCA centered and $\sigma^{2}$ scaled', 'PCA $\sigma$ scalled' ,'PLS centered', 'PLS centered and $\sigma$ scaled']) plt.xscale('log') plt.xlabel('max(squared elements)/max(linear elements)') plt.ylabel('Average R$^{2}$') plt.show() """ plt.figure(1, figsize=(3.5, 3.5)) Figure_folder = os.path.join(os.path.expanduser("~"), 'Downloads') plt.plot(cratio, np.mean(rmse, axis=1), 'blue') plt.plot(cratio, np.mean(rmse_centered, axis=1), 'og') plt.plot(cratio, np.mean(rmse_scaled, axis=1), ':r') plt.plot(cratio, np.mean(rmse_cscaled, axis=1), ':g') plt.plot(cratio, np.mean(rmse_scaled_std, axis=1), '^r') plt.plot(cratio, np.mean(rmse_PLS, axis=1), color='orange') plt.plot(cratio, np.mean(rmse_PLS_scaled, axis=1), marker='s', color='orange', linewidth=0) plt.legend([ 'PCA unprocessed', 'PCA centered', 'PCA $\sigma^{2}$ scaled', 'PCA centered and $\sigma^{2}$ scaled', 'PCA $\sigma$ scalled', 'PLS centered ', 'PLS centered and $\sigma$ scaled' ], loc=4) plt.xlabel('Max(squared elements)/max(linear elements)') #plt.ylabel('Average RMSE') plt.xscale('log') plt.yscale('log') plt.ylim([6 * 10**-4, 4]) #plt.xticks([]) #plt.yticks([]) figure_file = os.path.join(Figure_folder, figure_name + '.jpg') plt.savefig(figure_file, format='jpg') plt.close() plt.figure(2) plt.plot(cratio, r2[:, 2], 'blue') plt.plot(cratio, r2_centered[:, 2], 'og') plt.plot(cratio, r2_scaled[:, 2], ':r') plt.plot(cratio, r2_cscaled[:, 2], ':g') plt.plot(cratio, r2_scaled_std[:, 2], '^r') plt.plot(cratio, r2_PLS[:, 2], 'orange') plt.plot(cratio, r2_PLS_scaled[:, 2], marker='s', color='orange', linewidth=0) plt.legend([ 'PCA unprocessed', 'PCA centered', 'PCA $\sigma^{2}$ scaled', 'PCA centered and $\sigma^{2}$ scaled', 'PCA $\sigma$ scalled', 'PLS centered', 'PLS centered and $\sigma$ scaled' ]) plt.xscale('log') plt.xlabel('Max(squared elements)/max(linear elements)') plt.ylabel('Average R$^{2}$') plt.show() plt.figure(3) plt.plot(cratio, rmse[:, 2], 'blue') plt.plot(cratio, rmse_centered[:, 2], 'og') plt.plot(cratio, rmse_scaled[:, 2], ':r') plt.plot(cratio, rmse_cscaled[:, 2], ':g') plt.plot(cratio, rmse_scaled_std[:, 2], '^r') plt.plot(cratio, rmse_PLS[:, 2], color='orange') plt.plot(cratio, rmse_PLS_scaled[:, 2], marker='s', color='orange', linewidth=0) plt.legend([ 'PCA unprocessed', 'PCA centered', 'PCA $\sigma^{2}$ scaled', 'PCA centered and $\sigma^{2}$ scaled', 'PCA $\sigma$ scalled', 'PLS centered ', 'PLS centered and $\sigma$ scaled' ]) plt.xlabel('Max(squared elements)/max(linear elements)') plt.ylabel('Average RMSE') plt.xscale('log') plt.show()
x_axis = np.arange(1, np.linalg.matrix_rank(X) + 1) plt.scatter(x_axis, cummulative_variance_explained) plt.plot(x_axis, cummulative_variance_explained) plt.title("Scree Plot") plt.xlabel("Number of latent vectors used") plt.ylabel("Percentage of variance explained") plt.xticks(x_axis, x_axis) plt.yticks() plt.show() # compare to sklearn package results to verify accuracy import numpy as np np.set_printoptions(threshold=np.inf) from sklearn.cross_decomposition import PLSRegression from sklearn.preprocessing import StandardScaler import pandas as pd import matplotlib.pyplot as plt X = [[1, 5, 10], [2, 4, 8], [3, 4, 8], [4, 5, 10]] y = [41, 49, 69, 65] X = StandardScaler().fit_transform(X) # population stdev y = StandardScaler().fit_transform(y) # population stdev pls1 = PLSRegression(n_components=2) scores = pls1.fit_transform(X, y) T = pls1.x_scores_ W = pls1.x_weights_ P = pls1.y_loadings_ y_pred = pls1.predict(X)
import pandas as pd #from sklearn.cross_validation import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score from sklearn import tree import scipy import scipy.io as sio from sklearn.cross_decomposition import PLSRegression train = sio.loadmat('Train.mat') test = sio.loadmat('Dev.mat') X_train = train["final1"][:, 1:620] y_train = train["final1"][:, 0] X_test = test["final"][:, 1:620] y_test = test["final"][:, 0] pls2 = PLSRegression(n_components=11) pls2.fit(X_train, y_train) XX = [[1, 2, 2], [2, 3, 4]] y_pred = pls2.predict(X_test) res = [None] * 54 for i in range(54): if y_pred[i] < 7: res[i] = 1 elif y_pred[i] < 20: res[i] = 2 else: res[i] = 3
def prediction(x_calib, y_calib, x_valid, y_valid, plot_components=False): mse = [] component = np.arange(1, 30) for i in component: pls = PLSRegression(n_components=i) pls.fit(x_calib, y_calib) y_pred = pls.predict(x_valid) mse_p = mean_squared_error(y_valid, y_pred) mse.append(mse_p) comp = 100 * (i + 1) / 30 stdout.write("\r%d%% completed" % comp) stdout.flush() stdout.write("\n") msemin = np.argmin(mse) print("Suggested number of components: ", msemin + 1) stdout.write("\n") if plot_components is True: with plt.style.context(('ggplot')): plt.plot(component, np.array(mse), '-v', color='blue', mfc='blue') plt.plot(component[msemin], np.array(mse)[msemin], 'P', ms=10, mfc='red') plt.xlabel('Number of PLS components') plt.ylabel('MSE') plt.title('PLS') plt.xlim(xmin=-1) plt.show(block=False) _ = raw_input("Press [enter] to continue.") pls = PLSRegression(n_components=msemin + 1) pls.fit(x_calib, y_calib) startTime = time.time() y_pred = pls.predict(x_valid) endTime = time.time() print('Time elapsed: %s seconds' % (endTime - startTime)) lb = preprocessing.LabelBinarizer() score_p = r2_score(y_valid, y_pred) mse_p = mean_squared_error(y_valid, y_pred) lb.fit_transform(y_valid) score = r2_score(y_valid, y_pred) print('R2: %5.3f' % score_p) print('MSE: %5.3f' % mse_p) #print pr = lb.inverse_transform(y_pred) ac = lb.inverse_transform(y_valid) #print type(pr[0]) #print ac sum = 0 for j in range(len(pr)): if np.array_equal(pr[j], ac[j]): sum += 1 print('Accuracy: ' + str((float(sum) / float(len(pr))) * 100) + '%')
len(db_comps_trainset[0, :])]) test_result = numpy.zeros( [nc, len(db_comps_testset[:]), len(db_comps_testset[0, :])]) RMSECV = numpy.zeros( [nc, len(db_ox_list), len(numpy.unique(db_folds_trainset))]) #loop through each number of components jj = numpy.array([0, 2, 3, 4, 5, 6]) for k in range(1, nc + 1): #calculate full training model for the current number of components print k PLS1 = PLSRegression(n_components=k) x_train = numpy.transpose(db_spectra_trainset) x_train_mean = numpy.mean(x_train, axis=0) x_train_meancenter = x_train - numpy.tile(x_train_mean, (x_train.shape[0], 1)) y_train = numpy.transpose(db_comps_trainset[i, :]) y_train_mean = numpy.mean(y_train) y_train_meancenter = y_train - y_train_mean x_test = numpy.transpose(db_spectra_testset) x_test_meancenter = x_test - numpy.tile(x_train_mean, (x_test.shape[0], 1)) y_test = numpy.transpose(db_comps_testset[i, :])
def make_plots(m, data, colors, names, groundtruth=None, waves=None, sample_size=10, ux=0, remove_mean=False, log_x=False, ylim=None, res_out='', title=None): inds_sup_train = np.random.choice(data['X'].shape[0], size=sample_size) inds_sup_valid = np.random.choice(data['X_valid'].shape[0], size=sample_size) inds_train_x = np.random.choice(data['X_'].shape[0], size=sample_size) inds_train_y = np.random.choice(data['_y'].shape[0], size=sample_size) y = np.hstack([data['y'], 1 - data['y'].sum(axis=1, keepdims=True)]) y_valid = np.hstack( [data['y_valid'], 1 - data['y_valid'].sum(axis=1, keepdims=True)]) y_corners = np.vstack((np.eye(data['y'].shape[1]), np.zeros(data['y'].shape[1]))).astype('float32') if waves is None: waves = np.arange(data['X'].shape[1]) if remove_mean: _ux = ux else: _ux = 0 if log_x: f = lambda x: np.exp(x) else: f = lambda x: x force_ylim = False if ylim is not None: force_ylim = True pls_XY = PLSRegression(n_components=8, scale=False) pls_XY.fit(data['X'], y) pred_train_pls = pls_XY.predict(data['X']) pred_train_pls = (pred_train_pls.T / np.sum(pred_train_pls, axis=1)).T pred_valid_pls = pls_XY.predict(data['X_valid']) pred_valid_pls = (pred_valid_pls.T / np.sum(pred_valid_pls, axis=1)).T score_pred_train_pls = KL(pred_train_pls, y) score_pred_valid_pls = KL(pred_valid_pls, y_valid) pls_YX = PLSRegression(n_components=min(8, y.shape[1]), scale=False) pls_YX.fit(y, data['X']) gen_train_pls = pls_YX.predict(y) gen_valid_pls = pls_YX.predict(y_valid) score_gen_train_pls = L2(gen_train_pls, data['X']) score_gen_valid_pls = L2(gen_valid_pls, data['X_valid']) pred_train = m.predict(x=data['X'], deterministic=True) pred_train = np.hstack( [pred_train, 1 - pred_train.sum(axis=1, keepdims=True)]) score_pred_train = KL(pred_train, y) pred_valid = m.predict(x=data['X_valid'], deterministic=True) pred_valid = np.hstack( [pred_valid, 1 - pred_valid.sum(axis=1, keepdims=True)]) score_pred_valid = KL(pred_valid, y_valid) if m.model_type in [1, 2]: z2_train = m.getZ2(x=data['X'], y=data['y'], deterministic=True) z2_valid = m.getZ2(x=data['X_valid'], y=data['y_valid'], deterministic=True) z2_train_mean = z2_train.mean(axis=0) z2_valid_mean = z2_valid.mean(axis=0) z2_gen_train = z2_train_mean * np.ones_like(z2_train).astype('float32') z2_gen_valid = z2_valid_mean * np.ones_like(z2_valid).astype('float32') z2_gen_endmembers = z2_train_mean * np.ones( (y_corners.shape[0], z2_train.shape[1])).astype('float32') gen_train = f( _ux + m.generate(y=data['y'][inds_sup_train], z2=z2_gen_train[inds_sup_train], deterministic=True) ) # true by default for non-variational, variational default is False gen_valid = f(_ux + m.generate(y=data['y_valid'][inds_sup_valid], z2=z2_gen_valid[inds_sup_valid], deterministic=True)) endmembers = f( _ux + m.generate(y=y_corners, z2=z2_gen_endmembers, deterministic=True)) if m.variational: endmembers_dists = [] for idx_c, c in enumerate(y_corners): endmembers_dist = [ f(_ux + m.generate(y=np.atleast_2d(c), z2=z2_gen_endmembers[idx_c:idx_c + 1], deterministic=False)).squeeze() for i in range(sample_size) ] endmembers_dists += [np.asarray(endmembers_dist)] endmembers_dists = endmembers_dists else: gen_train = f( _ux + m.generate(y=data['y'][inds_sup_train], deterministic=True) ) # true by default for non-variational, variational default is False gen_valid = f( _ux + m.generate(y=data['y_valid'][inds_sup_valid], deterministic=True)) endmembers = f(_ux + m.generate(y=y_corners, deterministic=True)) if m.variational: endmembers_dists = [] for idx_c, c in enumerate(y_corners): endmembers_dist = [ f(_ux + m.generate(y=np.atleast_2d(c), deterministic=False) ).squeeze() for i in range(sample_size) ] endmembers_dists += [np.asarray(endmembers_dist)] endmembers_dists = endmembers_dists recon_train = f(_ux + m.generate(x=data['X_'][inds_train_x], deterministic=True)) recon_sup_valid = f( _ux + m.generate(x=data['X_valid'][inds_sup_valid], deterministic=True)) fs = 24 fs_tick = 18 # change xticks to be names p = 100 plt.plot(p * y[inds_sup_train][0], 'k', lw=2, label='Ground Truth') ssdgm_label = 'SSDGM ({:.3f})'.format(score_pred_train) plt.plot(p * pred_train[inds_sup_train][0], 'r-.', lw=2, label=ssdgm_label) pls_label = 'PLS ({:.3f})'.format(score_pred_train_pls) plt.plot(p * pred_train_pls[inds_sup_train][0], 'b-.', lw=2, label=pls_label) plt.plot(p * y[inds_sup_train].T, 'k', lw=2) plt.plot(p * pred_train[inds_sup_train].T, 'r-.', lw=2) plt.plot(p * pred_train_pls[inds_sup_train].T, 'b-.', lw=2) plt.title('Predicting Composition - Training Error', fontsize=fs) plt.ylabel('Composition (%)', fontsize=fs) ax = plt.gca() ax.set_ylim((0, 1 * p)) ax.set_xticks(np.arange(y.shape[1])) ax.set_xticklabels(names, fontsize=fs) ax.tick_params(axis='x', direction='out', top='off', length=10, labelsize=fs_tick) lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) ax = plt.gca() plt.savefig(res_out + '/comp_train.png', additional_artists=[lgd], bbox_inches='tight') plt.close() plt.plot(p * y_valid[inds_sup_valid][0], 'k', lw=2, label='Ground Truth') ssdgm_label = 'SSDGM ({:.3f})'.format(score_pred_valid) plt.plot(p * pred_valid[inds_sup_valid][0], 'r-.', lw=2, label=ssdgm_label) pls_label = 'PLS ({:.3f})'.format(score_pred_valid_pls) plt.plot(p * pred_valid_pls[inds_sup_valid][0], 'b-.', lw=2, label=pls_label) plt.plot(p * y_valid[inds_sup_valid].T, 'k', lw=2) plt.plot(p * pred_valid[inds_sup_valid].T, 'r-.', lw=2) plt.plot(p * pred_valid_pls[inds_sup_valid].T, 'b-.', lw=2) plt.title('Predicting Composition - Validation Error', fontsize=fs) plt.ylabel('Composition (%)', fontsize=fs) ax = plt.gca() ax.set_ylim((0, 1 * p)) ax.set_xticks(np.arange(y.shape[1])) ax.set_xticklabels(names, fontsize=fs) ax.tick_params(axis='x', direction='out', top='off', length=10, labelsize=fs_tick) lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) ax = plt.gca() plt.savefig(res_out + '/comp_valid.png', additional_artists=[lgd], bbox_inches='tight') plt.close() plt.plot(waves, f(_ux + data['X'][inds_sup_train]).T, 'k') plt.plot(waves, gen_train.T, 'r-.') plt.title('Generating Spectra - Training Error', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out + '/genspectra_train.png') plt.close() plt.plot(waves, f(_ux + data['X_valid'][inds_sup_valid]).T, 'k') plt.plot(waves, gen_valid.T, 'r-.') plt.title('Generating Spectra - Validation Error', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out + '/genspectra_valid.png') plt.close() if m.variational: for endmember, dist, color, name in zip(endmembers, endmembers_dists, colors, names): plt.plot(waves, endmember, color=color, lw=2, label=name) plt.plot(waves, dist.T, '-.', color=color, lw=1) plt.title('Generating ' + name + ' with Distribution', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) ax = plt.gca() if force_ylim: ax.set_ylim(ylim) plt.savefig(res_out + '/endmembers_dist_' + name + '.png', additional_artists=[lgd], bbox_inches='tight') plt.close() for endmember, color, name in zip(endmembers, colors, names): plt.plot(waves, endmember, color=color, lw=2, label=name) plt.title('Generating ' + name, fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # if m.variational: # plt.gca().set_ylim(ax.get_ylim()) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out + '/endmembers_mean_' + name + '.png', additional_artists=[lgd], bbox_inches='tight') plt.close() # if m.variational: # for endmember, color, name in zip(endmembers,colors,names): # plt.plot(waves,endmember,color=color,lw=2,label=name) # for endmember_dist, color in zip(endmembers_dists,colors): # plt.plot(waves,endmember_dist.T,'-.',color=color,lw=1) # plt.title('Generating Endmembers with Distributions', fontsize=fs) # plt.xlabel('Channels', fontsize=fs) # plt.ylabel('Intensities', fontsize=fs) # plt.tick_params(axis='both', which='major', labelsize=fs_tick) # lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5)) # ax = plt.gca() # if force_ylim: # ax.set_ylim(ylim) # plt.savefig(res_out+'/endmembers_dist.png',additional_artists=[lgd],bbox_inches='tight') # plt.close() # for endmember, color, name in zip(endmembers,colors,names): # plt.plot(waves,endmember,color=color,lw=2,label=name) # plt.title('Generating Endmembers', fontsize=fs) # plt.xlabel('Channels', fontsize=fs) # plt.ylabel('Intensities', fontsize=fs) # plt.tick_params(axis='both', which='major', labelsize=fs_tick) # lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5)) # if m.variational: # plt.gca().set_ylim(ax.get_ylim()) # if force_ylim: # plt.gca().set_ylim(ylim) # plt.savefig(res_out+'/endmembers_means.png',additional_artists=[lgd],bbox_inches='tight') # plt.close() # for endmember, color, name in zip(endmembers,colors,names): # plt.plot(waves,endmember,color=color,lw=2,label=name) # for endmember, color, name in zip(groundtruth,colors,names): # plt.plot(waves,endmember[:len(waves)],color=color,lw=6,alpha=0.4) # score_gen_endmembers = L2(endmembers,groundtruth[:,:len(waves)]) # if title is None: # plt.title('Generating Endmembers with Ground Truth ({:.3f})'.format(score_gen_endmembers), fontsize=fs) # else: # plt.title(title+' ({:.3f})'.format(score_gen_endmembers), fontsize=fs) # plt.xlabel('Channels', fontsize=fs) # plt.ylabel('Intensities', fontsize=fs) # plt.tick_params(axis='both', which='major', labelsize=fs_tick) # lgd = plt.legend(loc='lower right', fontsize=fs) # # lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5)) # if m.variational: # plt.gca().set_ylim(ax.get_ylim()) # if force_ylim: # plt.gca().set_ylim(ylim) # plt.savefig(res_out+'/endmembers_means_with_groundtruth.png',additional_artists=[lgd],bbox_inches='tight') # plt.close() if groundtruth is not None: score_gen_endmembers = L2(endmembers, groundtruth[:, :len(waves)]) for endmember, gt, color, name in zip(endmembers, groundtruth, colors, names): plt.plot(waves, endmember, color=color, lw=2, label=name) plt.plot(waves, gt[:len(waves)], color=color, lw=6, alpha=0.4) score_gen_endmember = L2(endmember, gt[:len(waves)]) plt.title( 'Generating ' + name + ' with Ground Truth ({:.3f})'.format(score_gen_endmember), fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # if m.variational: # plt.gca().set_ylim(ax.get_ylim()) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out + '/endmembers_mean_with_groundtruth_' + name + '.png', additional_artists=[lgd], bbox_inches='tight') plt.close() plt.plot(waves, f(_ux + data['X_'][inds_train_x]).T, 'k') plt.plot(waves, recon_train.T, 'r-.') plt.title('Reconstructing Spectra - Training Error', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out + '/recon_train.png') plt.close() plt.plot(waves, f(_ux + data['X_valid'][inds_sup_valid]).T, 'k') plt.plot(waves, recon_sup_valid.T, 'r-.') plt.title('Reconstructing Spectra - Validation Error', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out + '/recon_valid.png') plt.close() if m.model_type in [1, 2]: # need to use vertical lines to denote edges of datasets # write dataset i in middle of range on xlabel for i in range(z2_train.shape[1]): plt.plot(z2_train[:, i], 'r-.') plt.title('Nuisance Variable ' + str(i) + ' - Training', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) plt.savefig(res_out + '/nuisance_train_' + str(i) + '.png') plt.close() plt.plot(z2_valid[:, i], 'r-.') ax = plt.gca() ylim = ax.get_ylim() # should make this general if possible plt.plot([1866, 1866], [-5, 5], 'k--') plt.plot([1866 + 1742, 1866 + 1742], [-5, 5], 'k--') # plt.plot([1866+1742+1746,1866+1742+1746],[-5,5],'k--') ax.set_ylim(ylim) plt.title('Nuisance Variable ' + str(i) + ' - Validation', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) plt.savefig(res_out + '/nuisance_valid_' + str(i) + '.png') plt.close()
def save_model(path, aaindex_r2_list, learning_set, validation_set, threshold=5, regressor='pls', no_fft=False, train_on_all=False): """ Function Save_Model saves the best -s THRESHOLD models as 'Pickle' files (pickle.dump), which can be loaded again for doing predictions. Also, in Save_Model included is the def cross_validation -based computing of the k-fold CV performance of the n component-optimized model on all data (learning + validation set); by default k is 5 (n_samples = 5). Plots of the CV performance for the t best models are stored inside the folder CV_performance. """ regressor = regressor.lower() try: os.mkdir('CV_performance') except FileExistsError: pass try: os.mkdir('Pickles') except FileExistsError: pass try: os.remove('CV_performance/_CV_Results.txt') except FileNotFoundError: pass file = open('CV_performance/_CV_Results.txt', 'w') file.write('5-fold cross-validated performance of top models for validation set across all data.\n\n') if no_fft: file.write("No FFT used in this model construction, performance represents" " model accuracies on raw encoded sequence data.\n\n") file.close() for t in range(threshold): try: idx = aaindex_r2_list[t][0] parameter = aaindex_r2_list[t][7] # Estimating the CV performance of the n_component-fitted model on all data xy_learn = XY(full_path(idx), learning_set) xy_test = XY(full_path(idx), validation_set) if no_fft is False: x_test, y_test, _ = xy_test.get_x_and_y() x_learn, y_learn, _ = xy_learn.get_x_and_y() else: _, y_test, x_test = xy_test.get_x_and_y() _, y_learn, x_learn = xy_learn.get_x_and_y() x = np.concatenate([x_learn, x_test]) y = np.concatenate([y_learn, y_test]) if regressor == 'pls' or regressor == 'pls_cv': # n_components according to lowest MSE for validation set regressor_ = PLSRegression(n_components=parameter.get('n_components')) elif regressor == 'rf': regressor_ = RandomForestRegressor( random_state=parameter.get('random_state'), n_estimators=parameter.get('n_estimators'), max_features=parameter.get('max_features') ) elif regressor == 'svr': regressor_ = SVR(C=parameter.get('C'), gamma=parameter.get('gamma')) elif regressor == 'mlp': regressor_ = MLPRegressor( hidden_layer_sizes=parameter.get('hidden_layer_sizes'), activation=parameter.get('activation'), solver=parameter.get('solver'), learning_rate=parameter.get('learning_rate'), learning_rate_init=parameter.get('learning_rate_init'), max_iter=parameter.get('max_iter'), random_state=parameter.get('random_state') ) else: raise SystemError("Did not find specified regression model as valid option. " "See '--help' for valid regression model options.") # perform 5-fold cross-validation on all data (on X and Y) n_samples = 5 y_test_total, y_predicted_total = cross_validation(x, y, regressor_, n_samples) r_squared = r2_score(y_test_total, y_predicted_total) rmse = np.sqrt(mean_squared_error(y_test_total, y_predicted_total)) stddev = np.std(y_test_total, ddof=1) nrmse = rmse / stddev pearson_r = np.corrcoef(y_test_total, y_predicted_total)[0][1] # ranks for Spearman correlation y_test_total_rank = np.array(y_test_total).argsort().argsort() y_predicted_total_rank = np.array(y_predicted_total).argsort().argsort() spearman_rho = np.corrcoef(y_test_total_rank, y_predicted_total_rank)[0][1] with open('CV_performance/_CV_Results.txt', 'a') as f: f.write('Regression type: {}; Parameter: {}; Encoding index: {}\n'.format( regressor.upper(), parameter, idx[:-4])) f.write('R2 = {:.5f}; RMSE = {:.5f}; NRMSE = {:.5f}; Pearson\'s r = {:.5f};' ' Spearman\'s rho = {:.5f}\n\n'.format(r_squared, rmse, nrmse, pearson_r, spearman_rho)) figure, ax = plt.subplots() ax.scatter(y_test_total, y_predicted_total, marker='o', s=20, linewidths=0.5, edgecolor='black') ax.plot([min(y_test_total) - 1, max(y_test_total) + 1], [min(y_predicted_total) - 1, max(y_predicted_total) + 1], 'k', lw=2) ax.legend([ '$R^2$ = {}\nRMSE = {}\nNRMSE = {}\nPearson\'s $r$ = {}\nSpearman\'s '.format( round(r_squared, 3), round(rmse, 3), round(nrmse, 3), round(pearson_r, 3)) + r'$\rho$ = {}'.format(str(round(spearman_rho, 3))) ]) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') plt.savefig('CV_performance/' + idx[:-4] + '_' + str(n_samples) + '-fold-CV.png', dpi=250) plt.close('all') if train_on_all: # fit on all available data (learning + validation set; FFT or noFFT is defined already above) regressor_.fit(x, y) else: # fit (only) on full learning set (FFT or noFFT is defined already above) regressor_.fit(x_learn, y_learn) file = open(os.path.join(path, 'Pickles/'+idx[:-4]), 'wb') pickle.dump(regressor_, file) file.close() except IndexError: break return ()
def prediction(X_calib, Y_calib, X_valid, Y_valid, plot_components=False): # Run PLS including a variable number of components, up to 40, and calculate MSE mse = [] component = np.arange(1, 40) for i in component: pls = PLSRegression(n_components=i) # Fit pls.fit(X_calib, Y_calib) # Prediction Y_pred = pls.predict(X_valid) mse_p = mean_squared_error(Y_valid, Y_pred) mse.append(mse_p) comp = 100 * (i + 1) / 40 # Trick to update status on the same line stdout.write("\r%d%% completed" % comp) stdout.flush() stdout.write("\n") # Calculate and print the position of minimum in MSE msemin = np.argmin(mse) print("Suggested number of components: ", msemin + 1) stdout.write("\n") if plot_components is True: with plt.style.context(('ggplot')): plt.plot(component, np.array(mse), '-v', color='blue', mfc='blue') plt.plot(component[msemin], np.array(mse)[msemin], 'P', ms=10, mfc='red') plt.xlabel('Number of PLS components') plt.ylabel('MSE') plt.title('PLS') plt.xlim(xmin=-1) plt.show() # Run PLS with suggested number of components pls = PLSRegression(n_components=msemin + 1) pls.fit(X_calib, Y_calib) Y_pred = pls.predict(X_valid) # Calculate and print scores score_p = r2_score(Y_valid, Y_pred) mse_p = mean_squared_error(Y_valid, Y_pred) sep = np.std(Y_pred[:, 0] - Y_valid) rpd = np.std(Y_valid) / sep bias = np.mean(Y_pred[:, 0] - Y_valid) print('R2: %5.3f' % score_p) print('MSE: %5.3f' % mse_p) print('SEP: %5.3f' % sep) print('RPD: %5.3f' % rpd) print('Bias: %5.3f' % bias) # Plot regression and figures of merit rangey = max(Y_valid) - min(Y_valid) rangex = max(Y_pred) - min(Y_pred) z = np.polyfit(Y_valid, Y_pred, 1) with plt.style.context(('ggplot')): fig, ax = plt.subplots(figsize=(9, 5)) ax.scatter(Y_pred, Y_valid, c='red', edgecolors='k') ax.plot(z[1] + z[0] * Y_valid, Y_valid, c='blue', linewidth=1) ax.plot(Y_valid, Y_valid, color='green', linewidth=1) plt.xlabel('Predicted') plt.ylabel('Measured') plt.title('Prediction') # Print the scores on the plot plt.text( min(Y_pred) + 0.05 * rangex, max(Y_valid) - 0.1 * rangey, 'R$^{2}=$ %5.3f' % score_p) plt.text( min(Y_pred) + 0.05 * rangex, max(Y_valid) - 0.15 * rangey, 'MSE: %5.3f' % mse_p) plt.text( min(Y_pred) + 0.05 * rangex, max(Y_valid) - 0.2 * rangey, 'SEP: %5.3f' % sep) plt.text( min(Y_pred) + 0.05 * rangex, max(Y_valid) - 0.25 * rangey, 'RPD: %5.3f' % rpd) plt.text( min(Y_pred) + 0.05 * rangex, max(Y_valid) - 0.3 * rangey, 'Bias: %5.3f' % bias) plt.show()
def regressionVector(X, Y, numLV): mdl = PLSRegression(n_components=numLV).fit(X, Y) coefs = abs(mdl.coef_) return coefs[:, 0]
r2 = [] nRMSE = [] maps = [] for i in tqdm(range(Boots)): # select random number idx = np.random.choice(N, N, replace=True) idx2 = list(set(range(N)) - set(idx)) # select samples using idx x_train = np.array(x.loc[idx, :]) x_val = np.array(x.loc[idx2, :]) y_train = np.array(y[idx]) y_val = np.array(y[idx2]) # PLSR model trainPLSR = PLSRegression(n_components=bestComp) trainPLSR.fit(x_train, y_train) # predict predictt = trainPLSR.predict(x_val) predictt = unlist(predictt) # predict to map mapp = trainPLSR.predict(r_data) mapp = unlist(mapp) # backtransform maps to 3D array mapp = mapp.reshape(img[:, :, 0].shape) # get accuracies R2 = (np.corrcoef(predictt, y_val)[0, 1])**2
# autoscaling if do_autoscaling: autoscaled_Xtrain = (Xtrain - Xtrain.mean(axis=0)) / Xtrain.std(axis=0, ddof=1) autoscaled_ytrain = (ytrain - ytrain.mean()) / ytrain.std(ddof=1) autoscaled_Xtest = (Xtest - Xtrain.mean(axis=0)) / Xtrain.std(axis=0, ddof=1) else: autoscaled_Xtrain = Xtrain.copy() autoscaled_ytrain = ytrain.copy() autoscaled_Xtest = Xtest.copy() if regression_method_flag == 1: # Ordinary Least Squares regression_model = LinearRegression() elif regression_method_flag == 2: # Partial Least Squares with constant component regression_model = PLSRegression(n_components=pls_component_number) elif regression_method_flag == 3: # Partial Least Squares pls_components = np.arange( 1, min( np.linalg.matrix_rank(autoscaled_Xtrain) + 1, max_pls_component_number + 1), 1) r2all = list() r2cvall = list() for pls_component in pls_components: pls_model_in_cv = PLSRegression(n_components=pls_component) pls_model_in_cv.fit(autoscaled_Xtrain, autoscaled_ytrain) calculated_y_in_cv = np.ndarray.flatten( pls_model_in_cv.predict(autoscaled_Xtrain)) estimated_y_in_cv = np.ndarray.flatten( model_selection.cross_val_predict(pls_model_in_cv,
def optimise_pls_cv(X, y, n_comp, plot_components=True): '''Run PLS including a variable number of components, up to n_comp, and calculate MSE ''' mse = [] component = np.arange(1, n_comp + 1) for i in range(1, n_comp + 1): pls = PLSRegression(n_components=i) # Cross-validation y_cv = cross_val_predict(pls, X, y, cv=10) mse.append(mean_squared_error(y, y_cv)) comp = 100 * (i) / n_comp # Trick to update status on the same line stdout.write("\r%d%% completed" % comp) stdout.flush() stdout.write("\n") # Calculate and print the position of minimum in MSE msemin = np.argmin(mse) print("Suggested number of components: ", msemin + 1) stdout.write("\n") if plot_components is True: with plt.style.context(('ggplot')): plt.plot(component, np.array(mse), '-v', color='blue', mfc='blue') plt.plot(component[msemin], np.array(mse)[msemin], 'P', ms=10, mfc='red') plt.xlabel('Number of PLS components') plt.ylabel('MSE') plt.title('PLS') plt.xlim(left=-1) plt.show() # Define PLS object with optimal number of components pls_opt = PLSRegression(n_components=msemin + 1) # Fir to the entire dataset pls_opt.fit(X, y) y_c = pls_opt.predict(X) # Cross-validation y_cv = cross_val_predict(pls_opt, X, y, cv=10) # Calculate scores for calibration and cross-validation score_c = r2_score(y, y_c) score_cv = r2_score(y, y_cv) # Calculate mean squared error for calibration and cross validation mse_c = mean_squared_error(y, y_c) mse_cv = mean_squared_error(y, y_cv) print('R2 calib: %5.3f' % score_c) print('R2 CV: %5.3f' % score_cv) print('MSE calib: %5.3f' % mse_c) print('MSE CV: %5.3f' % mse_cv) # Plot regression and figures of merit # rangey = max(y) - min(y) # rangex = max(y_c) - min(y_c) # Fit a line to the CV vs response # z = np.polyfit(y, y_c, 1) # with plt.style.context(('ggplot')): # fig, ax = plt.subplots(figsize=(9, 5)) # ax.scatter(y_c, y, c='red', edgecolors='k') # #Plot the best fit line # ax.plot(np.polyval(z,y), y, c='blue', linewidth=1) # #Plot the ideal 1:1 line # ax.plot(y, y, color='green', linewidth=1) # plt.title('$R^{2}$ (CV): '+str(score_cv)) # plt.xlabel('Predicted $^{\circ}$Brix') # plt.ylabel('Measured $^{\circ}$Brix') # plt.show() return pls_opt, mse, y_c, y_cv
def final_train(x, y, x_test, y_test, out_list, mn, age_group_all): model = [] best_score = [] if mn == 'LAD': print(out_list) [C_list, score_list] = zip(*[(item[6]['C'], item[5]) for item in out_list]) C_final = np.median(C_list) best_score = np.mean(score_list) print('in final LAD') print('para', C_list, C_final, 'score', score_list, best_score) model = LAD(epsilon=0.0, tol=0.0001, C=C_final, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) model.fit(x, y) pred_var = predict(mn, model, x_test, y_test) elif mn == 'RFR': [n_est_list, score_list] = zip(*[(item[6]['n_estimators'], item[5]) for item in out_list]) n_est = int(np.median(n_est_list)) best_score = np.mean(score_list) print('in final RFR') print('n_est_list', n_est_list, n_est, 'score', score_list, best_score) rfr = RandomForestRegressor(criterion='mse') params = {"n_estimators": [n_est]} model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0) model.fit(x, y) pred_var = predict(mn, model, x_test, y_test) elif mn == 'PLSR': [n_comp_list, score_list] = zip(*[(item[6]['n_components'], item[5]) for item in out_list]) n_comp = int(np.median(n_comp_list)) best_score = np.mean(score_list) print('in final PLSR') print('n_comp_list', n_comp_list, n_comp, 'score', score_list, best_score) pls_reg = PLSRegression() params = {'n_components': [n_comp]} model = GridSearchCV(pls_reg, param_grid=params, cv=5, verbose=0) model.fit(x, y) pred_var = predict(mn, model, x_test, y_test) elif mn == 'RR': from sklearn.linear_model import Ridge, RidgeCV [n_comp_list, score_list] = zip(*[(item[6]['alpha'], item[5]) for item in out_list]) n_comp = int(np.median(n_comp_list)) best_score = np.mean(score_list) print('in final RR') print('n_comp_list', n_comp_list, n_comp, 'score', score_list, best_score) ridge = Ridge() params = {'alpha': [n_comp]} model = GridSearchCV(ridge, param_grid=params, cv=5, verbose=0) model.fit(x, y) pred_var = predict(mn, model, x_test, y_test) elif mn == 'RVM': from skrvm import RVR print('in final RVM') model = RVR(kernel='linear') model.fit(x, y) best_score = 0 pred_var = predict(mn, model, x_test, y_test) elif mn == 'COMB': print('IN COMB') group_lad = dict() from mord import LAD from sklearn.ensemble import RandomForestRegressor print('shapes', x.shape, y.shape) lad1 = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} broad_lad = GridSearchCV(lad1, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) broad_lad.fit(x, y) for ages in age_group_all: # print('ages', ages) idx_grp = list() for item in ages: # for every age in the age group collect the training data by getting the indices for idx, val in enumerate(y): if val == item: idx_grp.append(idx) key_age_grp = str(np.min(ages)) + '_' + str(np.max(ages)) x_samples_train = x[idx_grp] y_samples_train = y[idx_grp] # print('y_samples_train', y_samples_train) lad2 = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params2 = {"C": [0.001, 0.01, 1, 10, 100, 1000]} specific_lad = GridSearchCV(lad2, param_grid=params2, cv=5, scoring='neg_mean_absolute_error', verbose=0) specific_lad.fit(x_samples_train, y_samples_train) group_lad[key_age_grp] = specific_lad pred_all = make_predictions(x, broad_lad, group_lad) rfr = RandomForestRegressor(criterion='mse') params = {"n_estimators": [500]} model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0) model.fit(pred_all, y) print("[INFO] RFR grid search best parameters: {}".format( model.best_params_)) best_score = model.best_score_ pred_all_test = make_predictions(x_test, broad_lad, group_lad) pred_var = predict(mn, model, pred_all_test, y_test) return model, best_score, pred_var
import os import copy import numpy as np from matplotlib import pyplot as plt from sklearn.cross_decomposition import PLSRegression from sklearn import metrics import pickle os.chdir("D:/11. Programming/ML/01. FabWideSimulation6/") pls = PLSRegression(n_components=6, scale=False, max_iter=500, copy=True) lamda_PLS = 0.1 Tgt = np.array([0, 50]) A_p1 = np.array([[0.5, -0.2], [0.25, 0.15]]) d_p1 = np.array([[0.1, 0], [0.05, 0]]) C_p1 = np.transpose( np.array([[0, 0.5, 0.05, 0, 0.15, 0], [0.085, 0, 0.025, 0.2, 0, 0]])) L1 = 0.55 * np.identity(2) L2 = 0.75 * np.identity(2) I = np.identity(2) N = 120 DoE_Queue = [] def sampling_up(): u1_p1 = np.random.normal(0.4, np.sqrt(0.2)) u2_p1 = np.random.normal(0.6, np.sqrt(0.2)) u_p1 = np.array([u1_p1, u2_p1]) return u_p1
def train(m, x_train, y_train, x_test, y_test): print('training', m) model = [] pred_var = {} if m == 'LAD': from mord import LAD lad = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} model = GridSearchCV(lad, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) y_train = y_train.astype(float).round() y_train = y_train.astype(int) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] LAD grid search best parameters: {}".format( model.best_params_)) elif m == 'MCLog': # this class is not avaialble from sklearn.linear_model import LogisticRegression mcl = LogisticRegression(multi_class='multinomial', max_iter=10000, solver='newton-cg', fit_intercept=True) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} model = GridSearchCV(mcl, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] MCLog grid search best parameters: {}".format( model.best_params_)) elif m == 'LogAT': # takes quite some time from mord import LogisticAT lat = LogisticAT() params = {"alpha": np.linspace(0, 1, 5)} model = GridSearchCV(lat, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] LogAT grid search best parameters: {}".format( model.best_params_)) elif m == 'LinearSVC': from sklearn.svm import LinearSVC svm = LinearSVC() params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] LinearSVC grid search best parameters: {}".format( model.best_params_)) elif m == 'RFC': from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() params = {"n_estimators": [10, 100, 500, 1000]} model = GridSearchCV(rfc, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] RFC grid search best parameters: {}".format( model.best_params_)) elif m == 'Lasso': from sklearn.linear_model import Lasso from sklearn.linear_model import LassoCV svm = Lasso() params = {"alpha": [10]} model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] RFR grid search best parameters: {}".format( model.best_params_)) # model = LassoCV(n_alphas=10, cv=5, verbose=3) # model.fit(x_train, y_train) # print("[INFO] Lasso path search best parameter: {}".format(model.alpha_)) elif m == 'RFR': from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor(criterion='mse') params = {"n_estimators": [500]} model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] RFR grid search best parameters: {}".format( model.best_params_)) elif m == 'RR': from sklearn.linear_model import Ridge, RidgeCV ridge = Ridge() params = { 'alpha': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] } model = GridSearchCV(ridge, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) print("[INFO] Ridge Regression grid search best parameters: {}".format( model.best_params_)) # model = RidgeCV(alphas=(0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), cv=5) # model.fit(x_train, y_train) # print("[INFO] Ridge Regression grid search best parameters: {}".format(model.alpha_)) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) elif m == 'PLSR': from sklearn.cross_decomposition import PLSRegression pls_reg = PLSRegression() params = { 'n_components': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 ] } model = GridSearchCV(pls_reg, param_grid=params, cv=5, verbose=0) # pdb.set_trace() model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) print("[INFO] PLS Regression grid search best parameters: {}".format( model.best_params_)) pred_var = predict(m, model, x_test, y_test) elif m == 'RVM': from skrvm import RVR print('in RVM') model = RVR(kernel='linear') # avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model, x_train, y_train, x_test, y_test, loss='mse', # num_rounds=3, random_seed=123) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) # print('Average expected loss: %.3f' % avg_expected_loss) # print('Average bias: %.3f' % avg_bias) # print('Average variance: %.3f' % avg_var) elif m == 'DTR': from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor() # params = {"criterion": ["mse", "mae"], "min_samples_split": [10, 20, 40], "max_depth": [2], # "min_samples_leaf": [20, 40, 100], "max_leaf_nodes": [5, 20, 100]} # params = {"max_depth": [2,4,6]} # model = GridSearchCV(dtr, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) elif m == 'COMB': from sklearn.ensemble import RandomForestRegressor from mord import LAD from group_pred import create_age_groups print('IN COMB') group_lad = dict() print('shapes', x_train.shape, y_train.shape) lad1 = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} broad_lad = GridSearchCV(lad1, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) y_train_r = y_train.astype(float).round() y_train_r = y_train_r.astype(int) broad_lad.fit(x_train, y_train_r) age_group_all = create_age_groups(y_train_r, 10, 5) for ages in age_group_all: # print('ages', ages) idx_grp = list() for item in ages: # for every age in the age group collect the training data by getting the indices for idx, val in enumerate(y_train_r): if val == item: idx_grp.append(idx) print('group info', ages, len(idx_grp)) if len(idx_grp) > 5: key_age_grp = str(np.min(ages)) + '_' + str(np.max(ages)) x_samples_train = x_train[idx_grp] y_samples_train = y_train_r[idx_grp] # print('y_samples_train', y_samples_train) lad2 = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params2 = {"C": [0.001, 0.01, 1, 10, 100, 1000]} specific_lad = GridSearchCV(lad2, param_grid=params2, cv=5, scoring='neg_mean_absolute_error', verbose=0) specific_lad.fit(x_samples_train, y_samples_train) group_lad[key_age_grp] = specific_lad print('len_groups', len(group_lad)) pred_all = make_predictions(x_train, broad_lad, group_lad) rfr = RandomForestRegressor(criterion='mse') params = {"n_estimators": [500]} model_2 = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0) model_2.fit(pred_all, y_train) # lad = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, # intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) # params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} # model_2 = GridSearchCV(lad, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) # model_2.fit(pred_all, y_train_r) train_var = predict(m, model_2, pred_all, y_train) print("[INFO] RFR grid search best parameters: {}".format( model_2.best_params_)) pred_all_test = make_predictions(x_test, broad_lad, group_lad) pred_var = predict(m, model_2, pred_all_test, y_test) model = [broad_lad, group_lad, model_2] else: print('unknown model') if m == 'RVM' or 'DTR': return model, 0, 0, pred_var, train_var elif m == 'COMB': return model, model_2.best_score_, model_2.best_params_, pred_var, train_var else: return model, model.best_score_, model.best_params_, pred_var, train_var
# オートスケーリング autoscaled_x_train = (x_train - x_train.mean()) / x_train.std() autoscaled_y_train = (y_train - y_train.mean()) / y_train.std() autoscaled_x_test = (x_test - x_train.mean()) / x_train.std() if method_name == 'pls': # CV による成分数の最適化 components = [] # 空の list の変数を作成して、成分数をこの変数に追加していきます同じく成分数をこの変数に追加 r2_in_cv_all = [] # 空の list の変数を作成して、成分数ごとのクロスバリデーション後の r2 をこの変数に追加 for component in range( 1, min(np.linalg.matrix_rank(autoscaled_x_train), max_number_of_principal_components) + 1): # PLS model = PLSRegression(n_components=component) # PLS モデルの宣言 estimated_y_in_cv = pd.DataFrame( cross_val_predict( model, autoscaled_x_train, autoscaled_y_train, cv=fold_number)) # クロスバリデーション推定値の計算し、DataFrame型に変換 estimated_y_in_cv = estimated_y_in_cv * y_train.std() + y_train.mean( ) # スケールをもとに戻す r2_in_cv = metrics.r2_score(y_train, estimated_y_in_cv) # r2 を計算 print(component, r2_in_cv) # 成分数と r2 を表示 r2_in_cv_all.append(r2_in_cv) # r2 を追加 components.append(component) # 成分数を追加 # 成分数ごとの CV 後の r2 をプロットし、CV 後のr2が最大のときを最適成分数に optimal_component_number = sample_functions.plot_and_selection_of_hyperparameter( components, r2_in_cv_all, 'number of components', 'cross-validated r2') print('\nCV で最適化された成分数 :', optimal_component_number)
# fill empty values in the dataset X = X.fillna(method='ffill') # print("..........................") # print(X.isnull().any()) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) X_train = X_train.values X_test = X_test.values y_train = y_train.values y_test = y_test.values plsr = PLSRegression(n_components=2, scale=True, max_iter=500, tol=1e-06, copy=True) # plsr = RandomForestRegressor(n_estimators = 1000, random_state = 42) # plsr # model = plsr.fit(X_train, y_train) # forest model = plsr.fit(X_train, y_train.ravel()) pred = model.predict(X_test) # accuracy # print(model.score(X_test, y_test))
def pls_cal(dbfile,maskfile,outpath,which_elem,nc,normtype=1,mincomp=0,maxcomp=100,keepfile=None,removefile=None,cal_dir=None,masterlist_file=None,compfile=None,name_sub_file=None,testsetfile=None,nfolds=5,seed=None,skscale=False,max_samples=0.1,n_elems=9): plstype_string='sklearn' plstype='sklearn' if skscale==True: plstype_string=plstype+'_scale' print('Reading database') sys.stdout.flush() spectra,comps,spect_index,names,labels,wvl=ccam.read_db(dbfile,compcheck=True,n_elems=n_elems) oxides=labels[2:] compindex=numpy.where(oxides==which_elem)[0] print('Choosing spectra') which_removed=outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_removed.csv' spectra,names,spect_index,comps=ccam.choose_spectra(spectra,spect_index,names,comps,compindex,mincomp=mincomp,maxcomp=maxcomp,keepfile=keepfile,removefile=removefile,which_removed=which_removed) print('Masking spectra') spectra,wvl=ccam.mask(spectra,wvl,maskfile) print('Normalizing spectra') spectra=ccam.normalize(spectra,wvl,normtype=normtype) print('Removing Test Set') if testsetfile!=None: data=pandas.read_csv(testsetfile,header=None) testnames=numpy.array(data.iloc[:,0]) testind=numpy.in1d(names,testnames) trainind=numpy.in1d(names,testnames,invert=True) names_test=names[testind] spectra_test=spectra[testind] spect_index_test=spect_index[testind] comps_test=comps[testind,compindex] plot.subplot(2,3,0) plot.hist(comps_test,bins=20,range=[min(comps[:,compindex]),max(comps[:,compindex])]) plot.xlabel(which_elem+' wt.%',fontsize=20) plot.ylabel('# of samples',fontsize=20) plot.title('Test Set',fontsize=23) names_train=names[trainind] traintest=numpy.zeros_like(names) traintest[trainind]='Train' traintest[testind]='Test' spectra_train=spectra[trainind] spect_index_train=spect_index[trainind] names_train=names[trainind] comps_train=comps[trainind,compindex] print('Assigning Folds') #if a fold file is specified, use it # folds=ccam.folds(foldfile,names) #else: #otherwise, define random folds # folds=ccam.random_folds(names,nfolds,seed=seed) names_unique,uniqueindex=numpy.unique(names_train,return_index=True) comps_unique_train=comps_train[uniqueindex] names_unique_sorted=names_unique[comps_unique_train.argsort()] folds=list(range(1,nfolds+1)) while len(folds)<len(names_unique_sorted): folds.extend(range(1,nfolds+1)) folds_train=numpy.zeros(len(names_train)) for i in range(len(names_unique_sorted)): print(names_unique_sorted[i]) print(folds[i]) folds_train[numpy.in1d(names_train,names_unique_sorted[i])]=int(folds[i]) names_nofold=names[(folds_train==0)] spect_index_nofold=spect_index[(folds_train==0)] #write a file containing the samples not assigned to folds with open(which_removed,'ab') as writefile: writer=csv.writer(writefile,delimiter=',',) for i in range(len(names_nofold)): writer.writerow([names_nofold[i],spect_index_nofold[i],'No Fold']) #remove spectra that are not assigned to any fold spectra_train=spectra_train[(folds_train!=0),:] spect_index_train=spect_index_train[(folds_train!=0)] names_train=names_train[(folds_train!=0)] comps_train=comps_train[(folds_train!=0)] folds_train=folds_train[(folds_train!=0)] print('Do Leave One Label Out (LOLO) cross validation with all folds but the test set') #define array to hold cross validation predictions and RMSEs train_predict_cv=numpy.zeros((len(names_train),nc)) RMSECV=numpy.zeros(nc) for i in numpy.array(range(nfolds))+1: plot.subplot(2,3,i) plot.hist(comps_train[(folds_train==i)],bins=20,range=[min(comps[:,compindex]),max(comps[:,compindex])]) plot.xlabel(which_elem+' wt.%') plot.ylabel('# of samples') plot.title('Fold '+str(i)) print('Holding out fold #'+str(i)) if skscale==False: #mean center those spectra left in #X_cv_in1,X_cv_in_mean1=meancenter.ccam_meancenter(spectra_train[(folds_train!=i),:]) X_cv_in,X_cv_in_mean=ccam.meancenter(spectra_train[(folds_train!=i),:]) #and those left out X_cv_out=ccam.meancenter(spectra_train[(folds_train==i),:],X_mean=X_cv_in_mean)[0] #mean center compositions left in Y_cv_in,Y_cv_in_mean=ccam.meancenter(comps_train[(folds_train!=i)]) if skscale==True: X_cv_in=spectra_train[(folds_train!=i),:] X_cv_out=spectra_train[(folds_train==i),:] Y_cv_in=comps_train[(folds_train!=i)] Y_cv_in_mean=0 #step through each number of components for j in range(1,nc+1): print('Training Model for '+str(j)+' components') #train the model PLS1model=PLSRegression(n_components=j,scale=skscale) PLS1model.fit(X_cv_in,Y_cv_in) train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1model.predict(X_cv_out)+Y_cv_in_mean) plot.tight_layout() fig=plot.gcf() fig.savefig(outpath+which_elem+'_'+str(mincomp)+'-'+str(maxcomp)+'_fold_hist.png',dpi=600) fig.clf() #calculate RMSECV for i in range(0,nc): sqerr=(train_predict_cv[:,i]-comps_train)**2.0 RMSECV[i]=numpy.sqrt(numpy.mean(sqerr)) #mean center full model if skscale==False: X,X_mean=ccam.meancenter(spectra_train) X_test=ccam.meancenter(spectra_test,X_mean=X_mean)[0] X_all=ccam.meancenter(spectra,X_mean=X_mean)[0] Y,Y_mean=ccam.meancenter(comps_train) if skscale==True: X=spectra_train X_test=spectra_test X_all=spectra Y=comps_train Y_mean=0 #create arrays for results and RMSEs trainset_results=numpy.zeros((len(names_train),nc)) testset_results=numpy.zeros((len(names_test),nc)) results=numpy.zeros((len(names),nc)) RMSEP=numpy.zeros(nc) RMSEC=numpy.zeros(nc) beta=numpy.zeros((len(X[0,:]),nc)) Q_res=numpy.zeros((len(X[:,0]),nc)) T2=numpy.zeros((len(X[:,0]),nc)) [a,evals,b]=numpy.linalg.svd(numpy.cov(numpy.dot(X,X.transpose()))) evals=numpy.diag(evals**2) #set up variables for cal target calculation if cal_dir!=None: print('Reading cal target data') cal_data,cal_wvl,cal_filelist=ccam.read_ccs(cal_dir) cal_data,cal_wvl=ccam.mask(cal_data,cal_wvl,maskfile) cal_data=ccam.normalize(cal_data,cal_wvl,normtype=normtype) if skscale==True: cal_data_centered=cal_data if skscale==False: cal_data_centered=ccam.meancenter(cal_data,X_mean=X_mean)[0] RMSEP_cal=numpy.zeros(nc) RMSEP_cal_good=numpy.zeros(nc) RMSEP_KGAMEDS=numpy.zeros(nc) RMSEP_MACUSANITE=numpy.zeros(nc) RMSEP_NAU2HIS=numpy.zeros(nc) RMSEP_NAU2LOS=numpy.zeros(nc) RMSEP_NAU2MEDS=numpy.zeros(nc) RMSEP_NORITE=numpy.zeros(nc) RMSEP_PICRITE=numpy.zeros(nc) RMSEP_SHERGOTTITE=numpy.zeros(nc) targets,dists,amps,nshots=ccam.target_lookup(cal_filelist,masterlist_file,name_sub_file) target_comps=ccam.target_comp_lookup(targets,compfile,which_elem) cal_results=numpy.zeros((len(targets),nc)) model_list=[] #Now step through each # of components with the full model for j in range(1,nc+1): print('Training full model for '+str(j)+' components') PLS1model=PLSRegression(n_components=j,scale=skscale) PLS1model.fit(X,Y) T=PLS1model.x_scores_ #There's probably a more efficient way to calculate T2... for k in range(len(X[:,0])): T2[k,j-1]=numpy.dot(T[k,:],numpy.dot(numpy.linalg.inv(numpy.dot(T.transpose(),T)),T[k,:])) E=X-numpy.dot(PLS1model.x_scores_,PLS1model.x_loadings_.transpose()) Q_res[:,j-1]=numpy.dot(E,E.transpose()).diagonal() trainset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X)+Y_mean) testset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X_test)+Y_mean) results[:,j-1]=numpy.squeeze(PLS1model.predict(X_all)+Y_mean) beta[:,j-1]=numpy.squeeze(PLS1model.coefs) model_list.append([PLS1model]) if cal_dir != None: cal_results[:,j-1]=numpy.squeeze(PLS1model.predict(cal_data_centered)+Y_mean) RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results) RMSEC[j-1]=numpy.sqrt(numpy.mean((trainset_results[:,j-1]-comps_train)**2.0)) RMSEP[j-1]=numpy.sqrt(numpy.mean((testset_results[:,j-1]-comps_test)**2.0)) #pickle the PLS model with open(outpath+which_elem+'_'+plstype_string+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'.pkl','wb') as picklefile: pickle.dump(model_list,picklefile) if cal_dir!=None: n_good_cal=numpy.sum(numpy.array([RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE])[:,0]!=0) print(n_good_cal) RMSEP_cal=(RMSEP_KGAMEDS+RMSEP_MACUSANITE+RMSEP_NAU2HIS+RMSEP_NAU2LOS+RMSEP_NAU2MEDS+RMSEP_NORITE+RMSEP_PICRITE+RMSEP_SHERGOTTITE)/n_good_cal RMSEP_single_cals=[RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE,RMSEP_cal] with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_caltargets_predict.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['File','Target','Laser Energy','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(targets)): row=[cal_filelist[i],targets[i],amps[i],target_comps[i]] row.extend(cal_results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP_caltargets.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSEP Cal Targets (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSEP_cal[i]]) ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal.png',RMSEP_cals=RMSEP_single_cals) ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal_good.png',RMSEP_good=RMSEP_cal_good) # plot RMSEs ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot.png') #Write output info to files with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_Q_res.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=["Sample","Spectrum","Fold","True Comp"] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(Q_res[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+str(mincomp)+'-'+str(maxcomp)+'_quartiles.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=[which_elem] writer.writerow(row) row=['Min',numpy.percentile(comps[:,compindex],0)] writer.writerow(row) row=['1st Quartile',numpy.percentile(comps[:,compindex],25)] writer.writerow(row) row=['Median',numpy.percentile(comps[:,compindex],50)] writer.writerow(row) row=['3rd Quartile',numpy.percentile(comps[:,compindex],75)] writer.writerow(row) row=['Max',numpy.percentile(comps[:,compindex],100)] writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_HotellingT2.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=["Sample","Spectrum","Fold","True Comp"] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(T2[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSECV.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSECV (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSECV[i]]) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEC.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSEC (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSEC[i]]) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSEP (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSEP[i]]) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_cv_predict.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Fold','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(train_predict_cv[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_train_predict.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Fold','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(trainset_results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_test_predict.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_test)): row=[names_test[i],spect_index_test[i],comps_test[i]] row.extend(testset_results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_all_predict.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Set','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names)): row=[names[i],spect_index[i],traintest[i],comps[i,compindex]] row.extend(results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_beta_coeffs.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['wvl'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(wvl)): row=[wvl[i]] row.extend(beta[i,:]) writer.writerow(row) if skscale==False: with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_meancenters.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow([which_elem+' mean',Y_mean]) for i in range(0,len(wvl)): row=[wvl[i],X_mean[i]] writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_inputinfo.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['Spectral database =',dbfile]) writer.writerow(['Spectra Kept =',keepfile]) writer.writerow(['Spectra Removed =',which_removed]) writer.writerow(['Test Set File =',testsetfile]) writer.writerow(['Mask File =',maskfile]) writer.writerow(['Algorithm =',plstype_string]) writer.writerow(['# of components =',nc]) writer.writerow(['Normalization Type =',normtype]) writer.writerow(['Composition Min. =',mincomp]) writer.writerow(['Composition Max. =',maxcomp])
# regr = LinearRegression() # regr.fit(X_reduced,y) # ypc=regr.predict(X_reduced) # print('The R2 score is: ', r2_score(ypc,y)) result = np.genfromtxt("final_pca_reduced_data_with_label.txt", dtype=None, delimiter=',', skip_header=1) result = result.astype('float') print(result.shape) X_reduced = result[:, 0:2] y = result[:, 2] pls = PLSRegression() pls.fit(X_reduced, y) # y_test_pred=pls.predict(X_test) # print('The R2 score is: ', r2_score(y_test_pred,Y_test)) print('Starting gathering testing samples') testfilescount = 6 testfilesnames = [ '2013-06', '2014-07', '2015-08', '2013-01', '2014-12', '2015-02' ] final_mae = [] final_mse = [] final_r2 = []
def __init__(self, A, d, C, seed): self.pls = PLSRegression(n_components=6, scale=False, max_iter=50000, copy=True) np.random.seed(seed) self.A = A self.d = d self.C = C
def bgap_pred_all_features(slef): df = pd.read_csv(path + '/ML/data/dft_data_with_features.csv', sep='\t') df = df.drop([ 'StructuredFormula', 'A1', 'A1_frac', 'A2', 'A2_frac', 'B1', 'B1_frac', 'B2', 'B2_frac', 'O', 'O_frac', 'atom_numO', 'mend_numO', 'atomic_rO', 'O_X', 'M_O', 'V_O', 'therm_con_O', 'polarizability_O', 'lattice_const_O', 'Row_O', 'Group_O', 'nO', 'rO' ], axis=1) df_x = df.drop(['Ehull', 'Bandgap'], axis=1) df_y = df[['Bandgap']] algo_dict_mse = { 'DT': [], 'SVR': [], 'PLS': [], 'EN': [], 'KNN': [], 'RAND': [], 'GBR': [] } algo_dict_mae = { 'DT': [], 'SVR': [], 'PLS': [], 'KNN': [], 'RAND': [], 'GBR': [] } for i in range(20): X_train, X_test, y_train, y_test = train_test_split( df_x, df_y.values.ravel(), test_size=0.2, random_state=i) pipelines = [] pipelines.append(('DT', Pipeline([('Scaler', StandardScaler()), ('DT', DecisionTreeRegressor())]))) pipelines.append( ('SVR', Pipeline([('Scaler', StandardScaler()), ('SVR', SVR())]))) # pipelines.append(('PLS', Pipeline([('Scaler', StandardScaler()), ('PLS', PLSRegression())]))) pipelines.append(('KNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())]))) pipelines.append(('RAND', Pipeline([('Scaler', StandardScaler()), ('RAND', RandomForestRegressor())]))) pipelines.append( ('GBR', Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())]))) results = [] names = [] for name, model in pipelines: # cv = KFold(n_splits=10, random_state=10) cv = LeaveOneOut() cv_results_mse = cross_val_score( model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error') cv_results_mae = cross_val_score( model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error') msg_mse = "%s: MSE %f (%f)" % (name, cv_results_mse.mean(), cv_results_mse.std()) msg_mae = "%s: MAE %f (%f)" % (name, cv_results_mae.mean(), cv_results_mae.std()) print(msg_mse) print(msg_mae) algo_dict_mse[name].append(np.sqrt(-1 * cv_results_mse.mean())) algo_dict_mae[name].append(-1 * cv_results_mae.mean()) print('\n') print('DT 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['DT']).mean(), np.array(algo_dict_mae['DT']).mean(), np.array(algo_dict_mae['DT']).std())) print('SVR 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['SVR']).mean(), np.array(algo_dict_mae['SVR']).mean(), np.array(algo_dict_mae['SVR']).std())) print('PLS 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['PLS']).mean(), np.array(algo_dict_mae['PLS']).mean(), np.array(algo_dict_mae['PLS']).std())) print('KNN 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['KNN']).mean(), np.array(algo_dict_mae['KNN']).mean(), np.array(algo_dict_mae['KNN']).std())) print('RAND 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['RAND']).mean(), np.array(algo_dict_mae['RAND']).mean(), np.array(algo_dict_mae['RAND']).std())) print('GBR 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['GBR']).mean(), np.array(algo_dict_mae['GBR']).mean(), np.array(algo_dict_mae['GBR']).std()))