def pls_cv(self,ncomp_range=range(1,21),plot=False,verbose=False, osc_params=(10,1)): # Separating X from Y for PLS X=self.df[self.freqs].to_numpy() Y=self.df[self.y_name].to_numpy().reshape(-1, 1) sample_std=np.std(self.df[self.y_name]) # CV based on measurement day if self.cval=="MD": cv = LeaveOneGroupOut() folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name])) # kfold CV elif self.cval=="kfold": cv = KFold(n_splits=self.cval_param) folds=list(cv.split(X)) else: raise InputError("Invalid CV type!") # Array for storing CV errors cv_RMSE_all=np.zeros([len(folds),len(ncomp_range)]) i=0 for train, val in folds: # If OSC model specified if len(osc_params)==2: osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1]) osc.fit(X[train], Y[train]) X_train_osc=osc.X_osc X_val_osc=osc.transform(X[val]) j=0 for ncomp in ncomp_range: pls = PLSRegression(n_components=ncomp,scale=False) if len(osc_params)==2: pls.fit(X_train_osc, Y[train]) cv_RMSE_all[i,j]=metrics.mean_squared_error( Y[val], pls.predict(X_val_osc))**0.5 else: pls.fit(X[train], Y[train]) cv_RMSE_all[i,j]=metrics.mean_squared_error( Y[val], pls.predict(X[val]))**0.5 j=j+1 i=i+1 # Printing and plotting CV results cv_RMSE_ncomp=np.mean(cv_RMSE_all,axis=0) cv_RPD_ncomp=sample_std/cv_RMSE_ncomp if plot: fig = plt.figure(figsize=(12,8)) plt.gca().xaxis.grid(True) plt.xticks(ncomp_range) plt.ylabel("RPD") plt.xlabel("Number of components") plt.plot(ncomp_range,cv_RPD_ncomp) # Best model rpd_best=max(cv_RPD_ncomp) ncomp_best=ncomp_range[cv_RMSE_ncomp.argmin()] if verbose: print("Best RMSE: ",min(cv_RMSE_ncomp)) print("Best RPD: ",max(cv_RPD_ncomp)) print("Number of latent components: ",ncomp_range[cv_RMSE_ncomp.argmin()]) return (ncomp_best,rpd_best)
def fssregression_cv(self,inner_cv="kfold",inner_cv_param=5,maxvar=2,verbose=False, osc_params=(10,1)): #inner CV can be "kfold" or "none" # Separating X from Y for PLS X=self.df[self.freqs] Y=self.df[self.y_name] # Create list for selected variables best_vars=[] reg = FSSRegression(inner_cv,inner_cv_param,maxvar) # CV based on measurement day if self.cval=="MD": cv = LeaveOneGroupOut() folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name])) # kfold CV elif self.cval=="kfold": cv = KFold(n_splits=self.cval_param) folds=list(cv.split(X)) else: raise InputError("Invalid CV type!") i=0 #Array for cv values cv_RMSE_all=np.zeros([len(folds)]) for train,val in folds: # If OSC model specified if len(osc_params)==2: osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1]) # FSSR needs column names, so it uses pandas, but osc uses numpy arrays osc.fit(X.iloc[train].to_numpy(), Y.iloc[train].to_numpy().reshape(-1,1)) X_train_osc=pd.DataFrame(data=osc.X_osc,columns=self.freqs) X_val_osc=pd.DataFrame(data=osc.transform(X.iloc[val].to_numpy()),columns=self.freqs) # Fit and predict reg.fit(X_train_osc, Y.iloc[train]) cv_RMSE_all[i]=metrics.mean_squared_error( Y.iloc[val], reg.predict(X_val_osc))**0.5 best_vars.append(reg.bestvar) else: reg.fit(X.iloc[train], Y.iloc[train]) cv_RMSE_all[i]=metrics.mean_squared_error( Y.iloc[val], reg.predict(X.iloc[val]))**0.5 best_vars.append(reg.bestvar) i=i+1 cv_RMSE=np.mean(cv_RMSE_all) rpd=np.std(self.df[self.y_name])/cv_RMSE if verbose: print("RMSE: ",cv_RMSE) print("RPD: ",rpd) print("Selected freqs: ",best_vars) k=0 for day in self.df[self.date_name].unique(): print("Date: {0}, Measurements: {1:.0f}, RMSE: {2:.2f}, selected vars: {3}" .format( np.datetime_as_string(day,unit='D'), sum(self.df[self.date_name]==day), cv_RMSE_all[k], len(best_vars[k]))) k=k+1 return(rpd)
def __init__(self): super(LongWebW, self).__init__() self.grid = QGridLayout(self) self.osc = OSC() self.osc.clear() self.view_list = [LongWebView(), LongWebView()] self.view_list[0].setSizePolicy(QSizePolicy.MinimumExpanding,QSizePolicy.Preferred) self.view_list[1].setSizePolicy(QSizePolicy.MinimumExpanding,QSizePolicy.Preferred) self.onData_list = [[], []] # [[(cur_set, onset),], [(cur_set, onset),] self.offData_list = [[], []] # [[(cur_set, offset),], [(cur_set, offset),] self.onOff_list = [[], []] self.grid.addWidget(self.view_list[0], 0,0,1,1) self.grid.addWidget(self.view_list[1], 1,0,1,1) self.setMinimumWidth(1200) self.setFixedHeight(700) self.cur_notes_set = set() self.lock = threading.RLock() self.idx = 0 self.auto = False self.renewParserT("molihua.abc")
def ipls_cv(self,version="basic",nint_list=[8,16,32],ncomp_range=range(1,10), inner_cv="kfold",inner_cv_param=5,verbose=True, osc_params=(10,1)): X=self.df[self.freqs] Y=self.df[self.y_name] # CV based on measurement day if self.cval=="MD": cv = LeaveOneGroupOut() folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name])) # kfold CV elif self.cval=="kfold": cv = KFold(n_splits=self.cval_param) folds=list(cv.split(X)) else: raise InputError("Invalid CV type!") #Array for cv values cv_RMSE_all=np.zeros([len(folds),len(ncomp_range),len(nint_list)]) i=0 for train,val in folds: # If OSC model specified if len(osc_params)==2: osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1]) # IPLS needs column names, so it uses pandas, but osc uses numpy arrays osc.fit(X.iloc[train].to_numpy(), Y.iloc[train].to_numpy().reshape(-1,1)) X_train_osc=pd.DataFrame(data=osc.X_osc,columns=self.freqs) X_val_osc=pd.DataFrame(data=osc.transform(X.iloc[val].to_numpy()),columns=self.freqs) j=0 for ncomp in ncomp_range: k=0 for nint in nint_list: ipls_obj=IntervalPLSRegression(ncomp=ncomp,nint=nint, cv_type=inner_cv,cv_param=inner_cv_param) if len(osc_params)==2: ipls_obj.fit(X_train_osc, Y.iloc[train]) cv_RMSE_all[i,j,k]=metrics.mean_squared_error( Y.iloc[val], ipls_obj.predict(X_val_osc))**0.5 else: ipls_obj.fit(X.iloc[train], Y.iloc[train]) cv_RMSE_all[i,j,k]=metrics.mean_squared_error( Y.iloc[val], ipls_obj.predict(X.iloc[val]))**0.5 k=k+1 j=j+1 i=i+1 cv_RMSE=np.mean(cv_RMSE_all,axis=0) RMSE_best=np.amin(cv_RMSE) rpd_best=np.std(self.df[self.y_name])/RMSE_best # Best model ncomp_best=ncomp_range[np.where( cv_RMSE==RMSE_best)[0][0]] nint_best=nint_list[np.where( cv_RMSE==RMSE_best)[1][0]] if verbose: print("Best RMSE: ",RMSE_best) print("Best RPD: ",rpd_best) print("Number of components:",ncomp_best) print("Number of intervals:",nint_best) return (ncomp_best,nint_best,rpd_best)
def mcw_pls_cv(self,ncomp_range=range(1,21),sig_start=0.1,optimization="grid", plot=False,verbose=True, osc_params=(10,1)): # Separating X from Y for PLS # Needs to be converted to numpy array from pandas df X=self.df[self.freqs].to_numpy() # Y need to be converted to numpy array from pandas series and reshaped to (N,1) from (N,) Y=self.df[self.y_name].to_numpy().reshape(-1, 1) sample_std=np.std(self.df[self.y_name]) # CV based on measurement day if self.cval=="MD": cv = LeaveOneGroupOut() folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name])) # kfold CV elif self.cval=="kfold": cv = KFold(n_splits=self.cval_param) folds=list(cv.split(X)) else: raise InputError("Invalid CV type!") if optimization=="grid": # Create a search vector from starting values for gridsearch sig_list=np.linspace(sig_start/10,sig_start*10,30) rpd_best_all=0 non_improve=0 repeat=True while repeat: # Array for storing CV errors cv_RMSE_all=np.zeros([len(folds),len(ncomp_range),len(sig_list)]) i=0 for train,val in folds: # If OSC model specified if len(osc_params)==2: osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1]) osc.fit(X[train], Y[train]) X_train_osc=osc.X_osc X_val_osc=osc.transform(X[val]) j=0 for ncomp in ncomp_range: k=0 for sig in sig_list: if len(osc_params)==2: pls = mcw_pls_sklearn(n_components=ncomp, max_iter=30, R_initial=None, scale_sigma2=sig) pls.fit(X_train_osc, Y[train]) cv_RMSE_all[i,j,k]=metrics.mean_squared_error( Y[val], pls.predict(X_val_osc))**0.5 else: pls = mcw_pls_sklearn(n_components=ncomp, max_iter=30, R_initial=None, scale_sigma2=sig) pls.fit(X[train], Y[train]) cv_RMSE_all[i,j,k]=metrics.mean_squared_error( Y[val], pls.predict(X[val]))**0.5 k=k+1 j=j+1 i=i+1 cv_RMSE_ncomp_sigs=np.mean(cv_RMSE_all,axis=0) # Best model ncomp_best=ncomp_range[np.where( cv_RMSE_ncomp_sigs==np.amin(cv_RMSE_ncomp_sigs))[0][0]] sig_best=sig_list[np.where( cv_RMSE_ncomp_sigs==np.amin(cv_RMSE_ncomp_sigs))[1][0]] rpd_best=sample_std/np.amin(cv_RMSE_ncomp_sigs) if verbose: print("Best RMSE: ",np.amin(cv_RMSE_ncomp_sigs)) print("Best RPD: ",rpd_best) print("Number of latent components: ",ncomp_best) print("Best sigma: ",sig_best) # Check against all time best if rpd_best>rpd_best_all: ncomp_best_all = ncomp_best sig_best_all = sig_best rpd_best_all= rpd_best else: # Increase counter if there is no improvement non_improve=non_improve+1 repeat=False # Check if best value is in IQ range if sig_best<np.quantile(sig_list,0.2) or sig_best>np.quantile(sig_list,0.8): # If not, move the search interval based on the magnitude of the best value scale=math.floor(math.log10(sig_best))-1 lower=sig_best-(10**scale)*5 upper=sig_best+(10**scale)*5 # If best value is at the extreme of the interval expand it by a lot that way if min(sig_list)==sig_best: lower=sig_best/2 elif max(sig_list)==sig_best: upper=sig_best*2 # Create new search vector sig_list=np.linspace(lower,upper,10) # Repeat evaluation repeat=True # Terminate early if no improvements in 10 iterations if non_improve>10: repeat=False print("No improvement, terminate early.") if repeat: print("new iteration") # Set final values to all time best ncomp_best=ncomp_best_all sig_best=sig_best_all rpd_best=rpd_best_all elif optimization=="simple": # Array for storing CV errors sig_list=sig_start cv_RMSE_all=np.zeros([len(folds),len(ncomp_range),len(sig_list)]) i=0 for ncomp in ncomp_range: j=0 for sig in sig_list: pls = mcw_pls_sklearn(n_components=ncomp, max_iter=30, R_initial=None, scale_sigma2=sig) k=0 for train,val in folds: pls.fit(X[train], Y[train]) cv_RMSE_all[k,i,j]=metrics.mean_squared_error( Y[val], pls.predict(X[val]))**0.5 k=k+1 j=j+1 i=i+1 # Printing and plotting CV results cv_RMSE_ncomp_sigs=np.mean(cv_RMSE_all,axis=0) if plot: cv_RPD_ncomp_sigs=sample_std/cv_RMSE_ncomp_sigs fig = plt.figure(figsize=(10,5)) ax = plt.axes(projection="3d") # Cartesian indexing (x,y) transposes matrix indexing (i,j) x, y = np.meshgrid(list(sig_list),list(ncomp_range)) z=cv_RPD_ncomp_sigs ls = LightSource(270, 45) rgb = ls.shade(z, cmap=cm.gist_earth, vert_exag=0.1, blend_mode='soft') surf = ax.plot_surface(x, y, z, rstride=1, cstride=1, facecolors=rgb, linewidth=0, antialiased=False, shade=False) plt.show() # Best model ncomp_best=ncomp_range[np.where( cv_RMSE_ncomp_sigs==np.amin(cv_RMSE_ncomp_sigs))[0][0]] sig_best=sig_list[np.where( cv_RMSE_ncomp_sigs==np.amin(cv_RMSE_ncomp_sigs))[1][0]] rpd_best=sample_std/np.amin(cv_RMSE_ncomp_sigs) print("Best RMSE: ",np.amin(cv_RMSE_ncomp_sigs)) print("Best RPD: ",rpd_best) print("Number of latent components: ",ncomp_best) print("Best sigma: ",sig_best) return (ncomp_best,sig_best,rpd_best)
def svr_cv(self,gam_start=0.001, c_start=100, eps_start=0.1, optimization="grid",gridscale=5,non_improve_lim=10,verbose=False, osc_params=None): # Separating X from Y for PLS X=self.df[self.freqs].to_numpy() Y=self.df[self.y_name].to_numpy().reshape(-1, 1) sample_std=np.std(self.df[self.y_name]) # CV based on measurement day if self.cval=="MD": cv = LeaveOneGroupOut() folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name])) # kfold CV elif self.cval=="kfold": cv = KFold(n_splits=self.cval_param) folds=list(cv.split(X)) else: raise InputError("Invalid CV type!") if optimization=="none": cv_RMSE=np.zeros(len(folds)) # Only use RBF kernels, also standardize data pipe = Pipeline([('scaler', StandardScaler()), ('support vector regression', SVR(kernel="rbf",gamma=gam_start,C=c_start,epsilon=eps_start))]) l=0 for train, val in folds: pipe.fit(X[train], Y[train]) cv_RMSE[l]=metrics.mean_squared_error( Y[val], pipe.predict(X[val]))**0.5 l=l+1 gam_best=gam_start c_best=c_start eps_best=eps_start rpd_best=sample_std/np.mean(cv_RMSE) elif optimization=="grid": # Create a search vector from starting values for gridsearch gam_list=np.linspace(gam_start/gridscale,gam_start*gridscale,10) c_list=np.linspace(c_start/gridscale,c_start*gridscale,10) eps_list=np.linspace(eps_start/gridscale,eps_start*gridscale,10) # Create list of ndarrays from parameter search vectors, # it will help with making the cood more tidy param_lists=[gam_list,c_list,eps_list] param_best=np.zeros(3) rpd_best_all=0 non_improve=0 repeat=True while repeat: # Array for storing CV errors cv_RMSE_all=np.zeros([len(folds),len(gam_list),len(c_list),len(eps_list)]) # Put the CV iteration outside to save time when using OSC i=0 for train, val in folds: # If OSC model specified if len(osc_params)==2: osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1]) osc.fit(X[train], Y[train]) X_train_osc=osc.X_osc X_val_osc=osc.transform(X[val]) j=0 for gam in param_lists[0]: k=0 for c in param_lists[1]: l=0 for eps in param_lists[2]: pipe = Pipeline([('scaler', StandardScaler()), ('support vector regression', SVR(kernel="rbf",gamma=gam,C=c,epsilon=eps))]) if len(osc_params)==2: pipe.fit(X_train_osc, Y[train]) cv_RMSE_all[i,j,k,l]=metrics.mean_squared_error( Y[val], pipe.predict(X_val_osc))**0.5 else: pipe.fit(X[train], Y[train]) cv_RMSE_all[i,j,k,l]=metrics.mean_squared_error( Y[val], pipe.predict(X[val]))**0.5 l=l+1 k=k+1 j=j+1 i=i+1 cv_RMSE=np.mean(cv_RMSE_all,axis=0) # Best model param_best[0]=param_lists[0][np.where( cv_RMSE==np.amin(cv_RMSE))[0][0]] param_best[1]=param_lists[1][np.where( cv_RMSE==np.amin(cv_RMSE))[1][0]] param_best[2]=param_lists[2][np.where( cv_RMSE==np.amin(cv_RMSE))[2][0]] rpd_best=sample_std/np.amin(cv_RMSE) # Check against all time best if rpd_best>rpd_best_all: param_best_all = param_best.copy() rpd_best_all=rpd_best else: # Increase counter if there is no improvement non_improve=non_improve+1 if verbose==True: print("Best RMSE: ",np.amin(cv_RMSE)) print("Best RPD: ",rpd_best) print("Gamma: ",param_best[0]) print("C: ",param_best[1]) print("Epsilon: ",param_best[2]) repeat=False for index,p in enumerate(param_best): # Check if best value is in IQ range if p<np.quantile(param_lists[index],0.2) or p>np.quantile(param_lists[index],0.8): # If not, move the search interval based on the magnitude of the best value scale=math.floor(math.log10(p))-1 lower=p-(10**scale)*5 upper=p+(10**scale)*5 # If best value is at the extreme of the interval expand it by a lot that way if min(param_lists[index])==p: lower=min(param_lists[index])/2 elif max(param_lists[index])==p: upper=max(param_lists[index])*2 # Create new search vector param_lists[index]=np.linspace(lower,upper,10) # Repeat evaluation repeat=True # Terminate early if no improvements in 10 iterations if non_improve>non_improve_lim: repeat=False print("No improvement, terminate early.") if repeat: print("new iteration") # Set final values to all time best gam_best=param_best_all[0] c_best=param_best_all[1] eps_best=param_best_all[2] rpd_best=rpd_best_all # Simulated annealing elif optimization=="sa": # Number of cycles cycles = 100 # Trials per cycle trials = 100 # Number of accepted solutions n_accepted = 0.0 # Probability of accepting worse solution at the start p_start = 0.3 # Probability of accepting worse solution at the end p_end = 0.001 # Initial temperature t_start = -1.0/math.log(p_start) # Final temperature t_end = -1.0/math.log(p_end) # Use geometric temp reduction frac = (t_end/t_start)**(1.0/(cycles-1.0)) # Starting values t=t_start dE_mean = 0.0 gam=gam_start c=c_start eps=eps_start # Calculate starting cost cv_RMSE=np.zeros(len(folds)) pipe = Pipeline([('scaler', StandardScaler()), ('support vector regression', SVR(kernel="rbf",gamma=gam,C=c,epsilon=eps))]) L=0 for train, val in folds: pipe.fit(X[train], Y[train]) cv_RMSE[L]=metrics.mean_squared_error( Y[val], pipe.predict(X[val]))**0.5 L=L+1 cost=np.mean(cv_RMSE) rpd=sample_std/cost print("starting RPD:",rpd) # Best results gam_old = gam c_old = c eps_old = eps cost_old=cost rpd_old=rpd # All time best result gam_best = gam c_best = c eps_best = eps cost_best=cost rpd_best = rpd for i in range(cycles): if verbose and i%10==0 and i>0: print('Cycle: ', i ,' with Temperature: ', t) print('RPD=',rpd_old,'Gamma=' ,gam_old,', C=' ,c_old,', epsilon=',eps_old) for j in range(trials): # Generate new trial points gam = gam_old + (random.random()-0.5)*2/1000 c = c_old + (random.random()-0.5)*2*10 eps = eps_old + (random.random()-0.5)*2/100 # Enforce lower bounds gam = max(gam,0.0000001) c = max(c,0.0000001) eps = max(eps,0) # Calculate cost cv_RMSE=np.zeros(len(folds)) pipe = Pipeline([('scaler', StandardScaler()), ('support vector regression', SVR(kernel="rbf",gamma=gam,C=c,epsilon=eps))]) L=0 for train, val in folds: pipe.fit(X[train], Y[train]) cv_RMSE[L]=metrics.mean_squared_error( Y[val], pipe.predict(X[val]))**0.5 L=L+1 cost=np.mean(cv_RMSE) rpd=sample_std/cost dE = cost-cost_old # If new cost is higher if dE > 0: if (i==0 and j==0): dE_mean = dE # Generate probability of acceptance p = math.exp(-dE/(dE_mean * t)) # Determine whether to accept worse point if (random.random()<p): accept = True else: accept = False else: # New cost is lower, automatically accept accept = True # Check if cost is lower than all time best if cost<cost_best: # If new best, store the parameters, cost and RPD gam_best=gam c_best=c eps_best=eps cost_best=cost rpd_best=rpd if accept==True: # Update parameters, cost and RPD gam_old = gam c_old = c eps_old = eps cost_old=cost rpd_old=rpd # Increment number of accepted solutions n_accepted = n_accepted + 1 # Update energy change dE_mean = (dE_mean * (n_accepted-1) + abs(dE)) / n_accepted # Lower the temperature for next cycle t = frac * t # Return the best setting found else: raise InputError("Invalid optimization strategy!") return (gam_best,c_best,eps_best,rpd_best)
def osc_cv(self,nicomp_range=range(10,130,10),ncomp_range=range(1,5),epsilon = 10e-6, max_iters = 20,model="pls",model_parameter_range=range(1,11)): # Separating X from Y for PLS # Needs to be converted to numpy array from pandas df X=self.df[self.freqs].to_numpy() # Y need to be converted to numpy array from pandas series and reshaped to (N,1) from (N,) Y=self.df[self.y_name].to_numpy().reshape(-1, 1) # CV based on measurement day if self.cval=="MD": cv = LeaveOneGroupOut() folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name])) # kfold CV elif self.cval=="kfold": cv = KFold(n_splits=self.cval_param) folds=list(cv.split(X)) else: raise InputError("Invalid CV type!") #Matrix for cv values for all the possible parameter combinations cv_RMSE_all=np.zeros([len(folds),len(model_parameter_range),len(nicomp_range),len(ncomp_range)]) i=0 #possible internal component values for osc for nicomp in nicomp_range: j=0 #possible removed component values for osc for ncomp in ncomp_range: k=0 for train, val in folds: # train osc osc_obj=OSC("SWosc",nicomp,ncomp,epsilon, max_iters) X_osc_train, W,P,mu_x=osc_obj.fit(X[train],Y[train]) # apply osc on validation set # mean center data, alternatively the training set's mean can be used # if you think it is a better estimate by mean="training" X_osc_val=osc_obj.transform(X[val],mean="estimate") l=0 #possible model patrameter values for pls for param in model_parameter_range: #setup pls model pls = PLSRegression(param,scale=False) #train pls pls.fit(X_osc_train, Y[train]) #predict with pls and calculate error cv_RMSE_all[k,l,i,j]=metrics.mean_squared_error( Y[val], pls.predict(X_osc_val))**0.5 l=l+1 k=k+1 j=j+1 i=i+1 # Calculate mean performance across the folds cv_RMSE_mean=np.mean(cv_RMSE_all,axis=0) # Find maximum for every osc paremeter combination cv_RMSE=np.amax(cv_RMSE_mean, axis=0) cv_RPD=np.std(self.df[self.y_name])/cv_RMSE fig = plt.figure(figsize=(10,5)) ax = plt.axes(projection="3d") # Cartesian indexing (x,y) transposes matrix indexing (i,j) x, y = np.meshgrid(list(ncomp_range),list(nicomp_range)) z=cv_RPD ls = LightSource(200, 45) rgb = ls.shade(z, cmap=cm.gist_earth, vert_exag=0.1, blend_mode='soft') surf = ax.plot_surface(x, y, z, rstride=1, cstride=1, facecolors=rgb, linewidth=0, antialiased=False, shade=False) plt.show() # Best model print("Best RMSE: ",np.amin(cv_RMSE)) print("Best RPD: ",np.std(self.df[self.y_name])/np.amin(cv_RMSE)) print("Number of internal components: ",nicomp_range[np.where( cv_RMSE==np.amin(cv_RMSE))[0][0]]) print("Number of removed components: ",ncomp_range[np.where( cv_RMSE==np.amin(cv_RMSE))[1][0]]) return cv_RMSE