示例#1
0
 def pls_cv(self,ncomp_range=range(1,21),plot=False,verbose=False,
            osc_params=(10,1)):
     # Separating X from Y for PLS
     X=self.df[self.freqs].to_numpy()
     Y=self.df[self.y_name].to_numpy().reshape(-1, 1)
     sample_std=np.std(self.df[self.y_name])
     
     # CV based on measurement day
     if self.cval=="MD":
         cv = LeaveOneGroupOut()
         folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name]))
     # kfold CV
     elif self.cval=="kfold":
         cv = KFold(n_splits=self.cval_param)
         folds=list(cv.split(X))
     else:
         raise InputError("Invalid CV type!")
     
     # Array for storing CV errors
     cv_RMSE_all=np.zeros([len(folds),len(ncomp_range)])
     i=0
     for train, val in folds:
         # If OSC model specified
         if len(osc_params)==2:
             osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1])
             osc.fit(X[train], Y[train])
             X_train_osc=osc.X_osc
             X_val_osc=osc.transform(X[val])
         j=0
         for ncomp in ncomp_range:
             pls = PLSRegression(n_components=ncomp,scale=False)
             if len(osc_params)==2:
                 pls.fit(X_train_osc, Y[train])
                 cv_RMSE_all[i,j]=metrics.mean_squared_error(
                     Y[val], pls.predict(X_val_osc))**0.5
             else:
                 pls.fit(X[train], Y[train])
                 cv_RMSE_all[i,j]=metrics.mean_squared_error(
                         Y[val], pls.predict(X[val]))**0.5
             j=j+1
         i=i+1
     # Printing and plotting CV results
     cv_RMSE_ncomp=np.mean(cv_RMSE_all,axis=0)
     cv_RPD_ncomp=sample_std/cv_RMSE_ncomp
     if plot:
         fig = plt.figure(figsize=(12,8))
         plt.gca().xaxis.grid(True)
         plt.xticks(ncomp_range)
         plt.ylabel("RPD")
         plt.xlabel("Number of components")
         plt.plot(ncomp_range,cv_RPD_ncomp)
     # Best model
     rpd_best=max(cv_RPD_ncomp)
     ncomp_best=ncomp_range[cv_RMSE_ncomp.argmin()]
     if verbose:
         print("Best RMSE: ",min(cv_RMSE_ncomp))
         print("Best RPD: ",max(cv_RPD_ncomp))
         print("Number of latent components: ",ncomp_range[cv_RMSE_ncomp.argmin()])
     return (ncomp_best,rpd_best)
示例#2
0
 def fssregression_cv(self,inner_cv="kfold",inner_cv_param=5,maxvar=2,verbose=False,
                      osc_params=(10,1)):
     #inner CV can be "kfold" or "none"
     # Separating X from Y for PLS
     X=self.df[self.freqs]
     Y=self.df[self.y_name]
     
     # Create list for selected variables
     best_vars=[]
     reg = FSSRegression(inner_cv,inner_cv_param,maxvar)
     # CV based on measurement day
     if self.cval=="MD":
         cv = LeaveOneGroupOut()
         folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name]))
     # kfold CV
     elif self.cval=="kfold":
         cv = KFold(n_splits=self.cval_param)
         folds=list(cv.split(X))
     else:
         raise InputError("Invalid CV type!")  
     i=0
     #Array for cv values
     cv_RMSE_all=np.zeros([len(folds)])
     for train,val in folds:
         # If OSC model specified
         if len(osc_params)==2:
             osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1])
             # FSSR needs column names, so it uses pandas, but osc uses numpy arrays
             osc.fit(X.iloc[train].to_numpy(), Y.iloc[train].to_numpy().reshape(-1,1))
             X_train_osc=pd.DataFrame(data=osc.X_osc,columns=self.freqs)
             X_val_osc=pd.DataFrame(data=osc.transform(X.iloc[val].to_numpy()),columns=self.freqs)
             # Fit and predict
             reg.fit(X_train_osc, Y.iloc[train])
             cv_RMSE_all[i]=metrics.mean_squared_error(
                     Y.iloc[val], reg.predict(X_val_osc))**0.5
             best_vars.append(reg.bestvar) 
         else:
             reg.fit(X.iloc[train], Y.iloc[train])
             cv_RMSE_all[i]=metrics.mean_squared_error(
                     Y.iloc[val], reg.predict(X.iloc[val]))**0.5
             best_vars.append(reg.bestvar)        
         i=i+1
     cv_RMSE=np.mean(cv_RMSE_all)
     rpd=np.std(self.df[self.y_name])/cv_RMSE
     if verbose:
         print("RMSE: ",cv_RMSE)
         print("RPD: ",rpd)
         print("Selected freqs: ",best_vars)
         k=0
         for day in self.df[self.date_name].unique():    
             print("Date: {0}, Measurements: {1:.0f}, RMSE: {2:.2f}, selected vars: {3}"
                   .format(
                     np.datetime_as_string(day,unit='D'),
                     sum(self.df[self.date_name]==day),
                         cv_RMSE_all[k],
                         len(best_vars[k])))
             k=k+1
           
     return(rpd)
示例#3
0
 def __init__(self):
     super(LongWebW, self).__init__()
     self.grid = QGridLayout(self)
     self.osc = OSC()
     self.osc.clear()
     self.view_list = [LongWebView(), LongWebView()] 
     self.view_list[0].setSizePolicy(QSizePolicy.MinimumExpanding,QSizePolicy.Preferred)
     self.view_list[1].setSizePolicy(QSizePolicy.MinimumExpanding,QSizePolicy.Preferred)
     self.onData_list = [[], []] # [[(cur_set, onset),], [(cur_set, onset),]
     self.offData_list = [[], []] # [[(cur_set, offset),], [(cur_set, offset),]
     self.onOff_list = [[], []]
     self.grid.addWidget(self.view_list[0], 0,0,1,1)
     self.grid.addWidget(self.view_list[1], 1,0,1,1)
     self.setMinimumWidth(1200)
     self.setFixedHeight(700)
     self.cur_notes_set = set()
     self.lock = threading.RLock()
     self.idx = 0
     self.auto = False
     self.renewParserT("molihua.abc")
示例#4
0
    def ipls_cv(self,version="basic",nint_list=[8,16,32],ncomp_range=range(1,10),
                inner_cv="kfold",inner_cv_param=5,verbose=True,
                osc_params=(10,1)):
               
        X=self.df[self.freqs]
        Y=self.df[self.y_name]

        # CV based on measurement day
        if self.cval=="MD":
            cv = LeaveOneGroupOut()
            folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name]))
        # kfold CV
        elif self.cval=="kfold":
            cv = KFold(n_splits=self.cval_param)
            folds=list(cv.split(X))
        else:
            raise InputError("Invalid CV type!") 
            
        #Array for cv values
        cv_RMSE_all=np.zeros([len(folds),len(ncomp_range),len(nint_list)])
        i=0
        for train,val in folds:
            # If OSC model specified
            if len(osc_params)==2:
                osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1])
                # IPLS needs column names, so it uses pandas, but osc uses numpy arrays
                osc.fit(X.iloc[train].to_numpy(), Y.iloc[train].to_numpy().reshape(-1,1))
                X_train_osc=pd.DataFrame(data=osc.X_osc,columns=self.freqs)
                X_val_osc=pd.DataFrame(data=osc.transform(X.iloc[val].to_numpy()),columns=self.freqs)
            j=0
            for ncomp in ncomp_range:
                k=0
                for nint in nint_list:
                    ipls_obj=IntervalPLSRegression(ncomp=ncomp,nint=nint,
                                                       cv_type=inner_cv,cv_param=inner_cv_param)
                    if len(osc_params)==2:
                        ipls_obj.fit(X_train_osc, Y.iloc[train])
                        cv_RMSE_all[i,j,k]=metrics.mean_squared_error(
                                 Y.iloc[val], ipls_obj.predict(X_val_osc))**0.5
                    else:
                        ipls_obj.fit(X.iloc[train], Y.iloc[train])
                        cv_RMSE_all[i,j,k]=metrics.mean_squared_error(
                                 Y.iloc[val], ipls_obj.predict(X.iloc[val]))**0.5
                    k=k+1
                j=j+1
            i=i+1
        cv_RMSE=np.mean(cv_RMSE_all,axis=0)
        RMSE_best=np.amin(cv_RMSE)
        
        rpd_best=np.std(self.df[self.y_name])/RMSE_best
        # Best model
        ncomp_best=ncomp_range[np.where(
                cv_RMSE==RMSE_best)[0][0]]
        nint_best=nint_list[np.where(
                cv_RMSE==RMSE_best)[1][0]]
        if verbose:
            print("Best RMSE: ",RMSE_best)
            print("Best RPD: ",rpd_best)
            print("Number of components:",ncomp_best)
            print("Number of intervals:",nint_best)
        return (ncomp_best,nint_best,rpd_best)
示例#5
0
    def mcw_pls_cv(self,ncomp_range=range(1,21),sig_start=0.1,optimization="grid",
                   plot=False,verbose=True,
                   osc_params=(10,1)):
        # Separating X from Y for PLS
        # Needs to be converted to numpy array from pandas df
        X=self.df[self.freqs].to_numpy()
        # Y need to be converted to numpy array from pandas series and reshaped to (N,1) from (N,)
        Y=self.df[self.y_name].to_numpy().reshape(-1, 1)
        sample_std=np.std(self.df[self.y_name])
        
        # CV based on measurement day
        if self.cval=="MD":
            cv = LeaveOneGroupOut()
            folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name]))
        # kfold CV
        elif self.cval=="kfold":
            cv = KFold(n_splits=self.cval_param)
            folds=list(cv.split(X))
        else:
            raise InputError("Invalid CV type!")  
        
        if optimization=="grid":
            # Create a search vector from starting values for gridsearch
            sig_list=np.linspace(sig_start/10,sig_start*10,30)

            rpd_best_all=0
            non_improve=0
            
            repeat=True
            while repeat:
                # Array for storing CV errors
                cv_RMSE_all=np.zeros([len(folds),len(ncomp_range),len(sig_list)])
                i=0
                for train,val in folds:
                    # If OSC model specified
                    if len(osc_params)==2:
                        osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1])
                        osc.fit(X[train], Y[train])
                        X_train_osc=osc.X_osc
                        X_val_osc=osc.transform(X[val])
                    j=0
                    for ncomp in ncomp_range:
                        k=0
                        for sig in sig_list:
                            if len(osc_params)==2:
                                pls = mcw_pls_sklearn(n_components=ncomp, max_iter=30, R_initial=None, scale_sigma2=sig)
                                pls.fit(X_train_osc, Y[train])
                                cv_RMSE_all[i,j,k]=metrics.mean_squared_error(
                                        Y[val], pls.predict(X_val_osc))**0.5
                            else:        
                                pls = mcw_pls_sklearn(n_components=ncomp, max_iter=30, R_initial=None, scale_sigma2=sig)
                                pls.fit(X[train], Y[train])
                                cv_RMSE_all[i,j,k]=metrics.mean_squared_error(
                                        Y[val], pls.predict(X[val]))**0.5
                            k=k+1
                        j=j+1
                    i=i+1

                cv_RMSE_ncomp_sigs=np.mean(cv_RMSE_all,axis=0)

                # Best model
                ncomp_best=ncomp_range[np.where(
                        cv_RMSE_ncomp_sigs==np.amin(cv_RMSE_ncomp_sigs))[0][0]]
                sig_best=sig_list[np.where(
                        cv_RMSE_ncomp_sigs==np.amin(cv_RMSE_ncomp_sigs))[1][0]]
                rpd_best=sample_std/np.amin(cv_RMSE_ncomp_sigs)
                if verbose:
                    print("Best RMSE: ",np.amin(cv_RMSE_ncomp_sigs))
                    print("Best RPD: ",rpd_best)
                    print("Number of latent components: ",ncomp_best)
                    print("Best sigma: ",sig_best)
           
                # Check against all time best
                if rpd_best>rpd_best_all:
                    ncomp_best_all = ncomp_best
                    sig_best_all = sig_best
                    rpd_best_all= rpd_best
                else:
                    # Increase counter if there is no improvement
                    non_improve=non_improve+1
                repeat=False
                # Check if best value is in IQ range
                if sig_best<np.quantile(sig_list,0.2) or sig_best>np.quantile(sig_list,0.8):
                    # If not, move the search interval based on the magnitude of the best value
                    scale=math.floor(math.log10(sig_best))-1
                    lower=sig_best-(10**scale)*5
                    upper=sig_best+(10**scale)*5
                    # If best value is at the extreme of the interval expand it by a lot that way
                    if min(sig_list)==sig_best:
                        lower=sig_best/2
                    elif max(sig_list)==sig_best:
                        upper=sig_best*2
                    # Create new search vector
                    sig_list=np.linspace(lower,upper,10)
                    # Repeat evaluation
                    repeat=True
                # Terminate early if no improvements in 10 iterations        
                if non_improve>10:
                    repeat=False
                    print("No improvement, terminate early.")
                if repeat:
                    print("new iteration")
            # Set final values to all time best
            ncomp_best=ncomp_best_all
            sig_best=sig_best_all
            rpd_best=rpd_best_all
        

        elif optimization=="simple":

            # Array for storing CV errors
            sig_list=sig_start
            cv_RMSE_all=np.zeros([len(folds),len(ncomp_range),len(sig_list)])
            
            i=0
            for ncomp in ncomp_range:
                j=0
                for sig in sig_list:
                    pls = mcw_pls_sklearn(n_components=ncomp, max_iter=30, R_initial=None, scale_sigma2=sig)
                    k=0
                    for train,val in folds:
                        pls.fit(X[train], Y[train])
                        cv_RMSE_all[k,i,j]=metrics.mean_squared_error(
                                Y[val], pls.predict(X[val]))**0.5
                        k=k+1
                    j=j+1
                i=i+1
            
            # Printing and plotting CV results
            cv_RMSE_ncomp_sigs=np.mean(cv_RMSE_all,axis=0)
            if plot:
                cv_RPD_ncomp_sigs=sample_std/cv_RMSE_ncomp_sigs
                fig = plt.figure(figsize=(10,5))
                ax = plt.axes(projection="3d")
                # Cartesian indexing (x,y) transposes matrix indexing (i,j)
                x, y = np.meshgrid(list(sig_list),list(ncomp_range))
                z=cv_RPD_ncomp_sigs
                ls = LightSource(270, 45)
                rgb = ls.shade(z, cmap=cm.gist_earth, vert_exag=0.1, blend_mode='soft')
                surf = ax.plot_surface(x, y, z, rstride=1, cstride=1, facecolors=rgb,
                                       linewidth=0, antialiased=False, shade=False)
    
            plt.show()
            # Best model
            ncomp_best=ncomp_range[np.where(
                    cv_RMSE_ncomp_sigs==np.amin(cv_RMSE_ncomp_sigs))[0][0]]
            sig_best=sig_list[np.where(
                    cv_RMSE_ncomp_sigs==np.amin(cv_RMSE_ncomp_sigs))[1][0]]
            rpd_best=sample_std/np.amin(cv_RMSE_ncomp_sigs)
            print("Best RMSE: ",np.amin(cv_RMSE_ncomp_sigs))
            print("Best RPD: ",rpd_best)
            print("Number of latent components: ",ncomp_best)
            print("Best sigma: ",sig_best)
    
        return (ncomp_best,sig_best,rpd_best)
示例#6
0
 def svr_cv(self,gam_start=0.001,
            c_start=100,
            eps_start=0.1,
            optimization="grid",gridscale=5,non_improve_lim=10,verbose=False,
            osc_params=None):
     # Separating X from Y for PLS
     X=self.df[self.freqs].to_numpy()
     Y=self.df[self.y_name].to_numpy().reshape(-1, 1)
     sample_std=np.std(self.df[self.y_name])
     # CV based on measurement day
     if self.cval=="MD":
         cv = LeaveOneGroupOut()
         folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name]))
     # kfold CV
     elif self.cval=="kfold":
         cv = KFold(n_splits=self.cval_param)
         folds=list(cv.split(X))
     else:
         raise InputError("Invalid CV type!")
     
     if optimization=="none":
         cv_RMSE=np.zeros(len(folds))
         # Only use RBF kernels, also standardize data
         pipe = Pipeline([('scaler', StandardScaler()),
                                          ('support vector regression',
                                           SVR(kernel="rbf",gamma=gam_start,C=c_start,epsilon=eps_start))])
         l=0
         for train, val in folds:
             pipe.fit(X[train], Y[train])
             cv_RMSE[l]=metrics.mean_squared_error(
                     Y[val], pipe.predict(X[val]))**0.5
             l=l+1
             gam_best=gam_start
             c_best=c_start
             eps_best=eps_start
             rpd_best=sample_std/np.mean(cv_RMSE)
             
     elif optimization=="grid":
         # Create a search vector from starting values for gridsearch
         gam_list=np.linspace(gam_start/gridscale,gam_start*gridscale,10)
         c_list=np.linspace(c_start/gridscale,c_start*gridscale,10)
         eps_list=np.linspace(eps_start/gridscale,eps_start*gridscale,10)
         
         # Create list of ndarrays from parameter search vectors,
         # it will help with making the cood more tidy
         param_lists=[gam_list,c_list,eps_list]
         param_best=np.zeros(3)
         rpd_best_all=0
         non_improve=0
         
         repeat=True
         while repeat:
             # Array for storing CV errors
             cv_RMSE_all=np.zeros([len(folds),len(gam_list),len(c_list),len(eps_list)])
             # Put the CV iteration outside to save time when using OSC
             i=0
             for train, val in folds:
                 # If OSC model specified
                 if len(osc_params)==2:
                     osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1])
                     osc.fit(X[train], Y[train])
                     X_train_osc=osc.X_osc
                     X_val_osc=osc.transform(X[val])
                 j=0
                 for gam in param_lists[0]:
                     k=0
                     for c in param_lists[1]:
                         l=0
                         for eps in param_lists[2]: 
                             pipe = Pipeline([('scaler', StandardScaler()),
                                              ('support vector regression', SVR(kernel="rbf",gamma=gam,C=c,epsilon=eps))])
                             if len(osc_params)==2:
                                 pipe.fit(X_train_osc, Y[train])
                                 cv_RMSE_all[i,j,k,l]=metrics.mean_squared_error(
                                     Y[val], pipe.predict(X_val_osc))**0.5
                             else:
                                 pipe.fit(X[train], Y[train])
                                 cv_RMSE_all[i,j,k,l]=metrics.mean_squared_error(
                                         Y[val], pipe.predict(X[val]))**0.5
                             l=l+1
                         k=k+1
                     j=j+1
                 i=i+1
             cv_RMSE=np.mean(cv_RMSE_all,axis=0)
        
             # Best model
             param_best[0]=param_lists[0][np.where(
                     cv_RMSE==np.amin(cv_RMSE))[0][0]]
             param_best[1]=param_lists[1][np.where(
                     cv_RMSE==np.amin(cv_RMSE))[1][0]]
             param_best[2]=param_lists[2][np.where(
                     cv_RMSE==np.amin(cv_RMSE))[2][0]]
             rpd_best=sample_std/np.amin(cv_RMSE)
             # Check against all time best
             if rpd_best>rpd_best_all:
                 param_best_all = param_best.copy()
                 rpd_best_all=rpd_best
             else:
                 # Increase counter if there is no improvement
                 non_improve=non_improve+1
             if verbose==True:
                 print("Best RMSE: ",np.amin(cv_RMSE))
                 print("Best RPD: ",rpd_best)
                 print("Gamma: ",param_best[0])
                 print("C: ",param_best[1])
                 print("Epsilon: ",param_best[2])
             repeat=False
             for index,p in enumerate(param_best):
                 # Check if best value is in IQ range
                 if p<np.quantile(param_lists[index],0.2) or p>np.quantile(param_lists[index],0.8):
                     # If not, move the search interval based on the magnitude of the best value
                     scale=math.floor(math.log10(p))-1
                     lower=p-(10**scale)*5
                     upper=p+(10**scale)*5
                     # If best value is at the extreme of the interval expand it by a lot that way
                     if min(param_lists[index])==p:
                         lower=min(param_lists[index])/2
                     elif max(param_lists[index])==p:
                         upper=max(param_lists[index])*2
                     # Create new search vector
                     param_lists[index]=np.linspace(lower,upper,10)
                     # Repeat evaluation
                     repeat=True
             # Terminate early if no improvements in 10 iterations        
             if non_improve>non_improve_lim:
                 repeat=False
                 print("No improvement, terminate early.")
             if repeat:
                 print("new iteration")
         # Set final values to all time best
         gam_best=param_best_all[0]
         c_best=param_best_all[1]
         eps_best=param_best_all[2]
         rpd_best=rpd_best_all
    
     # Simulated annealing
     elif optimization=="sa":
             # Number of cycles
             cycles = 100
             # Trials per cycle
             trials = 100
             # Number of accepted solutions
             n_accepted = 0.0
             # Probability of accepting worse solution at the start
             p_start = 0.3
             # Probability of accepting worse solution at the end
             p_end = 0.001
             # Initial temperature
             t_start = -1.0/math.log(p_start)
             # Final temperature
             t_end = -1.0/math.log(p_end)
             # Use geometric temp reduction
             frac = (t_end/t_start)**(1.0/(cycles-1.0))
             # Starting values
             t=t_start
             dE_mean = 0.0
             gam=gam_start
             c=c_start
             eps=eps_start
             # Calculate starting cost
             cv_RMSE=np.zeros(len(folds))
             pipe = Pipeline([('scaler', StandardScaler()),
                                  ('support vector regression',
                                   SVR(kernel="rbf",gamma=gam,C=c,epsilon=eps))])
             L=0
             for train, val in folds:
                     pipe.fit(X[train], Y[train])
                     cv_RMSE[L]=metrics.mean_squared_error(
                             Y[val], pipe.predict(X[val]))**0.5
                     L=L+1
             cost=np.mean(cv_RMSE)
             rpd=sample_std/cost
             print("starting RPD:",rpd)
             # Best results
             gam_old = gam
             c_old = c
             eps_old = eps
             cost_old=cost
             rpd_old=rpd
             # All time best result
             gam_best = gam
             c_best = c
             eps_best = eps
             cost_best=cost
             rpd_best = rpd
             for i in range(cycles):
                 if verbose and i%10==0 and i>0:
                     print('Cycle: ', i ,' with Temperature: ', t)
                     print('RPD=',rpd_old,'Gamma='  ,gam_old,', C=' ,c_old,', epsilon=',eps_old)
                 for j in range(trials):
                     # Generate new trial points
                     gam = gam_old + (random.random()-0.5)*2/1000
                     c = c_old + (random.random()-0.5)*2*10
                     eps = eps_old + (random.random()-0.5)*2/100
                     # Enforce lower bounds
                     gam = max(gam,0.0000001)
                     c = max(c,0.0000001)
                     eps = max(eps,0)
                     # Calculate cost
                     cv_RMSE=np.zeros(len(folds))
                     pipe = Pipeline([('scaler', StandardScaler()),
                                          ('support vector regression',
                                           SVR(kernel="rbf",gamma=gam,C=c,epsilon=eps))])
                     L=0
                     for train, val in folds:
                             pipe.fit(X[train], Y[train])
                             cv_RMSE[L]=metrics.mean_squared_error(
                                     Y[val], pipe.predict(X[val]))**0.5
                             L=L+1
                     cost=np.mean(cv_RMSE)
                     rpd=sample_std/cost
                     dE = cost-cost_old
                     # If new cost is higher
                     if dE > 0:
                         if (i==0 and j==0): dE_mean = dE
                         # Generate probability of acceptance
                         p = math.exp(-dE/(dE_mean * t))
                         # Determine whether to accept worse point
                         if (random.random()<p):
                             accept = True
                         else:
                             accept = False
                     else:
                         # New cost is lower, automatically accept
                         accept = True
                         # Check if cost is lower than all time best
                         if cost<cost_best:
                             # If new best, store the parameters, cost and RPD
                             gam_best=gam
                             c_best=c
                             eps_best=eps
                             cost_best=cost
                             rpd_best=rpd
                     if accept==True:
                         # Update parameters, cost and RPD
                         gam_old = gam
                         c_old = c
                         eps_old = eps
                         cost_old=cost
                         rpd_old=rpd
                         # Increment number of accepted solutions
                         n_accepted = n_accepted + 1
                         # Update energy change
                         dE_mean = (dE_mean * (n_accepted-1) +  abs(dE)) / n_accepted
                 # Lower the temperature for next cycle
                 t = frac * t
                 # Return the best setting found
     else:
         raise InputError("Invalid optimization strategy!")
     return (gam_best,c_best,eps_best,rpd_best)
示例#7
0
    def osc_cv(self,nicomp_range=range(10,130,10),ncomp_range=range(1,5),epsilon = 10e-6,
               max_iters = 20,model="pls",model_parameter_range=range(1,11)):
        # Separating X from Y for PLS
        # Needs to be converted to numpy array from pandas df
        X=self.df[self.freqs].to_numpy()
        # Y need to be converted to numpy array from pandas series and reshaped to (N,1) from (N,)
        Y=self.df[self.y_name].to_numpy().reshape(-1, 1)
        # CV based on measurement day
        if self.cval=="MD":
            cv = LeaveOneGroupOut()
            folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name]))
        # kfold CV
        elif self.cval=="kfold":
            cv = KFold(n_splits=self.cval_param)
            folds=list(cv.split(X))
        else:
            raise InputError("Invalid CV type!")

        #Matrix for cv values for all the possible parameter combinations
        cv_RMSE_all=np.zeros([len(folds),len(model_parameter_range),len(nicomp_range),len(ncomp_range)])
        i=0
        #possible internal component values for osc
        for nicomp in nicomp_range:
            j=0
            #possible removed component values for osc
            for ncomp in ncomp_range:    
                k=0
                for train, val in folds:
                    # train osc
                    osc_obj=OSC("SWosc",nicomp,ncomp,epsilon, max_iters)
                    X_osc_train, W,P,mu_x=osc_obj.fit(X[train],Y[train])
                    # apply osc on validation set
                    # mean center data, alternatively the training set's mean can be used
                    # if you think it is a better estimate by mean="training"
                    X_osc_val=osc_obj.transform(X[val],mean="estimate")
                    l=0                           
                    #possible model patrameter values for pls
                    for param in model_parameter_range:                        
                        #setup pls model
                        pls = PLSRegression(param,scale=False)
                        #train pls
                        pls.fit(X_osc_train, Y[train])
                        #predict with pls and calculate error
                        cv_RMSE_all[k,l,i,j]=metrics.mean_squared_error(
                                Y[val], pls.predict(X_osc_val))**0.5
                        l=l+1
                    k=k+1
                j=j+1
            i=i+1
            
        # Calculate mean performance across the folds
        cv_RMSE_mean=np.mean(cv_RMSE_all,axis=0)
        # Find maximum for every osc paremeter combination
        cv_RMSE=np.amax(cv_RMSE_mean, axis=0)
        cv_RPD=np.std(self.df[self.y_name])/cv_RMSE
        fig = plt.figure(figsize=(10,5))
        ax = plt.axes(projection="3d")
        # Cartesian indexing (x,y) transposes matrix indexing (i,j)
        x, y = np.meshgrid(list(ncomp_range),list(nicomp_range))
        z=cv_RPD
        ls = LightSource(200, 45)
        rgb = ls.shade(z, cmap=cm.gist_earth, vert_exag=0.1, blend_mode='soft')
        surf = ax.plot_surface(x, y, z, rstride=1, cstride=1, facecolors=rgb,
                               linewidth=0, antialiased=False, shade=False)
        plt.show()
        # Best model
        print("Best RMSE: ",np.amin(cv_RMSE))
        print("Best RPD: ",np.std(self.df[self.y_name])/np.amin(cv_RMSE))
        print("Number of internal components: ",nicomp_range[np.where(
                cv_RMSE==np.amin(cv_RMSE))[0][0]])
        print("Number of removed components: ",ncomp_range[np.where(
                cv_RMSE==np.amin(cv_RMSE))[1][0]])
        return cv_RMSE