def rfcScores(self,Xn,y,cv=5,param_name='max_depth',estimatorsRange=(10,11,1),paramRange=(1,10,1),trainW=1,testW=2,title='Randorm Forest classifier',clfArg={},plot=False): """ Perform the validation_curve function using Random Forest classifier (RFC) and get the best param value based on the highest test_score. cv indicates the cross validation k-fold. Default param to optimize is max_depth. paramRange=(a,b,c) is the range to evaluate the param_name. a start degree, b end degree, c step. estimatorsRange=(a,b,c) is the range to evaluate the number of estimators (n_estimators). After the function gets the best param value, associated test_score and train_score are used to calculated a weighted_score. trainW and testW are the weights used to calculated a weighted_score=test_score*testW+train_score*trainW)/(testW+trainW). clfArg is a dictionary to add any additional parameters to the RFC. To see how the best score is collected set plot=True. The function calculates the scores for the RFC criterions gini and entropy. """ clf=RFC(**clfArg) model_scores=list() param_range=np.arange(paramRange[0],paramRange[1],paramRange[2]) e_range=np.arange(estimatorsRange[0],estimatorsRange[1],estimatorsRange[2]) criterions=['gini','entropy'] for criterion in criterions: clf.criterion=criterion for e in e_range: clf.n_estimators=e dtitle=title+". Criterion: "+criterion+". Estimators: "+str(e) train_sc, test_sc = validation_curve(clf,Xn,y,param_name=param_name,param_range=param_range,cv=cv) param_score=self.plotTrainTest(train_sc,test_sc,param_range,t=dtitle,xlabel=param_name,plot=plot) scoreDic={'model':dtitle,'param_name':param_name} scoreDic.update(param_score) model_scores.append(scoreDic.copy()) return self.scoreModelListDf(model_scores,trainW=trainW,testW=testW)
import pandas as pd #1 from sklearn.model_selection import train_test_split #2 from sklearn.ensemble import RandomForestClassifier #3 from sklearn.metrics import confusion_matrix #4 # import libs datas = pd.read_csv("datas.csv") # read datas #1 x = datas.iloc[:, 3:-3].values y = datas.iloc[:, -2].values # split values x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.10, random_state=0) # 90% for train, %10 for test #2 rfc = RandomForestClassifier() #3 rfc.max_depth = 100 rfc.criterion = "entropy" #select criterion,other criterion is 'gini' rfc.n_estimators = 1 rfc.fit(x_train, y_train) y_pred = rfc.predict(x_test) cm = confusion_matrix(y_test, y_pred) #4 print("RFC") print(cm)