def dtcScores(self,Xn,y,cv=5,param_name='max_depth',paramRange=(1,10,1),trainW=1,testW=2, title='Decision Tree classifier',clfArg={},plot=False, run_classifier=False): """ Perform the validation_curve function using Decision Tree classifier (DTC) and get the best param value based on the highest test_score. cv indicates the cross validation k-fold. Default param to optimize is max_depth. paramRange=(a,b,c) is the range to evaluate the param_name. a start degree, b end degree, c step. After the function gets the best param value, associated test_score and train_score are used to calculated a weighted_score. trainW and testW are the weights used to calculated a weighted_score=test_score*testW+train_score*trainW)/(testW+trainW). clfArg is a dictionary to add any additional parameters to the DTC. To see how the best score is collected set plot=True. The function calculates the scores for the DTC criterions gini and entropy. """ clf=DTC(**clfArg) model_scores=list() param_range=np.arange(paramRange[0],paramRange[1],paramRange[2]) criterions=['gini','entropy'] for criterion in criterions: dtitle=title+" "+criterion clf.criterion=criterion train_sc, test_sc = validation_curve(clf,Xn,y,param_name=param_name,param_range=param_range,cv=cv) param_score=self.plotTrainTest(train_sc,test_sc,param_range,t=dtitle,xlabel=param_name,plot=plot) scoreDic={'model':dtitle,'param_name':param_name} scoreDic.update(param_score) model_scores.append(scoreDic.copy()) if run_classifier==True: return train_sc[:],test_sc[:] return self.scoreModelListDf(model_scores,trainW=trainW,testW=testW)
def train_decision_tree_model(self, X_train, X_test, y_train, y_test): #开始训练 clf = DecisionTreeClassifier(random_state=0) clf.criterion = 'gini' print("decision tree criterion:",clf.criterion) clf.fit(X_train,y_train) print("total features:",clf.n_features_) print("total train samples:",len(X_train)) print("total classes:",clf.n_classes_) print("feature_importances:",clf.feature_importances_) #测试模型分类的准确率96%左右 print("score of test data:",clf.score(X_test, y_test)) return clf
import pandas as pd from sklearn.tree import DecisionTreeClassifier from sklearn import tree data = pd.read_csv(r"C:\Users\Naren\Desktop\test_data.csv") data_x = data[["age", "it_exp"]] data_y = data["going_to_die"] data_x cl = DecisionTreeClassifier(max_depth=2, random_state=0) cl.fit(data_x, data_y) cl.predict([[100, 20]]) # This wil visualize the DecisionTree structure constructed for the training tuples tree.plot_tree(cl) #=================================================================================== cl.criterion = "entropy" cl.fit(data_x, data_y) cl.predict([[100, 20]]) # This wil visualize the DecisionTree structure constructed for the training tuples tree.plot_tree(cl)