def dtcScores(self,Xn,y,cv=5,param_name='max_depth',paramRange=(1,10,1),trainW=1,testW=2,
		      title='Decision Tree classifier',clfArg={},plot=False,
		     run_classifier=False):
		"""
		Perform the validation_curve function using Decision Tree classifier (DTC)
		 and get the best param value based on the highest test_score. 
		cv indicates the cross validation k-fold. Default param to optimize is max_depth. 
		paramRange=(a,b,c) is the range to evaluate the param_name. a start degree, b end degree, c step.
		After the function gets the best param value, associated test_score and 
		train_score are used to calculated a weighted_score.
		trainW and testW are the weights used to calculated a 
		weighted_score=test_score*testW+train_score*trainW)/(testW+trainW).
		clfArg is a dictionary to add any additional parameters to the DTC. 
		To see how the best score is collected set plot=True. 
		The function calculates the scores for the DTC criterions gini and entropy.
		"""
		clf=DTC(**clfArg)
		model_scores=list()
		param_range=np.arange(paramRange[0],paramRange[1],paramRange[2])
		criterions=['gini','entropy']
		for criterion in criterions:
			dtitle=title+" "+criterion
			clf.criterion=criterion
			train_sc, test_sc = validation_curve(clf,Xn,y,param_name=param_name,param_range=param_range,cv=cv)
			param_score=self.plotTrainTest(train_sc,test_sc,param_range,t=dtitle,xlabel=param_name,plot=plot)
			scoreDic={'model':dtitle,'param_name':param_name}
			scoreDic.update(param_score)
			model_scores.append(scoreDic.copy())
		if run_classifier==True:
			return train_sc[:],test_sc[:]
		return self.scoreModelListDf(model_scores,trainW=trainW,testW=testW)
Exemplo n.º 2
0
    def train_decision_tree_model(self, X_train, X_test, y_train, y_test):
        #开始训练
        clf = DecisionTreeClassifier(random_state=0)
        clf.criterion = 'gini'
        print("decision tree criterion:",clf.criterion)
        clf.fit(X_train,y_train)
        print("total features:",clf.n_features_)
        print("total train samples:",len(X_train))
        print("total classes:",clf.n_classes_)

        print("feature_importances:",clf.feature_importances_)
        #测试模型分类的准确率96%左右
        print("score of test data:",clf.score(X_test, y_test))
        return clf
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
data = pd.read_csv(r"C:\Users\Naren\Desktop\test_data.csv")
data_x = data[["age", "it_exp"]]
data_y = data["going_to_die"]
data_x
cl = DecisionTreeClassifier(max_depth=2, random_state=0)
cl.fit(data_x, data_y)
cl.predict([[100, 20]])

# This wil visualize the DecisionTree structure constructed for the training tuples
tree.plot_tree(cl)

#===================================================================================
cl.criterion = "entropy"

cl.fit(data_x, data_y)
cl.predict([[100, 20]])

# This wil visualize the DecisionTree structure constructed for the training tuples
tree.plot_tree(cl)