def test_plot_tree(pyplot): # mostly smoke tests # Check correctness of export_graphviz clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2, criterion="gini", random_state=2) clf.fit(X, y) # Test export code feature_names = ['first feat', 'sepal_width'] nodes = plot_tree(clf, feature_names=feature_names) assert len(nodes) == 3 assert nodes[0].get_text() == ("first feat <= 0.0\nentropy = 0.5\n" "samples = 6\nvalue = [3, 3]") assert nodes[1].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [3, 0]" assert nodes[2].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [0, 3]"
# Plot the decision boundary plt.subplot(2, 3, pairidx + 1) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu) plt.xlabel(iris.feature_names[pair[0]]) plt.ylabel(iris.feature_names[pair[1]]) # Plot the training points for i, color in zip(range(n_classes), plot_colors): idx = np.where(y == i) plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], cmap=plt.cm.RdYlBu, edgecolor='black', s=15) plt.suptitle("Decision surface of a decision tree using paired features") plt.legend(loc='lower right', borderpad=0, handletextpad=0) plt.axis("tight") plt.figure() clf = DecisionTreeClassifier().fit(iris.data, iris.target) plot_tree(clf, filled=True) plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # print(train_X) # print(train_Y) # print(test_X) # print(test_Y) from sklearn import tree from sklearn.metrics import classification_report, plot_confusion_matrix clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=40, random_state=0) clf.fit(X_train, y_train) y_pre = clf.predict(X_test) # pre_train_Y = clf.predict(train_X) tree.plot_tree(clf) print(clf.score(X_test, y_test)) print(classification_report(y_test, y_pre, target_names=None)) # print(classification_report(train_Y, pre_train_Y, target_names=None)) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix titles_options = [("Confusion matrix, without normalization", None), ("Normalized confusion matrix", 'true')] for title, normalize in titles_options: disp = plot_confusion_matrix(clf, X_test, y_test, display_labels=None, cmap=plt.cm.Reds, normalize=normalize) disp.ax_.set_title(title)
"""Graph the actual tree""" !pip install -q graphviz import graphviz from matplotlib import pyplot as plt from sklearn import datasets from sklearn.tree import DecisionTreeClassifier from sklearn import tree fn=["age","gender","height","weight","ap_hi","ap_lo","cholesterol","gluc","smoke","alco","active","BMI","obesity_level"] cn=['Positive', 'Negative'] fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (30,30), dpi=300) tree.plot_tree(dt, feature_names = fn, class_names=cn, filled = True); """Import Naive Bayes Classifier and fit the data""" from sklearn.naive_bayes import GaussianNB model = GaussianNB() model.fit(X, Y); """Make a prediction""" y_pred = model.predict(X_test) y_pred """View the confusion matrix for the model"""
##Plot ROC for Decision Tree Method(with YellowBrick) ##Manual Plotting plt.style.use('seaborn-poster'); figure = plt.figure(figsize=(10, 6)); ax = plt.subplot(1, 1, 1); plot_roc(clf, X_val, y_val, "Decision Tree (test)", ax) plot_roc(clf, X_train, y_train, "Decision Tree (train)", ax) plt.legend(loc='lower right', fontsize=18); plt.tight_layout(); ##Model Visualization from sklearn.tree import plot_tree plt.figure(figsize=[20,10]); plot_tree(clf, filled=True, feature_names = feature_names, label='root', fontsize=14) plt.show(); ##2.0_Model Selection_Grid Search_Naive Bayes(Tune Parameters) from sklearn.naive_bayes import GaussianNB parameters = {'var_smoothing': [0,1e-9,1e-8]} gnb = GaussianNB() NBgridsearch = GridSearchCV(gnb,parameters,cv=15,scoring='roc_auc',return_train_score=True,error_score=0.0,n_jobs=-1) NBgridsearch.fit(X_train, y_train) NBgridsearch.best_params_ NBgridsearch.best_score_
waveform_features = waveform_data.iloc[:, 0:21] print(waveform_features) waveform_labels = waveform_data.iloc[:, 21] print(waveform_labels) X_train, X_test, y_train, y_test = train_test_split(waveform_features, waveform_labels, test_size=0.33, random_state=42) tree_classifier = DecisionTreeClassifier(max_depth=5, random_state=1) tree_classifier.fit(X=X_train, y=y_train) y_prediction = tree_classifier.predict(X=X_test) plt.figure(figsize=(75, 10), dpi=200) tree.plot_tree(decision_tree=tree_classifier, rounded=True, filled=True, fontsize=12) plt.savefig("Waveform_Decision_Tree") print(classification_report(y_true=y_test, y_pred=y_prediction)) scores = cross_val_score(estimator=tree_classifier, X=X_train, y=y_train, scoring='accuracy', cv=10) print("Scores: ", scores) print("Mean Scores: ", scores.mean()) y_prediction2 = tree_classifier.predict(X=X_train) print(classification_report(y_true=y_train, y_pred=y_prediction2))
min_impurity_split=None, class_weight=None, presort=False) criterion: 如何对样本进行划分. 可选: 'gini' (the Gini impurity) 'entropy' (the information gain) 信息增益 splitter: 当前选择哪个属性进行划分. 可选: 'best' / 'random' max_depth: 树的最大深度. None =>将节点展开至所有叶子都是纯净的, or 叶子节点数 少于最小节点 min_samples_split: 拆分内部节点所需的最少样本数 min_samples_leaf: 在叶节点处所需的最小样本数 (没看懂???) min_weight_fraction_leaf: 在所有叶节点处的权重总和中的最小加权分数 (还是没看懂???) max_features: 寻找最佳分割时要考虑的功能数量:int/float/'auto'/'sqrt'/'log2'/None max_leaf_nodes: 以该数字为最好的方式种植tree, None 即不考虑 min_impurity_decrease: 如果节点分裂导致杂质减少大于或等于该值,则该节点将分裂 min_impurity_split: 如果节点的杂质高于阈值,它将分裂 class_weight: 与类有关的权重 presort: 是否对数据进行预排序以加快寻找最佳拟合的速度 ''' iris = load_iris() X1, y1 = iris.data, iris.target clf1 = tree.DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=3) clf1.fit(X1[:-1], y1[:-1]) # 拟合 print(clf1.apply([X1[-1]])) # 返回样本被预测为的叶子的索引 print(clf1.decision_path([X1[-1]])) # 返回样本在树中的决策路径 print(clf1.feature_importances_) # 返回每个属性的重要程度 print(clf1.get_depth(), clf1.get_n_leaves()) # 返回树的深度, 叶子个数 print(clf1.predict([X1[-1]]), clf1.predict_proba([X1[-1]])) # 预测样本类型, 以及每个类型可能的概率 tree.plot_tree(clf1) r = tree.export.export_text(clf1, feature_names=iris['feature_names']) print(r) # 画树
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2) model=DecisionTreeClassifier(criterion='entropy') model.fit(X_train,y_train) y_pred=model.predict(X_test) from sklearn.metrics import confusion_matrix,classification_report,accuracy_score accuracy_score(y_test,y_pred) confusion_matrix(y_test,y_pred) classification_report(y_test,y_pred) from sklearn.tree import plot_tree plot_tree(model) ########################################################################################################################################
# to interpret the results. # %% from sklearn.tree import DecisionTreeClassifier from sklearn.tree import plot_tree max_leaf_nodes = 3 tree = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=0) tree.fit(X, y) # %% [markdown] # `plot_tree` will allow us to visually check the set of rules learnt by # the decision tree. # %% _ = plot_tree(tree) # %% # plot the decision function learned by the tree plot_tree_decision_function(tree, X, y) # %% [markdown] # By allowing only 3 leaves in the tree, we get similar rules to the ones we # designed by hand: # * the persons younger than 28.5 year-old (X[0] < 28.5) will be considered in the class # earning `<= 50K`. # * the persons older than 28.5 and working less than 40.5 hours-per-week (X[1] <= 40.5) # will be considered in the class earning `<= 50K`, while the persons working # above 40.5 hours-per-week, will be considered in the class # earning `> 50K`.
test = test.drop([target] + extraExcludes, axis=1) varSelected = ['TicketMeanT0mainWithDummies', 'Sex0mainWithDummies'] X = train[varSelected].fillna(0) y = train_y clf = DecisionTreeClassifier(max_leaf_nodes=1000, random_state=0, max_depth=100) # Train Decision Tree Classifer clf = clf.fit(X, y) #Predict the response for test dataset y_pred = clf.predict(X) test_pred = clf.predict(test[varSelected].fillna(0)) train['predicted'] = y_pred test['predicted'] = test_pred score_test = metrics.roc_auc_score(test_pred, test_y) score_train = metrics.accuracy_score(y, train['predicted']) print(score_train, score_test) from sklearn import tree import matplotlib.pyplot as plt fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=300) tree.plot_tree(clf, feature_names=varSelected, class_names='TARGET', filled=True) fig.savefig(baseLoc + 'imagename.png')
tree_clf.fit(X_train, Y_train) # In[15]: Y_pred = tree_clf.predict(X_test) cm = confusion_matrix(Y_test, Y_pred) # percentage of correct predictions perc_correct = np.diag(cm).sum() / len(X_test) perc_correct # In[19]: # our tree predicted correctly 64% of the test # lets take a look at the tree from sklearn.tree import plot_tree plot_tree(tree_clf, max_depth=3, filled=True) # In[20]: # now its time to try and improve our tree # with something called bagging from sklearn.ensemble import BaggingClassifier bag_clf = BaggingClassifier() bag_clf.fit(X_train, Y_train.ravel()) # In[21]: Ybag_pred = bag_clf.predict(X_test) cm_bag = confusion_matrix(Y_test, Ybag_pred) # percentage of correct predictions
#plot the decision boundary #绘制决策树 plt.subplot(2,3,pairidx + 1) x_min,x_max = x[:,0].min()-1,x[:,0].max()+1 y_min,y_max = x[:,1].min()-1,x[:,1].max()+1 xx,yy = np.meshgrid(np.arange(x_min,x_max,plot_step), np.arange(y_min,y_max,plot_step)) plt.tight_layout(h_pad=0.5,w_pad=0.5,pad=2.5) z = clf.predict(np.c_[xx.ravel(),yy.ravel()]) z = z.reshape(xx.shape) cs = plt.contourf(xx,yy,z,cmap=plt.cm.RdYlBu) plt.xlabel(iris.feature_names[pair[0]]) plt.ylabel(iris.feature_names[pair[1]]) #Plot the trainning points for i,color in zip(range(n_classes),plot_colors): idx = np.where(y==i) plt.scatter(x[idx,0],x[idx,1],c=color,label=iris.target_names[i], cmap=plt.cm.RdYlBu,edgecolor='black',s=15) plt.suptitle("Decision surface of a decision tree using paired features") plt.legend(loc='lower right') plt.axis("tight") plt.figure() clf = DecisionTreeClassifier().fit(iris.data,iris.target) plot_tree(clf,filled=True) plt.savefig('D:/python/czpython/save_decision_tree.png')#只能保存一张图片 plt.show()
from sklearn import tree import pandas as pd from sklearn.tree import plot_tree import matplotlib.pyplot as plt data = pd.read_csv('decision_tree_sample.csv') feature_cols = [ 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'modelyear' ] Y = data.mpg.to_numpy() data.drop(['mpg', 'maker'], axis=1, inplace=True) X = data.to_numpy() #create model model = tree.DecisionTreeClassifier(criterion='gini') #Train model model.fit(X, Y) plt.figure(figsize=(30, 10)) a = plot_tree(model, feature_names=feature_cols, class_names=['Bad', 'OK', 'Good'], filled=True, rounded=True, fontsize=5) b = 1
plot_decision_regions(X_combined, y_combined, classifier=tree_model, test_idx=range(105, 150)) plt.xlabel('petal length [cm]') plt.ylabel('petal width [cm]') plt.legend(loc='upper left') plt.tight_layout() #plt.savefig('images/03_20.png', dpi=300) plt.show() tree.plot_tree(tree_model) #plt.savefig('images/03_21_1.pdf') plt.show() dot_data = export_graphviz(tree_model, filled=True, rounded=True, class_names=['Setosa', 'Versicolor', 'Virginica'], feature_names=['petal length',
#Result Interpretation: #Low MSE means that the predicted values are actually matching with the predicted values #RMSE talks about how accurately the model fits the response. A low RMSE is an idicator of "good fit", which means the response is very close to the real values # In[ ]: #Decision Tree from sklearn import tree tree_obj = tree.DecisionTreeClassifier() tree1 = tree_obj.fit(train, train_labels) #Visualization of the tree fig, ax = mlt.pyplot.subplots(figsize=(40,40)) tree.plot_tree(tree_obj, feature_names=liver_data.columns, class_names = True, filled=True, max_depth=4, fontsize=15) mlt.pyplot.show() mlt.pyplot.savefig('tree.png') # In[ ]: #Prediction dec_prediction = tree1.predict(test) comp2 = pd.DataFrame({'Actual': test_labels, 'Predicted':dec_prediction}) comp3 = comp2.head(25) print('The comparison between test_labels and the prediction labels:\n {}'.format(comp3)) comp3.plot(kind='bar', figsize=(12,10)) mlt.pyplot.grid(which='major', linestyle='-', linewidth='0.5', color='black')
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder from collections import Counter from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn import tree import matplotlib.pyplot as plt # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folter tmp_folder = self._create_tmp_folder(logger) # Set up model if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) clf = DecisionTreeClassifier(random_state=42, max_depth=self.params["tree_depth"]) self.is_classifier = True else: clf = DecisionTreeRegressor(random_state=42, max_depth=self.params["tree_depth"]) self.is_classifier = False # Find the datatypes X = X.to_pandas() X.columns = orig_cols # Change continuous features to categorical X_datatypes = [str(item) for item in list(X.dtypes)] # Change all float32 values to float64 for ii in range(len(X_datatypes)): if X_datatypes[ii] == 'float32': X = X.astype({orig_cols[ii]: np.float64}) X_datatypes = [str(item) for item in list(X.dtypes)] # List the categorical and numerical features self.X_categorical = [ orig_cols[col_count] for col_count in range(len(orig_cols)) if (X_datatypes[col_count] == 'category') or ( X_datatypes[col_count] == 'object') ] self.X_numeric = [ item for item in orig_cols if item not in self.X_categorical ] # Find the levels and mode for each categorical feature # for use in the test set self.train_levels = {} for item in self.X_categorical: self.train_levels[item] = list(set(X[item])) self.train_mode[item] = Counter(X[item]).most_common(1)[0][0] # One hot encode the categorical features # And replace missing values with a Missing category if len(self.X_categorical) > 0: X.loc[:, self.X_categorical] = X[self.X_categorical].fillna( "Missing").copy() self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(X[self.X_categorical]) self.encoded_categories = list( self.enc.get_feature_names(input_features=self.X_categorical)) X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([ X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories) ], axis=1) # Replace missing values with a missing value code if len(self.X_numeric) > 0: X.loc[:, self.X_numeric] = X[self.X_numeric].fillna(-999).copy() # Fit the decision tree clf.fit(X, y) if self.is_classifier: yy = clf.predict_proba(X) p = np.round_(yy[:, 1], 5) else: yy = clf.predict(X) p = np.round_(yy, 5) self.leaf_categories = list(set(p)) # Fit linear or logistic models to each leaf node model_array = {} equation_log = [] for cat in self.leaf_categories: if self.is_classifier: if (np.mean(y[p == cat]) < 1) and (np.mean(y[p == cat]) > 0): lm = LogisticRegression(random_state=42) lm.fit(X[p == cat], y[p == cat]) model_array[cat] = lm equation_log.append([[ int(round((1 - cat) * sum(p == cat))), int(round(cat * sum(p == cat))) ], sum(p == cat), lm.intercept_[0]] + list(lm.coef_[0])) else: loggerinfo(logger, "No leaf fit") model_array[cat] = "dt" else: try: lm = LinearRegression() lm.fit(X[p == cat], y[p == cat]) model_array[cat] = lm equation_log.append( [cat, sum(p == cat), lm.intercept_] + list(lm.coef_)) except: loggerinfo(logger, "No leaf fit") model_array[cat] = "dt" # Save the leaf models pd.DataFrame(equation_log, columns=['leaf value', 'number of samples', 'intercept'] + list(X.columns)).to_csv( os.path.join(tmp_folder, 'Leaf_model_coef.csv')) # Plot the decision tree fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(8, 8), dpi=1600) tree.plot_tree(clf, feature_names=list(X.columns)) fig.savefig(os.path.join(tmp_folder, 'Decision_tree_plot.png')) importances = clf.feature_importances_ loggerinfo(logger, str(importances)) self.mean_target = np.array(sum(y) / len(y)) model = [clf, model_array] # Set model properties self.set_model_properties(model=model, features=list(X.columns), importances=importances, iterations=self.params['n_estimators'])
value=max_d, step=1, format=None, key="ad") modelo = DecisionTreeClassifier(max_depth=max_d) result = classificação(modelo, target, preditores, df_treino, df_teste) modelo = result["modelo"] lista_modelos.append(result) st.header("Visualização da árvore") #plt.figure(figsize=(12,8)) plot_tree(modelo, feature_names=preditores, filled=True, class_names=["0", "1", "2", "3", "4"]) st.pyplot() # # Feature importance # st.header("Relevância das variáveis") #modelo.feature_importances_ importancia = pd.Series(modelo.feature_importances_, index=preditores) importancia = importancia.sort_values(ascending=True) importancia.plot(kind="barh") st.pyplot()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3490, random_state=101) # In[49]: c = tree.DecisionTreeClassifier() c = c.fit(X_train, Y_train) # In[50]: get_ipython().run_line_magic('matplotlib', 'inline') tree.plot_tree(c) # In[51]: Y_pred = c.predict(X_test) # In[52]: from sklearn.metrics import classification_report, confusion_matrix print(confusion_matrix(Y_test, Y_pred)) print(classification_report(Y_test, Y_pred))
print(X_train.shape); print(y_train.shape); print("\r\n"); print(X_test.shape); print(y_test.shape); from sklearn.tree import DecisionTreeClassifier; tree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, random_state = 0 ); tree.fit(X_train, y_train) from sklearn.tree import plot_tree; plot_tree(tree, feature_names = cancer.feature_names, fontsize = 7 ) from sklearn.metrics import accuracy_score; y_pred_train = tree.predict(X_train); print("Train Set Accuracy : ", accuracy_score(y_train, y_pred_train)) y_pred_test = tree.predict(X_test); print("Test Set Accuracy : ", accuracy_score(y_test, y_pred_test)) # GINI IMPURITY tree_gin_d1 = DecisionTreeClassifier(criterion = 'gini', max_depth = 1, random_state = 0 ); tree_gin_d1.fit(X_train, y_train) y_pred_train_gin_d1 = tree_gin_d1.predict(X_train); y_pred_test_gin_d1 = tree_gin_d1.predict(X_test);
# Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) print(cm) print(75 / 80) #Accuracy = 93.75% """ #Rescaling my independent variables: X_test = sc.inverse_transform(X_test) """ # Decision Tree visualization----------------- from sklearn import tree #Simple Decision Tree #tree.plot_tree(classifier) #image is quite blurred #Lets try to make decision tree more interpretable by adding 'class names' and filling colors. #cn=['0','1'] #tree.plot_tree(classifier,class_names=cn,filled = True) #Although the Decision tree shows class name & leafs are colred but still its view is blurred. #Lets create a blank chart of desired size using matplotlib library and place our Decision tree there. import matplotlib.pyplot as plt fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=300) cn = ['0', '1'] tree.plot_tree(classifier, class_names=cn, filled=True) #if you want save figure, use savefig method in returned figure object. fig.savefig('imagename2.png')
X['Embarked'] = [0 if str(name) == "nan" else ord(name) for name in X['Embarked']] X['Cabin'] = [0 if str(el) == "nan" else len(str(el)) for el in X['Cabin']] X['Ticket'] = [ 0 if str(el) == "nan" else len(str(el)) for el in X['Ticket']] def sumASCII(name): accum = 0 for i in name: accum += ord(i) return accum X['Name'] = [0 if str(name) == "nan" else sumASCII(str(name)) for name in X['Name']] X['Sex'] = [0 if sex == 'male' else 1 for sex in X['Sex']] # Just give up on these for now # X = X.drop(['Ticket', 'Cabin', 'Embarked'], axis = 1) X = X.fillna(0) # train the model clf = clf.fit(X, Y) # run predictions on the data, using the model. They should mostly conform to the training values Y y_pred = clf.predict(X) fig = plt.figure() _ = tree.plot_tree(clf, feature_names=X.columns, filled=True) returnVal = metrics.precision_recall_fscore_support(Y, y_pred, average="macro") print(returnVal) fig.savefig("tree.png") plt.show()
print("Train index: \n", train_index) print("Test index: \n", test_index) i = i + 1 for train_index, test_index in kf_10.split(X): #print(train_index, test_index) X_train, X_test = X.loc[train_index], X.loc[test_index] y_train, y_test = y[train_index], y[test_index] model_rf = RandomForestClassifier(n_estimators=10, criterion='gini', random_state=42) model_rf.fit(X_train, y_train) # training model/classifier y_pred = model_rf.predict(X_test) # melakukan prediksi print("confusion matrix: \n", confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print("=======================================================") fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=700) tree.plot_tree(model_rf.estimators_[0], filled=True) fig.savefig('rf_individualtree.png') # export model to create API pickle.dump(model_rf, open('model.pkl', 'wb')) cek = [[31, 5.2, 6.8, 10.9, 4.2, 33]] print("model rf") print(model_rf.predict(cek))
# plt.axis("tight") # # plt.figure() # clf = DecisionTreeClassifier().fit(iris.data, iris.target) # plot_tree(clf, filled=True) # plt.show() iris = load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) estimator = DecisionTreeClassifier(max_leaf_nodes=6, random_state=0) plt.figure() estimator.fit(X_train, y_train) plot_tree(estimator, filled=True) plt.show() from sklearn.tree import _tree def tree_to_code(tree, feature_names, target_names): tree_ = tree.tree_ conditions = [] # classes = tree.classes_ feature_name = [ feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature ] target_name = [
model.fit(X_train, y_train) print('Optimal parameters:', model.best_params_) y_test_hat = model.predict(X_test) y_test_prob = model.predict_proba(X_test)[:, 1] print('Model evaluation (Optimal Hyperparameters)') print('Accuracy:') print(metrics.accuracy_score(y_test, y_test_hat)) print('Classification report:') print(metrics.classification_report(y_test, y_test_hat)) print('Confusion matrix (Optimal Hyperparameters)') cm = ConfusionMatrix(y_test, y_test_hat) print(cm) cm.print_stats() ax = cm.plot(backend='seaborn', annot=True, fmt='g') ax.set_title('Confusion Matrix (Optimal Hyperparameters)') plt.show() print('ROC curve (Optimal Hyperparameters)') plot_roc_curve(y_test, y_test_prob) tree = model.best_estimator_ plot_tree(tree, filled=True) plt.show()
ax.plot(X_test, y_pred, linewidth=4, label="Decision tree") _ = plt.legend() # %% [markdown] # We see that the decision tree model does not have a priori distribution for # the data and we do not end-up with a straight line to regress flipper length # and body mass. # # Instead, we observe that the predictions of the tree are piecewise constant. # Indeed, our feature space was split into two partitions. We can check the # tree structure to see what was the threshold found during the training. # %% from sklearn.tree import plot_tree _ = plot_tree(tree, feature_names=data_columns) # %% [markdown] # The threshold for our feature (flipper length) is 202.5 mm. The predicted # values on each side of the split are two constants: 3683.50 g and 5023.62 g. # These values corresponds to the mean values of the training samples in each # partition. # # In classification, we saw that increasing the depth of the tree allowed to # get more complex decision boundary. We can check the effect of increasing the # depth for decision tree in a regression setting. # %% tree = DecisionTreeRegressor(max_depth=3) tree.fit(X_train, y_train) y_pred = tree.predict(X_test)
Y = data.values[:,15] #xyz=data.values[180:181,:13] X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100) # In[88]: clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,max_depth=3, min_samples_leaf=5) clf_gini.fit(X_train, y_train) from sklearn import tree clf = tree.DecisionTreeClassifier() #Once trained, you can plot the tree with the plot_tree function: %matplotlib inline tree.plot_tree(clf_gini) from matplotlib import pyplot as plt import matplotlib.pyplot as plt fig = plt.figure(figsize=(125,200)) _ = tree.plot_tree(clf_gini,filled=True) # In[92]: y_pred = clf_gini.predict(X_test) #y_pred print ("\nAcuraccy score ::: ",accuracy_score(y_test,y_pred)*100)
if __name__ == '__main__': with open('lenses.txt', 'r') as fr: #加载文件 lenses = [inst.strip().split('\t') for inst in fr.readlines()] #处理文件 lenses_target = [] #提取每组数据的类别,保存在列表里 for each in lenses: lenses_target.append(each[-1]) print(lenses_target) lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] #特征标签 lenses_list = [] #保存lenses数据的临时列表 lenses_dict = {} #保存lenses数据的字典,用于生成pandas for each_label in lensesLabels: #提取信息,生成字典 for each in lenses: lenses_list.append(each[lensesLabels.index(each_label)]) lenses_dict[each_label] = lenses_list lenses_list = [] # print(lenses_dict) #打印字典信息 lenses_pd = pd.DataFrame(lenses_dict) #生成pandas.DataFrame # print(lenses_pd) #打印pandas.DataFrame le = LabelEncoder() #创建LabelEncoder()对象,用于序列化 for col in lenses_pd.columns: #序列化 lenses_pd[col] = le.fit_transform(lenses_pd[col]) # print(lenses_pd) #打印编码信息 clf = tree.DecisionTreeClassifier( max_depth=4) #创建DecisionTreeClassifier()类 # clf = clf.fit(lenses_pd.values.tolist(), lenses_target) #使用数据,构建决策树 tree.plot_tree(clf.fit(lenses_pd.values.tolist(), lenses_target), filled=True) plt.show()
X_test.shape # In[186]: from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier(criterion="entropy", random_state=0) classifier.fit(X_train, Y_train) # In[187]: import matplotlib.pyplot as plt from sklearn.tree import plot_tree #Use the pandas apply method to numerically encode our attrition target variable # In[188]: plt.figure(figsize=(10, 11)) plot_tree(classifier) plt.show # In[189]: classifier.score(X_train, Y_train) # In[190]: classifier.score(X_test, Y_test) # In[ ]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=.25) model = KNeighborsClassifier() model.fit(xtrain, ytrain) ypred = model.predict(xtest) df = pd.DataFrame({'actual': ytest, 'ypred': ypred}) df model.score(xtest, ytest) mod = GaussianNB() mod.fit(xtrain, ytrain) mod.predict(xtest) mod.score(xtest, ytest) mod1 = SVC() mod1.fit(xtrain, ytrain) mod1.predict(xtest) mod1.score(xtest, ytest) mod2 = DecisionTreeClassifier(max_depth=2) mod2.fit(xtrain, ytrain) mod2.predict(xtest) mod2.score(xtest, ytest) mod3 = RandomForestClassifier(n_estimators=700) mod3.fit(xtrain, ytrain) mod3.predict(xtest) mod3.score(xtest, ytest) plot_tree(mod2.fit(xtrain, ytrain))
x = tree_data.iloc[:,:5] x y = tree_data.iloc[:,5] y # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) dt = DecisionTreeClassifier() model = dt.fit(x, y) pred = model.predict(x) acc=accuracy_score(y, pred) acc # 1.0 tree.plot_tree(model) #@@12 x_col = x.columns x_col[2] # 'income' x_col[1] # 'age' export_graphviz(model, out_file='smoking_tree_graph.dot', feature_names=x_col) #@@13 dt = DecisionTreeClassifier(criterion='entropy') model = dt.fit(x, y) pred = model.predict(x) acc=accuracy_score(y, pred) acc
dtc = DecisionTreeClassifier() # model dtc.fit(X_train, y_train) # fit # Prévision sur les données de test y_pred = dtc.predict(X_test) # predict dtc.score(X_test, y_test) # first evaluation based on accuracy # Matrice de confusion et métriques confusion_matrice_tree = confusion_matrix(y_test, y_pred) print("Arbre de décision") print_metrics(y_test, y_pred) #Visualisation de l'arbre méthode 3 from sklearn.tree import plot_tree plt.figure(figsize=(20, 10)) plot_tree(dtc, filled=True, max_depth=5, fontsize=10) """5.2. Random Forest""" from sklearn.ensemble import RandomForestClassifier # Classification par une forêt aléatoire rf = RandomForestClassifier(max_depth=10) # model / construction rf.fit(X_train, y_train) # fit / apprentissage # Prévision sur les données de test y_pred = rf.predict(X_test) # predict / prévision : booléen True/False y_pred_proba = rf.predict_proba( X_test)[:, 1] # predict : avec probabilité d'être True/False entre 0 et 1 # The predicted class probability is the fraction of samples of the same class in a leaf. # Matrice de confusion et métriques
RandomForestClassifier(...) print(clf.predict([[0, 0, 0, 0]])) out = clf.predict(X) correct = np.where(out == y) acc = len(correct[0]) / len(y) * 100 print(acc) count = 0 for i in range(n_estimators): count += clf.estimators_[i].tree_.node_count print(clf.estimators_[i].tree_.node_count) print(count) a = clf.estimators_[0] #plt.figure(10,10) tree.plot_tree(a) count = 0 for i in range(n_estimators): count += clf.estimators_[i].tree_.node_count print(clf.estimators_[i].tree_.node_count) print(count) #%% #save_name= 'rf9_nf_cv1.p' save_name = '3c_rf6_nf_norm_k2_cv1.p' result_dir = 'results/' rf_model = joblib.load(result_dir + save_name, mmap_mode='r') #model = pickle.load(open(result_dir+'train_'+save_name+'_best.pth', 'rb')) #