def update_tree(change): with output: is_shows = [button.value for button in feature_buttons] show_features = np.array(feature_names)[is_shows] clf.fit(X_train[:, is_shows], y_train) if eval: y_pred = clf.predict(X_valid[:, is_shows]) accuracy = accuracy_score(y_valid, y_pred) dtree = dtreeplt(model=clf, feature_names=show_features, target_names=target_names, X=X_train, y=y_train) clear_output() fig = dtree.view() if eval: fig.suptitle(f'Accuracy(Hold Out 9:1): {accuracy * 100:.3f}%', x=0, fontsize=20) plt.tight_layout() plt.show()
""" # Fitting Decision Tree Classification to the Training set from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier(criterion='entropy', random_state=0, max_depth=3, min_samples_leaf=5) clf = classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) print(cm) #Accuracy = 91% """ #Rescaling my independent variables: X_test = sc.inverse_transform(X_test) """ # Decision Tree visualization from dtreeplt import dtreeplt dtree = dtreeplt(model=classifier, feature_names=X_train, target_names=y_train) fig = dtree.view() fig #if you want save figure, use savefig method in returned figure object. fig.savefig('output_2.png')
def main(): # import the hazelnut.csv file using panda df = pd.read_csv('mlAssignment2Dataset.csv', header=0, delimiter=",") # choose the number of training cases training_number = 10 # find attribute names and class labels from the dataframe feature_names = df.columns.tolist() target_variable = df.columns[-1] target_names = df[target_variable].unique() # initialize variables average_accuracy = [] clf_accuracy = [] learning_curve_accuracy = [] clf_learning_curve_accuracy = [] # run the tests ten times and get the average accuracy for x in range(0, training_number): # randomly shuffle the data and split into training and test subsets df = df.sample(frac=1) train_split = int((len(df) * 2) / 3) training = df.values[0:train_split] test = df.values[train_split - 1:] # get the number of columns in the test and training sets training_length = training.shape[1] test_length = test.shape[1] # run the sklearn decision tree clf = tree.DecisionTreeClassifier() clf = clf.fit(training[:, :test_length-1], training[:, training_length-1]) # Test the imported decision tree algorithm test_predictions = clf.predict(test[:, :test_length-1]) clf_accuracy.append((metrics.accuracy_score(test[:, -1], test_predictions))*100) clf_result = "\nScikit CART: test %d accuracy: %f%%" % (x+1, clf_accuracy[x]) print(clf_result) # test the accuracy of our tree accuracy = test_tree(build_tree(training), test) average_accuracy.append(accuracy) result = "Our tree: Test %d has accuracy %d%%" % (x+1, accuracy) print(result) print("\nAverage accuracy for scikit tree after 10 runs is {:.2f}%, +/- {:.2f}%".format(mean(clf_accuracy), stdev(clf_accuracy))) print("Average accuracy after 10 runs is {:.2f}%, +/- {:.2f}%".format(numpy.mean(average_accuracy), stdev(average_accuracy))) # print out the sklearn decision tree dtree = dtreeplt(model=clf, feature_names=feature_names, target_names=target_names) fig = dtree.view() fig.savefig('output.png') # create a diagram of our decision tree final_string = cart_tree(build_tree(training), None) create_tree_image(final_string) # get data for learning curves and save to file to be used in excel for instances in range(2, len(training), 2): accuracy = test_tree(build_tree(training[0:instances]), test) learning_curve_accuracy.append(accuracy) clf = tree.DecisionTreeClassifier() clf = clf.fit(training[0:instances, :training_length-1], training[0:instances, training_length-1]) test_predictions = clf.predict(test[:, :test_length-1]) clf_learning_curve_accuracy.append((metrics.accuracy_score(test[:, -1], test_predictions))*100) print(clf_learning_curve_accuracy) print(learning_curve_accuracy) newFile = open("mlAssignment2LearningCurve.csv", 'a+') newFile.write(str(learning_curve_accuracy)) newFile.write(str(clf_learning_curve_accuracy)) newFile.close()
from sklearn.tree import DecisionTreeClassifier from dtreeplt import dtreeplt #Creating a dataframe with the four feature variables import pandas as pd df = pd.read_csv('/home/deepak/analytics/Iris_Dataset.csv') #View top 5 rows df.head() X = df.iloc[:, [0, 1, 2, 3]].values y = df.iloc[:, 4].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=2) model = DecisionTreeClassifier(criterion='entropy', random_state=0) model.fit(X_train, y_train) dtree = dtreeplt(model=model, feature_names=X_train, target_names=y_train) fig = dtree.view() #if you want save figure, use savefig method in returned figure object. fig.savefig('Iris_output.png')
y = data['Churn'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) tree = DecisionTreeClassifier(criterion='gini', random_state=0) tree.fit(X_train, y_train) print('Accuracy for train {:.3f}'.format(tree.score(X_train, y_train))) #학습 데이터를 가지고 만든 의사결정나무 모형이 학습 데이터를 얼마나 잘 나누는지 정확도를 보여준다. print('Accuracy for test {:.3f}'.format(tree.score(X_test, y_test))) dtree = dtreeplt(model=tree, feature_names=X_train.columns, target_names=['yes', 'no']) #feature_names: 입력변수이 들어간다. target_names: 목표 변수의 이름이 들어간다. fig = dtree.view() fig # In[7]: tree = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=0) tree.fit(X_train, y_train) print('Accuracy for train {:.3f}'.format(tree.score(X_train, y_train))) #학습 데이터를 가지고 만든 의사결정나무 모형이 학습 데이터를 얼마나 잘 나누는지 정확도를 보여준다. print('Accuracy for test {:.3f}'.format(tree.score(X_test, y_test)))
#https://pypi.org/project/dtreeplt/ # https://github.com/nekoumei/dtreeplt # You should prepare trained model,feature_names, target_names. # in this example, use iris datasets. from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier from dtreeplt import dtreeplt iris = load_iris() model = DecisionTreeClassifier() model.fit(iris.data, iris.target) dtree = dtreeplt(model=model, feature_names=iris.feature_names, target_names=iris.target_names) fig = dtree.view() #if you want save figure, use savefig method in returned figure object. fig.savefig('output_test_Community.png') # exploring the data #https://stackoverflow.com/questions/38105539/how-to-convert-a-scikit-learn-dataset-to-a-pandas-dataset import numpy as np import pandas as pd from sklearn.datasets import load_iris # save load_iris() sklearn dataset to iris # if you'd like to check dataset type use: type(load_iris()) # if you'd like to view list of attributes use: dir(load_iris()) iris = load_iris() # np.c_ is the numpy concatenate function
features[:, 11] = data['petroR50_r'] / data['petroR90_r'] # concentration in z filter features[:, 12] = data['petroR50_z'] / data['petroR90_z'] return features, targets if __name__ == '__main__': data = np.load('galaxy_catalogue.npy') #filename="decision_tree.jpg" # predicted_class, actual_class = dtc_predict_actual(data) training_set, testing_set = splitdata_train_test(data, 0.7) train_features, train_targets = generate_features_targets(training_set) test_features, test_targets = generate_features_targets(testing_set) dtc = DecisionTreeClassifier() dtc.fit(train_features, train_targets) #dot_data = export_graphviz(dtc, out_file=None,feature_names=['u - g', 'g - r', 'r - i', 'i - z','ecc','m4_u','m4_g','m4_r','m4_i','m4_z','conc1','conc2','conc3']) #graph = pydotplus.graph_from_dot_data(dot_data) #graph.write_jpg("decision_tree.jpg") #predictions= dtc.predict(test_features) dtree = dtreeplt(model=dtc, feature_names=[ 'u - g', 'g - r', 'r - i', 'i - z', 'ecc', 'm4_u', 'm4_g', 'm4_r', 'm4_i', 'm4_z', 'flux_u', 'flux_r', 'flux_z' ], target_names=['merger', 'elliptical', 'spiral'], filled=True) fig = dtree.view() fig.savefig('DTC.png')
x1 = market2[[ 'Age', 'Gender', 'OwnHome', 'Married', 'Location', 'Salary', 'Children' ]] y1 = market2['Spent'] x1_dummy = pd.get_dummies(x1, drop_first=True) X_train, X_test, y_train, y_test = train_test_split(x1_dummy, y1, test_size=0.2, random_state=None) tree = DecisionTreeClassifier(criterion='gini', random_state=0) tree.fit(X_train, y_train) dtree = dtreeplt( model=tree, feature_names=X_train.columns, target_names=['1', '0'], ) fig = dtree.view() # #### (c) (b)에서 만들어진 의사결정나무 모형의 성과를 5겹 교차검증으로 측정하고, 5겹 교차검증의 정확도 평균값을 출력하라. (5점) # # In[229]: from sklearn.model_selection import cross_val_score import sklearn.metrics as sm scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'] for i in scoring: #tree1=DecisionTreeClassifier(criterion='entropy')
plot_decision_regions( X=X, y=y.astype(np.integer), clf=model, legend=None, #legend=2 zoom_factor=3.0) # Update plot object with X/Y axis labels and Figure Title plt.xlabel('x1') plt.ylabel('x2') plt.xlim(0, 1) plt.ylim(0, 1) plt.title('Solution Boundary', size=16) plt.show() from dtreeplt import dtreeplt #plt.style.use('presentation') dtree = dtreeplt( model=model, feature_names=['x0', 'x1'], target_names=[ 0, 1 ] #I think order in the dataset doesn't matter, just in ascending order??? ) fig = dtree.view() fig.set_facecolor("lightslategray") #if you want save figure, use savefig method in returned figure object. fig.savefig('output_quiz.png')
max_leaf_nodes=5, max_depth=2, min_samples_leaf=50) # X_train=X_train.reshape(-1,1) clf_entropy.fit(X_train, Y_train) Y_pred = clf_entropy.predict(X_test) print('Accuracy Score is:::', accuracy_score(Y_test, Y_pred) * 100) df1 = df del df1['not.fully.paid'] features = list(df1.columns) from IPython.display import Image from sklearn.externals.six import StringIO import pydotplus # dot_data=StringIO() # features=list(df1.columns) # tree.export_graphviz(clf_entropy,out_file=dot_data,rounded=True,filled=True,feature_names=features,impurity=False) # graph=pydotplus.graph_from_dot_data(dot_data.getvalue()) # Image(graph.create_png()) from dtreeplt import dtreeplt dtree = dtreeplt(model=df1, feature_names=features) fig = dtree.view() # df1=df.reshape(-1,1) # dot_file=open("pt.dot",'w') # sd=tree.export_graphviz(clf_entropy,out_file=dot_file,feature_names=df1.columns)