def test_graphviz_toy(): # Check correctness of export_graphviz clf = DecisionTreeClassifier(max_depth=3, min_samples_split=1, criterion="gini", random_state=2) clf.fit(X, y) # Test export code out = StringIO() export_graphviz(clf, out_file=out) contents1 = out.getvalue() contents2 = "digraph Tree {\n" \ "0 [label=\"X[0] <= 0.0000\\ngini = 0.5\\n" \ "samples = 6\", shape=\"box\"] ;\n" \ "1 [label=\"gini = 0.0000\\nsamples = 3\\n" \ "value = [ 3. 0.]\", shape=\"box\"] ;\n" \ "0 -> 1 ;\n" \ "2 [label=\"gini = 0.0000\\nsamples = 3\\n" \ "value = [ 0. 3.]\", shape=\"box\"] ;\n" \ "0 -> 2 ;\n" \ "}" assert_equal(contents1, contents2) # Test with feature_names out = StringIO() export_graphviz(clf, out_file=out, feature_names=["feature0", "feature1"]) contents1 = out.getvalue() contents2 = "digraph Tree {\n" \ "0 [label=\"feature0 <= 0.0000\\ngini = 0.5\\n" \ "samples = 6\", shape=\"box\"] ;\n" \ "1 [label=\"gini = 0.0000\\nsamples = 3\\n" \ "value = [ 3. 0.]\", shape=\"box\"] ;\n" \ "0 -> 1 ;\n" \ "2 [label=\"gini = 0.0000\\nsamples = 3\\n" \ "value = [ 0. 3.]\", shape=\"box\"] ;\n" \ "0 -> 2 ;\n" \ "}" assert_equal(contents1, contents2) # Test max_depth out = StringIO() export_graphviz(clf, out_file=out, max_depth=0) contents1 = out.getvalue() contents2 = "digraph Tree {\n" \ "0 [label=\"X[0] <= 0.0000\\ngini = 0.5\\n" \ "samples = 6\", shape=\"box\"] ;\n" \ "1 [label=\"(...)\", shape=\"box\"] ;\n" \ "0 -> 1 ;\n" \ "2 [label=\"(...)\", shape=\"box\"] ;\n" \ "0 -> 2 ;\n" \ "}" assert_equal(contents1, contents2)
def test_graphviz_toy(): """Check correctness of graphviz output on a toy dataset.""" clf = tree.DecisionTreeClassifier(max_depth=3, min_samples_split=1) clf.fit(X, y) # test export code out = StringIO() tree.export_graphviz(clf, out_file=out) contents1 = out.getvalue() tree_toy = StringIO( "digraph Tree {\n" "0 [label=\"X[0] <= 0.0000\\nerror = 0.5" "\\nsamples = 6\\nvalue = [ 3. 3.]\", shape=\"box\"] ;\n" "1 [label=\"error = 0.0000\\nsamples = 3\\n" "value = [ 3. 0.]\", shape=\"box\"] ;\n" "0 -> 1 ;\n" "2 [label=\"error = 0.0000\\nsamples = 3\\n" "value = [ 0. 3.]\", shape=\"box\"] ;\n" "0 -> 2 ;\n" "}") contents2 = tree_toy.getvalue() assert contents1 == contents2, \ "graphviz output test failed\n: %s != %s" % (contents1, contents2) # test with feature_names out = StringIO() out = tree.export_graphviz(clf, out_file=out, feature_names=["feature1", ""]) contents1 = out.getvalue() tree_toy = StringIO( "digraph Tree {\n" "0 [label=\"feature1 <= 0.0000\\nerror = 0.5" "\\nsamples = 6\\nvalue = [ 3. 3.]\", shape=\"box\"] ;\n" "1 [label=\"error = 0.0000\\nsamples = 3\\n" "value = [ 3. 0.]\", shape=\"box\"] ;\n" "0 -> 1 ;\n" "2 [label=\"error = 0.0000\\nsamples = 3\\n" "value = [ 0. 3.]\", shape=\"box\"] ;\n" "0 -> 2 ;\n" "}") contents2 = tree_toy.getvalue() assert contents1 == contents2, \ "graphviz output test failed\n: %s != %s" % (contents1, contents2) # test improperly formed feature_names out = StringIO() assert_raises(IndexError, tree.export_graphviz, clf, out, feature_names=[])
def visualize_tree(clf, outname, headers): from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=list(headers)) graph = pydot.graph_from_dot_data(dot_data.getvalue().decode('latin1').encode('utf8')) graph.write_pdf(outname)
def applyDecisionTree(trainData, trainTargets, testData, testTargets, featureNames): """Train and classify using a Decision Tree and prints the decision Tree.""" decisionTree = DecisionTreeClassifier() model = decisionTree.fit(trainData, trainTargets) # Create graph description of the Decision Tree dot_data = StringIO() #export_graphviz(model, out_file=dot_data, max_depth=5) print("Feature names:", featureNames) export_graphviz(model, out_file=dot_data, feature_names=featureNames, max_depth=5) export_graphviz(model, out_file="DecisionTree.dot", feature_names=featureNames, max_depth=5) #with open("DecisionTree.dot", 'r') as dotFile: # dotFile.write(exportFile) # Create PDF from dot graph = pydot.graph_from_dot_data(dot_data.getvalue()) #path = "/Users/konstantin/Documents/University/Bachelorthesis/paper/src/DecisionTree.dot" #graph = pydot.graph_from_dot_file(path) #graph.write_pdf("DecisionTree.pdf") classification = [model.predict(d)[0] for d in testData] print("\nUsing a Decision Tree:") showPerformance(testTargets, classification)
def decision_tree(train_features, train_labels, test_features, test_labels, feature_names): regressor = tree.DecisionTreeRegressor() regressor.fit(train_features, train_labels) test_results = cap_results(regressor.predict(test_features)) train_results = cap_results(regressor.predict(train_features)) print "test result", metrics.mean_squared_error(test_labels, test_results) print "test r2", metrics.r2_score(test_labels, test_results) print "train result", metrics.mean_squared_error(train_labels, train_results) print "train r2", metrics.r2_score(train_labels, train_results) # print "importances" # temp = [] # for index, val in enumerate(regressor.feature_importances_): # if val > 0.001: # temp.append((index, val)) # print sorted(temp, key=lambda x: x[1]) '''graph stuff''' dot_data = StringIO() tree.export_graphviz(regressor, out_file=dot_data, special_characters=True, class_names=regressor.classes_, impurity=False, feature_names=feature_names) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("tree.pdf") return (test_results, train_results)
def tree3(): global final_html global df,df_train,df_test,test_train_created,origin_df chi_key = list() init_style_string = template.style_string if request.method == 'POST': Listkey1 = list(MultiDict(request.form).values()) Listkey2 = MultiDict(request.form) DV_tree = Listkey2.get('DV') df1 = df for key1 in Listkey1: if(key1 <> "Build Tree" and key1 <> DV_tree): chi_key.append(key1) df1 = df.loc[:,chi_key] df2 = df1.values Y = df[DV_tree] clf = tree.DecisionTreeClassifier() clf = clf.fit(df2,Y.values) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) k = dot_data.getvalue() left_px = 600 width_px = 150 top_px = 50 height_px = 309 s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px) temp_df = df[0:15] t = """</div><div style="width:600px; height:700px; position: absolute; top: 20px; left:500px;"><br> Decision Tree result <br>""" final_html = template.s1 + t + k + "<br><br></div>" + temp_df.to_html() return final_html return 'helloo'
def drawDecisionTree(dt, filename, featureNames, classNames): dot_data = StringIO() print featureNames print classNames tree.export_graphviz(dt, out_file=dot_data, feature_names=featureNames, class_names=classNames, rounded=True, special_characters=True, filled=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png(filename)
def run_DT_model_2(df, criteria_col): # run the tree for various 0,1 lebel (e.g. : high value or not..) from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split from sklearn.externals.six import StringIO from IPython.display import Image import pydotplus print ('criteria_col = ', criteria_col) tree_col = [criteria_col,'Frequency', 'LTV', 'period_no_use','AverageTimeToOrder', 'late_by_collection', 'late_by_delivery', 'tickets', 'recleaned_orders', 'cancalled_orders', 'voucher_used'] df_train_ = df #df_train_tree = df_train_[tree_col] tree_data = df_train_[tree_col] tree_data = tree_data.dropna() tree_train, tree_test = train_test_split(tree_data, test_size=0.2, random_state=200, stratify=tree_data[criteria_col]) clf = tree.DecisionTreeClassifier() clf = clf.fit(tree_train.iloc[:,1:], tree_train[criteria_col]) print (clf.score(tree_test.iloc[:,1:], tree_test[criteria_col])) # confusion matrix print (confusion_matrix(tree_test[criteria_col], clf.predict(tree_test.iloc[:,1:]))) # visualize the tree dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=tree_col[1:], filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) return Image(graph.create_png()), tree_train, tree_test
def main(): if (len(sys.argv) < 2): print("One Argument Required; Training Set") return X_train, Y_train = ParseTraining(sys.argv[1]) #X_test, Y_test = ParseTraining(sys.argv[2]) #X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=99) #X_train, X_test, Y_train, Y_test = X, X, Y, Y #clf = tree.DecisionTreeClassifier() clf = tree.DecisionTreeClassifier(max_depth=6) #clf = OneVsRestClassifier(SVC(kernel="linear", C=0.025)) #clf = RandomForestClassifier(max_depth=6, n_estimators=10, max_features=1) #clf = SVC(kernel="linear", C=0.025) #clf = AdaBoostClassifier() #clf = SVC(gamma=2, C=1) clf = clf.fit(X_train, Y_train) #feature_names = ["partAvg", "recavg", "latency", "ReadRate"] feature_names = ["partConf", "recAvg", "latency", "ReadRate", "homeconf"] #feature_names = ["partAvg", "recAvg", "recVar", "ReadRate"] #feature_names = ["partAvg", "recAvg", "recVar"] #feature_names = ["recAvg", "recVar", "Read"] #feature_names = ["partAvg", "recVar"] ##class_names = ["Partition", "OCC", "2PL"] #class_names = ["OCC", "2PL"] class_names = ["Partition", "No Partition"] dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png("partition.png")
def classifyTree(Xtr, ytr, Xte, yte, splitCriterion="gini", maxDepth=0, visualizeTree=False): """ Classifies data using CART """ try: accuracyRate, probabilities, timing = 0.0, [], 0.0 # Perform classification cartClassifier = tree.DecisionTreeClassifier(criterion=splitCriterion, max_depth=maxDepth) startTime = time.time() prettyPrint("Training a CART tree for classification using \"%s\" and maximum depth of %s" % (splitCriterion, maxDepth), "debug") cartClassifier.fit(numpy.array(Xtr), numpy.array(ytr)) prettyPrint("Submitting the test samples", "debug") predicted = cartClassifier.predict(Xte) endTime = time.time() # Compare the predicted and ground truth and append result to list accuracyRate = round(metrics.accuracy_score(predicted, yte), 2) # Also append the probability estimates probs = cartClassifier.predict_proba(Xte) probabilities.append(probs) timing = endTime-startTime # Keep track of performance if visualizeTree: # Visualize the tree dot_data = StringIO() tree.export_graphviz(cartClassifier, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) prettyPrint("Saving learned CART to \"tritonTree_%s.pdf\"" % getTimestamp(), "debug") graph.write_pdf("tree_%s.pdf" % getTimestamp()) except Exception as e: prettyPrint("Error encountered in \"classifyTree\": %s" % e, "error") return accuracyRate, timing, probabilities, predicted
def mainTree(): header=re.sub(' |\t','','id|gender|age|height|edu|salary|nation|car|house|body|face|hair|\ smoke|drink|child|parent|bmi|where0|where1|\ marriage0|marriage1|look0|look1|where2').split('|') MaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/transed_M.txt',names=header,sep='|') FemaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/cluster_female.txt',names=header+['class'],sep='|') matches=matchDict('/home/idanan/jiayuan/code/resources/lovers_ids.txt') FemaleData['id']=FemaleData['id'].map(partial(match,matches=matches)) FemaleClass=FemaleData[['id','class']] newMaleData=concatData(MaleData,FemaleClass) MaleArrays=scaleData(newMaleData,['id','gender']) pca=factors(MaleArrays[:,:-1],17) print 'PCA explained variance:', sum(pca.explained_variance_ratio_) pcaMaleArray=pca.transform(MaleArrays[:,:-1]) MaleArrays=np.c_[pcaMaleArray,MaleArrays] trainData,testData=departData(MaleArrays,0.9) trainModel=decisionModel(trainData) dot_data = StringIO() tree.export_graphviz(trainModel, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("/home/idanan/jiayuan/code/resources/marriage.pdf") rate=test(trainModel,testData) print 'Decision Model true rate',rate
def tree2(): global final_html global df,origin_df chi_key = list() firstkey = "" init_style_string = """<p style="position: absolute; font-size: 12px; top: <top>px; width: <width>px; height: <height>px; left:<left>px; text-align: center;">tree_text_here</p>""" if request.method == 'POST': Listkey1 = list(MultiDict(request.form).values()) Listkey2 = MultiDict(request.form) DV_tree = Listkey2.get('DV') df1 = df for key1 in Listkey1: if(key1 <> "Build Tree" and key1 <> DV_tree): chi_key.append(key1) df1 = df.loc[:,chi_key] df2 = df1.values temp_count = 0 Y = df[DV_tree] clf = tree.DecisionTreeClassifier() clf = clf.fit(df2,Y.values) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) k = dot_data.getvalue() k1 = k.split(";") left_px = 600 width_px = 150 top_px = 50 height_px = 309 s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px) temp_df = df[0:15] t = """</div><div style="float:right;"><br> Decision Tree result <br>""" final_html = template.s1 + t + k + "</div><br><br><br>" + temp_df.to_html() return final_html return 'helloo'
def generate_plot(clf): print "\nGenerating plot..." dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("weather_forecast.pdf") print "Plot generated!"
def visualize_tree(dtree): dot_data = StringIO() tree.export_graphviz(dtree, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) display(Image(graph.create_png()))
def train_network(self): """ Pure virtual method for training the network """ db_query = self._database_session.query(PregameHitterGameEntry) mlb_training_data, mlb_evaluation_data = self.get_train_eval_data(db_query, 0.8) X_train, Y_train = self.get_stochastic_batch(mlb_training_data, self.SIZE_TRAINING_BATCH) self._decision_tree.fit(X_train, Y_train) dot_data = StringIO() tree.export_graphviz(self._decision_tree, out_file=dot_data, feature_names=PregameHitterGameEntry.get_input_vector_labels()) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("hitter_tree.pdf") x_test_actual = list() y_test_actual = list() for data in mlb_evaluation_data: try: postgame_entry = self._database_session.query(PostgameHitterGameEntry).filter(PostgameHitterGameEntry.rotowire_id == data.rotowire_id, PostgameHitterGameEntry.game_date == data.game_date).one() y_test_actual.append([postgame_entry.actual_draftkings_points]) x_test_actual.append(data.to_input_vector()) except NoResultFound: print "Ignoring hitter %s since his postgame stats were not found." % data.rotowire_id continue self._database_session.close()
def create_tree(X, Y): clf = tree.DecisionTreeClassifier(criterion="entropy") clf = clf.fit(X, Y) from IPython.display import Image import pydotplus dot_data = StringIO() # tree.export_graphviz(clf, out_file=dot_data) # feature_names = ['Gender', 'Age'] feature_names = ["Gender", "0-5", "6-12", "13-19", "20-27", "28-35", "36-50", "55+"] target_names = [] for i in range(1, len(Y) + 1): target_names.append("Ad #" + str(i)) tree.export_graphviz( clf, out_file=dot_data, feature_names=feature_names, class_names=target_names, filled=True, rounded=True, special_characters=True, ) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("Tree.pdf") return clf
def dt_graph(treeest, cv, scores, features, labels, featnames, outfile): ''' Retrains the tree estimator using the fold with the best results from the cross-validation process. Prints out a graph pdf file of that estimator.''' # Hacky way to get the training data for the best fold bestfold = np.argmax(scores) cnt = 0 for train, _ in cv: # Only do stuff when you've got the training indices for the best fold if(cnt == bestfold): # Fit treeest.fit(features[train], labels[train]) # Get the dot file dot_data = StringIO() tree.export_graphviz(treeest, out_file=dot_data, \ feature_names=featnames) # Convert the dot file to a graph graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(outfile) return else: cnt += 1 print("You should never see this text from dt_graph!") return
def decisionTree(): iris = load_iris() clf = tree.DecisionTreeClassifier( criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None ) clf = clf.fit(iris.data, iris.target) dot_data = StringIO() tree.export_graphviz( clf, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=False, rounded=True, special_characters=True ) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("iris.pdf")
def export_tree(clf, filename, feature_names=None, max_depth=None): from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names, max_depth=max_depth) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(filename)
def createGraph(clf): with open("portScan.dot", 'w') as f: f = tree.export_graphviz(clf, out_file=f) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("portScan.pdf")
def printPdf(clf, dataTrain): from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('sentiment.pdf') print dataTrain.data[0]
def printTreePDF(self, path = './tree.pdf'): if self.clf == None: raise NameError('Tree was not created!') else: dot_data = StringIO() tree.export_graphviz(self.clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(path)
def view(classifier): """ Renders a graph representation of classifier, and saves it to "MyTree.pdf" in the same folder as the executing script. """ tree_dot = StringIO() tree.export_graphviz(classifier, out_file=tree_dot) graph = pydot.graph_from_dot_data(tree_dot.getvalue()) graph.write_pdf("MyTree.pdf")
def save_tree_png(self, store): import pydot from sklearn.externals.six import StringIO dot_data = StringIO() tree.export_graphviz(self.clf, out_file=dot_data, feature_names=self.feature_names) graph = pydot.graph_from_dot_data(dot_data.getvalue())[0] with open(store.dataset_path + '/tree.png','wb') as f: f.write(graph.create_png())
def export(self, fpath): """ Export the decision tree as a PDF file :return: None """ dot_data = StringIO() tree.export_graphviz(self.model, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(fpath)
def tree_vis(clf): #fn = ''.join([random.choice(string.ascii_lowercase + string.digits) for _ in range(10)]) fn = 'tree' fn = 'data/trees/{0}.png'.format(fn) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png(fn) return Image(filename=fn)
def train_decision_tree_elite_status_classifier(): """Trains and validates a decision tree model for predicting users' Elite status.""" model = train_and_validate_elite_status_classifier(DecisionTreeClassifier, DECISION_TREE_USER_ATTRIBUTES) # Output tree representation showing decision rules dot_data = StringIO() tree.export_graphviz(model, out_file=dot_data, class_names=True, filled=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('analysis/analysis_results/decision_tree.pdf')
def drawDecisionTree(classIndex): clf = tree.DecisionTreeClassifier() clf = clf.fit(preference,y[classIndex]) dot_data = StringIO() # change it: class_names = cnames[classIndex] tree.export_graphviz(clf,out_file=dot_data,feature_names= fname,filled=True, rounded=True,special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) filename = "decisionTree_" + str(classIndex) + ".pdf" graph.write_pdf(filename)
def __plotTree(clf,name): tree.export_graphviz(clf,out_file=outputdir + name) dot_data = StringIO() tree.export_graphviz(clf,out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(outputdir + name + '.pdf') os.remove(outputdir + name) #plot utilities
def createTreePdf(self): try: import pydot except: return dot_data = StringIO() tree.export_graphviz(self.getClf(), out_file = dot_data, feature_names = self.featureNames) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("DT" + "-".join(self.classNames) + ".pdf")
csRoot, depth, criteria) # cria o diretório caso não exista for dir in list([csRoot, dotPath, graphPath, csvPredictionPath]): dirName = os.path.dirname(dir) if not os.path.exists(dirName): os.makedirs(dirName) #cria o csv com as predições dfPredictions = dfCs.copy() dfPredictions = dfPredictions.iloc[y_test.index, ] dfPredictions[classLabel] = y_pred dfPredictions.to_csv(csvPredictionPath) dotFile = pydotplus.graph_from_dot_data( dot_data.getvalue()).to_string() #graphPath = ('{0}_ga.png' if applyPreProcessingWGA else "{0}.png") ## expressões regulares que excluem elementos indesejados da decision tree e faz aprimoramentos na visualização import re dotFile = re.sub('(style="rounded")', ' style="filled, rounded", fillcolor="#FFFFFF"', dotFile) dotFile = re.sub('(samples = [0-9]+<br\/>)', '', dotFile) dotFile = re.sub('(value = \[[0-9]+, [0-9]+\]<br\/>)', '', dotFile) dotFile = re.sub('(<br\/>class = [0-9])', '', dotFile) dotFile = re.sub('(<class = 1>)', '<<b>smelly code</b>>, fillcolor="#e68743"', dotFile) dotFile = re.sub('(class = 0)', 'not smelly code', dotFile) ## RE para modificar o tamanho do nó
def main(): # Building Phase data = importdata() X, Y, X_train, X_test, y_train, y_test = splitdataset(data) clf_gini = train_using_gini(X_train, X_test, y_train) clf_entropy = tarin_using_entropy(X_train, X_test, y_train) #Visualizing tree using Gini Index dot_data = StringIO() export_graphviz(clf_gini, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('gini_graph.png') Image(graph.create_png()) print('\n') # Operational Phase print("Results Using Gini Index:") print ("\n") # Prediction using gini y_pred_gini = prediction(X_test, clf_gini) #Test instance prdictions print("\n") test1_set =[1,1,1,1] print ("Test instance 1: ", test1_set) test1 = clf_gini.predict([test1_set]) print("Predicted label: ", test1) print("Actual label: B") print('\n') test2_set =[1,3,2,3] print ("Test instance 2: ", test2_set) test2 = clf_gini.predict([test2_set]) print("Predicted label: ", test2) print("Actual label: R") print('\n') test3_set = [5,4,5,1] print ("Test instance 3: ", test3_set) test3 = clf_gini.predict([test3_set]) print("Predicted label: ", test3) print("Actual label: L") print('\n') test7_set = [1,4,1,4] print ("Test instance 4: ", test7_set) test7 = clf_gini.predict([test7_set]) print("Predicted label: ", test7) print("Actual label: B") print('\n') cal_accuracy(y_test, y_pred_gini) #Visualizing tree using Entropy dot_data = StringIO() export_graphviz(clf_entropy, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('entropy_graph.png') Image(graph.create_png()) print("Results Using Entropy:") print('\n') # Prediction using entropy y_pred_entropy = prediction(X_test, clf_entropy) print('\n') test4_set =[1,1,1,1] print ("Test instance 1: ", test4_set) test4 = clf_gini.predict([test4_set]) print("Predicted label: ", test4) print("Actual label: B") print('\n') test5_set =[1,3,2,3] print ("Test instance 2: ", test5_set) test5 = clf_gini.predict([test5_set]) print("Predicted label: ", test5) print("Actual label: R") print('\n') test6_set = [5,4,5,1] print ("Test instance 3: ", test6_set) test6 = clf_gini.predict([test6_set]) print("Predicted label: ", test6) print("Actual label: L") print('\n') test8_set = [1,4,1,4] print ("Test instance 4: ", test8_set) test8 = clf_gini.predict([test8_set]) print("Predicted label: ", test8) print("Actual label: B") print('\n') cal_accuracy(y_test, y_pred_entropy)
def _decision_tree_regression_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, presort) regressor.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(regressor, out_file=dot_data, feature_names=feature_cols, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.report import png2MD fig_tree = png2MD(graph.create_png()) # json model = _model_dict('decision_tree_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col feature_importance = regressor.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = regressor.max_features_ model['n_features'] = regressor.n_features_ model['n_outputs'] = regressor.n_outputs_ model['tree'] = regressor.tree_ get_param = regressor.get_params() model['parameters'] = get_param model['regressor'] = regressor # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T # Add tree plot rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Regression Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['report'] = rb.get() return {'model': model}
validation_confusion_tree = confusion_matrix(Y2, tree_val_predictions) print "Confusion Matrix: Decision Tree" print validation_confusion_tree print "" validation_confusion_knn = confusion_matrix(Y2, knn_val_predictions) print "Confusion Matrix: KNN" print validation_confusion_knn print "" #Here we can get a visual of the Decision Tree by copying the output and #pasting into https://dreampuf.github.io/GraphvizOnline/ from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydot plot = StringIO() export_graphviz(tree, out_file=plot, filled=True, rounded=True, special_characters=True) print plot.getvalue() #Save the model using Pickle import pickle with open('knn_pickle', 'wb') as knn_model: pickle.dump(knn, knn_model)
def plotTree(treeName,tree,featureNames): treePic_dot = StringIO() export_graphviz(tree, out_file=treePic_dot, feature_names=featureNames, filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(treePic_dot.getvalue()) Image(graph.create_png()) graph.write_png(treeName+'.png')
def _decision_tree_classification_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): feature_names, features = check_col_type(table, feature_cols) y_train = table[label_col] if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'): raise_error('0718', 'label_col') class_labels = sorted(set(y_train)) if class_weight is not None: if len(class_weight) != len(class_labels): raise ValueError( "Number of class weights should match number of labels.") else: class_weight = { class_labels[i]: class_weight[i] for i in range(len(class_labels)) } classifier = DecisionTreeClassifier( criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, class_weight, presort) classifier.fit(features, table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, feature_names=feature_names, class_names=classifier.classes_.astype(np.str), filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['classes'] = classifier.classes_ feature_importance = classifier.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = classifier.max_features_ model['n_classes'] = classifier.n_classes_ model['n_features'] = classifier.n_features_ model['n_outputs'] = classifier.n_outputs_ model['tree'] = classifier.tree_ get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_names)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.xlim(0, 1.1) plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Classification Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance_table = pd.DataFrame( [[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
lenses_list = [] # print(lenses_dict) #打印字典信息 lenses_pd = pd.DataFrame(lenses_dict) #生成pandas.DataFrame # print(lenses_pd) #打印pandas.DataFrame le = LabelEncoder() #创建LabelEncoder()对象,用于序列化 for col in lenses_pd.columns: #序列化 lenses_pd[col] = le.fit_transform(lenses_pd[col]) # print(lenses_pd) #打印编码信息 clf = tree.DecisionTreeClassifier( max_depth=6) #创建DecisionTreeClassifier()类 clf = clf.fit(lenses_pd.values.tolist(), lenses_target) #使用数据,构建决策树 dot_data = StringIO() tree.export_graphviz( clf, out_file=dot_data, #绘制决策树 feature_names=lenses_pd.keys(), class_names=clf.classes_, filled=True, rounded=True, special_characters=True) # graph = pydotplus.graph_from_dot_data(dot_data.getvalue()); #下面这列解决中文乱码 graph = pydotplus.graph_from_dot_data(dot_data.getvalue().replace( 'helvetica', '"Microsoft YaHei"')) print(dot_data.getvalue()) graph.write_pdf("tree.pdf") #保存绘制好的决策树,以PDF的形式存储。 print(clf.predict([[1, 1, 1, 0]])) #预测
import numpy as np import pandas as pd from sklearn import tree from sklearn.externals.six import StringIO train = pd.read_excel("py_tree_learn.xls", "Sheet1") # print(train) _ = train.fillna(9999, inplace=True) # print(train) train_data = train.iloc[:, :-1] train_target = train.iloc[:, -1] # print(train_data) train_data_1 = train_data.values train_target_1 = train_target.values # print(train_data_1) clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=6) clf = clf.fit(train_data_1, train_target_1) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) print(dot_data.getvalue()) # new_train=(train['PENSION_FUND_STATUS']) # # print(new_train) # # print(new_train.value_counts()) # print(pd.crosstab(train.PENSION_FUND_STATUS,train.target_new,margins=True))
#Graphing trees from sklearn.externals.six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz pip install graphviz pip install pydotplus pip install pyparsing import pydotplus as pypl # Graphing 5 leaves node dot_data5 = StringIO() export_graphviz(classifier5, out_file=dot_data5, filled=True, rounded=True, special_characters=True, class_names = ['0', '1']) graph = pypl.graph_from_dot_data(dot_data5.getvalue()) Image(graph.create_png()) graph.write_png('5leavesNode.png') # Graphing 15 leaves node dot_data15 = StringIO() export_graphviz(classifier15, out_file=dot_data15, filled=True, rounded=True, special_characters=True, class_names = ['0', '1']) graph = pypl.graph_from_dot_data(dot_data15.getvalue()) Image(graph.create_png()) graph.write_png('15leavesNode.png') # Graphing 25 leaves node dot_data25 = StringIO()
clf_gini.fit(X_train, y_train) #Decision Tree with Information Entropy clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=7, min_samples_leaf=5) clf_entropy.fit(X_train, y_train) # evaluate algorithm y_pred = clf_gini.predict(X_test) y_pred_en = clf_entropy.predict(X_test) #Accuracy print("Accuracy for gini", accuracy_score(y_test, y_pred) * 100) print("Accuracy for entropy", accuracy_score(y_test, y_pred_en) * 100) import graphviz list(X) tree.export_graphviz(clf_gini, out_file='tree.dot') from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf_gini, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph[0].write_pdf("Elastomer.pdf") tree.export_graphviz(clf_entropy, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph[0].write_pdf("Elastomer_Entropy.pdf")
training_data = np.array(training_data) training_class = np.array(training_class) test_data = np.array(test_data) test_class = np.array(test_class) #building the classifier (the option random_state=RandomState(130) makes the algorithm deterministic) clf = tree.DecisionTreeClassifier(criterion='gini', random_state=RandomState(130)) clf = clf.fit(training_data, training_class) #print the decision tree in a pdf file from sklearn.externals.six import StringIO dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("iris.pdf") # the following code evaluates the decision tree on the test set and compute a confidence interval for # the accuracy. You should create a list a, where a[i]=1 if the ith record test_data[i] has been classified # correctly and 0 otherwise. Remember, a.append(1) add one more element to the list with value = 1. a = [] pre_class = clf.predict(test_data) for i in range(0, len(test_data)): if test_class[i] == pre_class[i]: a.append(1) else: a.append(0) # fill properly this missing part # The following code computes a confidence interval for the accuracy. The first argument is the confidence,
def show_tree(clf): dot_data = StringIO() export_graphviz(clf, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("titanic_tree.pdf")
confusion_matrix(y_validacao, y_predicao_validacao))) print("Matriz de confusão da teste :\n {}".format( confusion_matrix(y_teste, y_predicao_teste))) # Não estou conseguindo gerar um *.png arquivo_dot = StringIO() tree.export_graphviz( modeloAD, out_file=arquivo_dot, node_ids=True, feature_names=['Sangue', 'Da a luz', 'Pode voar', 'Mora na agua'], class_names=['SIM', 'NAO'], filled=True) arvore = pdp.graph_from_dot_data(arquivo_dot.getvalue()) lista_edge = [] for edge in arvore.get_edge_list(): lista_edge.append(edge.get_source()) nodes = arvore.get_node_list() for node in nodes: if node.get_name() == '0': node.set_fillcolor('#F19C99') elif node.get_name() not in lista_edge: node.set_fillcolor('#E1D5E7') else: node.set_fillcolor('#D5E8D4') arvore.write_png("arvore_mamifero.png")
def printTree(clf): dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) return graph.write_png('tree.png')
# Train model kyphosis_features = kyphosis.columns[1:] kyphosis_dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1) kyphosis_dt_clf = kyphosis_dt_clf.fit(kyphosis_train[kyphosis_features], kyphosis_train['Kyphosis']) # Print a string representation of the tree. # If you have graphviz (www.graphviz.org) installed, you can write a pdf # visualization using graph.write_pdf(filename) kyphosis_dt_data = StringIO() tree.export_graphviz(kyphosis_dt_clf, out_file=kyphosis_dt_data) kyphosis_dt_graph = pydotplus.parser.parse_dot_data( kyphosis_dt_data.getvalue()) print(kyphosis_dt_graph.to_string()) # Predict classes of test set and evaluate kyphosis_dt_pred = kyphosis_dt_clf.predict(kyphosis_test[kyphosis_features]) kyphosis_dt_cm = metrics.confusion_matrix(kyphosis_test['Kyphosis'], kyphosis_dt_pred, labels=['absent', 'present']) print(kyphosis_dt_cm) kyphosis_dt_acc = metrics.accuracy_score(kyphosis_test['Kyphosis'], kyphosis_dt_pred) kyphosis_dt_prec = metrics.precision_score(kyphosis_test['Kyphosis'], kyphosis_dt_pred, pos_label='absent') kyphosis_dt_rec = metrics.recall_score(kyphosis_test['Kyphosis'],
tree_mod = DecisionTreeRegressor(max_depth=5) tree_mod.fit(x_train[final_columns], y_train) y_1_pred = tree_mod.predict(x_test[final_columns]) x1 = np.asanyarray(y_1_pred) x2 = np.asanyarray(y_test) this_rmse = np.sqrt(np.mean(np.square(x1 - x2))) from sklearn import tree tree.export_graphviz(tree_mod, out_file='tree.dot') #produces dot file import pydot dotfile = StringIO('tree.dot') tree.export_graphviz(tree_mod, out_file=dotfile) pydot.graph_from_dot_data(dotfile.getvalue()).write_png("dtree2.png") ############## Random Forest modelling ########################### fr_regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100) fr_regr.fit(x_train[final_columns], y_train) y_1_pred = fr_regr.predict(x_test[final_columns]) x1 = np.asanyarray(y_1_pred) x2 = np.asanyarray(y_test) this_rmse_rf = np.sqrt(np.mean(np.square(x1 - x2))) ##########################################################################################
X = pd.DataFrame(data, columns=feature_names) y = pd.Categorical.from_codes(target, target_names) X.head() y = pd.get_dummies(y) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size=0.3, random_state=1) dt = DecisionTreeClassifier() dt.fit(X_train, y_train) dot_data = StringIO() export_graphviz(dt, out_file=dot_data, feature_names=feature_names, class_names=target_names) (graph, ) = graph_from_dot_data(dot_data.getvalue()) Image(graph.create_png()) #graph.write_png("crack.png") y_pred = dt.predict(X_test) sets = np.array(y_test).argmax(axis=1) predictions = np.array(y_pred).argmax(axis=1) matrix = confusion_matrix(sets, predictions) print(matrix) tp1 = matrix[0][0] fp1 = matrix[1][0] + matrix[2][0] fn1 = matrix[0][1] + matrix[0][2] tn1 = matrix[1][1] + matrix[2][2] prec1 = tp1/(tp1+fp1) recall1 = tp1/(tp1+fn1) f11 = 2*((prec1*recall1)/(prec1+recall1))
def fit_population_cv(population, target_column_name, identifier_column_name, table_name, folds=3, parameters=None): """ The internal wrapper of GridSearchCV fit algorithm for DecisionTree of scikit-learn. :param population: the population data whose functional type is 'table' :type population: dict :param target_column_name: the name of the attribute providing the class label of the observed subject. Must match one of the available population attributes. :type target_column_name: str :param identifier_column_name: the name of the attribute identifying each observed subject. Must match one of the available population attributes. :type identifier_column_name: str :param parameters: a dictionary containing the list of values to be tested that were parsed :type parameters: dict :param table_name: name of the table to create :type table_name: str :param folds: number of folds used for the cross validation :type folds: int :raises IkatsException: error occurred. """ if table_name is None or re.match('^[a-zA-Z0-9-_]+$', table_name) is None: raise ValueError("Error in table name") LOGGER.info("Starting Decision Tree CV Fit with scikit-learn") # To avoid having a dict as default arg of a function if parameters is None: parameters = {'max_depth': None, 'class_weight': False} try: desc_population = population.get('table_desc', None) LOGGER.info("with Population table_desc= %s", desc_population) # 1/ prepare the learning set # feature_vectors, target, class_names, column_names = split_population( population, target_column_name, identifier_column_name) # 2/ prepare the DecisionTree and CrossValidation procedure # mdl = tree.DecisionTreeClassifier() gcv = GridSearchCV(mdl, param_grid=parameters, cv=folds) gcv.fit(X=feature_vectors, y=target) LOGGER.info(" ... finished fitting the Decision Tree CV to data") LOGGER.info(" - Exporting Decision Tree CV to dot format") dot_io = StringIO() tree.export_graphviz(gcv.best_estimator_, out_file=dot_io, feature_names=column_names, class_names=class_names, filled=True, label='all') dot = dot_io.getvalue() LOGGER.info( " ... finished exporting the Decision Tree CV to dot format") # Formatting the result dictionary to an IKATS table formatted_results = _fill_table_cv_results(gcv.cv_results_) best_params = gcv.best_params_ best_params['balancing'] = best_params.pop('class_weight') best_params['max_depth'] = 0 if best_params[ 'max_depth'] is None else best_params['max_depth'] best_params['balancing'] = best_params['balancing'] is not None formatted_best_params = json.dumps(best_params) LOGGER.info("... ended Decision Tree CV Fit with scikit-learn") # Save the table description = "Result of Decision Tree Cross Validation operator" formatted_results['table_desc']['name'] = table_name formatted_results['table_desc']['desc'] = description IkatsApi.table.create(data=formatted_results) return gcv.best_estimator_, dot, formatted_best_params, table_name except IkatsException: raise except Exception: msg = "Unexpected error: fit_population(..., {}, {}, {})" raise IkatsException( msg.format(target_column_name, identifier_column_name, parameters))
def run_decision_tree(training_features, training_labels, test_features, test_labels, passed_parameters=None, headings=None): """ Classifies the data using sklearn's decision tree Does not natively support pruning so max_depth is being used Parameters ---------- training_data: data used to train the classifier. For each row, item 0 assumed to be the label test_data: data used to test the classifier. For each row, item 0 assumed to be the label max_depth: maximum tree depth to be applied (will simulate pruning) Returns ------- prediction: predicted labels of the test data accuracy: percent of test data labels accurately predicted """ time_1 = time.time() estimator = tree.DecisionTreeClassifier() #set up parameters for the classifier if (passed_parameters == None): parameters = {'max_depth': None} else: parameters = passed_parameters #create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) #plot the validation curves for param in parameters: if (is_number(parameters[param][0])): title = 'Validation Curves \n(Decision Tree)' save_name = "Validation Curves - Decision Tree - %s.png" % param plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv) pylab.savefig(os.path.join(results_location, save_name)) #set up tuning algorithm classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) #fit the classifier classifier.fit(training_features, training_labels) test_prediction = classifier.predict(test_features) test_accuracy = classifier.score(test_features, test_labels) time_2 = time.time() #show the best result estimator = tree.DecisionTreeClassifier( max_depth=classifier.best_estimator_.max_depth, criterion=classifier.best_estimator_.criterion) estimator.fit(training_features, training_labels) #plot the learning curve title = 'Learning Curves \n(Decision Tree, max depth=%i)' % classifier.best_estimator_.max_depth plot_learning_curve(estimator, title, training_features, training_labels, cv=cv) pylab.savefig( os.path.join(results_location, 'Learning Curves - Decision Tree.png')) #plt.show() #save the visualization of the decision tree only use the top 5 levels for now tree_data = StringIO() tree.export_graphviz(estimator, out_file=tree_data, max_depth=5, feature_names=headings) graph = pydot.graph_from_dot_data(tree_data.getvalue()) graph.write_pdf(os.path.join(results_location, "Decision Tree Model.pdf")) time_3 = time.time() #output time stats #time 1 -> time 2 is optimization time #time 2 -> time 3 is run for just one case print("Decision Tree Time Stats") print("Optimization Time -> %f" % (time_2 - time_1)) print("Single Run Time -> %f" % (time_3 - time_2)) #output classification report and confusion matrix print('\n\n----------------------------') print('Classification Report') print('----------------------------\n') print(classification_report(y_true=test_labels, y_pred=test_prediction)) print('\n\n----------------------------') print('Confusion Matrix') print('----------------------------\n') print(confusion_matrix(y_true=test_labels, y_pred=test_prediction)) return test_prediction, test_accuracy
def parse_tree(file_path, outputfile): """ :param file_path: Filepath of the tree !! Use same python version as for training and saving :param outputfile : Where do we save the treefile we output :return: """ with open(file_path, 'rb') as tree_file: tree_model = pickle.load(tree_file) feature_names = tree_model.extract_features_names() decision_model = tree_model.classifier dot_data = StringIO() export_graphviz(decision_model, out_file=dot_data, filled=True, rounded=True, special_characters=True, label='all', class_names=decision_model.classes_.astype(str), impurity=False, feature_names=feature_names) input_label = [ 'class = %d' % i for i in range(tree_model.max_quality_change * 2 + 1) ] output_label = [ 'Action : ' + ''.join([low] * i) for i in range(tree_model.max_quality_change, 0, -1) ] output_label += ['Action : ' + same] output_label += [ 'Action : ' + ''.join([up] * i) for i in range(1, tree_model.max_quality_change + 1) ] class_label_mapper = { in_l: out_l for in_l, out_l in zip(input_label, output_label) } input_label = [ '_switch_%d' % i for i in range(-tree_model.max_quality_change, tree_model.max_quality_change + 1) ] output_label = [ 'Switch : ' + ''.join([low] * i) for i in range(tree_model.max_quality_change, 0, -1) ] output_label += ['Switch : ' + same] output_label += [ 'Switch : ' + ''.join([up] * i) for i in range(1, tree_model.max_quality_change + 1) ] switch_mapper = [(in_l, out_l) for in_l, out_l in zip(input_label, output_label)] graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) for n in graph.get_nodes(): label = n.get_label() if label: label_parsed = label[1:-1].split('<br/>') if len(label_parsed) == 3: sample_size, values, class_label = label[1:-1].split('<br/>') else: feature_name, sample_size, values, class_label = label[ 1:-1].split('<br/>') certainty = min( float(sample_size.split('=')[-1].strip()) / MAX_N_SAMPLES_CONFIDENCE, 1.0) label_parsed = parse_node_label(label, class_label_mapper, switch_mapper) n.set_label(label_parsed) reference_color = np.array(reference_colormap(1.0)) reference_color[-1] = certainty reference_color *= reference_color n.set_fillcolor( convert_rgba2hex((reference_color * 255).astype(int))) png_binary = graph.create_png() with open(outputfile, 'wb') as outpng: outpng.write(png_binary)
def test_graphviz_toy(): # Check correctness of export_graphviz clf = DecisionTreeClassifier(max_depth=3, min_samples_split=1, criterion="gini", random_state=2) clf.fit(X, y) # Test export code out = StringIO() export_graphviz(clf, out_file=out) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]"] ;\n' \ '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert_equal(contents1, contents2) # Test with feature_names out = StringIO() export_graphviz(clf, out_file=out, feature_names=["feature0", "feature1"]) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]"] ;\n' \ '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert_equal(contents1, contents2) # Test with class_names out = StringIO() export_graphviz(clf, out_file=out, class_names=["yes", "no"]) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]\\nclass = yes"] ;\n' \ '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n' \ 'class = yes"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n' \ 'class = no"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert_equal(contents1, contents2) # Test plot_options out = StringIO() export_graphviz(clf, out_file=out, filled=True, impurity=False, proportion=True, special_characters=True, rounded=True) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled, rounded", color="black", ' \ 'fontname=helvetica] ;\n' \ 'edge [fontname=helvetica] ;\n' \ '0 [label=<X<SUB>0</SUB> ≤ 0.0<br/>samples = 100.0%<br/>' \ 'value = [0.5, 0.5]>, fillcolor="#e5813900"] ;\n' \ '1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, ' \ 'fillcolor="#e58139ff"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label=<samples = 50.0%<br/>value = [0.0, 1.0]>, ' \ 'fillcolor="#399de5ff"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '}' assert_equal(contents1, contents2) # Test max_depth out = StringIO() export_graphviz(clf, out_file=out, max_depth=0, class_names=True) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box] ;\n' \ '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \ 'value = [3, 3]\\nclass = y[0]"] ;\n' \ '1 [label="(...)"] ;\n' \ '0 -> 1 ;\n' \ '2 [label="(...)"] ;\n' \ '0 -> 2 ;\n' \ '}' assert_equal(contents1, contents2) # Test max_depth with plot_options out = StringIO() export_graphviz(clf, out_file=out, max_depth=0, filled=True, node_ids=True) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled", color="black"] ;\n' \ '0 [label="node #0\\nX[0] <= 0.0\\ngini = 0.5\\n' \ 'samples = 6\\nvalue = [3, 3]", fillcolor="#e5813900"] ;\n' \ '1 [label="(...)", fillcolor="#C0C0C0"] ;\n' \ '0 -> 1 ;\n' \ '2 [label="(...)", fillcolor="#C0C0C0"] ;\n' \ '0 -> 2 ;\n' \ '}' assert_equal(contents1, contents2) # Test multi-output with weighted samples clf = DecisionTreeClassifier(max_depth=2, min_samples_split=1, criterion="gini", random_state=2) clf = clf.fit(X, y2, sample_weight=w) out = StringIO() export_graphviz(clf, out_file=out, filled=True, impurity=False) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled", color="black"] ;\n' \ '0 [label="X[0] <= 0.0\\nsamples = 6\\n' \ 'value = [[3.0, 1.5, 0.0]\\n' \ '[3.0, 1.0, 0.5]]", fillcolor="#e5813900"] ;\n' \ '1 [label="samples = 3\\nvalue = [[3, 0, 0]\\n' \ '[3, 0, 0]]", fillcolor="#e58139ff"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="True"] ;\n' \ '2 [label="X[0] <= 1.5\\nsamples = 3\\n' \ 'value = [[0.0, 1.5, 0.0]\\n' \ '[0.0, 1.0, 0.5]]", fillcolor="#e5813986"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="False"] ;\n' \ '3 [label="samples = 2\\nvalue = [[0, 1, 0]\\n' \ '[0, 1, 0]]", fillcolor="#e58139ff"] ;\n' \ '2 -> 3 ;\n' \ '4 [label="samples = 1\\nvalue = [[0.0, 0.5, 0.0]\\n' \ '[0.0, 0.0, 0.5]]", fillcolor="#e58139ff"] ;\n' \ '2 -> 4 ;\n' \ '}' assert_equal(contents1, contents2) # Test regression output with plot_options clf = DecisionTreeRegressor(max_depth=3, min_samples_split=1, criterion="mse", random_state=2) clf.fit(X, y) out = StringIO() export_graphviz(clf, out_file=out, filled=True, leaves_parallel=True, rotate=True, rounded=True) contents1 = out.getvalue() contents2 = 'digraph Tree {\n' \ 'node [shape=box, style="filled, rounded", color="black", ' \ 'fontname=helvetica] ;\n' \ 'graph [ranksep=equally, splines=polyline] ;\n' \ 'edge [fontname=helvetica] ;\n' \ 'rankdir=LR ;\n' \ '0 [label="X[0] <= 0.0\\nmse = 1.0\\nsamples = 6\\n' \ 'value = 0.0", fillcolor="#e5813980"] ;\n' \ '1 [label="mse = 0.0\\nsamples = 3\\nvalue = -1.0", ' \ 'fillcolor="#e5813900"] ;\n' \ '0 -> 1 [labeldistance=2.5, labelangle=-45, ' \ 'headlabel="True"] ;\n' \ '2 [label="mse = 0.0\\nsamples = 3\\nvalue = 1.0", ' \ 'fillcolor="#e58139ff"] ;\n' \ '0 -> 2 [labeldistance=2.5, labelangle=45, ' \ 'headlabel="False"] ;\n' \ '{rank=same ; 0} ;\n' \ '{rank=same ; 1; 2} ;\n' \ '}' assert_equal(contents1, contents2)
def writeTree(treeModel, namesList, filename): #utility function that plots a decision tree and saves to file dot_data = StringIO() export_graphviz(treeModel, out_file=dot_data, feature_names=namesList) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(filename)
def decision_tree(df_sub,df_train,df_test): ''' wine = pd.read_csv('wine_data.csv',names=["Cultivator","Alcohol","Malic_Acid","Ash","Alcalinity_of_Ash","Magnesium","Total_Phenols","Falvanoids","Nonflavanoid_phenols","Proanthocyanins","Color_intensity","Hue","OD280","Proline"]) #Look at the data wine.head() wine.describe().transpose() X = wine.drop('Cultivator',axis=1) y=wine['Cultivator'] ''' df_submission=df_sub df_train=df_train df_test=df_test drop_list=['Survived','Name','Sex','Ticket','Cabin','PassengerId','Embarked'] drop_list2=['Name','Sex','Ticket','Cabin','PassengerId','Embarked'] X_train=df_train.drop(drop_list,axis=1) X_test=df_test.drop(drop_list2,axis=1) y_train=df_train['Survived'] y_test=df_submission['Survived'] print(X_train.head()) print(X_test.head()) ''' col_names = ['pregnant','glucose','bp','skin','insulin','bmi','pedigree','age','label'] pima = pd.read_csv("pima-indians-diabetes.csv",header=None,names=col_names) pima.head() #split dataset in features and target variable feature_cols=['pregnant','insulin','bmi','age','glucose','bp','pedigree'] X = pima[feature_cols] y=pima.label ''' #Create DEcision Tree classifier object clf = DecisionTreeClassifier(max_depth=5,min_samples_leaf=10) #Train Decision Tree Classifier clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_train) print('Training Accuracy: ',metrics.accuracy_score(y_train,y_pred)) y_pred = clf.predict(X_test) #Model Accuracy, how often is the classifier correct? print("Test Accuracy:",metrics.accuracy_score(y_test,y_pred)) print(pd.crosstab(y_test,y_pred,rownames=['True'],colnames=['Predicted'],margins=True)) #PREP DATA FOR LOOPING df_X_train=pd.DataFrame(X_train) df_X_test=pd.DataFrame(X_test) df_y_train=pd.DataFrame(y_train) df_y_test=pd.DataFrame(y_test) X=df_X_train.append(df_X_test) y=df_y_train.append(df_y_test) #LEARNING CURVE: LOOP FOR DIFFERENT TRAINING SIZES # n_range=[0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5] n_range=[0.01,0.02,0.1,0.25,0.4,0.5,0.6,0.75,0.9,0.98,0.99]#[0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5] scores = {} scores_list=[] train_scores_list=[] for n_size in n_range: print('n_range',n_size) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=n_size, random_state=10)#,random_state=4) clf.fit(X_train,y_train) y_pred=clf.predict(X_test) scores_list.append(metrics.accuracy_score(y_test,y_pred)) #Train scores, for learning curves y_pred_train=clf.predict(X_train) train_scores_list.append(metrics.accuracy_score(y_train,y_pred_train)) print("TRAINING SIZE") print('scores_lis',scores_list) a=['0.01','0.02','0.1','0.25','0.4','0.5','0.6','0.75','0.9','0.98','0.99'] plt.plot(a,scores_list,a,train_scores_list)#(n_range,scores_list,n_range,train_scores_list)#n_range,scores_list)#,n_range,[0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5])#(['0.05','0.1','0.15','0.2','0.25','0.3','0.35','0.4','0.45','0.5'],scores_list) # plt.plot(n_range,scores_list,n_range,train_scores_list) plt.title('Test Accuracy v. Train Accuracy, Decision Trees (Titanic)') plt.legend(['Test','Train']) plt.xlabel('Test Split') plt.ylabel('Accuracy') plt.ylim((0,1.0)) plt.savefig('Titanic/mlp_Titanic_testSize.png') plt.show() #LEARNING CURVE: LOOP FOR DIFFERENT Max Depths l_range=[1,5,10,15,20] #scores = {} learning_list=[] time_list=[] for l_rate in l_range: print('l_rate',l_rate) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=10)#,random_state=4) clf = DecisionTreeClassifier(max_depth=l_rate, min_samples_leaf=10) start_time=time.time() clf.fit(X_train,y_train) end_time=time.time() elapsed=end_time-start_time time_list.append(elapsed) y_pred=clf.predict(X_test) learning_list.append(metrics.accuracy_score(y_test,y_pred)) print('LEARNING RATE') print('scores_lis',learning_list) plt.plot(['1','5','10','15','20'],learning_list)#l_range,learning_list) plt.title('Decision Tree Accuracy at Varying Max Depths (Titanic)') plt.xlabel('Max Depth') plt.ylabel('Testing Accuracy') plt.ylim((0,1.0)) plt.savefig('Titanic/dt_Titanic_learningRate.png') plt.show() plt.plot(['1','5','10','15','20'],time_list) plt.xlabel('Max Depth') plt.ylabel('Training Time (sec)') plt.title('Decision Tree Training Time at Varying Max Depths (Titanic)') # plt.ylim((0,0.01)) plt.savefig('Titanic/dt_Titanic_trainingTime.png') plt.show() import graphviz ''' dot_data = tree.export_graphviz(clf,out_file=None) graph = graphviz.Source(dot_data) graph.render("iris") ''' from sklearn.externals.six import StringIO from IPython.display import Image import pydotplus dot_data = StringIO() tree.export_graphviz(clf, out_file = dot_data, filled=True, rounded=True, special_characters=True, #feature_names = feature_cols, #class_names=['0','1']) ) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('Titanic/decisionTree_Titanic.png') Image(graph.create_png()) print('after graphviz') #Section: Optimizing Decision Tree Performance ''' #Create Decision Tree Classifier Object clf = DecisionTreeClassifier(criterion="entropy",max_depth=5) #min_samples_leaf can be set to 5%, max_leaf nodes can also be set clf=clf.fit(X_train,y_train) y_pred = clf.predict(X_test) #Model Accuracy, how often is the classifier correct? print("Accuracy:",metrics.accuracy_score(y_test,y_pred)) ''' ''' dot_data = StringIO() export_graphviz(clf, out_file = dot_data, filled=True, rounded=True, special_characters=True, feature_names = feature_cols, class_names=['0','1']) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('diabetes.png') Image(graph.create_png()) ''' # raise NotImplementedError return
def classify(**args): """ Main method that prepares dataset, builds model, executes training and displays results. :param args: keyword arguments passed from cli parser """ # only allow print-outs if execution has no repetitions allow_print = args['repetitions'] == 1 # determine classification targets and parameters to construct datasets properly cls_target, cls_str = set_classification_targets(args['cls_choice']) d = prepare_dataset( 0, # any synthetic cls_target, args['batch_size'], train_shuffle_repeat=False, categorical_labels=False) print('\n\tTask: Classify «{}» using «{}» with DecisionTreeClassifier\n'. format(cls_str, d['data_str'])) print_dataset_info(d) model = DecisionTreeClassifier(class_weight='balanced') # empty train data generator into list, then train. Careful with RAM train_data = [ sample for batch in tqdm( d['train_data'], total=d['train_steps'], desc='prep_train') for sample in batch[0] ] model.fit(train_data, d['train_labels']) del train_data # predict on testset and calculate classification report and confusion matrix for diagnosis d = prepare_dataset( 2, # any handheld cls_target, args['batch_size'], train_shuffle_repeat=False, categorical_labels=False) test_data = [ sample for batch in tqdm( d['test_data'], total=d['test_steps'], desc='prep_test') for sample in batch[0] ] print_dataset_info(d) pred = model.predict(test_data) del test_data if allow_print: # visualise decision tree, from datacamp.com/community/tutorials/decision-tree-classification-python dot_data = StringIO() export_graphviz(model, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('img/decision_tree.pdf') diagnose_output(d['test_labels'], pred, d['classes_trans']) return balanced_accuracy_score(d['test_labels'], pred)
test_target = iris.target[test_idx] ## test_data = iris.data[test_idx] # 2. Train a classifier clf = tree.DecisionTreeClassifier() clf.fit(train_data, train_target) # 3. Predict label for new flower print(test_target) # [0,1,2] print(clf.predict(test_data)) # splits out the same labels [0,1,2] # 4. Visualize the tree ## from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, impurity=False) print("should export the tree.dot") import graphviz as gp graph = gp.Source(dot_data.getvalue()) graph.render("iris", view=True)
os.environ["PATH"] += os.pathsep + 'c:/Program Files (x86)/Graphviz2.38/bin/' #%% graph1 = Source(tree.export_graphviz(clf, out_file=None, class_names= ['0', '1'] , filled = True)) display(SVG(graph1.pipe(format='svg'))) #change labels names graph2 = Source( tree.export_graphviz(clf, out_file=None, feature_names=X.columns, filled=True, class_names=['NoDiabetis','Diabetis'])) graph2 #change max_depth : 1 to 4 Source(tree.export_graphviz(clf, out_file=None, max_depth=1, feature_names=X.columns, class_names=['NonDB','DB'], label='all', filled=True, leaves_parallel=True, impurity=True, node_ids=True, proportion=True, rotate=True, rounded=True, special_characters=False, precision=1)) #https://stackoverflow.com/questions/27817994/visualizing-decision-tree-in-scikit-learn # This is for saving image in file system #https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html import pydotplus dotfile = StringIO() tree.export_graphviz(clf, out_file=dotfile, filled=True, feature_names=X.columns, class_names=['NoDiabetis','Diabetis']) pydotplus.graph_from_dot_data(dotfile.getvalue()).write_png("E:/graphs/dtree2.png") #True should be returned. goto location and see the file #%%% Create Decision Tree classifer object #change max_depth at the time of creation and method #criterio= entropy, gini clf3 = DecisionTreeClassifier(criterion="entropy", max_depth=3) # Train Decision Tree Classifer clf3 = clf3.fit(X_train,y_train) #Visualise Source(tree.export_graphviz(clf3, out_file=None, class_names= ['0', '1'] , filled = True, feature_names=X.columns,node_ids=True)) #display(SVG(graph3b.pipe(format='svg'))) X_train[0:1] #Class:1 : glucose > 127, glucose < 158, bmi, age, #Predict the response for test dataset y_pred3 = clf3.predict(X_test)
def show_tree(self): '''return a png of the tree''' assert self.clf try: import pydotplus as pydot except ImportError: import pydot # dirty hack for read the docs dot_data = StringIO() tree.export_graphviz(self.clf, out_file=dot_data, feature_names=self.feature_names) dot_data = dot_data.getvalue()#.encode('ascii') # @UndefinedVariable graph = pydot.graph_from_dot_data(dot_data)[0] img = graph.create_png() return img # if __name__ == '__main__': # from test import test_utilities # import matplotlib.pyplot as plt # # ema_logging.log_to_stderr(ema_logging.INFO) # # def scarcity_classify(outcomes): # outcome = outcomes['relative market price'] # change = np.abs(outcome[:, 1::]-outcome[:, 0:-1]) # # neg_change = np.min(change, axis=1) # pos_change = np.max(change, axis=1) # # logical = (neg_change > -0.6) & (pos_change > 0.6) # # classes = np.zeros(outcome.shape[0]) # classes[logical] = 1 # # return classes # # results = test_utilities.load_scarcity_data() # # cart = setup_cart(results, scarcity_classify) # cart.build_tree() # # print(cart.boxes_to_dataframe()) # print(cart.stats_to_dataframe()) # cart.display_boxes(together=True) # # img = cart.show_tree() # # import matplotlib.pyplot as plt # import matplotlib.image as mpimg # # # treat the dot output string as an image file # sio = StringIO() # sio.write(img) # sio.seek(0) # img = mpimg.imread(sio) # # # plot the image # imgplot = plt.imshow(img, aspect='equal') # # plt.show()
def draw_tree(model, name): dot_data = StringIO() _tree.export_graphviz(model, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(name + ".pdf")
def run_model(df, vectorizer, classifier): # load data x = df['Cleaned'].values y = df['Class'].values # split dataset into training and test sets, with 80:20 split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1000, stratify=y) if vectorizer == "count": vectorizer = CountVectorizer() if vectorizer == "tfidf": vectorizer = TfidfVectorizer() vectorizer.fit(x_train) X_train = vectorizer.transform(x_train) X_test = vectorizer.transform(x_test) if classifier == "naive_bayes": classifier = MultinomialNB() if classifier == "decision_tree": classifier = DecisionTreeClassifier( ) # manual search tried, but default hyperparameters were best if classifier == "random_forest": clf = RandomForestClassifier() # default n_estimators=100 # define random search space based on decision tree depth hyp = { "n_estimators": [50, 100, 150, 200], # number of trees in the forest "max_depth": [40, 50, None], # max depth of tree "max_features": [10, 20, 'sqrt', None], "min_samples_split": randint(1, 11), "bootstrap": [True, False], # to use bagging or not "criterion": ["gini", "entropy"] } # gini impurity or information gain # random search over 5-fold cross validation (stratified k-fold by default) random_search = RandomizedSearchCV(clf, hyp, random_state=1, n_iter=100, cv=5, verbose=1, n_jobs=-1) search_result = random_search.fit(X_train, y_train) n_estimators = search_result.best_estimator_.get_params( )['n_estimators'] max_depth = search_result.best_estimator_.get_params()['max_depth'] max_features = search_result.best_estimator_.get_params( )['max_features'] min_samples_split = search_result.best_estimator_.get_params( )['min_samples_split'] bootstrap = search_result.best_estimator_.get_params()['bootstrap'] criterion = search_result.best_estimator_.get_params()['criterion'] print("Random search results: ") print("Best n_estimators: ", n_estimators) print("Best max_depth: ", max_depth) print("Best max_features:", max_features) print("Best max_features:", min_samples_split) print("Best bootstrap:", bootstrap) print("Best criterion:", criterion) # set the classifier to the one with best hyperparameters from random search classifier = RandomForestClassifier( n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split, bootstrap=bootstrap, criterion=criterion) if classifier == "logistic_regression": # by a manual search the lbfgs solver showed best results # number of max iterations is increased to allow lbfgs solver to converge # compare loss functions over 5-fold cross validation ovr_clf = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000) ovr_score = cross_val_score(ovr_clf, X_train, y_train, cv=5).mean() mce_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000) mce_score = cross_val_score(mce_clf, X_train, y_train, cv=5).mean() # choose the better performing hyperparameters if (ovr_score > mce_score): classifier = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000) else: classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000) if classifier == "linear_svm": clf = svm.LinearSVC(max_iter=1000) hyp = { "loss": ['hinge', 'squared_hinge'], "multi_class": ['ovr', 'crammer_singer'] } random_search = RandomizedSearchCV(clf, hyp, random_state=1, n_iter=20, cv=5, verbose=1, n_jobs=-1) search_result = random_search.fit(X_train, y_train) loss = search_result.best_estimator_.get_params()['loss'] multi_class = search_result.best_estimator_.get_params()['multi_class'] print("Best loss: ", loss) print("Best multi_class:", multi_class) classifier = svm.LinearSVC(loss=loss, multi_class=multi_class, max_iter=1000) if classifier == "nonlinear_svm": clf = svm.SVC() hyp = { "gamma": ['auto', 'scale'], "kernel": ['poly', 'rbf', 'sigmoid'] } random_search = RandomizedSearchCV(clf, hyp, random_state=1, n_iter=20, cv=5, verbose=1, n_jobs=-1) search_result = random_search.fit(X_train, y_train) gamma = search_result.best_estimator_.get_params()['gamma'] kernel = search_result.best_estimator_.get_params()['kernel'] print("Best gamma: ", gamma) print("Best kernel:", kernel) classifier = svm.SVC(gamma=gamma, kernel=kernel) if classifier == "knn": classifier = KNeighborsClassifier( n_neighbors=5) # change k-value as needed if classifier == "mlp": clf = MLPClassifier() hyp = { "hidden_layer_sizes": [(64, ), (64, 64), (64, 64, 64), (128, ), (128, 128), (128.128, 128), (256, 256, 256), (512, 512, 512)] } grid_search = GridSearchCV(clf, hyp, cv=5) search_result = grid_search.fit(X_train, y_train) hidden_layer_sizes = search_result.best_estimator_.get_params( )['hidden_layer_sizes'] print("Best hidden layer size:", hidden_layer_sizes) classifier = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, verbose=True) # uses reLU, adam by default classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) # print metrics print("\nClassification report summary:") print( classification_report(y_test, y_pred, labels=[i + 1 for i in range(20)], digits=3)) print("Accuracy:", classifier.score(X_test, y_test)) print("Macro-F1:", f1_score(y_test, y_pred, average='macro')) # if decision tree or random forest, generates plot of tree if classifier == "decision_tree" or classifier == "random_forest": # print 5 most important tokens: swapped_vocab = dict([ (value, key) for key, value in vectorizer.vocabulary_.items() ]) print("5 most important tokens: ") for i in np.argsort(classifier.feature_importances_)[-5:][::-1]: print(swapped_vocab[i]) from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() if classifier == "decision_tree": export_graphviz(classifier, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("decision_tree.pdf") else: # get a random one of the 100 trees in the forest export_graphviz(classifier.estimators_[random.randint(1, 101)], out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("random_forest.pdf") # if logistic regression, plot most important terms if classifier == "logistic_regression": plot_lr_coef(classifier, vectorizer) # get confusion matrix for plot cm = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None) return vectorizer, classifier, cm
iris = load_iris() df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target']) # df['label'] = df.target.replace(dict(enumerate(df.target_names))) print(df.head()) # to check the top results print(iris.feature_names) print(iris.target_names) print(df.describe()) # to check difference between min and maxmium value x = iris['data'] y = iris['target'] iris_df = pd.DataFrame(x, columns=iris['feature_names']) print(iris_df.head) x, y = shuffle(x, y, random_state=0) # random shuffle x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) classifier=DecisionTreeClassifier(criterion="entropy", max_depth=3) # To check accuracy ,applied algorithm clf = classifier.fit(x_train,y_train) y_pred = classifier.predict(x_test) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) # accuracy result shoecase in console dot_data = StringIO() tree.export_graphviz(classifier, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, impurity=False, proportion=True) graph=pydot.graph_from_dot_data(dot_data.getvalue()) # plotting the graph graph[0].write_pdf("iris3.pdf") # run the file.