def test_graph_with_shapefiles(self): shapefile_dir = os.path.join(TEST_DIR, 'from-past-to-future') dot_file = os.path.join(shapefile_dir, 'from-past-to-future.dot') pngs = [ os.path.join(shapefile_dir, fname) for fname in os.listdir(shapefile_dir) if fname.endswith('.png') ] f = open(dot_file, 'rt') graph_data = f.read() f.close() g = pydotplus.graph_from_dot_data(graph_data) g.set_shape_files(pngs) jpe_data = g.create(format='jpe') hexdigest = sha256(jpe_data).hexdigest() hexdigest_original = self._render_with_graphviz(dot_file) self.assertEqual(hexdigest, hexdigest_original)
def run_DT_model_2(df, criteria_col): # run the tree for various 0,1 lebel (e.g. : high value or not..) from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split from sklearn.externals.six import StringIO from IPython.display import Image import pydotplus print ('criteria_col = ', criteria_col) tree_col = [criteria_col,'Frequency', 'LTV', 'period_no_use','AverageTimeToOrder', 'late_by_collection', 'late_by_delivery', 'tickets', 'recleaned_orders', 'cancalled_orders', 'voucher_used'] df_train_ = df #df_train_tree = df_train_[tree_col] tree_data = df_train_[tree_col] tree_data = tree_data.dropna() tree_train, tree_test = train_test_split(tree_data, test_size=0.2, random_state=200, stratify=tree_data[criteria_col]) clf = tree.DecisionTreeClassifier() clf = clf.fit(tree_train.iloc[:,1:], tree_train[criteria_col]) print (clf.score(tree_test.iloc[:,1:], tree_test[criteria_col])) # confusion matrix print (confusion_matrix(tree_test[criteria_col], clf.predict(tree_test.iloc[:,1:]))) # visualize the tree dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=tree_col[1:], filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) return Image(graph.create_png()), tree_train, tree_test
def test_attribute_with_implicit_value(self): d = 'digraph {\na -> b[label="hi", decorate];\n}' g = pydotplus.graph_from_dot_data(d) attrs = g.get_edges()[0].get_attributes() self.assertEqual('decorate' in attrs, True)
def train_network(self): """ Pure virtual method for training the network """ db_query = self._database_session.query(PregameHitterGameEntry) mlb_training_data, mlb_evaluation_data = self.get_train_eval_data(db_query, 0.8) X_train, Y_train = self.get_stochastic_batch(mlb_training_data, self.SIZE_TRAINING_BATCH) self._decision_tree.fit(X_train, Y_train) dot_data = StringIO() tree.export_graphviz(self._decision_tree, out_file=dot_data, feature_names=PregameHitterGameEntry.get_input_vector_labels()) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("hitter_tree.pdf") x_test_actual = list() y_test_actual = list() for data in mlb_evaluation_data: try: postgame_entry = self._database_session.query(PostgameHitterGameEntry).filter(PostgameHitterGameEntry.rotowire_id == data.rotowire_id, PostgameHitterGameEntry.game_date == data.game_date).one() y_test_actual.append([postgame_entry.actual_draftkings_points]) x_test_actual.append(data.to_input_vector()) except NoResultFound: print "Ignoring hitter %s since his postgame stats were not found." % data.rotowire_id continue self._database_session.close()
def create_tree(X, Y): clf = tree.DecisionTreeClassifier(criterion="entropy") clf = clf.fit(X, Y) from IPython.display import Image import pydotplus dot_data = StringIO() # tree.export_graphviz(clf, out_file=dot_data) # feature_names = ['Gender', 'Age'] feature_names = ["Gender", "0-5", "6-12", "13-19", "20-27", "28-35", "36-50", "55+"] target_names = [] for i in range(1, len(Y) + 1): target_names.append("Ad #" + str(i)) tree.export_graphviz( clf, out_file=dot_data, feature_names=feature_names, class_names=target_names, filled=True, rounded=True, special_characters=True, ) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("Tree.pdf") return clf
def line_dot(self, code): """ %dot CODE - render code as Graphviz image This line magic will render the Graphiz CODE, and render it as an image. Example: %dot graph A { a->b }; """ try: if os.name == 'nt': import pydotplus as pydot else: import pydotplus as pydot #import pydot except: raise Exception("You need to install pydot") graph = pydot.graph_from_dot_data(str(code)) svg = graph.create_svg() if hasattr(svg, "decode"): svg = svg.decode("utf-8") html = HTML(svg) self.kernel.Display(html)
def cell_dot(self): """ %%dot - render contents of cell as Graphviz image This cell magic will send the cell to the browser as HTML. Example: %%dot graph A { a->b }; """ try: if os.name == 'nt': import pydotplus as pydot else: import pydot except: raise Exception("You need to install pydot") graph = pydot.graph_from_dot_data(str(self.code)) svg = graph.create_svg() if hasattr(svg, "decode"): svg = svg.decode("utf-8") html = HTML(svg) self.kernel.Display(html) self.evaluate = False
def DecisionTree(self, dados): database = np.array(zip(dados[:, 4], dados[:, 5], dados[:, 11], dados[:, 12], dados[:, 19], dados[:, 20], dados[:, 28], dados[:, 29], dados[:, 8], dados[:, 16], dados[:, 24], dados[:, 32])) class_names = ('implementacao_estimado','implementacao_real','correcao_est','correcao_real','teste_est', 'teste_real','elaboracao_estimado', 'elaboracao_real','perfil_imple','perfil_cor','perfil_teste', 'perfil_elab') kind = [] for dado in dados[:, 33]: if(float(dado) <= 1.5): kind.append('class1') elif (float(dado) <= 2.0): kind.append('class2') elif (float(dado) <= 2.5): kind.append('class3') elif (float(dado) <= 3.0): kind.append('class4') elif (float(dado) <= 3.5): kind.append('class5') elif (float(dado) <= 4.0): kind.append('class6') elif (float(dado) <= 4.5): kind.append('class7') else: kind.append('class8') target = np.array(kind) clf = tree.DecisionTreeClassifier() clf = clf.fit(database, target) with open("projetos.dot", 'w') as f: f = tree.export_graphviz(clf, out_file=f) os.unlink('projetos.dot') dot_data = tree.export_graphviz(clf, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf('projetos.pdf') dot_data = tree.export_graphviz(clf, out_file=None, feature_names=class_names, class_names=target, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) Image(graph.create_png())
def test_multiple_graphs(self): graph_data = 'graph A { a->b };\ngraph B {c->d}' graphs = pydotplus.graph_from_dot_data(graph_data) self.assertEqual(len(graphs), 2) self.assertEqual([g.get_name() for g in graphs], ['A', 'B'])
def export_graph(self, clf, labels, file_name): dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=self.__features_name, class_names=labels, filled=True, rounded=True, impurity=False) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('{}.pdf'.format(file_name))
def local_causality_graph(model, kind="full", goal=None): assert kind in ["verbose,","trimmed","saturated","worth","full"] if kind != "full" and goal is None: raise ValueError("goal cannot be None with %s LCG" % kind) args = ["-t", kind, "-o", "-"] if goal: args.append(goal) cp = _run_tool("pint-lcg", *args, input_model=model) g = pydotplus.graph_from_dot_data(cp.stdout.decode()) return nx.nx_pydot.from_pydot(g)
def graph_decision_tree(model, class_names): model_dot = StringIO() tree.export_graphviz(model, out_file=model_dot, feature_names=features, class_names=class_names, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(model_dot.getvalue()) graph.write_pdf("model"+class_names[1]+".pdf")
def show_tree(self): '''return a png of the tree''' assert self.clf try: import pydotplus as pydot except ImportError: import pydot # dirty hack for read the docs dot_data = StringIO() tree.export_graphviz(self.clf, out_file=dot_data, feature_names=self.feature_names) graph = pydot.graph_from_dot_data(dot_data.getvalue().encode('ascii')) # @UndefinedVariable img = graph.create_png() return img
def visualize_tree(clf, feature_names, class_names, output_file, method='pdf'): dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True, impurity=False) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) if method == 'pdf': graph.write_pdf(output_file + ".pdf") elif method == 'inline': Image(graph.create_png()) return graph
def train(self, training_set, training_target, fea_index): clf = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=30, class_weight="balanced") clf = clf.fit(training_set, training_target) class_names = np.unique([str(i) for i in training_target]) feature_names = [attr_list[i] for i in fea_index] dot_data = tree.export_graphviz(clf, out_file=None, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("output/tree-vis.pdf") joblib.dump(clf, 'output/CART.pkl')
def render_output_pydot(self, dotdata, **kwargs): """Renders the image using pydot""" if not HAS_PYDOT: raise CommandError("You need to install pydot python module") graph = pydot.graph_from_dot_data(dotdata) if not graph: raise CommandError("pydot returned an error") output_file = kwargs['outputfile'] formats = ['bmp', 'canon', 'cmap', 'cmapx', 'cmapx_np', 'dot', 'dia', 'emf', 'em', 'fplus', 'eps', 'fig', 'gd', 'gd2', 'gif', 'gv', 'imap', 'imap_np', 'ismap', 'jpe', 'jpeg', 'jpg', 'metafile', 'pdf', 'pic', 'plain', 'plain-ext', 'png', 'pov', 'ps', 'ps2', 'svg', 'svgz', 'tif', 'tiff', 'tk', 'vml', 'vmlz', 'vrml', 'wbmp', 'xdot'] ext = output_file[output_file.rfind('.') + 1:] format = ext if ext in formats else 'raw' graph.write(output_file, format=format)
def train_tree_classifer(features, labels, model_output_path): """ train_tree_classifer will train a DecisionTree and write it out to a pdf file features: 2D array of each input feature for each sample labels: array of string labels classifying each sample model_output_path: path for storing the trained tree model """ # save 20% of data for performance evaluation X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, labels, test_size=0.2) param = [ { "max_depth": [None, 10, 100, 1000, 10000] } ] dtree = tree.DecisionTreeClassifier(random_state=0) # 10-fold cross validation, use 4 thread as each fold and each parameter set can be train in parallel clf = grid_search.GridSearchCV(dtree, param, cv=10, n_jobs=20, verbose=3) clf.fit(X_train, y_train) if os.path.exists(model_output_path): joblib.dump(clf.best_estimator_, model_output_path) else: print("Cannot save trained tree model to {0}.".format(model_output_path)) dot_data = tree.export_graphviz(clf.best_estimator_, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf('best_tree.pdf') print("\nBest parameters set:") print(clf.best_params_) y_predict=clf.predict(X_test) labels=sorted(list(set(labels))) print("\nConfusion matrix:") print("Labels: {0}\n".format(",".join(labels))) print(confusion_matrix(y_test, y_predict, labels=labels)) print("\nClassification report:") print(classification_report(y_test, y_predict))
def show_pdf(clf): ''' 可视化输出 把决策树结构写入文件: http://sklearn.lzjqsdd.com/modules/tree.html Mac报错:pydotplus.graphviz.InvocationException: GraphViz's executables not found 解决方案:sudo brew install graphviz 参考写入: http://www.jianshu.com/p/59b510bafb4d ''' # with open("testResult/tree.dot", 'w') as f: # from sklearn.externals.six import StringIO # tree.export_graphviz(clf, out_file=f) import pydotplus from sklearn.externals.six import StringIO dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("output/3.DecisionTree/tree.pdf")
def export_tree_pdf(self,filename=None): # returns the tree as dot data # if filename is specified the function # will save the pdf file in current directory which consists of the visual reresentation of the tree import pydotplus from collections import deque dot_data = '''digraph Tree { node [shape=box] ;''' queue = deque() r = self.__root queue.append(r) count = 0 if r.index == -1: r.index = count dot_data = dot_data + "\n{} [label=\"Feature to split upon : X[{}]\\nOutput at this node : {}\" ];".format(count,r.data,r.output) # Doing LEVEL ORDER traversal in the tree (using a queue) while len(queue) != 0 : node = queue.popleft() for i in node.children: count+=1 if(node.children[i].index==-1): node.children[i].index = count # Creating child node dot_data = dot_data + "\n{} [label=\"Feature to split upon : X[{}]\\nOutput at this node : {}\" ];".format(node.children[i].index,node.children[i].data,node.children[i].output) # Connecting parent node with child dot_data = dot_data + "\n{} -> {} [ headlabel=\"Feature value = {}\"]; ".format(node.index,node.children[i].index,i) # Adding child node to queue queue.append(node.children[i]) dot_data = dot_data + "\n}" if filename != None: graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf(filename) return dot_data
def read_dot(path): """Return a NetworkX MultiGraph or MultiDiGraph from a dot file on path. Parameters ---------- path : filename or file handle Returns ------- G : NetworkX multigraph A MultiGraph or MultiDiGraph. Notes ----- Use G = nx.Graph(read_dot(path)) to return a Graph instead of a MultiGraph. """ import pydotplus data = path.read() P = pydotplus.graph_from_dot_data(data) return from_pydot(P)
def draw_DecTree(DecTree, feat_names=None, cla_names=None): # from sklearn.externals.six import StringIO # import pydotplus # dot_data = StringIO() # tree.export_graphviz(DecTre, out_file=dot_data) # graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) # graph.write_pdf("iris.pdf") dot_data = StringIO() # tree.export_graphviz(DecTree, out_file=dot_data) tree.export_graphviz(DecTree, out_file=dot_data, feature_names=feat_names, class_names=cla_names, node_ids=True, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('dot_data.pdf') # return Image(graph.create_png())
def pydot_layout(G,prog='neato',root=None, **kwds): """Create node positions using Pydot and Graphviz. Returns a dictionary of positions keyed by node. Examples -------- >>> G = nx.complete_graph(4) >>> pos = nx.nx_pydot.pydot_layout(G) >>> pos = nx.nx_pydot.pydot_layout(G, prog='dot') """ import pydotplus P=to_pydot(G) if root is not None : P.set("root",make_str(root)) D=P.create_dot(prog=prog) if D=="": # no data returned print("Graphviz layout with %s failed"%(prog)) print() print("To debug what happened try:") print("P=pydot_from_networkx(G)") print("P.write_dot(\"file.dot\")") print("And then run %s on file.dot"%(prog)) return Q=pydotplus.graph_from_dot_data(D) node_pos={} for n in G.nodes(): pydot_node = pydotplus.Node(make_str(n)).get_name() node=Q.get_node(pydot_node) if isinstance(node,list): node=node[0] pos=node.get_pos()[1:-1] # strip leading and trailing double quotes if pos != None: xx,yy=pos.split(",") node_pos[n]=(float(xx),float(yy)) return node_pos
def _create_graph(self, **kwargs): clf = self._clf dot_data = StringIO() feature_names, class_names = self._get_names(**kwargs) if StrictVersion(sklearn.__version__) >= StrictVersion('0.17'): tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names, class_names=class_names, filled=kwargs.get("filled", True), rounded=kwargs.get("rounded", True), special_characters=kwargs.get("special_characters", True), **kwargs) else: tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names, **kwargs) return pydot.graph_from_dot_data(dot_data.getvalue())
def render_output_pydot(self, dotdata, **kwargs): """Renders model data as image using pydot""" if not HAS_PYDOT: raise CommandError("You need to install pydot python module") graph = pydot.graph_from_dot_data(dotdata) if not graph: raise CommandError("pydot returned an error") if isinstance(graph, (list, tuple)): if len(graph) > 1: sys.stderr.write("Found more then one graph, rendering only the first one.\n") graph = graph[0] output_file = kwargs['outputfile'] formats = ['bmp', 'canon', 'cmap', 'cmapx', 'cmapx_np', 'dot', 'dia', 'emf', 'em', 'fplus', 'eps', 'fig', 'gd', 'gd2', 'gif', 'gv', 'imap', 'imap_np', 'ismap', 'jpe', 'jpeg', 'jpg', 'metafile', 'pdf', 'pic', 'plain', 'plain-ext', 'png', 'pov', 'ps', 'ps2', 'svg', 'svgz', 'tif', 'tiff', 'tk', 'vml', 'vmlz', 'vrml', 'wbmp', 'xdot'] ext = output_file[output_file.rfind('.') + 1:] format = ext if ext in formats else 'raw' graph.write(output_file, format=format)
def decision_trees(clf, drawing_param, nFeatures=None, show_trees=True, show_importance=True, showfig=True): clfs = clf.estimators_ if hasattr(clf, 'estimators_') else [clf] if show_trees: for i, estimator in enumerate(clfs): # create graph dot_data = StringIO() tree.export_graphviz(estimator, out_file=dot_data, rounded=True, filled=True, **drawing_param) graph = pydot.graph_from_dot_data(dot_data.getvalue()) # create image from graph png_str = graph.create_png(prog='dot') sio = io.BytesIO() sio.write(png_str) sio.seek(0) img = mpimg.imread(sio) # plot the image fig = plt.figure('Decision tree %s' % i) plt.imshow(img, aspect='equal') fig.tight_layout() if showfig: plt.show() importances = clf.feature_importances_ indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for i, f in enumerate(indices): print("%d. feature %d = %f [%s]" % (i + 1, f, importances[f], drawing_param['feature_names'][f])) if show_importance: axis = hbar([tree_.feature_importances_ for tree_ in clfs], figtitle='Feature importance', nBars=nFeatures, yticknames=drawing_param['feature_names'], xlabel='Gini importance', sort='descend')[0] axis.figure.tight_layout() if showfig: plt.show()
def run_model(self, max_depth=3, criterion='entropy', do_plot=True): # Supported criteria for tree are gini for the Gini impurity and entropy for the information gain. tree = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=0) tree.fit(self.x_train, self.y_train) # export a graphical representation of the tree dot_data = io.StringIO() export_graphviz(tree, out_file=dot_data, feature_names=self.x_col_names) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) fn = './output/' + self.data_label + '_graph.pdf' graph.write_pdf(fn) # check model accuracy ''' y_train_pred = tree.predict(self.x_train) train_acc = accuracy_score(self.y_train, y_train_pred) print('Training accuracy score is', train_acc) y_test_pred = tree.predict(self.x_test) test_acc = accuracy_score(self.y_test, y_test_pred) print('Test accuracy score is', test_acc) ''' # no difference from above train_score = tree.score(self.x_train, self.y_train) print('Training score is', train_score) test_score = tree.score(self.x_test, self.y_test) print('Test score is', test_score) if do_plot: self.__plot_learning_curve(tree) self.__plot_decision_boundaries(tree) return train_score, test_score
def tree_classifier(): """Create an HPS classifier using the alpha-beta.""" fn_fit = os.path.join(DF_Dir, 'fit_constant_step_size_01_bounded.pkl') fit = pd.read_pickle(fn_fit) print('Using data from', fn_fit) X = fit[['0_alpha', '0_beta', '1_alpha', '1_beta']].values y = fit['HPS_level'].values clf = tree.DecisionTreeClassifier(max_depth=4) clf.fit(X, y) from sklearn.externals.six import StringIO import pydotplus as pydot dot_data = StringIO() feature_names = ['a0', 'b0', 'a1', 'b1'] target_names = ['low', 'medium', 'high'] tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names, class_names=target_names, filled=True, rounded=True, special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) fn = os.path.join(Fig_Dir, 'hpf_tree_classifier.pdf') graph.write_pdf(fn) print('Tree saved as', fn)
def test(): #training data test_idx = [0, 50, 100] train_target = np.delete(iris.target, test_idx) train_data = np.delete(iris.data, test_idx, axis=0) # testing data test_target = iris.target[test_idx] test_data = iris.data[test_idx] clf = tree.DecisionTreeClassifier() clf = clf.fit(train_data, train_target) print test_target print clf.predict(test_data) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("iris.pdf")
def show_tree(self, mplfig=True, format='png'): '''return a png of the tree Parameters ---------- mplfig : bool, optional if true (default) returns a matplotlib figure with the tree, otherwise, it returns the output as bytes format : {'png', 'svg'}, default 'png' Gives a format of the output. ''' assert self.clf try: import pydotplus as pydot except ImportError: import pydot # dirty hack for read the docs dot_data = StringIO() tree.export_graphviz(self.clf, out_file=dot_data, feature_names=self.feature_names) dot_data = dot_data.getvalue() # .encode('ascii') # @UndefinedVariable graph = pydot.graph_from_dot_data(dot_data) if format == 'png': img = graph.create_png() if mplfig: fig, ax = plt.subplots() ax.imshow(mpimg.imread(io.BytesIO(img))) ax.axis('off') return fig elif format == 'svg': img = graph.create_svg() else: raise TypeError('''format must be in {'png', 'svg'}''') return img
def test_unicode_ids(self): node1 = '"aánñoöüé€"' node2 = '"îôø®çßΩ"' g = pydotplus.Dot() g.set_charset('latin1') g.add_node(pydotplus.Node(node1)) g.add_node(pydotplus.Node(node2)) g.add_edge(pydotplus.Edge(node1, node2)) self.assertEqual(g.get_node(node1)[0].get_name(), node1) self.assertEqual(g.get_node(node2)[0].get_name(), node2) self.assertEqual(g.get_edges()[0].get_source(), node1) self.assertEqual(g.get_edges()[0].get_destination(), node2) g2 = pydotplus.graph_from_dot_data(g.to_string()) self.assertEqual(g2.get_node(node1)[0].get_name(), node1) self.assertEqual(g2.get_node(node2)[0].get_name(), node2) self.assertEqual(g2.get_edges()[0].get_source(), node1) self.assertEqual(g2.get_edges()[0].get_destination(), node2)
pred = clf.predict(iris_test.iloc[:, 0:4]) print(accuracy_score(iris_test['type'].values.codes, pred)) #%% ''' CART(Classification and Regression Tree) 학습 ''' clf = tree.DecisionTreeClassifier() clf.fit(iris_train.iloc[:, 0:4], iris_train['type'].values.codes) pred = clf.predict(iris_train.iloc[:, 0:4]) print(accuracy_score(iris_train['type'].values.codes, pred)) pred = clf.predict(iris_test.iloc[:, 0:4]) print(accuracy_score(iris_test['type'].values.codes, pred)) #%% ''' 트리 시각화 !conda install -c conda-forge -y pydotplus !conda install -y graphviz ''' import pydotplus from IPython.display import Image tree_dot = tree.export_graphviz(clf, out_file=None, feature_names=iris.columns.values, class_names=iris['type'].values.categories.values) tree_graph = pydotplus.graph_from_dot_data(tree_dot) Image(tree_graph.create_png())
def run_model(df, vectorizer, classifier): # load data x = df['Cleaned'].values y = df['Class'].values # split dataset into training and test sets, with 80:20 split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1000, stratify=y) if vectorizer == "count": vectorizer = CountVectorizer() if vectorizer == "tfidf": vectorizer = TfidfVectorizer() vectorizer.fit(x_train) X_train = vectorizer.transform(x_train) X_test = vectorizer.transform(x_test) if classifier == "naive_bayes": classifier = MultinomialNB() if classifier == "decision_tree": classifier = DecisionTreeClassifier( ) # manual search tried, but default hyperparameters were best if classifier == "random_forest": clf = RandomForestClassifier() # default n_estimators=100 # define random search space based on decision tree depth hyp = { "n_estimators": [50, 100, 150, 200], # number of trees in the forest "max_depth": [40, 50, None], # max depth of tree "max_features": [10, 20, 'sqrt', None], "min_samples_split": randint(1, 11), "bootstrap": [True, False], # to use bagging or not "criterion": ["gini", "entropy"] } # gini impurity or information gain # random search over 5-fold cross validation (stratified k-fold by default) random_search = RandomizedSearchCV(clf, hyp, random_state=1, n_iter=100, cv=5, verbose=1, n_jobs=-1) search_result = random_search.fit(X_train, y_train) n_estimators = search_result.best_estimator_.get_params( )['n_estimators'] max_depth = search_result.best_estimator_.get_params()['max_depth'] max_features = search_result.best_estimator_.get_params( )['max_features'] min_samples_split = search_result.best_estimator_.get_params( )['min_samples_split'] bootstrap = search_result.best_estimator_.get_params()['bootstrap'] criterion = search_result.best_estimator_.get_params()['criterion'] print("Random search results: ") print("Best n_estimators: ", n_estimators) print("Best max_depth: ", max_depth) print("Best max_features:", max_features) print("Best max_features:", min_samples_split) print("Best bootstrap:", bootstrap) print("Best criterion:", criterion) # set the classifier to the one with best hyperparameters from random search classifier = RandomForestClassifier( n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split, bootstrap=bootstrap, criterion=criterion) if classifier == "logistic_regression": # by a manual search the lbfgs solver showed best results # number of max iterations is increased to allow lbfgs solver to converge # compare loss functions over 5-fold cross validation ovr_clf = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000) ovr_score = cross_val_score(ovr_clf, X_train, y_train, cv=5).mean() mce_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000) mce_score = cross_val_score(mce_clf, X_train, y_train, cv=5).mean() # choose the better performing hyperparameters if (ovr_score > mce_score): classifier = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000) else: classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000) if classifier == "linear_svm": clf = svm.LinearSVC(max_iter=1000) hyp = { "loss": ['hinge', 'squared_hinge'], "multi_class": ['ovr', 'crammer_singer'] } random_search = RandomizedSearchCV(clf, hyp, random_state=1, n_iter=20, cv=5, verbose=1, n_jobs=-1) search_result = random_search.fit(X_train, y_train) loss = search_result.best_estimator_.get_params()['loss'] multi_class = search_result.best_estimator_.get_params()['multi_class'] print("Best loss: ", loss) print("Best multi_class:", multi_class) classifier = svm.LinearSVC(loss=loss, multi_class=multi_class, max_iter=1000) if classifier == "nonlinear_svm": clf = svm.SVC() hyp = { "gamma": ['auto', 'scale'], "kernel": ['poly', 'rbf', 'sigmoid'] } random_search = RandomizedSearchCV(clf, hyp, random_state=1, n_iter=20, cv=5, verbose=1, n_jobs=-1) search_result = random_search.fit(X_train, y_train) gamma = search_result.best_estimator_.get_params()['gamma'] kernel = search_result.best_estimator_.get_params()['kernel'] print("Best gamma: ", gamma) print("Best kernel:", kernel) classifier = svm.SVC(gamma=gamma, kernel=kernel) if classifier == "knn": classifier = KNeighborsClassifier( n_neighbors=5) # change k-value as needed if classifier == "mlp": clf = MLPClassifier() hyp = { "hidden_layer_sizes": [(64, ), (64, 64), (64, 64, 64), (128, ), (128, 128), (128.128, 128), (256, 256, 256), (512, 512, 512)] } grid_search = GridSearchCV(clf, hyp, cv=5) search_result = grid_search.fit(X_train, y_train) hidden_layer_sizes = search_result.best_estimator_.get_params( )['hidden_layer_sizes'] print("Best hidden layer size:", hidden_layer_sizes) classifier = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, verbose=True) # uses reLU, adam by default classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) # print metrics print("\nClassification report summary:") print( classification_report(y_test, y_pred, labels=[i + 1 for i in range(20)], digits=3)) print("Accuracy:", classifier.score(X_test, y_test)) print("Macro-F1:", f1_score(y_test, y_pred, average='macro')) # if decision tree or random forest, generates plot of tree if classifier == "decision_tree" or classifier == "random_forest": # print 5 most important tokens: swapped_vocab = dict([ (value, key) for key, value in vectorizer.vocabulary_.items() ]) print("5 most important tokens: ") for i in np.argsort(classifier.feature_importances_)[-5:][::-1]: print(swapped_vocab[i]) from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() if classifier == "decision_tree": export_graphviz(classifier, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("decision_tree.pdf") else: # get a random one of the 100 trees in the forest export_graphviz(classifier.estimators_[random.randint(1, 101)], out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("random_forest.pdf") # if logistic regression, plot most important terms if classifier == "logistic_regression": plot_lr_coef(classifier, vectorizer) # get confusion matrix for plot cm = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None) return vectorizer, classifier, cm
def show_tree(self): '''return a png of the tree''' assert self.clf try: import pydotplus as pydot except ImportError: import pydot # dirty hack for read the docs dot_data = StringIO() tree.export_graphviz(self.clf, out_file=dot_data, feature_names=self.feature_names) dot_data = dot_data.getvalue()#.encode('ascii') # @UndefinedVariable graph = pydot.graph_from_dot_data(dot_data)[0] img = graph.create_png() return img # if __name__ == '__main__': # from test import test_utilities # import matplotlib.pyplot as plt # # ema_logging.log_to_stderr(ema_logging.INFO) # # def scarcity_classify(outcomes): # outcome = outcomes['relative market price'] # change = np.abs(outcome[:, 1::]-outcome[:, 0:-1]) # # neg_change = np.min(change, axis=1) # pos_change = np.max(change, axis=1) # # logical = (neg_change > -0.6) & (pos_change > 0.6) # # classes = np.zeros(outcome.shape[0]) # classes[logical] = 1 # # return classes # # results = test_utilities.load_scarcity_data() # # cart = setup_cart(results, scarcity_classify) # cart.build_tree() # # print(cart.boxes_to_dataframe()) # print(cart.stats_to_dataframe()) # cart.display_boxes(together=True) # # img = cart.show_tree() # # import matplotlib.pyplot as plt # import matplotlib.image as mpimg # # # treat the dot output string as an image file # sio = StringIO() # sio.write(img) # sio.seek(0) # img = mpimg.imread(sio) # # # plot the image # imgplot = plt.imshow(img, aspect='equal') # # plt.show()
def draw_tree(model, name): dot_data = StringIO() _tree.export_graphviz(model, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(name + ".pdf")
def plot_tree(tree: Tree, max_depth: int, iter: int): """ 展示单棵决策树 :param tree: 生成的决策树 :param max_depth: 决策树的最大深度 :param iter: 第几棵决策树 :return: """ root = tree.root_node res = [] # 通过遍历获取决策树的父子节点关系,可选有traversal 层次遍历 和traversal_preorder 先序遍历 traversal(root, res) # 获取所有节点 nodes = {} index = 0 for i in res: p, c = i[0], i[1] if p not in nodes.values(): nodes[index] = p index = index + 1 if c not in nodes.values(): nodes[index] = c index = index + 1 # 通过dot语法将决策树展示出来 edges = '' node = '' # 将节点层次展示 for depth in range(max_depth): for nodepair in res: if nodepair[0].deep == depth: # p,c分别为节点对中的父节点和子节点 p, c = nodepair[0], nodepair[1] l = len([i for i in range(len(c.data_index)) if c.data_index[i] is True]) pname = str(list(nodes.keys())[list(nodes.values()).index(p)]) cname = str(list(nodes.keys())[list(nodes.values()).index(c)]) if l > 0: edges = edges + pname + '->' + cname + '[label=\"' + str(p.split_feature) + ( '<' if p.left_child == c else '>=') + str(p.split_value) + '\"]' + ';\n' node = node + pname + '[width=1,height=0.5,color=lemonchiffon,style=filled,shape=ellipse,label=\"id:' + str( [i for i in range(len(p.data_index)) if p.data_index[i] is True]) + '\"];\n' + \ (cname + '[width=1,height=0.5,color=lemonchiffon,style=filled,shape=ellipse,label=\"id:' + str( [i for i in range(len(c.data_index)) if c.data_index[i] is True]) + '\"];\n' if l > 0 else '') if c.is_leaf and l > 0: edges = edges + cname + '->' + cname + 'p[style=dotted];\n' node = node + cname + 'p[width=1,height=0.5,color=lightskyblue,style=filled,shape=box,label=\"' + str( "{:.4f}".format(c.predict_value)) + '\"];\n' else: continue dot = '''digraph g {\n''' + edges + node + '''}''' graph = pdp.graph_from_dot_data(dot) # 保存图片+pyplot展示 graph.write_png('results/NO.{}_tree.png'.format(iter)) img = Image.open('results/NO.{}_tree.png'.format(iter)) img = img.resize((1024, 700), Image.ANTIALIAS) plt.ion() plt.figure(1, figsize=(30, 20)) plt.axis('off') plt.title('NO.{} tree'.format(iter)) plt.rcParams['figure.figsize'] = (30.0, 20.0) plt.imshow(img) plt.pause(0.01)
def graph(request): if request.method == 'GET': length = len(Applicants2016.objects.all()) applicants = Applicants2016.objects.all()[:(length / 2 - 1)] clf = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=1) clf2 = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=1) Y = [] X = [] Z = [] for a in applicants: if a.apply_for == "mphil": Y.append("0") elif a.apply_for == "phd": Y.append("1") else: Y.append("2") Z.append(a.shortlisted) X.append([a.toefl, a.gpa_ug / a.gpa_ug_scale, a.papers]) if a.major_ug == 'CS': X[-1].append("1") else: X[-1].append("0") feature_names = ['toefl', 'gpa_ug_scale', 'papers', 'major_ug'] class_names = ['mphil', 'phd', 'either'] class2_names = ['not shortlisted', 'shortlisted'] clf = clf.fit(X, Y) clf2 = clf2.fit(X, Z) f = tree.export_graphviz(clf, out_file=None, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True) f2 = tree.export_graphviz(clf2, out_file=None, feature_names=feature_names, class_names=class2_names, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(f) graph2 = pydotplus.graph_from_dot_data(f2) current_dir = os.path.dirname(os.path.abspath(__file__)) static = os.path.join(current_dir, 'static', 'mining1') graph.write_png(os.path.join(static, "tree.png")) graph2.write_png(os.path.join(static, "tree2.png")) #image_data1=open('tree.png',"rb").read() #image_data2=open('tree2.png',"rb").read() html = "<p style='text-align:center;'>Decision Tree for program</p><img src='/static/mining1/tree.png'><p style='text-align:center;'>Decision Tree for shortlisted interview</p><img src='/static/mining1/tree2.png'>" applicants2 = Applicants2016.objects.all()[(length / 2):length] Y = [] X = [] Z = [] for a in applicants2: if a.apply_for == "mphil": Y.append("0") elif a.apply_for == "phd": Y.append("1") else: Y.append("2") Z.append(a.shortlisted) X.append([a.toefl, a.gpa_ug / a.gpa_ug_scale, a.papers]) if a.major_ug == 'CS': X[-1].append("1") else: X[-1].append("0") result = clf2.predict(X) i = 0 accuracy = 0 for r in result: if r == Z[i]: accuracy = accuracy + 1 i = i + 1 print(accuracy / i) #return HttpResponse(image_data1,content_type="image/png") return HttpResponse(html) elif request.method == 'POST': return _error_response(request, "POST not allowed")
elif listFromLine[2] == 'yes': returnMat[index, 2] = 1 if listFromLine[3] == 'reduced': returnMat[index, 3] = 0 elif listFromLine[3] == 'normal': returnMat[index, 3] = 1 #returnMat[index,:] = listFromLine[0:3] if listFromLine[-1] == 'no lenses': classLabelVector.append(3) elif listFromLine[-1] == 'soft': classLabelVector.append(2) elif listFromLine[-1] == 'hard': classLabelVector.append(1) index = index + 1 return returnMat, classLabelVector X, y = dataload() print X, y # 训练模型,限制树的最大深度4 clf = tree.DecisionTreeClassifier(criterion='entropy') #拟合模型 clf = clf.fit(X, y) dot_data = tree.export_graphviz(clf, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("iris.pdf")
# Instantiate Model clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=5, min_samples_split=2, random_state=100, splitter='best') # Fit Model clf_gini.fit(X_train, y_train) # Generate Prediction clf_gini_pred = clf_gini.predict(X_test) # Calculate Accuracy Score '''Accuracy = ratio of correctly predicted target values vs all values''' clf_gini_accuracy = round(accuracy_score(y_test, clf_gini_pred) * 100, 2) print(clf_gini_accuracy) # Vizualize Tree ------------------------------------ os.chdir(r'/home/ccirelli2/Desktop/repositories/Scikit_Learn/output') dot_data = StringIO() export_graphviz(clf_gini, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('balance_scale.png') Image(graph.create_png())
def show_tree(clf): dot_data = StringIO() export_graphviz(clf, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("titanic_tree.pdf")
def ML_tree(X_train, X_test, Y_train, Y_test): fileML = open("Ml data.txt", "a+") # append mode def tree_to_code(tree, feature_names): tree_ = tree.tree_ feature_name = [ feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature ] print("def tree({}):\n".format(", ".join(feature_names))) fileML.write("def tree({}):\n".format(", ".join(feature_names))) def recurse(node, depth): indent = " " * depth if tree_.feature[node] != _tree.TREE_UNDEFINED: name = feature_name[node] threshold = tree_.threshold[node] print("{}if {} <= {}:\n".format(indent, name, threshold)) fileML.write("{}if {} <= {}:\n".format( indent, name, threshold)) recurse(tree_.children_left[node], depth + 1) print("{}else: # if {} > {}\n".format( indent, name, threshold)) fileML.write("{}else: # if {} > {}\n".format( indent, name, threshold)) recurse(tree_.children_right[node], depth + 1) else: print("{}return {}\n".format(indent, tree_.value[node])) fileML.write("{}return {}\n".format( indent, tree_.value[node])) recurse(0, 1) fileML.write("\n") # clf = DecisionTreeClassifier().fit(iris.data, iris.target) plot_tree(tree, filled=True) plt.show() clf = DecisionTreeClassifier(random_state=0, max_depth=10, min_samples_leaf=1) result = clf.fit(X_train, Y_train) probs = clf.predict_proba(X_test) preds = probs[:, 1] fpr, tpr, threshold = metrics.roc_curve(Y_test, preds) roc_auc = metrics.auc(fpr, tpr) #---------print---------------------------------------------- # ----------tree---------------- print(clf.feature_importances_) fileML.write("DTW Decision Tree Classifier feature_importances: \n") for i in range(len(clf.feature_importances_)): print(X_train.columns[i]) fileML.write("{0}: ".format(X_train.columns[i])) print(clf.feature_importances_[i]) fileML.write("{0} \n".format(clf.feature_importances_[i])) print("2^coeff: " + str(2.0**(clf.feature_importances_[i]))) fileML.write("2^coeff: {0} \n".format( str(2.0**(clf.feature_importances_[i])))) fileML.write("\n") dot_data = six.StringIO() export_graphviz(clf, out_file=dot_data, feature_names=X_train.columns, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("DTW Decision Tree Classifier.pdf") # ----------grapgh--------------------- # method I: plt plt.title('DTW Receiver Operating Characteristic-Decision Tree') plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') path = os.path.join(parent_dir, "Decision Tree Classifier") plt.savefig(path) plt.show() print('finished') #----------function--------- tree_to_code(clf, [ "Malicious", "Day count Mean", "Day count STD", 'max Day count', "Size", "DTW 15-Malicious(%)", "DTW 10-Malicious(%)", "DTW 5-Malicious(%)", "Prevalence", "Peaks", "Sharp peaks" ]) fileML.write("----------------------------------------------\n") fileML.close()
def main(): # Put your code below test_size_input = float(input()) random_state_input = int(input()) criterion_input = input() deepth_input = int(input()) random_state_tree_input = int(input()) #read data iris = datasets.load_iris() #clip data iris.feature_names # use 'sepal length (cm)' and 'sepal width (cm)' as features X = iris.data[:,0:2] #print(X) y = iris.target #print(y) X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size_input,random_state = random_state_input) #create decision tree classifier learning dt_clf = DecisionTreeClassifier(criterion = criterion_input, max_depth = deepth_input, random_state = random_state_tree_input) dt_clf.fit(X_train,y_train) y_pred = dt_clf.predict(X_test) #accuracy_score acc = accuracy_score(y_test, y_pred) print("{:.3f}".format(acc)) dot_data=export_graphviz(dt_clf, out_file=None, feature_names=['sepal length (cm)','sepal width (cm)']) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_png('tree1.png') Image(graph.create_png()) #read data iris = datasets.load_iris() #clip data iris.feature_names # use 'petal length (cm)' and 'petal width (cm)' as features X = iris.data[:,2:4] #print(X) y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size_input,random_state = random_state_input) #create decision tree classifier learning dt_clf = DecisionTreeClassifier(criterion = criterion_input, max_depth = deepth_input, random_state = random_state_tree_input) dt_clf.fit(X_train,y_train) y_pred = dt_clf.predict(X_test) #accuracy_score acc = accuracy_score(y_test, y_pred) print("{:.3f}".format(acc)) dot_data=export_graphviz(dt_clf, out_file=None, feature_names=['petal length (cm)','petal width (cm)']) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_png('tree2.png') Image(graph.create_png())
confusion_matrix(y_validacao, y_predicao_validacao))) print("Matriz de confusão da teste :\n {}".format( confusion_matrix(y_teste, y_predicao_teste))) # Não estou conseguindo gerar um *.png arquivo_dot = StringIO() tree.export_graphviz( modeloAD, out_file=arquivo_dot, node_ids=True, feature_names=['Sangue', 'Da a luz', 'Pode voar', 'Mora na agua'], class_names=['SIM', 'NAO'], filled=True) arvore = pdp.graph_from_dot_data(arquivo_dot.getvalue()) lista_edge = [] for edge in arvore.get_edge_list(): lista_edge.append(edge.get_source()) nodes = arvore.get_node_list() for node in nodes: if node.get_name() == '0': node.set_fillcolor('#F19C99') elif node.get_name() not in lista_edge: node.set_fillcolor('#E1D5E7') else: node.set_fillcolor('#D5E8D4') arvore.write_png("arvore_mamifero.png")
def to_pdf(self, filename, dic_var=None): graph = pydotplus.graph_from_dot_data(self.to_dot(dic_var)) graph.write_pdf(filename)
def plot_multi(trees: dict, max_depth: int, iter: int): trees_traversal = {} trees_nodes = {} for class_index in trees.keys(): tree = trees[class_index] res = [] root = tree.root_node traversal(root, res) trees_traversal[class_index] = res # 获取所有节点 nodes = {} index = 0 for i in res: p, c = i[0], i[1] if p not in nodes.values(): nodes[index] = p index = index + 1 if c not in nodes.values(): nodes[index] = c index = index + 1 trees_nodes[class_index] = nodes # 通过dot语法将决策树展示出来 trees_edges = {} trees_node = {} for class_index in trees.keys(): trees_node[class_index] = '' trees_edges[class_index] = '' for depth in range(max_depth): for class_index in trees.keys(): for nodepair in trees_traversal[class_index]: if nodepair[0].deep == depth: p, c = nodepair[0], nodepair[1] l = len([i for i in range(len(c.data_index)) if c.data_index[i] is True]) pname = str(list(trees_nodes[class_index].keys())[list(trees_nodes[class_index].values()).index(p)]) cname = str(list(trees_nodes[class_index].keys())[list(trees_nodes[class_index].values()).index(c)]) if l > 0: trees_edges[class_index] = trees_edges[class_index] + pname + '->' + cname + '[label=\"' + str( p.split_feature) + ( '<' if p.left_child == c else '>=') + str( p.split_value) + '\"]' + ';\n' trees_node[class_index] = trees_node[ class_index] + pname + '[width=1,height=0.5,color=lemonchiffon,style=filled,shape=ellipse,label=\"id:' + str( [i for i in range(len(p.data_index)) if p.data_index[i] is True]) + '\"];\n' + \ ( cname + '[width=1,height=0.5,color=lemonchiffon,style=filled,shape=ellipse,label=\"id:' + str( [i for i in range(len(c.data_index)) if c.data_index[i] is True]) + '\"];\n' if l > 0 else '') if c.is_leaf and l > 0: trees_edges[class_index] = trees_edges[ class_index] + cname + '->' + cname + 'p[style=dotted];\n' trees_node[class_index] = trees_node[ class_index] + cname + 'p[width=1,height=0.5,color=lightskyblue,style=filled,shape=box,label=\"' + str( "{:.4f}".format(c.predict_value)) + '\"];\n' else: continue dot = '''digraph g {\n''' + trees_edges[class_index] + trees_node[class_index] + '''}''' graph = pdp.graph_from_dot_data(dot) # 保存图片+pyplot展示 graph.write_png('results/NO.{}_{}_tree.png'.format(iter, class_index)) plt.ion() plt.figure(1, figsize=(30, 20)) plt.axis('off') plt.title('NO.{} iter '.format(iter)) class_num = len(trees.keys()) if class_num / 3 - int(class_num / 3) < 0.000001: rows = int(class_num / 3) else: rows = int(class_num / 3) + 1 for class_index in trees.keys(): index = list(trees.keys()).index(class_index) plt.subplot(rows, 3, index + 1) img = Image.open('results/NO.{}_{}_tree.png'.format(iter, class_index)) img = img.resize((1024, 700), Image.ANTIALIAS) plt.axis('off') plt.title('NO.{}_class {}'.format(iter, class_index)) plt.rcParams['figure.figsize'] = (30.0, 20.0) plt.imshow(img) plt.savefig('results/NO.{}_tree.png'.format(iter)) plt.pause(0.01)
ha='right', fontsize=20) plt.ylabel('True label', fontsize=20) plt.xlabel('Predicted label', fontsize=20) plt.tight_layout() plt.show() # display decision tree dot_data = tree.export_graphviz(clf_gini, filled=True, rounded=True, class_names='survived', feature_names=tt.iloc[:, 0:].columns, out_file=None) graph = graph_from_dot_data(dot_data) graph.write_pdf("decision_tree_gini.pdf") webbrowser.open_new(r'decision_tree_gini.pdf') dot_data = tree.export_graphviz(clf_entropy, filled=True, rounded=True, class_names='survived', feature_names=tt.iloc[:, 0:].columns, out_file=None) graph = graph_from_dot_data(dot_data) graph.write_pdf("decision_tree_entropy.pdf") webbrowser.open_new(r'decision_tree_entropy.pdf') #%%-----------------------------------------------------------------------
lenses_list = [] # print(lenses_dict) #打印字典信息 lenses_pd = pd.DataFrame(lenses_dict) #生成pandas.DataFrame # print(lenses_pd) #打印pandas.DataFrame le = LabelEncoder() #创建LabelEncoder()对象,用于序列化 for col in lenses_pd.columns: #序列化 lenses_pd[col] = le.fit_transform(lenses_pd[col]) # print(lenses_pd) #打印编码信息 clf = tree.DecisionTreeClassifier( max_depth=6) #创建DecisionTreeClassifier()类 clf = clf.fit(lenses_pd.values.tolist(), lenses_target) #使用数据,构建决策树 dot_data = StringIO() tree.export_graphviz( clf, out_file=dot_data, #绘制决策树 feature_names=lenses_pd.keys(), class_names=clf.classes_, filled=True, rounded=True, special_characters=True) # graph = pydotplus.graph_from_dot_data(dot_data.getvalue()); #下面这列解决中文乱码 graph = pydotplus.graph_from_dot_data(dot_data.getvalue().replace( 'helvetica', '"Microsoft YaHei"')) print(dot_data.getvalue()) graph.write_pdf("tree.pdf") #保存绘制好的决策树,以PDF的形式存储。 print(clf.predict([[1, 1, 1, 0]])) #预测
def autolayout(self): """ Using graphviz to layout the current graph to 'dot' layout. """ # Create a empty graph graph_viz = Digraph(engine='dot') # For all instances, add a node, with it's size. for widget_instance in self.widget_instances: assert isinstance(widget_instance, InstanceWidget) if widget_instance.hidden: continue size = widget_instance.preferredSize() assert isinstance(size, QtCore.QSizeF) graph_viz.node(widget_instance.name, width=str(size.width() / 72.0), height=str(size.height() / 72.0), shape="rect") # For all (not hidden) connections), connect the source and destination widgets with minimum length of 2 inches for connection in self.connection_widgets: assert isinstance(connection, ConnectionWidget) if not connection.hidden: graph_viz.edge(connection.source_instance_widget.name, connection.dest_instance_widget.name, minlen=str(2)) # Generate / Render graph into 'dot' format raw_dot_data = graph_viz.pipe('dot') print "Graphviz rendering... the following is the dot file from graphviz" print raw_dot_data # Read dot format (using pydotplus) dot_data = pydotplus.graph_from_dot_data(raw_dot_data) # Get graphviz height graph_attributes = dot_data.get_graph_defaults() height = 0 for attribute_dict in graph_attributes: if not isinstance(attribute_dict, dict): continue if attribute_dict['bb'] is not None: rectange = Common.extract_numbers(attribute_dict['bb']) height = rectange[1][1] - rectange[0][1] # For all instances, apply new position to widgets. for instance_widget in self.widget_instances: assert isinstance(instance_widget, InstanceWidget) if instance_widget.hidden: continue # Get instance's name instance_name = instance_widget.name # Get the node representing this instance, and get its attributes node_list = dot_data.get_node(instance_name) if len(node_list) < 1: # Name may be quoted due to special characters quoted_name = '"%s"' % instance_name node_list = dot_data.get_node(quoted_name) assert len(node_list) == 1 # Should only be one node assert isinstance(node_list[0], pydotplus.Node) node_attributes_dict = node_list[0].get_attributes() # Extract position of the node node_position_list = Common.extract_numbers(node_attributes_dict['pos']) assert len(node_position_list) is 1 # Should only be one position node_position = node_position_list[0] self.reposition_instance_widget(instance_widget, x_pos=node_position[0], y_pos=math.fabs(height - node_position[1])) self.update_view() self.save_layout_to_file()
car_data = np.genfromtxt("auto-mpg-modified.data", usecols=range(8)) car_data = car_data[~np.isnan(car_data).any(axis=1)] # Assign MPG to y and all other attributes to x data = car_data[:, 1:] labels = car_data[:, 0] # Uncomment to add some noise to the data #noise = np.random.normal(0, 10, len(data)) #data[:,5] += noise.astype(int) dt = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3) dt = dt.fit(data, labels) attributes = [ "CYLYNDERS", "DISPLACEMENT", "HORSEPOWER", "WEIGHT", "ACCELERATION", "MODEL_YEAR", "ORIGIN" ] class_labels = ["BAD", "OK", "GOOD"] out = StringIO() tree.export_graphviz(dt, out_file=out, feature_names=attributes, class_names=class_labels, filled=True, impurity=False) pydotplus.graph_from_dot_data(out.getvalue()).write_png("dtree.png")
def hr_modeling(features, label, tree_vis=False, ann=False): # 将原数据集划分为了训练集、验证集、测试集,6:2:2切分 f_v = features.values f_n = features.columns.values l_v = label.values x_tt, x_validation, y_tt, y_validation = train_test_split(f_v, l_v, test_size=0.2) x_train, x_test, y_train, y_test = train_test_split(x_tt, y_tt, test_size=0.25) models = [] models.append(('KNN', KNeighborsClassifier(n_neighbors=3))) models.append(('GaussianNB', GaussianNB())) models.append(('BernoulliNB', BernoulliNB())) models.append(('DecisionTreeGini', DecisionTreeClassifier())) models.append( ('DecisionTreeEntropy', DecisionTreeClassifier(criterion='entropy'))) models.append(('SVM Classifier', SVC(C=1000))) models.append(('RandomForest', RandomForestClassifier(n_estimators=81, max_features=None))) models.append(('Adaboost', AdaBoostClassifier())) models.append(('LogisticRegression', LogisticRegression(penalty='l2', C=1.0, tol=1e-10))) models.append( ('GBDT', GradientBoostingClassifier(max_depth=6, n_estimators=100))) for clf_name, clf in models: clf.fit(x_train, y_train) xy_list = [(x_train, y_train), (x_validation, y_validation), (x_test, y_test)] for i in range(len(xy_list)): x_part = xy_list[i][0] y_part = xy_list[i][1] y_pred = clf.predict(x_part) print(i) # 分别将模型在训练集、验证集、测试集上进行实验 print(clf_name, '-ACC:', accuracy_score(y_part, y_pred)) print(clf_name, '-REC:', recall_score(y_part, y_pred)) print(clf_name, '-F1:', f1_score(y_part, y_pred)) # 决策树可以生成图 if clf_name.startswith('DecisionTree') and tree_vis: dot_data = export_graphviz(clf, out_file=None, feature_names=f_n, class_names=['NL', 'L'], filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf('dt_tree.pdf') if ann: ann_model = Sequential() ann_model.add(Dense(50, input_dim=len(f_v[0]))) ann_model.add(Activation('sigmoid')) ann_model.add(Dense(2)) ann_model.add(Activation('softmax')) sgd = SGD(lr=0.1) ann_model.compile(optimizer=sgd, loss='mean_squared_error') ann_model.fit(x=x_train, y=np.array([[0, 1] if i == 1 else [1, 0] for i in y_train]), nb_epoch=15000, batch_size=8999) xy_list = [(x_train, y_train), (x_validation, y_validation), (x_test, y_test)] for i in range(len(xy_list)): x_part = xy_list[i][0] y_part = xy_list[i][1] y_pred = ann_model.predict_classes(x_part) print(i) print('ANN', '-ACC:', accuracy_score(y_part, y_pred)) print('ANN', '-REC:', recall_score(y_part, y_pred)) print('ANN', '-F1:', f1_score(y_part, y_pred))
def randomforest_predict(): warnings.filterwarnings('ignore') df_data = pd.read_csv("data/housing.data", delim_whitespace=True) X = df_data.drop(["MEDV"], axis=1) y = df_data["MEDV"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=128) param_grid = { 'n_estimators': [5, 10, 20, 50, 100, 200], # tree number 'max_depth': [3, 5, 7], # max depth 'max_features': [0.6, 0.7, 0.8, 1] # max features } rf = RandomForestRegressor() grid = GridSearchCV(rf, param_grid=param_grid, cv=3) grid.fit(X_train, y_train) print("best_params", grid.best_params_) rf_reg = grid.best_estimator_ print(rf_reg) estimator = rf_reg.estimators_[3] dot_data = tree.export_graphviz(estimator, out_file=None, filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_png("result/rf_reg.png") feature_names = X.columns feature_importances = rf_reg.feature_importances_ indices = np.argsort(feature_importances)[::-1] for index in indices: print("feature %s (%f)" % (feature_names[index], feature_importances[index])) plt.figure(figsize=(16, 8)) plt.title("feature importance of random forest") plt.bar(range(len(feature_importances)), feature_importances[indices], color='b') plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b') plt.show() rst = {"label": y_test, "prediction": rf_reg.predict(X_test)} rst = pd.DataFrame(rst) print(rst.head()) rst['label'].plot(style='k.', figsize=(15, 5)) rst['prediction'].plot(style='r.') plt.legend(fontsize=15, markerscale=3) plt.tick_params(labelsize=25) plt.grid() plt.show() MSE = metrics.mean_squared_error(y, rf_reg.predict(X)) print(np.sqrt(MSE)) submission = {"prediction": rf_reg.predict(X_test)} submission = pd.DataFrame(submission) submission.to_csv("result/price_predict_randomforest.csv") y_predict = rf_reg.predict(X_test) x_data = pd.Series(range(len(y_test)))[:, np.newaxis] y_test_data = y_test[:, np.newaxis] y_predict_data = y_predict[:, np.newaxis] plt.plot(x_data, y_test_data, label='Price') plt.plot(x_data, y_predict_data, label='Predict price') plt.xlabel('Entity') plt.ylabel('Price') plt.title('Price prediction (random forest)') plt.legend() plt.savefig('result/price_predict_random_forest.png') plt.show()
[1, 0, 1, 2, 'yes'], [1, 0, 1, 2, 'yes'], [2, 0, 1, 2, 'yes'], [2, 0, 1, 1, 'yes'], [2, 1, 0, 1, 'yes'], [2, 1, 0, 2, 'yes'], [2, 0, 0, 0, 'no'] ] y_label = "isOk" x_labels = ["age", "work", "house", "credit"] return data, x_labels, y_label if __name__ == '__main__': data, x_labels, y_label = createDataSet() x, y = [], [] for i in data: y.append(i[-1]) x.append(i[:-1]) train_x = df(x, columns=x_labels) train_y = df(y, columns=[y_label]) clf = tree.DecisionTreeClassifier( criterion="entropy", max_depth=5) # 创建DecisionTreeClassifier()类 clf = clf.fit(x, y) # 使用数据,构建决策树 reg_dot_data = tree.export_graphviz(clf, out_file=None, feature_names=train_x.keys(), class_names=clf.classes_) # 决策树可视化函数 reg_graph = pydotplus.graph_from_dot_data(reg_dot_data) reg_graph.write_png('tree.png') # 保存为图片
def test_16_Tree(): strOf_FuncName = "test_16_Tree" '''################### step : 1 opening, vars ###################''' print() print ("[%s:%d] starting : %s (time=%s)" % ( os.path.basename(os.path.basename(libs.thisfile())) , libs.linenum() , strOf_FuncName , libs.get_TimeLabel_Now() ) ) print() #ref https://www.devdungeon.com/content/python-import-syspath-and-pythonpath-tutorial#toc-13 print ("[%s:%d] sys.path ==>" % ( os.path.basename(os.path.basename(libs.thisfile())) , libs.linenum() ) ) print(sys.path) print() #ref https://www.devdungeon.com/content/python-import-syspath-and-pythonpath-tutorial#toc-13 print ("[%s:%d] os.environ[\"PATH\"] ==>" % ( os.path.basename(os.path.basename(libs.thisfile())) , libs.linenum() ) ) print(os.environ["PATH"]) '''################### step : 2 data : load ###################''' df = pandas.read_csv("shows.csv") # #debug # print ("[%s:%d] df ==> " # % # (os.path.basename(libs.thisfile()), libs.linenum()) # ) # # print(df) '''################### step : 2 : 2 data : mapping ###################''' d = {'UK': 0, 'USA': 1, 'N': 2} df['Nationality'] = df['Nationality'].map(d) d = {'YES': 1, 'NO': 0} df['Go'] = df['Go'].map(d) # #debug # print ("[%s:%d] df (mapped) ==> " # % # (os.path.basename(libs.thisfile()), libs.linenum()) # ) # # print(df) '''################### step : 2 : 3 data : feature, target ###################''' features = ['Age', 'Experience', 'Rank', 'Nationality'] X = df[features] y = df['Go'] # #debug # print ("[%s:%d] feature, column ==> " # % # (os.path.basename(libs.thisfile()), libs.linenum()) # ) # # print(X) # print(y) '''################### step : 3 tree ###################''' dtree = DecisionTreeClassifier() dtree = dtree.fit(X, y) # #debug # print ("[%s:%d] dtree ==> " # % # (os.path.basename(libs.thisfile()), libs.linenum()) # ) # # print(dtree) #debug:20210418_170812 data = tree.export_graphviz(dtree, out_file=None, feature_names=features) # #debug # print ("[%s:%d] data ==> " # % # (os.path.basename(libs.thisfile()), libs.linenum()) # ) # # print(data) #mark:20210503_164821 graph = pydotplus.graph_from_dot_data(data) # #debug # print ("[%s:%d] graph ==> " # % # (os.path.basename(libs.thisfile()), libs.linenum()) # ) # # print(graph) '''################### step : 4 graph ###################''' strOf_Time_Label = libs.get_TimeLabel_Now() dpath_PlotImage = "./data/s-10" # dpath_PlotImage = "./data/s-9" fname_PlotImage = "mydecisiontree.%s.png" % (strOf_Time_Label) # fname_PlotImage = "plot_image_%s" % (strOf_Time_Label) fpath_PlotImage = os.path.join(dpath_PlotImage, fname_PlotImage) graph.write_png(fpath_PlotImage) # graph.write_png('mydecisiontree.png') #debug print ("[%s:%d] graph.write_png ==> %s" % ( os.path.basename(libs.thisfile()), libs.linenum() , fpath_PlotImage ) ) img=pltimg.imread(fpath_PlotImage) # img=pltimg.imread('mydecisiontree.png') imgplot = plt.imshow(img) # plt.show() '''################### step : 5 predict ###################''' litOf_Predict_Conditions = [40, 10, 6, 1] # litOf_Predict_Conditions = [40, 10, 7, 1] #debug print ("[%s:%d] litOf_Predict_Conditions ==>" % ( os.path.basename(libs.thisfile()), libs.linenum() ) ) print(litOf_Predict_Conditions) print(dtree.predict([litOf_Predict_Conditions])) # print(dtree.predict([[40, 10, 7, 1]])) '''################### step : 2 prep ###################''' '''###################
def createDataSet(dict): allElectronicsData = open(dict['in_file_path']) reader = csv.reader(allElectronicsData) headers = next(reader) print("headers\n",headers) featureList = [] labelList = [] for row in [rows for rows in reader]: labelList.append(row[len(row) - 1]) rowDict = {} for i in range(0, len(row) - 1): rowDict[headers[i]] = row[i] featureList.append(rowDict) print("labelList\n",labelList) vec = DictVectorizer() dummyX = vec.fit_transform(featureList).toarray() print('show vector name\n',vec.get_feature_names()) print("dummyX\n",dummyX) #把标签转化为0-1形式 #lb = preprocessing.LabelBinarizer() #dummyY = lb.fit_transform(labelList) dummyY=labelList print("dummyY\n",dummyY) print(dict) clf = tree.DecisionTreeClassifier( criterion=dict['criterion'], splitter=dict['splitter'], max_depth=dict['max_depth'], min_samples_split=dict['min_samples_split'], min_samples_leaf=dict['min_samples_leaf'], min_weight_fraction_leaf=dict['min_weight_fraction_leaf'], max_features=dict['max_features'], random_state=dict['random_state'], max_leaf_nodes=dict['max_leaf_nodes'], min_impurity_decrease=dict['min_impurity_decrease'], min_impurity_split=dict['min_impurity_split'], class_weight=dict['class_weight'], presort=dict['presort'], ccp_alpha=dict['ccp_alpha']) print(clf) clf.fit(dummyX, dummyY) print("training score : %.3f " % (clf.score(dummyX, dummyY))) import pydotplus from six import StringIO dot_data = StringIO() model_path=dict['out_file_path'] model_parent_path=os.path.split(model_path)[0] if not os.path.exists(model_parent_path): os.makedirs(model_parent_path) pdf_path=os.path.splitext(model_path)[0]+".pdf" dot_path=os.path.splitext(model_path)[0]+".dot" with open(dot_path, 'w') as f : f = tree.export_graphviz(clf, out_file = f, class_names=clf.classes_, feature_names = vec.get_feature_names()) tree.export_graphviz(clf, out_file=dot_data,feature_names = vec.get_feature_names(), class_names=clf.classes_, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(pdf_path) joblib.dump(clf, model_path) plt.figure() plot_tree(clf, filled=True,feature_names = vec.get_feature_names(),class_names=clf.classes_) png_path=os.path.splitext(model_path)[0]+".png" plt.savefig(png_path)
def _decision_tree_classification_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): feature_names, features = check_col_type(table, feature_cols) y_train = table[label_col] if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'): raise_error('0718', 'label_col') class_labels = sorted(set(y_train)) if class_weight is not None: if len(class_weight) != len(class_labels): raise ValueError( "Number of class weights should match number of labels.") else: class_weight = { class_labels[i]: class_weight[i] for i in range(len(class_labels)) } classifier = DecisionTreeClassifier( criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, class_weight, presort) classifier.fit(features, table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, feature_names=feature_names, class_names=classifier.classes_.astype(np.str), filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['classes'] = classifier.classes_ feature_importance = classifier.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = classifier.max_features_ model['n_classes'] = classifier.n_classes_ model['n_features'] = classifier.n_features_ model['n_outputs'] = classifier.n_outputs_ model['tree'] = classifier.tree_ get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_names)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.xlim(0, 1.1) plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Classification Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance_table = pd.DataFrame( [[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
import numpy as np from sklearn.metrics import confusion_matrix,accuracy_score,classification_report from sklearn.tree import export_graphviz from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split import pydotplus as pdp from IPython.display import Image url="https://raw.githubusercontent.com/venky14/Machine-Learning-with-Iris-Dataset/master/Iris.csv" cols=['Id','SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species'] pima=pd.read_csv(url) print(pima.head()) feature_cols=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm'] x=pima[feature_cols] y=pima.Species x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1) clf=DecisionTreeClassifier() clf=clf.fit(x_train,y_train) y_pred=clf.predict(x_test) result=confusion_matrix(y_test,y_pred) print("confusion matrix") print(result) result1=classification_report(y_test,y_pred) print("classification report") print(result1) result2=accuracy_score(y_test,y_pred) print("accuracy: ",result2) dot_data=export_graphviz(clf,out_file=None,filled=True,rounded=True,special_characters=True,feature_names=feature_cols,class_names=['Iris-setosa','Iris-versicolor','Iris-virginica']) graph=pdp.graph_from_dot_data(dot_data) graph.write_png('Iris.png') Image(graph.create_png())
min_samples_leaf=9, min_samples_split=6, min_weight_fraction_leaf=0.0, presort=False, random_state=42, splitter='best') clf1 = clf1.fit(X1, Y1) dot_data1 = tree.export_graphviz( clf1, out_file=None, feature_names=[pos_names[x] for x in features_train1], class_names=['ctrl', 'expr'], filled=True, rounded=True, special_characters=True) graph1 = pydotplus.graph_from_dot_data(dot_data1) graph1.write_pdf("tree-dataset-1.pdf") clf2 = tree.DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=5, min_samples_split=4, min_weight_fraction_leaf=0.0, presort=False, random_state=42, splitter='best') clf2 = clf2.fit(X2, Y2)
'green', 'dark', 'dark', 'dark', 'green', 'white', 'white', 'green', 'white', 'dark', 'white', 'green'], 'root': ['fully rolled', 'fully rolled', 'fully rolled', 'fully rolled', 'fully rolled', 'slightly rolled ','slightly rolled ', 'slightly rolled ', 'slightly rolled ', 'straight', 'slightly rolled ', 'fully rolled', 'slightly rolled ', 'slightly rolled ','slightly rolled ', 'fully rolled', 'fully rolled'], 'response': ['boom', 'low', 'boom', 'low', 'boom', 'boom', 'boom', 'boom', 'low', 'clear', 'clear', 'boom', 'boom', 'low', 'boom', 'boom', 'low'], 'texture': ['clear'] * 6 + ['slightly paste', 'clear', 'slightly paste', 'clear', 'paste', 'paste', 'slightly paste', 'slightly paste', 'clear', 'paste', 'slightly paste'], 'navel': ['dent'] * 5 + ['slightly dent'] * 4 + ['flat'] * 3 + ['dent'] * 2 + \ ['slightly dent', 'flat', 'slightly dent'], 'touch': ['hard slip'] * 5 + ['soft sticky ', 'soft sticky ', 'hard slip', 'hard slip', 'soft sticky ', 'hard slip', 'soft sticky ', 'hard slip', 'hard slip', 'soft sticky ', 'hard slip', 'hard slip'], 'good': ['good'] * 8 + ['bad'] * 9}) test_data = test_data[[ 'color', 'root', 'response', 'texture', 'navel', 'touch', 'good' ]] ## print(test_data.show()) X, Y = test_data[:'touch'], test_data['good'] mytree = DecisionTreeClassifier() mytree.fit(X, Y) import pydotplus graph = pydotplus.graph_from_dot_data(mytree.export_graphviz()) graph.write_pdf(r'C:\Users\JacksonWoo\Desktop\boston.pdf') print(mytree) print(mytree.predict(X)) print( mytree.predict( SeriesSet([['red', 'red', 'clear', 'None', 'None', 'soft sticky']])))
def automate(wordfile): words = [] accuracy = [] precision = [] recall = [] fscore = [] with open(args.wordfile, encoding='utf8') as wordfile: for word in wordfile: words.append(word.rstrip()) count = 0 while len(words) > count: target_word_1 = words[count] count += 1 target_word_2 = words[count] count += 1 with open( args.filename, encoding='utf8' ) as data_file: # The datafile contains all sentence examples of the category in consideration sent = preprocess(data_file) sentences = tuple_sent(sent) sent_list_1 = sent_list(sentences, target_word_1) sent_list_2 = sent_list(sentences, target_word_2) data_matrix_1 = np.zeros([1, 16]) data_matrix_2 = np.zeros([1, 16]) for i in sent_list_1: target_vector = feature_extraction(i, target_word_1) data_matrix_1 = np.vstack( (data_matrix_1, target_vector) ) # Create a data matrix for all sentence examples of the target words for i in sent_list_2: target_vector = feature_extraction(i, target_word_2) data_matrix_2 = np.vstack((data_matrix_2, target_vector)) N1, D1 = data_matrix_1.shape N2, D2 = data_matrix_2.shape target1 = np.zeros(N1) target2 = np.ones(N2) target = np.concatenate((target1, target2), axis=0) data = np.concatenate( (data_matrix_1, data_matrix_2), axis=0 ) # A unified data matrix containing all examples for both candidates feature_names = [ "FO V", "FO H", "PH V", "PH H", "NF V", "NF H", "AF V", "AF H", "SMO V", "SMO H", "KVK V2", "KK V2", "NHM V", "NHM H", "NH V", "NH H" ] clf = tree.DecisionTreeClassifier() cross_val_accuracy_scores = cross_val_score(clf, data, target, cv=10) cross_val_precision_scores = cross_val_score(clf, data, target, cv=10, scoring="precision") cross_val_recall_scores = cross_val_score(clf, data, target, cv=10, scoring="recall") cross_val_f1_scores = cross_val_score(clf, data, target, cv=10, scoring="f1") print("Results for", target_word_1, "and", target_word_2) #print("Cross validation accuracy scores:", cross_val_accuracy_scores) # All cross val scores individually sumav = sum(cross_val_accuracy_scores ) / 10 # Average over all cross val scores print("Average accuracy:", sumav) #print("Cross validation precision scores:", cross_val_precision_scores) sumav2 = sum(cross_val_precision_scores) / 10 print("Average precision:", sumav2) #print("Cross validation recall scores:", cross_val_recall_scores) sumav3 = sum(cross_val_recall_scores) / 10 print("Average recall:", sumav3) #print("Cross validation F1 scores:", cross_val_f1_scores) sumav4 = sum(cross_val_f1_scores) / 10 print("Average F1:", sumav4) accuracy.append(sumav) precision.append(sumav2) recall.append(sumav3) fscore.append(sumav4) #Visualize data clf.fit(data, target) dot_data = tree.export_graphviz(clf, feature_names=feature_names, out_file=None, filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(dot_data) colors = ('turquoise', 'orange') edges = collections.defaultdict(list) for edge in graph.get_edge_list(): edges[edge.get_source()].append(int(edge.get_destination())) for edge in edges: edges[edge].sort() for i in range(2): dest = graph.get_node(str(edges[edge][i]))[0] dest.set_fillcolor(colors[i]) graph.write_png('tree_test.png') # Outputs a visualized tree grap target_names = [] target_names.append(target_word_1) target_names.append(target_word_2) feature_importance = clf.feature_importances_ y_pos = np.arange(len(feature_importance)) plt.barh(y_pos, feature_importance) plt.title("Mikilvægi þátta fyrir " + str(target_word_1) + " and " + str(target_word_2)) plt.yticks(y_pos, feature_names) plt.show( ) # Outputs a bar chart showing the importance of each feature print("Accuracy: ", accuracy) print("Precision: ", precision) print("Recall: ", recall) print("F-score: ", fscore)
def main(): # Import data and create decision tree #dermatology=datasets.load_dermatology() dermData = pd.read_csv('dermatologyWOnan.data') dermData.columns = [ 'erythema', 'scaling', 'definite borders', 'itching', 'koebner phenomenon', 'polygonal papules', 'follicular papules', 'oral mucosal involvement', 'knee and elbow involvement', 'scalp involvement', 'family history', 'melanin incontinence', 'eosinophils in the infiltrate', 'PNL infiltrate', 'fibrosis of the papillary dermis', 'exocytosis', 'acanthosis', 'hyperkeratosis', 'parakeratosis', 'clubbing of the rete ridges', 'elongation of the rete ridges', 'thinning of the suprapapillary epidermis', 'spongiform pustule', 'munro microabcess', 'focal hypergranulosis', 'disappearance of the granular layer', 'vacuolisation and damage of basal layer', 'spongiosis', 'saw-tooth appearance of retes', 'follicular horn plug', 'perifollicular parakeratosis', 'inflammatory monoluclear inflitrate', 'band-like infiltrate', 'age', 'label' ] dermData.to_csv('dermData.csv') target = dermData[ 'label'] #provided your csv has header row, and the label column is named "Label" #select all but the last column as data data = dermData.iloc[:, :-1] data_feature_names = [ 'erythema', 'scaling', 'definite borders', 'itching', 'koebner phenomenon', 'polygonal papules', 'follicular papules', 'oral mucosal involvement', 'knee and elbow involvement', 'scalp involvement', 'family history', 'melanin incontinence', 'eosinophils in the infiltrate', 'PNL infiltrate', 'fibrosis of the papillary dermis', 'exocytosis', 'acanthosis', 'hyperkeratosis', 'parakeratosis', 'clubbing of the rete ridges', 'elongation of the rete ridges', 'thinning of the suprapapillary epidermis', 'spongiform pustule', 'munro microabcess', 'focal hypergranulosis', 'disappearance of the granular layer', 'vacuolisation and damage of basal layer', 'spongiosis', 'saw-tooth appearance of retes', 'follicular horn plug', 'perifollicular parakeratosis', 'inflammatory monoluclear inflitrate', 'band-like infiltrate', 'age' ] #df=pd.DataFrame(dermatology.data, columns=dermatology.names) dtree = DecisionTreeClassifier() dtree.fit(data, target) # Plot decision tree #dot_data = StringIO() dot_data = export_graphviz(dtree, out_file=None, feature_names=data_feature_names, filled=True, rounded=True, precision=2, special_characters=True) #export_graphviz(dtree, out_file='tree_test.dot', feature_names = iris.feature_names, # class_names = iris.target_names, # rounded = True, precision = 2, filled = True) graph = pdot.graph_from_dot_data(dot_data) #colors = ('turquoise', 'orange') edges = collections.defaultdict(list) for edge in graph.get_edge_list(): edges[edge.get_source()].append(int(edge.get_destination())) for edge in edges: edges[edge].sort() for i in range(2): dest = graph.get_node(str(edges[edge][i]))[0] dest.set_fillcolor(colors[i]) #graphviz.render('dot', 'png', 'test-output/holy-grenade.gv') graph.write_png('tree_test.png')
def main(): # Building Phase data = importdata() X, Y, X_train, X_test, y_train, y_test = splitdataset(data) clf_gini = train_using_gini(X_train, X_test, y_train) clf_entropy = tarin_using_entropy(X_train, X_test, y_train) #Visualizing tree using Gini Index dot_data = StringIO() export_graphviz(clf_gini, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('gini_graph.png') Image(graph.create_png()) print('\n') # Operational Phase print("Results Using Gini Index:") print ("\n") # Prediction using gini y_pred_gini = prediction(X_test, clf_gini) #Test instance prdictions print("\n") test1_set =[1,1,1,1] print ("Test instance 1: ", test1_set) test1 = clf_gini.predict([test1_set]) print("Predicted label: ", test1) print("Actual label: B") print('\n') test2_set =[1,3,2,3] print ("Test instance 2: ", test2_set) test2 = clf_gini.predict([test2_set]) print("Predicted label: ", test2) print("Actual label: R") print('\n') test3_set = [5,4,5,1] print ("Test instance 3: ", test3_set) test3 = clf_gini.predict([test3_set]) print("Predicted label: ", test3) print("Actual label: L") print('\n') test7_set = [1,4,1,4] print ("Test instance 4: ", test7_set) test7 = clf_gini.predict([test7_set]) print("Predicted label: ", test7) print("Actual label: B") print('\n') cal_accuracy(y_test, y_pred_gini) #Visualizing tree using Entropy dot_data = StringIO() export_graphviz(clf_entropy, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('entropy_graph.png') Image(graph.create_png()) print("Results Using Entropy:") print('\n') # Prediction using entropy y_pred_entropy = prediction(X_test, clf_entropy) print('\n') test4_set =[1,1,1,1] print ("Test instance 1: ", test4_set) test4 = clf_gini.predict([test4_set]) print("Predicted label: ", test4) print("Actual label: B") print('\n') test5_set =[1,3,2,3] print ("Test instance 2: ", test5_set) test5 = clf_gini.predict([test5_set]) print("Predicted label: ", test5) print("Actual label: R") print('\n') test6_set = [5,4,5,1] print ("Test instance 3: ", test6_set) test6 = clf_gini.predict([test6_set]) print("Predicted label: ", test6) print("Actual label: L") print('\n') test8_set = [1,4,1,4] print ("Test instance 4: ", test8_set) test8 = clf_gini.predict([test8_set]) print("Predicted label: ", test8) print("Actual label: B") print('\n') cal_accuracy(y_test, y_pred_entropy)