def test_graph_with_shapefiles(self):

        shapefile_dir = os.path.join(TEST_DIR, 'from-past-to-future')
        dot_file = os.path.join(shapefile_dir, 'from-past-to-future.dot')

        pngs = [
            os.path.join(shapefile_dir, fname)
            for fname in os.listdir(shapefile_dir)
            if fname.endswith('.png')
        ]

        f = open(dot_file, 'rt')
        graph_data = f.read()
        f.close()

        g = pydotplus.graph_from_dot_data(graph_data)

        g.set_shape_files(pngs)

        jpe_data = g.create(format='jpe')

        hexdigest = sha256(jpe_data).hexdigest()

        hexdigest_original = self._render_with_graphviz(dot_file)

        self.assertEqual(hexdigest, hexdigest_original)
示例#2
0
def run_DT_model_2(df, criteria_col):
    # run the tree for various 0,1 lebel (e.g. : high value or not..)
    from sklearn.metrics import confusion_matrix
    from sklearn.cross_validation import train_test_split
    from sklearn.externals.six import StringIO
    from IPython.display import Image  
    import pydotplus
    print ('criteria_col  =  ', criteria_col)
    tree_col = [criteria_col,'Frequency', 'LTV', 'period_no_use','AverageTimeToOrder',
          'late_by_collection', 'late_by_delivery', 'tickets', 'recleaned_orders',
         'cancalled_orders', 'voucher_used']
    df_train_ = df 
    #df_train_tree = df_train_[tree_col]
    tree_data = df_train_[tree_col]
    tree_data = tree_data.dropna()
    tree_train, tree_test = train_test_split(tree_data,
                                           test_size=0.2, 
                                           random_state=200,
                                           stratify=tree_data[criteria_col])
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(tree_train.iloc[:,1:], tree_train[criteria_col])
    print (clf.score(tree_test.iloc[:,1:], tree_test[criteria_col]))
    # confusion matrix 
    print (confusion_matrix(tree_test[criteria_col], clf.predict(tree_test.iloc[:,1:])))
    # visualize the tree 
    dot_data = StringIO()
    tree.export_graphviz(clf,
                       out_file=dot_data,
                       feature_names=tree_col[1:],
                       filled=True, 
                       rounded=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    return Image(graph.create_png()), tree_train, tree_test
    def test_attribute_with_implicit_value(self):

        d = 'digraph {\na -> b[label="hi", decorate];\n}'
        g = pydotplus.graph_from_dot_data(d)
        attrs = g.get_edges()[0].get_attributes()

        self.assertEqual('decorate' in attrs, True)
    def train_network(self):
        """ Pure virtual method for training the network
        """
        db_query = self._database_session.query(PregameHitterGameEntry)
        mlb_training_data, mlb_evaluation_data = self.get_train_eval_data(db_query, 0.8)
        X_train, Y_train = self.get_stochastic_batch(mlb_training_data, self.SIZE_TRAINING_BATCH)
        self._decision_tree.fit(X_train, Y_train)
        dot_data = StringIO()
        tree.export_graphviz(self._decision_tree, out_file=dot_data,
                             feature_names=PregameHitterGameEntry.get_input_vector_labels())
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf("hitter_tree.pdf")
        x_test_actual = list()
        y_test_actual = list()
        for data in mlb_evaluation_data:
            try:
                postgame_entry = self._database_session.query(PostgameHitterGameEntry).filter(PostgameHitterGameEntry.rotowire_id == data.rotowire_id,
                                                                                              PostgameHitterGameEntry.game_date == data.game_date).one()
                y_test_actual.append([postgame_entry.actual_draftkings_points])
                x_test_actual.append(data.to_input_vector())
            except NoResultFound:
                print "Ignoring hitter %s since his postgame stats were not found." % data.rotowire_id
                continue

        self._database_session.close()
示例#5
0
文件: DecTree.py 项目: M-Anwar/ARGEL
def create_tree(X, Y):
    clf = tree.DecisionTreeClassifier(criterion="entropy")
    clf = clf.fit(X, Y)

    from IPython.display import Image
    import pydotplus

    dot_data = StringIO()
    # tree.export_graphviz(clf, out_file=dot_data)
    # feature_names = ['Gender', 'Age']
    feature_names = ["Gender", "0-5", "6-12", "13-19", "20-27", "28-35", "36-50", "55+"]
    target_names = []

    for i in range(1, len(Y) + 1):
        target_names.append("Ad #" + str(i))

    tree.export_graphviz(
        clf,
        out_file=dot_data,
        feature_names=feature_names,
        class_names=target_names,
        filled=True,
        rounded=True,
        special_characters=True,
    )

    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("Tree.pdf")

    return clf
示例#6
0
    def line_dot(self, code):
        """
        %dot CODE - render code as Graphviz image

        This line magic will render the Graphiz CODE, and render 
        it as an image.

        Example:
            %dot graph A { a->b };

        """
        try:
            if os.name == 'nt':
                import pydotplus as pydot
            else:
                import pydotplus as pydot
                #import pydot
        except:
            raise Exception("You need to install pydot")
        graph = pydot.graph_from_dot_data(str(code))
        svg = graph.create_svg()
        if hasattr(svg, "decode"):
            svg = svg.decode("utf-8")
        html = HTML(svg)
        self.kernel.Display(html)
示例#7
0
    def cell_dot(self):
        """
        %%dot - render contents of cell as Graphviz image

        This cell magic will send the cell to the browser as
        HTML.

        Example:
            %%dot

            graph A { a->b };
        """
        try:
            if os.name == 'nt':
                import pydotplus as pydot
            else:
                import pydot
        except:
            raise Exception("You need to install pydot")
        graph = pydot.graph_from_dot_data(str(self.code))
        svg = graph.create_svg()
        if hasattr(svg, "decode"):
            svg = svg.decode("utf-8")
        html = HTML(svg)
        self.kernel.Display(html)
        self.evaluate = False
示例#8
0
    def DecisionTree(self, dados):

        database = np.array(zip(dados[:, 4], dados[:, 5], dados[:, 11], dados[:, 12], dados[:, 19], dados[:, 20],
                               dados[:, 28], dados[:, 29], dados[:, 8], dados[:, 16], dados[:, 24], dados[:, 32]))
        class_names = ('implementacao_estimado','implementacao_real','correcao_est','correcao_real','teste_est',
                 'teste_real','elaboracao_estimado', 'elaboracao_real','perfil_imple','perfil_cor','perfil_teste',
                 'perfil_elab')
        kind = []
        for dado in dados[:, 33]:
            if(float(dado) <= 1.5):
                kind.append('class1')
            elif (float(dado) <= 2.0):
                kind.append('class2')
            elif (float(dado) <= 2.5):
                kind.append('class3')
            elif (float(dado) <= 3.0):
                kind.append('class4')
            elif (float(dado) <= 3.5):
                kind.append('class5')
            elif (float(dado) <= 4.0):
                kind.append('class6')
            elif (float(dado) <= 4.5):
                kind.append('class7')
            else:
                kind.append('class8')

        target = np.array(kind)

        clf = tree.DecisionTreeClassifier()
        clf = clf.fit(database, target)

        with open("projetos.dot", 'w') as f:
            f = tree.export_graphviz(clf, out_file=f)

        os.unlink('projetos.dot')

        dot_data = tree.export_graphviz(clf, out_file=None)
        graph = pydotplus.graph_from_dot_data(dot_data)
        graph.write_pdf('projetos.pdf')

        dot_data = tree.export_graphviz(clf, out_file=None,
                                             feature_names=class_names,
                                             class_names=target,
                                             filled=True, rounded=True,
                                             special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data)
        Image(graph.create_png())
    def test_multiple_graphs(self):

        graph_data = 'graph A { a->b };\ngraph B {c->d}'

        graphs = pydotplus.graph_from_dot_data(graph_data)

        self.assertEqual(len(graphs), 2)

        self.assertEqual([g.get_name() for g in graphs], ['A', 'B'])
示例#10
0
 def export_graph(self, clf, labels, file_name):
     dot_data = StringIO()
     tree.export_graphviz(clf,
                          out_file=dot_data,
                          feature_names=self.__features_name,
                          class_names=labels,
                          filled=True, rounded=True,
                          impurity=False)
     graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
     graph.write_pdf('{}.pdf'.format(file_name))
示例#11
0
def local_causality_graph(model, kind="full", goal=None):
    assert kind in ["verbose,","trimmed","saturated","worth","full"]
    if kind != "full" and goal is None:
        raise ValueError("goal cannot be None with %s LCG" % kind)
    args = ["-t", kind, "-o", "-"]
    if goal:
        args.append(goal)
    cp = _run_tool("pint-lcg", *args, input_model=model)
    g = pydotplus.graph_from_dot_data(cp.stdout.decode())
    return nx.nx_pydot.from_pydot(g)
示例#12
0
def graph_decision_tree(model, class_names):
    
    model_dot = StringIO() 
    tree.export_graphviz(model, out_file=model_dot,
                         feature_names=features,
                         class_names=class_names,
                         filled=True, rounded=True,  
                         special_characters=True) 
    graph = pydotplus.graph_from_dot_data(model_dot.getvalue()) 
    graph.write_pdf("model"+class_names[1]+".pdf")
示例#13
0
    def show_tree(self):
        '''return a png of the tree'''
        assert self.clf
        try:
            import pydotplus as pydot
        except ImportError:
            import pydot # dirty hack for read the docs

        dot_data = StringIO() 
        tree.export_graphviz(self.clf, out_file=dot_data, 
                             feature_names=self.feature_names) 
        graph = pydot.graph_from_dot_data(dot_data.getvalue().encode('ascii'))  # @UndefinedVariable
        img = graph.create_png()
        return img
def visualize_tree(clf, feature_names, class_names, output_file,
                   method='pdf'):
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data,
                         feature_names=iris.feature_names,
                         class_names=iris.target_names,
                         filled=True, rounded=True,
                         special_characters=True,
                         impurity=False)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    if method == 'pdf':
        graph.write_pdf(output_file + ".pdf")
    elif method == 'inline':
        Image(graph.create_png())

    return graph
示例#15
0
    def train(self, training_set, training_target, fea_index):

        clf = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=30, class_weight="balanced")
        clf = clf.fit(training_set, training_target)

        class_names = np.unique([str(i) for i in training_target])
        feature_names = [attr_list[i] for i in fea_index]

        dot_data = tree.export_graphviz(clf, out_file=None,
                                        feature_names=feature_names,
                                        class_names=class_names,
                                        filled=True, rounded=True,
                                        special_characters=True)

        graph = pydotplus.graph_from_dot_data(dot_data)
        graph.write_pdf("output/tree-vis.pdf")
        joblib.dump(clf, 'output/CART.pkl')
    def render_output_pydot(self, dotdata, **kwargs):
        """Renders the image using pydot"""
        if not HAS_PYDOT:
            raise CommandError("You need to install pydot python module")

        graph = pydot.graph_from_dot_data(dotdata)
        if not graph:
            raise CommandError("pydot returned an error")
        output_file = kwargs['outputfile']
        formats = ['bmp', 'canon', 'cmap', 'cmapx', 'cmapx_np', 'dot', 'dia', 'emf',
                   'em', 'fplus', 'eps', 'fig', 'gd', 'gd2', 'gif', 'gv', 'imap',
                   'imap_np', 'ismap', 'jpe', 'jpeg', 'jpg', 'metafile', 'pdf',
                   'pic', 'plain', 'plain-ext', 'png', 'pov', 'ps', 'ps2', 'svg',
                   'svgz', 'tif', 'tiff', 'tk', 'vml', 'vmlz', 'vrml', 'wbmp', 'xdot']
        ext = output_file[output_file.rfind('.') + 1:]
        format = ext if ext in formats else 'raw'
        graph.write(output_file, format=format)
def train_tree_classifer(features, labels, model_output_path):
    """
    train_tree_classifer will train a DecisionTree and write it out to a pdf file

    features: 2D array of each input feature for each sample
    labels: array of string labels classifying each sample
    model_output_path: path for storing the trained tree model
    """
    # save 20% of data for performance evaluation
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, labels, test_size=0.2)

    param = [
        {
            "max_depth": [None, 10, 100, 1000, 10000]
        }
    ]

    dtree = tree.DecisionTreeClassifier(random_state=0)

    # 10-fold cross validation, use 4 thread as each fold and each parameter set can be train in parallel
    clf = grid_search.GridSearchCV(dtree, param,
            cv=10, n_jobs=20, verbose=3)

    clf.fit(X_train, y_train)

    if os.path.exists(model_output_path):
        joblib.dump(clf.best_estimator_, model_output_path)
    else:
        print("Cannot save trained tree model to {0}.".format(model_output_path))

    dot_data = tree.export_graphviz(clf.best_estimator_, out_file=None)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_pdf('best_tree.pdf')

    print("\nBest parameters set:")
    print(clf.best_params_)

    y_predict=clf.predict(X_test)

    labels=sorted(list(set(labels)))
    print("\nConfusion matrix:")
    print("Labels: {0}\n".format(",".join(labels)))
    print(confusion_matrix(y_test, y_predict, labels=labels))

    print("\nClassification report:")
    print(classification_report(y_test, y_predict))
示例#18
0
def show_pdf(clf):
    '''
    可视化输出
    把决策树结构写入文件: http://sklearn.lzjqsdd.com/modules/tree.html

    Mac报错:pydotplus.graphviz.InvocationException: GraphViz's executables not found
    解决方案:sudo brew install graphviz
    参考写入: http://www.jianshu.com/p/59b510bafb4d
    '''
    # with open("testResult/tree.dot", 'w') as f:
    #     from sklearn.externals.six import StringIO
    #     tree.export_graphviz(clf, out_file=f)

    import pydotplus
    from sklearn.externals.six import StringIO
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("output/3.DecisionTree/tree.pdf")
示例#19
0
    def export_tree_pdf(self,filename=None):
        # returns the tree as dot data
        # if filename is specified the function 
        # will save the pdf file in current directory which consists of the visual reresentation of the tree
        import pydotplus
        from collections import deque
        
        dot_data = '''digraph Tree {
node [shape=box] ;'''
        
        queue = deque()
        
        r = self.__root
        queue.append(r)
        count = 0
        if r.index == -1:
            r.index = count
        
        dot_data = dot_data + "\n{} [label=\"Feature to split upon : X[{}]\\nOutput at this node : {}\" ];".format(count,r.data,r.output) 
        
        # Doing LEVEL ORDER traversal in the tree (using a queue)
        while len(queue) != 0 :
            node = queue.popleft()
            for i in node.children:
                count+=1
                if(node.children[i].index==-1):
                    node.children[i].index = count
                
                # Creating child node
                dot_data = dot_data + "\n{} [label=\"Feature to split upon : X[{}]\\nOutput at this node : {}\" ];".format(node.children[i].index,node.children[i].data,node.children[i].output) 
                # Connecting parent node with child
                dot_data = dot_data + "\n{} -> {} [ headlabel=\"Feature value = {}\"]; ".format(node.index,node.children[i].index,i)
                # Adding child node to queue
                queue.append(node.children[i])
        
        dot_data = dot_data + "\n}"

        if filename != None:    
            graph = pydotplus.graph_from_dot_data(dot_data)
            graph.write_pdf(filename)    
        
        return dot_data
示例#20
0
def read_dot(path):
    """Return a NetworkX MultiGraph or MultiDiGraph from a dot file on path.

    Parameters
    ----------
    path : filename or file handle

    Returns
    -------
    G : NetworkX multigraph
        A MultiGraph or MultiDiGraph.

    Notes
    -----
    Use G = nx.Graph(read_dot(path)) to return a Graph instead of a MultiGraph.
    """
    import pydotplus
    data = path.read()
    P = pydotplus.graph_from_dot_data(data)
    return from_pydot(P)
示例#21
0
def draw_DecTree(DecTree, feat_names=None, cla_names=None):
	# from sklearn.externals.six import StringIO  
	# import pydotplus 
	# dot_data = StringIO() 
	# tree.export_graphviz(DecTre, out_file=dot_data) 
	# graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
	# graph.write_pdf("iris.pdf") 

	dot_data = StringIO()
	# tree.export_graphviz(DecTree, out_file=dot_data)
	tree.export_graphviz(DecTree, out_file=dot_data,  
	                     feature_names=feat_names,  
	                     class_names=cla_names, 
	                     node_ids=True, 
	                     filled=True, rounded=True,  
	                     special_characters=True)  
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
	graph.write_pdf('dot_data.pdf') 

	# return Image(graph.create_png())
示例#22
0
def pydot_layout(G,prog='neato',root=None, **kwds):
    """Create node positions using Pydot and Graphviz.

    Returns a dictionary of positions keyed by node.

    Examples
    --------
    >>> G = nx.complete_graph(4)
    >>> pos = nx.nx_pydot.pydot_layout(G)
    >>> pos = nx.nx_pydot.pydot_layout(G, prog='dot')
    """
    import pydotplus
    P=to_pydot(G)
    if root is not None :
        P.set("root",make_str(root))

    D=P.create_dot(prog=prog)

    if D=="":  # no data returned
        print("Graphviz layout with %s failed"%(prog))
        print()
        print("To debug what happened try:")
        print("P=pydot_from_networkx(G)")
        print("P.write_dot(\"file.dot\")")
        print("And then run %s on file.dot"%(prog))
        return

    Q=pydotplus.graph_from_dot_data(D)

    node_pos={}
    for n in G.nodes():
        pydot_node = pydotplus.Node(make_str(n)).get_name()
        node=Q.get_node(pydot_node)

        if isinstance(node,list):
            node=node[0]
        pos=node.get_pos()[1:-1] # strip leading and trailing double quotes
        if pos != None:
            xx,yy=pos.split(",")
            node_pos[n]=(float(xx),float(yy))
    return node_pos
示例#23
0
 def _create_graph(self, **kwargs):
     clf = self._clf
     dot_data = StringIO()
     feature_names, class_names = self._get_names(**kwargs)
 
     if StrictVersion(sklearn.__version__) >= StrictVersion('0.17'):
         tree.export_graphviz(clf,
                              out_file=dot_data,  
                              feature_names=feature_names,
                              class_names=class_names,  
                              filled=kwargs.get("filled", True),
                              rounded=kwargs.get("rounded", True),  
                              special_characters=kwargs.get("special_characters", True),
                              **kwargs)
     else:
         tree.export_graphviz(clf,
                              out_file=dot_data,  
                              feature_names=feature_names,
                              **kwargs)
 
     return pydot.graph_from_dot_data(dot_data.getvalue())
示例#24
0
    def render_output_pydot(self, dotdata, **kwargs):
        """Renders model data as image using pydot"""
        if not HAS_PYDOT:
            raise CommandError("You need to install pydot python module")

        graph = pydot.graph_from_dot_data(dotdata)
        if not graph:
            raise CommandError("pydot returned an error")
        if isinstance(graph, (list, tuple)):
            if len(graph) > 1:
                sys.stderr.write("Found more then one graph, rendering only the first one.\n")
            graph = graph[0]

        output_file = kwargs['outputfile']
        formats = ['bmp', 'canon', 'cmap', 'cmapx', 'cmapx_np', 'dot', 'dia', 'emf',
                   'em', 'fplus', 'eps', 'fig', 'gd', 'gd2', 'gif', 'gv', 'imap',
                   'imap_np', 'ismap', 'jpe', 'jpeg', 'jpg', 'metafile', 'pdf',
                   'pic', 'plain', 'plain-ext', 'png', 'pov', 'ps', 'ps2', 'svg',
                   'svgz', 'tif', 'tiff', 'tk', 'vml', 'vmlz', 'vrml', 'wbmp', 'xdot']
        ext = output_file[output_file.rfind('.') + 1:]
        format = ext if ext in formats else 'raw'
        graph.write(output_file, format=format)
示例#25
0
def decision_trees(clf, drawing_param, nFeatures=None, show_trees=True, show_importance=True, showfig=True):

    clfs = clf.estimators_ if hasattr(clf, 'estimators_') else [clf]

    if show_trees:
        for i, estimator in enumerate(clfs):
            # create graph
            dot_data = StringIO()
            tree.export_graphviz(estimator, out_file=dot_data, rounded=True, filled=True, **drawing_param)
            graph = pydot.graph_from_dot_data(dot_data.getvalue())

            # create image from graph
            png_str = graph.create_png(prog='dot')
            sio = io.BytesIO()
            sio.write(png_str)
            sio.seek(0)
            img = mpimg.imread(sio)

            # plot the image
            fig = plt.figure('Decision tree %s' % i)
            plt.imshow(img, aspect='equal')
            fig.tight_layout()
            if showfig:
                plt.show()

    importances = clf.feature_importances_
    indices = np.argsort(importances)[::-1]
    # Print the feature ranking
    print("Feature ranking:")
    for i, f in enumerate(indices):
        print("%d. feature %d = %f [%s]" % (i + 1, f, importances[f], drawing_param['feature_names'][f]))

    if show_importance:
        axis = hbar([tree_.feature_importances_ for tree_ in clfs], figtitle='Feature importance', nBars=nFeatures,
                          yticknames=drawing_param['feature_names'], xlabel='Gini importance', sort='descend')[0]
        axis.figure.tight_layout()
        if showfig:
            plt.show()
示例#26
0
 def run_model(self, max_depth=3, criterion='entropy', do_plot=True):
     
     # Supported criteria for tree are gini for the Gini impurity and entropy for the information gain.
     tree = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=0)
     tree.fit(self.x_train, self.y_train)
     
     # export a graphical representation of the tree
     dot_data = io.StringIO()
     export_graphviz(tree,
                     out_file=dot_data,
                     feature_names=self.x_col_names)
     graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
     fn = './output/' + self.data_label + '_graph.pdf'
     graph.write_pdf(fn)
     
     # check model accuracy
     '''
     y_train_pred = tree.predict(self.x_train)
     train_acc = accuracy_score(self.y_train, y_train_pred)
     print('Training accuracy score is', train_acc)
     
     y_test_pred = tree.predict(self.x_test)
     test_acc = accuracy_score(self.y_test, y_test_pred)
     print('Test accuracy score is', test_acc)
     '''
     # no difference from above
     train_score = tree.score(self.x_train, self.y_train)
     print('Training score is', train_score)
     
     test_score = tree.score(self.x_test, self.y_test)
     print('Test score is', test_score)
     
     if do_plot:
         self.__plot_learning_curve(tree)
         self.__plot_decision_boundaries(tree)
     
     return train_score, test_score
示例#27
0
def tree_classifier():
    """Create an HPS classifier using the alpha-beta."""
    fn_fit = os.path.join(DF_Dir, 'fit_constant_step_size_01_bounded.pkl')
    fit = pd.read_pickle(fn_fit)
    print('Using data from', fn_fit)
    X = fit[['0_alpha', '0_beta', '1_alpha', '1_beta']].values
    y = fit['HPS_level'].values

    clf = tree.DecisionTreeClassifier(max_depth=4)
    clf.fit(X, y)
    from sklearn.externals.six import StringIO
    import pydotplus as pydot
    dot_data = StringIO()
    feature_names = ['a0', 'b0', 'a1', 'b1']
    target_names = ['low', 'medium', 'high']
    tree.export_graphviz(clf, out_file=dot_data,
                         feature_names=feature_names,
                         class_names=target_names,
                         filled=True, rounded=True,
                         special_characters=True)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    fn = os.path.join(Fig_Dir, 'hpf_tree_classifier.pdf')
    graph.write_pdf(fn)
    print('Tree saved as', fn)
def test():
	#training data
	test_idx = [0, 50, 100]
	train_target = np.delete(iris.target, test_idx)
	train_data = np.delete(iris.data, test_idx, axis=0)
	
	# testing data
	test_target = iris.target[test_idx]
	test_data = iris.data[test_idx]
	
	clf = tree.DecisionTreeClassifier()
	clf = clf.fit(train_data, train_target)
	
	print test_target
	print clf.predict(test_data)

	dot_data = StringIO() 
	tree.export_graphviz(clf, out_file=dot_data,  
		feature_names=iris.feature_names,  
		class_names=iris.target_names,  
		filled=True, rounded=True,  
		special_characters=True)  
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
	graph.write_pdf("iris.pdf")
示例#29
0
    def show_tree(self, mplfig=True, format='png'):
        '''return a png of the tree

        Parameters
        ----------
        mplfig : bool, optional
                 if true (default) returns a matplotlib figure with the tree,
                 otherwise, it returns the output as bytes
        format : {'png', 'svg'}, default 'png'
                 Gives a format of the output.

        '''
        assert self.clf
        try:
            import pydotplus as pydot
        except ImportError:
            import pydot  # dirty hack for read the docs

        dot_data = StringIO()
        tree.export_graphviz(self.clf, out_file=dot_data,
                             feature_names=self.feature_names)
        dot_data = dot_data.getvalue()  # .encode('ascii') # @UndefinedVariable
        graph = pydot.graph_from_dot_data(dot_data)
        if format == 'png':
            img = graph.create_png()
            if mplfig:
                fig, ax = plt.subplots()
                ax.imshow(mpimg.imread(io.BytesIO(img)))
                ax.axis('off')
                return fig
        elif format == 'svg':
            img = graph.create_svg()
        else:
            raise TypeError('''format must be in {'png', 'svg'}''')

        return img
示例#30
0
    def test_unicode_ids(self):

        node1 = '"aánñoöüé€"'
        node2 = '"îôø®çßΩ"'

        g = pydotplus.Dot()
        g.set_charset('latin1')
        g.add_node(pydotplus.Node(node1))
        g.add_node(pydotplus.Node(node2))
        g.add_edge(pydotplus.Edge(node1, node2))

        self.assertEqual(g.get_node(node1)[0].get_name(), node1)
        self.assertEqual(g.get_node(node2)[0].get_name(), node2)

        self.assertEqual(g.get_edges()[0].get_source(), node1)
        self.assertEqual(g.get_edges()[0].get_destination(), node2)

        g2 = pydotplus.graph_from_dot_data(g.to_string())

        self.assertEqual(g2.get_node(node1)[0].get_name(), node1)
        self.assertEqual(g2.get_node(node2)[0].get_name(), node2)

        self.assertEqual(g2.get_edges()[0].get_source(), node1)
        self.assertEqual(g2.get_edges()[0].get_destination(), node2)
示例#31
0
pred = clf.predict(iris_test.iloc[:, 0:4])
print(accuracy_score(iris_test['type'].values.codes, pred))

#%%
'''
CART(Classification and Regression Tree) 학습
'''
clf = tree.DecisionTreeClassifier()
clf.fit(iris_train.iloc[:, 0:4], iris_train['type'].values.codes)

pred = clf.predict(iris_train.iloc[:, 0:4])
print(accuracy_score(iris_train['type'].values.codes, pred))

pred = clf.predict(iris_test.iloc[:, 0:4])
print(accuracy_score(iris_test['type'].values.codes, pred))

#%%
'''
트리 시각화

!conda install -c conda-forge -y pydotplus
!conda install -y graphviz
'''
import pydotplus
from IPython.display import Image

tree_dot = tree.export_graphviz(clf, out_file=None,
                                feature_names=iris.columns.values,
                                class_names=iris['type'].values.categories.values)
tree_graph = pydotplus.graph_from_dot_data(tree_dot)
Image(tree_graph.create_png())
def run_model(df, vectorizer, classifier):
    # load data
    x = df['Cleaned'].values
    y = df['Class'].values

    # split dataset into training and test sets, with 80:20 split
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1000,
                                                        stratify=y)

    if vectorizer == "count":
        vectorizer = CountVectorizer()

    if vectorizer == "tfidf":
        vectorizer = TfidfVectorizer()

    vectorizer.fit(x_train)

    X_train = vectorizer.transform(x_train)
    X_test = vectorizer.transform(x_test)

    if classifier == "naive_bayes":
        classifier = MultinomialNB()

    if classifier == "decision_tree":
        classifier = DecisionTreeClassifier(
        )  # manual search tried, but default hyperparameters were best

    if classifier == "random_forest":
        clf = RandomForestClassifier()  # default n_estimators=100

        # define random search space based on decision tree depth
        hyp = {
            "n_estimators": [50, 100, 150,
                             200],  # number of trees in the forest
            "max_depth": [40, 50, None],  # max depth of tree
            "max_features": [10, 20, 'sqrt', None],
            "min_samples_split": randint(1, 11),
            "bootstrap": [True, False],  # to use bagging or not
            "criterion": ["gini", "entropy"]
        }  # gini impurity or information gain

        # random search over 5-fold cross validation (stratified k-fold by default)
        random_search = RandomizedSearchCV(clf,
                                           hyp,
                                           random_state=1,
                                           n_iter=100,
                                           cv=5,
                                           verbose=1,
                                           n_jobs=-1)
        search_result = random_search.fit(X_train, y_train)

        n_estimators = search_result.best_estimator_.get_params(
        )['n_estimators']
        max_depth = search_result.best_estimator_.get_params()['max_depth']
        max_features = search_result.best_estimator_.get_params(
        )['max_features']
        min_samples_split = search_result.best_estimator_.get_params(
        )['min_samples_split']
        bootstrap = search_result.best_estimator_.get_params()['bootstrap']
        criterion = search_result.best_estimator_.get_params()['criterion']

        print("Random search results: ")
        print("Best n_estimators: ", n_estimators)
        print("Best max_depth: ", max_depth)
        print("Best max_features:", max_features)
        print("Best max_features:", min_samples_split)
        print("Best bootstrap:", bootstrap)
        print("Best criterion:", criterion)

        # set the classifier to the one with best hyperparameters from random search
        classifier = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            min_samples_split=min_samples_split,
            bootstrap=bootstrap,
            criterion=criterion)

    if classifier == "logistic_regression":
        # by a manual search the lbfgs solver showed best results
        # number of max iterations is increased to allow lbfgs solver to converge
        # compare loss functions over 5-fold cross validation
        ovr_clf = LogisticRegression(multi_class='ovr',
                                     solver='lbfgs',
                                     max_iter=1000)
        ovr_score = cross_val_score(ovr_clf, X_train, y_train, cv=5).mean()

        mce_clf = LogisticRegression(multi_class='multinomial',
                                     solver='lbfgs',
                                     max_iter=1000)
        mce_score = cross_val_score(mce_clf, X_train, y_train, cv=5).mean()

        # choose the better performing hyperparameters
        if (ovr_score > mce_score):
            classifier = LogisticRegression(multi_class='ovr',
                                            solver='lbfgs',
                                            max_iter=1000)
        else:
            classifier = LogisticRegression(multi_class='multinomial',
                                            solver='lbfgs',
                                            max_iter=1000)

    if classifier == "linear_svm":
        clf = svm.LinearSVC(max_iter=1000)

        hyp = {
            "loss": ['hinge', 'squared_hinge'],
            "multi_class": ['ovr', 'crammer_singer']
        }

        random_search = RandomizedSearchCV(clf,
                                           hyp,
                                           random_state=1,
                                           n_iter=20,
                                           cv=5,
                                           verbose=1,
                                           n_jobs=-1)
        search_result = random_search.fit(X_train, y_train)

        loss = search_result.best_estimator_.get_params()['loss']
        multi_class = search_result.best_estimator_.get_params()['multi_class']

        print("Best loss: ", loss)
        print("Best multi_class:", multi_class)

        classifier = svm.LinearSVC(loss=loss,
                                   multi_class=multi_class,
                                   max_iter=1000)

    if classifier == "nonlinear_svm":
        clf = svm.SVC()
        hyp = {
            "gamma": ['auto', 'scale'],
            "kernel": ['poly', 'rbf', 'sigmoid']
        }

        random_search = RandomizedSearchCV(clf,
                                           hyp,
                                           random_state=1,
                                           n_iter=20,
                                           cv=5,
                                           verbose=1,
                                           n_jobs=-1)
        search_result = random_search.fit(X_train, y_train)

        gamma = search_result.best_estimator_.get_params()['gamma']
        kernel = search_result.best_estimator_.get_params()['kernel']

        print("Best gamma: ", gamma)
        print("Best kernel:", kernel)

        classifier = svm.SVC(gamma=gamma, kernel=kernel)

    if classifier == "knn":
        classifier = KNeighborsClassifier(
            n_neighbors=5)  # change k-value as needed

    if classifier == "mlp":
        clf = MLPClassifier()
        hyp = {
            "hidden_layer_sizes": [(64, ), (64, 64), (64, 64, 64), (128, ),
                                   (128, 128), (128.128, 128), (256, 256, 256),
                                   (512, 512, 512)]
        }

        grid_search = GridSearchCV(clf, hyp, cv=5)
        search_result = grid_search.fit(X_train, y_train)

        hidden_layer_sizes = search_result.best_estimator_.get_params(
        )['hidden_layer_sizes']

        print("Best hidden layer size:", hidden_layer_sizes)

        classifier = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes,
                                   verbose=True)  # uses reLU, adam by default

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    # print metrics
    print("\nClassification report summary:")
    print(
        classification_report(y_test,
                              y_pred,
                              labels=[i + 1 for i in range(20)],
                              digits=3))

    print("Accuracy:", classifier.score(X_test, y_test))
    print("Macro-F1:", f1_score(y_test, y_pred, average='macro'))

    # if decision tree or random forest, generates plot of tree
    if classifier == "decision_tree" or classifier == "random_forest":

        # print 5 most important tokens:
        swapped_vocab = dict([
            (value, key) for key, value in vectorizer.vocabulary_.items()
        ])
        print("5 most important tokens: ")
        for i in np.argsort(classifier.feature_importances_)[-5:][::-1]:
            print(swapped_vocab[i])

        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus

        dot_data = StringIO()

        if classifier == "decision_tree":
            export_graphviz(classifier,
                            out_file=dot_data,
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

            graph.write_pdf("decision_tree.pdf")

        else:
            # get a random one of the 100 trees in the forest
            export_graphviz(classifier.estimators_[random.randint(1, 101)],
                            out_file=dot_data,
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

            graph.write_pdf("random_forest.pdf")

    # if logistic regression, plot most important terms
    if classifier == "logistic_regression":
        plot_lr_coef(classifier, vectorizer)

    # get confusion matrix for plot
    cm = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)

    return vectorizer, classifier, cm
示例#33
0
    def show_tree(self):
        '''return a png of the tree'''
        assert self.clf
        try:
            import pydotplus as pydot
        except ImportError:
            import pydot # dirty hack for read the docs

        dot_data = StringIO() 
        tree.export_graphviz(self.clf, out_file=dot_data, 
                             feature_names=self.feature_names) 
        dot_data = dot_data.getvalue()#.encode('ascii') # @UndefinedVariable
        graph = pydot.graph_from_dot_data(dot_data)[0]  
        img = graph.create_png()
        return img

       
# if __name__ == '__main__':
#     from test import test_utilities
#     import matplotlib.pyplot as plt
# 
#     ema_logging.log_to_stderr(ema_logging.INFO)
# 
#     def scarcity_classify(outcomes):
#         outcome = outcomes['relative market price']
#         change = np.abs(outcome[:, 1::]-outcome[:, 0:-1])
#         
#         neg_change = np.min(change, axis=1)
#         pos_change = np.max(change, axis=1)
#         
#         logical = (neg_change > -0.6) & (pos_change > 0.6)
#         
#         classes = np.zeros(outcome.shape[0])
#         classes[logical] = 1
#         
#         return classes
#  
#     results = test_utilities.load_scarcity_data()
#     
#     cart = setup_cart(results, scarcity_classify)
#     cart.build_tree()
#     
#     print(cart.boxes_to_dataframe())
#     print(cart.stats_to_dataframe())
#     cart.display_boxes(together=True)
#     
#     img = cart.show_tree()
#      
#     import matplotlib.pyplot as plt
#     import matplotlib.image as mpimg
#   
#     # treat the dot output string as an image file
#     sio = StringIO()
#     sio.write(img)
#     sio.seek(0)
#     img = mpimg.imread(sio)
#       
#     # plot the image
#     imgplot = plt.imshow(img, aspect='equal')
#       
#     plt.show()
def draw_tree(model, name):
    dot_data = StringIO()
    _tree.export_graphviz(model, out_file=dot_data)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(name + ".pdf")
示例#35
0
def plot_tree(tree: Tree, max_depth: int, iter: int):
    """
           展示单棵决策树
    :param tree: 生成的决策树
    :param max_depth: 决策树的最大深度
    :param iter: 第几棵决策树
    :return:
    """
    root = tree.root_node
    res = []
    # 通过遍历获取决策树的父子节点关系,可选有traversal 层次遍历 和traversal_preorder 先序遍历
    traversal(root, res)

    # 获取所有节点
    nodes = {}
    index = 0
    for i in res:
        p, c = i[0], i[1]
        if p not in nodes.values():
            nodes[index] = p
            index = index + 1
        if c not in nodes.values():
            nodes[index] = c
            index = index + 1

    # 通过dot语法将决策树展示出来
    edges = ''
    node = ''
    # 将节点层次展示
    for depth in range(max_depth):
        for nodepair in res:
            if nodepair[0].deep == depth:
                # p,c分别为节点对中的父节点和子节点
                p, c = nodepair[0], nodepair[1]
                l = len([i for i in range(len(c.data_index)) if c.data_index[i] is True])
                pname = str(list(nodes.keys())[list(nodes.values()).index(p)])
                cname = str(list(nodes.keys())[list(nodes.values()).index(c)])
                if l > 0:
                    edges = edges + pname + '->' + cname + '[label=\"' + str(p.split_feature) + (
                        '<' if p.left_child == c else '>=') + str(p.split_value) + '\"]' + ';\n'

                node = node + pname + '[width=1,height=0.5,color=lemonchiffon,style=filled,shape=ellipse,label=\"id:' + str(
                    [i for i in range(len(p.data_index)) if p.data_index[i] is True]) + '\"];\n' + \
                       (cname + '[width=1,height=0.5,color=lemonchiffon,style=filled,shape=ellipse,label=\"id:' + str(
                           [i for i in range(len(c.data_index)) if
                            c.data_index[i] is True]) + '\"];\n' if l > 0 else '')
                if c.is_leaf and l > 0:
                    edges = edges + cname + '->' + cname + 'p[style=dotted];\n'
                    node = node + cname + 'p[width=1,height=0.5,color=lightskyblue,style=filled,shape=box,label=\"' + str(
                        "{:.4f}".format(c.predict_value)) + '\"];\n'
            else:
                continue
        dot = '''digraph g {\n''' + edges + node + '''}'''
        graph = pdp.graph_from_dot_data(dot)
        # 保存图片+pyplot展示
        graph.write_png('results/NO.{}_tree.png'.format(iter))
        img = Image.open('results/NO.{}_tree.png'.format(iter))
        img = img.resize((1024, 700), Image.ANTIALIAS)
        plt.ion()
        plt.figure(1, figsize=(30, 20))
        plt.axis('off')
        plt.title('NO.{} tree'.format(iter))
        plt.rcParams['figure.figsize'] = (30.0, 20.0)
        plt.imshow(img)
        plt.pause(0.01)
示例#36
0
def graph(request):
    if request.method == 'GET':
        length = len(Applicants2016.objects.all())
        applicants = Applicants2016.objects.all()[:(length / 2 - 1)]
        clf = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=1)
        clf2 = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=1)
        Y = []
        X = []
        Z = []
        for a in applicants:
            if a.apply_for == "mphil":
                Y.append("0")
            elif a.apply_for == "phd":
                Y.append("1")
            else:
                Y.append("2")
            Z.append(a.shortlisted)
            X.append([a.toefl, a.gpa_ug / a.gpa_ug_scale, a.papers])
            if a.major_ug == 'CS':
                X[-1].append("1")
            else:
                X[-1].append("0")
        feature_names = ['toefl', 'gpa_ug_scale', 'papers', 'major_ug']
        class_names = ['mphil', 'phd', 'either']
        class2_names = ['not shortlisted', 'shortlisted']
        clf = clf.fit(X, Y)
        clf2 = clf2.fit(X, Z)
        f = tree.export_graphviz(clf,
                                 out_file=None,
                                 feature_names=feature_names,
                                 class_names=class_names,
                                 filled=True,
                                 rounded=True,
                                 special_characters=True)
        f2 = tree.export_graphviz(clf2,
                                  out_file=None,
                                  feature_names=feature_names,
                                  class_names=class2_names,
                                  filled=True,
                                  rounded=True,
                                  special_characters=True)
        graph = pydotplus.graph_from_dot_data(f)
        graph2 = pydotplus.graph_from_dot_data(f2)
        current_dir = os.path.dirname(os.path.abspath(__file__))
        static = os.path.join(current_dir, 'static', 'mining1')
        graph.write_png(os.path.join(static, "tree.png"))
        graph2.write_png(os.path.join(static, "tree2.png"))
        #image_data1=open('tree.png',"rb").read()
        #image_data2=open('tree2.png',"rb").read()
        html = "<p style='text-align:center;'>Decision Tree for program</p><img src='/static/mining1/tree.png'><p style='text-align:center;'>Decision Tree for shortlisted interview</p><img src='/static/mining1/tree2.png'>"

        applicants2 = Applicants2016.objects.all()[(length / 2):length]
        Y = []
        X = []
        Z = []
        for a in applicants2:
            if a.apply_for == "mphil":
                Y.append("0")
            elif a.apply_for == "phd":
                Y.append("1")
            else:
                Y.append("2")
            Z.append(a.shortlisted)
            X.append([a.toefl, a.gpa_ug / a.gpa_ug_scale, a.papers])
            if a.major_ug == 'CS':
                X[-1].append("1")
            else:
                X[-1].append("0")
        result = clf2.predict(X)
        i = 0
        accuracy = 0
        for r in result:
            if r == Z[i]:
                accuracy = accuracy + 1
            i = i + 1
        print(accuracy / i)

        #return HttpResponse(image_data1,content_type="image/png")
        return HttpResponse(html)
    elif request.method == 'POST':
        return _error_response(request, "POST not allowed")
示例#37
0
        elif listFromLine[2] == 'yes':
            returnMat[index, 2] = 1
        if listFromLine[3] == 'reduced':
            returnMat[index, 3] = 0
        elif listFromLine[3] == 'normal':
            returnMat[index, 3] = 1

        #returnMat[index,:] = listFromLine[0:3]
        if listFromLine[-1] == 'no lenses':
            classLabelVector.append(3)
        elif listFromLine[-1] == 'soft':
            classLabelVector.append(2)
        elif listFromLine[-1] == 'hard':
            classLabelVector.append(1)
        index = index + 1
    return returnMat, classLabelVector


X, y = dataload()

print X, y

# 训练模型,限制树的最大深度4
clf = tree.DecisionTreeClassifier(criterion='entropy')
#拟合模型
clf = clf.fit(X, y)

dot_data = tree.export_graphviz(clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("iris.pdf")
# Instantiate Model
clf_gini = DecisionTreeClassifier(criterion='gini',
                                  max_depth=3,
                                  min_samples_leaf=5,
                                  min_samples_split=2,
                                  random_state=100,
                                  splitter='best')
# Fit Model
clf_gini.fit(X_train, y_train)

# Generate Prediction
clf_gini_pred = clf_gini.predict(X_test)

# Calculate Accuracy Score
'''Accuracy = ratio of correctly predicted target values vs all values'''
clf_gini_accuracy = round(accuracy_score(y_test, clf_gini_pred) * 100, 2)
print(clf_gini_accuracy)

# Vizualize Tree ------------------------------------
os.chdir(r'/home/ccirelli2/Desktop/repositories/Scikit_Learn/output')
dot_data = StringIO()
export_graphviz(clf_gini,
                out_file=dot_data,
                filled=True,
                rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('balance_scale.png')
Image(graph.create_png())
def show_tree(clf):
	dot_data = StringIO()
	export_graphviz(clf, out_file=dot_data)
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf("titanic_tree.pdf")
示例#40
0
    def ML_tree(X_train, X_test, Y_train, Y_test):
        fileML = open("Ml data.txt", "a+")  # append mode

        def tree_to_code(tree, feature_names):
            tree_ = tree.tree_
            feature_name = [
                feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
                for i in tree_.feature
            ]

            print("def tree({}):\n".format(", ".join(feature_names)))
            fileML.write("def tree({}):\n".format(", ".join(feature_names)))

            def recurse(node, depth):
                indent = "  " * depth
                if tree_.feature[node] != _tree.TREE_UNDEFINED:
                    name = feature_name[node]
                    threshold = tree_.threshold[node]
                    print("{}if {} <= {}:\n".format(indent, name, threshold))
                    fileML.write("{}if {} <= {}:\n".format(
                        indent, name, threshold))
                    recurse(tree_.children_left[node], depth + 1)
                    print("{}else:  # if {} > {}\n".format(
                        indent, name, threshold))
                    fileML.write("{}else:  # if {} > {}\n".format(
                        indent, name, threshold))
                    recurse(tree_.children_right[node], depth + 1)
                else:
                    print("{}return {}\n".format(indent, tree_.value[node]))
                    fileML.write("{}return {}\n".format(
                        indent, tree_.value[node]))

            recurse(0, 1)
            fileML.write("\n")
            # clf = DecisionTreeClassifier().fit(iris.data, iris.target)
            plot_tree(tree, filled=True)
            plt.show()

        clf = DecisionTreeClassifier(random_state=0,
                                     max_depth=10,
                                     min_samples_leaf=1)
        result = clf.fit(X_train, Y_train)

        probs = clf.predict_proba(X_test)
        preds = probs[:, 1]
        fpr, tpr, threshold = metrics.roc_curve(Y_test, preds)
        roc_auc = metrics.auc(fpr, tpr)

        #---------print----------------------------------------------
        # ----------tree----------------
        print(clf.feature_importances_)
        fileML.write("DTW Decision Tree Classifier feature_importances: \n")
        for i in range(len(clf.feature_importances_)):
            print(X_train.columns[i])
            fileML.write("{0}: ".format(X_train.columns[i]))
            print(clf.feature_importances_[i])
            fileML.write("{0} \n".format(clf.feature_importances_[i]))
            print("2^coeff: " + str(2.0**(clf.feature_importances_[i])))
            fileML.write("2^coeff: {0} \n".format(
                str(2.0**(clf.feature_importances_[i]))))

        fileML.write("\n")

        dot_data = six.StringIO()
        export_graphviz(clf,
                        out_file=dot_data,
                        feature_names=X_train.columns,
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf("DTW Decision Tree Classifier.pdf")

        # ----------grapgh---------------------
        # method I: plt
        plt.title('DTW Receiver Operating Characteristic-Decision Tree')
        plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
        plt.legend(loc='lower right')
        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        path = os.path.join(parent_dir, "Decision Tree Classifier")
        plt.savefig(path)
        plt.show()
        print('finished')

        #----------function---------
        tree_to_code(clf, [
            "Malicious", "Day count Mean", "Day count STD", 'max Day count',
            "Size", "DTW 15-Malicious(%)", "DTW 10-Malicious(%)",
            "DTW 5-Malicious(%)", "Prevalence", "Peaks", "Sharp peaks"
        ])

        fileML.write("----------------------------------------------\n")
        fileML.close()
def main():
    # Put your code below
    
    test_size_input = float(input())
    random_state_input = int(input())
    criterion_input = input()
    deepth_input = int(input())
    random_state_tree_input = int(input())
    
    #read data
    iris = datasets.load_iris()
    
    #clip data
    iris.feature_names
    
    # use 'sepal length (cm)' and 'sepal width (cm)' as features
    X = iris.data[:,0:2]
    #print(X)
    y = iris.target
    #print(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size_input,random_state = random_state_input) 
    
    #create decision tree classifier learning
    dt_clf = DecisionTreeClassifier(criterion = criterion_input, max_depth = deepth_input, random_state = random_state_tree_input)
    dt_clf.fit(X_train,y_train)
    y_pred = dt_clf.predict(X_test)
    
    #accuracy_score
    acc = accuracy_score(y_test, y_pred)
    print("{:.3f}".format(acc))
    
    dot_data=export_graphviz(dt_clf, out_file=None, feature_names=['sepal length (cm)','sepal width (cm)'])
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_png('tree1.png')
    Image(graph.create_png())
    
    #read data
    iris = datasets.load_iris()
    
    #clip data
    iris.feature_names
    
    # use 'petal length (cm)' and 'petal width (cm)' as features
    X = iris.data[:,2:4]
    #print(X)
    y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size_input,random_state = random_state_input) 
    
    #create decision tree classifier learning
    dt_clf = DecisionTreeClassifier(criterion = criterion_input, max_depth = deepth_input, random_state = random_state_tree_input)
    dt_clf.fit(X_train,y_train)
    y_pred = dt_clf.predict(X_test)
    
    #accuracy_score
    acc = accuracy_score(y_test, y_pred)
    print("{:.3f}".format(acc))
    
    dot_data=export_graphviz(dt_clf, out_file=None, feature_names=['petal length (cm)','petal width (cm)'])
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_png('tree2.png')
    Image(graph.create_png())
示例#42
0
    confusion_matrix(y_validacao, y_predicao_validacao)))
print("Matriz de confusão da teste :\n {}".format(
    confusion_matrix(y_teste, y_predicao_teste)))

# Não estou conseguindo gerar um *.png
arquivo_dot = StringIO()

tree.export_graphviz(
    modeloAD,
    out_file=arquivo_dot,
    node_ids=True,
    feature_names=['Sangue', 'Da a luz', 'Pode voar', 'Mora na agua'],
    class_names=['SIM', 'NAO'],
    filled=True)

arvore = pdp.graph_from_dot_data(arquivo_dot.getvalue())

lista_edge = []
for edge in arvore.get_edge_list():
    lista_edge.append(edge.get_source())

nodes = arvore.get_node_list()
for node in nodes:
    if node.get_name() == '0':
        node.set_fillcolor('#F19C99')
    elif node.get_name() not in lista_edge:
        node.set_fillcolor('#E1D5E7')
    else:
        node.set_fillcolor('#D5E8D4')

arvore.write_png("arvore_mamifero.png")
示例#43
0
 def to_pdf(self, filename, dic_var=None):
     graph = pydotplus.graph_from_dot_data(self.to_dot(dic_var))
     graph.write_pdf(filename)
示例#44
0
def plot_multi(trees: dict, max_depth: int, iter: int):
    trees_traversal = {}
    trees_nodes = {}
    for class_index in trees.keys():
        tree = trees[class_index]
        res = []
        root = tree.root_node
        traversal(root, res)
        trees_traversal[class_index] = res
        # 获取所有节点
        nodes = {}
        index = 0
        for i in res:
            p, c = i[0], i[1]
            if p not in nodes.values():
                nodes[index] = p
                index = index + 1
            if c not in nodes.values():
                nodes[index] = c
                index = index + 1
        trees_nodes[class_index] = nodes
        # 通过dot语法将决策树展示出来
    trees_edges = {}
    trees_node = {}
    for class_index in trees.keys():
        trees_node[class_index] = ''
        trees_edges[class_index] = ''
    for depth in range(max_depth):
        for class_index in trees.keys():
            for nodepair in trees_traversal[class_index]:
                if nodepair[0].deep == depth:
                    p, c = nodepair[0], nodepair[1]
                    l = len([i for i in range(len(c.data_index)) if c.data_index[i] is True])
                    pname = str(list(trees_nodes[class_index].keys())[list(trees_nodes[class_index].values()).index(p)])
                    cname = str(list(trees_nodes[class_index].keys())[list(trees_nodes[class_index].values()).index(c)])
                    if l > 0:
                        trees_edges[class_index] = trees_edges[class_index] + pname + '->' + cname + '[label=\"' + str(
                            p.split_feature) + (
                                                       '<' if p.left_child == c else '>=') + str(
                            p.split_value) + '\"]' + ';\n'

                        trees_node[class_index] = trees_node[
                                                      class_index] + pname + '[width=1,height=0.5,color=lemonchiffon,style=filled,shape=ellipse,label=\"id:' + str(
                            [i for i in range(len(p.data_index)) if p.data_index[i] is True]) + '\"];\n' + \
                                                  (
                                                      cname + '[width=1,height=0.5,color=lemonchiffon,style=filled,shape=ellipse,label=\"id:' + str(
                                                          [i for i in range(len(c.data_index)) if
                                                           c.data_index[i] is True]) + '\"];\n' if l > 0 else '')
                    if c.is_leaf and l > 0:
                        trees_edges[class_index] = trees_edges[
                                                       class_index] + cname + '->' + cname + 'p[style=dotted];\n'
                        trees_node[class_index] = trees_node[
                                                      class_index] + cname + 'p[width=1,height=0.5,color=lightskyblue,style=filled,shape=box,label=\"' + str(
                            "{:.4f}".format(c.predict_value)) + '\"];\n'
                else:
                    continue
            dot = '''digraph g {\n''' + trees_edges[class_index] + trees_node[class_index] + '''}'''
            graph = pdp.graph_from_dot_data(dot)
            # 保存图片+pyplot展示
            graph.write_png('results/NO.{}_{}_tree.png'.format(iter, class_index))
        plt.ion()
        plt.figure(1, figsize=(30, 20))
        plt.axis('off')
        plt.title('NO.{} iter '.format(iter))
        class_num = len(trees.keys())
        if class_num / 3 - int(class_num / 3) < 0.000001:
            rows = int(class_num / 3)
        else:
            rows = int(class_num / 3) + 1
        for class_index in trees.keys():
            index = list(trees.keys()).index(class_index)
            plt.subplot(rows, 3, index + 1)
            img = Image.open('results/NO.{}_{}_tree.png'.format(iter, class_index))
            img = img.resize((1024, 700), Image.ANTIALIAS)
            plt.axis('off')
            plt.title('NO.{}_class {}'.format(iter, class_index))
            plt.rcParams['figure.figsize'] = (30.0, 20.0)
            plt.imshow(img)
        plt.savefig('results/NO.{}_tree.png'.format(iter))
        plt.pause(0.01)
                        ha='right',
                        fontsize=20)
plt.ylabel('True label', fontsize=20)
plt.xlabel('Predicted label', fontsize=20)
plt.tight_layout()
plt.show()

# display decision tree
dot_data = tree.export_graphviz(clf_gini,
                                filled=True,
                                rounded=True,
                                class_names='survived',
                                feature_names=tt.iloc[:, 0:].columns,
                                out_file=None)

graph = graph_from_dot_data(dot_data)
graph.write_pdf("decision_tree_gini.pdf")
webbrowser.open_new(r'decision_tree_gini.pdf')

dot_data = tree.export_graphviz(clf_entropy,
                                filled=True,
                                rounded=True,
                                class_names='survived',
                                feature_names=tt.iloc[:, 0:].columns,
                                out_file=None)

graph = graph_from_dot_data(dot_data)
graph.write_pdf("decision_tree_entropy.pdf")
webbrowser.open_new(r'decision_tree_entropy.pdf')
#%%-----------------------------------------------------------------------
        lenses_list = []
    # print(lenses_dict)														#打印字典信息
    lenses_pd = pd.DataFrame(lenses_dict)  #生成pandas.DataFrame
    # print(lenses_pd)														#打印pandas.DataFrame
    le = LabelEncoder()  #创建LabelEncoder()对象,用于序列化
    for col in lenses_pd.columns:  #序列化
        lenses_pd[col] = le.fit_transform(lenses_pd[col])
    # print(lenses_pd)														#打印编码信息

    clf = tree.DecisionTreeClassifier(
        max_depth=6)  #创建DecisionTreeClassifier()类
    clf = clf.fit(lenses_pd.values.tolist(), lenses_target)  #使用数据,构建决策树

    dot_data = StringIO()
    tree.export_graphviz(
        clf,
        out_file=dot_data,  #绘制决策树
        feature_names=lenses_pd.keys(),
        class_names=clf.classes_,
        filled=True,
        rounded=True,
        special_characters=True)
    # graph = pydotplus.graph_from_dot_data(dot_data.getvalue());
    #下面这列解决中文乱码
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue().replace(
        'helvetica', '"Microsoft YaHei"'))
    print(dot_data.getvalue())
    graph.write_pdf("tree.pdf")  #保存绘制好的决策树,以PDF的形式存储。

    print(clf.predict([[1, 1, 1, 0]]))  #预测
    def autolayout(self):
        """
        Using graphviz to layout the current graph to 'dot' layout.
        """

        # Create a empty graph
        graph_viz = Digraph(engine='dot')

        # For all instances, add a node, with it's size.
        for widget_instance in self.widget_instances:
            assert isinstance(widget_instance, InstanceWidget)

            if widget_instance.hidden:
                continue

            size = widget_instance.preferredSize()
            assert isinstance(size, QtCore.QSizeF)

            graph_viz.node(widget_instance.name, width=str(size.width() / 72.0),
                           height=str(size.height() / 72.0), shape="rect")

        # For all (not hidden) connections), connect the source and destination widgets with minimum length of 2 inches
        for connection in self.connection_widgets:
            assert isinstance(connection, ConnectionWidget)
            if not connection.hidden:
                graph_viz.edge(connection.source_instance_widget.name, connection.dest_instance_widget.name,
                               minlen=str(2))

        # Generate / Render graph into 'dot' format
        raw_dot_data = graph_viz.pipe('dot')
        print "Graphviz rendering... the following is the dot file from graphviz"
        print raw_dot_data

        # Read dot format (using pydotplus)
        dot_data = pydotplus.graph_from_dot_data(raw_dot_data)

        # Get graphviz height
        graph_attributes = dot_data.get_graph_defaults()

        height = 0

        for attribute_dict in graph_attributes:
            if not isinstance(attribute_dict, dict):
                continue

            if attribute_dict['bb'] is not None:
                rectange = Common.extract_numbers(attribute_dict['bb'])
                height = rectange[1][1] - rectange[0][1]

        # For all instances, apply new position to widgets.

        for instance_widget in self.widget_instances:
            assert isinstance(instance_widget, InstanceWidget)

            if instance_widget.hidden:
                continue

            # Get instance's name
            instance_name = instance_widget.name

            # Get the node representing this instance, and get its attributes
            node_list = dot_data.get_node(instance_name)

            if len(node_list) < 1:
                # Name may be quoted due to special characters
                quoted_name = '"%s"' % instance_name
                node_list = dot_data.get_node(quoted_name)

            assert len(node_list) == 1  # Should only be one node
            assert isinstance(node_list[0], pydotplus.Node)
            node_attributes_dict = node_list[0].get_attributes()

            # Extract position of the node
            node_position_list = Common.extract_numbers(node_attributes_dict['pos'])
            assert len(node_position_list) is 1  # Should only be one position
            node_position = node_position_list[0]

            self.reposition_instance_widget(instance_widget, x_pos=node_position[0],
                                            y_pos=math.fabs(height - node_position[1]))

        self.update_view()
        self.save_layout_to_file()
示例#48
0
car_data = np.genfromtxt("auto-mpg-modified.data", usecols=range(8))
car_data = car_data[~np.isnan(car_data).any(axis=1)]

# Assign MPG to y and all other attributes to x
data = car_data[:, 1:]
labels = car_data[:, 0]

# Uncomment to add some noise to the data
#noise = np.random.normal(0, 10, len(data))
#data[:,5] += noise.astype(int)

dt = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
dt = dt.fit(data, labels)

attributes = [
    "CYLYNDERS", "DISPLACEMENT", "HORSEPOWER", "WEIGHT", "ACCELERATION",
    "MODEL_YEAR", "ORIGIN"
]
class_labels = ["BAD", "OK", "GOOD"]

out = StringIO()
tree.export_graphviz(dt,
                     out_file=out,
                     feature_names=attributes,
                     class_names=class_labels,
                     filled=True,
                     impurity=False)

pydotplus.graph_from_dot_data(out.getvalue()).write_png("dtree.png")
示例#49
0
def hr_modeling(features, label, tree_vis=False, ann=False):
    # 将原数据集划分为了训练集、验证集、测试集,6:2:2切分
    f_v = features.values
    f_n = features.columns.values
    l_v = label.values
    x_tt, x_validation, y_tt, y_validation = train_test_split(f_v,
                                                              l_v,
                                                              test_size=0.2)
    x_train, x_test, y_train, y_test = train_test_split(x_tt,
                                                        y_tt,
                                                        test_size=0.25)

    models = []
    models.append(('KNN', KNeighborsClassifier(n_neighbors=3)))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('DecisionTreeGini', DecisionTreeClassifier()))
    models.append(
        ('DecisionTreeEntropy', DecisionTreeClassifier(criterion='entropy')))
    models.append(('SVM Classifier', SVC(C=1000)))
    models.append(('RandomForest',
                   RandomForestClassifier(n_estimators=81, max_features=None)))
    models.append(('Adaboost', AdaBoostClassifier()))
    models.append(('LogisticRegression',
                   LogisticRegression(penalty='l2', C=1.0, tol=1e-10)))
    models.append(
        ('GBDT', GradientBoostingClassifier(max_depth=6, n_estimators=100)))
    for clf_name, clf in models:
        clf.fit(x_train, y_train)
        xy_list = [(x_train, y_train), (x_validation, y_validation),
                   (x_test, y_test)]
        for i in range(len(xy_list)):
            x_part = xy_list[i][0]
            y_part = xy_list[i][1]
            y_pred = clf.predict(x_part)
            print(i)  # 分别将模型在训练集、验证集、测试集上进行实验
            print(clf_name, '-ACC:', accuracy_score(y_part, y_pred))
            print(clf_name, '-REC:', recall_score(y_part, y_pred))
            print(clf_name, '-F1:', f1_score(y_part, y_pred))

        # 决策树可以生成图
        if clf_name.startswith('DecisionTree') and tree_vis:
            dot_data = export_graphviz(clf,
                                       out_file=None,
                                       feature_names=f_n,
                                       class_names=['NL', 'L'],
                                       filled=True,
                                       rounded=True,
                                       special_characters=True)
            graph = pydotplus.graph_from_dot_data(dot_data)
            graph.write_pdf('dt_tree.pdf')

    if ann:
        ann_model = Sequential()
        ann_model.add(Dense(50, input_dim=len(f_v[0])))
        ann_model.add(Activation('sigmoid'))
        ann_model.add(Dense(2))
        ann_model.add(Activation('softmax'))
        sgd = SGD(lr=0.1)
        ann_model.compile(optimizer=sgd, loss='mean_squared_error')
        ann_model.fit(x=x_train,
                      y=np.array([[0, 1] if i == 1 else [1, 0]
                                  for i in y_train]),
                      nb_epoch=15000,
                      batch_size=8999)
        xy_list = [(x_train, y_train), (x_validation, y_validation),
                   (x_test, y_test)]
        for i in range(len(xy_list)):
            x_part = xy_list[i][0]
            y_part = xy_list[i][1]
            y_pred = ann_model.predict_classes(x_part)
            print(i)
            print('ANN', '-ACC:', accuracy_score(y_part, y_pred))
            print('ANN', '-REC:', recall_score(y_part, y_pred))
            print('ANN', '-F1:', f1_score(y_part, y_pred))
示例#50
0
def randomforest_predict():
    warnings.filterwarnings('ignore')

    df_data = pd.read_csv("data/housing.data", delim_whitespace=True)
    X = df_data.drop(["MEDV"], axis=1)
    y = df_data["MEDV"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=128)

    param_grid = {
        'n_estimators': [5, 10, 20, 50, 100, 200],  # tree number
        'max_depth': [3, 5, 7],  # max depth
        'max_features': [0.6, 0.7, 0.8, 1]  # max features
    }

    rf = RandomForestRegressor()
    grid = GridSearchCV(rf, param_grid=param_grid, cv=3)
    grid.fit(X_train, y_train)
    print("best_params", grid.best_params_)

    rf_reg = grid.best_estimator_
    print(rf_reg)

    estimator = rf_reg.estimators_[3]
    dot_data = tree.export_graphviz(estimator, out_file=None, filled=True, rounded=True)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_png("result/rf_reg.png")

    feature_names = X.columns
    feature_importances = rf_reg.feature_importances_
    indices = np.argsort(feature_importances)[::-1]
    for index in indices:
        print("feature %s (%f)" % (feature_names[index], feature_importances[index]))

    plt.figure(figsize=(16, 8))
    plt.title("feature importance of random forest")
    plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
    plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
    plt.show()

    rst = {"label": y_test, "prediction": rf_reg.predict(X_test)}
    rst = pd.DataFrame(rst)
    print(rst.head())

    rst['label'].plot(style='k.', figsize=(15, 5))
    rst['prediction'].plot(style='r.')
    plt.legend(fontsize=15, markerscale=3)
    plt.tick_params(labelsize=25)
    plt.grid()
    plt.show()

    MSE = metrics.mean_squared_error(y, rf_reg.predict(X))
    print(np.sqrt(MSE))

    submission = {"prediction": rf_reg.predict(X_test)}
    submission = pd.DataFrame(submission)
    submission.to_csv("result/price_predict_randomforest.csv")

    y_predict = rf_reg.predict(X_test)
    x_data = pd.Series(range(len(y_test)))[:, np.newaxis]
    y_test_data = y_test[:, np.newaxis]
    y_predict_data = y_predict[:, np.newaxis]
    plt.plot(x_data, y_test_data, label='Price')
    plt.plot(x_data, y_predict_data, label='Predict price')
    plt.xlabel('Entity')
    plt.ylabel('Price')
    plt.title('Price prediction (random forest)')
    plt.legend()
    plt.savefig('result/price_predict_random_forest.png')
    plt.show()
        [1, 0, 1, 2, 'yes'],
        [1, 0, 1, 2, 'yes'],
        [2, 0, 1, 2, 'yes'],
        [2, 0, 1, 1, 'yes'],
        [2, 1, 0, 1, 'yes'],
        [2, 1, 0, 2, 'yes'],
        [2, 0, 0, 0, 'no']
    ]
    y_label = "isOk"
    x_labels = ["age", "work", "house", "credit"]
    return data, x_labels, y_label


if __name__ == '__main__':
    data, x_labels, y_label = createDataSet()
    x, y = [], []
    for i in data:
        y.append(i[-1])
        x.append(i[:-1])
    train_x = df(x, columns=x_labels)
    train_y = df(y, columns=[y_label])
    clf = tree.DecisionTreeClassifier(
        criterion="entropy", max_depth=5)  # 创建DecisionTreeClassifier()类
    clf = clf.fit(x, y)  # 使用数据,构建决策树
    reg_dot_data = tree.export_graphviz(clf,
                                        out_file=None,
                                        feature_names=train_x.keys(),
                                        class_names=clf.classes_)  # 决策树可视化函数
    reg_graph = pydotplus.graph_from_dot_data(reg_dot_data)
    reg_graph.write_png('tree.png')  # 保存为图片
示例#52
0
def test_16_Tree():

    strOf_FuncName = "test_16_Tree"

    '''###################
        step : 1
            opening, vars
    ###################'''
    print()
    
    print ("[%s:%d] starting : %s (time=%s)" % (
                os.path.basename(os.path.basename(libs.thisfile()))
                , libs.linenum()
                , strOf_FuncName
                , libs.get_TimeLabel_Now()
                )
    )

    print()
    
    #ref https://www.devdungeon.com/content/python-import-syspath-and-pythonpath-tutorial#toc-13
    print ("[%s:%d] sys.path ==>" % (
                os.path.basename(os.path.basename(libs.thisfile()))
                , libs.linenum()
                )
    )
    
    print(sys.path)
    
    print()
    
    #ref https://www.devdungeon.com/content/python-import-syspath-and-pythonpath-tutorial#toc-13
    print ("[%s:%d] os.environ[\"PATH\"] ==>" % (
                os.path.basename(os.path.basename(libs.thisfile()))
                , libs.linenum()
                )
    )
    
    print(os.environ["PATH"])
    
    '''###################
        step : 2
            data : load
    ###################'''
    df = pandas.read_csv("shows.csv")

#     #debug
#     print ("[%s:%d] df ==> " 
#            % 
#             (os.path.basename(libs.thisfile()), libs.linenum())
#            )
#        
#     print(df)

    '''###################
        step : 2 : 2
            data : mapping
    ###################'''
    d = {'UK': 0, 'USA': 1, 'N': 2}
    df['Nationality'] = df['Nationality'].map(d)
    d = {'YES': 1, 'NO': 0}
    df['Go'] = df['Go'].map(d)

#     #debug
#     print ("[%s:%d] df (mapped) ==> " 
#            % 
#             (os.path.basename(libs.thisfile()), libs.linenum())
#            )
#        
#     print(df)

    '''###################
        step : 2 : 3
            data : feature, target
    ###################'''
    features = ['Age', 'Experience', 'Rank', 'Nationality']
    
    X = df[features]
    y = df['Go']    
    
#     #debug
#     print ("[%s:%d] feature, column ==> " 
#            % 
#             (os.path.basename(libs.thisfile()), libs.linenum())
#            )
#        
#     print(X)
#     print(y)
    
    '''###################
        step : 3
            tree
    ###################'''
    dtree = DecisionTreeClassifier()
    dtree = dtree.fit(X, y)    
    
#     #debug
#     print ("[%s:%d] dtree ==> " 
#            % 
#             (os.path.basename(libs.thisfile()), libs.linenum())
#            )
#        
#     print(dtree)
    
    #debug:20210418_170812
    data = tree.export_graphviz(dtree, out_file=None, feature_names=features)
    
#     #debug
#     print ("[%s:%d] data ==> " 
#            % 
#             (os.path.basename(libs.thisfile()), libs.linenum())
#            )
#        
#     print(data)

    #mark:20210503_164821
    graph = pydotplus.graph_from_dot_data(data)

#     #debug
#     print ("[%s:%d] graph ==> " 
#            % 
#             (os.path.basename(libs.thisfile()), libs.linenum())
#            )
#         
#     print(graph)    

    '''###################
        step : 4
            graph
    ###################'''
    strOf_Time_Label = libs.get_TimeLabel_Now()
    
    dpath_PlotImage = "./data/s-10"
#     dpath_PlotImage = "./data/s-9"
    fname_PlotImage = "mydecisiontree.%s.png" % (strOf_Time_Label)
#     fname_PlotImage = "plot_image_%s" % (strOf_Time_Label)
     
    fpath_PlotImage = os.path.join(dpath_PlotImage, fname_PlotImage)
    
    graph.write_png(fpath_PlotImage)
#     graph.write_png('mydecisiontree.png')

    #debug
    print ("[%s:%d] graph.write_png ==> %s" 
           % 
            (
             os.path.basename(libs.thisfile()), libs.linenum()
             , fpath_PlotImage
             )
           )
       
    
    img=pltimg.imread(fpath_PlotImage)
#     img=pltimg.imread('mydecisiontree.png')
    imgplot = plt.imshow(img)
#     plt.show() 
     
    '''###################
        step : 5
            predict
    ###################'''
    litOf_Predict_Conditions    = [40, 10, 6, 1]
#     litOf_Predict_Conditions    = [40, 10, 7, 1]
    
    #debug
    print ("[%s:%d] litOf_Predict_Conditions ==>" 
           % 
            (
             os.path.basename(libs.thisfile()), libs.linenum()
             
             )
           )
    
    print(litOf_Predict_Conditions)
    print(dtree.predict([litOf_Predict_Conditions])) 
#     print(dtree.predict([[40, 10, 7, 1]])) 
    
    
    
    '''###################
        step : 2
            prep
    ###################'''
    
    '''###################
示例#53
0
def createDataSet(dict):
    allElectronicsData = open(dict['in_file_path'])
    reader = csv.reader(allElectronicsData)
    headers = next(reader)

    print("headers\n",headers)
    featureList = []

    labelList = []

    for row in [rows for rows in reader]:
        labelList.append(row[len(row) - 1])
        rowDict = {}
        for i in range(0, len(row) - 1):
            rowDict[headers[i]] = row[i]
        featureList.append(rowDict)

    print("labelList\n",labelList)
    vec = DictVectorizer()
    dummyX = vec.fit_transform(featureList).toarray()
    print('show vector name\n',vec.get_feature_names())

    print("dummyX\n",dummyX)
    #把标签转化为0-1形式
    #lb = preprocessing.LabelBinarizer()
    #dummyY = lb.fit_transform(labelList)
    dummyY=labelList
    print("dummyY\n",dummyY)
    print(dict)
    clf = tree.DecisionTreeClassifier(
                 criterion=dict['criterion'],
                 splitter=dict['splitter'],
                 max_depth=dict['max_depth'],
                 min_samples_split=dict['min_samples_split'],
                 min_samples_leaf=dict['min_samples_leaf'],
                 min_weight_fraction_leaf=dict['min_weight_fraction_leaf'],
                 max_features=dict['max_features'],
                 random_state=dict['random_state'],
                 max_leaf_nodes=dict['max_leaf_nodes'],
                 min_impurity_decrease=dict['min_impurity_decrease'],
                 min_impurity_split=dict['min_impurity_split'],
                 class_weight=dict['class_weight'],
                 presort=dict['presort'],
                 ccp_alpha=dict['ccp_alpha'])
    print(clf)
    clf.fit(dummyX, dummyY)
    print("training score : %.3f " % (clf.score(dummyX, dummyY)))
    import pydotplus
    from six import StringIO
    dot_data = StringIO()
    model_path=dict['out_file_path']
    model_parent_path=os.path.split(model_path)[0]
    if not os.path.exists(model_parent_path):
        os.makedirs(model_parent_path)
    pdf_path=os.path.splitext(model_path)[0]+".pdf"
    dot_path=os.path.splitext(model_path)[0]+".dot"
    with open(dot_path, 'w') as f :
        f = tree.export_graphviz(clf, out_file = f, class_names=clf.classes_,
                feature_names = vec.get_feature_names())
    tree.export_graphviz(clf, out_file=dot_data,feature_names = vec.get_feature_names(),
                         class_names=clf.classes_,
                         filled=True, rounded=True,
                         special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf(pdf_path)

    joblib.dump(clf, model_path)
    plt.figure()
    plot_tree(clf, filled=True,feature_names = vec.get_feature_names(),class_names=clf.classes_)
    png_path=os.path.splitext(model_path)[0]+".png"
    plt.savefig(png_path)
def _decision_tree_classification_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]),
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        class_weight=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    feature_names, features = check_col_type(table, feature_cols)
    y_train = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'):
        raise_error('0718', 'label_col')

    class_labels = sorted(set(y_train))
    if class_weight is not None:
        if len(class_weight) != len(class_labels):
            raise ValueError(
                "Number of class weights should match number of labels.")
        else:
            class_weight = {
                class_labels[i]: class_weight[i]
                for i in range(len(class_labels))
            }

    classifier = DecisionTreeClassifier(
        criterion, splitter, max_depth, min_samples_split, min_samples_leaf,
        min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes,
        min_impurity_decrease, min_impurity_split, class_weight, presort)
    classifier.fit(features, table[label_col], sample_weight, check_input,
                   X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(classifier,
                        out_file=dot_data,
                        feature_names=feature_names,
                        class_names=classifier.classes_.astype(np.str),
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['classes'] = classifier.classes_
    feature_importance = classifier.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = classifier.max_features_
    model['n_classes'] = classifier.n_classes_
    model['n_features'] = classifier.n_features_
    model['n_outputs'] = classifier.n_outputs_
    model['tree'] = classifier.tree_
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier

    # report
    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_names)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.xlim(0, 1.1)
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)

    # Add tree plot
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Classification Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()
    feature_importance_table = pd.DataFrame(
        [[feature_cols[i], feature_importance[i]]
         for i in range(len(feature_cols))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
示例#55
0
import numpy as np
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pydotplus as pdp
from IPython.display import Image
url="https://raw.githubusercontent.com/venky14/Machine-Learning-with-Iris-Dataset/master/Iris.csv"
cols=['Id','SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
pima=pd.read_csv(url)
print(pima.head())
feature_cols=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
x=pima[feature_cols]
y=pima.Species
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)
clf=DecisionTreeClassifier()
clf=clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
result=confusion_matrix(y_test,y_pred)
print("confusion matrix")
print(result)
result1=classification_report(y_test,y_pred)
print("classification report")
print(result1)
result2=accuracy_score(y_test,y_pred)
print("accuracy: ",result2)
dot_data=export_graphviz(clf,out_file=None,filled=True,rounded=True,special_characters=True,feature_names=feature_cols,class_names=['Iris-setosa','Iris-versicolor','Iris-virginica'])
graph=pdp.graph_from_dot_data(dot_data) 
graph.write_png('Iris.png')
Image(graph.create_png())
示例#56
0
                                   min_samples_leaf=9,
                                   min_samples_split=6,
                                   min_weight_fraction_leaf=0.0,
                                   presort=False,
                                   random_state=42,
                                   splitter='best')
clf1 = clf1.fit(X1, Y1)
dot_data1 = tree.export_graphviz(
    clf1,
    out_file=None,
    feature_names=[pos_names[x] for x in features_train1],
    class_names=['ctrl', 'expr'],
    filled=True,
    rounded=True,
    special_characters=True)
graph1 = pydotplus.graph_from_dot_data(dot_data1)
graph1.write_pdf("tree-dataset-1.pdf")

clf2 = tree.DecisionTreeClassifier(class_weight=None,
                                   criterion='entropy',
                                   max_depth=3,
                                   max_features=None,
                                   max_leaf_nodes=None,
                                   min_impurity_split=1e-07,
                                   min_samples_leaf=5,
                                   min_samples_split=4,
                                   min_weight_fraction_leaf=0.0,
                                   presort=False,
                                   random_state=42,
                                   splitter='best')
clf2 = clf2.fit(X2, Y2)
示例#57
0
                  'green', 'dark', 'dark', 'dark', 'green',
                  'white', 'white', 'green', 'white', 'dark', 'white', 'green'],
        'root': ['fully rolled', 'fully rolled', 'fully rolled', 'fully rolled', 'fully rolled', 'slightly rolled ','slightly rolled ', 'slightly rolled ',
                 'slightly rolled ', 'straight', 'slightly rolled ', 'fully rolled', 'slightly rolled ', 'slightly rolled ','slightly rolled ', 'fully rolled', 'fully rolled'],

        'response': ['boom', 'low', 'boom', 'low', 'boom', 'boom', 'boom', 'boom',
                 'low', 'clear', 'clear', 'boom', 'boom', 'low', 'boom', 'boom', 'low'],
        'texture': ['clear'] * 6 + ['slightly paste', 'clear', 'slightly paste', 'clear', 'paste', 'paste',
                 'slightly paste', 'slightly paste', 'clear', 'paste', 'slightly paste'],
        'navel': ['dent'] * 5 + ['slightly dent'] * 4 + ['flat'] * 3 + ['dent'] * 2 + \
                ['slightly dent', 'flat', 'slightly dent'],
        'touch': ['hard slip'] * 5 + ['soft sticky ', 'soft sticky ', 'hard slip', 'hard slip', 'soft sticky ', 'hard slip',
                 'soft sticky ', 'hard slip', 'hard slip', 'soft sticky ', 'hard slip', 'hard slip'],
        'good': ['good'] * 8 + ['bad'] * 9})
    test_data = test_data[[
        'color', 'root', 'response', 'texture', 'navel', 'touch', 'good'
    ]]
    ##    print(test_data.show())
    X, Y = test_data[:'touch'], test_data['good']
    mytree = DecisionTreeClassifier()
    mytree.fit(X, Y)
    import pydotplus
    graph = pydotplus.graph_from_dot_data(mytree.export_graphviz())
    graph.write_pdf(r'C:\Users\JacksonWoo\Desktop\boston.pdf')
    print(mytree)
    print(mytree.predict(X))
    print(
        mytree.predict(
            SeriesSet([['red', 'red', 'clear', 'None', 'None',
                        'soft sticky']])))
示例#58
0
def automate(wordfile):
    words = []
    accuracy = []
    precision = []
    recall = []
    fscore = []

    with open(args.wordfile, encoding='utf8') as wordfile:
        for word in wordfile:
            words.append(word.rstrip())
    count = 0
    while len(words) > count:
        target_word_1 = words[count]
        count += 1
        target_word_2 = words[count]
        count += 1

        with open(
                args.filename, encoding='utf8'
        ) as data_file:  # The datafile contains all sentence examples of the category in consideration
            sent = preprocess(data_file)
            sentences = tuple_sent(sent)
            sent_list_1 = sent_list(sentences, target_word_1)
            sent_list_2 = sent_list(sentences, target_word_2)
            data_matrix_1 = np.zeros([1, 16])
            data_matrix_2 = np.zeros([1, 16])
            for i in sent_list_1:
                target_vector = feature_extraction(i, target_word_1)
                data_matrix_1 = np.vstack(
                    (data_matrix_1, target_vector)
                )  # Create a data matrix for all sentence examples of the target words
            for i in sent_list_2:
                target_vector = feature_extraction(i, target_word_2)
                data_matrix_2 = np.vstack((data_matrix_2, target_vector))
            N1, D1 = data_matrix_1.shape
            N2, D2 = data_matrix_2.shape
            target1 = np.zeros(N1)
            target2 = np.ones(N2)
            target = np.concatenate((target1, target2), axis=0)
            data = np.concatenate(
                (data_matrix_1, data_matrix_2), axis=0
            )  # A unified data matrix containing all examples for both candidates
            feature_names = [
                "FO V", "FO H", "PH V", "PH H", "NF V", "NF H", "AF V", "AF H",
                "SMO V", "SMO H", "KVK V2", "KK V2", "NHM V", "NHM H", "NH V",
                "NH H"
            ]

        clf = tree.DecisionTreeClassifier()

        cross_val_accuracy_scores = cross_val_score(clf, data, target, cv=10)
        cross_val_precision_scores = cross_val_score(clf,
                                                     data,
                                                     target,
                                                     cv=10,
                                                     scoring="precision")
        cross_val_recall_scores = cross_val_score(clf,
                                                  data,
                                                  target,
                                                  cv=10,
                                                  scoring="recall")
        cross_val_f1_scores = cross_val_score(clf,
                                              data,
                                              target,
                                              cv=10,
                                              scoring="f1")
        print("Results for", target_word_1, "and", target_word_2)
        #print("Cross validation accuracy scores:", cross_val_accuracy_scores) # All cross val scores individually
        sumav = sum(cross_val_accuracy_scores
                    ) / 10  # Average over all cross val scores
        print("Average accuracy:", sumav)
        #print("Cross validation precision scores:", cross_val_precision_scores)
        sumav2 = sum(cross_val_precision_scores) / 10
        print("Average precision:", sumav2)
        #print("Cross validation recall scores:", cross_val_recall_scores)
        sumav3 = sum(cross_val_recall_scores) / 10
        print("Average recall:", sumav3)
        #print("Cross validation F1 scores:", cross_val_f1_scores)
        sumav4 = sum(cross_val_f1_scores) / 10
        print("Average F1:", sumav4)

        accuracy.append(sumav)
        precision.append(sumav2)
        recall.append(sumav3)
        fscore.append(sumav4)

        #Visualize data
        clf.fit(data, target)
        dot_data = tree.export_graphviz(clf,
                                        feature_names=feature_names,
                                        out_file=None,
                                        filled=True,
                                        rounded=True)
        graph = pydotplus.graph_from_dot_data(dot_data)
        colors = ('turquoise', 'orange')
        edges = collections.defaultdict(list)
        for edge in graph.get_edge_list():
            edges[edge.get_source()].append(int(edge.get_destination()))
        for edge in edges:
            edges[edge].sort()
            for i in range(2):
                dest = graph.get_node(str(edges[edge][i]))[0]
                dest.set_fillcolor(colors[i])
        graph.write_png('tree_test.png')  # Outputs a visualized tree grap
        target_names = []
        target_names.append(target_word_1)
        target_names.append(target_word_2)
        feature_importance = clf.feature_importances_
        y_pos = np.arange(len(feature_importance))
        plt.barh(y_pos, feature_importance)
        plt.title("Mikilvægi þátta fyrir " + str(target_word_1) + " and " +
                  str(target_word_2))
        plt.yticks(y_pos, feature_names)
        plt.show(
        )  # Outputs a bar chart showing the importance of each feature

    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F-score: ", fscore)
示例#59
0
def main():
    # Import data and create decision tree
    #dermatology=datasets.load_dermatology()
    dermData = pd.read_csv('dermatologyWOnan.data')
    dermData.columns = [
        'erythema', 'scaling', 'definite borders', 'itching',
        'koebner phenomenon', 'polygonal papules', 'follicular papules',
        'oral mucosal involvement', 'knee and elbow involvement',
        'scalp involvement', 'family history', 'melanin incontinence',
        'eosinophils in the infiltrate', 'PNL infiltrate',
        'fibrosis of the papillary dermis', 'exocytosis', 'acanthosis',
        'hyperkeratosis', 'parakeratosis', 'clubbing of the rete ridges',
        'elongation of the rete ridges',
        'thinning of the suprapapillary epidermis', 'spongiform pustule',
        'munro microabcess', 'focal hypergranulosis',
        'disappearance of the granular layer',
        'vacuolisation and damage of basal layer', 'spongiosis',
        'saw-tooth appearance of retes', 'follicular horn plug',
        'perifollicular parakeratosis', 'inflammatory monoluclear inflitrate',
        'band-like infiltrate', 'age', 'label'
    ]
    dermData.to_csv('dermData.csv')

    target = dermData[
        'label']  #provided your csv has header row, and the label column is named "Label"

    #select all but the last column as data
    data = dermData.iloc[:, :-1]
    data_feature_names = [
        'erythema', 'scaling', 'definite borders', 'itching',
        'koebner phenomenon', 'polygonal papules', 'follicular papules',
        'oral mucosal involvement', 'knee and elbow involvement',
        'scalp involvement', 'family history', 'melanin incontinence',
        'eosinophils in the infiltrate', 'PNL infiltrate',
        'fibrosis of the papillary dermis', 'exocytosis', 'acanthosis',
        'hyperkeratosis', 'parakeratosis', 'clubbing of the rete ridges',
        'elongation of the rete ridges',
        'thinning of the suprapapillary epidermis', 'spongiform pustule',
        'munro microabcess', 'focal hypergranulosis',
        'disappearance of the granular layer',
        'vacuolisation and damage of basal layer', 'spongiosis',
        'saw-tooth appearance of retes', 'follicular horn plug',
        'perifollicular parakeratosis', 'inflammatory monoluclear inflitrate',
        'band-like infiltrate', 'age'
    ]

    #df=pd.DataFrame(dermatology.data, columns=dermatology.names)

    dtree = DecisionTreeClassifier()
    dtree.fit(data, target)

    # Plot decision tree
    #dot_data = StringIO()
    dot_data = export_graphviz(dtree,
                               out_file=None,
                               feature_names=data_feature_names,
                               filled=True,
                               rounded=True,
                               precision=2,
                               special_characters=True)

    #export_graphviz(dtree, out_file='tree_test.dot', feature_names = iris.feature_names,
    #            class_names = iris.target_names,
    #            rounded = True,  precision = 2, filled = True)

    graph = pdot.graph_from_dot_data(dot_data)
    #colors = ('turquoise', 'orange')
    edges = collections.defaultdict(list)

    for edge in graph.get_edge_list():
        edges[edge.get_source()].append(int(edge.get_destination()))

    for edge in edges:
        edges[edge].sort()
        for i in range(2):
            dest = graph.get_node(str(edges[edge][i]))[0]
            dest.set_fillcolor(colors[i])

    #graphviz.render('dot', 'png', 'test-output/holy-grenade.gv')
    graph.write_png('tree_test.png')
示例#60
0
def main(): 
	
	# Building Phase 
	data = importdata() 
	X, Y, X_train, X_test, y_train, y_test = splitdataset(data) 
	clf_gini = train_using_gini(X_train, X_test, y_train) 
	clf_entropy = tarin_using_entropy(X_train, X_test, y_train)

        #Visualizing tree using Gini Index
	dot_data = StringIO()
	export_graphviz(clf_gini, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
	graph.write_png('gini_graph.png')
	Image(graph.create_png())

	
	print('\n')
        # Operational Phase 
	print("Results Using Gini Index:") 
	print ("\n")
	
	# Prediction using gini 
	y_pred_gini = prediction(X_test, clf_gini)
        #Test instance prdictions
	print("\n")
	
	test1_set =[1,1,1,1]
	print ("Test instance 1:  ", test1_set) 
	test1 = clf_gini.predict([test1_set])
	print("Predicted label: ", test1)
	print("Actual label: B")
	print('\n')
        
	test2_set =[1,3,2,3]
	print ("Test instance 2:  ", test2_set) 
	test2 = clf_gini.predict([test2_set])
	print("Predicted label: ", test2)
	print("Actual label: R")
	print('\n')

	test3_set = [5,4,5,1]
	print ("Test instance 3:  ", test3_set) 
	test3 = clf_gini.predict([test3_set])
	print("Predicted label: ", test3)
	print("Actual label: L")
	print('\n')

	test7_set = [1,4,1,4]
	print ("Test instance 4:  ", test7_set) 
	test7 = clf_gini.predict([test7_set])
	print("Predicted label: ", test7)
	print("Actual label: B")
	print('\n')
        
	
	cal_accuracy(y_test, y_pred_gini)

        #Visualizing tree using Entropy
	dot_data = StringIO()
	export_graphviz(clf_entropy, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
	graph.write_png('entropy_graph.png')
	Image(graph.create_png())
        
	print("Results Using Entropy:")

	print('\n')
	# Prediction using entropy 
	y_pred_entropy = prediction(X_test, clf_entropy)
	print('\n')

	
	test4_set =[1,1,1,1]
	print ("Test instance 1:  ", test4_set) 
	test4 = clf_gini.predict([test4_set])
	print("Predicted label: ", test4)
	print("Actual label: B")
	print('\n')
        
	test5_set =[1,3,2,3]
	print ("Test instance 2:  ", test5_set) 
	test5 = clf_gini.predict([test5_set])
	print("Predicted label: ", test5)
	print("Actual label: R")
	print('\n')

	test6_set = [5,4,5,1]
	print ("Test instance 3:  ", test6_set) 
	test6 = clf_gini.predict([test6_set])
	print("Predicted label: ", test6)
	print("Actual label: L")
	print('\n')

	test8_set = [1,4,1,4]
	print ("Test instance 4:  ", test8_set) 
	test8 = clf_gini.predict([test8_set])
	print("Predicted label: ", test8)
	print("Actual label: B")
	print('\n')
	
	
	cal_accuracy(y_test, y_pred_entropy)