def Binning(self, col, x, y):
        # x = self.d[0]
        # y = self.d[1]
        tdc = DecisionTreeClassifier()
        params = {'max_depth': [2, 3, 4], 'min_samples_split': [2, 3, 5, 10]}
        Grid = GridSearchCV(tdc, param_grid=params, scoring='accuracy')
        Grid.fit(x, y)
        max_depth = Grid.best_params_['max_depth']
        min_samples_split = Grid.best_params_['min_samples_split']

        # print("Columns =  {}, max_depth = {}, min sample split = {}, \nunique =  {}"\
        #     .format( col, max_depth, min_samples_split, len(list(pd.Series(x[col]).unique()))))
        # print("[error]", type(y),set(y))
        if "Y" in list(set(y)):
            class_weight = {"Y": 1.0, "N": 5.6}
            tdcs = DecisionTreeClassifier(max_depth=max_depth,
                                          min_samples_split=min_samples_split,
                                          class_weight=class_weight)
            tdcs.fit(x, y)
            tree_rules = export_text(tdcs, feature_names=list(x.columns))
            tree_ = tdcs.tree_
        else:
            #         print("the target do not contain 'Y' and 'N'. You may have to change the binning spltting points. ")
            tdcs = DecisionTreeClassifier(max_depth=max_depth,
                                          min_samples_split=min_samples_split)
            tdcs.fit(x, y)
            tree_rules = export_text(tdcs, feature_names=list(x.columns))
            tree_ = tdcs.tree_
        # print("Threshold:\n", tree_.threshold )
        # print("children_left:\n", tree_.children_left)
        # print("children_right:\n",tree_.children_right)
        # a = tdcs.tree_.threshold
        # print(tree_rules)
        return tree_rules, max_depth, min_samples_split
def generate_templates(training_data):
    x = []
    y = []

    for graph, cluster_id in training_data:
        for u, edges in enumerate(graph):
            for v, features in edges:
                x.append(features)
                y.append(1 if cluster_id[u] == cluster_id[v] else 0)

    x = np.array(x)
    y = np.array(y)
    print(x)
    print(y)

    estimator = DecisionTreeClassifier(random_state=0, max_depth=3)
    estimator = estimator.fit(x, y)
    print(export_text(estimator))

    n_nodes = estimator.tree_.node_count
    children_left = estimator.tree_.children_left
    children_right = estimator.tree_.children_right
    feature = estimator.tree_.feature
    threshold = estimator.tree_.threshold

    templates = []

    def dfs(node, template_prefix, is_root=False):
        if template_prefix != []:
            templates.append(template_prefix)

        left, right = children_left[node], children_right[node]
        if left != -1:
            question = Question(feature[node], threshold[node], False)
            if is_root:
                dfs(left, [])
            dfs(left, template_prefix.copy() + [question])

        if right != -1:
            question = Question(feature[node], threshold[node], True)
            if is_root:
                dfs(right, [])
            dfs(right, template_prefix.copy() + [question])

    dfs(0, [], is_root=True)
    print("Feature induction templates:")
    for template in templates:
        print(", ".join([str(question) for question in template]))

    labels = []
    for f in basic_features:
        labels.append(f.__name__)
    print(export_text(estimator, labels))

    return templates
Exemplo n.º 3
0
def scikit_cart(features, X_data, Y_data):

    #criterion='gini'

    tree.DecisionTreeClassifier().fit(X_data, Y_data)
    clf = tree.DecisionTreeClassifier().fit(X_data, Y_data)
    #    tree = clf

    Y_pred = clf.predict(X_data)

    if len(features) != 1:
        dt_tree = export_text(clf, feature_names=features)
        print('Decision Tree:')
        print(dt_tree)


#     outputFile = open('graph.pdf', 'w')
#     treeDotFormat = tree.export_graphviz(clf, out_file=outputFile, feature_names = features, filled=True)
#     print('Tree DOT Format:', treeDotFormat)
#     outputFile.close()
#     s = Source.from_file('graph.pdf')
#     s.view()

    synz_fml = tree_to_phi(clf, features)

    #     print('***Synthesize Formula:',synz_fml)
    #     print('***Feature List:', features)
    #     print('***Feature List Size', len(features))

    logging.info('Synthesized formula: %s' % (synz_fml))
    logging.info("Accuracy:%f" % (accuracy_score(Y_data, Y_pred)))

    return synz_fml
Exemplo n.º 4
0
def write_random_forest_to_txt_file(model,
                                    folder_path,
                                    file_name,
                                    feature_names=None):
    '''
        This method goes through the estimators of the random forest model and writes the set of rules of each estimator to a text_file
        Args: 
            model: the sklearn model of the random forest
            folder_path: path to the folder to store the verilog file in (including last slash)
            file_name: name that should be given to the file (without any ending)
            feature_names: a list of names of the features that go into the random forest
    '''

    for idx, estimator in enumerate(model.estimators_):
        s = export_text(estimator, feature_names=feature_names)
        file_path = folder_path + file_name + '_estimator{}.txt'.format(idx)
        with open(file_path, 'w') as f:
            f.write(
                'Decision Tree - Estimator {}:\n--------------------------\n\n'
                .format(idx))
            f.write(s)
            f.write('\n\nFeature Importance:\n--------------------------\n\n')
            for i, importance in enumerate(estimator.feature_importances_):
                f.write('{}: {:.2f} %\n'.format(feature_names[i],
                                                importance * 100))
Exemplo n.º 5
0
def dtr_sip(x_train_, y_train_, x_test_, y_test_):
    dtr = tree.DecisionTreeRegressor(random_state=0, max_depth=5)
    dtr = dtr.fit(x_train_, y_train_)
    predict = dtr.predict(x_test_)
    print([x_train_.columns])
    print(tree.export_text(dtr))
    print("测试精度:%f" % (dtr.score(x_test_, y_test_)))
    pd.DataFrame(predict).to_csv(store_path + '\\dtr_result.csv', index=False, sep=',')
    count = Counter(predict * y_test_ > 0)
    accuracy = count[True] / (count[True] + count[False])
    print(count)
    print('accuracy:', accuracy)
    plt.figure(figsize=(20, 10))
    plt.scatter(x_test_.index, predict - y_test_, s=5, c='red', marker='o',
                label='decision tree:'+str(round(accuracy, 2)))

    plt.axhline(c='black')
    plt.legend()
    plt.title('Prediction Error of Decision Trees Regression')
    plt.xlabel("Date")
    plt.ylabel("Prediction Error")
    plt.savefig(store_path + r".\dt.png", dpi=600, bbox_inches='tight')
    plt.show()

    with open(store_path + '\\dtr.dot', 'w') as f:
        f = tree.export_graphviz(dtr,  out_file=f,
                                 filled=True, class_names=True, proportion=True, rounded=True)
Exemplo n.º 6
0
def plotTreeText(plotTree, fileName):
	textTree = export_text(plotTree)
	#Printing kinda explodes.
	#print(textTree)
	text_file = open(str(fileName)+".txt", "w")			
	text_file.write(textTree)
	text_file.close()
Exemplo n.º 7
0
def decision_tree_analysis(X, Y, feature_names):
    '''
    Outputs decision tree based of input X and output Y.

    Parameters:
    -----------
    X: decision tree input
    Y; decision tree output
    feature_names: names of all neurons in the inputs.
    '''
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.3,
                                                        random_state=1)
    # Decision Tree
    decisionTree = DecisionTreeClassifier()
    decisionTree = decisionTree.fit(X_train, Y_train)

    Y_pred = decisionTree.predict(X_test)

    # print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred))
    text_representation = tree.export_text(decisionTree,
                                           feature_names=feature_names)
    text_representation = text_representation.replace('<= 0.50', '== FALSE')
    text_representation = text_representation.replace('>  0.50', '== TRUE')
    print(text_representation)
    return decisionTree
def CARTT(dataset, month, a=12, b=1, c=2):

    dataset = normalize(dataset)
    mre_list = []
    sa_list = []
    for train, test in df_split(dataset, month):
        train_input = train.iloc[:, :-1]
        train_actual_effort = train.iloc[:, -1]
        test_input = test.iloc[:, :-1]
        test_actual_effort = test.iloc[:, -1]
        # max_depth: [1:12], min_samples_leaf: [1:12], min_samples_split: [2:21]

        model = DecisionTreeRegressor(max_depth=a,
                                      min_samples_leaf=b,
                                      min_samples_split=c)
        model.fit(train_input, train_actual_effort)
        test_predict_effort = model.predict(test_input)
        test_predict_Y = test_predict_effort
        test_actual_Y = test_actual_effort.values

        mre_list.append(mre_calc(test_predict_Y,
                                 test_actual_Y))  ######### for MRE
        sa_list.append(
            sa_calc(test_predict_Y, test_actual_Y,
                    train_actual_effort))  ######### for SA

    r = export_text(model, feature_names=list(train_input.columns.values))[1]
    # print(r)
    feature_used = list(dict.fromkeys(r))
    # print(feature_used)
    # tree.plot_tree(model, feature_names=list(train_input.columns.values))
    # plt.show()

    return mre_list, sa_list, feature_used
Exemplo n.º 9
0
def create_treemap(t):
    string = tree.export_text(t, show_weights=True, spacing=1)
    feature_label_stack = list()
    labels = list()
    values = list()
    while string.find("feature") != -1:
        string, feature_label = extract_feature_label(string)
        feature_label_stack = clean_feature_label_stack(
            feature_label_stack, feature_label)
        if (string.find("weights") <
                string.find("feature")) or (string.find("feature") == -1):
            label, value = extract_feature_properties(string)
            labels.append(
                generate_label(feature_label_stack) + feature_label + label)
            values.append(value)
        else:
            feature_label_stack.insert(0, feature_label)
    sorted_labels = list()
    sizes = list()
    while len(labels) != 0:
        maximum = max(values)
        i = len(labels)
        while maximum in values:
            index = values.index(maximum)
            sorted_labels.append(labels[index][labels[index].find("\n") + 1:])
            sizes.append(pow(i, 1.6))
            values.pop(index)
            labels.pop(index)
    palette = sns.light_palette(color=(210, 90, 60),
                                input="husl",
                                n_colors=len(sorted_labels))
    palette.reverse()
    squarify.plot(sizes=sizes, label=sorted_labels, color=palette)
    plt.axis('off')
    plt.show()
Exemplo n.º 10
0
def print_decision_tree(original_data, klaster_indeksi, columns):
    print("*** Stablo odlucivanja ***\n")
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(original_data, klaster_indeksi)

    text_tree = tree.export_text(clf, feature_names=list(columns))
    print(text_tree)
def tree_regressor(df, components, n_sfd, n_non_sfd, r_seed):
    '''
    Decision Tree fit. Performs the random sampling among the classes, creating a balanced
    training data set. 
    
    Returns the success rate for the training and test data sets. Also returns the tree rules 
    from each fit.
    '''
    df_non_sfd = df[df['seafood_meal'] == 0].sample(n=n_non_sfd,
                                                    random_state=r_seed)
    df_sfd = df[df['seafood_meal'] == 1].sample(n=n_sfd, random_state=r_seed)
    df = pd.concat([df_non_sfd, df_sfd])
    df_x = df[components]
    df_y = df['seafood_meal']
    X_train, X_test, y_train, y_test = train_test_split(df_x,
                                                        df_y,
                                                        test_size=0.2,
                                                        random_state=r_seed)
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X_train, y_train)
    y_pred_tree = decision_tree.predict(X_test)
    y_pred_tree_train = decision_tree.predict(X_train)
    score = accuracy_score(y_test, y_pred_tree)
    score_train = accuracy_score(y_train, y_pred_tree_train)
    tree_rules = export_text(decision_tree,
                             feature_names=list(X_train.columns))
    return score, score_train, tree_rules
Exemplo n.º 12
0
def train(x, y, feature_names, outfile):
    """Train the classifier

  Args:
    x: list of values for feature variables
    y: list of target values
    feature_names: Names of feature variables
    outfile: file name to write graphviz export to
  Returns:
    the trained classifier
  """
    clf = tree.DecisionTreeClassifier(random_state=0,
                                      max_depth=2,
                                      criterion='gini',
                                      min_samples_split=3)
    clf = clf.fit(x, y)
    score = clf.score(x, y)
    logging.info(f'Classifier score: {score}\n')
    y_pred = clf.predict(x)
    print("Training results")
    print(classification_report(y, y_pred))
    dot_data = tree.export_graphviz(clf, filled=True, rounded=True)
    graph = graphviz.Source(dot_data)
    class_names = ['Not relevant', 'Relevant']
    r = tree.export_text(clf, feature_names=feature_names)
    print(r)
    tree.plot_tree(clf,
                   feature_names=feature_names,
                   class_names=class_names,
                   filled=True,
                   impurity=False)
    #plt.show()
    plt.savefig(outfile, dpi=160)
    return clf
    def print_decision_tree(self):
        """
        Print decision tree rules
        :return: 
        """
        rules_text = export_text(self.sklearn_classifier, max_depth=100)
        # Vocabulary for replacement in the data which contains
        # feature numbers only
        vocab = self.count_vectorizer.vocabulary_
        vocabulary = dict((feature, word) for word, feature in vocab.items())
        rules = rules_text.split("\n")
        lines = []
        for rule in rules:
            if "feature_" in rule:
                word_id_str = re.sub(".*feature_([0-9]+).*", r"\1", rule)
                word_id = int(word_id_str)
                if word_id in vocabulary:
                    word = vocabulary[word_id]
                else:
                    word = "UNK"
                rule = rule.replace("feature_{}".format(word_id_str), word)
                lines.append(rule)
            else:
                lines.append(rule)

        with open(os.path.join(self.model_folder_path, "decision_rules.txt"),
                  'w',
                  encoding='utf-8') as out:
            for line in lines:
                out.write(line + '\n')
Exemplo n.º 14
0
def get_decision_tree_classifier(train_data, name, file):
    print(f'Creating decision tree classifiers on {name}\'s {file} file...')
    train_data = train_data.drop(['date'], axis=1)
    train_data = train_data.drop(['trading_time'], axis=1)
    train_data = train_data.drop(['source'], axis=1)
    train_data = train_data.drop(['text'], axis=1)
    sentiment = train_data.pop('sentiment')
    train_data.insert(0, 'sentiment', sentiment)
    y = train_data.iloc[:, 0].values
    X_train, X_test, y_train, y_test = train_test_split(train_data,
                                                        y,
                                                        test_size=0.33)
    dtc = DecisionTreeClassifier(criterion='entropy',
                                 max_features='auto',
                                 max_depth=5,
                                 random_state=0)
    print("Decision Tree classifier")
    pred = dtc.fit(X_train, y_train)
    predictions = pred.predict(X_test)
    text_representation = tree.export_text(dtc)
    with open(f'decision_tree_{file}_{name}.log', 'w') as fout:
        fout.write(text_representation)
    feature_names = list(train_data.columns.values)
    fig = plt.figure(figsize=(15, 10))
    plot_tree(dtc,
              feature_names=feature_names,
              class_names=["FALSE", "TRUE"],
              filled=True,
              fontsize=12)
    plt.title(f'{file} Decision Tree for {name}')
    plt.savefig(f'decision_tree_{file}_{name}.png')
    fig = plt.figure(figsize=(15, 10))
    con_mat = confusion_matrix(y_true=y_test, y_pred=predictions)
    group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
    group_counts = ['{0: 0.0f}'.format(value) for value in con_mat.flatten()]
    group_percentages = [
        '{0: .2f}'.format(value)
        for value in con_mat.flatten() / np.sum(con_mat)
    ]
    labels = [
        f'{v1}\n{v2}\n{v3}'
        for v1, v2, v3 in zip(group_names, group_counts, group_percentages)
    ]
    labels = np.asarray(labels).reshape(2, 2)
    sns.heatmap(con_mat, annot=labels, fmt='', cmap='Blues')
    plt.title(f'{file} Confusion Matrix for {name}')
    plt.savefig(f'confusion_matrix_{file}_{name}.png')
    fig = plt.figure(figsize=(15, 10))
    class_rpt = pd.DataFrame(
        classification_report(predictions, y_test, digits=2, output_dict=True))
    class_rpt.style.background_gradient(
        cmap='newcmp',
        subset=pd.IndexSlice['0':'9', :'f1-score']).set_properties(
            **{
                'text-align': 'center',
                'font-size': '30px'
            })
    sns.heatmap(class_rpt.iloc[:-1, :].T, annot=True)
    plt.title(f'{file} Classification Report for {name}')
    plt.savefig(f'classification_report_{file}_{name}.png')
Exemplo n.º 15
0
def teste_gera_sklearn_tree():
    # ts = pd.read_csv("PLA_dump/cifar10_images_binary.txt", header=None, nrows=5000)
    labels = pd.read_csv("PLA_dump/cifar10_images_binary_labels.txt",
                         header=None,
                         nrows=3000)

    tt = []
    content = np.genfromtxt("PLA_dump/cifar10_images_binary.txt",
                            max_rows=3000,
                            dtype=str)
    for idx in range(0, content.shape[0]):
        tt.append(list(content[idx]))
    ts = pd.DataFrame(tt)

    # Definition of the classifier
    clf = DecisionTreeClassifier(
        random_state=9856230,
        criterion='gini',
        max_depth=18,
    )

    clf.fit(ts, labels)

    with open("tree_test2.tree", "w") as arquivo:
        arquivo.write(tree.export_text(clf, max_depth=1000))
def tree_txt_result(output_classification_result):
    text_representation = tree.export_text(output_classification_result)

    # with open("decistion_tree.log", "w") as fout:
    # fout.write(text_representation)

    return text_representation
Exemplo n.º 17
0
def modeling(df, model: Model):
    """
    Dividimos los datos en un 70% para entrenamiento y 30% para testeo,
    aparte separamos las variables de las etiquetas:
        - En X guardamos las variables
        - En Y las etiquetas
    """
    x_train, x_test, y_train, y_test = train_test_split(df.drop(['relevant'], axis=1), df['relevant'], test_size=0.3,
                                                        random_state=1)

    # Inicializamos el modelo
    md = model()

    # Lo entrenamos
    md.fit(x_train, y_train)

    # Vemos el árbol que se ha creado en el caso de Decision Tree
    if model is Model.decision_tree:
        text_representation = tree.export_text(md)
        with open("./decistion_tree.log", "w") as fout:
            fout.write(text_representation)

        dot_data = tree.export_graphviz(md, out_file=None, feature_names=x_test.columns,
                                        class_names=['Predicted relevant', 'Predicted no relevant'], filled=True,
                                        rounded=True, special_characters=True)

        graph = graphviz.Source(dot_data, filename='./tree', format='png')
        # graph.render()

    # Hacemos una predicción
    y_pred = md.predict(x_test)

    # comparamos resultados
    compare_results(y_test, y_pred)
Exemplo n.º 18
0
def _de_novo_pred(X, y, feature_names, classifier='decision_tree'):
    if classifier == 'random_forest':
        log.info('Using extremely random forest')
        clf = ExtraTreesClassifier(n_estimators=250,
                                   bootstrap=True,
                                   oob_score=True)
        clf.fit(X, y)
        log.debug('Feature importance:')
        log.debug(
            format_feature_importances(feature_names,
                                       clf.feature_importances_))
        pred = clf.oob_decision_function_[:, 1]
        # in the unlikely event dt1_pred contains NaNs
        # (can happen when n_estimators is not big enough)
        pred[np.isnan(pred)] = 0
        pred = pred >= 0.5

    else:
        clf = DecisionTreeClassifier(
            max_depth=5,
            min_samples_split=100,
            min_impurity_decrease=0.005,
        )
        clf.fit(X, y)
        log.debug('Tree structure:')
        log.debug(export_text(clf, feature_names=feature_names))
        pred = clf.predict(X)
    return pred.astype(int)
Exemplo n.º 19
0
 def show_tree_structure_in_terminal(self, clf):
     """
     Prints tree structure in console.
     :param clf: instance of classifier returned by _predict method
     :return: prints tree structure
     """
     r = export_text(clf, feature_names=self.feature_names)
     print(r)
Exemplo n.º 20
0
def export_decision_tree(decision_tree, feature_names):
    export_decision_tree_to_file(export_text(decision_tree, feature_names))
    export_graphviz(decision_tree,
                    out_file='decision_tree.dot',
                    feature_names=feature_names,
                    rounded=True,
                    precision=1)
    convert_dot_to_svg()
Exemplo n.º 21
0
    def write_DT(DT, outfile, labels, generate_sorted_file=True):
        """
        Graphs the DT, writes the DT as text, and generates 'DT_sorted.txt' file
        that ranks features in descending order.

        Parameters
        ----------
        DT : sklearn.tree.DecisionTreeClassifier
            Decision tree of interest.
        outfile : str
            path to files to be generated.
        generate_sorted_file : bool, optional
            Whether to generate a file ranking the tree's features.
            The default is True.
        """

        # Make graph
        dot_data = tree.export_graphviz(DT,
                                        out_file=None,
                                        feature_names=labels,
                                        class_names=["benign", "malicous"],
                                        filled=True,
                                        rounded=True,
                                        special_characters=True,
                                        precision=6)
        graph = graphviz.Source(dot_data)
        graph.render("{}.gv".format(outfile), view=False)

        # Write DT as text
        with open("{}.txt".format(outfile), "w") as text_file:
            text_file.write(
                tree.export_text(
                    DT,
                    feature_names=list(labels),
                    show_weights=True,
                ))

        features = []

        if generate_sorted_file:
            # write DT_sorted.txt
            with open("{}_sorted.txt".format(outfile), "w") as DT_sorted:
                # Read text tree for best k
                with open("{}.txt".format(outfile), "r") as text_tree:
                    for line in text_tree:
                        if "class: " in line:
                            continue
                        else:
                            # string containing attribute test
                            attribute_test = line.strip().split("|--- ")[-1]
                            # extract feature from attibute test
                            feature = attribute_test.split("<")[0].split(
                                ">")[0].strip()

                            if feature not in features:
                                features.append(feature)
                                DT_sorted.write(feature + "\n")
    def print_tree(self, feature_names: list = None):
        """
        Print textual representation of the decision tree.

        Args:
            feature_names (list, optional): List of feature names to use. Defaults to None.
        """

        print(export_text(self.clf, feature_names=feature_names))
Exemplo n.º 23
0
    def as_text(self):
        """"
            This method returns the tree in a string format

        """

        if not self.is_fitted:
            self.fit()
        text = export_text(self.surrogate_explainer.estimator_, feature_names=list(self.feature_names))
        return text
Exemplo n.º 24
0
 def fit_predict_model(self):
     if "Random" not in self.model_name:
         self._fit_optimised_model()
     self.model.fit(self.X_train, self.y_train)
     self.y_preds = self.model.predict(self.X_test)
     if "Random" not in self.model_name:
         self.model_struct = export_text(self.model,
                                         feature_names=self.feature_names)
     else:
         self.model_struct = "Random Forest Model\n"
 def runC(cls, clf):
     # Graf
     tree.plot_tree(clf)
     plt.show()
     # Tekst
     dataFrame = pandas.read_csv('iris.csv')
     features = list(dataFrame.columns[:4])
     r = export_text(clf, feature_names=features)
     print(r)
     pass
Exemplo n.º 26
0
def print_tree(estimator, max_depth=6, **kwargs):
    """
    Print the first Decision Tree from a Random Forest.
    :param estimator: Sklearn ensemble estimator.
    """
    s = export_text(estimator.estimators_[0],
                    max_depth=max_depth,
                    feature_names=signal_names_,
                    **kwargs)
    print(s)
Exemplo n.º 27
0
def _dt_rules(clf, df_mat):
    """
    Function to transform the printed structure of a DT into the set of rules
    derived from the paths to the terminal nodes.
    It also includes the length of each of those rules,
    as well as the prediction associated with it (value of that terminal node).

    Parameters
    ----------
    clf : TYPE
        DESCRIPTION.
    df_mat : TYPE
        DESCRIPTION.

    Returns
    -------
    df_rules : TYPE
        DESCRIPTION.

    """
    r = export_text(clf, feature_names=list(df_mat.columns))

    list_splits = r.split("|---")
    list_splits = [x.replace("|", "") for x in list_splits]
    list_splits = [x.replace("class: -1", "") for x in list_splits]
    list_splits = [x.replace("class: 1", "") for x in list_splits]
    list_splits = [x.strip() for x in list_splits]
    df_splits = pd.DataFrame({"levels": list_splits})
    df_splits = (df_splits[df_splits["levels"] != ""].reset_index(
        drop=True).reset_index())
    df_splits["index"] += 1

    df_rules = pd.DataFrame()

    for i, point in df_mat.iterrows():
        node_indices = clf.decision_path(point.values.reshape(1, -1))
        rule = ""
        node_indices = pd.DataFrame(node_indices.toarray().T).reset_index()
        node_indices = node_indices.merge(df_splits)
        node_indices = node_indices[node_indices[0] == 1]
        for i in list(node_indices["levels"]):
            if rule == "":
                rule = i
            else:
                rule = rule + " & " + i
        dct_aux = {
            "rule": rule,
            "prediction": clf.predict(point.values.reshape(1, -1)),
            "len_rule": len(node_indices),
        }

        df_rules = df_rules.append(pd.DataFrame(dct_aux, index=[0]))
    df_rules = df_rules.drop_duplicates()

    return df_rules
Exemplo n.º 28
0
def decisionTree(df, playlist_name, random_state, training_size):  # Rohan
    print("Decision Tree")
    X = pd.DataFrame(df,
                     columns=['danceability', 'energy', 'tempo', 'valence'])
    Y = pd.DataFrame(df, columns=['genre'])
    decisionTree = tree.DecisionTreeClassifier(random_state=random_state)
    songs = Y.size
    trainingSize = training_size
    trainingSongs = round(songs * 0.2)

    decisionTree = decisionTree.fit(X[0:trainingSongs], Y[0:trainingSongs])
    prediction = decisionTree.predict(X[trainingSongs:])
    prediction_df = pd.DataFrame(prediction)
    df['Predicted Genre'] = prediction_df

    print(pd.DataFrame(df, columns=['name', 'genre', 'Predicted Genre']))

    # https://mljar.com/blog/visualize-decision-tree/

    text_representation = tree.export_text(decisionTree)
    print(text_representation)

    accuracy = accuracy_score(Y[trainingSongs:], prediction_df, normalize=True)
    accuracyText = 'Accuracy: ' + str(accuracy)
    print(accuracyText)
    f1 = f1_score(Y[trainingSongs:], prediction_df, average='weighted')
    f1Text = 'F1: ' + str(f1)
    print(f1Text)
    recall = recall_score(Y[trainingSongs:], prediction_df, average='weighted')
    recallText = 'Recall: ' + str(recall)
    print(recallText)
    precision = precision_score(Y[trainingSongs:],
                                prediction_df,
                                average='weighted')
    precisionText = 'Precision: ' + str(precision)
    print(precisionText)

    class_names = decisionTree.classes_
    feature_names = X.columns

    fig = plt.figure(figsize=(50, 40))
    title = str(playlist_name) + "_decision_tree_" + str(random_state)
    titleLine2 = accuracyText + ", " + precisionText + ", " + f1Text
    titleLine3 = "Training Data Proportion: " + str(training_size)
    # Recall is not pertinent to the scope of this problem
    fig.suptitle(title + '\n' + titleLine2 + '\n' + titleLine3, fontsize=72)
    _ = tree.plot_tree(decisionTree,
                       feature_names=feature_names,
                       class_names=class_names,
                       filled=True)
    fig.savefig(
        str(playlist_name) + "_decision_tree_" + str(random_state) + "_" +
        str(training_size) + ".png")

    return df
Exemplo n.º 29
0
    def visualize(self):
        print("##MSG: visualizing ...")
        classes = None

        # image
        dot_data = StringIO()
        #tree.export_graphviz(self.model, out_file=dot_data,feature_names=self.xNames)
        dot = tree.export_graphviz(self.model,
                                   feature_names=self.xNames,
                                   class_names=classes)

        # dot フォーマット強制変換
        #あとで項目名対応する or treeのメソッドを実装
        newdot = []
        pre_pattern = r'^[0-9]* \[label="'
        suf_pattern = r'"] ;$'
        for line in dot.split('\n'):
            if re.match(pre_pattern, line):
                lbl = re.sub(suf_pattern, '', re.sub(pre_pattern, '', line))
                lbldata = lbl.split("\\n")
                if len(lbldata) == 5:
                    lblval = lbldata[0].split(' ')
                    lblval0 = lblval[0].split("_")
                    if len(lblval0) > 1:
                        if lblval[-2] == "<=":
                            nn = "_".join(lblval0[0:-1])
                            newlable = 'label="%s == %s\\\\n%s\\\\n%s\\\\n%s\\\\n%s"] ;' % (
                                nn, lblval0[-1], lbldata[1], lbldata[2],
                                lbldata[3], lbldata[4])
                            newdot.append(
                                re.sub(r'label=".*"] ;', newlable, line))

                        else:
                            nn = "_".join(lblval0[0:-1])
                            newlable = 'label="%s != %s\\\\n%s\\\\n%s\\\\n%s\\\\n%s"] ;' % (
                                nn, lblval0[-1], lbldata[1], lbldata[2],
                                lbldata[3], lbldata[4])
                            newdot.append(
                                re.sub(r'label=".*"] ;', newlable, line))
                    else:
                        newdot.append(line)

                else:
                    newdot.append(line)

            else:
                newdot.append(line)

        newdotstr = '\n'.join(newdot)
        self.tree_chart = pydotplus.graph_from_dot_data(newdotstr)

        # text
        self.tree_text = tree.export_text(self.model,
                                          feature_names=self.xNames,
                                          show_weights=True)
def decision_tree(dataset,
                  full_train: bool = False,
                  print_tree=False,
                  plot=False):

    predictors = dataset.drop(columns='y')
    target = dataset['y']

    # Separa el dataset en train y test set
    # Además separa los predictores (x) de los resultados (y)
    x_train, x_test, y_train, y_test = None, None, None, None
    if full_train:
        x_train = predictors
        y_train = target
        x_test = predictors
        y_test = target
    else:
        x_train, x_test, y_train, y_test = train_test_split(
            predictors,
            target,
            random_state=123,
        )

    # Define los parámetros del random forest
    modelo = DecisionTreeClassifier(random_state=123)

    # Entrena el random forest
    modelo.fit(x_train, y_train)

    # Representa en consola el decision tree correspondiente
    if print_tree:
        text_representation = export_text(modelo)
        print(text_representation)

    # Crea un plot del decision tree que detiene la ejecución del script hasta que se cierra
    if plot:
        fig = plt.figure()
        #_ = plot_tree(modelo, feature_names=dataset.feature_names, class_names=dataset.target_names, filled=True)
        tree = plot_tree(modelo, filled=True)
        plt.show()

    # Usa el modelo ya entrenado para predecir
    predicciones = modelo.predict(X=x_test)

    # establece los valores esperados en un fromato con el que se pueda trabajar
    esperados = y_test.tolist()

    matches = 0
    for i, r in enumerate(predicciones):
        if r == esperados[i]:
            matches += 1

    print(str(matches) + ' / ' + str(len(predicciones)))
Exemplo n.º 31
0
def test_export_text():
    clf = DecisionTreeClassifier(max_depth=2, random_state=0)
    clf.fit(X, y)

    expected_report = dedent("""
    |--- feature_1 <= 0.00
    |   |--- class: -1
    |--- feature_1 >  0.00
    |   |--- class: 1
    """).lstrip()

    assert export_text(clf) == expected_report
    # testing that leaves at level 1 are not truncated
    assert export_text(clf, max_depth=0) == expected_report
    # testing that the rest of the tree is truncated
    assert export_text(clf, max_depth=10) == expected_report

    expected_report = dedent("""
    |--- b <= 0.00
    |   |--- class: -1
    |--- b >  0.00
    |   |--- class: 1
    """).lstrip()
    assert export_text(clf, feature_names=['a', 'b']) == expected_report

    expected_report = dedent("""
    |--- feature_1 <= 0.00
    |   |--- weights: [3.00, 0.00] class: -1
    |--- feature_1 >  0.00
    |   |--- weights: [0.00, 3.00] class: 1
    """).lstrip()
    assert export_text(clf, show_weights=True) == expected_report

    expected_report = dedent("""
    |- feature_1 <= 0.00
    | |- class: -1
    |- feature_1 >  0.00
    | |- class: 1
    """).lstrip()
    assert export_text(clf, spacing=1) == expected_report

    X_l = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, 1]]
    y_l = [-1, -1, -1, 1, 1, 1, 2]
    clf = DecisionTreeClassifier(max_depth=4, random_state=0)
    clf.fit(X_l, y_l)
    expected_report = dedent("""
    |--- feature_1 <= 0.00
    |   |--- class: -1
    |--- feature_1 >  0.00
    |   |--- truncated branch of depth 2
    """).lstrip()
    assert export_text(clf, max_depth=0) == expected_report

    X_mo = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
    y_mo = [[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1], [1, 1]]

    reg = DecisionTreeRegressor(max_depth=2, random_state=0)
    reg.fit(X_mo, y_mo)

    expected_report = dedent("""
    |--- feature_1 <= 0.0
    |   |--- value: [-1.0, -1.0]
    |--- feature_1 >  0.0
    |   |--- value: [1.0, 1.0]
    """).lstrip()
    assert export_text(reg, decimals=1) == expected_report
    assert export_text(reg, decimals=1, show_weights=True) == expected_report