def Binning(self, col, x, y): # x = self.d[0] # y = self.d[1] tdc = DecisionTreeClassifier() params = {'max_depth': [2, 3, 4], 'min_samples_split': [2, 3, 5, 10]} Grid = GridSearchCV(tdc, param_grid=params, scoring='accuracy') Grid.fit(x, y) max_depth = Grid.best_params_['max_depth'] min_samples_split = Grid.best_params_['min_samples_split'] # print("Columns = {}, max_depth = {}, min sample split = {}, \nunique = {}"\ # .format( col, max_depth, min_samples_split, len(list(pd.Series(x[col]).unique())))) # print("[error]", type(y),set(y)) if "Y" in list(set(y)): class_weight = {"Y": 1.0, "N": 5.6} tdcs = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, class_weight=class_weight) tdcs.fit(x, y) tree_rules = export_text(tdcs, feature_names=list(x.columns)) tree_ = tdcs.tree_ else: # print("the target do not contain 'Y' and 'N'. You may have to change the binning spltting points. ") tdcs = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split) tdcs.fit(x, y) tree_rules = export_text(tdcs, feature_names=list(x.columns)) tree_ = tdcs.tree_ # print("Threshold:\n", tree_.threshold ) # print("children_left:\n", tree_.children_left) # print("children_right:\n",tree_.children_right) # a = tdcs.tree_.threshold # print(tree_rules) return tree_rules, max_depth, min_samples_split
def generate_templates(training_data): x = [] y = [] for graph, cluster_id in training_data: for u, edges in enumerate(graph): for v, features in edges: x.append(features) y.append(1 if cluster_id[u] == cluster_id[v] else 0) x = np.array(x) y = np.array(y) print(x) print(y) estimator = DecisionTreeClassifier(random_state=0, max_depth=3) estimator = estimator.fit(x, y) print(export_text(estimator)) n_nodes = estimator.tree_.node_count children_left = estimator.tree_.children_left children_right = estimator.tree_.children_right feature = estimator.tree_.feature threshold = estimator.tree_.threshold templates = [] def dfs(node, template_prefix, is_root=False): if template_prefix != []: templates.append(template_prefix) left, right = children_left[node], children_right[node] if left != -1: question = Question(feature[node], threshold[node], False) if is_root: dfs(left, []) dfs(left, template_prefix.copy() + [question]) if right != -1: question = Question(feature[node], threshold[node], True) if is_root: dfs(right, []) dfs(right, template_prefix.copy() + [question]) dfs(0, [], is_root=True) print("Feature induction templates:") for template in templates: print(", ".join([str(question) for question in template])) labels = [] for f in basic_features: labels.append(f.__name__) print(export_text(estimator, labels)) return templates
def scikit_cart(features, X_data, Y_data): #criterion='gini' tree.DecisionTreeClassifier().fit(X_data, Y_data) clf = tree.DecisionTreeClassifier().fit(X_data, Y_data) # tree = clf Y_pred = clf.predict(X_data) if len(features) != 1: dt_tree = export_text(clf, feature_names=features) print('Decision Tree:') print(dt_tree) # outputFile = open('graph.pdf', 'w') # treeDotFormat = tree.export_graphviz(clf, out_file=outputFile, feature_names = features, filled=True) # print('Tree DOT Format:', treeDotFormat) # outputFile.close() # s = Source.from_file('graph.pdf') # s.view() synz_fml = tree_to_phi(clf, features) # print('***Synthesize Formula:',synz_fml) # print('***Feature List:', features) # print('***Feature List Size', len(features)) logging.info('Synthesized formula: %s' % (synz_fml)) logging.info("Accuracy:%f" % (accuracy_score(Y_data, Y_pred))) return synz_fml
def write_random_forest_to_txt_file(model, folder_path, file_name, feature_names=None): ''' This method goes through the estimators of the random forest model and writes the set of rules of each estimator to a text_file Args: model: the sklearn model of the random forest folder_path: path to the folder to store the verilog file in (including last slash) file_name: name that should be given to the file (without any ending) feature_names: a list of names of the features that go into the random forest ''' for idx, estimator in enumerate(model.estimators_): s = export_text(estimator, feature_names=feature_names) file_path = folder_path + file_name + '_estimator{}.txt'.format(idx) with open(file_path, 'w') as f: f.write( 'Decision Tree - Estimator {}:\n--------------------------\n\n' .format(idx)) f.write(s) f.write('\n\nFeature Importance:\n--------------------------\n\n') for i, importance in enumerate(estimator.feature_importances_): f.write('{}: {:.2f} %\n'.format(feature_names[i], importance * 100))
def dtr_sip(x_train_, y_train_, x_test_, y_test_): dtr = tree.DecisionTreeRegressor(random_state=0, max_depth=5) dtr = dtr.fit(x_train_, y_train_) predict = dtr.predict(x_test_) print([x_train_.columns]) print(tree.export_text(dtr)) print("测试精度:%f" % (dtr.score(x_test_, y_test_))) pd.DataFrame(predict).to_csv(store_path + '\\dtr_result.csv', index=False, sep=',') count = Counter(predict * y_test_ > 0) accuracy = count[True] / (count[True] + count[False]) print(count) print('accuracy:', accuracy) plt.figure(figsize=(20, 10)) plt.scatter(x_test_.index, predict - y_test_, s=5, c='red', marker='o', label='decision tree:'+str(round(accuracy, 2))) plt.axhline(c='black') plt.legend() plt.title('Prediction Error of Decision Trees Regression') plt.xlabel("Date") plt.ylabel("Prediction Error") plt.savefig(store_path + r".\dt.png", dpi=600, bbox_inches='tight') plt.show() with open(store_path + '\\dtr.dot', 'w') as f: f = tree.export_graphviz(dtr, out_file=f, filled=True, class_names=True, proportion=True, rounded=True)
def plotTreeText(plotTree, fileName): textTree = export_text(plotTree) #Printing kinda explodes. #print(textTree) text_file = open(str(fileName)+".txt", "w") text_file.write(textTree) text_file.close()
def decision_tree_analysis(X, Y, feature_names): ''' Outputs decision tree based of input X and output Y. Parameters: ----------- X: decision tree input Y; decision tree output feature_names: names of all neurons in the inputs. ''' X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1) # Decision Tree decisionTree = DecisionTreeClassifier() decisionTree = decisionTree.fit(X_train, Y_train) Y_pred = decisionTree.predict(X_test) # print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) text_representation = tree.export_text(decisionTree, feature_names=feature_names) text_representation = text_representation.replace('<= 0.50', '== FALSE') text_representation = text_representation.replace('> 0.50', '== TRUE') print(text_representation) return decisionTree
def CARTT(dataset, month, a=12, b=1, c=2): dataset = normalize(dataset) mre_list = [] sa_list = [] for train, test in df_split(dataset, month): train_input = train.iloc[:, :-1] train_actual_effort = train.iloc[:, -1] test_input = test.iloc[:, :-1] test_actual_effort = test.iloc[:, -1] # max_depth: [1:12], min_samples_leaf: [1:12], min_samples_split: [2:21] model = DecisionTreeRegressor(max_depth=a, min_samples_leaf=b, min_samples_split=c) model.fit(train_input, train_actual_effort) test_predict_effort = model.predict(test_input) test_predict_Y = test_predict_effort test_actual_Y = test_actual_effort.values mre_list.append(mre_calc(test_predict_Y, test_actual_Y)) ######### for MRE sa_list.append( sa_calc(test_predict_Y, test_actual_Y, train_actual_effort)) ######### for SA r = export_text(model, feature_names=list(train_input.columns.values))[1] # print(r) feature_used = list(dict.fromkeys(r)) # print(feature_used) # tree.plot_tree(model, feature_names=list(train_input.columns.values)) # plt.show() return mre_list, sa_list, feature_used
def create_treemap(t): string = tree.export_text(t, show_weights=True, spacing=1) feature_label_stack = list() labels = list() values = list() while string.find("feature") != -1: string, feature_label = extract_feature_label(string) feature_label_stack = clean_feature_label_stack( feature_label_stack, feature_label) if (string.find("weights") < string.find("feature")) or (string.find("feature") == -1): label, value = extract_feature_properties(string) labels.append( generate_label(feature_label_stack) + feature_label + label) values.append(value) else: feature_label_stack.insert(0, feature_label) sorted_labels = list() sizes = list() while len(labels) != 0: maximum = max(values) i = len(labels) while maximum in values: index = values.index(maximum) sorted_labels.append(labels[index][labels[index].find("\n") + 1:]) sizes.append(pow(i, 1.6)) values.pop(index) labels.pop(index) palette = sns.light_palette(color=(210, 90, 60), input="husl", n_colors=len(sorted_labels)) palette.reverse() squarify.plot(sizes=sizes, label=sorted_labels, color=palette) plt.axis('off') plt.show()
def print_decision_tree(original_data, klaster_indeksi, columns): print("*** Stablo odlucivanja ***\n") clf = tree.DecisionTreeClassifier() clf = clf.fit(original_data, klaster_indeksi) text_tree = tree.export_text(clf, feature_names=list(columns)) print(text_tree)
def tree_regressor(df, components, n_sfd, n_non_sfd, r_seed): ''' Decision Tree fit. Performs the random sampling among the classes, creating a balanced training data set. Returns the success rate for the training and test data sets. Also returns the tree rules from each fit. ''' df_non_sfd = df[df['seafood_meal'] == 0].sample(n=n_non_sfd, random_state=r_seed) df_sfd = df[df['seafood_meal'] == 1].sample(n=n_sfd, random_state=r_seed) df = pd.concat([df_non_sfd, df_sfd]) df_x = df[components] df_y = df['seafood_meal'] X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=r_seed) decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, y_train) y_pred_tree = decision_tree.predict(X_test) y_pred_tree_train = decision_tree.predict(X_train) score = accuracy_score(y_test, y_pred_tree) score_train = accuracy_score(y_train, y_pred_tree_train) tree_rules = export_text(decision_tree, feature_names=list(X_train.columns)) return score, score_train, tree_rules
def train(x, y, feature_names, outfile): """Train the classifier Args: x: list of values for feature variables y: list of target values feature_names: Names of feature variables outfile: file name to write graphviz export to Returns: the trained classifier """ clf = tree.DecisionTreeClassifier(random_state=0, max_depth=2, criterion='gini', min_samples_split=3) clf = clf.fit(x, y) score = clf.score(x, y) logging.info(f'Classifier score: {score}\n') y_pred = clf.predict(x) print("Training results") print(classification_report(y, y_pred)) dot_data = tree.export_graphviz(clf, filled=True, rounded=True) graph = graphviz.Source(dot_data) class_names = ['Not relevant', 'Relevant'] r = tree.export_text(clf, feature_names=feature_names) print(r) tree.plot_tree(clf, feature_names=feature_names, class_names=class_names, filled=True, impurity=False) #plt.show() plt.savefig(outfile, dpi=160) return clf
def print_decision_tree(self): """ Print decision tree rules :return: """ rules_text = export_text(self.sklearn_classifier, max_depth=100) # Vocabulary for replacement in the data which contains # feature numbers only vocab = self.count_vectorizer.vocabulary_ vocabulary = dict((feature, word) for word, feature in vocab.items()) rules = rules_text.split("\n") lines = [] for rule in rules: if "feature_" in rule: word_id_str = re.sub(".*feature_([0-9]+).*", r"\1", rule) word_id = int(word_id_str) if word_id in vocabulary: word = vocabulary[word_id] else: word = "UNK" rule = rule.replace("feature_{}".format(word_id_str), word) lines.append(rule) else: lines.append(rule) with open(os.path.join(self.model_folder_path, "decision_rules.txt"), 'w', encoding='utf-8') as out: for line in lines: out.write(line + '\n')
def get_decision_tree_classifier(train_data, name, file): print(f'Creating decision tree classifiers on {name}\'s {file} file...') train_data = train_data.drop(['date'], axis=1) train_data = train_data.drop(['trading_time'], axis=1) train_data = train_data.drop(['source'], axis=1) train_data = train_data.drop(['text'], axis=1) sentiment = train_data.pop('sentiment') train_data.insert(0, 'sentiment', sentiment) y = train_data.iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=0.33) dtc = DecisionTreeClassifier(criterion='entropy', max_features='auto', max_depth=5, random_state=0) print("Decision Tree classifier") pred = dtc.fit(X_train, y_train) predictions = pred.predict(X_test) text_representation = tree.export_text(dtc) with open(f'decision_tree_{file}_{name}.log', 'w') as fout: fout.write(text_representation) feature_names = list(train_data.columns.values) fig = plt.figure(figsize=(15, 10)) plot_tree(dtc, feature_names=feature_names, class_names=["FALSE", "TRUE"], filled=True, fontsize=12) plt.title(f'{file} Decision Tree for {name}') plt.savefig(f'decision_tree_{file}_{name}.png') fig = plt.figure(figsize=(15, 10)) con_mat = confusion_matrix(y_true=y_test, y_pred=predictions) group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos'] group_counts = ['{0: 0.0f}'.format(value) for value in con_mat.flatten()] group_percentages = [ '{0: .2f}'.format(value) for value in con_mat.flatten() / np.sum(con_mat) ] labels = [ f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names, group_counts, group_percentages) ] labels = np.asarray(labels).reshape(2, 2) sns.heatmap(con_mat, annot=labels, fmt='', cmap='Blues') plt.title(f'{file} Confusion Matrix for {name}') plt.savefig(f'confusion_matrix_{file}_{name}.png') fig = plt.figure(figsize=(15, 10)) class_rpt = pd.DataFrame( classification_report(predictions, y_test, digits=2, output_dict=True)) class_rpt.style.background_gradient( cmap='newcmp', subset=pd.IndexSlice['0':'9', :'f1-score']).set_properties( **{ 'text-align': 'center', 'font-size': '30px' }) sns.heatmap(class_rpt.iloc[:-1, :].T, annot=True) plt.title(f'{file} Classification Report for {name}') plt.savefig(f'classification_report_{file}_{name}.png')
def teste_gera_sklearn_tree(): # ts = pd.read_csv("PLA_dump/cifar10_images_binary.txt", header=None, nrows=5000) labels = pd.read_csv("PLA_dump/cifar10_images_binary_labels.txt", header=None, nrows=3000) tt = [] content = np.genfromtxt("PLA_dump/cifar10_images_binary.txt", max_rows=3000, dtype=str) for idx in range(0, content.shape[0]): tt.append(list(content[idx])) ts = pd.DataFrame(tt) # Definition of the classifier clf = DecisionTreeClassifier( random_state=9856230, criterion='gini', max_depth=18, ) clf.fit(ts, labels) with open("tree_test2.tree", "w") as arquivo: arquivo.write(tree.export_text(clf, max_depth=1000))
def tree_txt_result(output_classification_result): text_representation = tree.export_text(output_classification_result) # with open("decistion_tree.log", "w") as fout: # fout.write(text_representation) return text_representation
def modeling(df, model: Model): """ Dividimos los datos en un 70% para entrenamiento y 30% para testeo, aparte separamos las variables de las etiquetas: - En X guardamos las variables - En Y las etiquetas """ x_train, x_test, y_train, y_test = train_test_split(df.drop(['relevant'], axis=1), df['relevant'], test_size=0.3, random_state=1) # Inicializamos el modelo md = model() # Lo entrenamos md.fit(x_train, y_train) # Vemos el árbol que se ha creado en el caso de Decision Tree if model is Model.decision_tree: text_representation = tree.export_text(md) with open("./decistion_tree.log", "w") as fout: fout.write(text_representation) dot_data = tree.export_graphviz(md, out_file=None, feature_names=x_test.columns, class_names=['Predicted relevant', 'Predicted no relevant'], filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data, filename='./tree', format='png') # graph.render() # Hacemos una predicción y_pred = md.predict(x_test) # comparamos resultados compare_results(y_test, y_pred)
def _de_novo_pred(X, y, feature_names, classifier='decision_tree'): if classifier == 'random_forest': log.info('Using extremely random forest') clf = ExtraTreesClassifier(n_estimators=250, bootstrap=True, oob_score=True) clf.fit(X, y) log.debug('Feature importance:') log.debug( format_feature_importances(feature_names, clf.feature_importances_)) pred = clf.oob_decision_function_[:, 1] # in the unlikely event dt1_pred contains NaNs # (can happen when n_estimators is not big enough) pred[np.isnan(pred)] = 0 pred = pred >= 0.5 else: clf = DecisionTreeClassifier( max_depth=5, min_samples_split=100, min_impurity_decrease=0.005, ) clf.fit(X, y) log.debug('Tree structure:') log.debug(export_text(clf, feature_names=feature_names)) pred = clf.predict(X) return pred.astype(int)
def show_tree_structure_in_terminal(self, clf): """ Prints tree structure in console. :param clf: instance of classifier returned by _predict method :return: prints tree structure """ r = export_text(clf, feature_names=self.feature_names) print(r)
def export_decision_tree(decision_tree, feature_names): export_decision_tree_to_file(export_text(decision_tree, feature_names)) export_graphviz(decision_tree, out_file='decision_tree.dot', feature_names=feature_names, rounded=True, precision=1) convert_dot_to_svg()
def write_DT(DT, outfile, labels, generate_sorted_file=True): """ Graphs the DT, writes the DT as text, and generates 'DT_sorted.txt' file that ranks features in descending order. Parameters ---------- DT : sklearn.tree.DecisionTreeClassifier Decision tree of interest. outfile : str path to files to be generated. generate_sorted_file : bool, optional Whether to generate a file ranking the tree's features. The default is True. """ # Make graph dot_data = tree.export_graphviz(DT, out_file=None, feature_names=labels, class_names=["benign", "malicous"], filled=True, rounded=True, special_characters=True, precision=6) graph = graphviz.Source(dot_data) graph.render("{}.gv".format(outfile), view=False) # Write DT as text with open("{}.txt".format(outfile), "w") as text_file: text_file.write( tree.export_text( DT, feature_names=list(labels), show_weights=True, )) features = [] if generate_sorted_file: # write DT_sorted.txt with open("{}_sorted.txt".format(outfile), "w") as DT_sorted: # Read text tree for best k with open("{}.txt".format(outfile), "r") as text_tree: for line in text_tree: if "class: " in line: continue else: # string containing attribute test attribute_test = line.strip().split("|--- ")[-1] # extract feature from attibute test feature = attribute_test.split("<")[0].split( ">")[0].strip() if feature not in features: features.append(feature) DT_sorted.write(feature + "\n")
def print_tree(self, feature_names: list = None): """ Print textual representation of the decision tree. Args: feature_names (list, optional): List of feature names to use. Defaults to None. """ print(export_text(self.clf, feature_names=feature_names))
def as_text(self): """" This method returns the tree in a string format """ if not self.is_fitted: self.fit() text = export_text(self.surrogate_explainer.estimator_, feature_names=list(self.feature_names)) return text
def fit_predict_model(self): if "Random" not in self.model_name: self._fit_optimised_model() self.model.fit(self.X_train, self.y_train) self.y_preds = self.model.predict(self.X_test) if "Random" not in self.model_name: self.model_struct = export_text(self.model, feature_names=self.feature_names) else: self.model_struct = "Random Forest Model\n"
def runC(cls, clf): # Graf tree.plot_tree(clf) plt.show() # Tekst dataFrame = pandas.read_csv('iris.csv') features = list(dataFrame.columns[:4]) r = export_text(clf, feature_names=features) print(r) pass
def print_tree(estimator, max_depth=6, **kwargs): """ Print the first Decision Tree from a Random Forest. :param estimator: Sklearn ensemble estimator. """ s = export_text(estimator.estimators_[0], max_depth=max_depth, feature_names=signal_names_, **kwargs) print(s)
def _dt_rules(clf, df_mat): """ Function to transform the printed structure of a DT into the set of rules derived from the paths to the terminal nodes. It also includes the length of each of those rules, as well as the prediction associated with it (value of that terminal node). Parameters ---------- clf : TYPE DESCRIPTION. df_mat : TYPE DESCRIPTION. Returns ------- df_rules : TYPE DESCRIPTION. """ r = export_text(clf, feature_names=list(df_mat.columns)) list_splits = r.split("|---") list_splits = [x.replace("|", "") for x in list_splits] list_splits = [x.replace("class: -1", "") for x in list_splits] list_splits = [x.replace("class: 1", "") for x in list_splits] list_splits = [x.strip() for x in list_splits] df_splits = pd.DataFrame({"levels": list_splits}) df_splits = (df_splits[df_splits["levels"] != ""].reset_index( drop=True).reset_index()) df_splits["index"] += 1 df_rules = pd.DataFrame() for i, point in df_mat.iterrows(): node_indices = clf.decision_path(point.values.reshape(1, -1)) rule = "" node_indices = pd.DataFrame(node_indices.toarray().T).reset_index() node_indices = node_indices.merge(df_splits) node_indices = node_indices[node_indices[0] == 1] for i in list(node_indices["levels"]): if rule == "": rule = i else: rule = rule + " & " + i dct_aux = { "rule": rule, "prediction": clf.predict(point.values.reshape(1, -1)), "len_rule": len(node_indices), } df_rules = df_rules.append(pd.DataFrame(dct_aux, index=[0])) df_rules = df_rules.drop_duplicates() return df_rules
def decisionTree(df, playlist_name, random_state, training_size): # Rohan print("Decision Tree") X = pd.DataFrame(df, columns=['danceability', 'energy', 'tempo', 'valence']) Y = pd.DataFrame(df, columns=['genre']) decisionTree = tree.DecisionTreeClassifier(random_state=random_state) songs = Y.size trainingSize = training_size trainingSongs = round(songs * 0.2) decisionTree = decisionTree.fit(X[0:trainingSongs], Y[0:trainingSongs]) prediction = decisionTree.predict(X[trainingSongs:]) prediction_df = pd.DataFrame(prediction) df['Predicted Genre'] = prediction_df print(pd.DataFrame(df, columns=['name', 'genre', 'Predicted Genre'])) # https://mljar.com/blog/visualize-decision-tree/ text_representation = tree.export_text(decisionTree) print(text_representation) accuracy = accuracy_score(Y[trainingSongs:], prediction_df, normalize=True) accuracyText = 'Accuracy: ' + str(accuracy) print(accuracyText) f1 = f1_score(Y[trainingSongs:], prediction_df, average='weighted') f1Text = 'F1: ' + str(f1) print(f1Text) recall = recall_score(Y[trainingSongs:], prediction_df, average='weighted') recallText = 'Recall: ' + str(recall) print(recallText) precision = precision_score(Y[trainingSongs:], prediction_df, average='weighted') precisionText = 'Precision: ' + str(precision) print(precisionText) class_names = decisionTree.classes_ feature_names = X.columns fig = plt.figure(figsize=(50, 40)) title = str(playlist_name) + "_decision_tree_" + str(random_state) titleLine2 = accuracyText + ", " + precisionText + ", " + f1Text titleLine3 = "Training Data Proportion: " + str(training_size) # Recall is not pertinent to the scope of this problem fig.suptitle(title + '\n' + titleLine2 + '\n' + titleLine3, fontsize=72) _ = tree.plot_tree(decisionTree, feature_names=feature_names, class_names=class_names, filled=True) fig.savefig( str(playlist_name) + "_decision_tree_" + str(random_state) + "_" + str(training_size) + ".png") return df
def visualize(self): print("##MSG: visualizing ...") classes = None # image dot_data = StringIO() #tree.export_graphviz(self.model, out_file=dot_data,feature_names=self.xNames) dot = tree.export_graphviz(self.model, feature_names=self.xNames, class_names=classes) # dot フォーマット強制変換 #あとで項目名対応する or treeのメソッドを実装 newdot = [] pre_pattern = r'^[0-9]* \[label="' suf_pattern = r'"] ;$' for line in dot.split('\n'): if re.match(pre_pattern, line): lbl = re.sub(suf_pattern, '', re.sub(pre_pattern, '', line)) lbldata = lbl.split("\\n") if len(lbldata) == 5: lblval = lbldata[0].split(' ') lblval0 = lblval[0].split("_") if len(lblval0) > 1: if lblval[-2] == "<=": nn = "_".join(lblval0[0:-1]) newlable = 'label="%s == %s\\\\n%s\\\\n%s\\\\n%s\\\\n%s"] ;' % ( nn, lblval0[-1], lbldata[1], lbldata[2], lbldata[3], lbldata[4]) newdot.append( re.sub(r'label=".*"] ;', newlable, line)) else: nn = "_".join(lblval0[0:-1]) newlable = 'label="%s != %s\\\\n%s\\\\n%s\\\\n%s\\\\n%s"] ;' % ( nn, lblval0[-1], lbldata[1], lbldata[2], lbldata[3], lbldata[4]) newdot.append( re.sub(r'label=".*"] ;', newlable, line)) else: newdot.append(line) else: newdot.append(line) else: newdot.append(line) newdotstr = '\n'.join(newdot) self.tree_chart = pydotplus.graph_from_dot_data(newdotstr) # text self.tree_text = tree.export_text(self.model, feature_names=self.xNames, show_weights=True)
def decision_tree(dataset, full_train: bool = False, print_tree=False, plot=False): predictors = dataset.drop(columns='y') target = dataset['y'] # Separa el dataset en train y test set # Además separa los predictores (x) de los resultados (y) x_train, x_test, y_train, y_test = None, None, None, None if full_train: x_train = predictors y_train = target x_test = predictors y_test = target else: x_train, x_test, y_train, y_test = train_test_split( predictors, target, random_state=123, ) # Define los parámetros del random forest modelo = DecisionTreeClassifier(random_state=123) # Entrena el random forest modelo.fit(x_train, y_train) # Representa en consola el decision tree correspondiente if print_tree: text_representation = export_text(modelo) print(text_representation) # Crea un plot del decision tree que detiene la ejecución del script hasta que se cierra if plot: fig = plt.figure() #_ = plot_tree(modelo, feature_names=dataset.feature_names, class_names=dataset.target_names, filled=True) tree = plot_tree(modelo, filled=True) plt.show() # Usa el modelo ya entrenado para predecir predicciones = modelo.predict(X=x_test) # establece los valores esperados en un fromato con el que se pueda trabajar esperados = y_test.tolist() matches = 0 for i, r in enumerate(predicciones): if r == esperados[i]: matches += 1 print(str(matches) + ' / ' + str(len(predicciones)))
def test_export_text(): clf = DecisionTreeClassifier(max_depth=2, random_state=0) clf.fit(X, y) expected_report = dedent(""" |--- feature_1 <= 0.00 | |--- class: -1 |--- feature_1 > 0.00 | |--- class: 1 """).lstrip() assert export_text(clf) == expected_report # testing that leaves at level 1 are not truncated assert export_text(clf, max_depth=0) == expected_report # testing that the rest of the tree is truncated assert export_text(clf, max_depth=10) == expected_report expected_report = dedent(""" |--- b <= 0.00 | |--- class: -1 |--- b > 0.00 | |--- class: 1 """).lstrip() assert export_text(clf, feature_names=['a', 'b']) == expected_report expected_report = dedent(""" |--- feature_1 <= 0.00 | |--- weights: [3.00, 0.00] class: -1 |--- feature_1 > 0.00 | |--- weights: [0.00, 3.00] class: 1 """).lstrip() assert export_text(clf, show_weights=True) == expected_report expected_report = dedent(""" |- feature_1 <= 0.00 | |- class: -1 |- feature_1 > 0.00 | |- class: 1 """).lstrip() assert export_text(clf, spacing=1) == expected_report X_l = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, 1]] y_l = [-1, -1, -1, 1, 1, 1, 2] clf = DecisionTreeClassifier(max_depth=4, random_state=0) clf.fit(X_l, y_l) expected_report = dedent(""" |--- feature_1 <= 0.00 | |--- class: -1 |--- feature_1 > 0.00 | |--- truncated branch of depth 2 """).lstrip() assert export_text(clf, max_depth=0) == expected_report X_mo = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y_mo = [[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1], [1, 1]] reg = DecisionTreeRegressor(max_depth=2, random_state=0) reg.fit(X_mo, y_mo) expected_report = dedent(""" |--- feature_1 <= 0.0 | |--- value: [-1.0, -1.0] |--- feature_1 > 0.0 | |--- value: [1.0, 1.0] """).lstrip() assert export_text(reg, decimals=1) == expected_report assert export_text(reg, decimals=1, show_weights=True) == expected_report