def compute_tree_ensemble_accuracy(trees, X_test, y_test): y_pred_prob = np.zeros(len(list(y_test))) # print("local trees size:", len(trees)) weights_list = prepare_uniform_weights(2, len(trees)) weights_norm = normalize_weights(weights_list) # print("len of weights norm: ", weights_norm.size()) # print("weights norm:", weights_norm) out_weight = None for tree_id, tree in enumerate(trees): pred = tree.predict_proba(X_test) # pred (n_samples, n_classes) if out_weight is None: out_weight = weights_norm[tree_id] * torch.tensor( pred, dtype=torch.float) else: out_weight += weights_norm[tree_id] * torch.tensor( pred, dtype=torch.float) _, pred_label = torch.max(out_weight.data, 1) # print("pred label:", pred_label) # print("y test:", y_test) # print("out weight:", out_weight) # print("len of out weight:", len(out_weight)) correct_num = 0 # print(pred_label == torch.BoolTensor(y_test)) correct_num += (pred_label == torch.LongTensor(y_test)).sum().item() # print("correct num:", correct_num) # for i, pred_i in enumerate(out_weight): # pred_class = np.argmax(pred_i) # if pred_class == y_test[i]: # correct_num += 1 total = len(list(y_test)) acc = correct_num / total return acc
def best_prediction_threshold(tree, X_test, y_test, lower_bound, gap): ''' This function is used to find the best threshold to classified the predicted probability, using best accuracy or f1_score. Inputs: tree: decision tree X_test: dataframe of features used for testing y_test: series of classifiers used for testing Returns: y_pred: the predictions of classifier using training data best_accuracy: accuracy of all cases current_acc0: accuracy of classifier which is labelled 0 in testing data current_acc1: accuracy of classifier which is labelled 1 in testing data ''' gn, g0n, g1n = len(y_test), list(y_test).count(0), list(y_test).count(1) y_predp = tree.predict_proba(X_test)[:, 1] best_accuracy = 0 for threshold in np.arange(lower_bound, 1, gap): y_predp = pd.Series(y_predp > threshold) y_pred = pd.Series([0] * len(y_predp)) y_pred.loc[y_predp] = 1 evaluated = list(zip(y_pred, y_test)) acc_all, acc_0, acc_1 = accuracy_calculation(gn, g0n, g1n, evaluated) if acc_all > best_accuracy: best_accuracy = acc_all current_acc0, current_acc1 = acc_0, acc_1 best_threshold = threshold return best_threshold, best_accuracy, current_acc0, current_acc1
def yaraified_rf_prediction(rf, tree_thres, percent_match, X): results = [] for tree in rf.estimators_: results.append(list(tree.predict_proba(X)[:, -1] > tree_thres)) results = np.array(results) results = results.transpose().sum(axis=1) / len(rf.estimators_) return results
def tree_predicition(tree, input_data): #run tree prediciton prediction = tree.predict_proba(input_data) #get order of output classes classes = tree.classes_ output = {} #map probability to output class for p, c in zip(prediction[0], classes): output[c] = p return output
def fedboost(trees, args, net_dataidx_map, X_train, y_train, X_test, y_test, task_type): for party_id in range(args.n_parties): dataidxs = net_dataidx_map[party_id] X_train_local = X_train[dataidxs] y_train_local = y_train[dataidxs] current_pred = np.zeros((len(y_train_local), 2)) ensemble_tree_ids = np.zeros(args.n_ensemble_models, dtype=int) isselected = np.zeros(len(trees), dtype=int) for final_tree_id in range(args.n_ensemble_models): temp_loss = float("inf") temp_tree_id = -1 for tree_id, tree in enumerate(trees): if isselected[tree_id] == 1: continue if task_type == "binary_cls": temp_pred = current_pred + tree.predict_proba( X_train_local) current_pred_norm = preprocessing.normalize(temp_pred, axis=1, norm='l1') current_loss = metrics.log_loss(y_train_local, current_pred_norm) if tree_id in range(party_id * args.n_local_models, (party_id + 1) * args.n_local_models): current_loss += args.lambda_boost if current_loss < temp_loss: temp_loss = current_loss temp_tree_id = tree_id elif task_type == "reg": print("not supported yet!") exit(1) ensemble_tree_ids[final_tree_id] = temp_tree_id current_pred += args.lr * trees[temp_tree_id].predict_proba( X_train_local) isselected[temp_tree_id] = 1 ens_acc = compute_tree_ensemble_accuracy( [trees[i] for i in ensemble_tree_ids], X_test, y_test) logger.info("In party %d" % party_id) logger.info("Selected trees %s" % " ".join(str(e) for e in ensemble_tree_ids)) logger.info("Boost acc: %f" % ens_acc)
def produce_probabilities(trees, X): probas = [] for tree in trees: probas.append([p[1] for p in tree.predict_proba(X)]) return np.array(probas).transpose()
# Clase 0: Iris-Setosa, # 1: Iris-Versicolor # 2: Iris-Virginica print("Primeras 10 variables objetivo") print(iris.target[0:150]) # Crear arbol arbol = DecisionTreeClassifier(max_depth=5, random_state=100) # Cantidad de niveles: 5 arbol.fit(iris.data, iris.target) # entrenamiento del arbol # Obtener predicciones print("PREDICCIONES ----------------------------") print(arbol.predict(iris.data[100:140])) # tree.plot_tree(arbol) r = export_text(decision_tree, feature_names=iris['feature_names']) print(r) # Si queremos saber las probabilidades podemos usar el metodo predict_proba print(tree.predict_proba(iris.data[47:53])) # la primera clase (Setosa) es la primera columna, la segunda clase en la segunda, etc. # este es el resultado: # [[1. 0. 0. ] # [1. 0. 0. ] # [1. 0. 0. ] # [0. 0.90740741 0.09259259] # [0. 0.90740741 0.09259259] # [0. 0.90740741 0.09259259]]
clf = tree.DecisionTreeClassifier() tree = clf.fit(data, target) regiao = 'Grande Florianópolis' localizacao = 'URBANA' serie = '8º ano' regiao_enc = encoders['regiao'].transform([regiao])[0] localizacao_enc = encoders['localizacao'].transform([localizacao])[0] serie_enc = encoders['serie'].transform([serie])[0] prediction_enc = tree.predict([[regiao_enc, localizacao_enc, serie_enc]]) prediction = encoders['status'].inverse_transform(int(prediction_enc[0])) proba = tree.predict_proba([[regiao_enc, localizacao_enc, serie_enc]])[0] # Output print('\nAcurácia do classificador: ' + str(tree.score(data, target))) print('\nA predição retornou: ' + prediction) if prediction == 'Suficiente': print('O aluno atende aos pré requisitos do Bolsa Família.') else: print( 'O aluno não atende aos pré requisitos do Bolsa Família e deve ser desligado do programa.' ) print('\nPeso de cada variável:') for i in range(len(default_csv[0]) - 1): print('\t' + default_csv[0][i] + ': ' +
# naive bayes (58%) nb = MultinomialNB() grid = GridSearchCV(nb, {}, cv=5) grid.fit(X_small, y) # SVM clf = svm.SVC() param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': [.000001, .00001, .0001, .001, .01, .1, 1, 10, 100, 1000, 10000, 10000], 'C': [float(x) for x in range(1,50)], 'degree': range(1,20)} grid = GridSearchCV(clf, param_grid , cv=5) grid.fit(X_small, y) smoothie_tree.predict(c[smoothie_features]) tree.predict_proba(np.array(c.ix[1][smoothie_features])) # write a program that takes in a time and returns the advantageous areas (zips) of SF and the disadvantageous areas (zips) SAN_FRANCISCO_ZIP_CODES = [94102, 94109, 94123, 94117, 94134, 94112, 94124, 94121, 94133, 94116, 94115, 94110, 94127, 94114, 94107, 94132, 94122, 94103, 94105, 94104, 94108, 94118, 94158, 94111, 94131, 94130, 94014, 94129, 94015] # [94014, 94015] not present in sf zip codes I parsed for application time = datetime.now() WEEKEND_DAYS = ['Saturday', 'Sunday'] EARLY_MORNING = [5,6,7] LATE_MORNING = [8,9,10] EARLY_AFTERNOON = [11,12,13] LATE_AFTERNOON = [14,15,16] EARLY_EVENING = [17,18,19]
Y1B clfR2 = tree.DecisionTreeRegressor() clfR2 = clf2.fit(X1A, Y1A) clfR2.predict([[26, 2]]) #salary expected is 30 clfR2.predict_proba() clfC2 = tree.DecisionTreeClassifier() clfC2 = clf.fit(X1A, Y1B) Y1B1 = np.array(pd.get_dummies(data['mnc'])) Y1B1 clfC2 = clf.fit(X1A, Y1B1) clfC2.predict([[26, 2]]) data clfC2.predict(X1A) tree.predict_proba(clfC2) data IVs = ['age','experience'] DV1 = ['salary'] DV2 = ['mnc'] #plotting trees #lot_tree(*args, decision_tree, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, proportion=False, rotate='deprecated', rounded=False, precision=3, ax=None, fontsize=None) #regression tree tree.plot_tree(clfR2)#class names not avl tree.plot_tree(clfR2, feature_names=IVs, class_names=DV1, filled=True, node_ids=True, proportion=True, fontsize=10) #classification tree tree.plot_tree(clfC2)
Xtrain = trainM[:, 1:] ytest = testM[:, 0] Xtest = testM[:, 1:] #Grid search with param grid having different set of parameters param_grid = { 'max_depth': np.arange(3, 10), 'min_impurity_split': np.arange(1, 5), 'min_samples_leaf': np.arange(1, 5), 'min_samples_split': np.arange(3, 10) } tree = grid_search.GridSearchCV(DecisionTreeClassifier(), param_grid) tree.fit(Xtrain, ytrain) tree_preds = tree.predict_proba(Xtest)[:, 1] print("Best accuracy possible and best parameters to achieve them ") print(tree.best_score_, tree.best_params_) #calculating weights based on imbalanced data class_weight = class_weight.compute_class_weight('balanced', np.unique(ytrain), ytrain) cw = { 1: class_weight[0], 2: class_weight[1], 3: class_weight[2], 4: class_weight[3], 5: class_weight[4], 6: class_weight[5],
def predict_proba(self, X_test): Y_pred = np.zeros(len(X_test)) for tree in self.tree_bags: Y_pred += tree.predict_proba(X_test)[:,1] return 1.0*Y_pred / self.n_tree
from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn import tree neigh = KNeighborsClassifier(n_neighbors=1) neigh.fit(X_train, y_train) p_n = neigh.predict_proba(X_valid) #, y_valid) svm = SVC(probability=True) svm.fit(X_train, y_train) p_svm = svm.predict_proba(X_valid) tree = tree.DecisionTreeClassifier() tree.fit(X_train, y_train) p_tree = tree.predict_proba(X_valid) ############################################## ### CNN ## ##import keras ##from keras.datasets import mnist ##from keras.models import Sequential ##from keras.layers import Dense, Dropout, Flatten ##from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D ##from keras import backend as K ##from keras.callbacks import ReduceLROnPlateau, EarlyStopping ## ##batch_size = 128 ##num_classes = len(classes) ##epochs = 20
def obtain_predicted_probabilities(tree, test_df, xcol): ''' Obtain predicted probabilities of success for test data given a tree classifier and a list of the x columns. ''' return tree.predict_proba(test_df[xcol])[:,1]
scores = [] f1 = [] for i in classifiers: print i i.fit(X_train, y_train) y_pred = i.predict(X_test) cm = metrics.confusion_matrix(y_test, y_pred) scores.append((cm[0][0]+cm[1][1]).astype('f64')/sum(sum(cm))) f1.append(skl.metrics.f1_score(y_test, y_pred)) results = pd.DataFrame(zip(scores,f1)) tree = DecisionTreeClassifier(criterion='gini',max_depth=15) tree.fit(X_train, y_train) y_pred = tree.predict_proba(X_test) cm = metrics.confusion_matrix(y_test, y_pred) (cm[0][0]+cm[1][1]).astype('f64')/sum(sum(cm)) skl.metrics.f1_score(y_test, y_pred) y_pred[:,1] from sklearn.metrics import roc_curve len(tpr1) fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred[:,1]) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(fpr1, tpr1, lw=1, label='Curva ROC') plt.axis([-0.05, 1.05, -0.05, 1.05]) plt.title("Curva Roc") plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Suerte')
def train_a_student_tree(trees, public_data, public_data_label, n_classes, stu_model, gamma, filter_query, threshold=None, n_partition=None, apply_consistency=False, is_final_student=False): vote_counts = np.zeros((len(public_data_label), n_classes)) for tree_id, tree in enumerate(trees): y_pred = tree.predict(public_data) y_prob = tree.predict_proba(public_data) # print("y_pred:", y_pred) if is_final_student and apply_consistency: if tree_id % n_partition == 0: votes_base = y_pred votes_flag = np.ones(len(y_pred), dtype=int) else: for i, y in enumerate(y_pred): if votes_flag[i]: if int(y) != votes_base[i]: votes_flag[i] = 0 if (tree_id % n_partition) == (n_partition - 1) and votes_flag[i]: vote_counts[i][int(y)] += n_partition else: for i, y in enumerate(y_pred): if threshold is not None: if y_prob[i] >= threshold: vote_counts[i][int(y)] += 1 else: vote_counts[i][int(y)] += 1 vote_counts_origin = copy.deepcopy(vote_counts).astype("int") if gamma != 0: for i in range(vote_counts.shape[0]): vote_counts[i] += np.random.laplace(loc=0.0, scale=float(1.0 / gamma), size=vote_counts.shape[1]) final_pred = np.argmax(vote_counts, axis=1) logger.info( "Labeling acc %f" % ((final_pred == public_data_label).sum() / len(public_data_label))) if filter_query: confident_query_idx = [] for idx, row in enumerate(vote_counts_origin): top2_counts = row[np.argsort(row)[-2:]] if top2_counts[1] - top2_counts[0] > 2: # if top2_counts[1] > args.n_teacher_each_partition * args.query_filter_threshold: confident_query_idx.append(idx) print("len confident query idx:", len(confident_query_idx)) logger.info("len confident query idx: %d" % len(confident_query_idx)) # local_query_ds = data.Subset(public_ds, confident_query_idx) public_data = public_data[confident_query_idx] final_pred = [final_pred[i] for i in confident_query_idx] # query_data_size = int(len(y_test) * args.query_portion) stu_model.fit(public_data, final_pred) top1_class_counts = np.zeros(500) top2_class_counts = np.zeros(500) top_diff_counts = np.zeros(500) top2_counts_differ_one = 0 for row in vote_counts_origin: # print(row) top2_counts = row[np.argsort(row)[-2:]] if top2_counts[1] - top2_counts[0] <= 1: top2_counts_differ_one += 1 # print(top2_counts[1] - top2_counts[0]) top_diff_counts[top2_counts[1] - top2_counts[0]] += 1 top1_class_counts[top2_counts[1]] += 1 top2_class_counts[top2_counts[0]] += 1 return top2_counts_differ_one, vote_counts_origin
yticklabels=['Non-default', 'Default']) plt.ylabel('True label') plt.xlabel('Predicted label') plt.title("Confusion Matrix - Decision Tree") # In[55]: print(classification_report(y_test, y_pred_DT)) # In[56]: import os # In[57]: y_pred_proba_DT = tree.predict_proba(X_test)[::, 1] fpr_DT, tpr_DT, _ = metrics.roc_curve(y_test, y_pred_proba_DT) auc_DT = metrics.roc_auc_score(y_test, y_pred_proba_DT) # In[ ]: # In[ ]: # In[61]: plt.figure(figsize=(10, 7)) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_DT, tpr_DT, label="Decision Tree, auc=" + str(round(auc_DT, 2))) plt.legend(loc=4, title='Models', facecolor='white') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate')
pre_score = lr.predict_proba(X_test)[:, 1] print("对数几率回归模型参数:{parameter}".format(parameter=lr)) auto_model_analysis("logistic_regression", Y_train, train_pred, Y_test, test_pred, pre_score) # In[17]: #决策树 from sklearn import tree tree = tree.DecisionTreeClassifier(criterion="entropy", splitter='best', max_depth=4) tree.fit(X_train, Y_train) train_pred = tree.predict(X_train) test_pred = tree.predict(X_test) pre_score = tree.predict_proba(X_test)[:, 1] print("决策树模型参数:{parameter}".format(parameter=tree)) auto_model_analysis("decision tree", Y_train, train_pred, Y_test, test_pred, pre_score) # In[18]: #神经网络 from sklearn.neural_network import MLPClassifier mlp = MLPClassifier(solver='sgd', activation='tanh', hidden_layer_sizes=(7, 28, 56, 2), alpha=0.05, learning_rate_init=0.02, max_iter=10000) mlp.fit(X_train, Y_train)
def export_decision_path2(random_tree, x, out_file=None, feature_names=None, label='all', special_characters=False, node_ids=False, rounded=True, proportion=False, impurity=True, class_names=None): def recurse(the_tree, node_id): if node_id == _tree.TREE_LEAF: raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF) left_child = the_tree.children_left[node_id] right_child = the_tree.children_right[node_id] if left_child != _tree.TREE_LEAF: child2parent[left_child] = (LEFT, node_id) child2parent[right_child] = (RIGHT, node_id) recurse(the_tree, left_child) recurse(the_tree, right_child) else: leafs.append(node_id) def node_to_str(tree, node, criterion): node_id = node[1] node_pos = node[0] # Generate the node content string if tree.n_outputs == 1: value = tree.value[node_id][0, :] else: value = tree.value[node_id] # Should labels be shown? labels = (label == 'root' and node_id == 0) or label == 'all' # PostScript compatibility for special characters if special_characters: characters = ['#', '<SUB>', '</SUB>', '≤', '<br/>', '>'] node_string = '<' else: characters = ['#', '[', ']', '<=', '\\n', '"', '>'] node_string = '"' # Write node ID if node_ids: if labels: node_string += 'node ' node_string += characters[0] + str(node_id) + characters[4] # Write decision criteria if tree.children_left[node_id] != _tree.TREE_LEAF: # Always write node decision criteria, except for leaves if feature_names is not None: feature = feature_names[tree.feature[node_id]] else: feature = "X%s%s%s" % (characters[1], tree.feature[node_id], characters[2]) node_string += '%s %s %s%s' % ( feature, characters[3] if node_pos == 1 else characters[6], round(tree.threshold[node_id], 4), characters[4]) # Write impurity if impurity: if isinstance(criterion, _criterion.FriedmanMSE): criterion = "friedman_mse" elif not isinstance(criterion, six.string_types): criterion = "impurity" if labels: node_string += '%s = ' % criterion node_string += (str(round(tree.impurity[node_id], 4)) + characters[4]) # Write node sample count if labels: node_string += 'samples = ' if proportion: percent = (100. * tree.n_node_samples[node_id] / float(tree.n_node_samples[0])) node_string += (str(round(percent, 1)) + '%' + characters[4]) else: node_string += (str(tree.n_node_samples[node_id]) + characters[4]) # Write node class distribution / regression value if proportion and tree.n_classes[0] != 1: # For classification this will show the proportion of samples value = value / tree.weighted_n_node_samples[node_id] if labels: node_string += 'value = ' if tree.n_classes[0] == 1: # Regression value_text = np.around(value, 4) elif proportion: # Classification value_text = np.around(value, 2) elif np.all(np.equal(np.mod(value, 1), 0)): # Classification without floating-point weights value_text = value.astype(int) else: # Classification with floating-point weights value_text = np.around(value, 4) # Strip whitespace value_text = str(value_text.astype('S32')).replace("b'", "'") value_text = value_text.replace("' '", ", ").replace("'", "") if tree.n_classes[0] == 1 and tree.n_outputs == 1: value_text = value_text.replace("[", "").replace("]", "") value_text = value_text.replace("\n ", characters[4]) node_string += value_text + characters[4] # Write node majority class if (class_names is not None and tree.n_classes[0] != 1 and tree.n_outputs == 1): # Only done for single-output classification trees if labels: node_string += 'class = ' if class_names is not True: class_name = class_names[np.argmax(value)] else: class_name = "y%s%s%s" % (characters[1], np.argmax(value), characters[2]) node_string += class_name # Clean up any trailing newlines if node_string[-2:] == '\\n': node_string = node_string[:-2] if node_string[-5:] == '<br/>': node_string = node_string[:-5] return node_string + characters[5] # open out file return_string = False own_file = False if isinstance(out_file, six.string_types): if six.PY3: out_file = open(out_file, "w", encoding="utf-8") else: out_file = open(out_file, "wb") own_file = True if out_file is None: return_string = True out_file = six.StringIO() out_file.write('digraph Decision_path {\n') out_file.write('rankdir = LR;\n') out_file.write('node [shape=box];\n') scores = [] for tree in random_tree.estimators_: scores.append((tree, float(tree.predict_proba(x)[:, 1]))) iter = 0 for each in sorted(scores, key=lambda x: x[1], reverse=True)[0:10]: child2parent = {} leafs = [] recurse(each[0].tree_, 0) leaf_of_path = -1 path = each[0].decision_path(x)[0].todense()[0, :].tolist()[0] print(path) idx = len(path) - 1 while True: if path[idx] == 1: leaf_of_path = idx break idx -= 1 for leaf in leafs: if leaf != leaf_of_path: idx += 1 continue path = [] cur_node = (0, leaf) while True: path.append(cur_node) if cur_node[1] == 0: break cur_node = child2parent[cur_node[1]] path.reverse() for node in path: out_file.write( 'f%dt%d [label=%s];\n' % (iter, node[1], node_to_str(each[0].tree_, node, each[0].criterion))) if node[1] != 0: out_file.write( 'f%dt%d -> f%dt%d;\n' % (iter, child2parent[node[1]][1], iter, node[1])) iter += 1 out_file.write('}') if return_string: return out_file.getvalue() if own_file: out_file.close()
def export_decision_path2(random_tree, x, out_file=None, feature_names=None, label='all', special_characters=False, node_ids = False, rounded = True, proportion = False, impurity = True, class_names=None): def recurse(the_tree, node_id): if node_id == _tree.TREE_LEAF: raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF) left_child = the_tree.children_left[node_id] right_child = the_tree.children_right[node_id] if left_child != _tree.TREE_LEAF: child2parent[left_child] = (LEFT, node_id) child2parent[right_child] = (RIGHT, node_id) recurse(the_tree, left_child) recurse(the_tree, right_child) else: leafs.append(node_id) def node_to_str(tree, node, criterion): node_id = node[1] node_pos = node[0] # Generate the node content string if tree.n_outputs == 1: value = tree.value[node_id][0, :] else: value = tree.value[node_id] # Should labels be shown? labels = (label == 'root' and node_id == 0) or label == 'all' # PostScript compatibility for special characters if special_characters: characters = ['#', '<SUB>', '</SUB>', '≤', '<br/>', '>'] node_string = '<' else: characters = ['#', '[', ']', '<=', '\\n', '"', '>'] node_string = '"' # Write node ID if node_ids: if labels: node_string += 'node ' node_string += characters[0] + str(node_id) + characters[4] # Write decision criteria if tree.children_left[node_id] != _tree.TREE_LEAF: # Always write node decision criteria, except for leaves if feature_names is not None: feature = feature_names[tree.feature[node_id]] else: feature = "X%s%s%s" % (characters[1], tree.feature[node_id], characters[2]) node_string += '%s %s %s%s' % (feature, characters[3] if node_pos == 1 else characters[6], round(tree.threshold[node_id], 4), characters[4]) # Write impurity if impurity: if isinstance(criterion, _criterion.FriedmanMSE): criterion = "friedman_mse" elif not isinstance(criterion, six.string_types): criterion = "impurity" if labels: node_string += '%s = ' % criterion node_string += (str(round(tree.impurity[node_id], 4)) + characters[4]) # Write node sample count if labels: node_string += 'samples = ' if proportion: percent = (100. * tree.n_node_samples[node_id] / float(tree.n_node_samples[0])) node_string += (str(round(percent, 1)) + '%' + characters[4]) else: node_string += (str(tree.n_node_samples[node_id]) + characters[4]) # Write node class distribution / regression value if proportion and tree.n_classes[0] != 1: # For classification this will show the proportion of samples value = value / tree.weighted_n_node_samples[node_id] if labels: node_string += 'value = ' if tree.n_classes[0] == 1: # Regression value_text = np.around(value, 4) elif proportion: # Classification value_text = np.around(value, 2) elif np.all(np.equal(np.mod(value, 1), 0)): # Classification without floating-point weights value_text = value.astype(int) else: # Classification with floating-point weights value_text = np.around(value, 4) # Strip whitespace value_text = str(value_text.astype('S32')).replace("b'", "'") value_text = value_text.replace("' '", ", ").replace("'", "") if tree.n_classes[0] == 1 and tree.n_outputs == 1: value_text = value_text.replace("[", "").replace("]", "") value_text = value_text.replace("\n ", characters[4]) node_string += value_text + characters[4] # Write node majority class if (class_names is not None and tree.n_classes[0] != 1 and tree.n_outputs == 1): # Only done for single-output classification trees if labels: node_string += 'class = ' if class_names is not True: class_name = class_names[np.argmax(value)] else: class_name = "y%s%s%s" % (characters[1], np.argmax(value), characters[2]) node_string += class_name # Clean up any trailing newlines if node_string[-2:] == '\\n': node_string = node_string[:-2] if node_string[-5:] == '<br/>': node_string = node_string[:-5] return node_string + characters[5] # open out file return_string = False own_file = False if isinstance(out_file, six.string_types): if six.PY3: out_file = open(out_file, "w", encoding="utf-8") else: out_file = open(out_file, "wb") own_file = True if out_file is None: return_string = True out_file = six.StringIO() out_file.write('digraph Decision_path {\n') out_file.write('rankdir = LR;\n') out_file.write('node [shape=box];\n') scores = [] for tree in random_tree.estimators_: scores.append((tree, float(tree.predict_proba(x)[:,1]))) iter = 0 for each in sorted(scores, key = lambda x : x[1], reverse = True)[0:10]: child2parent = {} leafs = [] recurse(each[0].tree_, 0) leaf_of_path = -1 path = each[0].decision_path(x)[0].todense()[0,:].tolist()[0] print(path) idx = len(path) - 1 while True: if path[idx] == 1: leaf_of_path = idx break idx -= 1 for leaf in leafs: if leaf != leaf_of_path: idx += 1 continue path = [] cur_node = (0, leaf) while True: path.append(cur_node) if cur_node[1] == 0: break cur_node = child2parent[cur_node[1]] path.reverse() for node in path: out_file.write('f%dt%d [label=%s];\n' % (iter, node[1], node_to_str(each[0].tree_, node, each[0].criterion))) if node[1] != 0: out_file.write('f%dt%d -> f%dt%d;\n' % (iter, child2parent[node[1]][1], iter, node[1])) iter += 1 out_file.write('}') if return_string: return out_file.getvalue() if own_file: out_file.close()