print() # Explore Target NAMES in DataSet print("===IRIS DATA TARGET NAMES===") print(irisData.target_names) # 1. Create the Model model = tree.DecisionTreeClassifier() # 2. Training the Model model.fit(irisData.data, irisData.target) # Lets Test The Model with a Sample Input # inputData = [5.5, 2.3, 4.0, 1.3] # predictedTarget = model.predict([inputData]) # print(predictedTarget) inputData1 = [5.5, 2.3, 4.0, 1.3] inputData2 = [5.43, 3.90, 1.15, 0.32] predictedTargets = model.predict([inputData1, inputData2]) print(predictedTargets) import graphviz data = tree.export_graphviz(model, out_file=None) graph = graphviz.Source(data) graph.render("IRIS DATA SET DECISION TREE") graph.view() # Train the DecisionTreeClassifier with DataSet as in Session40 # Try looking for APIs to convert the string dataset into numbered dataset
def gplearn_procedure(equation_id, no_samples=1000, input_range=(-1, 1), save_path=None, save=True, load=True, func_set=[ 'add', 'sub', 'mul', 'div', 'log', 'sqrt', 'cos', 'tan', 'sin', 'pow', 'exp' ], verbose=1): """ Uses gplearn to attempt to predict the equation form of 'equation_id' Renders a graphviz image to images/gplearn/ returns predicted equation, R^2 score and time taken Parameters ---------- equation_id : string The ID of an equation in the dataset. Must be a valid one no_samples : int The number of samples you want fed in to the algorithm input_range: tuple(float, float) The minimum and maximum values of all input parameters save_path: string path The path to where you wish the save this dataframe save: boolean Saves file to save_path iff True load: boolean If true then looks for file in save_path and loads it preemptively if it is there func_set : list List of strings i.e names of functions to include / operations to consider current options include ‘add’ : addition, arity=2. ‘sub’ : subtraction, arity=2. ‘mul’ : multiplication, arity=2. ‘div’ : protected division where a denominator near-zero returns 1., arity=2. ‘sqrt’ : protected square root where the absolute value of the argument is used, arity=1. ‘log’ : protected log where the absolute value of the argument is used and a near-zero argument returns 0., arity=1. ‘abs’ : absolute value, arity=1. ‘neg’ : negative, arity=1. ‘inv’ : protected inverse where a near-zero argument returns 0., arity=1. ‘max’ : maximum, arity=2. ‘min’ : minimum, arity=2. ‘sin’ : sine (radians), arity=1. ‘cos’ : cosine (radians), arity=1. ‘tan’ : tangent (radians), arity=1. 'exp' : exponential (self defined), arity=1 'pow' : power (self defined), arity=2 verbose : int controls how much is printed, 0 is quitest Returns ------- string, float, float """ try: df = create_dataset(equation_id, no_samples=no_samples, input_range=input_range, save_path=save_path, save=save, load=load).dropna() X = df.drop('target', axis=1) y = df['target'] except: traceback.print_exc() print(f"Error on equation {equation_id} skipping") return '', 0, 0 no_samples = min(no_samples, len(y)) default_func_set = ('add', 'sub', 'mul', 'div', 'log', 'sqrt', 'cos', 'tan', 'sin', 'abs', 'neg', 'inv', 'max', 'min') final_func_set = [] for func in func_set: if func in default_func_set: final_func_set.append(func) else: if func == "pow": final_func_set.append(make_function(power, func, 2)) elif func == "exp": final_func_set.append(make_function(exponent, func, 1)) elif func == "pi": final_func_set.append(make_function(pi, func, 0)) else: warnings.warn( f"{func} is an unrecognized function, skipping it") pass est_gp = SymbolicRegressor(population_size=5000, generations=10, stopping_criteria=0.01, p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05, p_point_mutation=0.1, max_samples=0.9, function_set=final_func_set, verbose=verbose, parsimony_coefficient=0.01, random_state=0) start = time.time() hist = est_gp.fit(X[:no_samples], y[:no_samples]) end = time.time() #print(est_gp._program) dot_data = est_gp._program.export_graphviz() graph = graphviz.Source(dot_data) graph.render(f'images/gplearn/{equation_id}_estimate', format='png', cleanup=True) return est_gp._program, est_gp.score(X, y), end - start
def image_path(fig_id): return os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id) iris = load_iris() X = iris.data[:, 2:] y = iris.target tree_clf = DecisionTreeClassifier(max_depth=2) tree_clf.fit(X, y) # 시각화 export_graphviz(tree_clf, out_file=image_path("iris_tree.dot"), feature_names=iris.feature_names[2:], class_names=iris.target_names, rounded=True, filled=True) with open("images/decision_trees/iris_tree.dot") as f: dot_graph = f.read() dot = graphviz.Source(dot_graph) dot.format = 'png' dot.render(filename='iris_tree', directory='images/decision_trees/') # 클래스와 클래스 확률 예측 tree_clf.predict_proba([[5, 1.5] ]) # array([[0. , 0.90740741, 0.09259259]]) tree_clf.predict([[5, 1.5]]) # array([1])
clf = DecisionTreeClassifier(max_depth=8) # 参数max_depth设置树最大深度 # 交叉验证,评价分类器性能,此处选择的评分标准是ROC曲线下的AUC值,对应AUC更大的分类器效果更好 scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc') print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format(np.mean(scores), np.std(scores))) clf = clf.fit(X_train, y_train) dot_data = tree.export_graphviz( clf, out_file=None, # doctest: +SKIP feature_names=features.head(0).columns.values.tolist(), # doctest: +SKIP #class_names=["MORE THAN", "NO MORE THAN"], # doctest: +SKIP filled=True, rounded=True, # doctest: +SKIP special_characters=True) # doctest: +SKIP graph = graphviz.Source(dot_data) # doctest: +SKIP graph # In[19]: from sklearn.learning_curve import learning_curve def plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=5, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5), scoring=None):
a = plot_tree(dcs_tree, feature_names=header_names, class_names=target_names, filled=True, rounded=True, fontsize=14) plt.show()''' from sklearn.tree import export_graphviz import graphviz import pydot import pyparsing plot_desc_tree = graphviz.Source(export_graphviz(dcs_tree_fit)) plot_desc_tree.view() #plot_tree.render('dtree_render_'+clf_name,view=True) from IPython.core.display import display #Image(filename='decision_tree.png') ''' try: from StringIO import StringIO except ImportError: from io import StringIO dot_data = StringIO() plot_tree.export_graphviz(dcs_tree_fit, out_file=dot_data) plot_tree = pydot.graph_from_dot_data(dot_data.getvalue())
inputs['velocidadViento_n'] = le_cielo.fit_transform(inputs['velocidadViento']) print(inputs) inputs_n = inputs.drop(['cielo', 'ambiente', 'humedo', 'velocidadViento'], axis= 'columns') #conseguimos las direcciones de cada elemento en las columnas one_hot_data = pd.get_dummies(inputs[['cielo_n', 'ambiente_n', 'humedo_n', 'velocidadViento_n']]) print(one_hot_data) #entrenamos el arbol con la funcion fit, le pasamos los datos a entrenar y las output's decisionTree = decisionTree.fit(inputs_n,juegoTennis['juego']) print(decisionTree.score(inputs_n,juegoTennis['juego'])) #utilzamos la función predict para solicitarle al arbol ya entrenado una posible solución a nuestro problema decision = decisionTree.predict([[1,0,1,1]]) #realizamos una pequena validacion para poder mostra un mensaje en la pantalla if decision == 'si': print(' hoy SI se juega') else: print(' hoy NO se juega') #esta parte comentada aún está en desarrollo dot_data = tree.export_graphviz(decisionTree, out_file='juego.dot',feature_names=list(inputs_n), class_names=['Not_Play', 'Play'], rounded=True, filled=True) with open('juego.dot') as f: dot_graph=f.read() graphviz.Source(dot_graph).view()
def knowledge_dist(num_trees, num_nearneigh, num_bins): #Read Data Set file data = pd.read_csv("/Users/oscaraguilar/Desktop/dataset2_clean.csv", sep=",", header=0) attributes = [ 'Season', 'Age @ Analysis', 'Childish diseases', 'Accident or trauma', 'Surgical intervention', 'High fevers', 'Frequency of alcohol consumption', 'Smoking habit', 'Number of hours spent sitting per day' ] #Split in feature data and target data columns = data.shape[1] x = data.values[:, 1:columns - 1] y = data.values[:, -1] x = np.array(x, dtype=float) #Split in training, testing and validation sets #Generating train and test data sets, 80% for training and 20% for testing x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10) #Splitting train set in training and validation sets, 75% for #training and 25% for testing x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=10) #Building of classifiers clf1 = RandomForestClassifier(n_estimators=num_trees, criterion='gini', random_state=10) #Binary Class clf2 = DecisionTreeClassifier(criterion='gini', splitter='best', random_state=10, min_impurity_decrease=0.0001) #Binary Class clf3 = DecisionTreeClassifier(criterion='gini', splitter='best', random_state=10, min_impurity_decrease=0.0001) #Multi Class clf4 = SVC(kernel='linear') clf5 = KNeighborsClassifier(n_neighbors=num_nearneigh) #Training Classifiers for binary and multiclass classification problems clf1 = clf1.fit(x_train, y_train) clf2 = clf2.fit(x_train, y_train) clf4 = clf4.fit(x_train, y_train) clf5 = clf5.fit(x_train, y_train) #Get probabilities prob = clf1.predict_proba(x) #Convert to data frame df = pd.DataFrame(prob) #Drop '1' class probabilities p1 = df.drop(1, axis=1) #Conver to numpy array p2 = np.array(p1, dtype=float) #Bining process hist, bin_edges = np.histogram(p2, bins=num_bins) #Retrieve the bin number of each probability bin_number = np.digitize(p2, bin_edges) #Create new data set for multiclass classification #prob_dataset=np.concatenate((x,bin_number), axis=1) #print(prob_dataset) #Generating train and test data sets, 70% for training and 30% for testing x_train2, x_test2, y_train2, y_test2 = train_test_split(x, bin_number, test_size=0.3, random_state=10) #Training multiclass classification decision tree clf3 = clf3.fit(x_train2, y_train2) #Tuning Testing with test sets rf = clf1.predict(x_val) dt = clf2.predict(x_val) dt_multi = clf3.predict(x_test2) svm = clf4.predict(x_val) knn = clf5.predict(x_val) #Final testing with validation set (data never seen by the classifiers) rf_val = clf1.predict(x_test) dt_val = clf2.predict(x_test) dt_multi_test = clf3.predict(x_test) svm_val = clf4.predict(x_test) knn_val = clf5.predict(x_test) #Binarize clf3 output into 2 classes (binary) binarize = np.where(dt_multi_test >= (num_bins / 2), 0, 1) #Accuracies and classification reports def accuracies(): k_fold = KFold(n_splits=10) a1 = accuracy_score(y_val, rf) * 100 a2 = accuracy_score(y_val, dt) * 100 a3 = accuracy_score(y_test2, dt_multi) * 100 a4 = accuracy_score(y_val, svm) * 100 a5 = accuracy_score(y_val, knn) * 100 a6 = accuracy_score(y_test, rf_val) * 100 a7 = accuracy_score(y_test, dt_val) * 100 a8 = accuracy_score(y_test, binarize) * 100 a9 = accuracy_score(y_test, svm_val) * 100 a10 = accuracy_score(y_test, knn_val) * 100 print('Accuracies:') print("Tuning Accuracy random forest = %s" % str(a1)) print("Final Accuracy random forest = %s" % str(a6)) score_1 = cross_val_score(clf1, x, y, cv=k_fold, n_jobs=-1) print('Average random forest accuracy: {} %'.format( np.mean(score_1) * 100)) print('----------------------------------') print("Tuning Accuracy binary decision tree = %s" % str(a2)) print("Final Accuracy binary decision tree = %s" % str(a7)) score_2 = cross_val_score(clf2, x, y, cv=k_fold, n_jobs=-1) print('Average binary decision tree accuracy: {} %'.format( np.mean(score_2) * 100)) print('----------------------------------') print("Tuning Accuracy multi class decision tree = %s" % str(a3)) print("Final Accuracy multi class decision tree = %s" % str(a8)) score_3 = cross_val_score(clf3, x, y, cv=k_fold, n_jobs=-1) print('Average multi class decision tree accuracy: {} %'.format( np.mean(score_3) * 100)) print('----------------------------------') print("Tuning Accuracy binary SVM = %s" % str(a4)) print("Final Accuracy binary SVM = %s" % str(a9)) score_4 = cross_val_score(clf4, x, y, cv=k_fold, n_jobs=-1) print('Average SVM accuracy: {} %'.format(np.mean(score_4) * 100)) print('----------------------------------') print("Tuning Accuracy binary KNN (10NN) = %s" % str(a5)) print("Final Accuracy binary KNN (10NN) = %s" % str(a10)) score_5 = cross_val_score(clf5, x, y, cv=k_fold, n_jobs=-1) print('Average KNN accuracy: {} %'.format(np.mean(score_5) * 100)) print('----------------------------------') return def classreports(): classes = ['0-Healthy', '1-Unhealthy'] print('Class reports and confusion matrices') print('RANDOM FOREST') print(classification_report(y_test, rf_val, target_names=classes)) print(confusion_matrix(y_test, rf_val)) print('----------------------------------') print('BINARY DECISION TREE') print(classification_report(y_test, dt_val, target_names=classes)) print(confusion_matrix(y_test, dt_val)) print('----------------------------------') print('BINARIZED MULTICLASS DECISION TREE') print(classification_report(y_test, binarize, target_names=classes)) print(confusion_matrix(y_test, binarize)) print('----------------------------------') print('SVM') print(classification_report(y_test, svm_val, target_names=classes)) print(confusion_matrix(y_test, svm_val)) print('----------------------------------') print('K-NEAREST NEIGHBOURS') print(classification_report(y_test, knn_val, target_names=classes)) print(confusion_matrix(y_test, knn_val)) return #Plot Distilled and binary DTs classes2 = ['0', '1'] classes3 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] dot_data2 = tree.export_graphviz(clf2, out_file=None, feature_names=attributes, class_names=classes2, filled=True, rounded=True) graph2 = graphviz.Source(dot_data2) graph2.render("Binary DT", directory='/Users/oscaraguilar/Desktop', format='png') dot_data = tree.export_graphviz(clf3, out_file=None, feature_names=attributes, class_names=classes3, filled=True, rounded=True) graph = graphviz.Source(dot_data) graph.render("Distilled tree", directory='/Users/oscaraguilar/Desktop', format='png') #Plot confusion matrices #cm= confusion_matrix(y_test,binarize) #plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia) #classNames = ['Healthy','Unhealthy'] #plt.title('DISTILLED DECISION TREE CONFUSION MATRIX') #plt.ylabel('True label') #plt.xlabel('Predicted label') #tick_marks = np.arange(len(classNames)) #plt.xticks(tick_marks, classNames, rotation=45) #plt.yticks(tick_marks, classNames) #s = [['TN','FP'], ['FN', 'TP']] #for i in range(2): #for j in range(2): #plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j])) #plt.savefig('/Users/oscaraguilar/Desktop') a = accuracies() b = classreports() return a, b
def render(self, out_file, format='pdf', view=True, feature_names=None, filled=True, leaves_parallel=True, rotate=False, rounded=True, special_characters=False, precision=3): """ Render the tree to a flie Parameters ---------- out_file : file name to save to format : string, optional, default 'pdf' The file format to render to; must be supported by graphviz view : bool, optional, default True Whether to open the rendered result with the default application. feature_names : list of strings, optional, default None Names of each of the features. filled : bool, optional, default False When set to ``True``, paint nodes to indicate majority class for classification, extremity of values for regression, or purity of node for multi-output. leaves_parallel : bool, optional, default True When set to ``True``, draw all leaf nodes at the bottom of the tree. rotate : bool, optional, default False When set to ``True``, orient tree left to right rather than top-down. rounded : bool, optional, default True When set to ``True``, draw node boxes with rounded corners and use Helvetica fonts instead of Times-Roman. special_characters : bool, optional, default False When set to ``False``, ignore special characters for PostScript compatibility. precision : int, optional, default 3 Number of digits of precision for floating point in the values of impurity, threshold and value attributes of each node. """ dot_source = self.export_graphviz( out_file= None, # want the output as a string, only write the final file feature_names=feature_names, filled=filled, leaves_parallel=leaves_parallel, rotate=rotate, rounded=rounded, special_characters=special_characters, precision=precision) graphviz.Source(dot_source).render(out_file, format=format, view=view)
test_target = iris.target[test_idx] ## test_data = iris.data[test_idx] # 2. Train a classifier clf = tree.DecisionTreeClassifier() clf.fit(train_data, train_target) # 3. Predict label for new flower print(test_target) # [0,1,2] print(clf.predict(test_data)) # splits out the same labels [0,1,2] # 4. Visualize the tree ## from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, impurity=False) print("should export the tree.dot") import graphviz as gp graph = gp.Source(dot_data.getvalue()) graph.render("iris", view=True)
def visualise_tree(trained_tree): dot_data = tree.export_graphviz(trained_tree, out_file=None) graph = graphviz.Source(dot_data) graph.render("oxo")
y - We can now use the `.fit()` method to train our model using the `X` and `y` data model.fit(X, y) - Now we've used data to learn a model - Let's take a look at the model we made! - The code below prints out our model structure for us (like the tree we made ourselves earlier) import graphviz from sklearn.tree import export_graphviz dot_data = export_graphviz(model) graphviz.Source(export_graphviz(model, out_file=None, feature_names=X.columns, class_names=["blue", "red"], impurity=False)) - We can better visualize what's going on by actually plotting our data and the model "decision boundaries" - The code below does just this - It's using some code I have made myself located in the code folder on Canvas - I'm also using the plotting library `altair` to make this plot (which you may not have seen). It makes very nice plots but requires some wrangling to get data into a suitable format for use with the package. **You do not need to learn to use Altair in this course**, all your plotting for this course may be done in `matplotlib` import altair as alt # altair is a plotting library import sys sys.path.append('code/') from model_plotting import plot_model, plot_regression_model, plot_tree_grid # these are some custom plotting scripts I made plot_model(X, y, model)
def train(object_name, data_dir, output_dir, train_type, classifier_type, learned_model=None, debug=False): from sklearn import linear_model, tree from sklearn.svm import SVR from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.ensemble import AdaBoostRegressor if classifier_type == 'Earth': from pyearth import Earth import numpy as np have_graphviz = True try: import graphviz except: have_graphviz = False ans = None saso_data = load_data_file(object_name, data_dir) if train_type == 'gripper_status': action_str = 'gs' actions = range(CLOSE_ACTION_ID + 1) x = [] y = [] x_index = [] for action in actions: for sasor in saso_data[action]: #x_entry = sasor['touch_prev'] + sasor['init_joint_values'] x_entry = sasor['next_joint_values'] x_entry = x_entry + sasor['next_gripper'] + sasor['next_object'] x_entry.append(sasor['next_object'][0] - sasor['next_gripper'][0]) x_entry.append(sasor['next_object'][1] - sasor['next_gripper'][1]) x.append(x_entry) x_index.append(sasor['index']) if action == CLOSE_ACTION_ID: y.append(1) else: y.append(0) #gripper open if train_type == 'pick_success_probability': action_str = repr(PICK_ACTION_ID) x = [] y = [] x_index = [] for sasor in saso_data[PICK_ACTION_ID]: #x_entry = sasor['touch_prev'] + sasor['init_joint_values'] x_entry = sasor['init_joint_values'] x_entry = x_entry + sasor['init_gripper'][0:3] + sasor[ 'init_object'][0:3] x_entry.append(sasor['init_object'][0] - sasor['init_gripper'][0]) x_entry.append(sasor['init_object'][1] - sasor['init_gripper'][1]) x.append(x_entry) x_index.append(sasor['index']) if sasor['reward'] > 0: y.append(1) else: y.append(0) if train_type in ['pick_success_probability', 'gripper_status']: if learned_model is not None: logistic = learned_model else: print classifier_type if classifier_type == 'DTC': logistic = DecisionTreeClassifier(criterion='entropy') else: logistic = linear_model.LogisticRegression(max_iter=400, C=1.0) logistic.fit(x, y) joblib.dump( logistic, output_dir + '/' + classifier_type + '-' + action_str + '.pkl') ans = logistic print logistic.score(x, y) print logistic.get_params() print len(x) if classifier_type != 'DTC': print logistic.coef_ print logistic.intercept_ yaml_out = {} yaml_out['coef'] = logistic.coef_.tolist()[0] yaml_out['intercept'] = logistic.intercept_.tolist()[0] write_config_in_file( output_dir + '/' + classifier_type + '-' + action_str + ".yaml", yaml_out) else: print logistic.feature_importances_ #feature_names=['t1','t2', 'j1', 'j2'] feature_names = [ 'j1', 'j2' ] #Touch not required when object coordinates are known feature_names = feature_names + [ 'gx', 'gy', 'gz', 'gxx', 'gyy', 'gzz', 'gw' ][0:3] feature_names = feature_names + [ 'ox', 'oy', 'oz', 'oxx', 'oyy', 'ozz', 'ow' ][0:3] feature_names = feature_names + ['xrel', 'yrel'] if have_graphviz: dot_data = tree.export_graphviz(logistic, out_file=None, feature_names=feature_names, filled=True) graph = graphviz.Source(dot_data) graph.render(output_dir + '/' + classifier_type + '-' + action_str) yaml_out = {} yaml_out["max_depth"] = logistic.tree_.max_depth yaml_out["values"] = logistic.tree_.value yaml_out['n_nodes'] = logistic.tree_.node_count yaml_out['children_left'] = logistic.tree_.children_left yaml_out['children_right'] = logistic.tree_.children_right yaml_out['feature'] = logistic.tree_.feature yaml_out['threshold'] = logistic.tree_.threshold write_config_in_file( output_dir + '/' + classifier_type + '-' + action_str + ".yaml", yaml_out) if debug: for i in range(0, len(x)): y_bar = logistic.predict([x[i]]) if y_bar != y[i]: print x_index[i] print x[i] print y[i] print logistic.predict_proba([x[i]]) if classifier_type != 'DTC': print logistic.decision_function([x[i]]) prob = (np.dot(logistic.coef_[0], x[i]) + logistic.intercept_[0]) print prob prob *= -1 prob = np.exp(prob) prob += 1 prob = np.reciprocal(prob) print prob if 'next_state' in train_type: actions = range(10) # predictions can be 18, 7 for gripper pose, 7 for objct pose # 2 for joint values # 2 for touch values predictions = range(NUM_PREDICTIONS) train_type_array = train_type.split('_') for s in train_type_array: if 'action' in s: actions = s.split('-')[1:] if 'pred' in s: predictions = s.split('-')[1:] ans = {} for action_ in actions: action = int(action_) x = [] y = [] y_c = [] l_reg = [] l_reg_c = [] x_index = [] for i in range(0, NUM_PREDICTIONS): y.append([]) y_c.append([]) l_reg.append('') l_reg_c.append('') for sasor in saso_data[action]: if sasor['reward'] > -999: #discard invalid states x_entry = sasor['init_joint_values'] x_entry = x_entry + sasor['init_gripper'][0:3] + sasor[ 'init_object'][0:3] x_entry.append(sasor['init_object'][0] - sasor['init_gripper'][0]) x_entry.append(sasor['init_object'][1] - sasor['init_gripper'][1]) x.append(x_entry) x_index.append(sasor['index']) for p_ in predictions: p = int(p_) y[p].append(get_prediction_value(sasor, p)) y_default = get_default_value(sasor, p) y_c[p].append(is_correct(p, y[p][-1], y_default)) """ try: check_array(x) check_array(y[p]) except: print x[-1] print y[p][-1] print sasor['index'] assert(0==1) """ print len(x) ans[action] = {} for p_ in predictions: p = int(p_) if learned_model is not None: l_reg[p] = learned_model[action][p] else: if classifier_type == 'ridge': l_reg[p] = linear_model.Ridge(alpha=0.5, normalize=True) elif classifier_type == 'SVR': l_reg[p] = SVR(epsilon=0.2) elif classifier_type in ['DTR', 'DTRM']: l_reg[p] = DecisionTreeRegressor() elif classifier_type == 'DTC': l_reg[p] = DecisionTreeClassifier() elif classifier_type == 'Earth': l_reg[p] = Earth() elif classifier_type == 'AdaLinear': l_reg[p] = AdaBoostRegressor( linear_model.LinearRegression()) else: l_reg[p] = linear_model.LinearRegression() if classifier_type == 'DTRM': l_reg[p].fit(x, np.transpose(np.array(y))) elif classifier_type == 'DTC': l_reg[p].fit(x, y_c[p]) else: l_reg[p].fit(x, y[p]) joblib.dump( l_reg[p], output_dir + '/' + classifier_type + "-" + repr(action) + "-" + repr(p) + '.pkl') ans[action][p] = l_reg[p] if classifier_type == 'DTRM': print repr(action) + " " + repr(p) + " " + repr( l_reg[p].score(x, np.transpose(np.array(y)))) elif classifier_type == 'DTC': print repr(action) + " " + repr(p) + " " + repr( l_reg[p].score(x, y_c[p])) else: print repr(action) + " " + repr(p) + " " + repr( l_reg[p].score(x, y[p])) print l_reg[p].get_params() if classifier_type not in [ 'SVR', 'DTR', 'DTRM', 'AdaLinear', 'DTC' ]: print l_reg[p].coef_ if classifier_type not in [ 'DTR', 'DTRM', 'AdaLinear', 'DTC', 'Earth' ]: print l_reg[p].intercept_ if classifier_type in ['Earth']: for j in range(0, len(x)): predict_earth(l_reg[p], x[j]) print l_reg[p].summary() if learned_model is None: if classifier_type in ['DTR', 'DTRM', 'AdaLinear', 'DTC']: print l_reg[p].feature_importances_ feature_names = ['j1', 'j2'] feature_names = feature_names + [ 'gx', 'gy', 'gz', 'gxx', 'gyy', 'gzz', 'gw' ][0:3] feature_names = feature_names + [ 'ox', 'oy', 'oz', 'oxx', 'oyy', 'ozz', 'ow' ][0:3] feature_names = feature_names + ['xrel', 'yrel'] if have_graphviz: dot_data = tree.export_graphviz( l_reg[p], out_file=None, feature_names=feature_names, filled=True) graph = graphviz.Source(dot_data) graph.render(output_dir + '/' + classifier_type + "-" + repr(action) + "-" + repr(p)) yaml_out = {} yaml_out['max_depth'] = l_reg[p].tree_.max_depth yaml_out["values"] = l_reg[p].tree_.value.tolist() yaml_out['n_nodes'] = l_reg[p].tree_.node_count yaml_out['children_left'] = l_reg[ p].tree_.children_left.tolist() yaml_out['children_right'] = l_reg[ p].tree_.children_right.tolist() yaml_out['feature'] = l_reg[p].tree_.feature.tolist() yaml_out['threshold'] = l_reg[ p].tree_.threshold.tolist() write_config_in_file( output_dir + '/' + classifier_type + "-" + repr(action) + "-" + repr(p) + ".yaml", yaml_out) if classifier_type in ['Earth']: yaml_out = get_yaml_earth(l_reg[p]) write_config_in_file( output_dir + '/' + classifier_type + "-" + repr(action) + "-" + repr(p) + ".yaml", yaml_out) if classifier_type == 'DTRM': i = 0 y_bar = l_reg[p].predict([x[i]]) print x_index[i] print x[i] y_t = np.transpose(np.array(y)) print repr(y_t[i]) + ' Prediction ' + repr(y_bar) break if debug: for i in range(0, len(x)): y_bar = l_reg[p].predict([x[i]]) if classifier_type == 'DTC': if y_bar != y_c[p][i]: print x_index[i] print x[i] print y_c[p][i] print y[p][i] print l_reg[p].predict_proba([x[i]]) else: if is_correct(p, y_bar, y[p][i]) == 0: print x_index[i] print x[i] print repr( y[p][i]) + ' Prediction ' + repr(y_bar) return ans
export_graphviz(bamboo_tree, feature_names=list(ingredients.columns.values), out_file="bamboo_tree.dot", class_names=np.unique(cuisines), filled=True, node_ids=True, special_characters=True, impurity=False, label="all", leaves_parallel=False) with open("bamboo_tree.dot") as bamboo_tree_image: bamboo_tree_graph = bamboo_tree_image.read() graphviz.Source(bamboo_tree_graph) # The decision tree learned: # * If a recipe contains *cumin* and *fish* and **no** *yoghurt*, then it is most likely a **Thai** recipe. # * If a recipe contains *cumin* but **no** *fish* and **no** *soy_sauce*, then it is most likely an **Indian** recipe. # You can analyze the remaining branches of the tree to come up with similar rules for determining the cuisine of different recipes. # Feel free to select another subset of cuisines and build a decision tree of their recipes. You can select some European cuisines and build a decision tree to explore the ingredients that differentiate them. # # Model Evaluation <a id="4"></a> # <img src="https://ibm.box.com/shared/static/prc3kksci2a6deks36jpyf4cf4oxh74a.png" width=500> # To evaluate our model of Asian and Indian cuisines, we will split our dataset into a training set and a test set. We will build the decision tree using the training set. Then, we will test the model on the test set and compare the cuisines that the model predicts to the actual cuisines.
import graphviz import os m = DecisionTreeClassifier(max_depth=3) m.fit(Xtrain, ytrain) m.score(Xtrain, ytrain) # create string in .dot format tree = export_graphviz(m, out_file=None, class_names=["0", "1"], feature_names=['Age', 'Sex', "Pclass"], impurity=False, filled=True) open('titanic.dot', 'w').write(tree) graph = graphviz.Source(tree) # PNG conversion (tested on Ubuntu) cmd = "dot -Tpng titanic.dot -o tree_graphviz.png" os.system(cmd) #### UPLOAD TO KAGGLE #### predict = pd.read_csv('predict.csv') predict["Sex"] = pd.factorize(predict["Sex"])[0] predict['Age'].fillna(predict["Age"].mean(), inplace = True) Xpredict = predict[["Age", "Sex", "Pclass"]] ypredict = rf.predict(Xpredict) predict.set_index("PassengerId", inplace = True) predict["Survived"] = ypredict submission = predict[["Survived"]] submission.to_csv("submissionrf.csv")
# In[5]: # fit a decision tree from sklearn import tree #import tree dari library sklearn cikampek = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5) #membuat variabel asahan sebagai decisiontree, dengan criterion fungsi mengukur kualitas split cikampek = cikampek.fit(karawang_train_att, karawang_train_pass) #training varibael cikampek dengan data dari variabel karawang. # In[6]: # visualize tree import graphviz #import library graphviz sebagai perangkat lunak visualisasi grafik open source dot_data = tree.export_graphviz(cikampek, out_file=None, label="all", impurity=False, proportion=True, feature_names=list(karawang_train_att), class_names=["fail", "pass"], filled=True, rounded=True) #mengambil data untuk diterjemahkan ke grafik graph = graphviz.Source(dot_data) #membuat variabel graph sebagai grafik yang di ambil dari dot_data graph #memanggil graph # In[7]: # save tree tree.export_graphviz(cikampek, out_file="student-performance.dot", label="all", impurity=False, proportion=True, feature_names=list(karawang_train_att), class_names=["fail", "pass"], filled=True, rounded=True) #save tree sebagai export graphviz ke file student-performance.dot # In[8]: #asahan.score(medan_test_att, medan_test_pass)
def demo(self): import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier X = [[0, 0], [1, 1]] y = [0, 1] clf = DecisionTreeClassifier() clf.fit(X, y) print(clf.predict([[2, 2], [-1, -1], [0, 1]])) # Parameters n_classes = 3 plot_colors = 'ryb' plot_step = 0.02 # Load data iris = load_iris() for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]): # We only take the two corresponding features X = iris.data[:, pair] y = iris.target # Train clf = DecisionTreeClassifier() clf.fit(X, y) # Plot the decision boundary plt.subplot(2, 3, pairidx + 1) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu) plt.xlabel(iris.feature_names[pair[0]]) plt.ylabel(iris.feature_names[pair[1]]) # Plot the training points for i, color in zip(range(n_classes), plot_colors): idx = np.where(y == i) plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], cmap=plt.cm.RdYlBu, edgecolors='black', s=15) plt.suptitle( "Decision surface of a decision tree using paired features") plt.legend(loc='lower right', borderpad=0, handletextpad=0) plt.show() import graphviz from sklearn import tree iris = load_iris() clf = DecisionTreeClassifier() clf.fit(iris.data, iris.target) dot_data = tree.export_graphviz(clf, out_file=None) graph = graphviz.Source(dot_data) from sklearn.tree import DecisionTreeRegressor # Create a random dataset X = np.sort(5 * np.random.rand(80, 1), axis=0) y = np.sin(X).ravel() # Add noise y[::5] += 3 * (0.5 - np.random.rand(16)) # Fit regression model regr_1 = DecisionTreeRegressor(max_depth=2) regr_2 = DecisionTreeRegressor(max_depth=5) regr_1.fit(X, y) regr_2.fit(X, y) # Predict X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] y_1 = regr_1.predict(X_test) y_2 = regr_2.predict(X_test) # Plot the results plt.figure() plt.scatter(X, y, s=20, edgecolor='black', c='darkorange', label='data') plt.plot(X_test, y_1, color='cornflowerblue', label='max_depth=2', linewidth=2) plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2) plt.xlabel("data") plt.ylabel("target") plt.title("Decision Tree Regression") plt.legend() plt.show() import numpy as np import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeRegressor # Create a random dataset rng = np.random.RandomState(1) X = np.sort(200 * rng.rand(100, 1) - 100, axis=0) y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T y[::5, :] += (0.5 - rng.rand(20, 2)) # Fit regression model regr_1 = DecisionTreeRegressor(max_depth=2) regr_2 = DecisionTreeRegressor(max_depth=5) regr_3 = DecisionTreeRegressor(max_depth=8) regr_1.fit(X, y) regr_2.fit(X, y) regr_3.fit(X, y) # Predict X_test = np.arange(-100.0, 100.0, 0.01)[:, np.newaxis] y_1 = regr_1.predict(X_test) y_2 = regr_2.predict(X_test) y_3 = regr_3.predict(X_test) # Plot the results plt.figure() s = 25 plt.scatter(y[:, 0], y[:, 1], c="navy", s=s, edgecolor="black", label="data") plt.scatter(y_1[:, 0], y_1[:, 1], c="cornflowerblue", s=s, edgecolor="black", label="max_depth=2") plt.scatter(y_2[:, 0], y_2[:, 1], c="red", s=s, edgecolor="black", label="max_depth=5") plt.scatter(y_3[:, 0], y_3[:, 1], c="orange", s=s, edgecolor="black", label="max_depth=8") plt.xlim([-6, 6]) plt.ylim([-6, 6]) plt.xlabel("target 1") plt.ylabel("target 2") plt.title("Multi-output Decision Tree Regression") plt.legend(loc="best") plt.show()
min_samples_leaf=15, max_features='sqrt', max_leaf_nodes=12, random_state=0) tree.fit(X_train, y_train) export_graphviz(tree, out_file="tree.dot", feature_names=wine.feature_names, filled=True, rounded=True, special_characters=True) #[writes data into .dot file] with open("tree.dot") as f: dot_graph = f.read() display(graphviz.Source(dot_graph)) print("Feature importance: \n{}".format(tree.feature_importances_)) def plot_feature_importances_wine(model): n_features = wine.data.shape[1] plt.barh(range(n_features), model.feature_importances_, align='center') plt.yticks(np.arange(n_features), wine.feature_names) plt.xlabel("Feature Importance") plt.ylabel("Feature") plt.ylim(-1, n_features) plot_feature_importances_wine(tree)
test_features = test_data[features] dict_vec = feature_extraction.DictVectorizer(sparse=False) train_features = dict_vec.fit_transform( train_features.to_dict(orient='record')) print(dict_vec.feature_names_) new_features = dict_vec.feature_names_ test_features = dict_vec.fit_transform( test_features.to_dict(orient='record')) data_cleaning() dtc = tree.DecisionTreeClassifier() dtc.fit(train_features, train_labels) predict_labels = dtc.predict(test_features) print(predict_labels[0:5]) print("score %f ", dtc.score(train_features, train_labels)) # k 折交叉验证 print("k 折交叉验证准确率:%f", np.mean(cross_val_score(dtc, train_features, train_labels, cv=10))) graph_data = tree.export_graphviz(dtc, out_file=None, feature_names=new_features) graph = graphviz.Source(graph_data) graph.view(filename='titanic_classifier', quiet_view=False)
import sys import graphviz import core def usage(script_name): return f"Usage: {script_name} <outline_filepath>" if __name__ == "__main__": try: outline_filepath = sys.argv[1] except IndexError: print(usage(sys.argv[0]), file=sys.stderr) sys.exit(1) with open(outline_filepath) as outline_file: lines = map(lambda s: s.rstrip(), outline_file.readlines()) source = core.edges_to_dot(core.lines_to_edges(lines)) graphviz_file = outline_filepath.split(".")[0] dot = graphviz.Source(source) format = sys.argv[2] if len(sys.argv) > 2 else "pdf" dot.render(graphviz_file, format=format)
def create_stromcek(strat_store, strategy): data_train = strat_store["train"].dropna() train_labels = data_train['class'] data_valid = strat_store["valid"].dropna() valid_labels = data_valid['class'] columns = ['name', 'address', 'date_of_birth', 'class'] df_train_class = data_train.drop(columns, axis=1) df_train_class = pd.get_dummies(df_train_class) df_valid_class = data_valid.drop(columns, axis=1) df_valid_class = pd.get_dummies(df_valid_class) missing_cols = set(df_train_class.columns) - set(df_valid_class.columns) for c in missing_cols: df_valid_class[c] = 0 df_valid_class = df_valid_class[df_train_class.columns] clf = tree.DecisionTreeClassifier() parameters = { 'criterion': ('gini', 'entropy'), 'splitter': ('best', 'random'), 'max_depth': range(2, 20), 'max_features': range(1, 77, 5) } optimization = GridSearchCV(clf, parameters, cv=10) vysledok = optimization.fit(df_train_class, train_labels) vysledok params = optimization.best_params_ clf = tree.DecisionTreeClassifier(criterion=params['criterion'], max_depth=params['max_depth'], max_features=params['max_features'], splitter=params['splitter']) clf = clf.fit(df_train_class, train_labels) predicted_labels = clf.predict(df_valid_class) basic_acc = metrics.accuracy_score(valid_labels, predicted_labels) basic_acc import graphviz dot_data = tree.export_graphviz( clf, out_file=None, feature_names=df_train_class.columns, class_names=["1", "0"], filled=True, rounded=True, ) graph = graphviz.Source(dot_data) graph.render("strom_" + strategy) strat_store["report"] = classification_report(valid_labels, predicted_labels, target_names=["0", "1"])
#熱度圖 heat map import matplotlib.pyplot as plt import seaborn as sns # %matplotlib inline df.corr() plt.figure(figsize=(10, 10)) sns.heatmap(df.astype("float").corr(), cmap="OrRd", annot=True) """class sklearn.tree.DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort='deprecated', ccp_alpha=0.0)""" from sklearn.tree import export_graphviz import graphviz g = export_graphviz(clf, feature_names=iris["feature_names"], class_names=iris["target_names"], filled=True) graphviz.Source(g) """Gini index or Gini impurity measures the degree or probability of a particular variable being wrongly classified when it is randomly chosen. But what is actually meant by ‘impurity’? If all the elements belong to a single class, then it can be called pure. The degree of Gini index varies between 0 and 1, where 0 denotes that all elements belong to a certain class or if there exists only one class, and 1 denotes that the elements are randomly distributed across various classes. A Gini Index of 0.5 denotes equally distributed elements into some classes. where pi is the probability of an object being classified to a particular class. """ pre = clf.predict(x_test) print(list(pre)) print(list(y_test)) from sklearn.metrics import accuracy_score accuracy_score(clf.predict(x_test), y_test) # search sklearn metrics from sklearn.metrics import confusion_matrix pd.DataFrame(confusion_matrix(y_test, pre))
predict_labels(clf, X_test_imp, y_test))) # Use k-fold cross validation to evaluate model scores = cross_val_score(clf, X_all_imp, y_all, cv=10, scoring=f1_scorer) print(scores) feat_names = list(X_all.columns) #Visualize tree tree_data1 = tree.export_graphviz(clf, out_file='pstemp_tree.dot', feature_names=feat_names, class_names=['No PSTEMP', 'PSTEMP'], leaves_parallel=True, filled=True) with open("pstemp_tree.dot") as f: dot_graph = f.read() #print(dot_graph) graph = gv.Source( source=dot_graph, filename='pstemp_tree', directory='C:/Users/david_000/Google Drive/HSLS_2009_v3_0_Stata_Datasets', format='png', engine='dot') #graph = gv.Source(tree_data1, filename='pstemp_tree', directory='C:/Users/david_000/Google Drive/HSLS_2009_v3_0_Stata_Datasets', format='pdf') graph.render( filename='pstemp_tree', directory='C:/Users/david_000/Google Drive/HSLS_2009_v3_0_Stata_Datasets') #graph.render(filename='pstemp_tree', directory='C:/Users/david_000/Google Drive/HSLS_2009_v3_0_Stata_Datasets', view=True)
from sklearn import tree tree.plot_tree(decision_tree=clsModel) tree.plot_tree(decision_tree=clsModel, feature_names=['Var', 'Skew', ' Kur', 'Ent'], class_names=['Org','Fake'], fontsize=12) #not a good way to draw graphs.. other methods to be experimented tree.plot_tree(decision_tree=clsModel, max_depth=2, feature_names=['Var', 'Skew', ' Kur', 'Ent'], class_names=['Org','Fake'], fontsize=12) Source(tree.export_graphviz(clsModel)) dot_data1 = tree.export_graphviz(clsModel, max_depth=3, out_file=None, filled=True, rounded=True, special_characters=True, feature_names=['Var', 'Skew', ' Kur', 'Ent'], class_names=['Org','Fake']) #check the folder location after installing the graphviz import os os.environ["PATH"] += os.pathsep + 'c:/Program Files (x86)/Graphviz2.38/bin/' import graphviz from subprocess import call call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600']) graph1 = graphviz.Source(dot_data1) graph1 #%% Regression Tree - Predict Petrol Consumption on other parameters #Predict Numerical value based on IV #os.listdir('E:/analytics/projects/pyanalytics/data') #data2 = pd.read_csv('E:/analytics/projects/pyanalytics/data/petrol_consumption.csv') data2 = pd.read_csv('https://raw.githubusercontent.com/DUanalytics/pyAnalytics/master/data/petrol_consumption.csv') data2.head() data2.shape data2.columns
#%% [markdown] ## Counting Trees # *Combinatorics* is a branch of mathematics that is about counting things. # Today we will count a kind of diagram called a *tree*. # # Although this may seem a simple idea, it turns out to be important in many # situations in both mathematics and computer science. #%% [markdown] ### Growing a tree # To grow a tree, start with a dot that we call the "root". # This is the simplest tree. #%% graphviz.Source(draw_dot(G, path='G0.png')) #%% [markdown] # You grow a tree by starting with this simple tree and adding dots and edges. # There are some rules: # - The root gets left alone. # - Every new dot you add gets connected to a dot that's already in the tree by an edge. # - You always add dots in pairs, with each of the new dots connected to the same dot that's already in the tree. # Trees always follow these rules: # - A tree has a root dot at the bottom and either zero or two lines connected to it. # - Every dot in a tree except for the root has either three lines connected to it, or one. #%% [markdown] # Here is what happens when we add two new dots to the top of our simple tree: # We get a tree with 3 dots and 2 lines.
def main(): digits = datasets.load_digits() data_digits = digits.data target = digits.target sep = int(len(data_digits) * 0.7) train_data = data_digits[:sep] test_data = data_digits[sep:] train_target = target[:sep] test_target = target[sep:] print("------------------- SciKitLearn Tree -------------------") clf = tree.DecisionTreeClassifier() clf = clf.fit(train_data, train_target) dot_data = tree.export_graphviz(clf) graph = graphviz.Source(dot_data) graph.render('ScikitLearnTree') result = clf.predict(test_data) print(result) report = metrics.classification_report(test_target, result) print(report) report = metrics.confusion_matrix(test_target, result, labels=digits.target_names) print(report) print("------------------- SciKitLearn Tree2 -------------------") clf = tree.DecisionTreeClassifier(min_samples_leaf=3) clf = clf.fit(train_data, train_target) dot_data = tree.export_graphviz(clf) graph = graphviz.Source(dot_data) graph.render('ScikitLearnTree') result = clf.predict(test_data) print(result) report = metrics.classification_report(test_target, result) print(report) report = metrics.confusion_matrix(test_target, result, labels=digits.target_names) print(report) print("------------------- My tree with ToyData -------------------") attributes, classes, data, target, data2, target2 = td.ToyData().get_data() id3 = ID3.ID3DecisionTreeClassifier(toy=1) myTree = id3.fit(data, target, attributes, classes) plot = id3.make_dot_data() plot.render("testTree_toyData") #pdb.set_trace() result = id3.predict(data2) print("Predicted" + str(result)) report = metrics.classification_report(target2, result) print(report) report = metrics.confusion_matrix(target2, result) print(report) classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] print("------------------- My tree with digits -------------------") print(len(train_data)) id3 = ID3.ID3DecisionTreeClassifier(toy=0) att_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] attributes = {} for i in range(64): attributes[i] = att_values myTree = id3.fit(train_data, train_target, attributes, classes) plot = id3.make_dot_data() plot.render("testTree_digits") #pdb.set_trace() result = id3.predict(test_data) print("Predicted" + str(result)) report = metrics.classification_report(test_target, result) print(report) report = metrics.confusion_matrix(test_target, result) print(report) print("------------------- My tree with d-g-l -------------------") id3 = ID3.ID3DecisionTreeClassifier(toy=0) att_values = ['d', 'g', 'l'] attributes = {} for i in range(64): attributes[i] = att_values data = [] for item in data_digits: row = [] for d in item: if d < 5: row.append('d') elif d < 10: row.append('g') else: row.append('l') data.append(row) train_data = data[:sep] test_data = data[sep:] #pdb.set_trace() myTree = id3.fit(train_data, train_target, attributes, classes) plot = id3.make_dot_data() plot.render("testTree_digits_dgl") result = id3.predict(test_data) print("Predicted" + str(result)) report = metrics.classification_report(test_target, result) print(report) report = metrics.confusion_matrix(test_target, result) print(report)
def gv(s): return graphviz.Source('digraph G{ rankdir="LR"' + s + '; }') def get_image_files_sorted(path, recurse=True, folders=None): return get_image_files(path, recurse, folders).sorted()
modelo = SVC() # Instância do estimador modelo.fit( treino_x, treino_y) # Ensina o estimador com os dados e as classes desses dados previsoes = modelo.predict(teste_x) # Prediz oque cada item da lista é acuracia = accuracy_score(teste_y, previsoes) * 100 # Valida a acurácia print("A acurácia foi %.2f%%" % acuracia) """# DecisionTreeClassifier""" modelo = DecisionTreeClassifier(max_depth=3) #modelo de arvore com altura 3 modelo.fit(raw_treino_x, treino_y) previsoes = modelo.predict(raw_teste_x) acuracia = accuracy_score(teste_y, previsoes) * 100 print("A acurácia foi %.2f%%" % acuracia) features = x.columns # nome das colunas para aparecer nos itens da arvore dot_data = export_graphviz( modelo, # dados out_file=None, # não vai exportar nada filled=True, # preenchido com cores rounded=True, # borda arredondada feature_names=features, # nomes class_names=["não", "sim"] # Classes de retorno 0 ou 1 ) # exporta o texto para gerar o gráfico grafico = graphviz.Source(dot_data) # monta o gráfico grafico
def draw_tree(t, df, size=10, ratio=0.6, precision=0, **kwargs): s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True, special_characters=True, rotate=False, precision=precision, **kwargs) return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))
from __future__ import division import numpy as np import graphviz import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier from sklearn import tree from sklearn.model_selection import train_test_split if __name__ == "__main__": iris = load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print "score : %s" % score dot_data = tree.export_graphviz(clf, out_file=None, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data) graph.render("iris")
def overwatch(): column_names = ['Team Stack', 'Role', 'Leaver', 'Mode', 'Map', 'Result'] features = ['Team Stack', 'Role', 'Leaver'] classes = ['Loss', 'Win'] data = pd.read_csv("./overwatch/Overwatch.csv", names=column_names) data = data.iloc[1:] #Kill the first row (titles) #Preprocess the rest of the data data = data[['Team Stack', 'Role', 'Leaver', 'Result']] #Take care of missing data #round(data[['Team Stack']].drop([1,2,3,4]).mean(),1) #^The result of this is 1.3 -> 1 is the mean data[['Team Stack']] = data[['Team Stack']].fillna(1) #Filter out only Support and Tank (have to do binary because n-ary won't work with sklearn) #df[df.C.str.contains("XYZ") == False] data = data[data['Role'].str.contains("Offense") == False] data = data[data['Role'].str.contains("Defense") == False] #Change Team Stack to binary (1 ->0, 2-5 ->1) data['Team Stack'][data['Team Stack'] == '1'] = 0 data['Team Stack'][data['Team Stack'] == '2'] = 1 data['Team Stack'][data['Team Stack'] == '3'] = 1 data['Team Stack'][data['Team Stack'] == '4'] = 1 data['Team Stack'][data['Team Stack'] == '5'] = 1 #Make Leaver columnn binary (if its on enemy team, doesnt affect our team really) #This will make it binary once everything goes numeric data['Leaver'][data['Leaver'] == 'Enemy team'] = 'No' #Make Match result binary (Draw = lose) data['Result'][data['Result'] == 'Draw'] = 'Loss' #TURN THIS INTO ONEHOTENCODER BECAUSE WE DONT WANT IT NUMERIC #turn the categorized data into numerical LEncoder = LabelEncoder() data = data.apply(LEncoder.fit_transform) #split arrays into values and targets (classif) overwatchData = data.iloc[:, [0, 1, 2]] overwatchTarget = data.iloc[:, [3]] #max_depth restricts model so it doesn't grow complex and overfit - similar to pruning X_train, X_test, Y_train, Y_test = train_test_split(overwatchData, overwatchTarget, test_size=0.33, random_state=42) #Train model with the decision tree classifier = tree.DecisionTreeClassifier(max_depth=5) #classifier = classifier.fit(iris.data, iris.target) classifier = classifier.fit(X_train, Y_train) dot_data = tree.export_graphviz(classifier, out_file=None, max_depth=4, impurity=False, proportion=True, feature_names=features, class_names=classes, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data) graph.render('test-output/overwatch') predictions = classifier.predict(X_test) print("Overwatch: Accuracy is ", round(accuracy_score(Y_test, predictions) * 100), '%')