Пример #1
0
print()

# Explore Target NAMES in DataSet
print("===IRIS DATA TARGET NAMES===")
print(irisData.target_names)

# 1. Create the Model
model = tree.DecisionTreeClassifier()

# 2. Training the Model
model.fit(irisData.data, irisData.target)

# Lets Test The Model with a Sample Input
# inputData = [5.5, 2.3, 4.0,	1.3]
# predictedTarget = model.predict([inputData])
# print(predictedTarget)

inputData1 = [5.5, 2.3, 4.0, 1.3]
inputData2 = [5.43, 3.90, 1.15, 0.32]

predictedTargets = model.predict([inputData1, inputData2])
print(predictedTargets)

import graphviz
data = tree.export_graphviz(model, out_file=None)
graph = graphviz.Source(data)
graph.render("IRIS DATA SET DECISION TREE")
graph.view()

# Train the DecisionTreeClassifier with DataSet as in Session40
# Try looking for APIs to convert the string dataset into numbered dataset
def gplearn_procedure(equation_id,
                      no_samples=1000,
                      input_range=(-1, 1),
                      save_path=None,
                      save=True,
                      load=True,
                      func_set=[
                          'add', 'sub', 'mul', 'div', 'log', 'sqrt', 'cos',
                          'tan', 'sin', 'pow', 'exp'
                      ],
                      verbose=1):
    """
    Uses gplearn to attempt to predict the equation form of 'equation_id'
    Renders a graphviz image to images/gplearn/
    returns predicted equation, R^2 score and time taken
    
    Parameters
    ----------
    equation_id : string
        The ID of an equation in the dataset. Must be a valid one

    no_samples : int 
        The number of samples you want fed in to the algorithm

    input_range: tuple(float, float)
        The minimum and maximum values of all input parameters
    save_path: string path
        The path to where you wish the save this dataframe
    save: boolean
        Saves file to save_path iff True
    load: boolean
        If true then looks for file in save_path and loads it preemptively if it is there

    func_set : list
        List of strings i.e names of functions to include / operations to consider
        current options include
        ‘add’ : addition, arity=2.
        ‘sub’ : subtraction, arity=2.
        ‘mul’ : multiplication, arity=2.
        ‘div’ : protected division where a denominator near-zero returns 1., arity=2.
        ‘sqrt’ : protected square root where the absolute value of the argument is used, arity=1.
        ‘log’ : protected log where the absolute value of the argument is used and a near-zero argument returns 0., arity=1.
        ‘abs’ : absolute value, arity=1.
        ‘neg’ : negative, arity=1.
        ‘inv’ : protected inverse where a near-zero argument returns 0., arity=1.
        ‘max’ : maximum, arity=2.
        ‘min’ : minimum, arity=2.
        ‘sin’ : sine (radians), arity=1.
        ‘cos’ : cosine (radians), arity=1.
        ‘tan’ : tangent (radians), arity=1.

        'exp' : exponential (self defined), arity=1
        'pow' : power (self defined), arity=2

    verbose : int
        controls how much is printed, 0 is quitest

    Returns
    -------
    string, float, float
    """
    try:
        df = create_dataset(equation_id,
                            no_samples=no_samples,
                            input_range=input_range,
                            save_path=save_path,
                            save=save,
                            load=load).dropna()
        X = df.drop('target', axis=1)
        y = df['target']
    except:
        traceback.print_exc()
        print(f"Error on equation {equation_id} skipping")
        return '', 0, 0
    no_samples = min(no_samples, len(y))

    default_func_set = ('add', 'sub', 'mul', 'div', 'log', 'sqrt', 'cos',
                        'tan', 'sin', 'abs', 'neg', 'inv', 'max', 'min')
    final_func_set = []
    for func in func_set:
        if func in default_func_set:
            final_func_set.append(func)
        else:
            if func == "pow":
                final_func_set.append(make_function(power, func, 2))
            elif func == "exp":
                final_func_set.append(make_function(exponent, func, 1))
            elif func == "pi":
                final_func_set.append(make_function(pi, func, 0))
            else:
                warnings.warn(
                    f"{func} is an unrecognized function, skipping it")
                pass

    est_gp = SymbolicRegressor(population_size=5000,
                               generations=10,
                               stopping_criteria=0.01,
                               p_crossover=0.7,
                               p_subtree_mutation=0.1,
                               p_hoist_mutation=0.05,
                               p_point_mutation=0.1,
                               max_samples=0.9,
                               function_set=final_func_set,
                               verbose=verbose,
                               parsimony_coefficient=0.01,
                               random_state=0)

    start = time.time()
    hist = est_gp.fit(X[:no_samples], y[:no_samples])
    end = time.time()
    #print(est_gp._program)
    dot_data = est_gp._program.export_graphviz()
    graph = graphviz.Source(dot_data)
    graph.render(f'images/gplearn/{equation_id}_estimate',
                 format='png',
                 cleanup=True)
    return est_gp._program, est_gp.score(X, y), end - start
Пример #3
0
def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id)


iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

# 시각화
export_graphviz(tree_clf,
                out_file=image_path("iris_tree.dot"),
                feature_names=iris.feature_names[2:],
                class_names=iris.target_names,
                rounded=True,
                filled=True)

with open("images/decision_trees/iris_tree.dot") as f:
    dot_graph = f.read()
dot = graphviz.Source(dot_graph)
dot.format = 'png'
dot.render(filename='iris_tree', directory='images/decision_trees/')

# 클래스와 클래스 확률 예측
tree_clf.predict_proba([[5, 1.5]
                        ])  # array([[0.        , 0.90740741, 0.09259259]])
tree_clf.predict([[5, 1.5]])  # array([1])
Пример #4
0
clf = DecisionTreeClassifier(max_depth=8)  # 参数max_depth设置树最大深度

# 交叉验证,评价分类器性能,此处选择的评分标准是ROC曲线下的AUC值,对应AUC更大的分类器效果更好
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format(np.mean(scores),
                                                       np.std(scores)))
clf = clf.fit(X_train, y_train)
dot_data = tree.export_graphviz(
    clf,
    out_file=None,  # doctest: +SKIP
    feature_names=features.head(0).columns.values.tolist(),  # doctest: +SKIP
    #class_names=["MORE THAN", "NO MORE THAN"],  # doctest: +SKIP
    filled=True,
    rounded=True,  # doctest: +SKIP
    special_characters=True)  # doctest: +SKIP
graph = graphviz.Source(dot_data)  # doctest: +SKIP
graph

# In[19]:

from sklearn.learning_curve import learning_curve


def plot_learning_curve(estimator,
                        X,
                        y,
                        ylim=(0, 1.1),
                        cv=5,
                        n_jobs=-1,
                        train_sizes=np.linspace(.1, 1.0, 5),
                        scoring=None):
a = plot_tree(dcs_tree, 
              feature_names=header_names, 
              class_names=target_names, 
              filled=True, 
              rounded=True, 
              fontsize=14)

plt.show()'''

from sklearn.tree import export_graphviz
import graphviz

import pydot
import pyparsing

plot_desc_tree = graphviz.Source(export_graphviz(dcs_tree_fit))
plot_desc_tree.view()
#plot_tree.render('dtree_render_'+clf_name,view=True)

from IPython.core.display import display

#Image(filename='decision_tree.png')
'''
try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

dot_data = StringIO() 
plot_tree.export_graphviz(dcs_tree_fit, out_file=dot_data) 
plot_tree = pydot.graph_from_dot_data(dot_data.getvalue()) 
Пример #6
0
inputs['velocidadViento_n'] = le_cielo.fit_transform(inputs['velocidadViento'])

print(inputs)

inputs_n = inputs.drop(['cielo', 'ambiente', 'humedo', 'velocidadViento'], axis= 'columns')
#conseguimos las direcciones de cada elemento en las columnas
one_hot_data = pd.get_dummies(inputs[['cielo_n', 'ambiente_n', 'humedo_n', 'velocidadViento_n']])

print(one_hot_data)

#entrenamos el arbol con la funcion fit, le pasamos los datos a entrenar y las output's
decisionTree = decisionTree.fit(inputs_n,juegoTennis['juego'])
print(decisionTree.score(inputs_n,juegoTennis['juego']))

#utilzamos la función predict para solicitarle al arbol ya entrenado una posible solución a nuestro problema
decision = decisionTree.predict([[1,0,1,1]])

#realizamos una pequena validacion para poder mostra un mensaje en la pantalla
if decision == 'si':
    print(' hoy SI se juega')
else:
    print(' hoy NO se juega')

#esta parte comentada aún está en desarrollo
dot_data = tree.export_graphviz(decisionTree, out_file='juego.dot',feature_names=list(inputs_n), class_names=['Not_Play', 'Play'], rounded=True, filled=True)

with open('juego.dot') as f:
    dot_graph=f.read()
graphviz.Source(dot_graph).view()
 
Пример #7
0
def knowledge_dist(num_trees, num_nearneigh, num_bins):
    #Read Data Set file
    data = pd.read_csv("/Users/oscaraguilar/Desktop/dataset2_clean.csv",
                       sep=",",
                       header=0)
    attributes = [
        'Season', 'Age @ Analysis', 'Childish diseases', 'Accident or trauma',
        'Surgical intervention', 'High fevers',
        'Frequency of alcohol consumption', 'Smoking habit',
        'Number of hours spent sitting per day'
    ]

    #Split in feature data and target data
    columns = data.shape[1]
    x = data.values[:, 1:columns - 1]
    y = data.values[:, -1]
    x = np.array(x, dtype=float)

    #Split in training, testing and validation sets
    #Generating train and test data sets, 80% for training and 20% for testing
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=10)
    #Splitting train set in training and validation sets, 75% for
    #training and 25% for testing
    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.25,
                                                      random_state=10)

    #Building of classifiers
    clf1 = RandomForestClassifier(n_estimators=num_trees,
                                  criterion='gini',
                                  random_state=10)  #Binary Class
    clf2 = DecisionTreeClassifier(criterion='gini',
                                  splitter='best',
                                  random_state=10,
                                  min_impurity_decrease=0.0001)  #Binary Class
    clf3 = DecisionTreeClassifier(criterion='gini',
                                  splitter='best',
                                  random_state=10,
                                  min_impurity_decrease=0.0001)  #Multi Class
    clf4 = SVC(kernel='linear')
    clf5 = KNeighborsClassifier(n_neighbors=num_nearneigh)

    #Training Classifiers for binary and multiclass classification problems
    clf1 = clf1.fit(x_train, y_train)
    clf2 = clf2.fit(x_train, y_train)
    clf4 = clf4.fit(x_train, y_train)
    clf5 = clf5.fit(x_train, y_train)

    #Get probabilities
    prob = clf1.predict_proba(x)
    #Convert to data frame
    df = pd.DataFrame(prob)
    #Drop '1' class probabilities
    p1 = df.drop(1, axis=1)
    #Conver to numpy array
    p2 = np.array(p1, dtype=float)
    #Bining process
    hist, bin_edges = np.histogram(p2, bins=num_bins)
    #Retrieve the bin number of each probability
    bin_number = np.digitize(p2, bin_edges)
    #Create new data set for multiclass classification
    #prob_dataset=np.concatenate((x,bin_number), axis=1)
    #print(prob_dataset)

    #Generating train and test data sets, 70% for training and 30% for testing
    x_train2, x_test2, y_train2, y_test2 = train_test_split(x,
                                                            bin_number,
                                                            test_size=0.3,
                                                            random_state=10)
    #Training multiclass classification decision tree
    clf3 = clf3.fit(x_train2, y_train2)

    #Tuning Testing with test sets
    rf = clf1.predict(x_val)
    dt = clf2.predict(x_val)
    dt_multi = clf3.predict(x_test2)
    svm = clf4.predict(x_val)
    knn = clf5.predict(x_val)

    #Final testing with validation set (data never seen by the classifiers)
    rf_val = clf1.predict(x_test)
    dt_val = clf2.predict(x_test)
    dt_multi_test = clf3.predict(x_test)
    svm_val = clf4.predict(x_test)
    knn_val = clf5.predict(x_test)

    #Binarize clf3 output into 2 classes (binary)
    binarize = np.where(dt_multi_test >= (num_bins / 2), 0, 1)

    #Accuracies and classification reports
    def accuracies():
        k_fold = KFold(n_splits=10)
        a1 = accuracy_score(y_val, rf) * 100
        a2 = accuracy_score(y_val, dt) * 100
        a3 = accuracy_score(y_test2, dt_multi) * 100
        a4 = accuracy_score(y_val, svm) * 100
        a5 = accuracy_score(y_val, knn) * 100
        a6 = accuracy_score(y_test, rf_val) * 100
        a7 = accuracy_score(y_test, dt_val) * 100
        a8 = accuracy_score(y_test, binarize) * 100
        a9 = accuracy_score(y_test, svm_val) * 100
        a10 = accuracy_score(y_test, knn_val) * 100

        print('Accuracies:')
        print("Tuning Accuracy random forest = %s" % str(a1))
        print("Final Accuracy random forest = %s" % str(a6))
        score_1 = cross_val_score(clf1, x, y, cv=k_fold, n_jobs=-1)
        print('Average random forest accuracy: {} %'.format(
            np.mean(score_1) * 100))
        print('----------------------------------')
        print("Tuning Accuracy binary decision tree = %s" % str(a2))
        print("Final Accuracy binary decision tree = %s" % str(a7))
        score_2 = cross_val_score(clf2, x, y, cv=k_fold, n_jobs=-1)
        print('Average binary decision tree accuracy: {} %'.format(
            np.mean(score_2) * 100))
        print('----------------------------------')
        print("Tuning Accuracy multi class decision tree = %s" % str(a3))
        print("Final Accuracy multi class decision tree = %s" % str(a8))
        score_3 = cross_val_score(clf3, x, y, cv=k_fold, n_jobs=-1)
        print('Average multi class decision tree accuracy: {} %'.format(
            np.mean(score_3) * 100))
        print('----------------------------------')
        print("Tuning Accuracy binary SVM = %s" % str(a4))
        print("Final Accuracy binary SVM = %s" % str(a9))
        score_4 = cross_val_score(clf4, x, y, cv=k_fold, n_jobs=-1)
        print('Average SVM accuracy: {} %'.format(np.mean(score_4) * 100))
        print('----------------------------------')
        print("Tuning Accuracy binary KNN (10NN) = %s" % str(a5))
        print("Final Accuracy binary KNN (10NN) = %s" % str(a10))
        score_5 = cross_val_score(clf5, x, y, cv=k_fold, n_jobs=-1)
        print('Average KNN accuracy: {} %'.format(np.mean(score_5) * 100))
        print('----------------------------------')
        return

    def classreports():
        classes = ['0-Healthy', '1-Unhealthy']
        print('Class reports and confusion matrices')
        print('RANDOM FOREST')
        print(classification_report(y_test, rf_val, target_names=classes))
        print(confusion_matrix(y_test, rf_val))
        print('----------------------------------')
        print('BINARY DECISION TREE')
        print(classification_report(y_test, dt_val, target_names=classes))
        print(confusion_matrix(y_test, dt_val))
        print('----------------------------------')
        print('BINARIZED MULTICLASS DECISION TREE')
        print(classification_report(y_test, binarize, target_names=classes))
        print(confusion_matrix(y_test, binarize))
        print('----------------------------------')
        print('SVM')
        print(classification_report(y_test, svm_val, target_names=classes))
        print(confusion_matrix(y_test, svm_val))
        print('----------------------------------')
        print('K-NEAREST NEIGHBOURS')
        print(classification_report(y_test, knn_val, target_names=classes))
        print(confusion_matrix(y_test, knn_val))
        return

    #Plot Distilled and binary DTs
    classes2 = ['0', '1']
    classes3 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

    dot_data2 = tree.export_graphviz(clf2,
                                     out_file=None,
                                     feature_names=attributes,
                                     class_names=classes2,
                                     filled=True,
                                     rounded=True)

    graph2 = graphviz.Source(dot_data2)
    graph2.render("Binary DT",
                  directory='/Users/oscaraguilar/Desktop',
                  format='png')

    dot_data = tree.export_graphviz(clf3,
                                    out_file=None,
                                    feature_names=attributes,
                                    class_names=classes3,
                                    filled=True,
                                    rounded=True)

    graph = graphviz.Source(dot_data)
    graph.render("Distilled tree",
                 directory='/Users/oscaraguilar/Desktop',
                 format='png')

    #Plot confusion matrices
    #cm= confusion_matrix(y_test,binarize)
    #plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
    #classNames = ['Healthy','Unhealthy']
    #plt.title('DISTILLED DECISION TREE CONFUSION MATRIX')
    #plt.ylabel('True label')
    #plt.xlabel('Predicted label')
    #tick_marks = np.arange(len(classNames))
    #plt.xticks(tick_marks, classNames, rotation=45)
    #plt.yticks(tick_marks, classNames)
    #s = [['TN','FP'], ['FN', 'TP']]
    #for i in range(2):
    #for j in range(2):
    #plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
    #plt.savefig('/Users/oscaraguilar/Desktop')

    a = accuracies()
    b = classreports()
    return a, b
Пример #8
0
    def render(self,
               out_file,
               format='pdf',
               view=True,
               feature_names=None,
               filled=True,
               leaves_parallel=True,
               rotate=False,
               rounded=True,
               special_characters=False,
               precision=3):
        """
        Render the tree to a flie

        Parameters
        ----------
        out_file : file name to save to

        format : string, optional, default 'pdf'
            The file format to render to; must be supported by graphviz

        view : bool, optional, default True
            Whether to open the rendered result with the default application.

        feature_names : list of strings, optional, default None
            Names of each of the features.

        filled : bool, optional, default False
            When set to ``True``, paint nodes to indicate majority class for
            classification, extremity of values for regression, or purity of node
            for multi-output.

        leaves_parallel : bool, optional, default True
            When set to ``True``, draw all leaf nodes at the bottom of the tree.

        rotate : bool, optional, default False
            When set to ``True``, orient tree left to right rather than top-down.

        rounded : bool, optional, default True
            When set to ``True``, draw node boxes with rounded corners and use
            Helvetica fonts instead of Times-Roman.

        special_characters : bool, optional, default False
            When set to ``False``, ignore special characters for PostScript
            compatibility.

        precision : int, optional, default 3
            Number of digits of precision for floating point in the values of
            impurity, threshold and value attributes of each node.
        """
        dot_source = self.export_graphviz(
            out_file=
            None,  # want the output as a string, only write the final file
            feature_names=feature_names,
            filled=filled,
            leaves_parallel=leaves_parallel,
            rotate=rotate,
            rounded=rounded,
            special_characters=special_characters,
            precision=precision)
        graphviz.Source(dot_source).render(out_file, format=format, view=view)
Пример #9
0
test_target = iris.target[test_idx]
##
test_data = iris.data[test_idx]

# 2. Train a classifier
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_target)

# 3. Predict label for new flower
print(test_target)  # [0,1,2]
print(clf.predict(test_data))  # splits out the same labels [0,1,2]

# 4. Visualize the tree
##
from sklearn.externals.six import StringIO
import pydot
dot_data = StringIO()
tree.export_graphviz(clf,
                     out_file=dot_data,
                     feature_names=iris.feature_names,
                     class_names=iris.target_names,
                     filled=True,
                     rounded=True,
                     impurity=False)

print("should export the tree.dot")

import graphviz as gp
graph = gp.Source(dot_data.getvalue())
graph.render("iris", view=True)
Пример #10
0
def visualise_tree(trained_tree):
    dot_data = tree.export_graphviz(trained_tree, out_file=None)
    graph = graphviz.Source(dot_data)
    graph.render("oxo")
Пример #11
0
y

- We can now use the `.fit()` method to train our model using the `X` and `y` data

model.fit(X, y)

- Now we've used data to learn a model
- Let's take a look at the model we made!
- The code below prints out our model structure for us (like the tree we made ourselves earlier)

import graphviz
from sklearn.tree import export_graphviz
dot_data = export_graphviz(model)
graphviz.Source(export_graphviz(model,
                                out_file=None,
                                feature_names=X.columns,
                                class_names=["blue", "red"],
                                impurity=False))

- We can better visualize what's going on by actually plotting our data and the model "decision boundaries"
- The code below does just this
- It's using some code I have made myself located in the code folder on Canvas
- I'm also using the plotting library `altair` to make this plot (which you may not have seen). It makes very nice plots but requires some wrangling to get data into a suitable format for use with the package. **You do not need to learn to use Altair in this course**, all your plotting for this course may be done in `matplotlib`

import altair as alt # altair is a plotting library
import sys
sys.path.append('code/')
from model_plotting import plot_model, plot_regression_model, plot_tree_grid # these are some custom plotting scripts I made

plot_model(X, y, model)
def train(object_name,
          data_dir,
          output_dir,
          train_type,
          classifier_type,
          learned_model=None,
          debug=False):
    from sklearn import linear_model, tree
    from sklearn.svm import SVR
    from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
    from sklearn.ensemble import AdaBoostRegressor
    if classifier_type == 'Earth':
        from pyearth import Earth
    import numpy as np
    have_graphviz = True
    try:
        import graphviz
    except:
        have_graphviz = False
    ans = None
    saso_data = load_data_file(object_name, data_dir)
    if train_type == 'gripper_status':
        action_str = 'gs'
        actions = range(CLOSE_ACTION_ID + 1)
        x = []
        y = []
        x_index = []
        for action in actions:
            for sasor in saso_data[action]:
                #x_entry = sasor['touch_prev'] + sasor['init_joint_values']
                x_entry = sasor['next_joint_values']
                x_entry = x_entry + sasor['next_gripper'] + sasor['next_object']
                x_entry.append(sasor['next_object'][0] -
                               sasor['next_gripper'][0])
                x_entry.append(sasor['next_object'][1] -
                               sasor['next_gripper'][1])
                x.append(x_entry)
                x_index.append(sasor['index'])
                if action == CLOSE_ACTION_ID:
                    y.append(1)
                else:
                    y.append(0)  #gripper open
    if train_type == 'pick_success_probability':
        action_str = repr(PICK_ACTION_ID)
        x = []
        y = []
        x_index = []
        for sasor in saso_data[PICK_ACTION_ID]:
            #x_entry = sasor['touch_prev'] + sasor['init_joint_values']
            x_entry = sasor['init_joint_values']
            x_entry = x_entry + sasor['init_gripper'][0:3] + sasor[
                'init_object'][0:3]
            x_entry.append(sasor['init_object'][0] - sasor['init_gripper'][0])
            x_entry.append(sasor['init_object'][1] - sasor['init_gripper'][1])
            x.append(x_entry)
            x_index.append(sasor['index'])
            if sasor['reward'] > 0:
                y.append(1)
            else:
                y.append(0)
    if train_type in ['pick_success_probability', 'gripper_status']:
        if learned_model is not None:
            logistic = learned_model
        else:
            print classifier_type
            if classifier_type == 'DTC':
                logistic = DecisionTreeClassifier(criterion='entropy')
            else:
                logistic = linear_model.LogisticRegression(max_iter=400, C=1.0)
            logistic.fit(x, y)
            joblib.dump(
                logistic,
                output_dir + '/' + classifier_type + '-' + action_str + '.pkl')
        ans = logistic
        print logistic.score(x, y)
        print logistic.get_params()
        print len(x)
        if classifier_type != 'DTC':
            print logistic.coef_
            print logistic.intercept_
            yaml_out = {}
            yaml_out['coef'] = logistic.coef_.tolist()[0]
            yaml_out['intercept'] = logistic.intercept_.tolist()[0]
            write_config_in_file(
                output_dir + '/' + classifier_type + '-' + action_str +
                ".yaml", yaml_out)
        else:
            print logistic.feature_importances_

            #feature_names=['t1','t2', 'j1', 'j2']
            feature_names = [
                'j1', 'j2'
            ]  #Touch not required when object coordinates are known
            feature_names = feature_names + [
                'gx', 'gy', 'gz', 'gxx', 'gyy', 'gzz', 'gw'
            ][0:3]
            feature_names = feature_names + [
                'ox', 'oy', 'oz', 'oxx', 'oyy', 'ozz', 'ow'
            ][0:3]
            feature_names = feature_names + ['xrel', 'yrel']
            if have_graphviz:
                dot_data = tree.export_graphviz(logistic,
                                                out_file=None,
                                                feature_names=feature_names,
                                                filled=True)
                graph = graphviz.Source(dot_data)
                graph.render(output_dir + '/' + classifier_type + '-' +
                             action_str)
            yaml_out = {}
            yaml_out["max_depth"] = logistic.tree_.max_depth
            yaml_out["values"] = logistic.tree_.value
            yaml_out['n_nodes'] = logistic.tree_.node_count
            yaml_out['children_left'] = logistic.tree_.children_left
            yaml_out['children_right'] = logistic.tree_.children_right
            yaml_out['feature'] = logistic.tree_.feature
            yaml_out['threshold'] = logistic.tree_.threshold
            write_config_in_file(
                output_dir + '/' + classifier_type + '-' + action_str +
                ".yaml", yaml_out)
        if debug:
            for i in range(0, len(x)):
                y_bar = logistic.predict([x[i]])
                if y_bar != y[i]:
                    print x_index[i]
                    print x[i]
                    print y[i]
                    print logistic.predict_proba([x[i]])
                    if classifier_type != 'DTC':
                        print logistic.decision_function([x[i]])
                        prob = (np.dot(logistic.coef_[0], x[i]) +
                                logistic.intercept_[0])
                        print prob
                        prob *= -1
                        prob = np.exp(prob)
                        prob += 1
                        prob = np.reciprocal(prob)
                        print prob
    if 'next_state' in train_type:
        actions = range(10)

        #  predictions can be 18, 7 for gripper pose, 7 for objct pose
        # 2 for joint values
        # 2 for touch values
        predictions = range(NUM_PREDICTIONS)

        train_type_array = train_type.split('_')
        for s in train_type_array:
            if 'action' in s:
                actions = s.split('-')[1:]
            if 'pred' in s:
                predictions = s.split('-')[1:]
        ans = {}
        for action_ in actions:
            action = int(action_)
            x = []
            y = []
            y_c = []
            l_reg = []
            l_reg_c = []
            x_index = []
            for i in range(0, NUM_PREDICTIONS):
                y.append([])
                y_c.append([])
                l_reg.append('')
                l_reg_c.append('')
            for sasor in saso_data[action]:
                if sasor['reward'] > -999:  #discard invalid states
                    x_entry = sasor['init_joint_values']
                    x_entry = x_entry + sasor['init_gripper'][0:3] + sasor[
                        'init_object'][0:3]
                    x_entry.append(sasor['init_object'][0] -
                                   sasor['init_gripper'][0])
                    x_entry.append(sasor['init_object'][1] -
                                   sasor['init_gripper'][1])
                    x.append(x_entry)
                    x_index.append(sasor['index'])
                    for p_ in predictions:
                        p = int(p_)
                        y[p].append(get_prediction_value(sasor, p))
                        y_default = get_default_value(sasor, p)
                        y_c[p].append(is_correct(p, y[p][-1], y_default))
                        """
                        try:
                            check_array(x)
                            check_array(y[p])
                        except:
                            print x[-1]
                            print y[p][-1]
                            print sasor['index']
                            assert(0==1)
                        """

            print len(x)
            ans[action] = {}

            for p_ in predictions:
                p = int(p_)
                if learned_model is not None:
                    l_reg[p] = learned_model[action][p]
                else:
                    if classifier_type == 'ridge':
                        l_reg[p] = linear_model.Ridge(alpha=0.5,
                                                      normalize=True)
                    elif classifier_type == 'SVR':
                        l_reg[p] = SVR(epsilon=0.2)
                    elif classifier_type in ['DTR', 'DTRM']:
                        l_reg[p] = DecisionTreeRegressor()
                    elif classifier_type == 'DTC':
                        l_reg[p] = DecisionTreeClassifier()
                    elif classifier_type == 'Earth':
                        l_reg[p] = Earth()
                    elif classifier_type == 'AdaLinear':
                        l_reg[p] = AdaBoostRegressor(
                            linear_model.LinearRegression())
                    else:
                        l_reg[p] = linear_model.LinearRegression()
                    if classifier_type == 'DTRM':
                        l_reg[p].fit(x, np.transpose(np.array(y)))
                    elif classifier_type == 'DTC':
                        l_reg[p].fit(x, y_c[p])
                    else:
                        l_reg[p].fit(x, y[p])
                    joblib.dump(
                        l_reg[p], output_dir + '/' + classifier_type + "-" +
                        repr(action) + "-" + repr(p) + '.pkl')
                ans[action][p] = l_reg[p]

                if classifier_type == 'DTRM':
                    print repr(action) + " " + repr(p) + " " + repr(
                        l_reg[p].score(x, np.transpose(np.array(y))))
                elif classifier_type == 'DTC':
                    print repr(action) + " " + repr(p) + " " + repr(
                        l_reg[p].score(x, y_c[p]))
                else:
                    print repr(action) + " " + repr(p) + " " + repr(
                        l_reg[p].score(x, y[p]))
                print l_reg[p].get_params()
                if classifier_type not in [
                        'SVR', 'DTR', 'DTRM', 'AdaLinear', 'DTC'
                ]:
                    print l_reg[p].coef_
                if classifier_type not in [
                        'DTR', 'DTRM', 'AdaLinear', 'DTC', 'Earth'
                ]:
                    print l_reg[p].intercept_
                if classifier_type in ['Earth']:
                    for j in range(0, len(x)):
                        predict_earth(l_reg[p], x[j])
                    print l_reg[p].summary()
                if learned_model is None:
                    if classifier_type in ['DTR', 'DTRM', 'AdaLinear', 'DTC']:

                        print l_reg[p].feature_importances_

                        feature_names = ['j1', 'j2']
                        feature_names = feature_names + [
                            'gx', 'gy', 'gz', 'gxx', 'gyy', 'gzz', 'gw'
                        ][0:3]
                        feature_names = feature_names + [
                            'ox', 'oy', 'oz', 'oxx', 'oyy', 'ozz', 'ow'
                        ][0:3]
                        feature_names = feature_names + ['xrel', 'yrel']
                        if have_graphviz:
                            dot_data = tree.export_graphviz(
                                l_reg[p],
                                out_file=None,
                                feature_names=feature_names,
                                filled=True)
                            graph = graphviz.Source(dot_data)
                            graph.render(output_dir + '/' + classifier_type +
                                         "-" + repr(action) + "-" + repr(p))
                        yaml_out = {}
                        yaml_out['max_depth'] = l_reg[p].tree_.max_depth
                        yaml_out["values"] = l_reg[p].tree_.value.tolist()
                        yaml_out['n_nodes'] = l_reg[p].tree_.node_count
                        yaml_out['children_left'] = l_reg[
                            p].tree_.children_left.tolist()
                        yaml_out['children_right'] = l_reg[
                            p].tree_.children_right.tolist()
                        yaml_out['feature'] = l_reg[p].tree_.feature.tolist()
                        yaml_out['threshold'] = l_reg[
                            p].tree_.threshold.tolist()
                        write_config_in_file(
                            output_dir + '/' + classifier_type + "-" +
                            repr(action) + "-" + repr(p) + ".yaml", yaml_out)
                    if classifier_type in ['Earth']:
                        yaml_out = get_yaml_earth(l_reg[p])
                        write_config_in_file(
                            output_dir + '/' + classifier_type + "-" +
                            repr(action) + "-" + repr(p) + ".yaml", yaml_out)

                if classifier_type == 'DTRM':
                    i = 0
                    y_bar = l_reg[p].predict([x[i]])
                    print x_index[i]
                    print x[i]
                    y_t = np.transpose(np.array(y))
                    print repr(y_t[i]) + ' Prediction ' + repr(y_bar)
                    break
                if debug:
                    for i in range(0, len(x)):
                        y_bar = l_reg[p].predict([x[i]])
                        if classifier_type == 'DTC':
                            if y_bar != y_c[p][i]:
                                print x_index[i]
                                print x[i]
                                print y_c[p][i]
                                print y[p][i]
                                print l_reg[p].predict_proba([x[i]])
                        else:
                            if is_correct(p, y_bar, y[p][i]) == 0:
                                print x_index[i]
                                print x[i]
                                print repr(
                                    y[p][i]) + ' Prediction ' + repr(y_bar)

    return ans

export_graphviz(bamboo_tree,
                feature_names=list(ingredients.columns.values),
                out_file="bamboo_tree.dot",
                class_names=np.unique(cuisines),
                filled=True,
                node_ids=True,
                special_characters=True,
                impurity=False,
                label="all",
                leaves_parallel=False)

with open("bamboo_tree.dot") as bamboo_tree_image:
    bamboo_tree_graph = bamboo_tree_image.read()
graphviz.Source(bamboo_tree_graph)


# The decision tree learned:
# * If a recipe contains *cumin* and *fish* and **no** *yoghurt*, then it is most likely a **Thai** recipe.
# * If a recipe contains *cumin* but **no** *fish* and **no** *soy_sauce*, then it is most likely an **Indian** recipe.

# You can analyze the remaining branches of the tree to come up with similar rules for determining the cuisine of different recipes. 

# Feel free to select another subset of cuisines and build a decision tree of their recipes. You can select some European cuisines and build a decision tree to explore the ingredients that differentiate them.

# # Model Evaluation <a id="4"></a>

# <img src="https://ibm.box.com/shared/static/prc3kksci2a6deks36jpyf4cf4oxh74a.png" width=500>

# To evaluate our model of Asian and Indian cuisines, we will split our dataset into a training set and a test set. We will build the decision tree using the training set. Then, we will test the model on the test set and compare the cuisines that the model predicts to the actual cuisines. 
Пример #14
0
import graphviz
import os

m = DecisionTreeClassifier(max_depth=3)
m.fit(Xtrain, ytrain)
m.score(Xtrain, ytrain)

# create string in .dot format
tree = export_graphviz(m, out_file=None, 
                class_names=["0", "1"],
                feature_names=['Age', 'Sex', "Pclass"],
                impurity=False,
                filled=True)
open('titanic.dot', 'w').write(tree)

graph = graphviz.Source(tree)
# PNG conversion (tested on Ubuntu)
cmd = "dot -Tpng titanic.dot -o tree_graphviz.png"
os.system(cmd)


#### UPLOAD TO KAGGLE ####
predict = pd.read_csv('predict.csv')
predict["Sex"] = pd.factorize(predict["Sex"])[0]
predict['Age'].fillna(predict["Age"].mean(), inplace = True)
Xpredict = predict[["Age", "Sex", "Pclass"]]
ypredict = rf.predict(Xpredict)
predict.set_index("PassengerId", inplace = True)
predict["Survived"] = ypredict
submission = predict[["Survived"]]
submission.to_csv("submissionrf.csv")
Пример #15
0
# In[5]:

# fit a decision tree 
from sklearn import tree #import tree dari library sklearn
cikampek = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5) #membuat variabel asahan sebagai decisiontree, dengan criterion fungsi mengukur kualitas split
cikampek = cikampek.fit(karawang_train_att, karawang_train_pass) #training varibael cikampek dengan data dari variabel karawang.


# In[6]:

# visualize tree
import graphviz #import library graphviz sebagai perangkat lunak visualisasi grafik open source
dot_data = tree.export_graphviz(cikampek, out_file=None, label="all", impurity=False, proportion=True,
                                feature_names=list(karawang_train_att), class_names=["fail", "pass"], 
                                filled=True, rounded=True) #mengambil data untuk diterjemahkan ke grafik
graph = graphviz.Source(dot_data) #membuat variabel graph sebagai grafik yang di ambil dari dot_data
graph #memanggil graph


# In[7]:

# save tree
tree.export_graphviz(cikampek, out_file="student-performance.dot", label="all", impurity=False, proportion=True,
                     feature_names=list(karawang_train_att), class_names=["fail", "pass"], 
                     filled=True, rounded=True) #save tree sebagai export graphviz ke file student-performance.dot


# In[8]:

#asahan.score(medan_test_att, medan_test_pass)
Пример #16
0
    def demo(self):
        import numpy as np
        import matplotlib.pyplot as plt
        from sklearn.datasets import load_iris
        from sklearn.tree import DecisionTreeClassifier
        X = [[0, 0], [1, 1]]
        y = [0, 1]
        clf = DecisionTreeClassifier()
        clf.fit(X, y)
        print(clf.predict([[2, 2], [-1, -1], [0, 1]]))

        # Parameters
        n_classes = 3
        plot_colors = 'ryb'
        plot_step = 0.02

        # Load data
        iris = load_iris()

        for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3],
                                        [2, 3]]):
            # We only take the two corresponding features
            X = iris.data[:, pair]
            y = iris.target

            # Train
            clf = DecisionTreeClassifier()
            clf.fit(X, y)

            # Plot the decision boundary
            plt.subplot(2, 3, pairidx + 1)

            x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
            y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                                 np.arange(y_min, y_max, plot_step))

            plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

            Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)
            cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

            plt.xlabel(iris.feature_names[pair[0]])
            plt.ylabel(iris.feature_names[pair[1]])

            # Plot the training points
            for i, color in zip(range(n_classes), plot_colors):
                idx = np.where(y == i)
                plt.scatter(X[idx, 0],
                            X[idx, 1],
                            c=color,
                            label=iris.target_names[i],
                            cmap=plt.cm.RdYlBu,
                            edgecolors='black',
                            s=15)

        plt.suptitle(
            "Decision surface of a decision tree using paired features")
        plt.legend(loc='lower right', borderpad=0, handletextpad=0)
        plt.show()

        import graphviz
        from sklearn import tree
        iris = load_iris()
        clf = DecisionTreeClassifier()
        clf.fit(iris.data, iris.target)
        dot_data = tree.export_graphviz(clf, out_file=None)
        graph = graphviz.Source(dot_data)

        from sklearn.tree import DecisionTreeRegressor
        # Create a random dataset
        X = np.sort(5 * np.random.rand(80, 1), axis=0)
        y = np.sin(X).ravel()
        # Add noise
        y[::5] += 3 * (0.5 - np.random.rand(16))

        # Fit regression model
        regr_1 = DecisionTreeRegressor(max_depth=2)
        regr_2 = DecisionTreeRegressor(max_depth=5)
        regr_1.fit(X, y)
        regr_2.fit(X, y)

        # Predict
        X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
        y_1 = regr_1.predict(X_test)
        y_2 = regr_2.predict(X_test)

        # Plot the results
        plt.figure()
        plt.scatter(X,
                    y,
                    s=20,
                    edgecolor='black',
                    c='darkorange',
                    label='data')
        plt.plot(X_test,
                 y_1,
                 color='cornflowerblue',
                 label='max_depth=2',
                 linewidth=2)
        plt.plot(X_test,
                 y_2,
                 color="yellowgreen",
                 label="max_depth=5",
                 linewidth=2)
        plt.xlabel("data")
        plt.ylabel("target")
        plt.title("Decision Tree Regression")
        plt.legend()
        plt.show()

        import numpy as np
        import matplotlib.pyplot as plt
        from sklearn.tree import DecisionTreeRegressor
        # Create a random dataset
        rng = np.random.RandomState(1)
        X = np.sort(200 * rng.rand(100, 1) - 100, axis=0)
        y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
        y[::5, :] += (0.5 - rng.rand(20, 2))

        # Fit regression model
        regr_1 = DecisionTreeRegressor(max_depth=2)
        regr_2 = DecisionTreeRegressor(max_depth=5)
        regr_3 = DecisionTreeRegressor(max_depth=8)
        regr_1.fit(X, y)
        regr_2.fit(X, y)
        regr_3.fit(X, y)

        # Predict
        X_test = np.arange(-100.0, 100.0, 0.01)[:, np.newaxis]
        y_1 = regr_1.predict(X_test)
        y_2 = regr_2.predict(X_test)
        y_3 = regr_3.predict(X_test)

        # Plot the results
        plt.figure()
        s = 25
        plt.scatter(y[:, 0],
                    y[:, 1],
                    c="navy",
                    s=s,
                    edgecolor="black",
                    label="data")
        plt.scatter(y_1[:, 0],
                    y_1[:, 1],
                    c="cornflowerblue",
                    s=s,
                    edgecolor="black",
                    label="max_depth=2")
        plt.scatter(y_2[:, 0],
                    y_2[:, 1],
                    c="red",
                    s=s,
                    edgecolor="black",
                    label="max_depth=5")
        plt.scatter(y_3[:, 0],
                    y_3[:, 1],
                    c="orange",
                    s=s,
                    edgecolor="black",
                    label="max_depth=8")
        plt.xlim([-6, 6])
        plt.ylim([-6, 6])
        plt.xlabel("target 1")
        plt.ylabel("target 2")
        plt.title("Multi-output Decision Tree Regression")
        plt.legend(loc="best")
        plt.show()
                              min_samples_leaf=15,
                              max_features='sqrt',
                              max_leaf_nodes=12,
                              random_state=0)
tree.fit(X_train, y_train)

export_graphviz(tree,
                out_file="tree.dot",
                feature_names=wine.feature_names,
                filled=True,
                rounded=True,
                special_characters=True)  #[writes data into .dot file]

with open("tree.dot") as f:
    dot_graph = f.read()
display(graphviz.Source(dot_graph))

print("Feature importance: \n{}".format(tree.feature_importances_))


def plot_feature_importances_wine(model):
    n_features = wine.data.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), wine.feature_names)
    plt.xlabel("Feature Importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)


plot_feature_importances_wine(tree)
Пример #18
0
    test_features = test_data[features]
    dict_vec = feature_extraction.DictVectorizer(sparse=False)
    train_features = dict_vec.fit_transform(
        train_features.to_dict(orient='record'))
    print(dict_vec.feature_names_)
    new_features = dict_vec.feature_names_

    test_features = dict_vec.fit_transform(
        test_features.to_dict(orient='record'))


data_cleaning()

dtc = tree.DecisionTreeClassifier()
dtc.fit(train_features, train_labels)

predict_labels = dtc.predict(test_features)
print(predict_labels[0:5])
print("score %f ", dtc.score(train_features, train_labels))

# k 折交叉验证
print("k 折交叉验证准确率:%f",
      np.mean(cross_val_score(dtc, train_features, train_labels, cv=10)))

graph_data = tree.export_graphviz(dtc,
                                  out_file=None,
                                  feature_names=new_features)

graph = graphviz.Source(graph_data)
graph.view(filename='titanic_classifier', quiet_view=False)
Пример #19
0
import sys

import graphviz

import core


def usage(script_name):
    return f"Usage: {script_name} <outline_filepath>"


if __name__ == "__main__":
    try:
        outline_filepath = sys.argv[1]
    except IndexError:
        print(usage(sys.argv[0]), file=sys.stderr)
        sys.exit(1)

    with open(outline_filepath) as outline_file:
        lines = map(lambda s: s.rstrip(), outline_file.readlines())
        source = core.edges_to_dot(core.lines_to_edges(lines))
        graphviz_file = outline_filepath.split(".")[0]
        dot = graphviz.Source(source)
        format = sys.argv[2] if len(sys.argv) > 2 else "pdf"
        dot.render(graphviz_file, format=format)
Пример #20
0
def create_stromcek(strat_store, strategy):
    data_train = strat_store["train"].dropna()
    train_labels = data_train['class']

    data_valid = strat_store["valid"].dropna()
    valid_labels = data_valid['class']

    columns = ['name', 'address', 'date_of_birth', 'class']

    df_train_class = data_train.drop(columns, axis=1)
    df_train_class = pd.get_dummies(df_train_class)

    df_valid_class = data_valid.drop(columns, axis=1)
    df_valid_class = pd.get_dummies(df_valid_class)

    missing_cols = set(df_train_class.columns) - set(df_valid_class.columns)

    for c in missing_cols:
        df_valid_class[c] = 0

    df_valid_class = df_valid_class[df_train_class.columns]

    clf = tree.DecisionTreeClassifier()

    parameters = {
        'criterion': ('gini', 'entropy'),
        'splitter': ('best', 'random'),
        'max_depth': range(2, 20),
        'max_features': range(1, 77, 5)
    }
    optimization = GridSearchCV(clf, parameters, cv=10)

    vysledok = optimization.fit(df_train_class, train_labels)
    vysledok

    params = optimization.best_params_

    clf = tree.DecisionTreeClassifier(criterion=params['criterion'],
                                      max_depth=params['max_depth'],
                                      max_features=params['max_features'],
                                      splitter=params['splitter'])

    clf = clf.fit(df_train_class, train_labels)

    predicted_labels = clf.predict(df_valid_class)
    basic_acc = metrics.accuracy_score(valid_labels, predicted_labels)
    basic_acc

    import graphviz
    dot_data = tree.export_graphviz(
        clf,
        out_file=None,
        feature_names=df_train_class.columns,
        class_names=["1", "0"],
        filled=True,
        rounded=True,
    )
    graph = graphviz.Source(dot_data)
    graph.render("strom_" + strategy)

    strat_store["report"] = classification_report(valid_labels,
                                                  predicted_labels,
                                                  target_names=["0", "1"])
#熱度圖 heat map
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
df.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(df.astype("float").corr(), cmap="OrRd", annot=True)
"""class sklearn.tree.DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort='deprecated', ccp_alpha=0.0)"""

from sklearn.tree import export_graphviz
import graphviz
g = export_graphviz(clf,
                    feature_names=iris["feature_names"],
                    class_names=iris["target_names"],
                    filled=True)
graphviz.Source(g)
"""Gini index or Gini impurity measures the degree or probability of a particular variable being wrongly classified when it is randomly chosen. But what is actually meant by ‘impurity’? If all the elements belong to a single class, then it can be called pure. The degree of Gini index varies between 0 and 1, where 0 denotes that all elements belong to a certain class or if there exists only one class, and 1 denotes that the elements are randomly distributed across various classes. A Gini Index of 0.5 denotes equally distributed elements into some classes.


where pi  is the probability of an object being classified to a particular class.
"""

pre = clf.predict(x_test)
print(list(pre))
print(list(y_test))
from sklearn.metrics import accuracy_score
accuracy_score(clf.predict(x_test), y_test)
# search sklearn metrics

from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_test, pre))
    predict_labels(clf, X_test_imp, y_test)))

# Use k-fold cross validation to evaluate model
scores = cross_val_score(clf, X_all_imp, y_all, cv=10, scoring=f1_scorer)
print(scores)

feat_names = list(X_all.columns)

#Visualize tree
tree_data1 = tree.export_graphviz(clf,
                                  out_file='pstemp_tree.dot',
                                  feature_names=feat_names,
                                  class_names=['No PSTEMP', 'PSTEMP'],
                                  leaves_parallel=True,
                                  filled=True)
with open("pstemp_tree.dot") as f:
    dot_graph = f.read()
#print(dot_graph)
graph = gv.Source(
    source=dot_graph,
    filename='pstemp_tree',
    directory='C:/Users/david_000/Google Drive/HSLS_2009_v3_0_Stata_Datasets',
    format='png',
    engine='dot')

#graph = gv.Source(tree_data1, filename='pstemp_tree', directory='C:/Users/david_000/Google Drive/HSLS_2009_v3_0_Stata_Datasets', format='pdf')
graph.render(
    filename='pstemp_tree',
    directory='C:/Users/david_000/Google Drive/HSLS_2009_v3_0_Stata_Datasets')
#graph.render(filename='pstemp_tree', directory='C:/Users/david_000/Google Drive/HSLS_2009_v3_0_Stata_Datasets', view=True)
Пример #23
0
from sklearn import tree
tree.plot_tree(decision_tree=clsModel)
tree.plot_tree(decision_tree=clsModel, feature_names=['Var', 'Skew', ' Kur',  'Ent'], class_names=['Org','Fake'], fontsize=12)
#not a good way to draw graphs.. other methods to be experimented
tree.plot_tree(decision_tree=clsModel, max_depth=2, feature_names=['Var', 'Skew', ' Kur',  'Ent'], class_names=['Org','Fake'], fontsize=12)

Source(tree.export_graphviz(clsModel))

dot_data1 = tree.export_graphviz(clsModel, max_depth=3, out_file=None, filled=True, rounded=True,  special_characters=True, feature_names=['Var', 'Skew', ' Kur',  'Ent'], class_names=['Org','Fake'])  
#check the folder location after installing the graphviz
import os
os.environ["PATH"] += os.pathsep + 'c:/Program Files (x86)/Graphviz2.38/bin/'
import graphviz 
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
graph1 = graphviz.Source(dot_data1)  
graph1 





#%% Regression Tree - Predict Petrol Consumption on other parameters
#Predict Numerical value based on IV
#os.listdir('E:/analytics/projects/pyanalytics/data')

#data2 = pd.read_csv('E:/analytics/projects/pyanalytics/data/petrol_consumption.csv')
data2 = pd.read_csv('https://raw.githubusercontent.com/DUanalytics/pyAnalytics/master/data/petrol_consumption.csv')
data2.head()
data2.shape
data2.columns
Пример #24
0
#%% [markdown]
## Counting Trees
# *Combinatorics* is a branch of mathematics that is about counting things.
# Today we will count a kind of diagram called a *tree*.
#
# Although this may seem a simple idea, it turns out to be important in many
# situations in both mathematics and computer science.

#%% [markdown]
### Growing a tree
# To grow a tree, start with a dot that we call the "root".
# This is the simplest tree.

#%%
graphviz.Source(draw_dot(G, path='G0.png'))

#%% [markdown]
# You grow a tree by starting with this simple tree and adding dots and edges.
# There are some rules:
# - The root gets left alone.
# - Every new dot you add gets connected to a dot that's already in the tree by an edge.
# - You always add dots in pairs, with each of the new dots connected to the same dot that's already in the tree.

# Trees always follow these rules:
# - A tree has a root dot at the bottom and either zero or two lines connected to it.
# - Every dot in a tree except for the root has either three lines connected to it, or one.

#%% [markdown]
# Here is what happens when we add two new dots to the top of our simple tree:
# We get a tree with 3 dots and 2 lines.
Пример #25
0
def main():
    digits = datasets.load_digits()
    data_digits = digits.data
    target = digits.target
    sep = int(len(data_digits) * 0.7)
    train_data = data_digits[:sep]
    test_data = data_digits[sep:]
    train_target = target[:sep]
    test_target = target[sep:]

    print("------------------- SciKitLearn Tree -------------------")
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(train_data, train_target)
    dot_data = tree.export_graphviz(clf)
    graph = graphviz.Source(dot_data)
    graph.render('ScikitLearnTree')
    result = clf.predict(test_data)
    print(result)
    report = metrics.classification_report(test_target, result)
    print(report)
    report = metrics.confusion_matrix(test_target,
                                      result,
                                      labels=digits.target_names)
    print(report)

    print("------------------- SciKitLearn Tree2 -------------------")
    clf = tree.DecisionTreeClassifier(min_samples_leaf=3)
    clf = clf.fit(train_data, train_target)
    dot_data = tree.export_graphviz(clf)
    graph = graphviz.Source(dot_data)
    graph.render('ScikitLearnTree')
    result = clf.predict(test_data)
    print(result)
    report = metrics.classification_report(test_target, result)
    print(report)
    report = metrics.confusion_matrix(test_target,
                                      result,
                                      labels=digits.target_names)
    print(report)

    print("------------------- My tree with ToyData -------------------")
    attributes, classes, data, target, data2, target2 = td.ToyData().get_data()
    id3 = ID3.ID3DecisionTreeClassifier(toy=1)
    myTree = id3.fit(data, target, attributes, classes)
    plot = id3.make_dot_data()
    plot.render("testTree_toyData")
    #pdb.set_trace()
    result = id3.predict(data2)
    print("Predicted" + str(result))
    report = metrics.classification_report(target2, result)
    print(report)
    report = metrics.confusion_matrix(target2, result)
    print(report)
    classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    print("------------------- My tree with digits -------------------")
    print(len(train_data))
    id3 = ID3.ID3DecisionTreeClassifier(toy=0)
    att_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
    attributes = {}
    for i in range(64):
        attributes[i] = att_values
    myTree = id3.fit(train_data, train_target, attributes, classes)
    plot = id3.make_dot_data()
    plot.render("testTree_digits")
    #pdb.set_trace()
    result = id3.predict(test_data)
    print("Predicted" + str(result))
    report = metrics.classification_report(test_target, result)
    print(report)
    report = metrics.confusion_matrix(test_target, result)
    print(report)

    print("------------------- My tree with d-g-l -------------------")
    id3 = ID3.ID3DecisionTreeClassifier(toy=0)
    att_values = ['d', 'g', 'l']
    attributes = {}
    for i in range(64):
        attributes[i] = att_values
    data = []
    for item in data_digits:
        row = []
        for d in item:
            if d < 5:
                row.append('d')
            elif d < 10:
                row.append('g')
            else:
                row.append('l')
        data.append(row)
    train_data = data[:sep]
    test_data = data[sep:]
    #pdb.set_trace()
    myTree = id3.fit(train_data, train_target, attributes, classes)
    plot = id3.make_dot_data()
    plot.render("testTree_digits_dgl")
    result = id3.predict(test_data)
    print("Predicted" + str(result))
    report = metrics.classification_report(test_target, result)
    print(report)
    report = metrics.confusion_matrix(test_target, result)
    print(report)
Пример #26
0
def gv(s): return graphviz.Source('digraph G{ rankdir="LR"' + s + '; }')

def get_image_files_sorted(path, recurse=True, folders=None): return get_image_files(path, recurse, folders).sorted()
modelo = SVC()  # Instância do estimador

modelo.fit(
    treino_x,
    treino_y)  # Ensina o estimador com os dados e as classes desses dados

previsoes = modelo.predict(teste_x)  # Prediz oque cada item da lista é

acuracia = accuracy_score(teste_y, previsoes) * 100  # Valida a acurácia
print("A acurácia foi %.2f%%" % acuracia)
"""# DecisionTreeClassifier"""

modelo = DecisionTreeClassifier(max_depth=3)  #modelo de arvore com altura 3
modelo.fit(raw_treino_x, treino_y)
previsoes = modelo.predict(raw_teste_x)

acuracia = accuracy_score(teste_y, previsoes) * 100
print("A acurácia foi %.2f%%" % acuracia)

features = x.columns  # nome das colunas para aparecer nos itens da arvore
dot_data = export_graphviz(
    modelo,  # dados
    out_file=None,  # não vai exportar nada
    filled=True,  # preenchido com cores
    rounded=True,  # borda arredondada
    feature_names=features,  # nomes
    class_names=["não", "sim"]  # Classes de retorno 0 ou 1
)  # exporta o texto para gerar o gráfico

grafico = graphviz.Source(dot_data)  # monta o gráfico
grafico
Пример #28
0
def draw_tree(t, df, size=10, ratio=0.6, precision=0, **kwargs):
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True,
                      special_characters=True, rotate=False, precision=precision, **kwargs)
    return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))
Пример #29
0
from __future__ import division
import numpy as np
import graphviz
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split


if __name__ == "__main__":
	iris = load_iris()
	X = iris.data
	y = iris.target

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

	clf = DecisionTreeClassifier()
	clf.fit(X_train, y_train)
	score = clf.score(X_test, y_test)
	print "score : %s" % score

	dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=iris.feature_names,  
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  
	graph = graphviz.Source(dot_data)
	graph.render("iris")
Пример #30
0
def overwatch():
    column_names = ['Team Stack', 'Role', 'Leaver', 'Mode', 'Map', 'Result']
    features = ['Team Stack', 'Role', 'Leaver']
    classes = ['Loss', 'Win']
    data = pd.read_csv("./overwatch/Overwatch.csv", names=column_names)
    data = data.iloc[1:]  #Kill the first row (titles)

    #Preprocess the rest of the data
    data = data[['Team Stack', 'Role', 'Leaver', 'Result']]

    #Take care of missing data
    #round(data[['Team Stack']].drop([1,2,3,4]).mean(),1)
    #^The result of this is 1.3 -> 1 is the mean
    data[['Team Stack']] = data[['Team Stack']].fillna(1)

    #Filter out only Support and Tank (have to do binary because n-ary won't work with sklearn)
    #df[df.C.str.contains("XYZ") == False]
    data = data[data['Role'].str.contains("Offense") == False]
    data = data[data['Role'].str.contains("Defense") == False]

    #Change Team Stack to binary (1 ->0, 2-5 ->1)
    data['Team Stack'][data['Team Stack'] == '1'] = 0
    data['Team Stack'][data['Team Stack'] == '2'] = 1
    data['Team Stack'][data['Team Stack'] == '3'] = 1
    data['Team Stack'][data['Team Stack'] == '4'] = 1
    data['Team Stack'][data['Team Stack'] == '5'] = 1

    #Make Leaver columnn binary (if its on enemy team, doesnt affect our team really)
    #This will make it binary once everything goes numeric
    data['Leaver'][data['Leaver'] == 'Enemy team'] = 'No'

    #Make Match result binary (Draw = lose)
    data['Result'][data['Result'] == 'Draw'] = 'Loss'

    #TURN THIS INTO ONEHOTENCODER BECAUSE WE DONT WANT IT NUMERIC

    #turn the categorized data into numerical

    LEncoder = LabelEncoder()
    data = data.apply(LEncoder.fit_transform)

    #split arrays into values and targets (classif)
    overwatchData = data.iloc[:, [0, 1, 2]]
    overwatchTarget = data.iloc[:, [3]]

    #max_depth restricts model so it doesn't grow complex and overfit - similar to pruning
    X_train, X_test, Y_train, Y_test = train_test_split(overwatchData,
                                                        overwatchTarget,
                                                        test_size=0.33,
                                                        random_state=42)

    #Train model with the decision tree
    classifier = tree.DecisionTreeClassifier(max_depth=5)
    #classifier = classifier.fit(iris.data, iris.target)

    classifier = classifier.fit(X_train, Y_train)

    dot_data = tree.export_graphviz(classifier,
                                    out_file=None,
                                    max_depth=4,
                                    impurity=False,
                                    proportion=True,
                                    feature_names=features,
                                    class_names=classes,
                                    filled=True,
                                    rounded=True,
                                    special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.render('test-output/overwatch')

    predictions = classifier.predict(X_test)
    print("Overwatch: Accuracy is ",
          round(accuracy_score(Y_test, predictions) * 100), '%')