def regions_to_tree_improved(self,
                                 features_df,
                                 labels_df,
                                 regions,
                                 features,
                                 feature_mins,
                                 feature_maxs,
                                 max_samples=1):

        lines = self.find_lines(regions, features, feature_mins, feature_maxs)
        lines_keys = [key for key in lines.keys() if len(lines[key]) > 0]
        if lines is None or len(lines) <= 0 or len(lines_keys) <= 0:
            return DecisionTree(label=str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int)))),
                                value=None,
                                data=features_df)

        random_label = np.random.choice(lines_keys)
        random_value = np.random.choice(lines[random_label])
        data = DataFrame(features_df)
        data['cat'] = labels_df
        best_split_node = DecisionTree(
            data=data,
            label=random_label,
            value=random_value,
            left=DecisionTree(data=data[data[random_label] <= random_value]),
            right=DecisionTree(data=data[data[random_label] > random_value]))
        node = DecisionTree(label=best_split_node.label,
                            value=best_split_node.value,
                            data=best_split_node.data)

        feature_mins_right = feature_mins.copy()
        feature_mins_right[node.label] = node.value
        feature_maxs_left = feature_maxs.copy()
        feature_maxs_left[node.label] = node.value
        regions_left = []
        regions_right = []
        for region in regions:
            if region[best_split_node.label][0] < best_split_node.value:
                regions_left.append(region)
            else:
                regions_right.append(region)
        if len(best_split_node.left.data) >= max_samples and len(
                best_split_node.right.data) >= max_samples:
            node.left = self.regions_to_tree_improved(
                best_split_node.left.data.drop('cat', axis=1),
                best_split_node.left.data[['cat']], regions_left, features,
                feature_mins, feature_maxs_left)
            node.right = self.regions_to_tree_improved(
                best_split_node.right.data.drop('cat', axis=1),
                best_split_node.right.data[['cat']], regions_right, features,
                feature_mins_right, feature_maxs)

        else:
            node.label = str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int))))
            node.value = None

        return node
 def divide_data(self, data, feature, value):
     """
     Divide the data in two subsets, thanks pandas
     :param data: the dataframe to divide
     :param feature: on which column of the dataframe are we splitting?
     :param value: what threshold do we use to split
     :return: node: initialised decision tree object
     """
     # print data[feature], feature, value
     return DecisionTree(left=DecisionTree(data=data[data[feature] <= value]),
                         right=DecisionTree(data=data[data[feature] > value]),
                         label=feature,
                         data=data,
                         value=value)
Exemplo n.º 3
0
def erreurs(taux_app, data, prof_max):

    erreurs_train = []
    erreurs_test = []
    x = [i for i in range(2, prof_max)]
    dataTrain, dataTest = partition(taux_app, data)
    train_x, train_y = x_y(dataTrain)
    test_x, test_y = x_y(dataTest)

    for i in range(2, prof_max):
        dt = DecisionTree()
        dt.max_depth = i
        dt.min_samples_split = 2
        dt.fit(train_x, train_y)
        erreurs_train.append(1 - dt.score(train_x, train_y))
        erreurs_test.append(1 - dt.score(test_x, test_y))

    import matplotlib.pyplot as plt
    plt.figure()
    plt.plot(x, erreurs_train)
    plt.plot(x, erreurs_test)
    plt.ylabel('erreur en fonction de la profondeur, taux app : ' +
               str(taux_app))
    plt.legend(['app', 'test'], loc='upper left')
    plt.savefig(str(taux_app) + "erreurs.png")
    plt.show()
Exemplo n.º 4
0
def computeDecisionTreeCrossValidation(args, dict_algorithms):
    if (args.debug):
        print("Running decision tree...", end='')
    model = DecisionTree(args)
    dict_algorithms["decision_tree"] = model.computeCrossValidation()
    if (args.debug):
        print("ok!")
Exemplo n.º 5
0
 def train(self, D, X, Y):
     for i in range(self.num_trees):
         x = random.sample(X, int(len(X) * self.max_X))
         d = D.sample(frac=self.max_samples)
         tree = DecisionTree()
         tree.train(d, x, Y)
         self.trees.append(tree)
Exemplo n.º 6
0
 def __init__(self):
     self.forest_tree = {}
     self.test_list = []
     self.tree = DecisionTree()
     self.sample = BalanceSample()
     self.file_name = open("/Users/homelink/storein/rent.txt", "r")
     self.datas = []
Exemplo n.º 7
0
 def fit(self, X, y):
     self.tree = []
     for _ in range(self.n_trees):
         tree = DecisionTree(min_samples_split=self.min_samples_split,
                             max_depth=self.max_depth,
                             n_features=self.n_feature)
         x_sample, y_sample = bootstrap_sample(X, y)
         tree.fit(x_sample, y_sample)
         self.trees.append(tree)
Exemplo n.º 8
0
 def train(self, data, labels):
     self.trees = []
     for i in range(self.ITERATIONS):
         inds = np.random.choice(np.arange(len(data)), len(data))
         self.trees.append(
             DecisionTree(min_leaf=self.MIN_LEAF,
                          m=self.M,
                          max_depth=self.MAX_DEPTH))
         self.trees[-1].train(data[inds], labels[inds])
Exemplo n.º 9
0
Arquivo: TME1.py Projeto: mtrazzi/ARF
def predict(depth, x, y, x_test, y_test):
    dt = DecisionTree()
    dt.max_depth = depth  # on fixe la taille de l ’ arbre a 5
    dt.min_samples_split = 2  # nombre minimum d ’ exemples pour spliter un noeud
    dt.fit(x, y)
    dt.predict(x_test[:5, :])
    score = dt.score(x_test, y_test)
    print(score)
    return (score)
Exemplo n.º 10
0
Arquivo: tme1.py Projeto: keyber/ARF
def scoreTrainTest(f: float):
    assert (f > 0 and f <= 1)
    l = int(tot * f)
    scoresTrain = []
    scoresTest = []
    for depth in profondeurs:
        dt = DecisionTree(depth)
        dt.fit(datax[:l], datay[:l])
        scoresTrain.append(dt.score(datax[:l], datay[:l]))
        scoresTest.append(dt.score(datax[l:], datay[l:]))
    return scoresTrain, scoresTest
    def construct_tree(self, training_feature_vectors, labels, current_depth=0):
        # First find the best split feature
        feature, type = self.find_split_feature(training_feature_vectors.copy(), labels.copy())

        # Can be removed later
        if len(labels) == 0:
            return DecisionTree(label=self.default, value=None, data=None)

        data = DataFrame(training_feature_vectors.copy())
        data['cat'] = labels

        # Only pre-pruning enabled at this moment (QUEST already has very nice trees)
        if feature is None or len(data) == 0 or len(training_feature_vectors.index) <= self.max_nr_nodes \
                or len(np.unique(data['cat'])) == 1 or self.all_feature_vectors_equal(training_feature_vectors)\
                or current_depth >= self.max_depth:
            # Create leaf with label most occurring class
            label = np.argmax(np.bincount(data['cat'].values.astype(int)))
            return DecisionTree(label=label.astype(str), value=None, data=data)

        # If we don't need to pre-prune, we calculate the best possible splitting point for the best split feature
        split_point = self.find_best_split_point(data.copy(), feature, type)

        if split_point is None or math.isnan(split_point):
            label = np.argmax(np.bincount(data['cat'].values.astype(int)))
            return DecisionTree(label=label.astype(str), value=None, data=data)


        # Divide the data using this best split feature and value and call recursively
        split_node = self.divide_data(data.copy(), feature, split_point)
        if len(split_node.left.data) == 0 or len(split_node.right.data) == 0:
            label = np.argmax(np.bincount(data['cat'].values.astype(int)))
            return DecisionTree(label=label.astype(str), value=None, data=data)
        node = DecisionTree(label=split_node.label, value=split_node.value, data=split_node.data)
        node.left = self.construct_tree(split_node.left.data.drop('cat', axis=1),
                                        split_node.left.data[['cat']], current_depth+1)
        node.right = self.construct_tree(split_node.right.data.drop('cat', axis=1),
                                         split_node.right.data[['cat']], current_depth+1)

        return node
Exemplo n.º 12
0
    def fit(self, datanum, ans):
        for _ in range(self.num_tree):
            x_train, _, y_train, _ = train_test_split(datanum,
                                                      ans,
                                                      test_size=1.0 -
                                                      self.sample_data_rate)

            tree = DecisionTree(x_train,
                                y_train,
                                rand_features=self.sample_features)
            tree.fit()

            self.trees.append(tree)
Exemplo n.º 13
0
def main():

    word_list = create_words()

    data, labels = readfile(word_list)
    tree = DecisionTree()
    data_train, data_test, labels_train, labels_test = \
        train_test_split(data, labels, test_size=test_size, random_state=42)

    ## calls our tree algorithm and prediction method ##
    tree.train(data_train, labels_train)
    labels_pred = tree.predict(data_test)
    compute_accuracy(labels_test, labels_pred)
Exemplo n.º 14
0
Arquivo: tme1.py Projeto: keyber/ARF
def scoreTrain():
    scores = []
    for depth in profondeurs:
        dt = DecisionTree(depth)
        dt.fit(datax, datay)
        #dt.predict(datax [:5 ,:])
        scores.append(dt.score(datax, datay))
        # dessine l’arbre  dans un  fichier  pdf   si pydot  est  installe.
        #dt.to_pdf("/tmp/test_tree.pdf",fields)
        # sinon  utiliser  http :// www.webgraphviz.com/
        #print(dt.to_dot(fields))
        #ou dans la  console
        #print(dt.print_tree(fields ))
    return scores
Exemplo n.º 15
0
 def decision_tree_learning(examples, attributes, parent_examples=()):
     if len(examples) == 0:
         return plurality_value(parent_examples)
     elif same_classification(examples):
         return DecisionLeaf(examples[0][target])
     elif len(attributes) == 0:
         return plurality_value(examples)
     elif percent_error(examples) < error_threshold:
         return plurality_value(examples)
     else:
         a = importance(attributes, examples)
         tree = DecisionTree(a, dataset.attrnames[a])
         for (val_i, exs_i) in split_by(a, examples):
             subtree = decision_tree_learning(exs_i, removeall(a, attributes), examples)
             tree.add(val_i, subtree)
         return tree
Exemplo n.º 16
0
def handle_train(argv):
    examples = process_file(argv[2], training=True)
    tree = None
    if argv[4] == "dt":
        tree = DecisionTree()
    else:
        tree = Adaboost()
    tree.define_positive_class(lambda x: x.classification == 'en')
    tree.define_classes(processing.classes)
    tree.define_attributes(processing.attr_definitions)
    tree.load_examples(examples)
    tree.generate(tree.examples)
    with open(argv[3], "w") as f:
        f.write(argv[4] + "\n")
        f.write(str(tree.tree))
    f.close()
    tree.print()
Exemplo n.º 17
0
def handle_predict(argv):
    hypothesis = None
    model = None
    with open(argv[3], "r") as f:
        # DONT DO THIS ITS INSECURE. IM INSANE
        model = f.readline().strip('\n')
        hypothesis = f.readline()
    f.close()
    hypothesis = literal_eval(hypothesis)
    tree = None
    tree = DecisionTree()
    tree.define_positive_class(lambda x: x.classification == 'en')
    tree.define_classes(processing.classes)
    tree.define_attributes(processing.attr_definitions)
    examples = process_file(argv[4], training=False)
    examples = tree.create_examples(examples)
    return tree.classify(examples, hypothesis)
Exemplo n.º 18
0
def _convert_to_tree(dt, features):
    """Convert a sklearn object to a `decisiontree.decisiontree` object"""
    n_nodes = dt.tree_.node_count
    children_left = dt.tree_.children_left
    children_right = dt.tree_.children_right
    feature = dt.tree_.feature
    threshold = dt.tree_.threshold
    classes = dt.classes_

    # The tree structure can be traversed to compute various properties such
    # as the depth of each node and whether or not it is a leaf.
    node_depth = np.zeros(shape=n_nodes)
    decision_trees = [None] * n_nodes
    for i in range(n_nodes):
        decision_trees[i] = DecisionTree()
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        # If we have a test node
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    for i in range(n_nodes):

        if children_left[i] > 0:
            decision_trees[i].left = decision_trees[children_left[i]]

        if children_right[i] > 0:
            decision_trees[i].right = decision_trees[children_right[i]]

        if is_leaves[i]:
            decision_trees[i].label = dt.classes_[np.argmax(
                dt.tree_.value[i][0])]
            decision_trees[i].value = None
        else:
            decision_trees[i].label = features[feature[i]]
            decision_trees[i].value = threshold[i]

    return decision_trees[0]
Exemplo n.º 19
0
def handle_predict(argv):
    hypothesis = None
    model = None
    with open(argv[2], "r") as f:
        model = f.readline().strip('\n')
        hypothesis = f.readline()
    f.close()
    hypothesis = literal_eval(hypothesis)
    tree = None
    if model == "dt":
        tree = DecisionTree()
    else:
        tree = Adaboost()
    tree.define_positive_class(lambda x: x.classification == 'en')
    tree.define_classes(processing.classes)
    tree.define_attributes(processing.attr_definitions)
    examples = process_file(argv[3], training=False)
    examples = tree.create_examples(examples)
    for classification in tree.classify(examples, hypothesis):
        print(classification)
Exemplo n.º 20
0
def handle_train(argv, size=None, depth=None):
    examples = process_file(argv[2], training=True)
    tree = None
    tree = DecisionTree()
    tree.define_positive_class(lambda x: x.classification == 'en')
    tree.define_classes(processing.classes)
    tree.define_attributes(processing.attr_definitions)
    if size:
        tree.load_examples(examples[:size])
    else:
        tree.load_examples(examples)
    if depth != None:
        tree.generate(tree.examples, depth)
    else:
        tree.generate(tree.examples)
    with open(argv[3], "w") as f:
        f.write("dt" + "\n")
        f.write(str(tree.tree))
    f.close()
    tree.print()
Exemplo n.º 21
0
 def decision_tree_learning(examples, attributes, m, parent_examples=()):
     if len(examples) == 0:
         return majority_value(parent_examples)
     elif same_classification(examples):
         return DecisionLeaf(examples[0][target])
     elif len(attributes) == 0:
         return majority_value(examples)
     elif misclass_error(examples) < m:
         return majority_value(examples)
     else:
         A = pick_attribute(attributes, examples)
         tree = DecisionTree(A, dataset.attrnames[A])
         nonlocal internal_nodes
         internal_nodes += 1
         for (val_i, exs_i) in split(A, examples):
             subtree = decision_tree_learning(exs_i,
                                              removeall(A, attributes), m,
                                              examples)
             tree.add(val_i, subtree)
         return tree
Exemplo n.º 22
0
def scores_selon_prof(taux_app, data, prof_max):
    scores = []
    x = [i for i in range(2, prof_max)]
    dataTrain, dataTest = partition(taux_app, data)
    train_x, train_y = x_y(dataTrain)
    test_x, test_y = x_y(dataTest)

    for i in range(2, prof_max):
        dt = DecisionTree()
        dt.max_depth = i
        dt.min_samples_split = 2
        dt.fit(train_x, train_y)
        scores.append(dt.score(test_x, test_y))

    import matplotlib.pyplot as plt
    plt.plot(x, scores)
    plt.ylabel('score en fonction de la profondeur, taux app : ' +
               str(taux_app))
    plt.savefig(str(taux_app) + "scores.png")
    plt.show()
Exemplo n.º 23
0
    def fit(self, X, y):
        """Build multiple trees based on training data.

        Args:
            X (numpy array): sample in shape [n x d], where n is
            number of samples and d is number of features.
            y (numpy array): sample labels in shape [n].
        """
        n, d = X.shape
        for i in range(self.tree_num):
            # draws random subset of features
            features = np.random.choice(d, self.fc, replace=False)
            tree = DecisionTree(self.max_depth, self.min_improv,
                                self.eval_func)
            samples = np.random.choice(n, n, replace=True)
            X_train = X[:, features][samples, ]
            y_train = y[samples]
            tree.fit(X_train, y_train)

            self.features[i] = features
            self.trees[i] = tree
Exemplo n.º 24
0
Arquivo: tme1.py Projeto: keyber/ARF
def scoreCross(n=5):
    """fait la moyenne sur n tests
    taille test = tot/n"""
    assert (type(n) == int)
    scoresTrain = []
    scoresTest = []
    for depth in profondeurs:
        sTrain = 0
        sTest = 0
        for i in range(n):
            start = tot * i // n
            end = tot * (i + 1) // n
            dt = DecisionTree(depth)
            xtrain = np.vstack((datax[:start], datax[end:]))
            ytrain = np.hstack((datay[:start], datay[end:]))
            dt.fit(xtrain, ytrain)
            sTrain += dt.score(xtrain, ytrain)
            sTest += dt.score(datax[start:end], datay[start:end])
        scoresTrain.append(sTrain / n)
        scoresTest.append(sTest / n)
    return scoresTrain, scoresTest
Exemplo n.º 25
0
def partitionnement_test(datax,datay,rp,rdm,couleur): #rp la proportion qui sera dans l'apprentissage. #rdm un booléen qui détermine si on partitionne nos ensemble au hasard.
    
    dt = DecisionTree()
    dt.min_samples_split = 2
    if rdm:
        rp = random.uniform(0,1)
    #inceap nos indices dans datax qui vont servir pour notre apprentissage, et indicet pour nos test. 
    #On tire indiceap aléatoirement avec la proportion rp dans datax, et on effectue des tirages sans remise.
    indiceap = np.random.choice(np.arange(len(datax)), int(rp*len(datax)), replace = False)
    indicet = []
    for i in range(0,len(datax)):
        if i not in indiceap:
            indicet.append(i)
    testy = np.zeros((len(indicet)), int)
    apprentissagey = np.zeros((len(indiceap)),int)
    
       
    
    
    testx = np.delete(datax,indiceap,axis=0)
    
    apprentissagex = np.delete(datax,indicet,axis=0)
    
    for i in range(0,len(indiceap)):
        apprentissagey[i] = datay[indiceap[i]]
    for i in range(0,len(indicet)):
        testy[i] = datay[indicet[i]]
    
    
    l_scoretest = []
    l_scoreapprentissage = []
    #On test différentes profondeurs d'arbres avec comme pas de 3 pour éviter un trop long temps de calcul.
    for i in range(2,20,3):
        dt.max_depth = i
        dt.fit(apprentissagex ,apprentissagey)
        dt.predict(apprentissagex[:5 ,:])
        l_scoretest.append(1 - dt.score(testx,testy))
        l_scoreapprentissage.append(1 - dt.score(apprentissagex,apprentissagey))
    plt.plot(range(2,20,3),l_scoretest,couleur+'--',range(2,20,3),l_scoreapprentissage,couleur)
    plt.show()
Exemplo n.º 26
0
Arquivo: tme1.py Projeto: ykrmm/ARF
def partitionnement_test(datax, datay, rp,
                         rdm):  #rp la proportion qui sera dans le test.

    dt = DecisionTree()
    dt.min_samples_split = 2
    if rdm:
        rp = random.uniform(0, 1)
    indiceap = np.random.choice(np.arange(len(datax)),
                                int(rp * len(datax)),
                                replace=False)
    indicet = []
    for i in range(0, len(datax)):
        if i not in indiceap:
            indicet.append(i)
    testy = np.zeros((len(indicet)), int)
    apprentissagey = np.zeros((len(indiceap)), int)

    testx = np.delete(datax, indiceap, axis=0)

    apprentissagex = np.delete(datax, indicet, axis=0)

    for i in range(0, len(indiceap)):
        apprentissagey[i] = datay[indiceap[i]]
    for i in range(0, len(indicet)):
        testy[i] = datay[indicet[i]]

    l_scoretest = []
    l_scoreapprentissage = []

    for i in range(2, 20, 3):
        dt.max_depth = i
        dt.fit(apprentissagex, apprentissagey)
        dt.predict(apprentissagex[:5, :])
        l_scoretest.append(1 - dt.score(testx, testy))
        l_scoreapprentissage.append(1 -
                                    dt.score(apprentissagex, apprentissagey))
    plt.plot(range(2, 20, 3), l_scoretest, 'r--', range(2, 20, 3),
             l_scoreapprentissage, 'b--')
    plt.show()
    plt.close()
    def decision_tree_from_text(self, lines):

        dt = DecisionTree()

        if '<=' in lines[0] or '>' in lines[0]:
            # Intermediate node
            node_name = lines[0].split(':')[0].lstrip()
            label, value = lines[0].split(':')[1].split('<=')
            label = ' '.join(label.lstrip().rstrip().split('.'))
            value = value.lstrip().split()[0]
            dt.label = label
            dt.value = float(value)
            dt.left = self.decision_tree_from_text(lines[1:])
            counter = 1
            while lines[counter].split(':')[0].lstrip() != node_name:
                counter += 1
            dt.right = self.decision_tree_from_text(lines[counter + 1:])
        else:
            # Terminal node
            dt.label = int(eval(lines[0].split(':')[1].lstrip()))

        return dt
Exemplo n.º 28
0
def validation_croisee(n, taux_app, data, prof_max):
    data_app, _ = partition(taux_app, data)
    erreurs_moy_app = []
    borders = np.linspace(0, len(data_app), n + 1, dtype=int)

    for depth in range(1, prof_max):
        print(depth)
        erreurs_test = []
        for i in range(n):
            data_test = data_app[borders[i]:borders[i + 1]]
            if len(data_app[0:borders[i]]) > 0:
                data_train = np.concatenate(
                    (data_app[0:borders[i]],
                     data_app[borders[i + 1]:len(data_app)]))
            else:
                data_train = data_app[borders[i + 1]:len(data_app)]

            train_x, train_y = x_y(data_train)
            test_x, test_y = x_y(data_test)

            dt = DecisionTree()
            dt.max_depth = depth
            dt.min_samples_split = 2
            dt.fit(train_x, train_y)
            erreurs_test.append(1 - dt.score(test_x, test_y))
        print(erreurs_moy_app)
        erreurs_moy_app.append((1 / n) * np.array(erreurs_test).sum())

    x = [i for i in range(1, prof_max)]
    fig = plt.figure()
    plt.plot(x, erreurs_moy_app)
    plt.xlabel(
        'Erreur moyenne en fonction de la prof avec VC avec taux app de : ' +
        str(taux_app))
    plt.legend(['app'], loc='upper left')
    plt.savefig(str(taux_app) + "erreursVC.png")
    #plt.show()
    return
Exemplo n.º 29
0
def apprentissage(datax, datay, prop):
	ax = datax[:int(np.floor(prop * len(datax)))]  # donnee apprentissage
	ay = datay[:int(np.floor(prop * len(datax)))]

	tx = datax[int(np.floor(prop * len(datax))):]  # donnee test
	ty = datay[int(np.floor(prop * len(datax))):]

	ascore = np.zeros(9)
	tscore = np.zeros(9)

	for d in range(1, 28, 3):
		print("apprentissage : prop = " + str(prop) + " depth = " + str(d))
		dt = DecisionTree()
		dt.max_depth = d  # on fixe la taille de l ’ arbre a 5
		dt.min_samples_split = 2  # nombre minimum d ’ exemples pour spliter un noeud
		dt.fit(ax, ay)
		ascore[int(np.floor(d / 3))] = 1 - dt.score(ax, ay)
		tscore[int(np.floor(d / 3))] = 1 - dt.score(tx, ty)
	plt.plot(range(1, 28, 3), ascore)
	plt.plot(range(1, 28, 3), tscore)
	plt.legend(["Apprentissage", "Test"])
	plt.title("Proportion : " + str(prop))
	plt.show()
Exemplo n.º 30
0
# Différence entre les entropies pour chaque attribut.
# C'est ce qu'on appelle le gain d'information.
igain = ent - ent_cond
print("\nGains d'information par attribut\n", np.around(igain, decimals=4))
print("Genre où la différence est maximale,", fields[igain.argmax()])  # => 17
print("- la différence en question étant de",
      np.around(max(igain), decimals=4))  # => 0.0607

#/////////////////////////////////////////////////////////////////////////////////////////////////// </prise en main IMDb> ////

#///////////////////////////////////////////////////////////////////////////////////////////// <expériences préliminaires> ////

# ··· L'objet DecisionTree est déjà implémenté dans le code source fourni.

dt = DecisionTree()
# Taille de l'arbre de décision.
dt.max_depth = 5
# Nombre minimum d'exemples pour diviser un noeud.
dt.min_samples_split = 2

# Apprentissage et prédiction.
dt.fit(datax, datay)
dt.predict(datax[:5, :])
print('Depth: {} - Score: {}'.format(dt.max_depth, dt.score(datax, datay)))

# Dessiner l’arbre dans un fichier pdf si pydot est installé.
# filename = 'imdb_tree_d{}_s{}.pdf'.format(dt.max_depth, dt.min_samples_split)
# dt.to_pdf(filename, fields)

# Si pydot n'est pas installé, utiliser http://www.webgraphviz.com/,