예제 #1
0
 def _orange_dt_to_my_dt(self, orange_dt_root):
     # Check if leaf
     if orange_dt_root.node_type == Orange.classification.tree.C45Node.Leaf:
         return decisiontree.DecisionTree(left=None, right=None, label=str(int(orange_dt_root.leaf)), data=None, value=None)
     else:
         dt = decisiontree.DecisionTree(label=orange_dt_root.tested.name, data=None, value=orange_dt_root.cut)
         dt.left = self._orange_dt_to_my_dt(orange_dt_root.branch[0])
         dt.right = self._orange_dt_to_my_dt(orange_dt_root.branch[1])
         return dt
예제 #2
0
def trainEnsemble(data, numTrees, predictionIndex):
    ensemble = []
    for tree_index in range(numTrees):
        train, test = bootstrap(data)
        # print("treino: "+ str(len(train)) + "\n" + str(train))

        root = dt.DecisionTree(predictedIndex=predictionIndex)
        root.makeRootNode(train, data)
        root.induce(root.data, root.listOfAttr)

        ensemble += [root]

    return ensemble
예제 #3
0
def convert_to_tree(classifier, features):
    """
    Converts the DecisionTreeClassifier from sklearn (adapted CART) to DecisionTree from decisiontree.py

    :param classifier: the trained classifier
    :param features: the features used in the classifier
    :return: a DecisionTree from decisiontree.py
    """
    n_nodes = classifier.tree_.node_count
    children_left = classifier.tree_.children_left
    children_right = classifier.tree_.children_right
    feature = classifier.tree_.feature
    threshold = classifier.tree_.threshold
    classes = classifier.classes_

    # The tree structure can be traversed to compute various properties such
    # as the depth of each node and whether or not it is a leaf.
    node_depth = np.zeros(shape=n_nodes)
    decision_trees = [None] * n_nodes
    for i in range(n_nodes):
        decision_trees[i] = decisiontree.DecisionTree()
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        # If we have a test node
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    for i in range(n_nodes):
        if children_left[i] > 0:
            decision_trees[i].left = decision_trees[children_left[i]]

        if children_right[i] > 0:
            decision_trees[i].right = decision_trees[children_right[i]]

        if is_leaves[i]:
            decision_trees[i].label = classes[np.argmax(
                classifier.tree_.value[i][0])]
            decision_trees[i].value = None
        else:
            decision_trees[i].label = features[feature[i]]
            decision_trees[i].value = threshold[i]
    return decision_trees[0]
예제 #4
0
def trainDecisionTree(pkgDict, settings=None):
    totalCorrectness, trainBatch, count = 0, [], 0
    for key in pkgDict.keys():
        trainBatch.append(pkgDict[key][TRAIN])
    trainBatch = [data for sublist in trainBatch for data in sublist]
    discretizedDataset, bins = discretize(trainBatch, settings)
    decisionTree = d.DecisionTree(settings, bins)
    root = decisionTree.train(discretizedDataset)
    decisionTree.generateGraphviz(root)
    for key in pkgDict.keys():
        for sample in pkgDict[key][TEST]:
            if key == decisionTree.classify(root,sample):
                totalCorrectness += 1
            count += 1
    return totalCorrectness / count
예제 #5
0
    def _convert_to_tree(self):
        """Convert a sklearn object to a `decisiontree.decisiontree` object"""
        n_nodes = self.dt.tree_.node_count
        children_left = self.dt.tree_.children_left
        children_right = self.dt.tree_.children_right
        feature = self.dt.tree_.feature
        threshold = self.dt.tree_.threshold
        classes = self.dt.classes_

        # The tree structure can be traversed to compute various properties such
        # as the depth of each node and whether or not it is a leaf.
        node_depth = np.zeros(shape=n_nodes)
        decision_trees = [None] * n_nodes
        for i in range(n_nodes):
            decision_trees[i] = decisiontree.DecisionTree()
        is_leaves = np.zeros(shape=n_nodes, dtype=bool)
        stack = [(0, -1)]  # seed is the root node id and its parent depth
        while len(stack) > 0:
            node_id, parent_depth = stack.pop()
            node_depth[node_id] = parent_depth + 1

            # If we have a test node
            if children_left[node_id] != children_right[node_id]:
                stack.append((children_left[node_id], parent_depth + 1))
                stack.append((children_right[node_id], parent_depth + 1))
            else:
                is_leaves[node_id] = True

        for i in range(n_nodes):

            if children_left[i] > 0:
                decision_trees[i].left = decision_trees[children_left[i]]

            if children_right[i] > 0:
                decision_trees[i].right = decision_trees[children_right[i]]

            if is_leaves[i]:
                decision_trees[i].label = self.dt.classes_[np.argmax(
                    self.dt.tree_.value[i][0])]
                decision_trees[i].value = None
            else:
                decision_trees[i].label = self.features[feature[i]]
                decision_trees[i].value = threshold[i]

        return decision_trees[0]
예제 #6
0
    def _decision_tree_from_text(self, lines):
        dt = decisiontree.DecisionTree()

        if '<=' in lines[0] or '>' in lines[0]:
            # Intermediate node
            node_name = lines[0].split(':')[0].lstrip()
            label, value = lines[0].split(':')[1].split('<=')
            label = ' '.join(label.lstrip().rstrip().split('.'))
            value = value.lstrip().split()[0]
            dt.label = label
            dt.value = float(value)
            dt.left = self._decision_tree_from_text(lines[1:])
            counter = 1
            while lines[counter].split(':')[0].lstrip() != node_name: counter+=1
            dt.right = self._decision_tree_from_text(lines[counter + 1:])
        else:
            # Terminal node
            dt.label = int(eval(lines[0].split(':')[1].lstrip()))

        return dt
예제 #7
0
def build_dt_from_ensemble(decision_trees,
                           data,
                           class_label,
                           tests,
                           prior_entropy,
                           prior_tests={},
                           min_nr_samples=1,
                           calc_fracs_from_ensemble=False):
    """
    Given an ensemble of decision trees, build a single decision tree using estimates from the ensemble

    :param decision_trees: the ensembles of decision trees
    :param data: the training data frame
    :param class_label: the column with
    :param tests: all possible tests (calculated from the ensemble)
    :param prior_entropy: recursive parameter to calculate information gain
    :param prior_tests: the tests that are already picked for our final decision tree
    :param min_nr_samples: pre-prune condition, the data must be larger than this parameter
    :return: a single decision tree, calculated using information from the ensemble
    """
    # Pre-pruning conditions:
    #   - if the length of data is <= min_nr_samples
    #   - when we have no tests left
    #   - when there is only 1 unique class in the data left
    # print len(data), len(tests), np.unique(data[class_label].values)
    if len(data) > min_nr_samples and len(tests) > 0 and len(
            np.unique(data[class_label].values)) > 1:
        max_ig = 0
        best_pos_data, best_neg_data, best_pos_entropy, best_neg_entropy = [
            None
        ] * 4
        best_dt = decisiontree.DecisionTree()
        # Find the test that results in the maximum information gain
        for test in tests:
            pos_avg_probs, neg_avg_probs, pos_fraction, neg_fraction = {}, {}, 0.0, 0.0
            for dt in decision_trees:
                pos_prob_dict = calculate_prob_dict(dt, test[0], test[1],
                                                    prior_tests, False)
                neg_prob_dict = calculate_prob_dict(dt, test[0], test[1],
                                                    prior_tests, True)

                if not any(math.isnan(x)
                           for x in pos_prob_dict.values()) and not any(
                               math.isnan(x) for x in neg_prob_dict.values()):
                    pos_avg_probs = add_reduce_by_key(
                        pos_avg_probs,
                        calculate_prob_dict(dt, test[0], test[1], prior_tests,
                                            False))
                    neg_avg_probs = add_reduce_by_key(
                        neg_avg_probs,
                        calculate_prob_dict(dt, test[0], test[1], prior_tests,
                                            True))

                if calc_fracs_from_ensemble and len(data) > 0:
                    pos_fraction += float(
                        len(dt.data[dt.data[test[0]] <= test[1]])) / len(
                            dt.data)
                    neg_fraction += float(
                        len(dt.data[dt.data[test[0]] > test[1]])) / len(
                            dt.data)

            for key in pos_avg_probs:
                pos_avg_probs[key] /= len(decision_trees)
            for key in neg_avg_probs:
                neg_avg_probs[key] /= len(decision_trees)

            if calc_fracs_from_ensemble:
                pos_fraction /= float(len(decision_trees))
                neg_fraction /= float(len(decision_trees))

            pos_entropy = calculate_entropy(
                np.divide(list(pos_avg_probs.values()), len(decision_trees)))
            neg_entropy = calculate_entropy(
                np.divide(list(neg_avg_probs.values()), len(decision_trees)))

            pos_data = data[data[test[0]] <= test[1]].copy()
            neg_data = data[data[test[0]] > test[1]].copy()

            if not calc_fracs_from_ensemble:
                pos_fraction = float(len(pos_data)) / float(len(data))
                neg_fraction = float(len(neg_data)) / float(len(data))

            weighted_entropy = pos_fraction * pos_entropy + neg_fraction * neg_entropy
            information_gain = prior_entropy - weighted_entropy

            if information_gain > max_ig and len(pos_data) > 0 and len(
                    neg_data) > 0:
                max_ig, best_dt.label, best_dt.value = information_gain, test[
                    0], test[1]
                best_pos_data, best_neg_data, best_pos_entropy, best_neg_entropy = pos_data, neg_data, pos_entropy, neg_entropy

        # print max_ig
        if max_ig == 0:  # If we can't find a test that results in an information gain, we can pre-prune
            return decisiontree.DecisionTree(value=None,
                                             label=get_most_occurring_class(
                                                 data, class_label))

        # Update some variables and do recursive calls
        left_prior_tests = prior_tests.copy()
        left_prior_tests.update({(best_dt.label, best_dt.value): True})
        new_tests = tests.copy()
        new_tests.remove((best_dt.label, best_dt.value))
        best_dt.left = build_dt_from_ensemble(decision_trees, best_pos_data,
                                              class_label, new_tests,
                                              best_pos_entropy,
                                              left_prior_tests, min_nr_samples)

        right_prior_tests = prior_tests.copy()
        right_prior_tests.update({(best_dt.label, best_dt.value): False})
        best_dt.right = build_dt_from_ensemble(decision_trees, best_neg_data,
                                               class_label, new_tests,
                                               best_neg_entropy,
                                               right_prior_tests,
                                               min_nr_samples)

        # print (best_dt.label, best_dt.value)
        return best_dt
    else:
        # print '?????'
        return decisiontree.DecisionTree(value=None,
                                         label=get_most_occurring_class(
                                             data, class_label))
예제 #8
0
def testInduce(data):
    root = dt.DecisionTree()
    root.makeRootNode(data, data)
    root.induce(root.data, root.listOfAttr)
    root._print(0)
    return root
예제 #9
0
inst_oneR.begin_oneR()
list_rulesOneR = inst_oneR.get_final_rules()
for rule in list_rulesOneR:
    print "set of rules: "
    print rule

inst_logR = log_regression.LogisticRegression(data_reader.copy(), class_values)
print """
----------------
	"""
print("Logistic Regression algorithm...")
inst_logR.begin_alg()
param_classif = inst_logR.get_param()
print param_classif

inst_dtree = decisiontree.DecisionTree(data_reader.copy(), class_values)
print """
----------------
	"""
print("Decision Tree Classifier algorithm...")
inst_dtree.begin_alg()
param_classif = inst_dtree.get_param()
print param_classif

inst_svm = svm.SVM(data_reader.copy(), class_values)
print """
----------------
	"""
print("Support Vector Machines(SVM) algorithm...")
inst_svm.begin_alg()
param_weight = inst_svm.get_param()
예제 #10
0
def build_dt_from_ensemble(decision_trees,
                           data,
                           class_label,
                           tests,
                           prior_entropy,
                           prior_tests={},
                           min_nr_samples=1,
                           calc_fracs_from_ensemble=False):
    """
    Given an ensemble of decision trees, build a single decision tree using estimates from the ensemble

    **Params**
    ----------
     - `decision_trees` (list of `decisiontree.DecisionTree` objects): the ensemble of decision trees to be merged

     - `data` (pandas DataFrame): the data frame with training data

     - `class_label` (string): the column identifier for the column with class labels in the data

     - `tests` (set of tuples): all possible tests (extracted from the ensemble)

     - `prior_entropy` (float): recursive parameter to calculate information gain

     - `prior_tests` (set of tuples): the tests that are already picked for our final decision tree

     - `min_nr_samples` (int): pre-prune condition, stop searching if number of samples is smaller or equal than threshold

     - `calc_fracs_from_ensemble` (boolean): if `True`, the different probabilities are calculated using the ensemble. Else, the data is used

    **Returns**
    -----------
        a single decision tree, calculated using information from the ensemble
    """
    # Pre-pruning conditions:
    #   - if the length of data is <= min_nr_samples
    #   - when we have no tests left
    #   - when there is only 1 unique class in the data left
    # print len(data), len(tests), np.unique(data[class_label].values)
    if len(data) > min_nr_samples and len(tests) > 0 and len(
            np.unique(data[class_label].values)) > 1:
        max_ig = 0
        best_pos_data, best_neg_data, best_pos_entropy, best_neg_entropy = [
            None
        ] * 4
        best_dt = decisiontree.DecisionTree()
        # Find the test that results in the maximum information gain
        for test in tests:
            pos_avg_probs, neg_avg_probs, pos_fraction, neg_fraction = {}, {}, 0.0, 0.0
            for dt in decision_trees:
                pos_prob_dict = _calculate_prob_dict(dt, test[0], test[1],
                                                     prior_tests, False)
                neg_prob_dict = _calculate_prob_dict(dt, test[0], test[1],
                                                     prior_tests, True)

                if not any(math.isnan(x)
                           for x in pos_prob_dict.values()) and not any(
                               math.isnan(x) for x in neg_prob_dict.values()):
                    pos_avg_probs = _add_reduce_by_key(
                        pos_avg_probs,
                        _calculate_prob_dict(dt, test[0], test[1], prior_tests,
                                             False))
                    neg_avg_probs = _add_reduce_by_key(
                        neg_avg_probs,
                        _calculate_prob_dict(dt, test[0], test[1], prior_tests,
                                             True))

                if calc_fracs_from_ensemble and len(data) > 0:
                    pos_fraction += float(
                        len(dt.data[dt.data[test[0]] <= test[1]])) / len(
                            dt.data)
                    neg_fraction += float(
                        len(dt.data[dt.data[test[0]] > test[1]])) / len(
                            dt.data)

            for key in pos_avg_probs:
                pos_avg_probs[key] /= len(decision_trees)
            for key in neg_avg_probs:
                neg_avg_probs[key] /= len(decision_trees)

            if calc_fracs_from_ensemble:
                pos_fraction /= float(len(decision_trees))
                neg_fraction /= float(len(decision_trees))

            pos_entropy = _calculate_entropy(
                np.divide(list(pos_avg_probs.values()), len(decision_trees)))
            neg_entropy = _calculate_entropy(
                np.divide(list(neg_avg_probs.values()), len(decision_trees)))

            pos_data = data[data[test[0]] <= test[1]].copy()
            neg_data = data[data[test[0]] > test[1]].copy()

            if not calc_fracs_from_ensemble:
                pos_fraction = float(len(pos_data)) / float(len(data))
                neg_fraction = float(len(neg_data)) / float(len(data))

            weighted_entropy = pos_fraction * pos_entropy + neg_fraction * neg_entropy
            information_gain = prior_entropy - weighted_entropy

            if information_gain > max_ig and len(pos_data) > 0 and len(
                    neg_data) > 0:
                max_ig, best_dt.label, best_dt.value = information_gain, test[
                    0], test[1]
                best_pos_data, best_neg_data, best_pos_entropy, best_neg_entropy = pos_data, neg_data, pos_entropy, neg_entropy

        # print max_ig
        if max_ig == 0:  # If we can't find a test that results in an information gain, we can pre-prune
            return decisiontree.DecisionTree(value=None,
                                             label=_get_most_occurring_class(
                                                 data, class_label))

        # Update some variables and do recursive calls
        left_prior_tests = prior_tests.copy()
        left_prior_tests.update({(best_dt.label, best_dt.value): True})
        new_tests = tests.copy()
        new_tests.remove((best_dt.label, best_dt.value))
        best_dt.left = build_dt_from_ensemble(decision_trees, best_pos_data,
                                              class_label, new_tests,
                                              best_pos_entropy,
                                              left_prior_tests, min_nr_samples)

        right_prior_tests = prior_tests.copy()
        right_prior_tests.update({(best_dt.label, best_dt.value): False})
        best_dt.right = build_dt_from_ensemble(decision_trees, best_neg_data,
                                               class_label, new_tests,
                                               best_neg_entropy,
                                               right_prior_tests,
                                               min_nr_samples)

        return best_dt
    else:
        return decisiontree.DecisionTree(value=None,
                                         label=_get_most_occurring_class(
                                             data, class_label))
예제 #11
0
    with open(join(tDir, 'testData.csv'), 'r') as dataFile:
        with open(join(tDir, 'testLabels.csv'), 'r') as labelFile:
            print tDir + ':', tree.testFile(dataFile, labelFile)


if 'load' in args:
    filename = args['load'][0]
    try:
        with open(filename, 'rb') as treeFile:
            tree = decisiontree.load(treeFile)
    except:
        print filename, 'could not be loaded!'
        quit()
else:
    print '\nTRAINING\n'
    tree = decisiontree.DecisionTree()
    if 'train' in args:
        for tDir in args['train']:
            trainTree(tDir)
    else:
        tree = decisiontree.DecisionTree()
        trainTree()
        saveTree()
        if not 'test' in args:
            print '\nTESTING\n'
            testTree()

if 'save_tree' in args:
    filename = args['save_tree'][0]
    if filename == 'def':
        saveTree()
예제 #12
0
list_rulesOneR = inst_oneR.get_final_rules()
for rule in list_rulesOneR:
    print "set of rules: "
    print rule
OneRvalidation_acc = inst_oneR.get_average_acc()

inst_logR = log_regression.LogisticRegression(data_reader.copy(), class_values)
print """
----------------
	"""
print("Logistic Regression algorithm...")
inst_logR.begin_alg()
param_classif = inst_logR.get_param()
#print param_classif

inst_dtree = decisiontree.DecisionTree(data_reader.copy(), class_values,
                                       feature_selection_function, KBest)
print """
----------------
	"""
print("Decision Tree Classifier algorithm...")
inst_dtree.begin_alg()
param_classif = inst_dtree.get_param()
#print param_classif

inst_svm = svm.SVM(data_reader.copy(), class_values)
print """
----------------
	"""
print("Support Vector Machines(SVM) algorithm...")
inst_svm.begin_alg()
param_weight = inst_svm.get_param()
    def convertToTree(self, verbose=False):
        # Using those arrays, we can parse the tree structure:
        # label = naam feature waarop je splitst
        # value = is de value van de feature waarop je splitst
        # ownDecisionTree.

        n_nodes = self.dt.tree_.node_count
        children_left = self.dt.tree_.children_left
        children_right = self.dt.tree_.children_right
        feature = self.dt.tree_.feature
        threshold = self.dt.tree_.threshold
        classes = self.dt.classes_

        # The tree structure can be traversed to compute various properties such
        # as the depth of each node and whether or not it is a leaf.
        node_depth = np.zeros(shape=n_nodes)
        decision_trees = [None] * n_nodes
        for i in range(n_nodes):
            decision_trees[i] = decisiontree.DecisionTree()
        is_leaves = np.zeros(shape=n_nodes, dtype=bool)
        stack = [(0, -1)]  # seed is the root node id and its parent depth
        while len(stack) > 0:
            node_id, parent_depth = stack.pop()
            node_depth[node_id] = parent_depth + 1

            # If we have a test node
            if children_left[node_id] != children_right[node_id]:
                stack.append((children_left[node_id], parent_depth + 1))
                stack.append((children_right[node_id], parent_depth + 1))
            else:
                is_leaves[node_id] = True
        if verbose:
            print("The binary tree structure has %s nodes and has "
                  "the following tree structure:" % n_nodes)

        for i in range(n_nodes):

            if children_left[i] > 0:
                decision_trees[i].left = decision_trees[children_left[i]]

            if children_right[i] > 0:
                decision_trees[i].right = decision_trees[children_right[i]]

            if is_leaves[i]:
                # decision_trees[i].label = self.dt.classes_[self.dt.tree_.value[i][0][1]]
                decision_trees[i].label = self.dt.classes_[np.argmax(
                    self.dt.tree_.value[i][0])]
                decision_trees[i].value = None
                # if verbose:
                #     print(bcolors.OKBLUE + "%snode=%s leaf node." % (node_depth[i] * "\t", i)) + bcolors.ENDC
            else:
                decision_trees[i].label = self.features[feature[i]]
                decision_trees[i].value = threshold[i]

                # if verbose:
                #     print("%snode=%s test node: go to node %s if %s %s <= %s %s else to "
                #       "node %s."
                #       % (node_depth[i] * "\t",
                #          i,
                #          children_left[i],
                #          bcolors.BOLD,
                #          self.features[feature[i]],
                #          threshold[i],
                #          bcolors.ENDC,
                #          children_right[i],
                #          ))
        return decision_trees[0]
예제 #14
0
파일: server.py 프로젝트: LiaoWenyun/wfa
def tree():
    x = decisiontree.DecisionTree()
    mytree = x.createDecisionTreeModel()

    return str(mytree)
for i in range(28):
    #print("entropy vect ",i,dt.entropy(datax[i]))
    entropy_cond = dtree.entropy_cond(
        [datay[datax[:, i] == 1], datay[datax[:, i] == 0]])
    print("Entropie conditionelle ", i, ", ", fields[i], " : ", entropy_cond)
    print("Différence : ", entropy - entropy_cond)

#Si la différence entre l'entropie et l'entropie conditionelle vaut 0,
#cela signifie que l'attribut ne permet pas de faire baisser l'entropie du tout,
#le "gain d'informations" est donc nul.
#Pour la première partition, le meilleur attribut est l'attribut "drama" car c'est
#qui a la plus grande différence entre l’entropie et l’entropie conditionnelle,
#autrement dit c'est l'attribut qui permet de minimiser le plus l'entropie

for i in range(0, depth):
    dt = dtree.DecisionTree()
    dt.max_depth = i  # on fixe la taille de l ’ arbre a i (avec i entre 1 et 10)
    dt.min_samples_split = 2  # nombre minimum d ’ exemples pour spliter un noeud
    dt.fit(datax, datay)
    dt.predict(datax[:5, :])
    print("Score pour la profondeur " + str(i) + " : " +
          str(dt.score(datax, datay)))
    # dessine l ’ arbre dans un fichier pdf si pydot est installe .
    #dt.to_pdf ("./tmp/testtree_deep_"+str(i)+".pdf", fields )
    # sinon utiliser http :// www . webgraphviz . com /
    #dt.to_dot ( fields )
    # ou dans la console
    #print ( dt.print_tree ( fields ))

###### Question 1.4