예제 #1
0
 def train(self, D, X, Y):
     for i in range(self.num_trees):
         x = random.sample(X, int(len(X) * self.max_X))
         d = D.sample(frac=self.max_samples)
         tree = DecisionTree()
         tree.train(d, x, Y)
         self.trees.append(tree)
예제 #2
0
def computeDecisionTreeCrossValidation(args, dict_algorithms):
    if (args.debug):
        print("Running decision tree...", end='')
    model = DecisionTree(args)
    dict_algorithms["decision_tree"] = model.computeCrossValidation()
    if (args.debug):
        print("ok!")
예제 #3
0
 def __init__(self):
     self.forest_tree = {}
     self.test_list = []
     self.tree = DecisionTree()
     self.sample = BalanceSample()
     self.file_name = open("/Users/homelink/storein/rent.txt", "r")
     self.datas = []
    def regions_to_tree_improved(self,
                                 features_df,
                                 labels_df,
                                 regions,
                                 features,
                                 feature_mins,
                                 feature_maxs,
                                 max_samples=1):

        lines = self.find_lines(regions, features, feature_mins, feature_maxs)
        lines_keys = [key for key in lines.keys() if len(lines[key]) > 0]
        if lines is None or len(lines) <= 0 or len(lines_keys) <= 0:
            return DecisionTree(label=str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int)))),
                                value=None,
                                data=features_df)

        random_label = np.random.choice(lines_keys)
        random_value = np.random.choice(lines[random_label])
        data = DataFrame(features_df)
        data['cat'] = labels_df
        best_split_node = DecisionTree(
            data=data,
            label=random_label,
            value=random_value,
            left=DecisionTree(data=data[data[random_label] <= random_value]),
            right=DecisionTree(data=data[data[random_label] > random_value]))
        node = DecisionTree(label=best_split_node.label,
                            value=best_split_node.value,
                            data=best_split_node.data)

        feature_mins_right = feature_mins.copy()
        feature_mins_right[node.label] = node.value
        feature_maxs_left = feature_maxs.copy()
        feature_maxs_left[node.label] = node.value
        regions_left = []
        regions_right = []
        for region in regions:
            if region[best_split_node.label][0] < best_split_node.value:
                regions_left.append(region)
            else:
                regions_right.append(region)
        if len(best_split_node.left.data) >= max_samples and len(
                best_split_node.right.data) >= max_samples:
            node.left = self.regions_to_tree_improved(
                best_split_node.left.data.drop('cat', axis=1),
                best_split_node.left.data[['cat']], regions_left, features,
                feature_mins, feature_maxs_left)
            node.right = self.regions_to_tree_improved(
                best_split_node.right.data.drop('cat', axis=1),
                best_split_node.right.data[['cat']], regions_right, features,
                feature_mins_right, feature_maxs)

        else:
            node.label = str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int))))
            node.value = None

        return node
예제 #5
0
 def fit(self, X, y):
     self.tree = []
     for _ in range(self.n_trees):
         tree = DecisionTree(min_samples_split=self.min_samples_split,
                             max_depth=self.max_depth,
                             n_features=self.n_feature)
         x_sample, y_sample = bootstrap_sample(X, y)
         tree.fit(x_sample, y_sample)
         self.trees.append(tree)
예제 #6
0
파일: TME1.py 프로젝트: mtrazzi/ARF
def predict(depth, x, y, x_test, y_test):
    dt = DecisionTree()
    dt.max_depth = depth  # on fixe la taille de l ’ arbre a 5
    dt.min_samples_split = 2  # nombre minimum d ’ exemples pour spliter un noeud
    dt.fit(x, y)
    dt.predict(x_test[:5, :])
    score = dt.score(x_test, y_test)
    print(score)
    return (score)
예제 #7
0
def main():

    word_list = create_words()

    data, labels = readfile(word_list)
    tree = DecisionTree()
    data_train, data_test, labels_train, labels_test = \
        train_test_split(data, labels, test_size=test_size, random_state=42)

    ## calls our tree algorithm and prediction method ##
    tree.train(data_train, labels_train)
    labels_pred = tree.predict(data_test)
    compute_accuracy(labels_test, labels_pred)
예제 #8
0
    def fit(self, datanum, ans):
        for _ in range(self.num_tree):
            x_train, _, y_train, _ = train_test_split(datanum,
                                                      ans,
                                                      test_size=1.0 -
                                                      self.sample_data_rate)

            tree = DecisionTree(x_train,
                                y_train,
                                rand_features=self.sample_features)
            tree.fit()

            self.trees.append(tree)
예제 #9
0
def erreurs(taux_app, data, prof_max):

    erreurs_train = []
    erreurs_test = []
    x = [i for i in range(2, prof_max)]
    dataTrain, dataTest = partition(taux_app, data)
    train_x, train_y = x_y(dataTrain)
    test_x, test_y = x_y(dataTest)

    for i in range(2, prof_max):
        dt = DecisionTree()
        dt.max_depth = i
        dt.min_samples_split = 2
        dt.fit(train_x, train_y)
        erreurs_train.append(1 - dt.score(train_x, train_y))
        erreurs_test.append(1 - dt.score(test_x, test_y))

    import matplotlib.pyplot as plt
    plt.figure()
    plt.plot(x, erreurs_train)
    plt.plot(x, erreurs_test)
    plt.ylabel('erreur en fonction de la profondeur, taux app : ' +
               str(taux_app))
    plt.legend(['app', 'test'], loc='upper left')
    plt.savefig(str(taux_app) + "erreurs.png")
    plt.show()
 def divide_data(self, data, feature, value):
     """
     Divide the data in two subsets, thanks pandas
     :param data: the dataframe to divide
     :param feature: on which column of the dataframe are we splitting?
     :param value: what threshold do we use to split
     :return: node: initialised decision tree object
     """
     # print data[feature], feature, value
     return DecisionTree(left=DecisionTree(data=data[data[feature] <= value]),
                         right=DecisionTree(data=data[data[feature] > value]),
                         label=feature,
                         data=data,
                         value=value)
예제 #11
0
파일: tme1.py 프로젝트: keyber/ARF
def scoreTrain():
    scores = []
    for depth in profondeurs:
        dt = DecisionTree(depth)
        dt.fit(datax, datay)
        #dt.predict(datax [:5 ,:])
        scores.append(dt.score(datax, datay))
        # dessine l’arbre  dans un  fichier  pdf   si pydot  est  installe.
        #dt.to_pdf("/tmp/test_tree.pdf",fields)
        # sinon  utiliser  http :// www.webgraphviz.com/
        #print(dt.to_dot(fields))
        #ou dans la  console
        #print(dt.print_tree(fields ))
    return scores
예제 #12
0
def part5():
    """We take 2 features with high class correlation to show decision boundaries
    for breast cancer: Uniformity_of_Cell_Shape and Uniformity_of_Cell_Size"""
    # Dataset 1
    # KNN
    df = reduce_df(cancer_df, 3)
    cancer_features, cancer_labels = Preprocessing.get_labels_features(df)

    KNN.plot_decision_bound(
        cancer_features,
        cancer_labels,
        df.keys()[1],
        df.keys()[2],
        KNN,
        k=cancer_k,
    )
    # Decision Tree
    DecisionTree.plot_decision_bound(
        cancer_features,
        cancer_labels,
        df.keys()[1],
        df.keys()[2],
        DecisionTree,
        max_depth=cancer_d,
    )

    # Dataset 2
    # KNN
    df = reduce_df(hepatitis_df, 3)
    hepatitis_features, hepatitis_labels = Preprocessing.get_labels_features(
        df)

    KNN.plot_decision_bound(
        hepatitis_features,
        hepatitis_labels,
        df.keys()[1],
        df.keys()[2],
        KNN,
        k=hepatitis_k,
    )
    # Decision Tree
    DecisionTree.plot_decision_bound(
        hepatitis_features,
        hepatitis_labels,
        df.keys()[1],
        df.keys()[2],
        DecisionTree,
        max_depth=hepatitis_d,
    )
예제 #13
0
 def decision_tree_learning(examples, attributes, parent_examples=()):
     if len(examples) == 0:
         return plurality_value(parent_examples)
     elif same_classification(examples):
         return DecisionLeaf(examples[0][target])
     elif len(attributes) == 0:
         return plurality_value(examples)
     elif percent_error(examples) < error_threshold:
         return plurality_value(examples)
     else:
         a = importance(attributes, examples)
         tree = DecisionTree(a, dataset.attrnames[a])
         for (val_i, exs_i) in split_by(a, examples):
             subtree = decision_tree_learning(exs_i, removeall(a, attributes), examples)
             tree.add(val_i, subtree)
         return tree
예제 #14
0
def handle_predict(argv):
    hypothesis = None
    model = None
    with open(argv[3], "r") as f:
        # DONT DO THIS ITS INSECURE. IM INSANE
        model = f.readline().strip('\n')
        hypothesis = f.readline()
    f.close()
    hypothesis = literal_eval(hypothesis)
    tree = None
    tree = DecisionTree()
    tree.define_positive_class(lambda x: x.classification == 'en')
    tree.define_classes(processing.classes)
    tree.define_attributes(processing.attr_definitions)
    examples = process_file(argv[4], training=False)
    examples = tree.create_examples(examples)
    return tree.classify(examples, hypothesis)
예제 #15
0
 def train(self, data, labels):
     self.trees = []
     for i in range(self.ITERATIONS):
         inds = np.random.choice(np.arange(len(data)), len(data))
         self.trees.append(
             DecisionTree(min_leaf=self.MIN_LEAF,
                          m=self.M,
                          max_depth=self.MAX_DEPTH))
         self.trees[-1].train(data[inds], labels[inds])
예제 #16
0
 def decision_tree_learning(examples, attributes, m, parent_examples=()):
     if len(examples) == 0:
         return majority_value(parent_examples)
     elif same_classification(examples):
         return DecisionLeaf(examples[0][target])
     elif len(attributes) == 0:
         return majority_value(examples)
     elif misclass_error(examples) < m:
         return majority_value(examples)
     else:
         A = pick_attribute(attributes, examples)
         tree = DecisionTree(A, dataset.attrnames[A])
         nonlocal internal_nodes
         internal_nodes += 1
         for (val_i, exs_i) in split(A, examples):
             subtree = decision_tree_learning(exs_i,
                                              removeall(A, attributes), m,
                                              examples)
             tree.add(val_i, subtree)
         return tree
예제 #17
0
def part4():
    hepatitis_p = KNN.tune_knn_p(hepatitis_df)
    cancer_p = KNN.tune_knn_p(cancer_df)
    print("\nThe ideal P for hepatitis minkowski distance function:",
          hepatitis_p)
    print("The ideal P for breast cancer minkowski distance function:",
          cancer_p)

    hepatitis_cf = DecisionTree.tune_costfn(X_train_h, X_test_h, y_train_h,
                                            y_test_h, hepatitis_d)
    print(
        "\nThe most accurate cost function for hepatitis dataset:",
        hepatitis_cf.__name__,
    )
    cancer_cf = DecisionTree.tune_costfn(X_train_c, X_test_c, y_train_c,
                                         y_test_c, cancer_d)
    print(
        "\nThe most accurate cost function for breast cancer  dataset:",
        cancer_cf.__name__,
    )
예제 #18
0
    def fit(self, X, y):
        """Build multiple trees based on training data.

        Args:
            X (numpy array): sample in shape [n x d], where n is
            number of samples and d is number of features.
            y (numpy array): sample labels in shape [n].
        """
        n, d = X.shape
        for i in range(self.tree_num):
            # draws random subset of features
            features = np.random.choice(d, self.fc, replace=False)
            tree = DecisionTree(self.max_depth, self.min_improv,
                                self.eval_func)
            samples = np.random.choice(n, n, replace=True)
            X_train = X[:, features][samples, ]
            y_train = y[samples]
            tree.fit(X_train, y_train)

            self.features[i] = features
            self.trees[i] = tree
예제 #19
0
파일: tme1.py 프로젝트: ykrmm/ARF
def partitionnement_test(datax, datay, rp,
                         rdm):  #rp la proportion qui sera dans le test.

    dt = DecisionTree()
    dt.min_samples_split = 2
    if rdm:
        rp = random.uniform(0, 1)
    indiceap = np.random.choice(np.arange(len(datax)),
                                int(rp * len(datax)),
                                replace=False)
    indicet = []
    for i in range(0, len(datax)):
        if i not in indiceap:
            indicet.append(i)
    testy = np.zeros((len(indicet)), int)
    apprentissagey = np.zeros((len(indiceap)), int)

    testx = np.delete(datax, indiceap, axis=0)

    apprentissagex = np.delete(datax, indicet, axis=0)

    for i in range(0, len(indiceap)):
        apprentissagey[i] = datay[indiceap[i]]
    for i in range(0, len(indicet)):
        testy[i] = datay[indicet[i]]

    l_scoretest = []
    l_scoreapprentissage = []

    for i in range(2, 20, 3):
        dt.max_depth = i
        dt.fit(apprentissagex, apprentissagey)
        dt.predict(apprentissagex[:5, :])
        l_scoretest.append(1 - dt.score(testx, testy))
        l_scoreapprentissage.append(1 -
                                    dt.score(apprentissagex, apprentissagey))
    plt.plot(range(2, 20, 3), l_scoretest, 'r--', range(2, 20, 3),
             l_scoreapprentissage, 'b--')
    plt.show()
    plt.close()
예제 #20
0
def partitionnement_test(datax,datay,rp,rdm,couleur): #rp la proportion qui sera dans l'apprentissage. #rdm un booléen qui détermine si on partitionne nos ensemble au hasard.
    
    dt = DecisionTree()
    dt.min_samples_split = 2
    if rdm:
        rp = random.uniform(0,1)
    #inceap nos indices dans datax qui vont servir pour notre apprentissage, et indicet pour nos test. 
    #On tire indiceap aléatoirement avec la proportion rp dans datax, et on effectue des tirages sans remise.
    indiceap = np.random.choice(np.arange(len(datax)), int(rp*len(datax)), replace = False)
    indicet = []
    for i in range(0,len(datax)):
        if i not in indiceap:
            indicet.append(i)
    testy = np.zeros((len(indicet)), int)
    apprentissagey = np.zeros((len(indiceap)),int)
    
       
    
    
    testx = np.delete(datax,indiceap,axis=0)
    
    apprentissagex = np.delete(datax,indicet,axis=0)
    
    for i in range(0,len(indiceap)):
        apprentissagey[i] = datay[indiceap[i]]
    for i in range(0,len(indicet)):
        testy[i] = datay[indicet[i]]
    
    
    l_scoretest = []
    l_scoreapprentissage = []
    #On test différentes profondeurs d'arbres avec comme pas de 3 pour éviter un trop long temps de calcul.
    for i in range(2,20,3):
        dt.max_depth = i
        dt.fit(apprentissagex ,apprentissagey)
        dt.predict(apprentissagex[:5 ,:])
        l_scoretest.append(1 - dt.score(testx,testy))
        l_scoreapprentissage.append(1 - dt.score(apprentissagex,apprentissagey))
    plt.plot(range(2,20,3),l_scoretest,couleur+'--',range(2,20,3),l_scoreapprentissage,couleur)
    plt.show()
예제 #21
0
def part3():
    hepatitis_d = DecisionTree.tune_tree_depth(X_train_h,
                                               X_test_h,
                                               y_train_h,
                                               y_test_h,
                                               training=True)
    cancer_d = DecisionTree.tune_tree_depth(X_train_c,
                                            X_test_c,
                                            y_train_c,
                                            y_test_c,
                                            training=True)
    print("\nThe ideal depth for hepatitis is (based on train accuracy):",
          hepatitis_d)
    print("The ideal depth for breast cancer is (based on train accuracy):",
          cancer_d)

    hepatitis_d = DecisionTree.tune_tree_depth(X_train_h, X_test_h, y_train_h,
                                               y_test_h)
    cancer_d = DecisionTree.tune_tree_depth(X_train_c, X_test_c, y_train_c,
                                            y_test_c)
    print("\nThe ideal depth for hepatitis is (based on test accuracy):",
          hepatitis_d)
    print("The ideal depth for breast cancer is (based on test accuracy):",
          cancer_d)
예제 #22
0
def _convert_to_tree(dt, features):
    """Convert a sklearn object to a `decisiontree.decisiontree` object"""
    n_nodes = dt.tree_.node_count
    children_left = dt.tree_.children_left
    children_right = dt.tree_.children_right
    feature = dt.tree_.feature
    threshold = dt.tree_.threshold
    classes = dt.classes_

    # The tree structure can be traversed to compute various properties such
    # as the depth of each node and whether or not it is a leaf.
    node_depth = np.zeros(shape=n_nodes)
    decision_trees = [None] * n_nodes
    for i in range(n_nodes):
        decision_trees[i] = DecisionTree()
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        # If we have a test node
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    for i in range(n_nodes):

        if children_left[i] > 0:
            decision_trees[i].left = decision_trees[children_left[i]]

        if children_right[i] > 0:
            decision_trees[i].right = decision_trees[children_right[i]]

        if is_leaves[i]:
            decision_trees[i].label = dt.classes_[np.argmax(
                dt.tree_.value[i][0])]
            decision_trees[i].value = None
        else:
            decision_trees[i].label = features[feature[i]]
            decision_trees[i].value = threshold[i]

    return decision_trees[0]
예제 #23
0
파일: tme1.py 프로젝트: keyber/ARF
def scoreTrainTest(f: float):
    assert (f > 0 and f <= 1)
    l = int(tot * f)
    scoresTrain = []
    scoresTest = []
    for depth in profondeurs:
        dt = DecisionTree(depth)
        dt.fit(datax[:l], datay[:l])
        scoresTrain.append(dt.score(datax[:l], datay[:l]))
        scoresTest.append(dt.score(datax[l:], datay[l:]))
    return scoresTrain, scoresTest
    def construct_tree(self, training_feature_vectors, labels, current_depth=0):
        # First find the best split feature
        feature, type = self.find_split_feature(training_feature_vectors.copy(), labels.copy())

        # Can be removed later
        if len(labels) == 0:
            return DecisionTree(label=self.default, value=None, data=None)

        data = DataFrame(training_feature_vectors.copy())
        data['cat'] = labels

        # Only pre-pruning enabled at this moment (QUEST already has very nice trees)
        if feature is None or len(data) == 0 or len(training_feature_vectors.index) <= self.max_nr_nodes \
                or len(np.unique(data['cat'])) == 1 or self.all_feature_vectors_equal(training_feature_vectors)\
                or current_depth >= self.max_depth:
            # Create leaf with label most occurring class
            label = np.argmax(np.bincount(data['cat'].values.astype(int)))
            return DecisionTree(label=label.astype(str), value=None, data=data)

        # If we don't need to pre-prune, we calculate the best possible splitting point for the best split feature
        split_point = self.find_best_split_point(data.copy(), feature, type)

        if split_point is None or math.isnan(split_point):
            label = np.argmax(np.bincount(data['cat'].values.astype(int)))
            return DecisionTree(label=label.astype(str), value=None, data=data)


        # Divide the data using this best split feature and value and call recursively
        split_node = self.divide_data(data.copy(), feature, split_point)
        if len(split_node.left.data) == 0 or len(split_node.right.data) == 0:
            label = np.argmax(np.bincount(data['cat'].values.astype(int)))
            return DecisionTree(label=label.astype(str), value=None, data=data)
        node = DecisionTree(label=split_node.label, value=split_node.value, data=split_node.data)
        node.left = self.construct_tree(split_node.left.data.drop('cat', axis=1),
                                        split_node.left.data[['cat']], current_depth+1)
        node.right = self.construct_tree(split_node.right.data.drop('cat', axis=1),
                                         split_node.right.data[['cat']], current_depth+1)

        return node
예제 #25
0
def scores_selon_prof(taux_app, data, prof_max):
    scores = []
    x = [i for i in range(2, prof_max)]
    dataTrain, dataTest = partition(taux_app, data)
    train_x, train_y = x_y(dataTrain)
    test_x, test_y = x_y(dataTest)

    for i in range(2, prof_max):
        dt = DecisionTree()
        dt.max_depth = i
        dt.min_samples_split = 2
        dt.fit(train_x, train_y)
        scores.append(dt.score(test_x, test_y))

    import matplotlib.pyplot as plt
    plt.plot(x, scores)
    plt.ylabel('score en fonction de la profondeur, taux app : ' +
               str(taux_app))
    plt.savefig(str(taux_app) + "scores.png")
    plt.show()
    def decision_tree_from_text(self, lines):

        dt = DecisionTree()

        if '<=' in lines[0] or '>' in lines[0]:
            # Intermediate node
            node_name = lines[0].split(':')[0].lstrip()
            label, value = lines[0].split(':')[1].split('<=')
            label = ' '.join(label.lstrip().rstrip().split('.'))
            value = value.lstrip().split()[0]
            dt.label = label
            dt.value = float(value)
            dt.left = self.decision_tree_from_text(lines[1:])
            counter = 1
            while lines[counter].split(':')[0].lstrip() != node_name:
                counter += 1
            dt.right = self.decision_tree_from_text(lines[counter + 1:])
        else:
            # Terminal node
            dt.label = int(eval(lines[0].split(':')[1].lstrip()))

        return dt
예제 #27
0
def validation_croisee(n, taux_app, data, prof_max):
    data_app, _ = partition(taux_app, data)
    erreurs_moy_app = []
    borders = np.linspace(0, len(data_app), n + 1, dtype=int)

    for depth in range(1, prof_max):
        print(depth)
        erreurs_test = []
        for i in range(n):
            data_test = data_app[borders[i]:borders[i + 1]]
            if len(data_app[0:borders[i]]) > 0:
                data_train = np.concatenate(
                    (data_app[0:borders[i]],
                     data_app[borders[i + 1]:len(data_app)]))
            else:
                data_train = data_app[borders[i + 1]:len(data_app)]

            train_x, train_y = x_y(data_train)
            test_x, test_y = x_y(data_test)

            dt = DecisionTree()
            dt.max_depth = depth
            dt.min_samples_split = 2
            dt.fit(train_x, train_y)
            erreurs_test.append(1 - dt.score(test_x, test_y))
        print(erreurs_moy_app)
        erreurs_moy_app.append((1 / n) * np.array(erreurs_test).sum())

    x = [i for i in range(1, prof_max)]
    fig = plt.figure()
    plt.plot(x, erreurs_moy_app)
    plt.xlabel(
        'Erreur moyenne en fonction de la prof avec VC avec taux app de : ' +
        str(taux_app))
    plt.legend(['app'], loc='upper left')
    plt.savefig(str(taux_app) + "erreursVC.png")
    #plt.show()
    return
예제 #28
0
def apprentissage(datax, datay, prop):
	ax = datax[:int(np.floor(prop * len(datax)))]  # donnee apprentissage
	ay = datay[:int(np.floor(prop * len(datax)))]

	tx = datax[int(np.floor(prop * len(datax))):]  # donnee test
	ty = datay[int(np.floor(prop * len(datax))):]

	ascore = np.zeros(9)
	tscore = np.zeros(9)

	for d in range(1, 28, 3):
		print("apprentissage : prop = " + str(prop) + " depth = " + str(d))
		dt = DecisionTree()
		dt.max_depth = d  # on fixe la taille de l ’ arbre a 5
		dt.min_samples_split = 2  # nombre minimum d ’ exemples pour spliter un noeud
		dt.fit(ax, ay)
		ascore[int(np.floor(d / 3))] = 1 - dt.score(ax, ay)
		tscore[int(np.floor(d / 3))] = 1 - dt.score(tx, ty)
	plt.plot(range(1, 28, 3), ascore)
	plt.plot(range(1, 28, 3), tscore)
	plt.legend(["Apprentissage", "Test"])
	plt.title("Proportion : " + str(prop))
	plt.show()
예제 #29
0
파일: tme1.py 프로젝트: keyber/ARF
def scoreCross(n=5):
    """fait la moyenne sur n tests
    taille test = tot/n"""
    assert (type(n) == int)
    scoresTrain = []
    scoresTest = []
    for depth in profondeurs:
        sTrain = 0
        sTest = 0
        for i in range(n):
            start = tot * i // n
            end = tot * (i + 1) // n
            dt = DecisionTree(depth)
            xtrain = np.vstack((datax[:start], datax[end:]))
            ytrain = np.hstack((datay[:start], datay[end:]))
            dt.fit(xtrain, ytrain)
            sTrain += dt.score(xtrain, ytrain)
            sTest += dt.score(datax[start:end], datay[start:end])
        scoresTrain.append(sTrain / n)
        scoresTest.append(sTest / n)
    return scoresTrain, scoresTest
예제 #30
0
class Decision(object):
    def __init__(self):
        self.forest_tree = {}
        self.test_list = []
        self.tree = DecisionTree()
        self.sample = BalanceSample()
        self.file_name = open("/Users/homelink/storein/rent.txt", "r")
        self.datas = []

    def generateSample(self):
        positive = []
        negative = []
        count = 0
        for data in self.file_name.readlines():
            rent_dic = json.loads(data)
            self.datas.append([
                rent_dic["business_area"], rent_dic["area"], rent_dic["width"],
                rent_dic["face"], rent_dic["structure"], rent_dic["height"],
                rent_dic["day_rent_per_centare"], rent_dic["tenancy"],
                rent_dic["transfer_fee"], rent_dic["licence"],
                rent_dic["water"], rent_dic["power"], rent_dic["fire"],
                rent_dic["wind"], rent_dic["gas"], rent_dic["industry"],
                rent_dic["is_rent"]
            ])

            if rent_dic["is_rent"] == "True":
                positive.append(count)
            else:
                negative.append(count)
            count = count + 1
        data = self.sample.over_sample(positive, negative, self.datas)
        self.datas = self.datas + data

    def run(self):
        self.generateSample()
        tree = self.tree.buildtree(self.datas, self.tree.giniimpurity_2)
        # prune(tree,0.1)
        self.tree.printtree(tree)

    def frequence(self, trade):
        trade_dic = {}
        trade_area = []
        for i in trade:
            if i in trade_dic.keys():
                trade_dic[i] = trade_dic[i] + 1
            else:
                trade_dic[i] = 1
        max_index = max(list(trade_dic.values()))
        for key, value in trade_dic.items():
            if value == max_index:
                trade_area.append(key)
        return (trade_area)

    def accuracy(self):
        true_index = 0
        false_index = 0
        test_list = []
        test = open("/Users/homelink/dianping/test.txt", "r")
        for line in test.readlines():
            test_list = test_list + list(eval(line))

        for i in range(len(test_list)):
            test = test_list[i]
            result_true = test[5]
            trade_store = []

            for key, tree in self.forest_tree.items():
                result = classify([
                    test[0],
                    int(test[1]),
                    float(test[2]),
                    float(test[3]),
                    float(test[4])
                ], tree)

                max_value = 0
                for key, value in result.items():
                    if value > max_value:
                        max_value = value
                        trade = key
                trade_store.append(trade)

            option_result = self.frequence(trade_store)

            if result_true in option_result:
                true_index = true_index + 1
            else:
                false_index = false_index + 1

        return true_index / len(test_list)