def run_id3(data, test_data, metric, tree_depth, data_percents,
            train_data_percents):
    id3 = Id3.Id3(metric)
    print("\n--- Using Tree level " + str(tree_depth) + " ---")
    id3.fit(data.examples, data.attributes, None, data.labels, 0, tree_depth)

    correct_results = 0
    for example in test_data.examples:
        if example.get_label() == id3.predict(example):
            correct_results += 1

    percentage = float(correct_results) / float(len(test_data.examples))
    if data_percents is not None:
        data_percents.append(percentage)

    print("Test Error: " + "%.16f" % (1.0 - percentage))

    correct_results = 0
    for example in data.examples:
        if example.get_label() == id3.predict(example):
            correct_results += 1

    percentage = float(correct_results) / float(len(data.examples))
    if train_data_percents is not None:
        train_data_percents.append(percentage)

    print("Training Error: " + "%.16f" % (1.0 - percentage))
    max_height = id3.max_height
    id3.reset_max_height()

    return max_height
def forest_bagged_cross_comparison():
    # Train data
    dir_path = os.path.dirname(os.path.realpath(__file__))
    data = BankData.Data()
    data.initialize_data_from_file(dir_path + '/../../Data/bank/train.csv',
                                   False)

    # Test data
    test_data = BankData.Data()
    test_data.initialize_data_from_file(dir_path + '/../../Data/bank/test.csv',
                                        False)

    random_forests = []
    full_trees = []

    size = int(
        input(
            "Please enter a number for the cardinality of the set of random attributes:\n"
        ))

    counter = 1
    toolbar_width = 100
    print("Building trees")
    sys.stdout.write("Progress: [%s]" % (" " * toolbar_width))
    sys.stdout.flush()
    for i in range(0, 100):
        # sample 1000 features uniformly without replacement
        examples = get_samples(data)
        forest = RandomForests.RandomForests(t_value=100,
                                             features=examples,
                                             attributes=data.attributes,
                                             size=size)
        forest.fit(print_status_bar=False)
        random_forests.append(forest)
        id3 = Id3.Id3(metric='information_gain')
        full_trees.append(
            id3.fit(examples, data.attributes, None, data.labels, 0,
                    float("inf")))
        sys.stdout.write('\r')
        sys.stdout.flush()
        sys.stdout.write('Progress: [%s' % ('#' * counter))
        sys.stdout.write('%s]' % (' ' * (toolbar_width - counter)))
        sys.stdout.flush()
        counter += 1

    print("\nCalculating squared mean error of full trees.")
    full_trees_results = get_squared_mean_error_np(data, full_trees, False)
    print("\nMean Squared Error for the full trees is: " + "%.16f" %
          (full_trees_results[0] + full_trees_results[1]))
    print("Bias was %s, Variance was %s" %
          (full_trees_results[0], full_trees_results[1]))
    print("Calculating squared mean error for bagged trees.")
    random_forest_results = get_squared_mean_error_np(data, random_forests,
                                                      True)
    print("\nMean Squared Error for the bagged trees is: " + "%.16f" %
          (random_forest_results[0] + random_forest_results[1]))
    print("Bias was %s, Variance was %s" %
          (random_forest_results[0], random_forest_results[1]))
    def fit(self, print_status=False):
        """
        train Adaboost

        :param print_status: set to True if a status printout is desired
        :type print_status: boolean
        :return: None
        :rtype: None
        """
        if print_status:
            print("Building AdaBoost trees")
            sys.stdout.write("Progress: 0 / %s" % self.t_value)
            sys.stdout.flush()

        for i in range(0, self.t_value):
            id3 = Id3.Id3(metric='weighted_information_gain')
            id3.fit(features=self.features,
                    attributes=self.attributes,
                    prev_value=None,
                    label_set=(-1, 1),
                    current_depth=0,
                    max_depth=1)
            self.h_classifiers[i] = id3
            # Get predictions
            for i_, feature in enumerate(self.features):
                self.h_predictions[i_] = float(id3.predict(feature))

            epsilon = self.get_epsilon()
            self.alphas[i] = 0.5 * np.log((1.0 - epsilon) / epsilon)
            self.update_dt(self.alphas[i])

            # Update weights
            for j, feature in enumerate(self.features):
                feature.set_weight(self.dt[j])

            if print_status:
                sys.stdout.write("\rProgress: %s / %s" % (i + 1, self.t_value))
                sys.stdout.flush()

        if print_status:
            sys.stdout.write('\n')
            sys.stdout.flush()
示例#4
0
    def fit(self, print_status_bar):
        """
        train random forests

        :param print_status_bar: set to True if a status printout is desired
        :type print_status_bar: boolean
        :return: None
        :rtype: None
        """
        counter = 1
        toolbar_width = 100
        factor = int(self.t_value / toolbar_width)
        factor = max(factor, 1)
        if print_status_bar:
            print("Building Bagging Trees")
            sys.stdout.write("Progress: [%s]" % (" " * toolbar_width))
            sys.stdout.flush()

        for i in range(0, self.t_value):
            # Get bootstrap example
            bootstrap_sample = get_bootstrap_sample(self.features)
            id3 = Id3.Id3(metric='information_gain')
            id3.fit(features=bootstrap_sample,
                    attributes=self.attributes,
                    prev_value=None,
                    label_set=(-1, 1),
                    current_depth=0,
                    max_depth=float("inf"),
                    rand_attribute_size=self.size)
            self.forest.append(id3)

            if i % factor == 0 and print_status_bar:
                sys.stdout.write('\r')
                sys.stdout.flush()
                sys.stdout.write('Progress: [%s' % ('#' * counter))
                sys.stdout.write('%s]' % (' ' * (toolbar_width - counter)))
                sys.stdout.flush()
                counter += 1

        if print_status_bar:
            print("")
示例#5
0
def run_cross_comparison():
    print_error_calculation_status = False
    dir_path = os.path.dirname(os.path.realpath(__file__))
    data = CreditDefaultData.Data()
    data.initialize_data_from_file(dir_path + '/../../Data/credit/credit.csv')

    iterates = [1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 90, 100]
    it_sum = float(np.sum(iterates))

    ada_boost_train_error = []
    ada_boost_test_error = []
    bagging_trees_train_error = []
    bagging_trees_test_error = []
    random_forests_train_error = []
    random_forests_test_error = []

    ada_trees = AdaBoost.Adaboost(features=data.train_examples,
                                  attributes=data.attributes,
                                  t_value=100)
    ada_trees.fit(print_status=True)
    bag_trees = BaggingTrees.BaggingTrees(features=data.train_examples,
                                          attributes=data.attributes,
                                          t_value=100,
                                          attribute_factor=2)
    bag_trees.fit(print_status_bar=True)
    r_forest = RandomForests.RandomForests(features=data.train_examples,
                                           attributes=data.attributes,
                                           t_value=100,
                                           size=4)
    r_forest.fit(print_status_bar=True)
    print("Tree construction complete. Calculating Data.")

    toolbar_width = 100
    if print_error_calculation_status:
        sys.stdout.write("Progress: [%s]" % (" " * toolbar_width))
        sys.stdout.flush()

    for i in range(0, len(iterates)):
        ada_boost_train_error.append(get_error(data.train_examples, ada_trees))
        ada_boost_test_error.append(get_error(data.test_examples, ada_trees))

        bagging_trees_train_error.append(
            get_error(data.train_examples, bag_trees))
        bagging_trees_test_error.append(
            get_error(data.test_examples, bag_trees))

        random_forests_train_error.append(
            get_error(data.train_examples, r_forest))
        random_forests_test_error.append(
            get_error(data.test_examples, r_forest))

        if print_error_calculation_status:
            counter = int(
                (float(np.sum(iterates[:i + 1])) / it_sum) * toolbar_width)
            sys.stdout.write('\r')
            sys.stdout.flush()
            sys.stdout.write('Progress: [%s' % ('#' * counter))
            sys.stdout.write('%s]' % (' ' * (toolbar_width - counter)))
            sys.stdout.flush()

    print("")
    plt.plot(iterates, ada_boost_train_error, label='Ada Train')
    plt.plot(iterates, ada_boost_test_error, label='Ada Test')
    plt.plot(iterates,
             bagging_trees_train_error,
             label='Bag Train',
             marker='<')
    plt.plot(iterates, bagging_trees_test_error, label='Bag Test', marker='<')
    plt.plot(iterates,
             random_forests_train_error,
             label='Forest Train',
             marker='o')
    plt.plot(iterates,
             random_forests_test_error,
             label='Forest Test',
             marker='o')
    plt.legend(loc='best')
    plt.show()

    print("Now running statistics on a fully developed decision tree.")
    id3 = Id3.Id3(metric='information_gain')
    id3.fit(data.train_examples, data.attributes, None, data.labels, 0,
            float("inf"))
    dec_tree_train_error = get_error(data.train_examples, id3)
    dec_tree_test_error = get_error(data.test_examples, id3)
    print("Train error for full decision tree is %.16f" % dec_tree_train_error)
    print("Test error for full decision tree is %.16f" % dec_tree_test_error)