Пример #1
0
def test_forest():

    #加载数据
    train_set = pd.read_csv('./data_set/seeds.csv')
    data_set = np.array(train_set)

    X = data_set[:, :-1]
    y = data_set[:, -1]

    train_X, test_X, train_y, y_true = train_test_split(X,
                                                        y,
                                                        test_size=1 / 3.,
                                                        random_state=7)
    # 加载模型
    rf_model = RandomForestClassifier(n_estimators=3,
                                      criterion='gini',
                                      max_features='sqrt',
                                      max_depth=20)

    rf_model.fit(train_X, train_y)  #创建决策树集合

    print('rf_model.predict...begin...')
    pre_result = rf_model.predict(test_X)
    print('训练数据的预测概率向量:')
    print(pre_result)
    print('真实标签 :')
    print(y_true)
    print('训练数据的预测准确度:')
    print(accuracy_score(y_true, pre_result))
Пример #2
0
def grid_search_RF():
    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    trees_num_list = [16, 32, 64, 128]  # ランダムフォレストに含まれる決定木の個数の候補
    bootstrap_list = [0.1, 0.3, 0.5, 0.7, 0.9]   # ブートストラップ法で復元するデータ量の元のデータ量に対する割合の候補

    best_acc = 0
    best_trees_num = None
    best_bootstrap = None
    with tqdm(total=len(trees_num_list)*len(bootstrap_list), desc='Progress') as pbar:
        for trees_num in trees_num_list:
            for bootstrap in bootstrap_list:
                random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=5, bootstrap=bootstrap)
                random_forest.fit(X_train, y_train)
                acc = random_forest.accuracy_score(X_test, y_test)
                if acc > best_acc:
                    best_acc = acc
                    best_trees_num = trees_num
                    best_bootstrap = bootstrap
                pbar.update(1)
    
    print('best acc : {:.4f}    best trees_num : {}     best bootstrap : {}'.format(best_acc, best_trees_num, best_bootstrap))

    return best_trees_num, best_bootstrap
Пример #3
0
def compare_performance(trees_num, max_depth, bootstrap):

    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    # 決定木
    print('##### 決定木の性能  #####')
    decision_tree = DecisionTreeClassifier(max_depth=max_depth)
    dt_lr_start = time.time()  # 学習開始時間を記録
    decision_tree.fit(X_train, y_train)
    dt_lr_time = time.time() - dt_lr_start  # 学習時間
    dt_est_start = time.time()  # 推論開始時間を記録
    y_est = decision_tree.predict(X_test)
    dt_est_time = time.time() - dt_est_start  # 推論時間
    print('学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(dt_lr_time, dt_est_time))
    dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
    dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
    print('train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy))

    # ランダムフォレスト
    print('##### ランダムフォレストの性能 #####')
    random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=max_depth, bootstrap=bootstrap)
    rf_lr_start = time.time()  # 学習開始時間を記録
    random_forest.fit(X_train, y_train)
    rf_lr_time = time.time() - rf_lr_start  # 学習時間
    rf_est_start = time.time()  # 推論開始時間を記録
    y_est = random_forest.predict(X_test)
    rf_est_time = time.time() - rf_est_start  # 推論時間
    print('学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(rf_lr_time, rf_est_time))
    rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
    rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
    print('train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))
Пример #4
0
def test_rf_classification():
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    print (X.shape, y.shape)
    train_X, train_y, test_X, test_y = split_train_test(X, y)
    print (train_X.shape, train_y.shape, test_X.shape, test_y.shape)

    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(train_X, train_y)
    preds = clf.predict(test_X)
    accuracy = cal_accuracy(test_y, preds)
    print ('accuracy: ', accuracy)
Пример #5
0
def compare_depth():

    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    # ランダムフォレストに関して、最良のハイパーパラメータを調べる
    trees_num, bootstrap = grid_search_RF()

    # 決定木の深度の制限を変えて、調べる
    depth_list = [i for i in range(21)]  # 深さの制限0~20まで調べる

    dt_train_acc_list = []
    dt_test_acc_list = []
    rf_train_acc_list = []
    rf_test_acc_list = []

    for depth in tqdm(depth_list):
        print('***** max_depth = {} *****'.format(depth))
        # 決定木
        decision_tree = DecisionTreeClassifier(max_depth=depth)
        decision_tree.fit(X_train, y_train)
        dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
        dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
        # accuracyをリストに追加
        dt_train_acc_list.append(dt_train_accuracy)
        dt_test_acc_list.append(dt_test_accuracy)
        print('決定木       train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy))

        # ランダムフォレスト
        random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=depth, bootstrap=bootstrap)
        random_forest.fit(X_train, y_train)
        rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
        rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
        # accuracyをリストに追加
        rf_train_acc_list.append(rf_train_accuracy)
        rf_test_acc_list.append(rf_test_accuracy)
        print('ランダムフォレスト       train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))

    # グラフの描画
    plt.plot(depth_list, dt_train_acc_list, label='Decision Tree - train accuracy', color='r')
    plt.plot(depth_list, dt_test_acc_list, label='Decision Tree - test accuracy', color='g')
    plt.plot(depth_list, rf_train_acc_list, label='Random Forest - train accuracy', color='y')
    plt.plot(depth_list, rf_test_acc_list, label='Random Forest - test accuracy', color='b')

    plt.xlabel('Max Depth')
    plt.ylabel('Accuracy')
    plt.xlim(0, 20)
    plt.xticks(np.arange(0, 21, 2))
    plt.ylim(0, 1.0)
    plt.legend(loc='lower right')
    plt.title('Max Depth of Decision Trees and Accuracy')
    # グラフを保存
    plt.savefig('figures/mnist/max_depth_&_accuracy.png')
def train_and_predict(x_train, y_train, x_test, x_val, y_val):
    """ Interface to train and test the new/improved decision tree.
    
    This function is an interface for training and testing the new/improved
    decision tree classifier. 

    x_train and y_train should be used to train your classifier, while 
    x_test should be used to test your classifier. 
    x_val and y_val may optionally be used as the validation dataset. 
    You can just ignore x_val and y_val if you do not need a validation dataset.

    Args:
    x_train (numpy.ndarray): Training instances, numpy array of shape (N, K) 
                       N is the number of instances
                       K is the number of attributes
    y_train (numpy.ndarray): Class labels, numpy array of shape (N, )
                       Each element in y is a str 
    x_test (numpy.ndarray): Test instances, numpy array of shape (M, K) 
                            M is the number of test instances
                            K is the number of attributes
    x_val (numpy.ndarray): Validation instances, numpy array of shape (L, K) 
                       L is the number of validation instances
                       K is the number of attributes
    y_val (numpy.ndarray): Class labels of validation set, numpy array of shape (L, )
    """

    #######################################################################
    #                 ** TASK 4.2: COMPLETE THIS FUNCTION **
    #######################################################################
    # TODO: Train new classifier
    forest = RandomForestClassifier()
    # Forest is trained on the best hyperparameter set
    forest.update_hyperparameters(feature_sel=True,
                                  cross_val=False,
                                  max_tree_depth=13,
                                  min_sample_size=2,
                                  num_trees=20)

    forest.fit(x_train, y_train)
    # set up an empty (M, ) numpy array to store the predicted labels
    # feel free to change this if needed

    # TODO: Make predictions on x_test using new classifier
    predictions = forest.predict(x_test)

    # return result on best classifier option
    # remember to change this if you rename the variable
    return predictions
Пример #7
0
def main():

    dataset = datasets.load_iris()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'],
                                                        dataset['target'],
                                                        test_size=0.3,
                                                        random_state=0)

    # 決定木の深度の制限が1~3、制限なしの各場合について調べる
    depth_list = [1, 2, 3, None]
    for depth in depth_list:
        print('######### max_depth = {} #########'.format(depth))
        # 全ての特徴量を使用したときの精度、学習時間、推論時間、汎化性能を調べる
        # 決定木
        decision_tree = DecisionTreeClassifier(max_depth=depth)
        dt_lr_start = time.time()  # 学習開始時間を記録
        decision_tree.fit(X_train, y_train)
        dt_lr_time = time.time() - dt_lr_start  # 学習時間
        dt_est_start = time.time()  # 推論開始時間を記録
        y_est = decision_tree.predict(X_test)
        dt_est_time = time.time() - dt_est_start  # 推論時間
        print('決定木       学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(
            dt_lr_time, dt_est_time))
        dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
        dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
        print('決定木       train accuracy : {:.4f}     test_accuracy : {:.4f}'.
              format(dt_train_accuracy, dt_test_accuracy))

        # ランダムフォレスト
        random_forest = RandomForestClassifier(max_depth=depth)
        rf_lr_start = time.time()  # 学習開始時間を記録
        random_forest.fit(X_train, y_train)
        rf_lr_time = time.time() - rf_lr_start  # 学習時間
        rf_est_start = time.time()  # 推論開始時間を記録
        y_est = random_forest.predict(X_test)
        rf_est_time = time.time() - rf_est_start  # 推論時間
        print('ランダムフォレスト       学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.
              format(rf_lr_time, rf_est_time))
        rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
        rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
        print(
            'ランダムフォレスト       train accuracy : {:.4f}     test_accuracy : {:.4f}'
            .format(rf_train_accuracy, rf_test_accuracy))

        # 使用する特徴量を2つ(Petal legth, Petal width)に絞って、二次元で可視化
        visualize('decision_tree', max_depth=depth)
        visualize('random_forest', max_depth=depth)
Пример #8
0
    def random_forest_classification(self, train_data):

        train_X = train_data[:, :-1]
        train_y = train_data[:, -1]

        print("train_X ......type.....")
        print(type(train_X))

        self.k_class_ = list(set(train_y))  #

        rf_model = RandomForestClassifier(n_estimators=10,
                                          criterion='gini',
                                          max_features='sqrt',
                                          max_depth=20)

        rf_model.fit(train_X, train_y)
        # 保存模型
        self.rf_model = rf_model
        save_model_rf_model(rf_model)
Пример #9
0
    def test_random_foerst_fit_predict(self):
        model = RandomForestClassifier(n_estimators=100)

        features = np.array([
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
        ])
        labels = np.array([
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
        ])

        model.fit(features, labels)
        """
        for tree in model._models:
            print("=================================")
            from pprint import pprint
            pprint(tree._node)

        for tree in model._models:
            print(tree.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]])))
        """

        predictions = model.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]))

        self.assertEqual(predictions.tolist(), [0, 1, 1, 0])
Пример #10
0
print(f"10 tree Random Forest Validation Accuracy, ",
      f"with feature selection and sampling: {acc_val_4}")

results = np.asarray([acc_val_1, acc_val_2, acc_val_3, acc_val_4])
print(f"Best result was achieved with setup {np.argmax(results) + 1}")

"""
# 4.2.4 starts here
# Trees start following the best tree model from improvement 1
forest.update_hyperparameters(feature_sel=True,
                              cross_val=False,
                              max_tree_depth=13,
                              min_sample_size=3,
                              num_trees=10)

forest.fit(x_train, y_train)
pred_val = forest.predict(x_val)
acc_val_5 = metrics.accuracy(pred_val, y_val)

print(f"10 Best Tree Random Forest Validation Accuracy, ",
      f"with feature selection and sampling: {acc_val_5}")

# start by tuning the trees used
param_space = {
    "max_tree_depth": [x for x in range(13, 15)],
    "min_sample_size": [y for y in range(2, 4)],
    "num_trees": [10, 20]
}
best_param = metrics.grid_search(forest,
                                 x_train,
                                 y_train,