示例#1
0
def compare_performance(trees_num, max_depth, bootstrap):

    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    # 決定木
    print('##### 決定木の性能  #####')
    decision_tree = DecisionTreeClassifier(max_depth=max_depth)
    dt_lr_start = time.time()  # 学習開始時間を記録
    decision_tree.fit(X_train, y_train)
    dt_lr_time = time.time() - dt_lr_start  # 学習時間
    dt_est_start = time.time()  # 推論開始時間を記録
    y_est = decision_tree.predict(X_test)
    dt_est_time = time.time() - dt_est_start  # 推論時間
    print('学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(dt_lr_time, dt_est_time))
    dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
    dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
    print('train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy))

    # ランダムフォレスト
    print('##### ランダムフォレストの性能 #####')
    random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=max_depth, bootstrap=bootstrap)
    rf_lr_start = time.time()  # 学習開始時間を記録
    random_forest.fit(X_train, y_train)
    rf_lr_time = time.time() - rf_lr_start  # 学習時間
    rf_est_start = time.time()  # 推論開始時間を記録
    y_est = random_forest.predict(X_test)
    rf_est_time = time.time() - rf_est_start  # 推論時間
    print('学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(rf_lr_time, rf_est_time))
    rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
    rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
    print('train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))
示例#2
0
def main():
    args = get_cmd_ln_arguments()

    num_columns, num_rows, training_data = parse_txt_file(args.filename)
    feature_names = get_feature_names(num_columns)

    decision_tree = DecisionTreeClassifier()
    print('Training a decision tree classifier...')
    decision_tree.fit(feature_names, training_data)
    print('Decision tree classifier trained:')
    decision_tree.print()

    print()
    print(
        'Entering a loop to query the decision tree. Press ctrl-c at anytime to exit.'
    )

    while True:
        sample = input(
            'Enter a sample ({} numbers separated by a space): '.format(
                num_columns))
        try:
            sample = line_to_int_list(sample)
        except ValueError:
            print(
                'Input was not {} numbers separated by a space. Please try again. '
                .format(num_columns))
            continue
        prediction = decision_tree.predict(sample)
        print('Prediction: {}'.format(prediction))
示例#3
0
def test():

    data_train = pd.read_csv('./data_set/iris_1_3.csv', header=0)
    train_data = np.array(data_train ,'float')

    #
    X = train_data[:,:-1]
    y = train_data[:, -1]

    X_train, X_test, y_train, y_true = train_test_split(X, y,test_size=1 / 3., random_state= 6)

    d_tree = DecisionTreeClassifier(criterion = 'gini')

    #使用有放回抽样,抽样数据进行训练树,包外数据进行验证

    X_subset, y_subset, out_of_bag_data = sampling_bagging(X_train,y_train)

    d_tree.fit(X_subset, y_subset)

    #使用袋外数据进行树的调整;

    print('y_true : ', y_true.tolist())
    pre_lab  = d_tree.predict(X_test)
    print('pre_lab: ',pre_lab.tolist())
    # print('test_data\n',np.column_stack((X_test,y_true)))
    print('The accuracy was ',100 * accuracy_score(y_true,pre_lab),'% on the test ')
示例#4
0
def result():
    payload = request.get_json()
    diagnosis_data = DiagnosisRequest(payload)
    x_test = diagnosis_data.to_np_array()

    tree_model = DecisionTree.query.order_by(DecisionTree.id.desc()).first()
    decision_tree = tree_model.tree
    tree_classifier = DecisionTreeClassifier(initial_tree=decision_tree)

    prediction = tree_classifier.predict(x_test)
    response = {'result': prediction.tolist()[0]}
    return jsonify(response)
示例#5
0
def visualize(model, max_depth=None):
    iris_dataset = datasets.load_iris()
    petal_features = iris_dataset['data'][:, 2:]
    targets = iris_dataset['target']

    if max_depth is None:
        # 決定木の最大深度は制限しない
        # アヤメのデータセットの場合は、データ数やクラス数が少ないため、深度を制限しなくても計算時間はあまりかからない
        if model == 'decision_tree':
            clf = DecisionTreeClassifier()
        else:
            clf = RandomForestClassifier()
    else:
        if model == 'decision_tree':
            clf = DecisionTreeClassifier(max_depth=max_depth)
        else:
            clf = RandomForestClassifier(max_depth=max_depth)

    clf.fit(petal_features, targets)

    # データの取りうる範囲 +-1 を計算する
    x_min = max(0, petal_features[:, 0].min() - 1)
    y_min = max(0, petal_features[:, 1].min() - 1)
    x_max = petal_features[:, 0].max() + 1
    y_max = petal_features[:, 1].max() + 1

    # 教師データの取りうる範囲でメッシュ状の座標を作る
    grid_interval = 0.2
    xx, yy = np.meshgrid(np.arange(x_min, x_max, grid_interval),
                         np.arange(y_min, y_max, grid_interval))

    # メッシュの座標を学習したモデルで判定させる
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    # 各点の判定結果をグラフに描画する
    plt.contourf(xx, yy, Z.reshape(xx.shape), cmap=plt.cm.rainbow, alpha=0.4)

    # データもプロット
    for c in np.unique(targets):
        plt.scatter(petal_features[targets == c, 0],
                    petal_features[targets == c, 1])

    feature_names = iris_dataset['feature_names']
    plt.xlabel(feature_names[2])
    plt.ylabel(feature_names[3])
    if max_depth is None:
        plt.title('Max Depth : No Limitation')
        plt.savefig('figures/iris/{}_no_limit.png'.format(model))
    else:
        plt.title('Max Depth : ' + str(max_depth))
        plt.savefig('figures/iris/{}_depth_{}.png'.format(model, max_depth))

    plt.close()
示例#6
0
 def test_predict(self):
     clf = DecisionTreeClassifier()
     clf.fit(self.X_train, self.y_train, self.feature_names)
     expected_for_x = [
         (np.array(['sunny', 'hot', 'high', False]),
          False),  # sunny outlook + high humidity -> don't play
         (np.array(['sunny', 'hot', 'normal', False]),
          True),  # sunny outlook + normal humidity -> play
         (np.array(['overcast', 'hot', 'high',
                    False]), True),  # overcast outlook -> don't play
     ]
     for x, expected in expected_for_x:
         output = clf.predict(x)
         assert output == expected
    def test_decision_tree_classifier_numerical_split_hard(self):
        model = DecisionTreeClassifier()

        # feature 3, 1 -> label 1
        # feature 2, 0 -> label 0
        features = np.array([[0, 0], [0, 0], [1, 0], [1, 0], [2, 0], [2, 0], [3, 0], [3, 0], [4, 0], [4, 0], [5, 0], [5, 0], \
            [6, 0], [6, 0], [7, 0], [7, 0], [8, 0], [8, 0], [9, 0], [9, 0], [10, 0], [10, 0], [11, 0], [11, 0]])
        labels = np.array([
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
        ])

        model.fit(features, labels)
        """
        print("test_decision_tree_classifier_numerical_split_hard")
        from pprint import pprint
        pprint(model._node)
        """

        predictions = model.predict(
            np.array([[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0], [6, 0],
                      [7, 0], [8, 0], [9, 0], [10, 0], [11, 0]]))

        self.assertEqual(predictions.tolist(),
                         [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])
示例#8
0
def main():

    dataset = datasets.load_iris()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'],
                                                        dataset['target'],
                                                        test_size=0.3,
                                                        random_state=0)

    # 決定木の深度の制限が1~3、制限なしの各場合について調べる
    depth_list = [1, 2, 3, None]
    for depth in depth_list:
        print('######### max_depth = {} #########'.format(depth))
        # 全ての特徴量を使用したときの精度、学習時間、推論時間、汎化性能を調べる
        # 決定木
        decision_tree = DecisionTreeClassifier(max_depth=depth)
        dt_lr_start = time.time()  # 学習開始時間を記録
        decision_tree.fit(X_train, y_train)
        dt_lr_time = time.time() - dt_lr_start  # 学習時間
        dt_est_start = time.time()  # 推論開始時間を記録
        y_est = decision_tree.predict(X_test)
        dt_est_time = time.time() - dt_est_start  # 推論時間
        print('決定木       学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(
            dt_lr_time, dt_est_time))
        dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
        dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
        print('決定木       train accuracy : {:.4f}     test_accuracy : {:.4f}'.
              format(dt_train_accuracy, dt_test_accuracy))

        # ランダムフォレスト
        random_forest = RandomForestClassifier(max_depth=depth)
        rf_lr_start = time.time()  # 学習開始時間を記録
        random_forest.fit(X_train, y_train)
        rf_lr_time = time.time() - rf_lr_start  # 学習時間
        rf_est_start = time.time()  # 推論開始時間を記録
        y_est = random_forest.predict(X_test)
        rf_est_time = time.time() - rf_est_start  # 推論時間
        print('ランダムフォレスト       学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.
              format(rf_lr_time, rf_est_time))
        rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
        rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
        print(
            'ランダムフォレスト       train accuracy : {:.4f}     test_accuracy : {:.4f}'
            .format(rf_train_accuracy, rf_test_accuracy))

        # 使用する特徴量を2つ(Petal legth, Petal width)に絞って、二次元で可視化
        visualize('decision_tree', max_depth=depth)
        visualize('random_forest', max_depth=depth)
    def _compare_sklearn_dataset(self, dataset):
        dataset = load_iris()

        features = dataset.data
        labels = dataset.target

        model_sklearn = DecisionTreeClassifierSklearn()

        model_sklearn.fit(features, labels)
        predictions_sklearn = model_sklearn.predict(features)

        model = DecisionTreeClassifier()

        model.fit(features, labels)
        predictions = model.predict(features)

        self.assertEqual(predictions.tolist(), predictions_sklearn.tolist())
    def test_decision_tree_classifier_fit(self):
        model = DecisionTreeClassifier()

        # XOR problem
        features = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0],
                             [1, 0], [1, 0], [1, 1], [1, 1], [1, 1]])
        labels = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0])

        model.fit(features, labels)
        """
        from pprint import pprint
        pprint(model._node)
        """

        predictions = model.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]))

        self.assertEqual(predictions.tolist(), [0, 1, 1, 0])
示例#11
0
        random.shuffle(image_list)
        X = []
        y = [0] * len(image_list)

        for i in range(0, len(image_list)):
            image = image_list[i]
            if image in dorsal_features:
                y[i] = 0
            else:
                y[i] = 1
            X.append(label_folder_features[image])
        X = np.array(X)
        y = np.array(y)
        decisiontree.fit(X, y)
        for image_id, feature in unlabelled_features.items():
            val = decisiontree.predict([feature])
            # print(image_id, val)
            if val[0] == 0:
                result[image_id] = 'dorsal'
            else:
                result[image_id] = 'palmar'

    elif classifier == 'SVM':
        dorsal_images = list(dorsal_features.keys())
        palmar_images = list(palmar_features.keys())
        image_list = dorsal_images
        image_list.extend(palmar_images)
        random.shuffle(image_list)
        X = []
        y = [0] * len(image_list)