Python DecisionTreeClassifier.fit示例，decision_tree.DecisionTreeClassifier.fit Python示例

示例#1

0

显示文件

def test():

    data_train = pd.read_csv('./data_set/iris_1_3.csv', header=0)
    train_data = np.array(data_train ,'float')

    #
    X = train_data[:,:-1]
    y = train_data[:, -1]

    X_train, X_test, y_train, y_true = train_test_split(X, y,test_size=1 / 3., random_state= 6)

    d_tree = DecisionTreeClassifier(criterion = 'gini')

    #使用有放回抽样，抽样数据进行训练树，包外数据进行验证

    X_subset, y_subset, out_of_bag_data = sampling_bagging(X_train,y_train)

    d_tree.fit(X_subset, y_subset)

    #使用袋外数据进行树的调整;

    print('y_true : ', y_true.tolist())
    pre_lab  = d_tree.predict(X_test)
    print('pre_lab: ',pre_lab.tolist())
    # print('test_data\n',np.column_stack((X_test,y_true)))
    print('The accuracy was ',100 * accuracy_score(y_true,pre_lab),'% on the test ')

示例#2

0

显示文件

文件： mnist.py 项目： ayarimatsui/Random_Forest

def compare_performance(trees_num, max_depth, bootstrap):

    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    # 決定木
    print('#####　決定木の性能  #####')
    decision_tree = DecisionTreeClassifier(max_depth=max_depth)
    dt_lr_start = time.time()  # 学習開始時間を記録
    decision_tree.fit(X_train, y_train)
    dt_lr_time = time.time() - dt_lr_start  # 学習時間
    dt_est_start = time.time()  # 推論開始時間を記録
    y_est = decision_tree.predict(X_test)
    dt_est_time = time.time() - dt_est_start  # 推論時間
    print('学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(dt_lr_time, dt_est_time))
    dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
    dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
    print('train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy))

    # ランダムフォレスト
    print('#####　ランダムフォレストの性能　#####')
    random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=max_depth, bootstrap=bootstrap)
    rf_lr_start = time.time()  # 学習開始時間を記録
    random_forest.fit(X_train, y_train)
    rf_lr_time = time.time() - rf_lr_start  # 学習時間
    rf_est_start = time.time()  # 推論開始時間を記録
    y_est = random_forest.predict(X_test)
    rf_est_time = time.time() - rf_est_start  # 推論時間
    print('学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(rf_lr_time, rf_est_time))
    rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
    rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
    print('train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))

示例#3

0

显示文件

文件： main.py 项目： aroques/decision-tree

def main():
    args = get_cmd_ln_arguments()

    num_columns, num_rows, training_data = parse_txt_file(args.filename)
    feature_names = get_feature_names(num_columns)

    decision_tree = DecisionTreeClassifier()
    print('Training a decision tree classifier...')
    decision_tree.fit(feature_names, training_data)
    print('Decision tree classifier trained:')
    decision_tree.print()

    print()
    print(
        'Entering a loop to query the decision tree. Press ctrl-c at anytime to exit.'
    )

    while True:
        sample = input(
            'Enter a sample ({} numbers separated by a space): '.format(
                num_columns))
        try:
            sample = line_to_int_list(sample)
        except ValueError:
            print(
                'Input was not {} numbers separated by a space. Please try again. '
                .format(num_columns))
            continue
        prediction = decision_tree.predict(sample)
        print('Prediction: {}'.format(prediction))

示例#4

0

显示文件

文件： test_decision_tree.py 项目： tngzng/ml

    def test_fit(self):
        clf = DecisionTreeClassifier()
        clf.fit(self.X_train, self.y_train, self.feature_names)
        # verify the decision tree looks like this
        #
        #                        feature:
        #                        outlook
        #                         / | \
        #                       /   |   \
        #             rainy   /  overcast \   sunny
        #                   /       |       \
        #                 /         |         \
        #            feature:     class:     feature:
        #            windy        True       humidity
        #            /   \                    /   \
        #   False  /       \  True     high /       \  normal
        #        /           \            /           \
        #      class:      class:      class:        class:
        #      True        False       False         True

        assert clf.root.feature == 'outlook'
        rainy_node = clf.root.children_by_attribute['rainy']
        overcast_node = clf.root.children_by_attribute['overcast']
        sunny_node = clf.root.children_by_attribute['sunny']
        assert rainy_node.feature == 'windy'
        assert overcast_node.classification == True
        assert sunny_node.feature == 'humidity'
        assert rainy_node.children_by_attribute['False'].classification == True
        assert rainy_node.children_by_attribute['True'].classification == False
        assert sunny_node.children_by_attribute['high'].classification == False
        assert sunny_node.children_by_attribute[
            'normal'].classification == True

示例#5

0

显示文件

文件： main.py 项目： carlosabcs/random-forest

def test_benchmark_numerical():
    print('\n** Numerical benchmark **')
    df = pd.read_csv(DATA_PATH + 'dados_benchmark_v2.csv', sep=';')
    dt = DecisionTreeClassifier(
        target_attribute = 'Joga',
        n_random_attributes=4
    )
    dt.fit(df)
    dt.print_tree()

示例#6

0

显示文件

def main():
    curr_dir = os.path.dirname(__file__)
    csv_file = os.path.join(curr_dir, 'data/play.csv')

    df = pd.read_csv(csv_file, index_col='Dia')
    X, y = df.loc[:, df.columns != 'Jogar'], df['Jogar']

    clf = DecisionTreeClassifier()
    clf.fit(X, y)
    print(clf.rules())

示例#7

0

显示文件

文件： test_decision_tree.py 项目： gergelyBognar/vanilearn

    def test_decision_tree_classifier_numerical_split(self):
        model = DecisionTreeClassifier()

        # feature 3, 1 -> label 1
        # feature 2, 0 -> label 0
        features = np.array([[3, 0], [3, 0], [3, 0], [2, 0], [2, 0], [2, 0],
                             [1, 0], [1, 0], [1, 0], [0, 0], [0, 0]])
        labels = np.array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0])

        model.fit(features, labels)
        """

示例#8

0

显示文件

文件： random_forest.py 项目： carlosabcs/random-forest

 def __generate_trees(self, df, n_baggings):
     '''
     Given 'n' baggings, generate one Tree model for each of them
     '''
     for i in n_baggings:
         clf = DecisionTreeClassifier(
             self.target_attribute,
             self.n_random_attributes
         )
         clf.fit(df.iloc[n_baggings[i],:])
         self.trees[i] = clf

示例#9

0

显示文件

文件： test_decision_tree.py 项目： gergelyBognar/vanilearn

    def test_decision_tree_classifier_exhaustive_categorical_split(self):
        model = DecisionTreeClassifier(categorical_feature_indeces=[0, 1])

        # feature[0] 3, 1 -> label 1
        # feature[0] 2, 0 -> label 0, 2
        features = np.array([[3, 0], [3, 0], [3, 0], [2, 0], [2, 0], [2, 0],
                             [1, 0], [1, 0], [1, 0], [0, 0], [0, 1]])
        labels = np.array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 2])

        model.fit(features, labels)
        """

示例#10

0

显示文件

文件： mnist.py 项目： ayarimatsui/Random_Forest

def compare_depth():

    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    # ランダムフォレストに関して、最良のハイパーパラメータを調べる
    trees_num, bootstrap = grid_search_RF()

    # 決定木の深度の制限を変えて、調べる
    depth_list = [i for i in range(21)]  # 深さの制限0~20まで調べる

    dt_train_acc_list = []
    dt_test_acc_list = []
    rf_train_acc_list = []
    rf_test_acc_list = []

    for depth in tqdm(depth_list):
        print('***** max_depth = {} *****'.format(depth))
        # 決定木
        decision_tree = DecisionTreeClassifier(max_depth=depth)
        decision_tree.fit(X_train, y_train)
        dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
        dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
        # accuracyをリストに追加
        dt_train_acc_list.append(dt_train_accuracy)
        dt_test_acc_list.append(dt_test_accuracy)
        print('決定木       train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy))

        # ランダムフォレスト
        random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=depth, bootstrap=bootstrap)
        random_forest.fit(X_train, y_train)
        rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
        rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
        # accuracyをリストに追加
        rf_train_acc_list.append(rf_train_accuracy)
        rf_test_acc_list.append(rf_test_accuracy)
        print('ランダムフォレスト       train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))

    # グラフの描画
    plt.plot(depth_list, dt_train_acc_list, label='Decision Tree - train accuracy', color='r')
    plt.plot(depth_list, dt_test_acc_list, label='Decision Tree - test accuracy', color='g')
    plt.plot(depth_list, rf_train_acc_list, label='Random Forest - train accuracy', color='y')
    plt.plot(depth_list, rf_test_acc_list, label='Random Forest - test accuracy', color='b')

    plt.xlabel('Max Depth')
    plt.ylabel('Accuracy')
    plt.xlim(0, 20)
    plt.xticks(np.arange(0, 21, 2))
    plt.ylim(0, 1.0)
    plt.legend(loc='lower right')
    plt.title('Max Depth of Decision Trees and Accuracy')
    # グラフを保存
    plt.savefig('figures/mnist/max_depth_&_accuracy.png')

示例#11

0

显示文件

def visualize(model, max_depth=None):
    iris_dataset = datasets.load_iris()
    petal_features = iris_dataset['data'][:, 2:]
    targets = iris_dataset['target']

    if max_depth is None:
        # 決定木の最大深度は制限しない
        # アヤメのデータセットの場合は、データ数やクラス数が少ないため、深度を制限しなくても計算時間はあまりかからない
        if model == 'decision_tree':
            clf = DecisionTreeClassifier()
        else:
            clf = RandomForestClassifier()
    else:
        if model == 'decision_tree':
            clf = DecisionTreeClassifier(max_depth=max_depth)
        else:
            clf = RandomForestClassifier(max_depth=max_depth)

    clf.fit(petal_features, targets)

    # データの取りうる範囲 +-1 を計算する
    x_min = max(0, petal_features[:, 0].min() - 1)
    y_min = max(0, petal_features[:, 1].min() - 1)
    x_max = petal_features[:, 0].max() + 1
    y_max = petal_features[:, 1].max() + 1

    # 教師データの取りうる範囲でメッシュ状の座標を作る
    grid_interval = 0.2
    xx, yy = np.meshgrid(np.arange(x_min, x_max, grid_interval),
                         np.arange(y_min, y_max, grid_interval))

    # メッシュの座標を学習したモデルで判定させる
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    # 各点の判定結果をグラフに描画する
    plt.contourf(xx, yy, Z.reshape(xx.shape), cmap=plt.cm.rainbow, alpha=0.4)

    # データもプロット
    for c in np.unique(targets):
        plt.scatter(petal_features[targets == c, 0],
                    petal_features[targets == c, 1])

    feature_names = iris_dataset['feature_names']
    plt.xlabel(feature_names[2])
    plt.ylabel(feature_names[3])
    if max_depth is None:
        plt.title('Max Depth : No Limitation')
        plt.savefig('figures/iris/{}_no_limit.png'.format(model))
    else:
        plt.title('Max Depth : ' + str(max_depth))
        plt.savefig('figures/iris/{}_depth_{}.png'.format(model, max_depth))

    plt.close()

示例#12

0

显示文件

文件： test_decision_tree.py 项目： tngzng/ml

 def test_predict(self):
     clf = DecisionTreeClassifier()
     clf.fit(self.X_train, self.y_train, self.feature_names)
     expected_for_x = [
         (np.array(['sunny', 'hot', 'high', False]),
          False),  # sunny outlook + high humidity -> don't play
         (np.array(['sunny', 'hot', 'normal', False]),
          True),  # sunny outlook + normal humidity -> play
         (np.array(['overcast', 'hot', 'high',
                    False]), True),  # overcast outlook -> don't play
     ]
     for x, expected in expected_for_x:
         output = clf.predict(x)
         assert output == expected

示例#13

0

显示文件

    def fit(self, X, y):
        self.forest = []
        N = len(y)
        N_sub_data = int(N * self.bootstrap)

        for i in range(self.trees_num):
            self.shuffle(X, y)
            X_sub = X[:N_sub_data]
            y_sub = y[:N_sub_data]

            decision_tree = DecisionTreeClassifier(self.features_num,
                                                   self.max_depth)
            decision_tree.fit(X_sub, y_sub)
            # 得られた決定木をforestのリストに追加
            self.forest.append(decision_tree)

示例#14

0

显示文件

文件： test_decision_tree.py 项目： gergelyBognar/vanilearn

    def test_decision_tree_classifier_numerical_split_hard(self):
        model = DecisionTreeClassifier()

        # feature 3, 1 -> label 1
        # feature 2, 0 -> label 0
        features = np.array([[0, 0], [0, 0], [1, 0], [1, 0], [2, 0], [2, 0], [3, 0], [3, 0], [4, 0], [4, 0], [5, 0], [5, 0], \
            [6, 0], [6, 0], [7, 0], [7, 0], [8, 0], [8, 0], [9, 0], [9, 0], [10, 0], [10, 0], [11, 0], [11, 0]])
        labels = np.array([
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
        ])

        model.fit(features, labels)
        """
        print("test_decision_tree_classifier_numerical_split_hard")
        from pprint import pprint
        pprint(model._node)
        """

        predictions = model.predict(
            np.array([[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0], [6, 0],
                      [7, 0], [8, 0], [9, 0], [10, 0], [11, 0]]))

        self.assertEqual(predictions.tolist(),
                         [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])

示例#15

0

显示文件

def main():

    dataset = datasets.load_iris()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'],
                                                        dataset['target'],
                                                        test_size=0.3,
                                                        random_state=0)

    # 決定木の深度の制限が1~3、制限なしの各場合について調べる
    depth_list = [1, 2, 3, None]
    for depth in depth_list:
        print('######### max_depth = {} #########'.format(depth))
        # 全ての特徴量を使用したときの精度、学習時間、推論時間、汎化性能を調べる
        # 決定木
        decision_tree = DecisionTreeClassifier(max_depth=depth)
        dt_lr_start = time.time()  # 学習開始時間を記録
        decision_tree.fit(X_train, y_train)
        dt_lr_time = time.time() - dt_lr_start  # 学習時間
        dt_est_start = time.time()  # 推論開始時間を記録
        y_est = decision_tree.predict(X_test)
        dt_est_time = time.time() - dt_est_start  # 推論時間
        print('決定木       学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(
            dt_lr_time, dt_est_time))
        dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
        dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
        print('決定木       train accuracy : {:.4f}     test_accuracy : {:.4f}'.
              format(dt_train_accuracy, dt_test_accuracy))

        # ランダムフォレスト
        random_forest = RandomForestClassifier(max_depth=depth)
        rf_lr_start = time.time()  # 学習開始時間を記録
        random_forest.fit(X_train, y_train)
        rf_lr_time = time.time() - rf_lr_start  # 学習時間
        rf_est_start = time.time()  # 推論開始時間を記録
        y_est = random_forest.predict(X_test)
        rf_est_time = time.time() - rf_est_start  # 推論時間
        print('ランダムフォレスト       学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.
              format(rf_lr_time, rf_est_time))
        rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
        rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
        print(
            'ランダムフォレスト       train accuracy : {:.4f}     test_accuracy : {:.4f}'
            .format(rf_train_accuracy, rf_test_accuracy))

        # 使用する特徴量を2つ(Petal legth, Petal width)に絞って、二次元で可視化
        visualize('decision_tree', max_depth=depth)
        visualize('random_forest', max_depth=depth)

示例#16

0

显示文件

文件： test_decision_tree.py 项目： gergelyBognar/vanilearn

    def _compare_sklearn_dataset(self, dataset):
        dataset = load_iris()

        features = dataset.data
        labels = dataset.target

        model_sklearn = DecisionTreeClassifierSklearn()

        model_sklearn.fit(features, labels)
        predictions_sklearn = model_sklearn.predict(features)

        model = DecisionTreeClassifier()

        model.fit(features, labels)
        predictions = model.predict(features)

        self.assertEqual(predictions.tolist(), predictions_sklearn.tolist())

示例#17

0

显示文件

文件： test_decision_tree.py 项目： gergelyBognar/vanilearn

    def test_decision_tree_classifier_fit(self):
        model = DecisionTreeClassifier()

        # XOR problem
        features = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0],
                             [1, 0], [1, 0], [1, 1], [1, 1], [1, 1]])
        labels = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0])

        model.fit(features, labels)
        """
        from pprint import pprint
        pprint(model._node)
        """

        predictions = model.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]))

        self.assertEqual(predictions.tolist(), [0, 1, 1, 0])

示例#18

0

显示文件

    def fit(self, X, y):

        # print('===', sys._getframe().f_code.co_filename, sys._getframe().f_code.co_name, sys._getframe().f_lineno,"===")
        self.forest = []
        self.n_calss_num = len(set(y))  #有几个类
        self.n_calss = list(set(y))  #类标签集合

        for i in range(self.n_estimators):

            #随机的取数据 self.bootstrap 比率 ，表示抽取样本集的比例
            X_subset, y_subset = sampling_with_reset(X, y, self.bootstrap)
            ###########################################
            tree = DecisionTreeClassifier(self.max_features, self.criterion,
                                          self.max_depth,
                                          self.min_samples_split,
                                          self.min_impurity_split)
            #打印树的信息
            print('tree_' + str(i))
            tree.fit(X_subset, y_subset)

            self.forest.append(tree)  #树的集合

示例#19

0

显示文件

文件： decision_tree_usage.py 项目： udeos/ml_playground

from sklearn import datasets, cross_validation

from decision_tree import DecisionTreeClassifier


iris = datasets.load_iris()
X, Y = iris.data, iris.target
clf = DecisionTreeClassifier()
clf.fit(X, Y)
print cross_validation.cross_val_score(clf, X, Y)
clf.draw_tree('decision_tree_example.png')

示例#20

0

显示文件

from sklearn.datasets import load_iris
from decision_tree import DecisionTreeClassifier
from sklearn import tree

# load the iris dataset
dataset = load_iris()

# set X and y variables
X, y = dataset.data, dataset.target
print(':::::::::::::::::::::::::::::::::::::::::::::')
print(f'APPROPRIATE X, y DATATYPES: {type(X)}')
print(':::::::::::::::::::::::::::::::::::::::::::::')
# create a new isntance of the DecisionTreeClassifier object
clf = DecisionTreeClassifier(max_depth=5)

# call the fit method on that object
clf.fit(X, y)
print('')
print(':::::::::::::PREDICTIONS:::::::::::::::::::::')
print('')
print(':::::::::::::::::::::::::::::::::::::::::::::')
inputs = [[1, 1.5, 5, 1.5]]
print(f'INPUTS: {inputs}')
print(f'OUR MODEL PREDICTION: {clf.predict(inputs)}')

clf2 = tree.DecisionTreeClassifier(max_depth=5)
clf2.fit(X, y)

print(f'SCIKITLEARN MODEL PREDICTION: {clf2.predict(inputs)}')
print(':::::::::::::::::::::::::::::::::::::::::::::')

示例#21

0

显示文件

文件： Phase_3.py 项目： Surya97/MWDB-project

        image_list = dorsal_images
        image_list.extend(palmar_images)
        random.shuffle(image_list)
        X = []
        y = [0] * len(image_list)

        for i in range(0, len(image_list)):
            image = image_list[i]
            if image in dorsal_features:
                y[i] = 0
            else:
                y[i] = 1
            X.append(label_folder_features[image])
        X = np.array(X)
        y = np.array(y)
        decisiontree.fit(X, y)
        for image_id, feature in unlabelled_features.items():
            val = decisiontree.predict([feature])
            # print(image_id, val)
            if val[0] == 0:
                result[image_id] = 'dorsal'
            else:
                result[image_id] = 'palmar'

    elif classifier == 'SVM':
        dorsal_images = list(dorsal_features.keys())
        palmar_images = list(palmar_features.keys())
        image_list = dorsal_images
        image_list.extend(palmar_images)
        random.shuffle(image_list)
        X = []