def compare_performance(trees_num, max_depth, bootstrap): dataset = datasets.load_digits() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) # 決定木 print('##### 決定木の性能 #####') decision_tree = DecisionTreeClassifier(max_depth=max_depth) dt_lr_start = time.time() # 学習開始時間を記録 decision_tree.fit(X_train, y_train) dt_lr_time = time.time() - dt_lr_start # 学習時間 dt_est_start = time.time() # 推論開始時間を記録 y_est = decision_tree.predict(X_test) dt_est_time = time.time() - dt_est_start # 推論時間 print('学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format(dt_lr_time, dt_est_time)) dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train) dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test) print('train accuracy : {:.4f} test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy)) # ランダムフォレスト print('##### ランダムフォレストの性能 #####') random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=max_depth, bootstrap=bootstrap) rf_lr_start = time.time() # 学習開始時間を記録 random_forest.fit(X_train, y_train) rf_lr_time = time.time() - rf_lr_start # 学習時間 rf_est_start = time.time() # 推論開始時間を記録 y_est = random_forest.predict(X_test) rf_est_time = time.time() - rf_est_start # 推論時間 print('学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format(rf_lr_time, rf_est_time)) rf_train_accuracy = random_forest.accuracy_score(X_train, y_train) rf_test_accuracy = random_forest.accuracy_score(X_test, y_test) print('train accuracy : {:.4f} test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))
def main(): args = get_cmd_ln_arguments() num_columns, num_rows, training_data = parse_txt_file(args.filename) feature_names = get_feature_names(num_columns) decision_tree = DecisionTreeClassifier() print('Training a decision tree classifier...') decision_tree.fit(feature_names, training_data) print('Decision tree classifier trained:') decision_tree.print() print() print( 'Entering a loop to query the decision tree. Press ctrl-c at anytime to exit.' ) while True: sample = input( 'Enter a sample ({} numbers separated by a space): '.format( num_columns)) try: sample = line_to_int_list(sample) except ValueError: print( 'Input was not {} numbers separated by a space. Please try again. ' .format(num_columns)) continue prediction = decision_tree.predict(sample) print('Prediction: {}'.format(prediction))
def test(): data_train = pd.read_csv('./data_set/iris_1_3.csv', header=0) train_data = np.array(data_train ,'float') # X = train_data[:,:-1] y = train_data[:, -1] X_train, X_test, y_train, y_true = train_test_split(X, y,test_size=1 / 3., random_state= 6) d_tree = DecisionTreeClassifier(criterion = 'gini') #使用有放回抽样,抽样数据进行训练树,包外数据进行验证 X_subset, y_subset, out_of_bag_data = sampling_bagging(X_train,y_train) d_tree.fit(X_subset, y_subset) #使用袋外数据进行树的调整; print('y_true : ', y_true.tolist()) pre_lab = d_tree.predict(X_test) print('pre_lab: ',pre_lab.tolist()) # print('test_data\n',np.column_stack((X_test,y_true))) print('The accuracy was ',100 * accuracy_score(y_true,pre_lab),'% on the test ')
def result(): payload = request.get_json() diagnosis_data = DiagnosisRequest(payload) x_test = diagnosis_data.to_np_array() tree_model = DecisionTree.query.order_by(DecisionTree.id.desc()).first() decision_tree = tree_model.tree tree_classifier = DecisionTreeClassifier(initial_tree=decision_tree) prediction = tree_classifier.predict(x_test) response = {'result': prediction.tolist()[0]} return jsonify(response)
def visualize(model, max_depth=None): iris_dataset = datasets.load_iris() petal_features = iris_dataset['data'][:, 2:] targets = iris_dataset['target'] if max_depth is None: # 決定木の最大深度は制限しない # アヤメのデータセットの場合は、データ数やクラス数が少ないため、深度を制限しなくても計算時間はあまりかからない if model == 'decision_tree': clf = DecisionTreeClassifier() else: clf = RandomForestClassifier() else: if model == 'decision_tree': clf = DecisionTreeClassifier(max_depth=max_depth) else: clf = RandomForestClassifier(max_depth=max_depth) clf.fit(petal_features, targets) # データの取りうる範囲 +-1 を計算する x_min = max(0, petal_features[:, 0].min() - 1) y_min = max(0, petal_features[:, 1].min() - 1) x_max = petal_features[:, 0].max() + 1 y_max = petal_features[:, 1].max() + 1 # 教師データの取りうる範囲でメッシュ状の座標を作る grid_interval = 0.2 xx, yy = np.meshgrid(np.arange(x_min, x_max, grid_interval), np.arange(y_min, y_max, grid_interval)) # メッシュの座標を学習したモデルで判定させる Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # 各点の判定結果をグラフに描画する plt.contourf(xx, yy, Z.reshape(xx.shape), cmap=plt.cm.rainbow, alpha=0.4) # データもプロット for c in np.unique(targets): plt.scatter(petal_features[targets == c, 0], petal_features[targets == c, 1]) feature_names = iris_dataset['feature_names'] plt.xlabel(feature_names[2]) plt.ylabel(feature_names[3]) if max_depth is None: plt.title('Max Depth : No Limitation') plt.savefig('figures/iris/{}_no_limit.png'.format(model)) else: plt.title('Max Depth : ' + str(max_depth)) plt.savefig('figures/iris/{}_depth_{}.png'.format(model, max_depth)) plt.close()
def test_predict(self): clf = DecisionTreeClassifier() clf.fit(self.X_train, self.y_train, self.feature_names) expected_for_x = [ (np.array(['sunny', 'hot', 'high', False]), False), # sunny outlook + high humidity -> don't play (np.array(['sunny', 'hot', 'normal', False]), True), # sunny outlook + normal humidity -> play (np.array(['overcast', 'hot', 'high', False]), True), # overcast outlook -> don't play ] for x, expected in expected_for_x: output = clf.predict(x) assert output == expected
def test_decision_tree_classifier_numerical_split_hard(self): model = DecisionTreeClassifier() # feature 3, 1 -> label 1 # feature 2, 0 -> label 0 features = np.array([[0, 0], [0, 0], [1, 0], [1, 0], [2, 0], [2, 0], [3, 0], [3, 0], [4, 0], [4, 0], [5, 0], [5, 0], \ [6, 0], [6, 0], [7, 0], [7, 0], [8, 0], [8, 0], [9, 0], [9, 0], [10, 0], [10, 0], [11, 0], [11, 0]]) labels = np.array([ 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, ]) model.fit(features, labels) """ print("test_decision_tree_classifier_numerical_split_hard") from pprint import pprint pprint(model._node) """ predictions = model.predict( np.array([[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0], [6, 0], [7, 0], [8, 0], [9, 0], [10, 0], [11, 0]])) self.assertEqual(predictions.tolist(), [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])
def main(): dataset = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) # 決定木の深度の制限が1~3、制限なしの各場合について調べる depth_list = [1, 2, 3, None] for depth in depth_list: print('######### max_depth = {} #########'.format(depth)) # 全ての特徴量を使用したときの精度、学習時間、推論時間、汎化性能を調べる # 決定木 decision_tree = DecisionTreeClassifier(max_depth=depth) dt_lr_start = time.time() # 学習開始時間を記録 decision_tree.fit(X_train, y_train) dt_lr_time = time.time() - dt_lr_start # 学習時間 dt_est_start = time.time() # 推論開始時間を記録 y_est = decision_tree.predict(X_test) dt_est_time = time.time() - dt_est_start # 推論時間 print('決定木 学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format( dt_lr_time, dt_est_time)) dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train) dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test) print('決定木 train accuracy : {:.4f} test_accuracy : {:.4f}'. format(dt_train_accuracy, dt_test_accuracy)) # ランダムフォレスト random_forest = RandomForestClassifier(max_depth=depth) rf_lr_start = time.time() # 学習開始時間を記録 random_forest.fit(X_train, y_train) rf_lr_time = time.time() - rf_lr_start # 学習時間 rf_est_start = time.time() # 推論開始時間を記録 y_est = random_forest.predict(X_test) rf_est_time = time.time() - rf_est_start # 推論時間 print('ランダムフォレスト 学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'. format(rf_lr_time, rf_est_time)) rf_train_accuracy = random_forest.accuracy_score(X_train, y_train) rf_test_accuracy = random_forest.accuracy_score(X_test, y_test) print( 'ランダムフォレスト train accuracy : {:.4f} test_accuracy : {:.4f}' .format(rf_train_accuracy, rf_test_accuracy)) # 使用する特徴量を2つ(Petal legth, Petal width)に絞って、二次元で可視化 visualize('decision_tree', max_depth=depth) visualize('random_forest', max_depth=depth)
def _compare_sklearn_dataset(self, dataset): dataset = load_iris() features = dataset.data labels = dataset.target model_sklearn = DecisionTreeClassifierSklearn() model_sklearn.fit(features, labels) predictions_sklearn = model_sklearn.predict(features) model = DecisionTreeClassifier() model.fit(features, labels) predictions = model.predict(features) self.assertEqual(predictions.tolist(), predictions_sklearn.tolist())
def test_decision_tree_classifier_fit(self): model = DecisionTreeClassifier() # XOR problem features = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1]]) labels = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0]) model.fit(features, labels) """ from pprint import pprint pprint(model._node) """ predictions = model.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]])) self.assertEqual(predictions.tolist(), [0, 1, 1, 0])
random.shuffle(image_list) X = [] y = [0] * len(image_list) for i in range(0, len(image_list)): image = image_list[i] if image in dorsal_features: y[i] = 0 else: y[i] = 1 X.append(label_folder_features[image]) X = np.array(X) y = np.array(y) decisiontree.fit(X, y) for image_id, feature in unlabelled_features.items(): val = decisiontree.predict([feature]) # print(image_id, val) if val[0] == 0: result[image_id] = 'dorsal' else: result[image_id] = 'palmar' elif classifier == 'SVM': dorsal_images = list(dorsal_features.keys()) palmar_images = list(palmar_features.keys()) image_list = dorsal_images image_list.extend(palmar_images) random.shuffle(image_list) X = [] y = [0] * len(image_list)