def test(): data_train = pd.read_csv('./data_set/iris_1_3.csv', header=0) train_data = np.array(data_train ,'float') # X = train_data[:,:-1] y = train_data[:, -1] X_train, X_test, y_train, y_true = train_test_split(X, y,test_size=1 / 3., random_state= 6) d_tree = DecisionTreeClassifier(criterion = 'gini') #使用有放回抽样,抽样数据进行训练树,包外数据进行验证 X_subset, y_subset, out_of_bag_data = sampling_bagging(X_train,y_train) d_tree.fit(X_subset, y_subset) #使用袋外数据进行树的调整; print('y_true : ', y_true.tolist()) pre_lab = d_tree.predict(X_test) print('pre_lab: ',pre_lab.tolist()) # print('test_data\n',np.column_stack((X_test,y_true))) print('The accuracy was ',100 * accuracy_score(y_true,pre_lab),'% on the test ')
def compare_performance(trees_num, max_depth, bootstrap): dataset = datasets.load_digits() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) # 決定木 print('##### 決定木の性能 #####') decision_tree = DecisionTreeClassifier(max_depth=max_depth) dt_lr_start = time.time() # 学習開始時間を記録 decision_tree.fit(X_train, y_train) dt_lr_time = time.time() - dt_lr_start # 学習時間 dt_est_start = time.time() # 推論開始時間を記録 y_est = decision_tree.predict(X_test) dt_est_time = time.time() - dt_est_start # 推論時間 print('学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format(dt_lr_time, dt_est_time)) dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train) dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test) print('train accuracy : {:.4f} test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy)) # ランダムフォレスト print('##### ランダムフォレストの性能 #####') random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=max_depth, bootstrap=bootstrap) rf_lr_start = time.time() # 学習開始時間を記録 random_forest.fit(X_train, y_train) rf_lr_time = time.time() - rf_lr_start # 学習時間 rf_est_start = time.time() # 推論開始時間を記録 y_est = random_forest.predict(X_test) rf_est_time = time.time() - rf_est_start # 推論時間 print('学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format(rf_lr_time, rf_est_time)) rf_train_accuracy = random_forest.accuracy_score(X_train, y_train) rf_test_accuracy = random_forest.accuracy_score(X_test, y_test) print('train accuracy : {:.4f} test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))
def main(): args = get_cmd_ln_arguments() num_columns, num_rows, training_data = parse_txt_file(args.filename) feature_names = get_feature_names(num_columns) decision_tree = DecisionTreeClassifier() print('Training a decision tree classifier...') decision_tree.fit(feature_names, training_data) print('Decision tree classifier trained:') decision_tree.print() print() print( 'Entering a loop to query the decision tree. Press ctrl-c at anytime to exit.' ) while True: sample = input( 'Enter a sample ({} numbers separated by a space): '.format( num_columns)) try: sample = line_to_int_list(sample) except ValueError: print( 'Input was not {} numbers separated by a space. Please try again. ' .format(num_columns)) continue prediction = decision_tree.predict(sample) print('Prediction: {}'.format(prediction))
def test_fit(self): clf = DecisionTreeClassifier() clf.fit(self.X_train, self.y_train, self.feature_names) # verify the decision tree looks like this # # feature: # outlook # / | \ # / | \ # rainy / overcast \ sunny # / | \ # / | \ # feature: class: feature: # windy True humidity # / \ / \ # False / \ True high / \ normal # / \ / \ # class: class: class: class: # True False False True assert clf.root.feature == 'outlook' rainy_node = clf.root.children_by_attribute['rainy'] overcast_node = clf.root.children_by_attribute['overcast'] sunny_node = clf.root.children_by_attribute['sunny'] assert rainy_node.feature == 'windy' assert overcast_node.classification == True assert sunny_node.feature == 'humidity' assert rainy_node.children_by_attribute['False'].classification == True assert rainy_node.children_by_attribute['True'].classification == False assert sunny_node.children_by_attribute['high'].classification == False assert sunny_node.children_by_attribute[ 'normal'].classification == True
def test_benchmark_numerical(): print('\n** Numerical benchmark **') df = pd.read_csv(DATA_PATH + 'dados_benchmark_v2.csv', sep=';') dt = DecisionTreeClassifier( target_attribute = 'Joga', n_random_attributes=4 ) dt.fit(df) dt.print_tree()
def main(): curr_dir = os.path.dirname(__file__) csv_file = os.path.join(curr_dir, 'data/play.csv') df = pd.read_csv(csv_file, index_col='Dia') X, y = df.loc[:, df.columns != 'Jogar'], df['Jogar'] clf = DecisionTreeClassifier() clf.fit(X, y) print(clf.rules())
def test_decision_tree_classifier_numerical_split(self): model = DecisionTreeClassifier() # feature 3, 1 -> label 1 # feature 2, 0 -> label 0 features = np.array([[3, 0], [3, 0], [3, 0], [2, 0], [2, 0], [2, 0], [1, 0], [1, 0], [1, 0], [0, 0], [0, 0]]) labels = np.array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0]) model.fit(features, labels) """
def __generate_trees(self, df, n_baggings): ''' Given 'n' baggings, generate one Tree model for each of them ''' for i in n_baggings: clf = DecisionTreeClassifier( self.target_attribute, self.n_random_attributes ) clf.fit(df.iloc[n_baggings[i],:]) self.trees[i] = clf
def test_decision_tree_classifier_exhaustive_categorical_split(self): model = DecisionTreeClassifier(categorical_feature_indeces=[0, 1]) # feature[0] 3, 1 -> label 1 # feature[0] 2, 0 -> label 0, 2 features = np.array([[3, 0], [3, 0], [3, 0], [2, 0], [2, 0], [2, 0], [1, 0], [1, 0], [1, 0], [0, 0], [0, 1]]) labels = np.array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 2]) model.fit(features, labels) """
def compare_depth(): dataset = datasets.load_digits() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) # ランダムフォレストに関して、最良のハイパーパラメータを調べる trees_num, bootstrap = grid_search_RF() # 決定木の深度の制限を変えて、調べる depth_list = [i for i in range(21)] # 深さの制限0~20まで調べる dt_train_acc_list = [] dt_test_acc_list = [] rf_train_acc_list = [] rf_test_acc_list = [] for depth in tqdm(depth_list): print('***** max_depth = {} *****'.format(depth)) # 決定木 decision_tree = DecisionTreeClassifier(max_depth=depth) decision_tree.fit(X_train, y_train) dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train) dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test) # accuracyをリストに追加 dt_train_acc_list.append(dt_train_accuracy) dt_test_acc_list.append(dt_test_accuracy) print('決定木 train accuracy : {:.4f} test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy)) # ランダムフォレスト random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=depth, bootstrap=bootstrap) random_forest.fit(X_train, y_train) rf_train_accuracy = random_forest.accuracy_score(X_train, y_train) rf_test_accuracy = random_forest.accuracy_score(X_test, y_test) # accuracyをリストに追加 rf_train_acc_list.append(rf_train_accuracy) rf_test_acc_list.append(rf_test_accuracy) print('ランダムフォレスト train accuracy : {:.4f} test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy)) # グラフの描画 plt.plot(depth_list, dt_train_acc_list, label='Decision Tree - train accuracy', color='r') plt.plot(depth_list, dt_test_acc_list, label='Decision Tree - test accuracy', color='g') plt.plot(depth_list, rf_train_acc_list, label='Random Forest - train accuracy', color='y') plt.plot(depth_list, rf_test_acc_list, label='Random Forest - test accuracy', color='b') plt.xlabel('Max Depth') plt.ylabel('Accuracy') plt.xlim(0, 20) plt.xticks(np.arange(0, 21, 2)) plt.ylim(0, 1.0) plt.legend(loc='lower right') plt.title('Max Depth of Decision Trees and Accuracy') # グラフを保存 plt.savefig('figures/mnist/max_depth_&_accuracy.png')
def visualize(model, max_depth=None): iris_dataset = datasets.load_iris() petal_features = iris_dataset['data'][:, 2:] targets = iris_dataset['target'] if max_depth is None: # 決定木の最大深度は制限しない # アヤメのデータセットの場合は、データ数やクラス数が少ないため、深度を制限しなくても計算時間はあまりかからない if model == 'decision_tree': clf = DecisionTreeClassifier() else: clf = RandomForestClassifier() else: if model == 'decision_tree': clf = DecisionTreeClassifier(max_depth=max_depth) else: clf = RandomForestClassifier(max_depth=max_depth) clf.fit(petal_features, targets) # データの取りうる範囲 +-1 を計算する x_min = max(0, petal_features[:, 0].min() - 1) y_min = max(0, petal_features[:, 1].min() - 1) x_max = petal_features[:, 0].max() + 1 y_max = petal_features[:, 1].max() + 1 # 教師データの取りうる範囲でメッシュ状の座標を作る grid_interval = 0.2 xx, yy = np.meshgrid(np.arange(x_min, x_max, grid_interval), np.arange(y_min, y_max, grid_interval)) # メッシュの座標を学習したモデルで判定させる Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # 各点の判定結果をグラフに描画する plt.contourf(xx, yy, Z.reshape(xx.shape), cmap=plt.cm.rainbow, alpha=0.4) # データもプロット for c in np.unique(targets): plt.scatter(petal_features[targets == c, 0], petal_features[targets == c, 1]) feature_names = iris_dataset['feature_names'] plt.xlabel(feature_names[2]) plt.ylabel(feature_names[3]) if max_depth is None: plt.title('Max Depth : No Limitation') plt.savefig('figures/iris/{}_no_limit.png'.format(model)) else: plt.title('Max Depth : ' + str(max_depth)) plt.savefig('figures/iris/{}_depth_{}.png'.format(model, max_depth)) plt.close()
def test_predict(self): clf = DecisionTreeClassifier() clf.fit(self.X_train, self.y_train, self.feature_names) expected_for_x = [ (np.array(['sunny', 'hot', 'high', False]), False), # sunny outlook + high humidity -> don't play (np.array(['sunny', 'hot', 'normal', False]), True), # sunny outlook + normal humidity -> play (np.array(['overcast', 'hot', 'high', False]), True), # overcast outlook -> don't play ] for x, expected in expected_for_x: output = clf.predict(x) assert output == expected
def fit(self, X, y): self.forest = [] N = len(y) N_sub_data = int(N * self.bootstrap) for i in range(self.trees_num): self.shuffle(X, y) X_sub = X[:N_sub_data] y_sub = y[:N_sub_data] decision_tree = DecisionTreeClassifier(self.features_num, self.max_depth) decision_tree.fit(X_sub, y_sub) # 得られた決定木をforestのリストに追加 self.forest.append(decision_tree)
def test_decision_tree_classifier_numerical_split_hard(self): model = DecisionTreeClassifier() # feature 3, 1 -> label 1 # feature 2, 0 -> label 0 features = np.array([[0, 0], [0, 0], [1, 0], [1, 0], [2, 0], [2, 0], [3, 0], [3, 0], [4, 0], [4, 0], [5, 0], [5, 0], \ [6, 0], [6, 0], [7, 0], [7, 0], [8, 0], [8, 0], [9, 0], [9, 0], [10, 0], [10, 0], [11, 0], [11, 0]]) labels = np.array([ 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, ]) model.fit(features, labels) """ print("test_decision_tree_classifier_numerical_split_hard") from pprint import pprint pprint(model._node) """ predictions = model.predict( np.array([[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0], [6, 0], [7, 0], [8, 0], [9, 0], [10, 0], [11, 0]])) self.assertEqual(predictions.tolist(), [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])
def main(): dataset = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) # 決定木の深度の制限が1~3、制限なしの各場合について調べる depth_list = [1, 2, 3, None] for depth in depth_list: print('######### max_depth = {} #########'.format(depth)) # 全ての特徴量を使用したときの精度、学習時間、推論時間、汎化性能を調べる # 決定木 decision_tree = DecisionTreeClassifier(max_depth=depth) dt_lr_start = time.time() # 学習開始時間を記録 decision_tree.fit(X_train, y_train) dt_lr_time = time.time() - dt_lr_start # 学習時間 dt_est_start = time.time() # 推論開始時間を記録 y_est = decision_tree.predict(X_test) dt_est_time = time.time() - dt_est_start # 推論時間 print('決定木 学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format( dt_lr_time, dt_est_time)) dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train) dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test) print('決定木 train accuracy : {:.4f} test_accuracy : {:.4f}'. format(dt_train_accuracy, dt_test_accuracy)) # ランダムフォレスト random_forest = RandomForestClassifier(max_depth=depth) rf_lr_start = time.time() # 学習開始時間を記録 random_forest.fit(X_train, y_train) rf_lr_time = time.time() - rf_lr_start # 学習時間 rf_est_start = time.time() # 推論開始時間を記録 y_est = random_forest.predict(X_test) rf_est_time = time.time() - rf_est_start # 推論時間 print('ランダムフォレスト 学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'. format(rf_lr_time, rf_est_time)) rf_train_accuracy = random_forest.accuracy_score(X_train, y_train) rf_test_accuracy = random_forest.accuracy_score(X_test, y_test) print( 'ランダムフォレスト train accuracy : {:.4f} test_accuracy : {:.4f}' .format(rf_train_accuracy, rf_test_accuracy)) # 使用する特徴量を2つ(Petal legth, Petal width)に絞って、二次元で可視化 visualize('decision_tree', max_depth=depth) visualize('random_forest', max_depth=depth)
def _compare_sklearn_dataset(self, dataset): dataset = load_iris() features = dataset.data labels = dataset.target model_sklearn = DecisionTreeClassifierSklearn() model_sklearn.fit(features, labels) predictions_sklearn = model_sklearn.predict(features) model = DecisionTreeClassifier() model.fit(features, labels) predictions = model.predict(features) self.assertEqual(predictions.tolist(), predictions_sklearn.tolist())
def test_decision_tree_classifier_fit(self): model = DecisionTreeClassifier() # XOR problem features = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1]]) labels = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0]) model.fit(features, labels) """ from pprint import pprint pprint(model._node) """ predictions = model.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]])) self.assertEqual(predictions.tolist(), [0, 1, 1, 0])
def fit(self, X, y): # print('===', sys._getframe().f_code.co_filename, sys._getframe().f_code.co_name, sys._getframe().f_lineno,"===") self.forest = [] self.n_calss_num = len(set(y)) #有几个类 self.n_calss = list(set(y)) #类标签集合 for i in range(self.n_estimators): #随机的取数据 self.bootstrap 比率 ,表示抽取样本集的比例 X_subset, y_subset = sampling_with_reset(X, y, self.bootstrap) ########################################### tree = DecisionTreeClassifier(self.max_features, self.criterion, self.max_depth, self.min_samples_split, self.min_impurity_split) #打印树的信息 print('tree_' + str(i)) tree.fit(X_subset, y_subset) self.forest.append(tree) #树的集合
from sklearn import datasets, cross_validation from decision_tree import DecisionTreeClassifier iris = datasets.load_iris() X, Y = iris.data, iris.target clf = DecisionTreeClassifier() clf.fit(X, Y) print cross_validation.cross_val_score(clf, X, Y) clf.draw_tree('decision_tree_example.png')
from sklearn.datasets import load_iris from decision_tree import DecisionTreeClassifier from sklearn import tree # load the iris dataset dataset = load_iris() # set X and y variables X, y = dataset.data, dataset.target print(':::::::::::::::::::::::::::::::::::::::::::::') print(f'APPROPRIATE X, y DATATYPES: {type(X)}') print(':::::::::::::::::::::::::::::::::::::::::::::') # create a new isntance of the DecisionTreeClassifier object clf = DecisionTreeClassifier(max_depth=5) # call the fit method on that object clf.fit(X, y) print('') print(':::::::::::::PREDICTIONS:::::::::::::::::::::') print('') print(':::::::::::::::::::::::::::::::::::::::::::::') inputs = [[1, 1.5, 5, 1.5]] print(f'INPUTS: {inputs}') print(f'OUR MODEL PREDICTION: {clf.predict(inputs)}') clf2 = tree.DecisionTreeClassifier(max_depth=5) clf2.fit(X, y) print(f'SCIKITLEARN MODEL PREDICTION: {clf2.predict(inputs)}') print(':::::::::::::::::::::::::::::::::::::::::::::')
image_list = dorsal_images image_list.extend(palmar_images) random.shuffle(image_list) X = [] y = [0] * len(image_list) for i in range(0, len(image_list)): image = image_list[i] if image in dorsal_features: y[i] = 0 else: y[i] = 1 X.append(label_folder_features[image]) X = np.array(X) y = np.array(y) decisiontree.fit(X, y) for image_id, feature in unlabelled_features.items(): val = decisiontree.predict([feature]) # print(image_id, val) if val[0] == 0: result[image_id] = 'dorsal' else: result[image_id] = 'palmar' elif classifier == 'SVM': dorsal_images = list(dorsal_features.keys()) palmar_images = list(palmar_features.keys()) image_list = dorsal_images image_list.extend(palmar_images) random.shuffle(image_list) X = []