def preprocess_and_save_1st_data(dataset_folder_path, output_path, aug=True): """ Preprocess Training and Validation Data """ if not os.path.exists(output_path): os.makedirs(output_path) u = util() # Load training data ====================================================== true_data, true_label = u.load_1st_data( os.path.join(dataset_folder_path, "crop_true"), ["0", "1", "2"]) false_data, false_label = u.load_1st_data( os.path.join(dataset_folder_path, "crop_false"), ["3"]) data = np.append(true_data, false_data, axis=0) label = np.append(true_label, false_label, axis=0) pickle.dump((data, label), open(os.path.join(output_path, 'raw_data_label.p'), 'wb'), protocol=4) print("[Before] data: ", np.shape(data)) # Preprocess training & validation data train_data = np.empty((0, 360, 201, 1), float) train_label = np.empty((0, 1), int) test_data = np.empty((0, 360, 201, 1), float) test_label = np.empty((0, 1), int) idx = list(range(data.shape[0])) random.shuffle(idx) train_sample_num = int(0.7 * len(idx)) train_data = np.append(train_data, data[idx[1:train_sample_num]], axis=0) train_label = np.append(train_label, label[idx[1:train_sample_num]], axis=0) test_data = np.append(test_data, data[idx[train_sample_num:]], axis=0) test_label = np.append(test_label, label[idx[train_sample_num:]], axis=0) if aug == True: train_data, train_label = u._augmentation(train_data, train_label) train_data, _, _ = preprc.normalize(train_data, mean=[], std=[]) test_data, _, _ = preprc.normalize(test_data, mean=[], std=[]) # One-hot encode train_label = preprc.one_hot_encode(train_label, dim=2) test_label = preprc.one_hot_encode(test_label, dim=2) print("[After] train_data shape: ", np.shape(train_data)) print("[After] train_label shape: ", np.shape(train_label)) print("[After] test_data shape: ", np.shape(test_data)) print("[After] test_label shape: ", np.shape(test_label)) # Save training data pickle.dump((train_data, train_label), open(os.path.join(output_path, 'preprocess_train.p'), 'wb'), protocol=4) pickle.dump((test_data, test_label), open(os.path.join(output_path, 'preprocess_test.p'), 'wb'), protocol=4)
def preprocess_and_save_data(dataset_folder_path, output_path, label_type, aug=True): """ Preprocess Training and Validation Data """ if not os.path.exists(output_path): os.makedirs(output_path) u = util() # Load training data ====================================================== data, label = u.load_data(os.path.join(dataset_folder_path, "crop_true"), label_type) pickle.dump((data, label), open(os.path.join(output_path, 'raw_data_label.p'), 'wb'), protocol=4) # Preprocess training & validation data train_data = np.empty((0, 360, 201, 1), float) train_label = np.empty((0, 1), int) test_data = np.empty((0, 360, 201, 1), float) test_label = np.empty((0, 1), int) for curr_label_type in label_type: print("[Before] {} data: ".format(curr_label_type), np.shape(data[curr_label_type])) idx = list(range(data[curr_label_type].shape[0])) random.shuffle(idx) train_sample_num = int(0.7 * len(idx)) if aug == True: if curr_label_type == "0": aug_train_data, aug_train_label = u._augmentation( data[curr_label_type][idx[1:train_sample_num]], label[curr_label_type][idx[1:train_sample_num]], level=1) if curr_label_type == "1": aug_train_data, aug_train_label = u._augmentation( data[curr_label_type][idx[1:train_sample_num]], label[curr_label_type][idx[1:train_sample_num]], level=5) print("[After] {} data: ".format(curr_label_type), np.shape(aug_train_data)) train_data = np.append(train_data, aug_train_data, axis=0) train_label = np.append(train_label, aug_train_label, axis=0) if curr_label_type == "0": test_data = np.append(test_data, data[curr_label_type][idx[-216:]], axis=0) test_label = np.append(test_label, label[curr_label_type][idx[-216:]], axis=0) if curr_label_type == "1": test_data = np.append( test_data, data[curr_label_type][idx[train_sample_num:]], axis=0) test_label = np.append( test_label, label[curr_label_type][idx[train_sample_num:]], axis=0) train_data, _, _ = preprc.normalize(train_data, mean=[], std=[]) test_data, _, _ = preprc.normalize(test_data, mean=[], std=[]) # One-hot encode train_label = preprc.one_hot_encode(train_label, dim=2) test_label = preprc.one_hot_encode(test_label, dim=2) print("[After] train_data shape: ", np.shape(train_data)) print("[After] train_label shape: ", np.shape(train_label)) print("[After] test_data shape: ", np.shape(test_data)) print("[After] test_label shape: ", np.shape(test_label)) # Save training data pickle.dump((train_data, train_label), open(os.path.join(output_path, 'preprocess_train.p'), 'wb'), protocol=4) pickle.dump((test_data, test_label), open(os.path.join(output_path, 'preprocess_test.p'), 'wb'), protocol=4)
from preprocess.make_lower_case import * from preprocess.eliminate_stop_words import * from preprocess.replace_negation_words import * from preprocess.tokenization import * from preprocess.one_hot_encode import * from preprocess.embed_200 import * from preprocess.spellingcheck import * from preprocess.extract_redundant_words import * make_lower_case = make_lower_case(0, "make_lower_case", 1) eliminate_stop_words = eliminate_stop_words(-5, "eliminate_stop_words", 2) replace_negation_words = replace_negation_words(5, "replace_negation_words", 3) tokenization = tokenization(0, "tokenization", 4) one_hot_encode = one_hot_encode(-100, "one_hot_encode", 5) spellingcheck = spellingcheck(50, "spellingcheck", 6) embed_200 = embed_200(0, "embed_200", 7)
def do_knn(df): ###################### Data preparation #################### # df = read_mushroom_data() df2, df2_columns = preprocess.one_hot_encode(df) df2 = pd.DataFrame(df2, columns=df2_columns) # print("number of samples: ", df.shape[0]) # print("number of attributes: ", df.shape[1]) # print( # "\nValues classified as 'Missing' for stalk-root: ", # (df.iloc[:, 11] == "?").sum(), # ) # print("\nNumber of samples: ", df2.shape[0]) # print("Number of attributes: ", df2.shape[1]) # print( # "\nRemaining missing values across all attributes and samples: ", # df2.isnull().sum().sum(), # ) # print("\nMinimum value across all attributes and samples: ", df2.min().min()) # print("Maximum value across all attributes and samples: ", df2.max().max()) # print( # "\nMinimum fraction of '1'-s across all attributes: {:.5f}".format( # df2.mean().min() # ) # ) # print( # "Maximum fraction of '1'-s across all attributes: {:.5f}".format( # df2.mean().max() # ) # ) kf = KFold(n_splits=5, shuffle=True) knn = KNeighborsClassifier(n_neighbors=100) accuracies = [] best_predictions = pd.DataFrame([], columns=df2_columns) best_test_labels = pd.DataFrame([], columns=df2_columns) class_columns, feature_columns = dataset_utility.get_split_column_names( df2, get_class_column_names()) for i in range(5): result = next(kf.split(df2), None) # Define poisonous as 1 and edible as 0 for the target x = df2.iloc[:, 2:] y = df2.iloc[:, 1] x_train = x.iloc[result[0]] x_test = x.iloc[result[1]] y_train = y.iloc[result[0]] y_test = y.iloc[result[1]] pca = PCA(n_components=2).fit(x_train) # Reduce dimensionality of the features from 113 to two principal componets # A PCA plot converts the correlations (or lack there of) among all of the features into a 2-D vector x_train = pca.transform(x_train) x_test = pca.transform(x_test) plt.figure(dpi=120) plt.scatter( x_train[y_train.values == 0, 0], x_train[y_train.values == 0, 1], label="Edible", alpha=0.5, s=2, ) plt.scatter( x_train[y_train.values == 1, 0], x_train[y_train.values == 1, 1], label="Poisonous", alpha=0.5, s=2, ) plt.title("Mushroom Data Set\nFirst Two Principal Components") plt.legend(frameon=1) plt.xlabel("PC 1") plt.ylabel("PC 2") plt.gca().set_aspect("equal") plt.savefig("knn.png") knn.fit(x_train, y_train) y_pred = knn.predict(x_test) # print("y_pred: ", y_pred) acc = accuracy_score(y_pred, y_test) # print("Accuracy:", acc) accuracies.append(acc) if acc >= max(accuracies): best_predictions = y_pred best_test_labels = y_test print("K-fold results: ", accuracies) print("Mean accuracy: ", np.mean(accuracies)) # print("Best prediction: ", best_prediction) # print(metrics.confusion_matrix(best_predictions, best_test_labels)) cm_labels = ["edible", "poisonous"] df_cm = pd.DataFrame( metrics.confusion_matrix(best_predictions, best_test_labels, labels=[0.0, 1.0]), index=cm_labels, columns=cm_labels, ) plt.figure(figsize=(10, 7)) sn.heatmap(df_cm, annot=True, fmt="g") plt.xlabel("Predicted") plt.ylabel("Actual") plt.savefig("knn-cm.png")
return class_columns def insert_class_columns(dataset): dt = dataset.copy() for column in get_class_column_names(): dt.insert(df_enc.columns.get_loc(column), column, float("NaN")) return dt df = pd.read_csv("mushrooms.csv", names=dataset_utility.get_column_names()) df_enc, df_enc_columns = preprocess.one_hot_encode(df) df_enc = pd.DataFrame(df_enc, columns=df_enc_columns) train, missing = preprocess.extract_missing(df_enc) train = pd.DataFrame(train, columns=df_enc_columns) missing = pd.DataFrame(missing, columns=df_enc_columns) train = train.reset_index(drop=True) missing = missing.reset_index(drop=True) class_columns, feature_columns = dataset_utility.get_split_column_names( train, get_class_column_names()) missing = preprocess.remove_class_columns(missing, class_columns)
def pls_inspection(X: np.ndarray, Y: np.ndarray, n_comps: int): n_classes = len(np.unique(Y)) Y_encoded = one_hot_encode(Y) model = cross_decomposition.PLSRegression(n_components=n_comps, scale=False) model.fit(X, Y_encoded) # Extract information scores = model.x_scores_ loadings = model.x_loadings_ var_scores = np.var(scores, axis=0) var_X = np.sum(np.var(X, axis=0)) var_ratios = var_scores / var_X cum_var_ratios = np.cumsum(var_ratios) # Colormap cmap = plt.cm.jet cmaplist = [cmap(i) for i in range(cmap.N)] cmap = mpl.colors.LinearSegmentedColormap.from_list( 'Custom map', cmaplist, cmap.N) bounds = np.linspace(0, n_classes, n_classes + 1) norm = mpl.colors.BoundaryNorm(bounds, cmap.N) # Explained variance plot plt.figure(num=3, figsize=(8, 6)) plt.plot(np.pad(cum_var_ratios, (1, 0), 'constant')) plt.title('Explained variance') plt.xlabel('Principal components') plt.ylabel('Cumulative explained variance') plt.xlim((0, n_comps)) plt.ylim((0, 1)) # Loadings plot plt.figure(num=4, figsize=(8, 6)) plt.plot(loadings[:, 0]) plt.title('PC1 loadings') plt.figure(num=5, figsize=(8, 6)) plt.plot(loadings[:, 1]) plt.title('PC2 loadings') plt.figure(num=6, figsize=(8, 6)) plt.plot(loadings[:, 2]) plt.title('PC3 loadings') # 2D scores plot plt.figure(num=7, figsize=(8, 6)) scat = plt.scatter(scores[:, 0], scores[:, 1], c=Y, s=2, cmap=cmap, norm=norm) cb = plt.colorbar(scat, spacing='proportional', ticks=bounds) cb.set_label('Classes') plt.title('Scores plot (PLS)') plt.xlabel('PC1 ({:.2f}% explained variance)'.format(var_ratios[0] * 100)) plt.ylabel('PC2 ({:.2f}% explained variance)'.format(var_ratios[1] * 100)) # 3D scores plot fig = plt.figure(num=8, figsize=(8, 6)) ax = fig.add_subplot(111, projection='3d') scat = ax.scatter(scores[:, 0], scores[:, 1], scores[:, 2], c=Y, s=2, cmap=cmap, norm=norm) cb = plt.colorbar(scat, spacing='proportional', ticks=bounds) cb.set_label('Classes') ax.set_title('Scores plot (PLS)') ax.set_xlabel('PC1 ({:.2f}% explained variance)'.format(var_ratios[0] * 100)) ax.set_ylabel('PC2 ({:.2f}% explained variance)'.format(var_ratios[1] * 100)) ax.set_zlabel('PC3 ({:.2f}% explained variance)'.format(var_ratios[2] * 100)) plt.show()
def preprocess_and_save_data(SVHN_dataset_folder_path, output_path, rm_class, aug_enable=False, reshape_enable=False): """ Preprocess Training and Validation Data """ if not os.path.exists(output_path): os.makedirs(output_path) # Load training data ====================================================== train_data, train_label = load_SVHN_mat(SVHN_dataset_folder_path, "train_32x32.mat") # Preprocess training & validation data if aug_enable == True: train_data_ud = preprc.vertical_flip(train_data) train_data_lr = preprc.horizontal_flip(train_data) train_data = np.concatenate((train_data, train_data_ud, train_data_lr)) train_label = np.concatenate((train_label, train_label, train_label)) if reshape_enable == True: train_data = preprc.reshape_image(train_data, (64, 64, 3)) print("[Training data] Removing No.{} Class...".format(rm_class)) print("\t[Before] train_data shape: ", np.shape(train_data)) print("\t[Before] train_label shape: ", np.shape(train_label)) idx = np.squeeze(train_label != rm_class) train_data = train_data[idx] train_label = train_label[idx] train_data, _, _ = preprc.normalize(train_data, mean=[], std=[]) train_label = preprc.one_hot_encode(train_label) print("\t[After] train_data shape: ", np.shape(train_data)) print("\t[After] train_label shape: ", np.shape(train_label)) # Save training data pickle.dump((train_data, train_label), open( os.path.join(output_path, 'preprocess_train_{}.p'.format(rm_class)), 'wb'), protocol=4) # Load Testing data ======================================================= test_data, test_label = load_SVHN_mat(SVHN_dataset_folder_path, "test_32x32.mat") if reshape_enable == True: test_data = preprc.reshape_image(test_data, (64, 64, 3)) print("[Testing data] Removing No.{} Class...".format(rm_class)) print("\t[Before] test_data shape: ", np.shape(test_data)) print("\t[Before] test_label shape: ", np.shape(test_label)) idx = np.squeeze(test_label != rm_class) test_data_rm = test_data[idx] test_label_rm = test_label[idx] print("\t[After] test_data shape: ", np.shape(test_data_rm)) print("\t[After] test_label shape: ", np.shape(test_label_rm)) # Preprocess training & validation data test_data, _, _ = preprc.normalize(test_data, mean=[], std=[]) test_label = preprc.one_hot_encode(test_label) test_data_rm, _, _ = preprc.normalize(test_data_rm, mean=[], std=[]) test_label_rm = preprc.one_hot_encode(test_label_rm) # Save original test data pickle.dump((np.array(test_data), np.array(test_label)), open(os.path.join(output_path, 'test.p'), 'wb')) # Save test data pickle.dump((np.array(test_data_rm), np.array(test_label_rm)), open( os.path.join(output_path, 'preprocess_test_{}.p'.format(rm_class)), 'wb'), protocol=4)
def preprocess_and_save_single_class_data(SVHN_dataset_folder_path, output_path, aug_enable=False, reshape_enable=False): """ Preprocess Training and Validation Data """ if not os.path.exists(output_path): os.makedirs(output_path) # Load training data ====================================================== train_data, train_label = load_SVHN_mat(SVHN_dataset_folder_path, "train_32x32.mat") # Preprocess training & validation data if aug_enable == True: train_data_ud = preprc.vertical_flip(train_data) train_data_lr = preprc.horizontal_flip(train_data) train_data = np.concatenate((train_data, train_data_ud, train_data_lr)) train_label = np.concatenate((train_label, train_label, train_label)) if reshape_enable == True: train_data = preprc.reshape_image(train_data, (64, 64, 3)) train_data, _, _ = preprc.normalize(train_data, mean=[], std=[]) train_label = preprc.one_hot_encode(train_label) for reserved_class in range(10): print( "[Training data] Extracting No.{} Class...".format(reserved_class)) curr_features = train_data[train_label[:, reserved_class] == 1] curr_lables = train_label[train_label[:, reserved_class] == 1] print("\t[Class {}] feature shape: ".format(reserved_class), np.shape(curr_features)) print(np.min(curr_features)) print(np.max(curr_features)) # Save training data pickle.dump( (curr_features, curr_lables), open( os.path.join(output_path, 'pr_train_class_{}.p'.format(reserved_class)), 'wb')) # Load Testing data ======================================================= test_data, test_label = load_SVHN_mat(SVHN_dataset_folder_path, "test_32x32.mat") if reshape_enable == True: test_data = preprc.reshape_image(test_data, (64, 64, 3)) # Preprocess training & validation data test_data, _, _ = preprc.normalize(test_data, mean=[], std=[]) test_label = preprc.one_hot_encode(test_label) # Save original test data pickle.dump((np.array(test_data), np.array(test_label)), open(os.path.join(output_path, 'test.p'), 'wb')) for reserved_class in range(10): print( "[Testing data] Extracting No.{} Class...".format(reserved_class)) curr_features = test_data[test_label[:, reserved_class] == 1] curr_lables = test_label[test_label[:, reserved_class] == 1] print("\t[After] feature shape: ", np.shape(curr_features)) print(np.min(curr_features)) print(np.max(curr_features)) # Save test data pickle.dump( (np.array(curr_features), np.array(curr_lables)), open( os.path.join(output_path, 'pr_test_class_{}.p'.format(reserved_class)), 'wb'))
def preprocess_and_save_data(cifar10_dataset_folder_path, output_path, rm_class, aug_enable, reshape_enable): """ Preprocess Training and Validation Data """ n_batches = 5 if not os.path.exists(output_path): os.makedirs(output_path) features = [] labels = [] for batch_i in range(1, n_batches + 1): curr_features, curr_labels = load_cfar10_batch( cifar10_dataset_folder_path, batch_i) if len(features) is 0: features = curr_features labels = curr_labels else: features = np.concatenate((features, curr_features)) labels = np.concatenate((labels, curr_labels)) # Preprocess training & validation data if aug_enable == True: features_ud = preprc.vertical_flip(features) features_lr = preprc.horizontal_flip(features) features_rot90 = preprc.rot90(features) features_rot270 = preprc.rot270(features) features = np.concatenate((features, features_ud, features_lr, features_rot90, features_rot270)) labels = np.concatenate((labels, labels, labels, labels, labels)) if reshape_enable == True: features = preprc.reshape_image(features, (64, 64, 3)) features, _, _ = preprc.normalize(features, mean=mean, std=std) labels = preprc.one_hot_encode(labels) print("[Training data] Removing No.{} Class...".format(rm_class)) print("\t[Before] feature shape: ", np.shape(features)) print("\t[Before] label shape: ", np.shape(labels)) count = 0 remove_class = [] for i in range(len(features)): if labels[i, rm_class] == 1: count = count + 1 remove_class.append(i) print("\tCount: {}".format(count)) features = np.delete(features, remove_class, axis=0) labels = np.delete(labels, remove_class, axis=0) print("\t[After] feature shape: ", np.shape(features)) print("\t[After] label shape: ", np.shape(labels)) # Save training data pickle.dump((features, labels), open( os.path.join(output_path, 'preprocess_train_{}.p'.format(rm_class)), 'wb'), protocol=4) with open(cifar10_dataset_folder_path + '/test_batch', mode='rb') as file: batch = pickle.load(file, encoding='latin1') # load the test data test_features = batch['data'].reshape( (len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1) test_labels = batch['labels'] if reshape_enable == True: test_features = preprc.reshape_image(test_features, (64, 64, 3)) # Preprocess training & validation data test_features, _, _ = preprc.normalize(test_features, mean=mean, std=std) test_labels = preprc.one_hot_encode(test_labels) # Save original test data pickle.dump((np.array(test_features), np.array(test_labels)), open(os.path.join(output_path, 'test.p'), 'wb')) print("[Testing data] Removing No.{} Class...".format(rm_class)) print("\t[Before] feature shape: ", np.shape(test_features)) print("\t[Before] label shape: ", np.shape(test_labels)) count = 0 remove_class = [] for i in range(len(test_features)): if test_labels[i, rm_class] == 1: count = count + 1 remove_class.append(i) print("\tCount: {}".format(count)) test_features = np.delete(test_features, remove_class, axis=0) test_labels = np.delete(test_labels, remove_class, axis=0) print("\t[After] feature shape: ", np.shape(test_features)) print("\t[After] label shape: ", np.shape(test_labels)) # Save test data pickle.dump((np.array(test_features), np.array(test_labels)), open( os.path.join(output_path, 'preprocess_test_{}.p'.format(rm_class)), 'wb'), protocol=4)
def preprocess_and_save_single_class_data(cifar10_dataset_folder_path, output_path, aug_enable, reshape_enable): """ Preprocess Training and Validation Data """ n_batches = 5 if not os.path.exists(output_path): os.makedirs(output_path) features = [] labels = [] for batch_i in range(1, n_batches + 1): curr_features, curr_labels = load_cfar10_batch( cifar10_dataset_folder_path, batch_i) if len(features) is 0: features = curr_features labels = curr_labels else: features = np.concatenate((features, curr_features)) labels = np.concatenate((labels, curr_labels)) # Preprocess training & validation data if aug_enable == True: features_ud = preprc.vertical_flip(features) features_lr = preprc.horizontal_flip(features) features_rot90 = preprc.rot90(features) features_rot270 = preprc.rot270(features) features = np.concatenate((features, features_ud, features_lr, features_rot90, features_rot270)) labels = np.concatenate((labels, labels, labels, labels, labels)) if reshape_enable == True: features = preprc.reshape_image(features, (64, 64, 3)) features, _, _ = preprc.normalize(features, mean=mean, std=std) labels = preprc.one_hot_encode(labels) for reserved_class in range(10): print( "[Training data] Extracting No.{} Class...".format(reserved_class)) curr_features = features[labels[:, reserved_class] == 1] curr_lables = labels[labels[:, reserved_class] == 1] print("\t[Class {}] feature shape: ".format(reserved_class), np.shape(curr_features)) # Save training data pickle.dump( (curr_features, curr_lables), open( os.path.join(output_path, 'pr_train_class_{}.p'.format(reserved_class)), 'wb')) with open(cifar10_dataset_folder_path + '/test_batch', mode='rb') as file: batch = pickle.load(file, encoding='latin1') # load the test data test_features = batch['data'].reshape( (len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1) test_labels = batch['labels'] if reshape_enable == True: test_features = preprc.reshape_image(test_features, (64, 64, 3)) # Preprocess training & validation data test_features, _, _ = preprc.normalize(test_features, mean=mean, std=std) test_labels = preprc.one_hot_encode(test_labels) # Save original test data pickle.dump((np.array(test_features), np.array(test_labels)), open(os.path.join(output_path, 'test.p'), 'wb')) for reserved_class in range(10): print( "[Testing data] Extracting No.{} Class...".format(reserved_class)) curr_features = test_features[test_labels[:, reserved_class] == 1] curr_lables = test_labels[test_labels[:, reserved_class] == 1] print("\t[After] feature shape: ", np.shape(curr_features)) # Save test data pickle.dump( (np.array(curr_features), np.array(curr_lables)), open( os.path.join(output_path, 'pr_test_class_{}.p'.format(reserved_class)), 'wb'))
def do_decision_tree(df): df_enc, df_enc_columns = preprocess.one_hot_encode(df) df_enc = pd.DataFrame(df_enc, columns=df_enc_columns) # ---KFold cross validation--- k_fold_splits = 5 kf = KFold(n_splits=k_fold_splits, shuffle=True) accuracies = [] best_predictions = pd.DataFrame([], columns=get_class_column_names()) best_test_labels = pd.DataFrame([], columns=get_class_column_names()) dt = DecisionTreeClassifier(random_state=0, max_depth=4, min_samples_leaf=5) class_columns, feature_columns = dataset_utility.get_split_column_names( df_enc, get_class_column_names()) features, labels = preprocess.split_features_labels(df_enc, class_columns) for i in range(k_fold_splits): result = next(kf.split(df_enc), None) train_features = features.iloc[result[0]] test_features = features.iloc[result[1]] train_labels = labels.iloc[result[0]] test_labels = labels.iloc[result[1]] # ---Decision Tree---- dt.fit(train_features, train_labels) predictions = dt.predict(test_features) accuracy = metrics.accuracy_score(predictions, test_labels) accuracies.append(accuracy) if accuracy >= max(accuracies): best_predictions = pd.DataFrame(predictions, columns=get_class_column_names()) best_test_labels = pd.DataFrame(test_labels, columns=get_class_column_names()) # best_predictions = pd.DataFrame(predictions, columns=get_class_column_names()) best_predictions = best_predictions.idxmax(axis=1) best_test_labels = best_test_labels.idxmax(axis=1) df_cm = pd.DataFrame( metrics.confusion_matrix(best_predictions, best_test_labels, labels=get_class_column_names()), index=cm_labels, columns=cm_labels, ) plt.figure(figsize=(10, 7)) sn.heatmap(df_cm, annot=True, fmt="g") plt.xlabel("Predicted") plt.ylabel("Actual") plt.savefig("decision-tree-cm.png") print("K-fold results: ", accuracies) print("Mean accuracy: ", np.mean(accuracies)) fig = plt.figure(figsize=(25, 20)) _ = tree.plot_tree( dt, feature_names=feature_columns, class_names=class_columns, filled=True, rounded=True, ) fig.savefig("decision_tree.png")