Exemplo n.º 1
0
def preprocess_and_save_1st_data(dataset_folder_path, output_path, aug=True):
    """
    Preprocess Training and Validation Data
    """

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    u = util()

    # Load training data ======================================================
    true_data, true_label = u.load_1st_data(
        os.path.join(dataset_folder_path, "crop_true"), ["0", "1", "2"])
    false_data, false_label = u.load_1st_data(
        os.path.join(dataset_folder_path, "crop_false"), ["3"])
    data = np.append(true_data, false_data, axis=0)
    label = np.append(true_label, false_label, axis=0)
    pickle.dump((data, label),
                open(os.path.join(output_path, 'raw_data_label.p'), 'wb'),
                protocol=4)

    print("[Before] data: ", np.shape(data))

    # Preprocess training & validation data
    train_data = np.empty((0, 360, 201, 1), float)
    train_label = np.empty((0, 1), int)
    test_data = np.empty((0, 360, 201, 1), float)
    test_label = np.empty((0, 1), int)

    idx = list(range(data.shape[0]))
    random.shuffle(idx)
    train_sample_num = int(0.7 * len(idx))

    train_data = np.append(train_data, data[idx[1:train_sample_num]], axis=0)
    train_label = np.append(train_label,
                            label[idx[1:train_sample_num]],
                            axis=0)

    test_data = np.append(test_data, data[idx[train_sample_num:]], axis=0)
    test_label = np.append(test_label, label[idx[train_sample_num:]], axis=0)

    if aug == True:
        train_data, train_label = u._augmentation(train_data, train_label)

    train_data, _, _ = preprc.normalize(train_data, mean=[], std=[])
    test_data, _, _ = preprc.normalize(test_data, mean=[], std=[])

    # One-hot encode
    train_label = preprc.one_hot_encode(train_label, dim=2)
    test_label = preprc.one_hot_encode(test_label, dim=2)

    print("[After] train_data shape: ", np.shape(train_data))
    print("[After] train_label shape: ", np.shape(train_label))
    print("[After] test_data shape: ", np.shape(test_data))
    print("[After] test_label shape: ", np.shape(test_label))

    # Save training data
    pickle.dump((train_data, train_label),
                open(os.path.join(output_path, 'preprocess_train.p'), 'wb'),
                protocol=4)
    pickle.dump((test_data, test_label),
                open(os.path.join(output_path, 'preprocess_test.p'), 'wb'),
                protocol=4)
Exemplo n.º 2
0
def preprocess_and_save_data(dataset_folder_path,
                             output_path,
                             label_type,
                             aug=True):
    """
    Preprocess Training and Validation Data
    """

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    u = util()

    # Load training data ======================================================
    data, label = u.load_data(os.path.join(dataset_folder_path, "crop_true"),
                              label_type)
    pickle.dump((data, label),
                open(os.path.join(output_path, 'raw_data_label.p'), 'wb'),
                protocol=4)

    # Preprocess training & validation data
    train_data = np.empty((0, 360, 201, 1), float)
    train_label = np.empty((0, 1), int)
    test_data = np.empty((0, 360, 201, 1), float)
    test_label = np.empty((0, 1), int)
    for curr_label_type in label_type:
        print("[Before] {} data: ".format(curr_label_type),
              np.shape(data[curr_label_type]))

        idx = list(range(data[curr_label_type].shape[0]))
        random.shuffle(idx)
        train_sample_num = int(0.7 * len(idx))

        if aug == True:
            if curr_label_type == "0":
                aug_train_data, aug_train_label = u._augmentation(
                    data[curr_label_type][idx[1:train_sample_num]],
                    label[curr_label_type][idx[1:train_sample_num]],
                    level=1)
            if curr_label_type == "1":
                aug_train_data, aug_train_label = u._augmentation(
                    data[curr_label_type][idx[1:train_sample_num]],
                    label[curr_label_type][idx[1:train_sample_num]],
                    level=5)

            print("[After] {} data: ".format(curr_label_type),
                  np.shape(aug_train_data))

        train_data = np.append(train_data, aug_train_data, axis=0)
        train_label = np.append(train_label, aug_train_label, axis=0)

        if curr_label_type == "0":
            test_data = np.append(test_data,
                                  data[curr_label_type][idx[-216:]],
                                  axis=0)
            test_label = np.append(test_label,
                                   label[curr_label_type][idx[-216:]],
                                   axis=0)

        if curr_label_type == "1":
            test_data = np.append(
                test_data,
                data[curr_label_type][idx[train_sample_num:]],
                axis=0)
            test_label = np.append(
                test_label,
                label[curr_label_type][idx[train_sample_num:]],
                axis=0)

    train_data, _, _ = preprc.normalize(train_data, mean=[], std=[])
    test_data, _, _ = preprc.normalize(test_data, mean=[], std=[])

    # One-hot encode
    train_label = preprc.one_hot_encode(train_label, dim=2)
    test_label = preprc.one_hot_encode(test_label, dim=2)

    print("[After] train_data shape: ", np.shape(train_data))
    print("[After] train_label shape: ", np.shape(train_label))
    print("[After] test_data shape: ", np.shape(test_data))
    print("[After] test_label shape: ", np.shape(test_label))

    # Save training data
    pickle.dump((train_data, train_label),
                open(os.path.join(output_path, 'preprocess_train.p'), 'wb'),
                protocol=4)
    pickle.dump((test_data, test_label),
                open(os.path.join(output_path, 'preprocess_test.p'), 'wb'),
                protocol=4)
Exemplo n.º 3
0
from preprocess.make_lower_case import *
from preprocess.eliminate_stop_words import *
from preprocess.replace_negation_words import *
from preprocess.tokenization import *
from preprocess.one_hot_encode import *
from preprocess.embed_200 import *
from preprocess.spellingcheck import *
from preprocess.extract_redundant_words import *

make_lower_case = make_lower_case(0, "make_lower_case", 1)
eliminate_stop_words = eliminate_stop_words(-5, "eliminate_stop_words", 2)
replace_negation_words = replace_negation_words(5, "replace_negation_words", 3)
tokenization = tokenization(0, "tokenization", 4)
one_hot_encode = one_hot_encode(-100, "one_hot_encode", 5)
spellingcheck = spellingcheck(50, "spellingcheck", 6)
embed_200 = embed_200(0, "embed_200", 7)
Exemplo n.º 4
0
def do_knn(df):
    ###################### Data preparation ####################
    # df = read_mushroom_data()

    df2, df2_columns = preprocess.one_hot_encode(df)
    df2 = pd.DataFrame(df2, columns=df2_columns)

    # print("number of samples: ", df.shape[0])
    # print("number of attributes: ", df.shape[1])
    # print(
    #     "\nValues classified as 'Missing' for stalk-root: ",
    #     (df.iloc[:, 11] == "?").sum(),
    # )

    # print("\nNumber of samples: ", df2.shape[0])
    # print("Number of attributes: ", df2.shape[1])
    # print(
    #     "\nRemaining missing values across all attributes and samples: ",
    #     df2.isnull().sum().sum(),
    # )
    # print("\nMinimum value across all attributes and samples: ", df2.min().min())
    # print("Maximum value across all attributes and samples: ", df2.max().max())
    # print(
    #     "\nMinimum fraction of '1'-s across all attributes: {:.5f}".format(
    #         df2.mean().min()
    #     )
    # )
    # print(
    #     "Maximum fraction of '1'-s across all attributes: {:.5f}".format(
    #         df2.mean().max()
    #     )
    # )

    kf = KFold(n_splits=5, shuffle=True)
    knn = KNeighborsClassifier(n_neighbors=100)
    accuracies = []
    best_predictions = pd.DataFrame([], columns=df2_columns)
    best_test_labels = pd.DataFrame([], columns=df2_columns)

    class_columns, feature_columns = dataset_utility.get_split_column_names(
        df2, get_class_column_names())

    for i in range(5):
        result = next(kf.split(df2), None)
        # Define poisonous as 1 and edible as 0 for the target
        x = df2.iloc[:, 2:]
        y = df2.iloc[:, 1]

        x_train = x.iloc[result[0]]
        x_test = x.iloc[result[1]]
        y_train = y.iloc[result[0]]
        y_test = y.iloc[result[1]]

        pca = PCA(n_components=2).fit(x_train)

        # Reduce dimensionality of the features from 113 to two principal componets
        # A PCA plot converts the correlations (or lack there of) among all of the features into a 2-D vector
        x_train = pca.transform(x_train)
        x_test = pca.transform(x_test)

        plt.figure(dpi=120)
        plt.scatter(
            x_train[y_train.values == 0, 0],
            x_train[y_train.values == 0, 1],
            label="Edible",
            alpha=0.5,
            s=2,
        )
        plt.scatter(
            x_train[y_train.values == 1, 0],
            x_train[y_train.values == 1, 1],
            label="Poisonous",
            alpha=0.5,
            s=2,
        )
        plt.title("Mushroom Data Set\nFirst Two Principal Components")
        plt.legend(frameon=1)
        plt.xlabel("PC 1")
        plt.ylabel("PC 2")
        plt.gca().set_aspect("equal")
        plt.savefig("knn.png")

        knn.fit(x_train, y_train)
        y_pred = knn.predict(x_test)

        # print("y_pred: ", y_pred)

        acc = accuracy_score(y_pred, y_test)
        # print("Accuracy:", acc)

        accuracies.append(acc)

        if acc >= max(accuracies):
            best_predictions = y_pred
            best_test_labels = y_test

    print("K-fold results: ", accuracies)
    print("Mean accuracy: ", np.mean(accuracies))
    # print("Best prediction: ", best_prediction)

    # print(metrics.confusion_matrix(best_predictions, best_test_labels))

    cm_labels = ["edible", "poisonous"]

    df_cm = pd.DataFrame(
        metrics.confusion_matrix(best_predictions,
                                 best_test_labels,
                                 labels=[0.0, 1.0]),
        index=cm_labels,
        columns=cm_labels,
    )

    plt.figure(figsize=(10, 7))
    sn.heatmap(df_cm, annot=True, fmt="g")

    plt.xlabel("Predicted")
    plt.ylabel("Actual")

    plt.savefig("knn-cm.png")
    return class_columns


def insert_class_columns(dataset):

    dt = dataset.copy()

    for column in get_class_column_names():
        dt.insert(df_enc.columns.get_loc(column), column, float("NaN"))

    return dt


df = pd.read_csv("mushrooms.csv", names=dataset_utility.get_column_names())

df_enc, df_enc_columns = preprocess.one_hot_encode(df)

df_enc = pd.DataFrame(df_enc, columns=df_enc_columns)

train, missing = preprocess.extract_missing(df_enc)

train = pd.DataFrame(train, columns=df_enc_columns)
missing = pd.DataFrame(missing, columns=df_enc_columns)

train = train.reset_index(drop=True)
missing = missing.reset_index(drop=True)

class_columns, feature_columns = dataset_utility.get_split_column_names(
    train, get_class_column_names())

missing = preprocess.remove_class_columns(missing, class_columns)
Exemplo n.º 6
0
def pls_inspection(X: np.ndarray, Y: np.ndarray, n_comps: int):
    n_classes = len(np.unique(Y))
    Y_encoded = one_hot_encode(Y)
    model = cross_decomposition.PLSRegression(n_components=n_comps,
                                              scale=False)
    model.fit(X, Y_encoded)

    # Extract information
    scores = model.x_scores_
    loadings = model.x_loadings_
    var_scores = np.var(scores, axis=0)
    var_X = np.sum(np.var(X, axis=0))
    var_ratios = var_scores / var_X
    cum_var_ratios = np.cumsum(var_ratios)

    # Colormap
    cmap = plt.cm.jet
    cmaplist = [cmap(i) for i in range(cmap.N)]
    cmap = mpl.colors.LinearSegmentedColormap.from_list(
        'Custom map', cmaplist, cmap.N)
    bounds = np.linspace(0, n_classes, n_classes + 1)
    norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

    # Explained variance plot
    plt.figure(num=3, figsize=(8, 6))
    plt.plot(np.pad(cum_var_ratios, (1, 0), 'constant'))
    plt.title('Explained variance')
    plt.xlabel('Principal components')
    plt.ylabel('Cumulative explained variance')
    plt.xlim((0, n_comps))
    plt.ylim((0, 1))

    # Loadings plot
    plt.figure(num=4, figsize=(8, 6))
    plt.plot(loadings[:, 0])
    plt.title('PC1 loadings')

    plt.figure(num=5, figsize=(8, 6))
    plt.plot(loadings[:, 1])
    plt.title('PC2 loadings')

    plt.figure(num=6, figsize=(8, 6))
    plt.plot(loadings[:, 2])
    plt.title('PC3 loadings')

    # 2D scores plot
    plt.figure(num=7, figsize=(8, 6))
    scat = plt.scatter(scores[:, 0],
                       scores[:, 1],
                       c=Y,
                       s=2,
                       cmap=cmap,
                       norm=norm)
    cb = plt.colorbar(scat, spacing='proportional', ticks=bounds)
    cb.set_label('Classes')
    plt.title('Scores plot (PLS)')
    plt.xlabel('PC1 ({:.2f}% explained variance)'.format(var_ratios[0] * 100))
    plt.ylabel('PC2 ({:.2f}% explained variance)'.format(var_ratios[1] * 100))

    # 3D scores plot
    fig = plt.figure(num=8, figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')
    scat = ax.scatter(scores[:, 0],
                      scores[:, 1],
                      scores[:, 2],
                      c=Y,
                      s=2,
                      cmap=cmap,
                      norm=norm)
    cb = plt.colorbar(scat, spacing='proportional', ticks=bounds)
    cb.set_label('Classes')
    ax.set_title('Scores plot (PLS)')
    ax.set_xlabel('PC1 ({:.2f}% explained variance)'.format(var_ratios[0] *
                                                            100))
    ax.set_ylabel('PC2 ({:.2f}% explained variance)'.format(var_ratios[1] *
                                                            100))
    ax.set_zlabel('PC3 ({:.2f}% explained variance)'.format(var_ratios[2] *
                                                            100))

    plt.show()
Exemplo n.º 7
0
def preprocess_and_save_data(SVHN_dataset_folder_path,
                             output_path,
                             rm_class,
                             aug_enable=False,
                             reshape_enable=False):
    """
    Preprocess Training and Validation Data
    """

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Load training data ======================================================
    train_data, train_label = load_SVHN_mat(SVHN_dataset_folder_path,
                                            "train_32x32.mat")

    # Preprocess training & validation data
    if aug_enable == True:
        train_data_ud = preprc.vertical_flip(train_data)
        train_data_lr = preprc.horizontal_flip(train_data)
        train_data = np.concatenate((train_data, train_data_ud, train_data_lr))
        train_label = np.concatenate((train_label, train_label, train_label))

    if reshape_enable == True:
        train_data = preprc.reshape_image(train_data, (64, 64, 3))

    print("[Training data] Removing No.{} Class...".format(rm_class))
    print("\t[Before] train_data shape: ", np.shape(train_data))
    print("\t[Before] train_label shape: ", np.shape(train_label))

    idx = np.squeeze(train_label != rm_class)
    train_data = train_data[idx]
    train_label = train_label[idx]

    train_data, _, _ = preprc.normalize(train_data, mean=[], std=[])
    train_label = preprc.one_hot_encode(train_label)

    print("\t[After] train_data shape: ", np.shape(train_data))
    print("\t[After] train_label shape: ", np.shape(train_label))

    # Save training data
    pickle.dump((train_data, train_label),
                open(
                    os.path.join(output_path,
                                 'preprocess_train_{}.p'.format(rm_class)),
                    'wb'),
                protocol=4)

    # Load Testing data =======================================================
    test_data, test_label = load_SVHN_mat(SVHN_dataset_folder_path,
                                          "test_32x32.mat")

    if reshape_enable == True:
        test_data = preprc.reshape_image(test_data, (64, 64, 3))

    print("[Testing data] Removing No.{} Class...".format(rm_class))
    print("\t[Before] test_data shape: ", np.shape(test_data))
    print("\t[Before] test_label shape: ", np.shape(test_label))

    idx = np.squeeze(test_label != rm_class)
    test_data_rm = test_data[idx]
    test_label_rm = test_label[idx]

    print("\t[After] test_data shape: ", np.shape(test_data_rm))
    print("\t[After] test_label shape: ", np.shape(test_label_rm))

    # Preprocess training & validation data
    test_data, _, _ = preprc.normalize(test_data, mean=[], std=[])
    test_label = preprc.one_hot_encode(test_label)
    test_data_rm, _, _ = preprc.normalize(test_data_rm, mean=[], std=[])
    test_label_rm = preprc.one_hot_encode(test_label_rm)

    # Save original test data
    pickle.dump((np.array(test_data), np.array(test_label)),
                open(os.path.join(output_path, 'test.p'), 'wb'))

    # Save test data
    pickle.dump((np.array(test_data_rm), np.array(test_label_rm)),
                open(
                    os.path.join(output_path,
                                 'preprocess_test_{}.p'.format(rm_class)),
                    'wb'),
                protocol=4)
Exemplo n.º 8
0
def preprocess_and_save_single_class_data(SVHN_dataset_folder_path,
                                          output_path,
                                          aug_enable=False,
                                          reshape_enable=False):
    """
    Preprocess Training and Validation Data
    """

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Load training data ======================================================
    train_data, train_label = load_SVHN_mat(SVHN_dataset_folder_path,
                                            "train_32x32.mat")

    # Preprocess training & validation data
    if aug_enable == True:
        train_data_ud = preprc.vertical_flip(train_data)
        train_data_lr = preprc.horizontal_flip(train_data)
        train_data = np.concatenate((train_data, train_data_ud, train_data_lr))
        train_label = np.concatenate((train_label, train_label, train_label))

    if reshape_enable == True:
        train_data = preprc.reshape_image(train_data, (64, 64, 3))

    train_data, _, _ = preprc.normalize(train_data, mean=[], std=[])
    train_label = preprc.one_hot_encode(train_label)

    for reserved_class in range(10):

        print(
            "[Training data] Extracting No.{} Class...".format(reserved_class))

        curr_features = train_data[train_label[:, reserved_class] == 1]
        curr_lables = train_label[train_label[:, reserved_class] == 1]

        print("\t[Class {}] feature shape: ".format(reserved_class),
              np.shape(curr_features))
        print(np.min(curr_features))
        print(np.max(curr_features))
        # Save training data
        pickle.dump(
            (curr_features, curr_lables),
            open(
                os.path.join(output_path,
                             'pr_train_class_{}.p'.format(reserved_class)),
                'wb'))

    # Load Testing data =======================================================
    test_data, test_label = load_SVHN_mat(SVHN_dataset_folder_path,
                                          "test_32x32.mat")

    if reshape_enable == True:
        test_data = preprc.reshape_image(test_data, (64, 64, 3))

    # Preprocess training & validation data
    test_data, _, _ = preprc.normalize(test_data, mean=[], std=[])
    test_label = preprc.one_hot_encode(test_label)

    # Save original test data
    pickle.dump((np.array(test_data), np.array(test_label)),
                open(os.path.join(output_path, 'test.p'), 'wb'))

    for reserved_class in range(10):

        print(
            "[Testing data] Extracting No.{} Class...".format(reserved_class))

        curr_features = test_data[test_label[:, reserved_class] == 1]
        curr_lables = test_label[test_label[:, reserved_class] == 1]

        print("\t[After] feature shape: ", np.shape(curr_features))
        print(np.min(curr_features))
        print(np.max(curr_features))
        # Save test data
        pickle.dump(
            (np.array(curr_features), np.array(curr_lables)),
            open(
                os.path.join(output_path,
                             'pr_test_class_{}.p'.format(reserved_class)),
                'wb'))
Exemplo n.º 9
0
def preprocess_and_save_data(cifar10_dataset_folder_path, output_path,
                             rm_class, aug_enable, reshape_enable):
    """
    Preprocess Training and Validation Data
    """
    n_batches = 5

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    features = []
    labels = []
    for batch_i in range(1, n_batches + 1):
        curr_features, curr_labels = load_cfar10_batch(
            cifar10_dataset_folder_path, batch_i)

        if len(features) is 0:
            features = curr_features
            labels = curr_labels
        else:
            features = np.concatenate((features, curr_features))
            labels = np.concatenate((labels, curr_labels))

    # Preprocess training & validation data
    if aug_enable == True:
        features_ud = preprc.vertical_flip(features)
        features_lr = preprc.horizontal_flip(features)
        features_rot90 = preprc.rot90(features)
        features_rot270 = preprc.rot270(features)
        features = np.concatenate((features, features_ud, features_lr,
                                   features_rot90, features_rot270))
        labels = np.concatenate((labels, labels, labels, labels, labels))

    if reshape_enable == True:
        features = preprc.reshape_image(features, (64, 64, 3))

    features, _, _ = preprc.normalize(features, mean=mean, std=std)
    labels = preprc.one_hot_encode(labels)

    print("[Training data] Removing No.{} Class...".format(rm_class))
    print("\t[Before] feature shape: ", np.shape(features))
    print("\t[Before] label shape: ", np.shape(labels))
    count = 0
    remove_class = []
    for i in range(len(features)):
        if labels[i, rm_class] == 1:
            count = count + 1
            remove_class.append(i)
    print("\tCount: {}".format(count))
    features = np.delete(features, remove_class, axis=0)
    labels = np.delete(labels, remove_class, axis=0)

    print("\t[After] feature shape: ", np.shape(features))
    print("\t[After] label shape: ", np.shape(labels))

    # Save training data
    pickle.dump((features, labels),
                open(
                    os.path.join(output_path,
                                 'preprocess_train_{}.p'.format(rm_class)),
                    'wb'),
                protocol=4)

    with open(cifar10_dataset_folder_path + '/test_batch', mode='rb') as file:
        batch = pickle.load(file, encoding='latin1')

    # load the test data
    test_features = batch['data'].reshape(
        (len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
    test_labels = batch['labels']

    if reshape_enable == True:
        test_features = preprc.reshape_image(test_features, (64, 64, 3))

    # Preprocess training & validation data
    test_features, _, _ = preprc.normalize(test_features, mean=mean, std=std)
    test_labels = preprc.one_hot_encode(test_labels)

    # Save original test data
    pickle.dump((np.array(test_features), np.array(test_labels)),
                open(os.path.join(output_path, 'test.p'), 'wb'))

    print("[Testing data] Removing No.{} Class...".format(rm_class))
    print("\t[Before] feature shape: ", np.shape(test_features))
    print("\t[Before] label shape: ", np.shape(test_labels))
    count = 0
    remove_class = []
    for i in range(len(test_features)):
        if test_labels[i, rm_class] == 1:
            count = count + 1
            remove_class.append(i)
    print("\tCount: {}".format(count))
    test_features = np.delete(test_features, remove_class, axis=0)
    test_labels = np.delete(test_labels, remove_class, axis=0)

    print("\t[After] feature shape: ", np.shape(test_features))
    print("\t[After] label shape: ", np.shape(test_labels))

    # Save test data
    pickle.dump((np.array(test_features), np.array(test_labels)),
                open(
                    os.path.join(output_path,
                                 'preprocess_test_{}.p'.format(rm_class)),
                    'wb'),
                protocol=4)
Exemplo n.º 10
0
def preprocess_and_save_single_class_data(cifar10_dataset_folder_path,
                                          output_path, aug_enable,
                                          reshape_enable):
    """
    Preprocess Training and Validation Data
    """
    n_batches = 5

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    features = []
    labels = []
    for batch_i in range(1, n_batches + 1):
        curr_features, curr_labels = load_cfar10_batch(
            cifar10_dataset_folder_path, batch_i)

        if len(features) is 0:
            features = curr_features
            labels = curr_labels
        else:
            features = np.concatenate((features, curr_features))
            labels = np.concatenate((labels, curr_labels))

    # Preprocess training & validation data
    if aug_enable == True:
        features_ud = preprc.vertical_flip(features)
        features_lr = preprc.horizontal_flip(features)
        features_rot90 = preprc.rot90(features)
        features_rot270 = preprc.rot270(features)
        features = np.concatenate((features, features_ud, features_lr,
                                   features_rot90, features_rot270))
        labels = np.concatenate((labels, labels, labels, labels, labels))

    if reshape_enable == True:
        features = preprc.reshape_image(features, (64, 64, 3))

    features, _, _ = preprc.normalize(features, mean=mean, std=std)
    labels = preprc.one_hot_encode(labels)

    for reserved_class in range(10):

        print(
            "[Training data] Extracting No.{} Class...".format(reserved_class))

        curr_features = features[labels[:, reserved_class] == 1]
        curr_lables = labels[labels[:, reserved_class] == 1]

        print("\t[Class {}] feature shape: ".format(reserved_class),
              np.shape(curr_features))

        # Save training data
        pickle.dump(
            (curr_features, curr_lables),
            open(
                os.path.join(output_path,
                             'pr_train_class_{}.p'.format(reserved_class)),
                'wb'))

    with open(cifar10_dataset_folder_path + '/test_batch', mode='rb') as file:
        batch = pickle.load(file, encoding='latin1')

    # load the test data
    test_features = batch['data'].reshape(
        (len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
    test_labels = batch['labels']

    if reshape_enable == True:
        test_features = preprc.reshape_image(test_features, (64, 64, 3))

    # Preprocess training & validation data
    test_features, _, _ = preprc.normalize(test_features, mean=mean, std=std)
    test_labels = preprc.one_hot_encode(test_labels)

    # Save original test data
    pickle.dump((np.array(test_features), np.array(test_labels)),
                open(os.path.join(output_path, 'test.p'), 'wb'))

    for reserved_class in range(10):

        print(
            "[Testing data] Extracting No.{} Class...".format(reserved_class))

        curr_features = test_features[test_labels[:, reserved_class] == 1]
        curr_lables = test_labels[test_labels[:, reserved_class] == 1]

        print("\t[After] feature shape: ", np.shape(curr_features))

        # Save test data
        pickle.dump(
            (np.array(curr_features), np.array(curr_lables)),
            open(
                os.path.join(output_path,
                             'pr_test_class_{}.p'.format(reserved_class)),
                'wb'))
def do_decision_tree(df):
    df_enc, df_enc_columns = preprocess.one_hot_encode(df)

    df_enc = pd.DataFrame(df_enc, columns=df_enc_columns)

    # ---KFold cross validation---

    k_fold_splits = 5

    kf = KFold(n_splits=k_fold_splits, shuffle=True)

    accuracies = []

    best_predictions = pd.DataFrame([], columns=get_class_column_names())
    best_test_labels = pd.DataFrame([], columns=get_class_column_names())

    dt = DecisionTreeClassifier(random_state=0,
                                max_depth=4,
                                min_samples_leaf=5)

    class_columns, feature_columns = dataset_utility.get_split_column_names(
        df_enc, get_class_column_names())

    features, labels = preprocess.split_features_labels(df_enc, class_columns)

    for i in range(k_fold_splits):

        result = next(kf.split(df_enc), None)
        train_features = features.iloc[result[0]]
        test_features = features.iloc[result[1]]
        train_labels = labels.iloc[result[0]]
        test_labels = labels.iloc[result[1]]

        # ---Decision Tree----
        dt.fit(train_features, train_labels)

        predictions = dt.predict(test_features)

        accuracy = metrics.accuracy_score(predictions, test_labels)

        accuracies.append(accuracy)

        if accuracy >= max(accuracies):
            best_predictions = pd.DataFrame(predictions,
                                            columns=get_class_column_names())
            best_test_labels = pd.DataFrame(test_labels,
                                            columns=get_class_column_names())

    # best_predictions = pd.DataFrame(predictions, columns=get_class_column_names())

    best_predictions = best_predictions.idxmax(axis=1)
    best_test_labels = best_test_labels.idxmax(axis=1)

    df_cm = pd.DataFrame(
        metrics.confusion_matrix(best_predictions,
                                 best_test_labels,
                                 labels=get_class_column_names()),
        index=cm_labels,
        columns=cm_labels,
    )

    plt.figure(figsize=(10, 7))
    sn.heatmap(df_cm, annot=True, fmt="g")

    plt.xlabel("Predicted")
    plt.ylabel("Actual")

    plt.savefig("decision-tree-cm.png")

    print("K-fold results: ", accuracies)
    print("Mean accuracy: ", np.mean(accuracies))

    fig = plt.figure(figsize=(25, 20))
    _ = tree.plot_tree(
        dt,
        feature_names=feature_columns,
        class_names=class_columns,
        filled=True,
        rounded=True,
    )
    fig.savefig("decision_tree.png")