示例#1
0
def run_training_binary(Xs, ys, cfg):
    print("\nx_train Angiography cube shape: {}".format(Xs[0][0].shape))
    print("x_train Structure OCT cube shape: {}".format(Xs[0][1].shape))
    print("x_train B scan shape: {}".format(Xs[0][2].shape))
    print("y_train onehot shape: {}".format(ys[0].shape))

    print("\nx_valid Angiography cube shape: {}".format(Xs[1][0].shape))
    print("x_valid Structure OCT cube shape: {}".format(Xs[1][1].shape))
    print("x_valid B scan shape: {}".format(Xs[1][2].shape))
    print("y_valid onehot shape: {}".format(ys[1].shape))

    print("\nx_test Angiography cube shape: {}".format(Xs[2][0].shape))
    print("x_test Structure OCT cube shape: {}".format(Xs[2][1].shape))
    print("x_test B scan shape: {}".format(Xs[2][2].shape))
    print("y_test onehot shape: {}".format(ys[2].shape))

    # Get and train model
    model = get_model(cfg.str_arch, cfg)
    callbacks = get_callbacks(cfg)

    h = model.fit(Xs[0],
                  ys[0],
                  batch_size=cfg.batch_size,
                  epochs=cfg.n_epoch,
                  verbose=2,
                  callbacks=callbacks,
                  validation_data=(Xs[1], ys[1]),
                  shuffle=False,
                  validation_batch_size=Xs[1][0].shape[0])
    cfg.history = h.history

    # save trained models
    save_model(model, cfg, overwrite=True, save_format='tf')

    # plotting training history
    plot_training_loss(h, cfg, save=True)
    plot_training_acc(h, cfg, save=True)

    # Now perform prediction
    train_set_score = model.evaluate(Xs[0],
                                     ys[0],
                                     callbacks=callbacks,
                                     verbose=0)
    valid_set_score = model.evaluate(Xs[1],
                                     ys[1],
                                     callbacks=callbacks,
                                     verbose=0)
    test_set_score = model.evaluate(Xs[2],
                                    ys[2],
                                    callbacks=callbacks,
                                    verbose=0)

    print("\nTrain set accuracy: {}".format(train_set_score[1]))
    print("Valid set accuracy: {}".format(valid_set_score[1]))
    print("Test set accuracy: {}".format(test_set_score[1]))

    cfg.vec_acc = [train_set_score[1], valid_set_score[1], test_set_score[1]]

    if cfg.num_classes == 2:
        y_true = ys[-1]
        y_pred_logits = model.predict(Xs[2])

        y_pred = y_pred_logits.copy()
        y_pred[y_pred >= 0.5] = 1
        y_pred[y_pred < 0.5] = 0
        y_pred = y_pred.reshape(-1)

        # now transform the logits into a matrix of two class probabilities and append
        y_pred_logits_both = np.concatenate([1 - y_pred_logits, y_pred_logits],
                                            axis=1)
        cfg.y_test_pred_prob = y_pred_logits_both
    else:
        raise ValueError("The number of classes should be two")

    # Printing out true and pred labels for log reg
    print('Test set: ground truth')

    print(y_true)

    print('Test set: prediction')

    print(y_pred)

    # Print out the patient IDs corresponding to the query
    # Here for example
    # if you are running 'disease' label and you set true_label_id = 0 and predicted_label_id = 2
    # then you would get the patients who are normal/healthy and but falsely classified as NV AMD
    # the true_label_id and predicted_label_id correspond to cfg.vec_str_labels defined above
    print(
        get_patient_id_by_label(y_true,
                                y_pred,
                                true_label_id=0,
                                predicted_label_id=1,
                                cfg=cfg))

    # you can also print multiple of these at the same time
    print(
        get_patient_id_by_label(y_true,
                                y_pred,
                                true_label_id=1,
                                predicted_label_id=0,
                                cfg=cfg))

    # Extra caveat: for feature labels since we don't have possible any more and since the classes
    # are automatically recasted to get the FN (patient has the feature but network predicts not present)
    # you need to do something like
    # print(get_patient_id_by_label(y_true, y_pred, true_label_id=1, predicted_label_id=0, cfg=cfg))

    cfg.y_test_true = y_true
    cfg.y_test_pred = y_pred

    # now compute the test set accuracies
    test_acc = np.sum(y_true == y_pred) / len(y_true)
    print("\nTest accuracy: {}".format(test_acc))
    cfg.test_acc_full = test_acc

    # plot the confusion matrices
    plot_raw_conf_matrix(y_true, y_pred, cfg, save=True)
    plot_norm_conf_matrix(y_true, y_pred, cfg, save=True)

    # save the cfg, which contains configurations and results
    save_cfg(cfg, overwrite=True)

    # save the mat file, which contains the binary results
    save_mat(cfg, overwrite=True)
def run_repeated_training_binary(cfg_in):
    # create the empty lists for holding all the data
    vec_train_acc = []
    vec_valid_acc = []
    vec_test_acc = []

    vec_y_true = []
    vec_y_pred = []
    vec_y_pred_prob = []

    mat_vec_idx_absolute_test = []

    # set the random seed for all the splits
    temp = initialize_config_split(copy.deepcopy(cfg_in))
    np.random.seed(temp.random_seed)

    # initiate the repeated training
    for i in range(cfg_in.n_repeats):
        cfg = copy.deepcopy(cfg_in)
        print("\n\nIteration: {}".format(i + 1))

        # now load the preprocessed data and the label
        f_data_handle = "preproc_data_{}_{}.pkl".format(
            cfg.vec_idx_patient[0], cfg.vec_idx_patient[1])
        f_data = cfg.d_preproc / f_data_handle
        with open(str(f_data), 'rb') as handle:
            X = pickle.load(handle)

        y = generate_labels(cfg.str_feature, cfg, bool_append_csv_to_cfg=True)

        # now prepare data for training
        cfg = initialize_config_split(cfg)
        X, y = correct_data_label(X, y, cfg)
        Xs, ys = prepare_data_for_train(X, y, cfg)

        print("\nx_train Angiography cube shape: {}".format(Xs[0][0].shape))
        print("x_train Structure OCT cube shape: {}".format(Xs[0][1].shape))
        print("x_train B scan shape: {}".format(Xs[0][2].shape))
        print("y_train onehot shape: {}".format(ys[0].shape))

        print("\nx_valid Angiography cube shape: {}".format(Xs[1][0].shape))
        print("x_valid Structure OCT cube shape: {}".format(Xs[1][1].shape))
        print("x_valid B scan shape: {}".format(Xs[1][2].shape))
        print("y_valid onehot shape: {}".format(ys[1].shape))

        print("\nx_test Angiography cube shape: {}".format(Xs[2][0].shape))
        print("x_test Structure OCT cube shape: {}".format(Xs[2][1].shape))
        print("x_test B scan shape: {}".format(Xs[2][2].shape))
        print("y_test onehot shape: {}".format(ys[2].shape))

        # finally set the training parameters
        cfg = initialize_config_training(cfg, bool_debug=cfg.bool_debug)
        model = get_model(cfg.str_arch, cfg)
        callbacks = get_callbacks(cfg)

        h = model.fit(Xs[0],
                      ys[0],
                      batch_size=cfg.batch_size,
                      epochs=cfg.n_epoch,
                      verbose=2,
                      callbacks=callbacks,
                      validation_data=(Xs[1], ys[1]),
                      shuffle=False,
                      validation_batch_size=Xs[1][0].shape[0])

        # Now perform prediction
        train_set_score = model.evaluate(Xs[0],
                                         ys[0],
                                         callbacks=callbacks,
                                         verbose=0)
        valid_set_score = model.evaluate(Xs[1],
                                         ys[1],
                                         callbacks=callbacks,
                                         verbose=0)
        test_set_score = model.evaluate(Xs[2],
                                        ys[2],
                                        callbacks=callbacks,
                                        verbose=0)

        vec_train_acc.append(train_set_score[1])
        vec_valid_acc.append(valid_set_score[1])
        vec_test_acc.append(test_set_score[1])

        if cfg.num_classes == 2:
            y_true = ys[-1]
            y_pred_logits = model.predict(Xs[2])

            y_pred = y_pred_logits.copy()
            y_pred[y_pred >= 0.5] = 1
            y_pred[y_pred < 0.5] = 0
            y_pred = y_pred.reshape(-1)

            # now transform the logits into a matrix of two class probabilities and append
            y_pred_logits_both = np.concatenate(
                [1 - y_pred_logits, y_pred_logits], axis=1)

        else:
            raise ValueError("The number of classes should be two")

        vec_y_true.append(y_true)
        vec_y_pred.append(y_pred)
        vec_y_pred_prob.append(y_pred_logits_both)
        mat_vec_idx_absolute_test.append(cfg.vec_idx_absolute[2])

        Xs = []
        ys = []

    print('\n' + '=' * 80)
    print("Average train set accuracy: {} + ".format(np.mean(vec_train_acc)),
          np.std(vec_train_acc))
    print("Average valid set accuracy: {} + ".format(np.mean(vec_valid_acc)),
          np.std(vec_valid_acc))
    print("Average test set accuracy: {} + ".format(np.mean(vec_test_acc)),
          np.std(vec_test_acc))

    # now concatenate the results together
    y_true = np.concatenate(vec_y_true, axis=0)
    y_pred = np.concatenate(vec_y_pred, axis=0)
    y_pred_prob = np.concatenate(vec_y_pred_prob, axis=0)
    vec_idx_absolute_test = np.concatenate(mat_vec_idx_absolute_test, axis=0)

    # sanity check
    if not y_true.shape[0] == y_pred.shape[0] == y_pred_prob.shape[
            0] == vec_idx_absolute_test.shape[0]:
        raise ValueError("These should be equal")

    # print the overall test set accuracy
    print("\nOverall test set accuracy: {}".format(
        np.sum(y_true == y_pred) / len(y_true)))

    # now load the preprocessed data and the label
    f_data_handle = "preproc_data_{}_{}.pkl".format(cfg_in.vec_idx_patient[0],
                                                    cfg_in.vec_idx_patient[1])
    f_data = cfg_in.d_preproc / f_data_handle
    with open(str(f_data), 'rb') as handle:
        X = pickle.load(handle)

    y = generate_labels(cfg_in.str_feature,
                        cfg_in,
                        bool_append_csv_to_cfg=True)

    # now prepare data for training
    cfg_in = initialize_config_split(cfg_in)
    X, y = correct_data_label(X, y, cfg_in)
    _, _ = prepare_data_for_train(X, y, cfg_in)
    X, y = None, None

    # append the hyperparameters to the cfg structure
    cfg_in.vec_train_acc = vec_train_acc
    cfg_in.vec_valid_acc = vec_valid_acc
    cfg_in.vec_test_acc = vec_test_acc

    cfg_in.y_test_true = y_true
    cfg_in.y_test_pred = y_pred
    cfg_in.y_test_pred_prob = y_pred_prob
    cfg_in.vec_idx_absolute_test = vec_idx_absolute_test

    cfg_in.vec_y_true = vec_y_true
    cfg_in.vec_y_pred = vec_y_pred
    cfg_in.vec_y_pred_prob = vec_y_pred_prob
    cfg_in.mat_vec_idx_absolute_test = mat_vec_idx_absolute_test

    # now get the configuration right for saving the output
    cfg_in.str_model = cfg_in.str_arch
    cfg_in.f_model = "{}_{}".format(cfg_in.str_arch,
                                    time.strftime("%Y%m%d_%H%M%S"))
    cfg_in.p_figure = cfg_in.d_model / cfg_in.str_model / cfg_in.f_model
    cfg_in.p_figure.mkdir(parents=True, exist_ok=True)
    cfg_in.p_cfg = cfg_in.p_figure

    # plot the confusion matrices
    plot_raw_conf_matrix(y_true,
                         y_pred,
                         cfg_in,
                         save=True,
                         f_figure=cfg_in.str_arch)
    plot_norm_conf_matrix(y_true,
                          y_pred,
                          cfg_in,
                          save=True,
                          f_figure=cfg_in.str_arch)

    # save the cfg, which contains configurations and results
    save_cfg(cfg_in, overwrite=True)

    # save the mat file, which contains the binary results
    save_mat(cfg_in, overwrite=True)
示例#3
0
def run_training_cv(vec_Xs, vec_ys, cfg):
    vec_history = []

    # find the aggregate results on the entire dataset
    vec_y_true = []
    vec_y_pred = []
    vec_y_pred_prob = []

    # also declare empty list for the validation set
    vec_y_valid_true = []
    vec_y_valid_pred = []
    vec_y_valid_pred_prob = []

    for idx_fold in range(len(vec_Xs)):
        print('\n\nFold: {}\n'.format(idx_fold))
        Xs = vec_Xs[idx_fold]
        ys = vec_ys[idx_fold]

        print("\nx_train Angiography cube shape: {}".format(Xs[0][0].shape))
        print("x_train Structure OCT cube shape: {}".format(Xs[0][1].shape))
        print("x_train B scan shape: {}".format(Xs[0][2].shape))
        print("x_train 3D B scan shape: {}".format(Xs[0][3].shape))
        print("y_train onehot shape: {}".format(ys[0].shape))

        print("\nx_valid Angiography cube shape: {}".format(Xs[1][0].shape))
        print("x_valid Structure OCT cube shape: {}".format(Xs[1][1].shape))
        print("x_valid B scan shape: {}".format(Xs[1][2].shape))
        print("x_valid 3D B scan shape: {}".format(Xs[1][3].shape))
        print("y_valid onehot shape: {}".format(ys[1].shape))

        print("\nx_test Angiography cube shape: {}".format(Xs[2][0].shape))
        print("x_test Structure OCT cube shape: {}".format(Xs[2][1].shape))
        print("x_test B scan shape: {}".format(Xs[2][2].shape))
        print("x_test 3D B scan shape: {}".format(Xs[2][3].shape))
        print("y_test onehot shape: {}".format(ys[2].shape))

        # Get and train model
        model_curr = get_model(cfg.str_arch, cfg)
        callbacks_curr = get_callbacks(cfg)

        h = model_curr.fit(Xs[0],
                           ys[0],
                           batch_size=cfg.batch_size,
                           epochs=cfg.n_epoch,
                           verbose=2,
                           callbacks=callbacks_curr,
                           validation_data=(Xs[1], ys[1]),
                           shuffle=False,
                           validation_batch_size=Xs[1][0].shape[0])
        vec_history.append(h.history)

        # save trained models
        save_model(model_curr,
                   cfg,
                   overwrite=True,
                   save_format='tf',
                   idx_cv_fold=idx_fold)

        # plotting training history
        plot_training_loss(h, cfg, save=True)
        plot_training_acc(h, cfg, save=True)

        # Now perform prediction
        train_set_score = model_curr.evaluate(Xs[0],
                                              ys[0],
                                              callbacks=callbacks_curr,
                                              verbose=0)
        valid_set_score = model_curr.evaluate(Xs[1],
                                              ys[1],
                                              callbacks=callbacks_curr,
                                              verbose=0)
        test_set_score = model_curr.evaluate(Xs[2],
                                             ys[2],
                                             callbacks=callbacks_curr,
                                             verbose=0)

        print("\nTrain set accuracy: {}".format(train_set_score[1]))
        print("Valid set accuracy: {}".format(valid_set_score[1]))
        print("Test set accuracy: {}".format(test_set_score[1]))

        cfg.vec_acc = [
            train_set_score[1], valid_set_score[1], test_set_score[1]
        ]

        if cfg.num_classes == 2:
            # make predictions for test set
            y_true = ys[-1]
            y_pred_logits = model_curr.predict(Xs[2])

            y_pred = y_pred_logits.copy()
            y_pred[y_pred >= 0.5] = 1
            y_pred[y_pred < 0.5] = 0
            y_pred = y_pred.reshape(-1)

            # now transform the logits into a matrix of two class probabilities and append
            y_pred_logits = np.concatenate([1 - y_pred_logits, y_pred_logits],
                                           axis=1)

            # make predictions for validation set
            y_valid_true = ys[1]
            y_valid_pred_logits = model_curr.predict(Xs[1])

            y_valid_pred = y_valid_pred_logits.copy()
            y_valid_pred[y_valid_pred >= 0.5] = 1
            y_valid_pred[y_valid_pred < 0.5] = 0
            y_valid_pred = y_valid_pred.reshape(-1)

            # nwo transform the logits into two class probabilities and append for validation set
            y_valid_pred_logits = np.concatenate(
                [1 - y_valid_pred_logits, y_valid_pred_logits], axis=1)

        else:
            # make the predictions for the test set
            y_true = np.argmax(ys[-1], axis=1)
            y_pred = np.argmax(model_curr.predict(Xs[2]), axis=1)
            y_pred_logits = model_curr.predict(Xs[2])

            # make the predictions for the validation set
            y_valid_true = np.argmax(ys[1], axis=1)
            y_valid_pred = np.argmax(model_curr.predict(Xs[1]), axis=1)
            y_valid_pred_logits = model_curr.predict(Xs[1])

        # plot the confusion matrices
        plot_raw_conf_matrix(y_true, y_pred, cfg, save=True)
        plot_norm_conf_matrix(y_true, y_pred, cfg, save=True)

        # now append the results to a list
        vec_y_true.append(y_true)
        vec_y_pred.append(y_pred)
        vec_y_pred_prob.append(y_pred_logits)

        # append the results from the validation set also
        vec_y_valid_true.append(y_valid_true)
        vec_y_valid_pred.append(y_valid_pred)
        vec_y_valid_pred_prob.append(y_valid_pred_logits)

    # Now we are outside of the loop
    y_true_unsorted_all = np.concatenate(vec_y_true, axis=-1)
    y_pred_unsorted_all = np.concatenate(vec_y_pred, axis=-1)
    y_pred_prob_unsorted_all = np.concatenate(vec_y_pred_prob, axis=0)

    # Now obtain the correct indices
    vec_idx_absolute_test_all = []
    for idx_fold in range(len(vec_Xs)):
        vec_idx_test_curr = cfg.vec_idx_absolute[idx_fold][-1]
        vec_idx_absolute_test_all.append(vec_idx_test_curr)
    vec_idx_absolute_test_all = np.concatenate(vec_idx_absolute_test_all, -1)

    # Now get all the test set data
    idx_permutation_sort = np.argsort(vec_idx_absolute_test_all)

    y_true_all = y_true_unsorted_all[idx_permutation_sort]
    y_pred_all = y_pred_unsorted_all[idx_permutation_sort]
    y_pred_prob_all = y_pred_prob_unsorted_all[idx_permutation_sort, ...]

    cfg.y_test_true = y_true_all
    cfg.y_test_pred = y_pred_all
    cfg.y_test_pred_prob = y_pred_prob_all

    # also generate the data for the validation set predictions
    y_valid_true_unsorted_all = np.concatenate(vec_y_valid_true, axis=-1)
    y_valid_pred_unsorted_all = np.concatenate(vec_y_valid_pred, axis=-1)
    y_valid_pred_prob_unsorted_all = np.concatenate(vec_y_valid_pred_prob,
                                                    axis=0)

    cfg.y_valid_true = y_valid_true_unsorted_all
    cfg.y_valid_pred = y_valid_pred_unsorted_all
    cfg.y_valid_pred_prob = y_valid_pred_prob_unsorted_all

    test_acc_full = np.sum(y_true_all == y_pred_all) / len(y_true_all)
    print("\nOverall accuracy: {}".format(test_acc_full))
    cfg.test_acc_full = test_acc_full

    # Print out the patient IDs corresponding to the query
    # Here for example
    # if you are running 'disease' label and you set true_label_id = 0 and predicted_label_id = 2
    # then you would get the patients who are normal/healthy and but falsely classified as NV AMD
    # the true_label_id and predicted_label_id correspond to cfg.vec_str_labels defined above
    # print(get_patient_id_by_label(y_true_all, y_pred_all, true_label_id=0, predicted_label_id=2, cfg=cfg))

    # you can also print multiple of these at the same time
    # print(get_patient_id_by_label(y_true_all, y_pred_all, true_label_id=2, predicted_label_id=1, cfg=cfg))

    # Extra caveat: for feature labels since we don't have possible any more and since the classes
    # are automatically recasted to get the FN (patient has the feature but network predicts not present)
    # you need to do something like
    print(
        get_patient_id_by_label(y_true_all,
                                y_pred_all,
                                true_label_id=1,
                                predicted_label_id=0,
                                cfg=cfg))

    # Plot and save the final result
    plot_raw_conf_matrix(y_true_all, y_pred_all, cfg, save=True, cv_all=True)
    plot_norm_conf_matrix(y_true_all, y_pred_all, cfg, save=True, cv_all=True)

    # append final training history
    cfg.vec_history = vec_history

    # save the output as a csv file also
    save_csv(y_true_all, y_pred_all, cfg)

    # save the cfg, which contains configurations and results
    save_cfg(cfg, overwrite=True)

    # save the mat file, which contains all useful output information
    save_mat(cfg, overwrite=True, bool_save_valid=True)

    return cfg