Пример #1
0
def MLP(M, labels, seed, split=0.8, hidden_layer_size_nb=50):
    """
    linear ridge algorithm for input M and output labels
    Inputs:
        M : matrix m*n where each row is a different example and the columns are composed of the features
        labels : vector m*1 where each row is the correponding class of the row of M
        seed : random seed to do the split between test/validation/training
        split: number between 0 and 1. Split between training and testing set. Default : 0.8
    Ouputs:
        roc_auc_rf_train: AUC score on the train set
        roc_auc_rf_val: AUC score on the validation set
        roc_auc_rf: AUC score on the test set
    """
    M_float = preprocessing_dataset.preprocessing_nan_normalization(M_str)
    M_train_val, M_val, M_test, labels_train_val, labels_val, labels_test = preprocessing_dataset.split_train_val_test(
        M_float, seed, labels, nb_val=3, split=0.8)

    X_train = M_train_val
    Y_train = labels_train_val
    X_test = M_test
    Y_train = np.reshape(Y_train, (Y_train.shape[0], ))
    # Create our imputer to replace missing values with the mean e.g.
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(X_train)

    # Impute our data, then train
    X_train_imp = imp.transform(X_train)
    # Impute each test item, then predict
    X_test_imp = imp.transform(X_test)
    X_val_imp = imp.transform(M_val)

    clf = MLPClassifier(solver='adam',
                        alpha=1e-5,
                        hidden_layer_sizes=(hidden_layer_size_nb),
                        random_state=1)
    clf = clf.fit(X_train_imp, Y_train)

    y_pred_proba_train = clf.predict_proba(X_train_imp)[:, 1]
    y_pred_proba_train = np.reshape(y_pred_proba_train,
                                    (y_pred_proba_train.shape[0], 1))

    fpr_svm, tpr_svm, thresholds_train = roc_curve(Y_train, y_pred_proba_train)
    roc_auc_train = auc(fpr_svm, tpr_svm)

    y_pred_proba_val = clf.predict_proba(X_val_imp)[:, 1]
    y_pred_proba_val = np.reshape(y_pred_proba_val,
                                  (y_pred_proba_val.shape[0], 1))

    fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_val,
                                                   y_pred_proba_val)
    roc_auc_val = auc(fpr_svm, tpr_svm)
    y_pred_proba = clf.predict_proba(X_test_imp)[:, 1]
    y_pred_proba = np.reshape(y_pred_proba, (y_pred_proba.shape[0], 1))

    fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_test, y_pred_proba)
    roc_auc_test = auc(fpr_svm, tpr_svm)
    print('MLP: train set: %0.5f, validation: %0.5f, test set: %0.5f' %
          (roc_auc_train, roc_auc_val, roc_auc_test))
    return roc_auc_train, roc_auc_val, roc_auc_test
Пример #2
0
def svm(M, labels, seed, split=0.8):
    """
    linear SVM algorithm for input M and output labels
    Inputs:
        M : matrix m*n where each row is a different example and the columns are composed of the features
        labels : vector m*1 where each row is the correponding class of the row of M
        seed : random seed to do the split between test/validation/training
        split: number between 0 and 1. Split between training and testing set. Default : 0.8
    Ouputs:
        roc_auc_svm_train: AUC score on the train set
        roc_auc_svm_val: AUC score on the validation set
        roc_auc_svm: AUC score on the test set
    """
    M_float = preprocessing_dataset.preprocessing_nan_normalization(M_str)
    M_train_val, M_val, M_test, labels_train_val, labels_val, labels_test = preprocessing_dataset.split_train_val_test(
        M_float, seed, labels, nb_val=3, split=0.8)
    X_train = M_train_val
    Y_train = labels_train_val
    X_test = M_test
    Y_train = np.reshape(Y_train, (Y_train.shape[0], ))
    # Create our imputer to replace missing values with the mean e.g.
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(X_train)

    # Impute our data, then train
    X_train_imp = imp.transform(X_train)
    clf = SVC(kernel='linear', probability=True)
    clf = clf.fit(X_train_imp, Y_train)

    # Impute each test item, then predict
    X_test_imp = imp.transform(X_test)
    X_val_imp = imp.transform(M_val)

    y_pred = clf.predict_proba(X_train_imp)[:, 1]
    y_pred = np.reshape(y_pred, (y_pred.shape[0], 1))
    fpr_svm, tpr_svm, thresholds_train = roc_curve(Y_train, y_pred)
    roc_auc_svm_train = auc(fpr_svm, tpr_svm)

    y_pred_val = clf.predict_proba(X_val_imp)[:, 1]
    y_pred_val = np.reshape(y_pred_val, (y_pred_val.shape[0], 1))
    fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_val, y_pred_val)
    roc_auc_svm_val = auc(fpr_svm, tpr_svm)

    y_pred_proba = clf.predict_proba(X_test_imp)[:, 1]
    y_pred_proba = np.reshape(y_pred_proba, (y_pred_proba.shape[0], 1))

    fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_test, y_pred_proba)
    roc_auc_svm = auc(fpr_svm, tpr_svm)
    print('linear SVM: train set: %0.5f, val set: %0.5f, test set: %0.5f' %
          (roc_auc_svm_train, roc_auc_svm_val, roc_auc_svm))
    return roc_auc_svm_train, roc_auc_svm_val, roc_auc_svm
Пример #3
0
def linear_ridge(M, labels, seed, split=0.8):
    """
    linear ridge algorithm for input M and output labels
    Inputs:
        M : matrix m*n where each row is a different example and the columns are composed of the features
        labels : vector m*1 where each row is the correponding class of the row of M
        seed : random seed to do the split between test/validation/training
        split: number between 0 and 1. Split between training and testing set. Default : 0.8
    Ouputs:
        roc_auc_train: AUC score on the train set
        roc_auc_val: AUC score on the validation set
        roc_auc_test: AUC score on the test set
    """
    M_float = preprocessing_dataset.preprocessing_nan_normalization(M_str)
    M_train_val, M_val, M_test, labels_train_val, labels_val, labels_test = preprocessing_dataset.split_train_val_test(
        M_float, seed, labels, nb_val=3, split=0.8)
    X_train = M_train_val
    Y_train = labels_train_val
    X_test = M_test
    Y_train = np.reshape(Y_train, (Y_train.shape[0], ))
    # Create our imputer to replace missing values with the mean e.g.
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(X_train)

    # Impute our data, then train
    X_train_imp = imp.transform(X_train)
    clf = RidgeClassifier()
    clf = clf.fit(X_train_imp, Y_train)

    # Impute each test item, then predict
    X_test_imp = imp.transform(X_test)
    X_val_imp = imp.transform(M_val)

    # Compute the accuracy
    lin_acc = clf.score(X_test_imp, labels_test)
    # Compute the AUC
    pred_train = clf.decision_function(X_train_imp)
    pred = clf.decision_function(X_test_imp)
    pred_val = clf.decision_function(X_val_imp)

    fpr_svm, tpr_svm, thresholds_train = roc_curve(Y_train, pred_train)
    roc_auc_train = auc(fpr_svm, tpr_svm)
    fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_val, pred_val)
    roc_auc_val = auc(fpr_svm, tpr_svm)
    fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_test, pred)
    roc_auc_test = auc(fpr_svm, tpr_svm)
    print(
        'linear ridge: train set: %0.5f, validation: %0.5f, test set: %0.5f' %
        (roc_auc_train, roc_auc_val, roc_auc_test))
    return roc_auc_train, roc_auc_val, roc_auc_test
Пример #4
0
mask_age, _, _ = read_tadpole.load_csv_no_header(path_mask_X1)
mask_sex, _, _ = read_tadpole.load_csv_no_header(path_mask_X2)
mask_agesex, _, _ = read_tadpole.load_csv_no_header(path_mask_X3)
mask_nosignificance, _, _ = read_tadpole.load_csv_no_header(path_mask_X4)

A1, _, _ = read_tadpole.load_csv_no_header(path_dataset_affinity_matrix_age)
A2, _, _ = read_tadpole.load_csv_no_header(path_dataset_affinity_matrix_sex)
A3, _, _ = read_tadpole.load_csv_no_header(path_dataset_affinity_matrix_agesex)
A_age = preprocessing_dataset.str_to_float(A1)
A_sex = preprocessing_dataset.str_to_float(A2)
A_sexage = preprocessing_dataset.str_to_float(A3)
labels, _, _ = read_tadpole.load_csv_no_header(labels_path)

labels = preprocessing_dataset.str_to_float(labels)

M_float = preprocessing_dataset.preprocessing_nan_normalization(M_str)

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(M_float)

# Impute our data, then train
M_float_imp = imp.transform(M_float)

M = M_float_imp

A_age = preprocessing_dataset.normalize_adj(A_age)
A_sex = preprocessing_dataset.normalize_adj(A_sex)
A_sexage = preprocessing_dataset.normalize_adj(A_sexage)

mask_age = preprocessing_dataset.str_to_float(mask_age)
mask_sex = preprocessing_dataset.str_to_float(mask_sex)