def train_fold(train_ind, test_ind, val_ind, graph_feat, features, y, y_data, params, subject_IDs): """ train_ind : indices of the training samples test_ind : indices of the test samples val_ind : indices of the validation samples graph_feat : population graph computed from phenotypic measures num_subjects x num_subjects features : feature vectors num_subjects x num_features y : ground truth labels (num_subjects x 1) y_data : ground truth labels - different representation (num_subjects x 2) params : dictionnary of GCNs parameters subject_IDs : list of subject IDs returns: test_acc : average accuracy over the test samples using GCNs test_auc : average area under curve over the test samples using GCNs lin_acc : average accuracy over the test samples using the linear classifier lin_auc : average area under curve over the test samples using the linear classifier fold_size : number of test samples """ print(len(train_ind)) # selection of a subset of data if running experiments with a subset of the training set labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs) # feature selection/dimensionality reduction step x_data = Reader.feature_selection(features, y, labeled_ind, params['num_features']) fold_size = len(test_ind) # Calculate all pairwise distances distv = distance.pdist(x_data, metric='correlation') # Convert to a square symmetric distance matrix dist = distance.squareform(distv) sigma = np.mean(dist) # Get affinity from similarity matrix sparse_graph = np.exp(-dist**2 / (2 * sigma**2)) final_graph = graph_feat * sparse_graph # Linear classifier clf = RidgeClassifier() clf.fit(x_data[train_ind, :], y[train_ind].ravel()) # Compute the accuracy lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel()) # Compute the AUC pred = clf.decision_function(x_data[test_ind, :]) lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred) print("Linear Accuracy: " + str(lin_acc)) # Classification with GCNs test_acc, test_auc = Train.run_training(final_graph, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind, test_ind, params) print(test_acc) # return number of correctly classified samples instead of percentage test_acc = int(round(test_acc * len(test_ind))) lin_acc = int(round(lin_acc * len(test_ind))) return test_acc, test_auc, lin_acc, lin_auc, fold_size
def train_fold(train_ind, test_ind, val_ind, graph_feat, features, y, y_data, params, subject_IDs, pathToSave, i, subject_labels, idx): """ train_ind : indices of the training samples test_ind : indices of the test samples val_ind : indices of the validation samples graph_feat : population graph computed from phenotypic measures num_subjects x num_subjects features : feature vectors num_subjects x num_features y : ground truth labels (num_subjects x 1) y_data : ground truth labels - different representation (num_subjects x 2) params : dictionnary of GCNs parameters subject_IDs : list of subject IDs returns: test_acc : average accuracy over the test samples using GCNs test_auc : average area under curve over the test samples using GCNs lin_acc : average accuracy over the test samples using the linear classifier lin_auc : average area under curve over the test samples using the linear classifier fold_size : number of test samples """ print(len(train_ind)) tf.reset_default_graph() tf.app.flags._global_parser = argparse.ArgumentParser() # selection of a subset of data if running experiments with a subset of the training set # labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs) num_nodes = np.size(graph_feat, 0) #print features[0,:],"features" x_data_1 = features.astype(float)#Reader.feature_selection(features, y, labeled_ind, params['num_features']) xrow,xcol = np.shape(x_data_1) for i in range(xrow): for j in range(xcol): x_data_1[i, j] = round(x_data_1[i,j], 4) fold_size = len(test_ind) x_data_1[np.where(np.isnan(x_data_1))] = 0 distv = distance.pdist(x_data_1, metric='correlation') dist = distance.squareform(distv) sigma = np.mean(dist) # Get affinity from similarity matrix sparse_graph = np.exp(- dist ** 2 / (2 * sigma ** 2)) # plt.matshow(sparse_graph) # plt.savefig('features_sparsegraph.png', bbox_inches='tight') # exit() graph = Reader.get_affinity(sparse_graph, idx) x_data = features.astype(float)#np.identity(num_nodes) xrow,xcol = np.shape(x_data) for i in range(xrow): for j in range(xcol): x_data[i, j] = round(x_data[i,j], 4) np.savetxt("x_data.csv", x_data, delimiter=',') x_data[np.where(np.isnan(x_data))] = 0 print(np.where(np.isnan(x_data))) #exit() # Linear classifier clf = RidgeClassifier() clf.fit(x_data[train_ind, :], y[train_ind].ravel()) # Compute the accuracy lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel()) # Compute the AUC pred = clf.decision_function(x_data[test_ind, :]) y_one_hot = label_binarize(y[test_ind], classes=np.arange(3)) lin_auc = sklearn.metrics.roc_auc_score(y_one_hot, pred) # np.savetxt("x_data.csv", x_data, delimiter = ',') # Classification with GCNs test_acc, test_auc, weights, confusion = Train.run_training(graph, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind, test_ind, params, pathToSave, i) # print(test_acc) scores_acc = [test_acc] scores_auc = [test_auc] scores_lin = [lin_acc] scores_auc_lin = [lin_auc] fold_size = [fold_size] if FLAGS.model == 'gcn_cheby': weights_0 = weights[0] weights_1 = weights[1] weights_2 = weights[2] scores_lin_ = np.sum(scores_lin) scores_auc_lin_ = np.mean(scores_auc_lin) scores_acc_ = int(np.sum(scores_acc) * len(test_ind)) scores_auc_ = np.mean(scores_auc) if not os.path.exists(pathToSave + 'excel/'): os.makedirs(pathToSave + 'excel/') pathToSave2 = pathToSave + 'excel/' result_name = 'ABIDE_classification.mat' if FLAGS.model == 'gcn_cheby': sio.savemat(pathToSave2 + str(trial) + result_name, {'lin': scores_lin_, 'lin_auc': scores_auc_lin_, 'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes, 'weights_0': weights_0, 'weights_1': weights_1, 'weights_2': weights_2}) df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_], 'scores_auc_lin': [scores_auc_lin_], 'weights_0': weights_0, 'weights_1': weights_1, 'weights_2':weights_2, 'confusion_matrix': [confusion]}) else: sio.savemat(pathToSave2 + str(trial) + result_name, {'lin': scores_lin_, 'lin_auc': scores_auc_lin_, 'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes}) df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_], 'scores_auc_lin': [scores_auc_lin_], 'confusion_matrix': [confusion]}) prediction.append(df) # Create a Pandas Excel writer using XlsxWriter as the engine. writer_n = pd.ExcelWriter(pathToSave2 + str(test_ind[0]) + '.xlsx', engine='xlsxwriter') # Convert the dataframe to an XlsxWriter Excel object. df.to_excel(writer_n, sheet_name='Sheet1') # Close the Pandas Excel writer and output the Excel file. writer_n.save() lin_acc = int(round(lin_acc * len(test_ind))) scores_acc = [test_acc] scores_auc = [test_auc] scores_lin = [lin_acc] scores_auc_lin = [lin_auc] fold_size = [fold_size] # return number of correctly classified samples instead of percentage test_acc = int(round(test_acc * len(test_ind))) return test_acc, test_auc, lin_acc, lin_auc, fold_size, len(test_ind)
def train_fold(train_ind, test_ind, val_ind, graph_feat, graph_feat2, features, y, y_data, idx, lr, params, subject_IDs, pathToSave, i): """ train_ind : indices of the training samples test_ind : indices of the test samples val_ind : indices of the validation samples graph_feat : population graph computed from phenotypic measures num_subjects x num_subjects features : feature vectors num_subjects x num_features y : ground truth labels (num_subjects x 1) y_data : ground truth labels - different representation (num_subjects x 2) params : dictionnary of GCNs parameters subject_IDs : list of subject IDs returns: test_acc : average accuracy over the test samples using GCNs test_auc : average area under curve over the test samples using GCNs lin_acc : average accuracy over the test samples using the linear classifier lin_auc : average area under curve over the test samples using the linear classifier fold_size : number of test samples """ tf.reset_default_graph() tf.app.flags._global_parser = argparse.ArgumentParser() print(len(train_ind)) # selection of a subset of data if running experiments with a subset of the training set #labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs) labeled_ind = reader.site_percentage(train_ind,1.0) # feature selection/dimensionality reduction step x_data = Reader.feature_selection(features, y, labeled_ind, params['num_features']) fold_size = len(test_ind) # Calculate all pairwise distances distv = distance.pdist(x_data, metric='correlation') # Convert to a square symmetric distance matrix dist = distance.squareform(distv) sigma = np.mean(dist) # Get affinity from similarity matrix sparse_graph = np.exp(- dist ** 2 / (2 * sigma ** 2)) num_nodes = 662 final_graph = graph_feat * sparse_graph # Gender final_graph2 = graph_feat2 * sparse_graph # Age # Linear classifier clf = RidgeClassifier() clf.fit(x_data[train_ind, :], y[train_ind].ravel()) # Compute the accuracy lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel()) # Compute the AUC pred = clf.decision_function(x_data[test_ind, :]) lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred) print("Linear Accuracy: " + str(lin_acc)) # Classification with GCNs test_acc, test_auc, weights= Train.run_training(final_graph, final_graph2, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind, test_ind, idx, lr, params, pathToSave, i) # return number of correctly classified samples instead of percentage # test_acc = int(round(test_acc * len(test_ind))) # lin_acc = int(round(lin_acc * len(test_ind))) scores_acc = [test_acc] scores_auc = [test_auc] scores_lin = [lin_acc] scores_auc_lin = [lin_auc] fold_size = [fold_size] weights_0 = weights[0] weights_1 = weights[1] scores_lin_ = np.sum(scores_lin) scores_auc_lin_ = np.mean(scores_auc_lin) scores_acc_ = np.sum(scores_acc) scores_auc_ = np.mean(scores_auc) if not os.path.exists(pathToSave + 'excel/'): os.makedirs(pathToSave + 'excel/') pathToSave2 = pathToSave + 'excel/' result_name = 'ABIDE_classification.mat' sio.savemat(pathToSave2 + str(trial) + result_name, {'lin': scores_lin_, 'lin_auc': scores_auc_lin_, 'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes, 'weights_0': weights_0, 'weights_1': weights_1}) df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_], 'scores_auc_lin': [scores_auc_lin_], 'weights_0': weights_0, 'weights_1': weights_1}) prediction.append(df) # Create a Pandas Excel writer using XlsxWriter as the engine. writer_n = pd.ExcelWriter(pathToSave2 + str(test_ind[0]) + '.xlsx', engine='xlsxwriter') # Convert the dataframe to an XlsxWriter Excel object. df.to_excel(writer_n, sheet_name='Sheet1') # Close the Pandas Excel writer and output the Excel file. writer_n.save() test_acc = int(round(test_acc * len(test_ind))) lin_acc = int(round(lin_acc * len(test_ind))) scores_acc = [test_acc] scores_auc = [test_auc] scores_lin = [lin_acc] scores_auc_lin = [lin_auc] fold_size = [fold_size] # return number of correctly classified samples instead of percentage test_acc = int(round(test_acc * len(test_ind))) return test_acc, test_auc, lin_acc, lin_auc, fold_size
def train_fold(cv, train_ind, test_ind, val_ind, graph_feat, features, y, y_data, params, subject_IDs, cur_time): """ train_ind : indices of the training samples test_ind : indices of the test samples val_ind : indices of the validation samples graph_feat : population graph computed from phenotypic measures num_subjects x num_subjects features : feature vectors num_subjects x num_features y : ground truth labels (num_subjects x 1) y_data : ground truth labels - different representation (num_subjects x 2) params : dictionnary of GCNs parameters subject_IDs : list of subject IDs #returns: test_acc : average accuracy over the test samples using GCNs test_auc : average area under curve over the test samples using GCNs lin_acc : average accuracy over the test samples using the linear classifier lin_auc : average area under curve over the test samples using the linear classifier fold_size : number of test samples """ # feature selection/dimensionality reduction step # x_data = features x_data = Reader.lasso_feature_selection(features, y, train_ind, cv) # x_data = Reader.feature_selection(features, y, labeled_ind, params['num_features']) # x_data = Reader.feature_selection(features, y, train_ind, params['num_features']) # no need to consider site info. # x_data = Reader.ttest_feature_selection(cur_time, cv, features, y, train_ind) # x_data = Reader.bagging_based_ttest_feature_selection(cv, features, y, train_ind) # x_data = Reader.ElasticNet_feature_selection(features, y, train_ind) # x_data = Reader.bagging_based_ElasticNet_feature_selection(features, y, train_ind) # x_data = Reader.bagging_based_lasso_feature_selection(features, y, train_ind) print('fold: ' + str(cv) + ', shape: ', np.shape(x_data)) # Calculate all pairwise distances distv = distance.pdist(x_data, metric='correlation') # Convert to a square symmetric distance matrix dist = distance.squareform(distv) sigma = np.mean(dist) # Get affinity from similarity matrix sparse_graph = np.exp(-dist**2 / (2 * sigma**2)) final_graph = graph_feat * sparse_graph # Classification by BrainNetCNN # import tensorflow as tf # sess = tf.Session() # brainnetcnn = Reader.BrainNetCNN(np.reshape(x_data, [x_data.shape[0], 114, -1, 1])) # test_auc, test_accuracy, test_sensitivity, test_specificity, pred, lab = Reader.calculate_performance(eval(brainnetcnn), y_data, train_ind, val_ind, test_ind) # outs_val = sess.run(Reader.BrainNetCNN, feed_dict=np.reshape(x_data, [x_data.shape[0], 114, -1, 1])) # Classification by MLP # test_auc, test_accuracy, test_sensitivity, test_specificity, pred, lab = Reader.MLP_classification(x_data, y_data, train_ind, val_ind, test_ind) # Classification with SVM # test_auc, test_accuracy, test_sensitivity, test_specificity, pred, lab = Reader.SVM_classification(x_data, y_data, train_ind, val_ind, test_ind) # Classification by GCNs test_auc, test_accuracy, test_sensitivity, test_specificity, pred, lab = Train.run_training( cv, final_graph, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind, test_ind, params, cur_time) return test_auc, test_accuracy, test_sensitivity, test_specificity, pred, lab