from sklearn import svm, cross_validation, metrics from pyriemann.classification import TSclassifier, MDM # set directories data_dir = '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC1/matrix_unsmooth/' kernel_dir = '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC1/kernels/' timecourse_dir = '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC_Unsmooth_TimeCourse/' # indices of lower triangle of a 90 x 90 matrix lotril_ind = np.ravel_multi_index(np.tril_indices(90, k=-1), (90, 90)) # for sparse inverse covariance matrices generated from raw timecourses timecourse_connectivity_data = np.genfromtxt(timecourse_dir + 'sparse_inverse_covariance_data.csv', delimiter=',') timecourse_files = pd.read_csv(timecourse_dir + 'sparse_inverse_covariance_files.csv').T.index.values timecourse_IDs = map(lambda x: int(x.split('/')[-1].split('_')[1][0:-4]), timecourse_files) labels = np.array([utils.load_labels(data_dir), ])[0] connectivity_files = glob.glob(data_dir + '*.txt') connectivity_IDs = map(lambda x: int(x.split('/')[-1][0:3]), connectivity_files) connectivity_IDs.sort() connectivity_in_timecourse = np.array([True if ID in timecourse_IDs else False for ID in connectivity_IDs]) timecourse_in_connectivity = np.array([True if ID in connectivity_IDs else False for ID in timecourse_IDs]) labels = labels[np.array(connectivity_in_timecourse)] #labels = np.expand_dims(labels, axis=1) timecourse_connectivity_data = timecourse_connectivity_data[timecourse_in_connectivity, :] # take matrix logs of timecourse_connectivity_data timecourse_log_data = np.squeeze(np.array(map(lambda x: np.reshape(la.logm(np.reshape(x, (90, 90))), (1, 8100)), timecourse_connectivity_data))) # pull out lower triangle timecourse_log_data_lotril = timecourse_log_data[:, lotril_ind] timecourse_connectivity_data_lotril = timecourse_connectivity_data[:, lotril_ind]
# read in connectivity data and labels - for original connectivity matrices #connectivity_data = utils.load_connectivity_data(data_dir) #labels = np.array([utils.load_labels(data_dir), ]) # for sparse inverse covariance matrices generated from raw timecourses connectivity_data = np.genfromtxt(timecourse_dir + 'sparse_inverse_covariance_data.csv', delimiter=',') timecourse_files = pd.read_csv( timecourse_dir + 'sparse_inverse_covariance_files.csv').T.index.values timecourse_IDs = map(lambda x: int(x.split('/')[-1].split('_')[1][0:-4]), timecourse_files) edge_data = connectivity_data labels = np.array([ utils.load_labels(data_dir), ])[0] connectivity_files = glob.glob(data_dir + '*.txt') connectivity_IDs = map(lambda x: int(x.split('/')[-1][0:3]), connectivity_files) connectivity_IDs.sort() connectivity_in_timecourse = np.array( [True if ID in timecourse_IDs else False for ID in connectivity_IDs]) timecourse_in_connectivity = np.array( [True if ID in connectivity_IDs else False for ID in timecourse_IDs]) labels = labels[np.array(connectivity_in_timecourse)] labels = np.expand_dims(labels, axis=1) edge_data = edge_data[timecourse_in_connectivity, :] # sanity check # pull out lower triangles
# between the covariance matrices def Matusita_kernel(cov_1, cov_2): p = np.shape(cov_1)[0] det_1 = la.det(cov_1) det_2 = la.det(cov_2) det_sum = la.det(cov_1 + cov_2) return ((2 ** (p/2.0)) * (det_1 ** 0.25) * (det_2 ** 0.25))/(det_sum ** 0.5) # timecourse data connectivity matrices timecourse_connectivity_data = np.genfromtxt(timecourse_dir + 'sparse_inverse_covariance_data.csv', delimiter=',') connectivity_data = np.genfromtxt(timecourse_dir + 'sparse_inverse_covariance_data.csv', delimiter=',') timecourse_files = pd.read_csv(timecourse_dir + 'sparse_inverse_covariance_files.csv').T.index.values timecourse_IDs = map(lambda x: int(x.split('/')[-1].split('_')[1][0:-4]), timecourse_files) labels = np.array([utils.load_labels(data_dir), ])[0] connectivity_files = glob.glob(data_dir + '*.txt') connectivity_IDs = map(lambda x: int(x.split('/')[-1][0:3]), connectivity_files) connectivity_IDs.sort() connectivity_in_timecourse = np.array([True if ID in timecourse_IDs else False for ID in connectivity_IDs]) timecourse_in_connectivity = np.array([True if ID in connectivity_IDs else False for ID in timecourse_IDs]) labels = labels[np.array(connectivity_in_timecourse)] #labels = np.expand_dims(labels, axis=1) timecourse_connectivity_data = timecourse_connectivity_data[timecourse_in_connectivity, :] timecourse_connectivity_matrices = np.reshape(timecourse_connectivity_data, (100, 90, 90)) # original connectivity matrices #connectivity_data, connectivity_files = utils.load_connectivity_data('/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC1/matrix_unsmooth/') #connectivity_data = connectivity_data[connectivity_in_timecourse, :] #connectivity_matrices = np.reshape(connectivity_data, (100, 90, 90))
# put these in a df dataset_1_cov = pd.DataFrame(data=dataset_1_cov_data) dataset_1_cov['file'] = dataset_1_cov_files.index # convert format of file name so they can be matched dataset_1_cov['file'] = dataset_1_cov['file'].apply( lambda x: x.split('/')[-1].split('_')[-1].zfill(7)) # import and process full dataset 1 files list and metadata to get labels # import original dataset 1 data, files dataset_1_data, dataset_1_files = utils.load_connectivity_data( '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC1/matrix_unsmooth/') # import dataset 1 labels dataset_1_labels = utils.load_labels( '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC1/matrix_unsmooth/') # put labels alongside files in a DF dataset_1_metadata = pd.DataFrame(columns=['file', 'label']) dataset_1_metadata['file'] = dataset_1_files dataset_1_metadata['label'] = dataset_1_labels # convert format of file name so they can be matched dataset_1_metadata['file'] = dataset_1_metadata['file'].apply( lambda x: x.split('/')[-1].split('_')[-1]) # join the DFs to match labels with spare inverse cov data dataset_1_cov = dataset_1_cov.merge(dataset_1_metadata, how='inner', on='file') # extract the data and labels dataset_1_cov_data = dataset_1_cov.iloc[:, 0:8100].as_matrix()
covariance_subject_ids = map(lambda string: int(string.split('_')[-1][0:-4]), covariance_files[0]) # find indices of common elements in BOTH lists common_correlation_indices = [ i for i, item in enumerate(correlation_subject_ids) if item in set(covariance_subject_ids) ] common_covariance_indices = [ i for i, item in enumerate(covariance_subject_ids) if item in set(correlation_subject_ids) ] # read in the labels labels = np.array([ utils.load_labels(correlation_dir + 'matrix_unsmooth/'), ]) # take only labels for subjects common between covariance and correlation sets. # initially labels are for the same subjects as correlation data so use common_correlation_indices labels = labels[0, common_correlation_indices] print len(labels) # read in the sparse inverse covariance data and cut it down to only include common subjects sparse_inverse_cov_data = np.genfromtxt( sparse_inverse_cov_dir + 'OAS_data.csv', delimiter=',')[common_covariance_indices, :] # map lower triangles of connectivities to an array sparse_cov_edge_data = np.apply_along_axis( lambda x: x[np.ravel_multi_index(np.tril_indices(90, k=-1), (90, 90))], 1,
from scipy.spatial.distance import pdist, squareform from sklearn.preprocessing import normalize from sklearn.metrics.pairwise import chi2_kernel, additive_chi2_kernel # set directories data_dir_1 = '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC1/matrix_unsmooth/' data_dir_2 = '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC2/matrix_unsmooth/' kernel_dir = '/home/jonyoung/IoP_data/Data/connectivity_data/kernels/' # number of subjects n = 333 # read in connectivity data and labels connectivity_data_1 = utils.load_connectivity_data(data_dir_1) labels_1 = np.array([ utils.load_labels(data_dir_1), ]) connectivity_data_2 = utils.load_connectivity_data(data_dir_2) labels_2 = np.array([ utils.load_labels(data_dir_2), ]) connectivity_data = np.vstack((connectivity_data_1, connectivity_data_2)) labels = np.hstack((labels_1, labels_2)) # set negative connectivities to 0 edge_data = np.apply_along_axis( lambda x: [0 if element < 0 else element for element in x], 1, connectivity_data) # normalise the edge data
import connectivity_utils as utils # set directories data_dir_1 = '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC1/matrix_unsmooth/' data_dir_2 = '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC2/matrix_unsmooth/' kernel_dir = '/home/jonyoung/IoP_data/Data/connectivity_data/kernels/' # include negatively weighted edges or not include_negative_weights = True # standardise data with a z-transform standardise_data = False # read in connectivity data and labels connectivity_data_1 = utils.load_connectivity_data(data_dir_1, standardise_data) labels_1 = np.array([utils.load_labels(data_dir_1), ]) connectivity_data_2 = utils.load_connectivity_data(data_dir_2, standardise_data) labels_2 = np.array([utils.load_labels(data_dir_2), ]) connectivity_data = np.vstack((connectivity_data_1, connectivity_data_2)) labels = np.hstack((labels_1, labels_2)) # save connectivity data np.savetxt(kernel_dir + 'connectivity_data.csv', connectivity_data, delimiter = ',') # map lower triangles of connectivities to an array edge_data = np.apply_along_axis(lambda x: x[np.ravel_multi_index(np.tril_indices(90, k=-1), (90, 90))], 1, connectivity_data) if not include_negative_weights :
correlation_subject_ids = map(lambda string: int(string.split('/')[-1][0:3]), correlation_files) # read in and process the list of files for which we have sparse covariance data with open(sparse_inverse_cov_dir + 'sparse_inverse_covariance_files.csv', 'rb') as f: reader = csv.reader(f) covariance_files = list(reader) # convert to list of subject numbers covariance_subject_ids = map(lambda string: int(string.split('_')[-1][0:-4]), covariance_files[0]) # find indices of common elements in BOTH lists common_correlation_indices = [i for i, item in enumerate(correlation_subject_ids) if item in set(covariance_subject_ids)] common_covariance_indices = [i for i, item in enumerate(covariance_subject_ids) if item in set(correlation_subject_ids)] # read in the labels labels = np.array([utils.load_labels(correlation_dir + 'matrix_unsmooth/' ), ]) # take only labels for subjects common between covariance and correlation sets. # initially labels are for the same subjects as correlation data so use common_correlation_indices labels = labels[0, common_correlation_indices] print len(labels) # read in the sparse inverse covariance data and cut it down to only include common subjects sparse_inverse_cov_data = np.genfromtxt(sparse_inverse_cov_dir + 'OAS_data.csv', delimiter=',')[common_covariance_indices,:] # map lower triangles of connectivities to an array sparse_cov_edge_data = np.apply_along_axis(lambda x: x[np.ravel_multi_index(np.tril_indices(90, k=-1), (90, 90))], 1, sparse_inverse_cov_data) # optional - remove negative correlations #sparse_cov_edge_data = np.apply_along_axis(lambda x: [0 if element < 0 else element for element in x], 1, sparse_cov_edge_data)
from scipy.linalg import logm # set directories data_dir_1 = '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC1/matrix_unsmooth/' data_dir_2 = '/home/jonyoung/IoP_data/Data/connectivity_data/KCL_SC2/matrix_unsmooth/' kernel_dir = '/home/jonyoung/IoP_data/Data/connectivity_data/kernels/' # include negatively weighted edges or not include_negative_weights = True # standardise data with a z-transform standardise_data = True # read in connectivity data and labels connectivity_data_1, connectivity_files = utils.load_connectivity_data(data_dir_1, standardise_data) labels_1 = np.array([utils.load_labels(data_dir_1), ]) connectivity_data_2 = utils.load_connectivity_data(data_dir_2, standardise_data) labels_2 = np.array([utils.load_labels(data_dir_2), ]) #connectivity_data = np.vstack((connectivity_data_1, connectivity_data_2)) connectivity_data = connectivity_data_1 labels = np.hstack((labels_1, labels_2)) edge_data = connectivity_data if not include_negative_weights : # set negative connectivities to 0 edge_data = np.apply_along_axis(lambda x: [0 if element < 0 else element for element in x], 1, edge_data)