示例#1
0
def load_data(cache):
    if not (cache):
        BMI = pd.io.parsers.read_csv(os.path.join(DATA_PATH, "BMI.csv"),
                                     index_col=0).as_matrix()

        # Dataframe
        COFOUND = [
            "Subject", "Gender de Feuil2", "ImagingCentreCity", "tiv_gaser",
            "mean_pds"
        ]
        df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH,
                                                 "1534bmi-vincent2.csv"),
                                    index_col=0)
        df = df[COFOUND]

        # Conversion dummy coding
        design_mat = utils.make_design_matrix(df,
                                              regressors=COFOUND).as_matrix()

        # Keep only subjects for which we have all data and remove the 1. column containing subject_id from the numpy array design_mat
        subjects_id = np.genfromtxt(os.path.join(DATA_PATH, "subjects_id.csv"),
                                    dtype=None,
                                    delimiter=',',
                                    skip_header=1)
        design_mat = np.delete(
            np.delete(
                design_mat,
                np.where(
                    np.in1d(
                        design_mat[:, 0],
                        np.delete(
                            design_mat,
                            np.where(np.in1d(design_mat[:, 0], subjects_id)),
                            0))), 0), 0, 1)

        # Images
        h5file = tables.openFile(IMAGES_FILE)
        masked_images = bmi_utils.read_array(
            h5file, "/standard_mask/residualized_images_gender_center_TIV_pds"
        )  #images already masked
        print "Data loaded"

        # Concatenate images with covariates gender, imaging city centrr, tiv_gaser and mean pds status in order to do as though BMI had been residualized
        X = np.concatenate((design_mat, masked_images), axis=1)
        z = BMI
        np.save(os.path.join(SHARED_DIR, "X.npy"), X)
        np.save(os.path.join(SHARED_DIR, "z.npy"), z)
        h5file.close()
        print "Data saved"
    else:
        X = np.load(os.path.join(SHARED_DIR, "X.npy"))
        z = np.load(os.path.join(SHARED_DIR, "z.npy"))
        print "Data read from cache"
    return X, z
def load_residualized_bmi_data(cache):
    if not (cache):
        # BMI
        BMI = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'),
                                     index_col=0).as_matrix()

        # Dataframe
        COFOUND = [
            'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds'
        ]

        df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH,
                                                 'population.csv'),
                                    index_col=0)
        df = df[COFOUND]

        # Keep only subjects for which we have all data
        subjects_id = np.genfromtxt(os.path.join(DATA_PATH, 'subjects_id.csv'),
                                    dtype=None,
                                    delimiter=',',
                                    skip_header=1)

        clinic_data = df.loc[subjects_id]

        # Conversion dummy coding
        covar = utils.make_design_matrix(clinic_data,
                                         regressors=COFOUND).as_matrix()

        # Concatenate BMI and covariates
        # (gender, imaging city centre, tiv_gaser and mean pds status)
        design_mat = np.hstack((covar, BMI))

        # Images
        h5file = tables.openFile(IMAGES_FILE)
        masked_images = bmi_utils.read_array(
            h5file, '/standard_mask/residualized_images_gender_center_TIV_pds')
        print "Images loaded"

        X = design_mat
        # Center & scale X
        skl = StandardScaler()
        X = skl.fit_transform(X)
        Y = masked_images

        np.save(os.path.join(SHARED_DIR, 'X.npy'), X)
        np.save(os.path.join(SHARED_DIR, 'Y.npy'), Y)
        h5file.close()
        print "Data saved"
    else:
        X = np.load(os.path.join(SHARED_DIR, 'X.npy'))
        Y = np.load(os.path.join(SHARED_DIR, 'Y.npy'))
        print "Data read from cache"
    return X, Y
示例#3
0
def load_residualized_bmi_data(cache):
    if not (cache):

        BMI = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'),
                                     index_col=0).as_matrix()

        # Dataframe
        COFOUNDS = [
            'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds'
        ]

        df = pd.io.parsers.read_csv(os.path.join(SHFJ_DATA_PATH,
                                                 '1534bmi-vincent2.csv'),
                                    index_col=0)
        df = df[COFOUNDS]

        # Keep only subjects for whom we have all neuroimaging and genetic data
        subjects_id = np.genfromtxt(os.path.join(DATA_PATH, 'subjects_id.csv'),
                                    dtype=None,
                                    delimiter=',',
                                    skip_header=1)

        clinic_data = df.loc[subjects_id]

        # Conversion dummy coding
        covar = utils.make_design_matrix(clinic_data,
                                         regressors=COFOUNDS).as_matrix()

        # Load images that have already been masked
        h5file = tables.openFile(IMAGES_FILE)
        masked_images = bmi_utils.read_array(
            h5file, '/standard_mask/residualized_images_gender_center_TIV_pds')
        print "Data loaded - Processing"

        # Concatenate images and covariates
        # (gender, imaging city centre, tiv_gaser and mean pds status)
        # in order to do as though BMI had been residualized
        X = np.hstack((covar, masked_images))
        z = BMI

        np.save(os.path.join(SHARED_DIR, 'X.npy'), X)
        np.save(os.path.join(SHARED_DIR, 'z.npy'), z)

        h5file.close()

        print "Data saved"
    else:
        X = np.load(os.path.join(SHARED_DIR, 'X.npy'))
        z = np.load(os.path.join(SHARED_DIR, 'z.npy'))
        print "Data read from cache"
    return X, z
示例#4
0
def load_residualized_bmi_data(cache):
    if not(cache):
#        SNPs = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'SNPs.csv'),
#                                      dtype='float64',
#                                      index_col=0).as_matrix()
        BMI = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'),
                                     index_col=0).as_matrix()

        # Dataframe
        df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH,
                                                 'normal_group.csv'),
                                    index_col=0)

        COFOUND = ['Gender de Feuil2',
                   'ImagingCentreCity',
                   'tiv_gaser',
                   'mean_pds']

        df = df[COFOUND]

        # Conversion dummy coding
        covar = utils.make_design_matrix(df, regressors=COFOUND).as_matrix()

        # Images
        h5file = tables.openFile(IMAGES_FILE)
        images_file = bmi_utils.read_array(h5file,
                "/standard_mask/residualized_images_gender_center_TIV_pds")
                 #images already masked

        masked_images = images_file[???????????, :]

        print "Data loaded - Processing"

        z = BMI
        # Concatenate images and covariates
        # (gender, imaging city centre, tiv_gaser and mean pds status)
        # in order to do as though BMI had been residualized.
        X_res = np.hstack((covar, masked_images))

        np.save(os.path.join(SHARED_DIR, "X_res.npy"), X_res)
        np.save(os.path.join(SHARED_DIR, "z.npy"), z)

        h5file.close()
        print "Data saved"
    else:
        X_res = np.load(os.path.join(SHARED_DIR, "X_res.npy"))
        z = np.load(os.path.join(SHARED_DIR, "z.npy"))
        print "Data read from cache"
    return X_res, z
示例#5
0
def load_residualized_bmi_data(cache):
    if not(cache):
        # BMI
        BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'),
                                     sep=',',
                                     index_col=0)

        # Sulci features
        labels = np.genfromtxt(os.path.join(QC_PATH, 'sulci_df_qc.csv'),
                                dtype=None,
                                delimiter=',',
                                skip_header=1,
                                usecols=0).tolist()

        sulci_index = pd.Index(labels)

        # Sulci features
        sulci_df_qc = pd.io.parsers.read_csv(os.path.join(QC_PATH,
                                                          'sulci_df_qc.csv'),
#                              usecols=[???],
                              sep=',')

        # Set the new dataframe index: subjects ID in the right format
        sulci_df_qc = sulci_df_qc.set_index(sulci_index)

        # Dataframe for picking out only clinical cofounds of non interest
        clinical_df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH,
                                                          'population.csv'),
                                             index_col=0)

        # Add one cofound since sulci follows a power law
        clinical_df['tiv2'] = pow(clinical_df['tiv_gaser'], 2)

        clinical_cofounds = ['Gender de Feuil2',
                             'ImagingCentreCity',
                             'tiv_gaser',
                             'tiv2',
                             'mean_pds']

        clinical_df = clinical_df[clinical_cofounds]

        # Consider subjects for whom we have neuroimaging and genetic data
        subjects_id = np.genfromtxt(os.path.join(DATA_PATH,
                                                 'subjects_id.csv'),
                                    dtype=None,
                                    delimiter=',',
                                    skip_header=1)

        # Get the intersept of indices of subjects for whom we have
        # neuroimaging and genetic data, but also sulci features
        subjects_index = np.intersect1d(subjects_id, sulci_df_qc.index.values)

        # Check whether all these subjects are actually stored into the qc
        # dataframe
        sulci_data = sulci_df_qc.loc[subjects_index]

        # Keep only subjects for which we have ALL data (neuroimaging,
        # genetic data and sulci features)
        clinical_data = clinical_df.loc[subjects_index]
        BMI = BMI_df.loc[subjects_index]

        # Conversion dummy coding
        covar = utils.make_design_matrix(clinical_data,
                                    regressors=clinical_cofounds).as_matrix()

        # Center and scale covariates, but not constant regressor's column
        cov = covar[:, 0:-1]
        skl = StandardScaler()
        cov = skl.fit_transform(cov)

        # Center & scale BMI
        BMI = skl.fit_transform(BMI)

        # Center & scale sulci_data
        sulci_data = skl.fit_transform(sulci_data)
        print "Sulci_data loaded"

        # Constant regressor to mimick the fit intercept
        constant_regressor = np.ones((sulci_data.shape[0], 1))

        # Concatenate sulci data, constant regressor and covariates
        design_mat = np.hstack((cov, constant_regressor, sulci_data))

        X = design_mat
        z = BMI

        np.save(os.path.join(SHARED_DIR, 'X.npy'), X)
        np.save(os.path.join(SHARED_DIR, 'z.npy'), z)

        print "Data saved"
    else:
        X = np.load(os.path.join(SHARED_DIR, 'X.npy'))
        z = np.load(os.path.join(SHARED_DIR, 'z.npy'))
        print "Data read from cache"
    return X, z
def load_SNPs_bmi_data(cache):
    if not (cache):
        # BMI
        BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'),
                                        sep=',',
                                        index_col=0)

        # Sulci maximal depth
        sulci_depthMax_df = pd.io.parsers.read_csv(os.path.join(
            QC_PATH, 'sulci_depthMax_df.csv'),
                                                   sep=',',
                                                   index_col=0)

        # SNPs
        SNPs_df = pd.io.parsers.read_csv(os.path.join(
            DATA_PATH, 'BMI_associated_SNPs_measures.csv'),
                                         index_col=0)

        # Dataframe for picking out only clinical cofounds of non interest
        clinical_df = pd.io.parsers.read_csv(os.path.join(
            CLINIC_DATA_PATH, 'population.csv'),
                                             index_col=0)

        # Cofounds of non interest
        clinical_cofounds = [
            'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds'
        ]

        clinical_df = clinical_df[clinical_cofounds]

        # Get the intersept of indices of subjects for whom we have
        # neuroimaging and genetic data, but also robustly segmented sulci
        subjects_intercept = np.intersect1d(SNPs_df.index.values,
                                            BMI_df.index.values)
        subjects_id = np.intersect1d(subjects_intercept,
                                     sulci_depthMax_df.index.values)

        # Keep only subjects for which we have ALL data (neuroimaging,
        # genetic data and sulci features)
        clinical_data = clinical_df.loc[subjects_id]
        BMI = BMI_df.loc[subjects_id]
        sulci_data = sulci_depthMax_df.loc[subjects_id]

        # Conversion dummy coding
        covar = utils.make_design_matrix(
            clinical_data, regressors=clinical_cofounds).as_matrix()

        # Center and scale covariates, but not constant regressor's column
        cov = covar[:, 0:-1]
        skl = StandardScaler()
        cov = skl.fit_transform(cov)

        # Center & scale BMI
        BMI = skl.fit_transform(BMI)

        # Constant regressor to mimick the fit intercept
        constant_regressor = np.ones((sulci_data.shape[0], 1))

        # Concatenate sulci data, constant regressor and covariates
        design_mat = np.hstack((cov, constant_regressor, sulci_data))

        X = design_mat
        z = BMI

        np.save(os.path.join(SHARED_DIR, 'X.npy'), X)
        np.save(os.path.join(SHARED_DIR, 'z.npy'), z)

        print 'Data saved.'
    else:
        X = np.load(os.path.join(SHARED_DIR, 'X.npy'))
        z = np.load(os.path.join(SHARED_DIR, 'z.npy'))
        print 'Data read from cache.'
    return X, z
def load_residualized_bmi_data(cache):
    if not (cache):

        # BMI
        BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'),
                                        sep=',',
                                        index_col=0)

        # Sulci features
        sulci_df_qc = pd.io.parsers.read_csv(os.path.join(
            QC_PATH, 'sulci_df_qc.csv'),
                                             sep=',',
                                             index_col=0)

        # Dataframe for picking out only clinical cofounds of non interest
        clinical_df = pd.io.parsers.read_csv(os.path.join(
            CLINIC_DATA_PATH, 'clinical_data_norm-ob_groups.csv'),
                                             index_col=0)

        # Add one cofound since sulci follows a power law
        clinical_df['tiv2'] = pow(clinical_df['tiv_gaser'], 2)

        clinical_cofounds = [
            'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'tiv2',
            'mean_pds'
        ]

        clinical_df = clinical_df[clinical_cofounds]

        # Get the intersept of indices of subjects for whom we have
        # neuroimaging and genetic data, but also sulci features
        subjects_index = np.intersect1d(clinical_df.index.values,
                                        sulci_df_qc.index.values)

        # Check whether all these subjects are actually stored into the qc
        # dataframe
        sulci_data = sulci_df_qc.loc[subjects_index]

        # Keep only subjects for which we have ALL data (neuroimaging,
        # genetic data and sulci features)
        clinical_data = clinical_df.loc[subjects_index]
        BMI = BMI_df.loc[subjects_index]

        # Conversion dummy coding
        covar = utils.make_design_matrix(
            clinical_data, regressors=clinical_cofounds).as_matrix()

        # Center and scale covariates, but not constant regressor's column
        cov = covar[:, 0:-1]
        skl = StandardScaler()
        cov = skl.fit_transform(cov)

        # Center & scale BMI
        BMI = skl.fit_transform(BMI)

        # Center & scale sulci_data
        sulci_data = skl.fit_transform(sulci_data)
        print "sulci_data loaded"
        # Constant regressor to mimick the fit intercept
        constant_regressor = np.ones((sulci_data.shape[0], 1))

        # Concatenate BMI, constant regressor and covariates
        design_mat = np.hstack((cov, constant_regressor, BMI))

        X = design_mat
        Y = sulci_data

        np.save(os.path.join(SHARED_DIR, 'X.npy'), X)
        np.save(os.path.join(SHARED_DIR, 'Y.npy'), Y)

        print "Data saved"
    else:
        X = np.load(os.path.join(SHARED_DIR, 'X.npy'))
        Y = np.load(os.path.join(SHARED_DIR, 'Y.npy'))
        print "Data read from cache"
    return X, Y, sulci_df_qc
def load_residualized_bmi_data(cache):
    if not(cache):

        # BMI
        BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'),
                                     sep=',',
                                     index_col=0)

        # Sulci features
        sulci_df_qc = pd.io.parsers.read_csv(os.path.join(QC_PATH,
                                                          'sulci_df_qc.csv'),
                                              sep=',',
                                              index_col=0)

        # Extract only sulci depthMax among sulci features
        sulci_feature_colnames = []
        for sulcus_feature in sulci_df_qc.columns.tolist():
            if (sulcus_feature.find('depthMax') != -1):
                sulci_feature_colnames.append(sulcus_feature)

        sulci_depthMax_df = sulci_df_qc[sulci_feature_colnames]

        # Dataframe for picking out only clinical cofounds of non interest
        clinical_df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH,
                                                          'population.csv'),
                                             index_col=0)

        clinical_cofounds = ['Gender de Feuil2',
                             'ImagingCentreCity',
                             'tiv_gaser',
                             'mean_pds']

        clinical_df = clinical_df[clinical_cofounds]

        # Consider subjects for whom we have neuroimaging and genetic data
        subjects_id = np.genfromtxt(os.path.join(DATA_PATH,
                                                 'subjects_id.csv'),
                                    dtype=None,
                                    delimiter=',',
                                    skip_header=1)

        # Get the intersept of indices of subjects for whom we have
        # neuroimaging and genetic data, but also sulci features
        subjects_index = np.intersect1d(subjects_id,
                                        sulci_depthMax_df.index.values)

        # Check whether all these subjects are actually stored into the qc
        # dataframe
        sulci_data = sulci_depthMax_df.loc[subjects_index]

        # Keep only subjects for which we have ALL data (neuroimaging,
        # genetic data and sulci features)
        clinical_data = clinical_df.loc[subjects_index]
        BMI = BMI_df.loc[subjects_index]

        # Conversion dummy coding
        covar = utils.make_design_matrix(clinical_data,
                                    regressors=clinical_cofounds).as_matrix()

        # Center and scale covariates, but not constant regressor's column
        cov = covar[:, 0:-1]
        skl = StandardScaler()
        cov = skl.fit_transform(cov)

        # Center & scale BMI
        BMI = skl.fit_transform(BMI)

        # Center & scale sulci_data
        sulci_data = skl.fit_transform(sulci_data)
        print "Sulci data loaded."
        # Constant regressor to mimick the fit intercept
        constant_regressor = np.ones((sulci_data.shape[0], 1))

        # Concatenate BMI, constant regressor and covariates
        design_mat = np.hstack((cov, constant_regressor, BMI))

        X = design_mat
        Y = sulci_data

        np.save(os.path.join(SHARED_DIR, 'X.npy'), X)
        np.save(os.path.join(SHARED_DIR, 'Y.npy'), Y)

        print "Data saved."
    else:
        X = np.load(os.path.join(SHARED_DIR, 'X.npy'))
        Y = np.load(os.path.join(SHARED_DIR, 'Y.npy'))
        print "Data read from cache."
    return X, Y, sulci_depthMax_df
def load_fMRI_SNPs_bmi_data(cache):
    if not (cache):
        # Dataframe for picking out only clinical cofounds of non interest
        clinical_df = pd.io.parsers.read_csv(os.path.join(
            CLINIC_DATA_PATH, 'population.csv'),
                                             index_col=0)

        # Cofounds of non interest
        clinical_cofounds = [
            'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds'
        ]

        clinical_df = clinical_df[clinical_cofounds]

        # SNPs
        SNPs_df = pd.io.parsers.read_csv(os.path.join(
            DATA_PATH, 'BMI_associated_SNPs_measures.csv'),
                                         index_col=0)

        # fMRI left motor tasks
        masked_images = np.load(
            os.path.join(GCA_motor_left_PATH, 'GCA_motor_left_images.npy'))

        # List of all subjects who had an fMRI examination
        fMRI_subjects = pd.io.parsers.read_csv(os.path.join(
            GCA_motor_left_PATH, 'subjects_id_left_motor_fMRI.csv'),
                                               index_col=0)

        # Get the intersept of indices of subjects for whom we have both
        # genetic data and fMRI examination
        subjects_intercept = np.intersect1d(SNPs_df.index.values,
                                            fMRI_subjects.index.values)
        subjects_id = np.intersect1d(subjects_intercept,
                                     clinical_df.index.values).tolist()

        # Keep only subjects for whom we have both genetic data and fMRI
        # examination
        clinical_data = clinical_df.loc[subjects_id]
        SNPs = SNPs_df.loc[subjects_id]

        # Conversion dummy coding
        covar = utils.make_design_matrix(
            clinical_data, regressors=clinical_cofounds).as_matrix()

        # Center and scale covariates, but not constant regressor's column
        cov = covar[:, 0:-1]
        skl = StandardScaler()
        cov = skl.fit_transform(cov)

        # Constant regressor to mimick the fit intercept
        constant_regressor = np.ones((SNPs.shape[0], 1))

        # Concatenate sulci data, constant regressor and covariates
        design_mat = np.hstack((cov, constant_regressor, SNPs))

        X = design_mat
        Y = masked_images

        np.save(os.path.join(SHARED_DIR, 'X.npy'), X)
        np.save(os.path.join(SHARED_DIR, 'Y.npy'), Y)

        print 'Data saved.'
    else:
        X = np.load(os.path.join(SHARED_DIR, 'X.npy'))
        Y = np.load(os.path.join(SHARED_DIR, 'Y.npy'))
        print 'Data read from cache.'
    return X, Y
示例#10
0
def load_sulci_SNPs_data(cache):
    if not (cache):

        # BMI
        BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'),
                                        sep=',',
                                        index_col=0)

        # SNPs
        SNPs_df = pd.io.parsers.read_csv(os.path.join(
            DATA_PATH, 'BMI_associated_SNPs_measures.csv'),
                                         index_col=0)

        # Dataframe for picking out only clinical cofounds of non interest
        clinical_df = pd.io.parsers.read_csv(os.path.join(
            CLINIC_DATA_PATH, 'population.csv'),
                                             index_col=0)

        # Cofounds
        clinical_cofounds = [
            'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds'
        ]

        clinical_df = clinical_df[clinical_cofounds]

        # Get the intersept of indices of subjects for whom we have
        # neuroimaging and genetic data
        subjects_id = np.intersect1d(SNPs_df.index.values, BMI_df.index.values)

        # Check whether all these subjects are actually stored into both
        # dataframes
        SNPs = SNPs_df.loc[subjects_id]
        BMI = BMI_df.loc[subjects_id]
        clinical_data = clinical_df.loc[subjects_id]

        # Conversion dummy coding
        covar = utils.make_design_matrix(
            clinical_data, regressors=clinical_cofounds).as_matrix()

        # Center and scale covariates, but not constant regressor's column
        cov = covar[:, 0:-1]
        skl = StandardScaler()
        cov = skl.fit_transform(cov)

        # Center & scale BMI
        BMI = skl.fit_transform(BMI)
        print 'BMI loaded.'

        # Constant regressor to mimick the fit intercept
        constant_regressor = np.ones((BMI.shape[0], 1))

        # Concatenate BMI, constant regressor and covariates
        design_mat = np.hstack((cov, constant_regressor, BMI))

        X = design_mat
        Y = SNPs

        np.save(os.path.join(SHARED_DIR, 'X.npy'), X)
        np.save(os.path.join(SHARED_DIR, 'Y.npy'), Y)
        print 'Data saved.'
    else:
        X = np.load(os.path.join(SHARED_DIR, 'X.npy'))
        Y = np.load(os.path.join(SHARED_DIR, 'Y.npy'))
        print 'Data read from cache.'
    return X, Y, SNPs_df
示例#11
0
#####################################################

# Construct EPAC workflow
pipeline = epac.Pipe(MULMStats(), ClusterStats())

# 1st model: most of the covariables
MODEL = [
    'group_sub_ctl', 'Gender', 'pds', 'Age', 'ImagingCentreCity',
    'Scanner_Type', 'vol_GM', 'vol_WM', 'vol_CSF', 'TIV', 'GM_on_TIV',
    'WM_on_TIV', 'CSF_on_TIV', 'VSF', 'tristesse', 'irritabilite', 'anhedonie',
    'total_symptoms_dep'
]
MODEL_OUT = os.path.join(OUT_DIR, "all-covariates")
if not os.path.exists(MODEL_OUT):
    os.makedirs(MODEL_OUT)
design_mat = utils.make_design_matrix(df, regressors=MODEL).as_matrix()
Y = masked_images
contrast = numpy.zeros(design_mat.shape[1])
contrast[0] = 1
contrast[1] = -1
isnan = numpy.isnan(design_mat)
if isnan.any():
    bad_subject_ind = numpy.where(isnan)[0]
    print "Removing subject", bad_subject_ind
    design_mat = numpy.delete(design_mat, bad_subject_ind, axis=0)
    Y = numpy.delete(Y, bad_subject_ind, axis=0)

pipeline_res = pipeline.run(design_matrix=design_mat,
                            Y=Y,
                            mask=mask,
                            contrast=contrast,
示例#12
0
def load_residualized_bmi_data(cache):
    if not(cache):
        # BMI
        BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'),
                                        sep=',',
                                        index_col=0)

        # Freesurfer
        labels = np.genfromtxt(os.path.join(FREESURFER_PATH,
                                    'IMAGEN_Freesurfer_data_29juil2014.csv'),
                                dtype=None,
                                delimiter=',',
                                skip_header=1,
                                usecols=1)

        subject_labels = []
        for i, s in enumerate(labels):
            subject_labels.append(int(s[25:]))

        freesurfer_index = pd.Index(subject_labels)

        # Freesurfer's spreadsheet from IMAGEN database
        freesurfer_df = pd.io.parsers.read_csv(os.path.join(FREESURFER_PATH,
                                    'IMAGEN_Freesurfer_data_29juil2014.csv'),
                                        sep=',',
                                        usecols=['lhCortexVol',
                                                 'rhCortexVol',
                                                 'CortexVol',
                                                 'SubCortGrayVol',
                                                 'TotalGrayVol',
                                                 'SupraTentorialVol',
                                                 'lhCorticalWhiteMatterVol',
                                                 'rhCorticalWhiteMatterVol',
                                                 'CorticalWhiteMatterVol'])

        # Set the new dataframe index: subjects ID in the right format
        freesurfer_df = freesurfer_df.set_index(freesurfer_index)

        # Dataframe for picking out only clinical cofounds of non interest
        clinical_df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH,
                                                          'population.csv'),
                                             index_col=0)

        # Cofounds
        clinical_cofounds = ['Gender de Feuil2', 'ImagingCentreCity',
                             'tiv_gaser', 'mean_pds']

        clinical_df = clinical_df[clinical_cofounds]

        # Consider subjects for which we have neuroimaging and genetic data
        subjects_id = np.genfromtxt(os.path.join(DATA_PATH,
                                                 'subjects_id.csv'),
                                    dtype=None,
                                    delimiter=',',
                                    skip_header=1)

        freesurfer_data = freesurfer_df.loc[subjects_id]

        # Drop rows that have any NaN values
        freesurfer_data = freesurfer_data.dropna()

        # Get indices of subjects for which we have both neuroimaging and
        # genetic data, but also Freesurfer subcortical features
        index = freesurfer_data.index

        # Keep only subjects for which we have ALL data (neuroimaging,
        # genetic data, subcortical features)
        clinical_data = clinical_df.loc[index]
        BMI = BMI_df.loc[index]

        # Conversion dummy coding
        covar = utils.make_design_matrix(clinical_data,
                                    regressors=clinical_cofounds).as_matrix()

        # Center and scale covariates, but not constant regressor's column
        cov = covar[:, 0:-1]
        skl = StandardScaler()
        cov = skl.fit_transform(cov)

        # Center & scale sulci_data
        freesurfer_data = skl.fit_transform(freesurfer_data)

        # Center & scale BMI
        BMI = skl.fit_transform(BMI)

        # Constant regressor to mimick the fit intercept
        constant_regressor = np.ones((freesurfer_data.shape[0], 1))

        # Concatenate BMI, constant regressor and covariates
        design_mat = np.hstack((cov, constant_regressor, freesurfer_data))

        X = design_mat
        z = BMI

        np.save(os.path.join(SHARED_DIR, 'X.npy'), X)
        np.save(os.path.join(SHARED_DIR, 'z.npy'), z)

        print "Data saved"
    else:
        X = np.load(os.path.join(SHARED_DIR, 'X.npy'))
        z = np.load(os.path.join(SHARED_DIR, 'z.npy'))
        print "Data read from cache"
    return X, z