def save_train_validation_ids(filename, data_path):
    patient_dirs = sorted(glob.glob(data_path + "/*/study/"),
                          key=lambda folder: int(re.search(r'/(\d+)/', folder).group(1)))
    dirs_indices = range(0, len(patient_dirs))

    valid_dirs_indices = get_cross_validation_indices(indices=dirs_indices, validation_index=0)
    train_patient_indices = list(set(dirs_indices) - set(valid_dirs_indices))

    train_patient_dirs = [utils.get_patient_id(patient_dirs[idx]) for idx in train_patient_indices]
    validation_patient_dirs = [utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices]

    d = {'train': train_patient_dirs, 'valid': validation_patient_dirs}
    utils.save_pkl(d, filename)
    print 'train-valid patients split saved to', filename
    return d
示例#2
0
    def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None,
                 slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=5, **kwargs):

        if patient_ids:
            patient_paths = []
            for pid in patient_ids:
                patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            patient_paths = glob.glob(data_path + '/*/study/')

        self.pid2sax_slice_paths = defaultdict(list)
        self.pid2ch2_path, self.pid2ch4_path = {}, {}
        for p in patient_paths:
            pid = int(utils.get_patient_id(p))
            spaths = sorted(glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1)))
            if len(spaths) > min_slices:
                self.pid2sax_slice_paths[pid] = spaths

                ch2_path = glob.glob(p + '/2ch_*.pkl')
                self.pid2ch2_path[pid] = ch2_path[0] if ch2_path else None
                ch4_path = glob.glob(p + '/4ch_*.pkl')
                self.pid2ch4_path[pid] = ch4_path[0] if ch4_path else None

        self.patient_ids = self.pid2sax_slice_paths.keys()
        self.nsamples = len(self.patient_ids)

        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.batch_size = batch_size
        self.infinite = infinite
        self.transformation_params = transform_params
        self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
示例#3
0
    def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None,
                 slice2roi_path=None, full_batch=False, random=True, infinite=False, view='sax',
                 data_prep_fun=data.transform_norm_rescale, **kwargs):

        if patient_ids:
            self.patient_paths = []
            for pid in patient_ids:
                self.patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            self.patient_paths = glob.glob(data_path + '/*/study/')

        self.slice_paths = [sorted(glob.glob(p + '/%s_*.pkl' % view)) for p in self.patient_paths]
        self.slice_paths = list(itertools.chain(*self.slice_paths))
        self.slicepath2pid = {}
        for s in self.slice_paths:
            self.slicepath2pid[s] = int(utils.get_patient_id(s))

        self.nsamples = len(self.slice_paths)
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.transformation_params = transform_params
        self.data_prep_fun = data_prep_fun
        self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
示例#4
0
def get_patient_data(patient_data_path):
    patient_data = []
    spaths = sorted(glob.glob(patient_data_path + '/*.pkl'),
                    key=lambda x: int(re.search(r'/\w*_(\d+)*\.pkl$', x).group(1)))
    pid = utils.get_patient_id(patient_data_path)
    for s in spaths:
        slice_id = utils.get_slice_id(s)
        metadata = data.read_metadata(s)
        d = data.read_slice(s)
        patient_data.append({'data': d, 'metadata': metadata,
                             'slice_id': slice_id, 'patient_id': pid})
    return patient_data
示例#5
0
    def split_train_data(self, paths, ratio=None):
        '''
        Get splitted data paths.
        '''

        if ratio == None:
            ratio = 0.05 / (1 - 1.0 / len(self.fold_itens))

        # Get data files.
        brains_data = Files("")
        brains_data.paths = paths
        brains_data = brains_data.get_file_names()
        total_brains = len(brains_data)
        validation_size = np.int(total_brains * ratio)

        # Get total of brains by category.
        brains_by_patient = group_brains_by_patient_id(brains_data)
        validation_paths = []
        train_paths = []
        brains_by_category = group_brains_by_category(brains_data)
        statistic = {}

        # Initialize statistic data.
        for label in brains_by_category:
            statistic[label] = np.round(
                len(brains_by_category[label]) * validation_size * 1.0 /
                total_brains)

        # Create train and validation set.
        for label in brains_by_category:
            index_brains = 0

            while statistic[label] > 0:
                patient_id = get_patient_id(
                    brains_by_category[label][index_brains])
                brains_patient = brains_by_patient[patient_id]

                for brain_patient in brains_patient:
                    validation_paths.append(brain_patient)
                    statistic[get_category(brain_patient)] -= 1

                index_brains += 1

        for brain in brains_data:
            if brain not in validation_paths:
                train_paths.append(brain)

        # Shuffle data.
        np.random.shuffle(np.array(validation_paths))
        np.random.shuffle(np.array(train_paths))

        return train_paths, validation_paths
def save_train_validation_ids(filename, data_path):
    patient_dirs = sorted(
        glob.glob(data_path + "/*/study/"),
        key=lambda folder: int(re.search(r'/(\d+)/', folder).group(1)))
    dirs_indices = list(range(0, len(patient_dirs)))

    valid_dirs_indices = get_cross_validation_indices(indices=dirs_indices,
                                                      validation_index=0)
    train_patient_indices = list(set(dirs_indices) - set(valid_dirs_indices))

    train_patient_dirs = [
        utils.get_patient_id(patient_dirs[idx])
        for idx in train_patient_indices
    ]
    validation_patient_dirs = [
        utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices
    ]

    d = {'train': train_patient_dirs, 'valid': validation_patient_dirs}
    utils.save_pkl(d, filename)
    print('train-valid patients split saved to', filename)
    return d
示例#7
0
    def __init__(self,
                 data_path,
                 batch_size,
                 transform_params,
                 patient_ids=None,
                 labels_path=None,
                 slice2roi_path=None,
                 full_batch=False,
                 random=True,
                 infinite=True,
                 min_slices=0,
                 data_prep_fun=data.transform_norm_rescale,
                 **kwargs):

        if patient_ids:
            patient_paths = []
            for pid in patient_ids:
                patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            patient_paths = glob.glob(data_path + '/*/study/')

        self.pid2slice_paths = defaultdict(list)
        nslices = []
        for p in patient_paths:
            pid = int(utils.get_patient_id(p))
            spaths = sorted(
                glob.glob(p + '/sax_*.pkl'),
                key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1)))
            # consider patients only with min_slices
            if len(spaths) > min_slices:
                self.pid2slice_paths[pid] = spaths
                nslices.append(len(spaths))

        # take max number of slices
        self.nslices = int(np.max(nslices))

        self.patient_ids = self.pid2slice_paths.keys()
        self.nsamples = len(self.patient_ids)

        self.data_path = data_path
        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.batch_size = batch_size
        self.infinite = infinite
        self.transformation_params = transform_params
        self.data_prep_fun = data_prep_fun
        self.slice2roi = utils.load_pkl(
            slice2roi_path) if slice2roi_path else None
示例#8
0
    def __init__(self,
                 data_path,
                 batch_size,
                 transform_params,
                 patient_ids=None,
                 labels_path=None,
                 slice2roi_path=None,
                 full_batch=False,
                 random=True,
                 infinite=True,
                 min_slices=5,
                 **kwargs):

        if patient_ids:
            patient_paths = []
            for pid in patient_ids:
                patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            patient_paths = glob.glob(data_path + '/*/study/')

        self.pid2sax_slice_paths = defaultdict(list)
        self.pid2ch2_path, self.pid2ch4_path = {}, {}
        for p in patient_paths:
            pid = int(utils.get_patient_id(p))
            spaths = sorted(
                glob.glob(p + '/sax_*.pkl'),
                key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1)))
            if len(spaths) > min_slices:
                self.pid2sax_slice_paths[pid] = spaths

                ch2_path = glob.glob(p + '/2ch_*.pkl')
                self.pid2ch2_path[pid] = ch2_path[0] if ch2_path else None
                ch4_path = glob.glob(p + '/4ch_*.pkl')
                self.pid2ch4_path[pid] = ch4_path[0] if ch4_path else None

        self.patient_ids = self.pid2sax_slice_paths.keys()
        self.nsamples = len(self.patient_ids)

        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.batch_size = batch_size
        self.infinite = infinite
        self.transformation_params = transform_params
        self.slice2roi = utils.load_pkl(
            slice2roi_path) if slice2roi_path else None
示例#9
0
def get_patient_data(patient_data_path):
    patient_data = []
    spaths = sorted(
        glob.glob(patient_data_path + '/*.pkl'),
        key=lambda x: int(re.search(r'/\w*_(\d+)*\.pkl$', x).group(1)))
    pid = utils.get_patient_id(patient_data_path)
    for s in spaths:
        slice_id = utils.get_slice_id(s)
        metadata = data.read_metadata(s)
        d = data.read_slice(s)
        patient_data.append({
            'data': d,
            'metadata': metadata,
            'slice_id': slice_id,
            'patient_id': pid
        })
    return patient_data
示例#10
0
    def __init__(self,
                 data_path,
                 batch_size,
                 transform_params,
                 patient_ids=None,
                 labels_path=None,
                 slice2roi_path=None,
                 full_batch=False,
                 random=True,
                 infinite=False,
                 view='sax',
                 data_prep_fun=data.transform_norm_rescale,
                 **kwargs):

        if patient_ids:
            self.patient_paths = []
            for pid in patient_ids:
                self.patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            self.patient_paths = glob.glob(data_path + '/*/study/')

        self.slice_paths = [
            sorted(glob.glob(p + '/%s_*.pkl' % view))
            for p in self.patient_paths
        ]
        self.slice_paths = list(itertools.chain(*self.slice_paths))
        self.slicepath2pid = {}
        for s in self.slice_paths:
            self.slicepath2pid[s] = int(utils.get_patient_id(s))

        self.nsamples = len(self.slice_paths)
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.transformation_params = transform_params
        self.data_prep_fun = data_prep_fun
        self.slice2roi = utils.load_pkl(
            slice2roi_path) if slice2roi_path else None
示例#11
0
    def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None,
                 slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=0,
                 data_prep_fun=data.transform_norm_rescale,
                 **kwargs):

        if patient_ids:
            patient_paths = []
            for pid in patient_ids:
                patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            patient_paths = glob.glob(data_path + '/*/study/')

        self.pid2slice_paths = defaultdict(list)
        nslices = []
        for p in patient_paths:
            pid = int(utils.get_patient_id(p))
            spaths = sorted(glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1)))
            # consider patients only with min_slices
            if len(spaths) > min_slices:
                self.pid2slice_paths[pid] = spaths
                nslices.append(len(spaths))

        # take max number of slices
        self.nslices = int(np.max(nslices))

        self.patient_ids = self.pid2slice_paths.keys()
        self.nsamples = len(self.patient_ids)

        self.data_path = data_path
        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.batch_size = batch_size
        self.infinite = infinite
        self.transformation_params = transform_params
        self.data_prep_fun = data_prep_fun
        self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
    def fit(self, X, y, paths, ratio=0.9, iterations=0):
        '''
        Compute ANOVA scores.
        '''

        if iterations != 0:
            self.iterations = iterations

        self.labels_ = np.unique(y)
        brains_by_patient = group_brains_by_patient_id(paths)
        paths = np.array([get_patient_id(paths[i]) for i in range(len(paths))])
        self.scores_ = X[0] * 0.0

        for i in range(self.iterations):
            selection = SelectKBest(f_classif, k=X.shape[1])
            randomize = range(len(X))
            np.random.shuffle(randomize)
            X = X[randomize]
            y = y[randomize]
            paths = paths[randomize]

            X_temp, y_temp = self.get_subsample_by_patient(
                X=X,
                y=y,
                patients=list(brains_by_patient.keys()),
                paths=paths,
                ratio=ratio)
            selection.fit(X_temp, y_temp)
            scores = selection.scores_
            scores[np.where(np.logical_or(np.isnan(scores),
                                          np.isinf(scores)))] = 0.0
            self.scores_ = np.max(np.vstack((scores, self.scores_)), axis=0)

            del scores
            del X_temp
            del y_temp

            print i, np.mean(self.scores_)
    train_patient_indices = list(set(dirs_indices) - set(valid_dirs_indices))

    train_patient_dirs = [utils.get_patient_id(patient_dirs[idx]) for idx in train_patient_indices]
    validation_patient_dirs = [utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices]

    d = {'train': train_patient_dirs, 'valid': validation_patient_dirs}
    utils.save_pkl(d, filename)
    print 'train-valid patients split saved to', filename
    return d


if __name__ == '__main__':
    global_data_path = '/data/dsb15_pkl/pkl_train'

    p = save_train_validation_ids(global_data_path)
    print 'TRAIN'
    for path in p['train']:
        print utils.get_patient_id(path),

    print '\nVALID'
    valid_ids = []
    for path in p['valid']:
        valid_ids.append(utils.get_patient_id(path))
        print utils.get_patient_id(path),

    valid_ids1 = []
    g = glob.glob('/data/dsb15_pkl/pkl_splitted/valid/*/study/')
    for path in g:
        valid_ids1.append(utils.get_patient_id(path))
    print set(valid_ids) == set(valid_ids1)
    ]
    validation_patient_dirs = [
        utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices
    ]

    d = {'train': train_patient_dirs, 'valid': validation_patient_dirs}
    utils.save_pkl(d, filename)
    print('train-valid patients split saved to', filename)
    return d


if __name__ == '__main__':
    global_data_path = '/data/dsb15_pkl/pkl_train'

    p = save_train_validation_ids(global_data_path)
    print('TRAIN')
    for path in p['train']:
        print(utils.get_patient_id(path), end=' ')

    print('\nVALID')
    valid_ids = []
    for path in p['valid']:
        valid_ids.append(utils.get_patient_id(path))
        print(utils.get_patient_id(path), end=' ')

    valid_ids1 = []
    g = glob.glob('/data/dsb15_pkl/pkl_splitted/valid/*/study/')
    for path in g:
        valid_ids1.append(utils.get_patient_id(path))
    print(set(valid_ids) == set(valid_ids1))
    ]
    validation_patient_dirs = [
        utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices
    ]

    d = {'train': train_patient_dirs, 'valid': validation_patient_dirs}
    utils.save_pkl(d, filename)
    print 'train-valid patients split saved to', filename
    return d


if __name__ == '__main__':
    global_data_path = '/data/dsb15_pkl/pkl_train'

    p = save_train_validation_ids(global_data_path)
    print 'TRAIN'
    for path in p['train']:
        print utils.get_patient_id(path),

    print '\nVALID'
    valid_ids = []
    for path in p['valid']:
        valid_ids.append(utils.get_patient_id(path))
        print utils.get_patient_id(path),

    valid_ids1 = []
    g = glob.glob('/data/dsb15_pkl/pkl_splitted/valid/*/study/')
    for path in g:
        valid_ids1.append(utils.get_patient_id(path))
    print set(valid_ids) == set(valid_ids1)
 def __call__(self, tup):
     return True if self.patient_id == None else self.patient_id == utils.get_patient_id(tup)
 def __call__(self, tup):
     return True if self.patient_id == None else self.patient_id == utils.get_patient_id(
         tup)