def prepare_data(input_folder, output_file, mode, size, target_resolution): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' assert (mode in ['2D', '3D']), 'Unknown mode: %s' % mode if mode == '2D' and not len(size) == 2: raise AssertionError('Inadequate number of size parameters') if mode == '3D' and not len(size) == 3: raise AssertionError('Inadequate number of size parameters') if mode == '2D' and not len(target_resolution) == 2: raise AssertionError( 'Inadequate number of target resolution parameters') if mode == '3D' and not len(target_resolution) == 3: raise AssertionError( 'Inadequate number of target resolution parameters') hdf5_file = h5py.File(output_file, "w") diag_list = {'test': [], 'train': []} height_list = {'test': [], 'train': []} weight_list = {'test': [], 'train': []} patient_id_list = {'test': [], 'train': []} cardiac_phase_list = {'test': [], 'train': []} file_list = {'test': [], 'train': []} num_slices = {'test': 0, 'train': 0} logging.info('Counting files and parsing meta data...') for folder in os.listdir(input_folder): folder_path = os.path.join(input_folder, folder) if os.path.isdir(folder_path): train_test = 'test' if (int(folder[-3:]) % 5 == 0) else 'train' infos = {} for line in open(os.path.join(folder_path, 'Info.cfg')): label, value = line.split(':') infos[label] = value.rstrip('\n').lstrip(' ') patient_id = folder.lstrip('patient') for file in glob.glob( os.path.join(folder_path, 'patient???_frame??.nii.gz')): file_list[train_test].append(file) # diag_list[train_test].append(diagnosis_to_int(infos['Group'])) diag_list[train_test].append(diagnosis_dict[infos['Group']]) weight_list[train_test].append(infos['Weight']) height_list[train_test].append(infos['Height']) patient_id_list[train_test].append(patient_id) systole_frame = int(infos['ES']) diastole_frame = int(infos['ED']) file_base = file.split('.')[0] frame = int(file_base.split('frame')[-1]) if frame == systole_frame: cardiac_phase_list[train_test].append(1) # 1 == systole elif frame == diastole_frame: cardiac_phase_list[train_test].append(2) # 2 == diastole else: cardiac_phase_list[train_test].append( 0) # 0 means other phase nifty_img = nib.load(file) num_slices[train_test] += nifty_img.shape[2] # Write the small datasets for tt in ['test', 'train']: hdf5_file.create_dataset('diagnosis_%s' % tt, data=np.asarray(diag_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('weight_%s' % tt, data=np.asarray(weight_list[tt], dtype=np.float32)) hdf5_file.create_dataset('height_%s' % tt, data=np.asarray(height_list[tt], dtype=np.float32)) hdf5_file.create_dataset('patient_id_%s' % tt, data=np.asarray(patient_id_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('cardiac_phase_%s' % tt, data=np.asarray(cardiac_phase_list[tt], dtype=np.uint8)) if mode == '3D': nx, ny, nz_max = size n_train = len(file_list['train']) n_test = len(file_list['test']) elif mode == '2D': nx, ny = size n_test = num_slices['test'] n_train = num_slices['train'] else: raise AssertionError('Wrong mode setting. This should never happen.') # Create datasets for images and masks data = {} for tt, num_points in zip(['test', 'train'], [n_test, n_train]): data['images_%s' % tt] = hdf5_file.create_dataset( "images_%s" % tt, [num_points] + list(size), dtype=np.float32) data['masks_%s' % tt] = hdf5_file.create_dataset( "masks_%s" % tt, [num_points] + list(size), dtype=np.uint8) mask_list = {'test': [], 'train': []} img_list = {'test': [], 'train': []} logging.info('Parsing image files') for train_test in ['test', 'train']: write_buffer = 0 counter_from = 0 for file in file_list[train_test]: logging.info( '-----------------------------------------------------------') logging.info('Doing: %s' % file) file_base = file.split('.nii.gz')[0] file_mask = file_base + '_gt.nii.gz' img_dat = utils.load_nii(file) mask_dat = utils.load_nii(file_mask) img = img_dat[0].copy() mask = mask_dat[0].copy() img = image_utils.normalise_image(img) pixel_size = (img_dat[2].structarr['pixdim'][1], img_dat[2].structarr['pixdim'][2], img_dat[2].structarr['pixdim'][3]) logging.info('Pixel size:') logging.info(pixel_size) ### PROCESSING LOOP FOR 3D DATA ################################ if mode == '3D': scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1], pixel_size[2] / target_resolution[2] ] img_scaled = transform.rescale(img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') mask_scaled = transform.rescale(mask, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') slice_vol = np.zeros((nx, ny, nz_max), dtype=np.float32) mask_vol = np.zeros((nx, ny, nz_max), dtype=np.uint8) nz_curr = img_scaled.shape[2] stack_from = (nz_max - nz_curr) // 2 if stack_from < 0: raise AssertionError( 'nz_max is too small for the chosen through plane resolution. Consider changing' 'the size or the target resolution in the through-plane.' ) for zz in range(nz_curr): slice_rescaled = img_scaled[:, :, zz] mask_rescaled = mask_scaled[:, :, zz] slice_cropped = crop_or_pad_slice_to_size( slice_rescaled, nx, ny) mask_cropped = crop_or_pad_slice_to_size( mask_rescaled, nx, ny) slice_vol[:, :, stack_from] = slice_cropped mask_vol[:, :, stack_from] = mask_cropped stack_from += 1 img_list[train_test].append(slice_vol) mask_list[train_test].append(mask_vol) write_buffer += 1 if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ################### elif mode == '2D': scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1] ] for zz in range(img.shape[2]): slice_img = np.squeeze(img[:, :, zz]) slice_rescaled = transform.rescale(slice_img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') slice_mask = np.squeeze(mask[:, :, zz]) mask_rescaled = transform.rescale(slice_mask, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') slice_cropped = crop_or_pad_slice_to_size( slice_rescaled, nx, ny) mask_cropped = crop_or_pad_slice_to_size( mask_rescaled, nx, ny) img_list[train_test].append(slice_cropped) mask_list[train_test].append(mask_cropped) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 # after file loop: Write the remaining data logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # After test train loop: hdf5_file.close()
def prepare_data(input_folder, preproc_folder, idx_start, idx_end): images = [] affines = [] patnames = [] masks = [] # read the filenames which have segmentations available filenames = sorted(glob.glob(input_folder + '*_seg.nii')) logging.info( 'Number of images in the dataset that have ground truth annotations: %s' % str(len(filenames))) # iterate through all indices for idx in range(len(filenames)): # only consider images within the indices requested if (idx < idx_start) or (idx >= idx_end): #logging.info('skipping subject: %d' %idx) continue logging.info('==============================================') # get the name of the ground truth annotation for this subject filename_seg = filenames[idx] filename_img = filename_seg[:-8] + '.nii.gz' _patname = filename_seg[filename_seg[:-1].rfind('/') + 1:-8] if _patname == 'IXI014-HH-1236-T2': # this subject has very poor resolution - 256x256x28 continue # read the image logging.info('reading image: %s' % _patname) _img_data, _img_affine, _img_header = utils.load_nii(filename_img) # make all the images of the same size by appending zero slices to facilitate appending # most images are of the size 256*256*130 if (_img_data.shape[2] is not 130): num_zero_slices = 130 - _img_data.shape[2] zero_slices = np.zeros( (_img_data.shape[0], _img_data.shape[1], num_zero_slices)) _img_data = np.concatenate((_img_data, zero_slices), axis=-1) # normalise the image _img_data = image_utils.normalise_image(_img_data, norm_type='div_by_max') # save the pre-processed image utils.makefolder(preproc_folder + _patname) savepath = preproc_folder + _patname + '/preprocessed_image.nii' utils.save_nii(savepath, _img_data, _img_affine) # append to the list of all images, affines and patient names images.append(_img_data) affines.append(_img_affine) patnames.append(_patname) # read the segmentation mask (already grouped) _seg_data, _seg_affine, _seg_header = utils.load_nii(filename_seg) # make all the images of the same size by appending zero slices to facilitate appending # most images are of the size 256*256*130 if (_seg_data.shape[2] is not 130): num_zero_slices = 130 - _seg_data.shape[2] zero_slices = np.zeros( (_seg_data.shape[0], _seg_data.shape[1], num_zero_slices)) _seg_data = np.concatenate((_seg_data, zero_slices), axis=-1) # save the pre-processed segmentation ground truth utils.makefolder(preproc_folder + _patname) savepath = preproc_folder + _patname + '/preprocessed_gt15.nii' utils.save_nii(savepath, _seg_data, _seg_affine) # append to the list of all masks masks.append(_seg_data) # convert the lists to arrays images = np.array(images) affines = np.array(affines) patnames = np.array(patnames) masks = np.array(masks, dtype='uint8') # merge along the y-zis to get a stack of x-z slices, for the images as well as the masks images = images.swapaxes(1, 2) images = images.reshape(-1, images.shape[2], images.shape[3]) masks = masks.swapaxes(1, 2) masks = masks.reshape(-1, masks.shape[2], masks.shape[3]) # save the processed images and masks so that they can be directly read the next time # make appropriate filenames according to the requested indices of training, validation and test images logging.info('Saving pre-processed files...') config_details = 'from%dto%d_' % (idx_start, idx_end) filepath_images = preproc_folder + config_details + 'images.npy' filepath_masks = preproc_folder + config_details + 'annotations15.npy' filepath_affine = preproc_folder + config_details + 'affines.npy' filepath_patnames = preproc_folder + config_details + 'patnames.npy' np.save(filepath_images, images) np.save(filepath_masks, masks) np.save(filepath_affine, affines) np.save(filepath_patnames, patnames) return images, masks, affines, patnames
def score_data(input_folder, output_folder, model_path, exp_config, do_postprocessing=False, gt_exists=True, evaluate_all=False, use_iter=None): nx, ny = exp_config.image_size[:2] batch_size = 1 num_channels = exp_config.nlabels image_tensor_shape = [batch_size] + list(exp_config.image_size) + [1] images_pl = tf.placeholder(tf.float32, shape=image_tensor_shape, name='images') mask_pl, softmax_pl = model.predict(images_pl, exp_config) saver = tf.train.Saver() init = tf.global_variables_initializer() evaluate_test_set = not gt_exists with tf.Session() as sess: sess.run(init) if not use_iter: checkpoint_path = utils.get_latest_model_checkpoint_path( model_path, 'model_best_dice.ckpt') else: checkpoint_path = os.path.join(model_path, 'model.ckpt-%d' % use_iter) saver.restore(sess, checkpoint_path) init_iteration = int(checkpoint_path.split('/')[-1].split('-')[-1]) total_time = 0 total_volumes = 0 for folder in os.listdir(input_folder): folder_path = os.path.join(input_folder, folder) if os.path.isdir(folder_path): if evaluate_test_set or evaluate_all: train_test = 'test' # always test else: train_test = 'test' if (int(folder[-3:]) % 5 == 0) else 'train' if train_test == 'test': infos = {} for line in open(os.path.join(folder_path, 'Info.cfg')): label, value = line.split(':') infos[label] = value.rstrip('\n').lstrip(' ') patient_id = folder.lstrip('patient') ED_frame = int(infos['ED']) ES_frame = int(infos['ES']) for file in glob.glob( os.path.join(folder_path, 'patient???_frame??.nii.gz')): logging.info( ' ----- Doing image: -------------------------') logging.info('Doing: %s' % file) logging.info( ' --------------------------------------------') file_base = file.split('.nii.gz')[0] frame = int(file_base.split('frame')[-1]) img_dat = utils.load_nii(file) img = img_dat[0].copy() img = image_utils.normalise_image(img) if gt_exists: file_mask = file_base + '_gt.nii.gz' mask_dat = utils.load_nii(file_mask) mask = mask_dat[0] start_time = time.time() if exp_config.data_mode == '2D': pixel_size = (img_dat[2].structarr['pixdim'][1], img_dat[2].structarr['pixdim'][2]) scale_vector = (pixel_size[0] / exp_config.target_resolution[0], pixel_size[1] / exp_config.target_resolution[1]) predictions = [] for zz in range(img.shape[2]): slice_img = np.squeeze(img[:, :, zz]) slice_rescaled = transform.rescale( slice_img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') x, y = slice_rescaled.shape x_s = (x - nx) // 2 y_s = (y - ny) // 2 x_c = (nx - x) // 2 y_c = (ny - y) // 2 # Crop section of image for prediction if x > nx and y > ny: slice_cropped = slice_rescaled[x_s:x_s + nx, y_s:y_s + ny] else: slice_cropped = np.zeros((nx, ny)) if x <= nx and y > ny: slice_cropped[ x_c:x_c + x, :] = slice_rescaled[:, y_s:y_s + ny] elif x > nx and y <= ny: slice_cropped[:, y_c:y_c + y] = slice_rescaled[ x_s:x_s + nx, :] else: slice_cropped[x_c:x_c + x, y_c:y_c + y] = slice_rescaled[:, :] # GET PREDICTION network_input = np.float32( np.tile( np.reshape(slice_cropped, (nx, ny, 1)), (batch_size, 1, 1, 1))) mask_out, logits_out = sess.run( [mask_pl, softmax_pl], feed_dict={images_pl: network_input}) prediction_cropped = np.squeeze( logits_out[0, ...]) # ASSEMBLE BACK THE SLICES slice_predictions = np.zeros( (x, y, num_channels)) # insert cropped region into original image again if x > nx and y > ny: slice_predictions[ x_s:x_s + nx, y_s:y_s + ny, :] = prediction_cropped else: if x <= nx and y > ny: slice_predictions[:, y_s:y_s + ny, :] = prediction_cropped[ x_c:x_c + x, :, :] elif x > nx and y <= ny: slice_predictions[ x_s:x_s + nx, :, :] = prediction_cropped[:, y_c: y_c + y, :] else: slice_predictions[:, :, :] = prediction_cropped[ x_c:x_c + x, y_c:y_c + y, :] # RESCALING ON THE LOGITS if gt_exists: prediction = transform.resize( slice_predictions, (mask.shape[0], mask.shape[1], num_channels), order=1, preserve_range=True, mode='constant') else: # This can occasionally lead to wrong volume size, therefore if gt_exists # we use the gt mask size for resizing. prediction = transform.rescale( slice_predictions, (1.0 / scale_vector[0], 1.0 / scale_vector[1], 1), order=1, preserve_range=True, multichannel=False, mode='constant') # prediction = transform.resize(slice_predictions, # (mask.shape[0], mask.shape[1], num_channels), # order=1, # preserve_range=True, # mode='constant') prediction = np.uint8( np.argmax(prediction, axis=-1)) predictions.append(prediction) prediction_arr = np.transpose( np.asarray(predictions, dtype=np.uint8), (1, 2, 0)) elif exp_config.data_mode == '3D': pixel_size = (img_dat[2].structarr['pixdim'][1], img_dat[2].structarr['pixdim'][2], img_dat[2].structarr['pixdim'][3]) scale_vector = (pixel_size[0] / exp_config.target_resolution[0], pixel_size[1] / exp_config.target_resolution[1], pixel_size[2] / exp_config.target_resolution[2]) vol_scaled = transform.rescale(img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') nz_max = exp_config.image_size[2] slice_vol = np.zeros((nx, ny, nz_max), dtype=np.float32) nz_curr = vol_scaled.shape[2] stack_from = (nz_max - nz_curr) // 2 stack_counter = stack_from x, y, z = vol_scaled.shape x_s = (x - nx) // 2 y_s = (y - ny) // 2 x_c = (nx - x) // 2 y_c = (ny - y) // 2 for zz in range(nz_curr): slice_rescaled = vol_scaled[:, :, zz] if x > nx and y > ny: slice_cropped = slice_rescaled[x_s:x_s + nx, y_s:y_s + ny] else: slice_cropped = np.zeros((nx, ny)) if x <= nx and y > ny: slice_cropped[ x_c:x_c + x, :] = slice_rescaled[:, y_s:y_s + ny] elif x > nx and y <= ny: slice_cropped[:, y_c:y_c + y] = slice_rescaled[ x_s:x_s + nx, :] else: slice_cropped[x_c:x_c + x, y_c:y_c + y] = slice_rescaled[:, :] slice_vol[:, :, stack_counter] = slice_cropped stack_counter += 1 stack_to = stack_counter network_input = np.float32( np.reshape(slice_vol, (1, nx, ny, nz_max, 1))) start_time = time.time() mask_out, logits_out = sess.run( [mask_pl, softmax_pl], feed_dict={images_pl: network_input}) logging.info('Classified 3D: %f secs' % (time.time() - start_time)) prediction_nzs = logits_out[0, :, :, stack_from:stack_to, ...] # non-zero-slices if not prediction_nzs.shape[2] == nz_curr: raise ValueError('sizes mismatch') # ASSEMBLE BACK THE SLICES prediction_scaled = np.zeros( list(vol_scaled.shape) + [num_channels ]) # last dim is for logits classes # insert cropped region into original image again if x > nx and y > ny: prediction_scaled[x_s:x_s + nx, y_s:y_s + ny, :, ...] = prediction_nzs else: if x <= nx and y > ny: prediction_scaled[:, y_s:y_s + ny, :, ...] = prediction_nzs[ x_c:x_c + x, :, :, ...] elif x > nx and y <= ny: prediction_scaled[ x_s:x_s + nx, :, :...] = prediction_nzs[:, y_c:y_c + y, :...] else: prediction_scaled[:, :, : ...] = prediction_nzs[ x_c:x_c + x, y_c:y_c + y, :...] logging.info('Prediction_scaled mean %f' % (np.mean(prediction_scaled))) prediction = transform.resize( prediction_scaled, (mask.shape[0], mask.shape[1], mask.shape[2], num_channels), order=1, preserve_range=True, mode='constant') prediction = np.argmax(prediction, axis=-1) prediction_arr = np.asarray(prediction, dtype=np.uint8) # This is the same for 2D and 3D again if do_postprocessing: prediction_arr = image_utils.keep_largest_connected_components( prediction_arr) elapsed_time = time.time() - start_time total_time += elapsed_time total_volumes += 1 logging.info('Evaluation of volume took %f secs.' % elapsed_time) if frame == ED_frame: frame_suffix = '_ED' elif frame == ES_frame: frame_suffix = '_ES' else: raise ValueError( 'Frame doesnt correspond to ED or ES. frame = %d, ED = %d, ES = %d' % (frame, ED_frame, ES_frame)) # Save prediced mask out_file_name = os.path.join( output_folder, 'prediction', 'patient' + patient_id + frame_suffix + '.nii.gz') if gt_exists: out_affine = mask_dat[1] out_header = mask_dat[2] else: out_affine = img_dat[1] out_header = img_dat[2] logging.info('saving to: %s' % out_file_name) utils.save_nii(out_file_name, prediction_arr, out_affine, out_header) # Save image data to the same folder for convenience image_file_name = os.path.join( output_folder, 'image', 'patient' + patient_id + frame_suffix + '.nii.gz') logging.info('saving to: %s' % image_file_name) utils.save_nii(image_file_name, img_dat[0], out_affine, out_header) if gt_exists: # Save GT image gt_file_name = os.path.join( output_folder, 'ground_truth', 'patient' + patient_id + frame_suffix + '.nii.gz') logging.info('saving to: %s' % gt_file_name) utils.save_nii(gt_file_name, mask, out_affine, out_header) # Save difference mask between predictions and ground truth difference_mask = np.where( np.abs(prediction_arr - mask) > 0, [1], [0]) difference_mask = np.asarray(difference_mask, dtype=np.uint8) diff_file_name = os.path.join( output_folder, 'difference', 'patient' + patient_id + frame_suffix + '.nii.gz') logging.info('saving to: %s' % diff_file_name) utils.save_nii(diff_file_name, difference_mask, out_affine, out_header) logging.info('Average time per volume: %f' % (total_time / total_volumes)) return init_iteration
def prepare_data(input_folder, output_file, size, target_resolution, labels_list, rescale_to_one, offset=None, image_postfix='.nii.gz'): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' csv_summary_file = os.path.join(input_folder, 'summary_alldata.csv') summary = pd.read_csv(csv_summary_file) summary = summary.loc[summary['image_exists']==True] summary = summary.loc[~(summary['diagnosis_3cat'] == 'unknown')] # Don't use images with unknown diagnosis # Get list of unique rids rids = summary.rid.unique() # Get initial diagnosis for rough stratification diagnoses = [] for rid in rids: diagnoses.append(summary.loc[summary['rid'] == rid]['diagnosis_3cat'].values[0]) train_and_val_rids, test_rids, train_and_val_diagnoses, _ = train_test_split(rids, diagnoses, test_size=0.2, stratify=diagnoses) train_rids, val_rids = train_test_split(train_and_val_rids, test_size=0.2, stratify=train_and_val_diagnoses) print(len(train_rids), len(test_rids), len(val_rids)) # n_images_train = len(summary.loc[summary['rid'].isin(train_rids)]) # n_images_test = len(summary.loc[summary['rid'].isin(test_rids)]) # n_images_val = len(summary.loc[summary['rid'].isin(val_rids)]) hdf5_file = h5py.File(output_file, "w") diag_list = {'test': [], 'train': [], 'val': []} weight_list = {'test': [], 'train': [], 'val': []} age_list = {'test': [], 'train': [], 'val': []} gender_list = {'test': [], 'train': [], 'val': []} rid_list = {'test': [], 'train': [], 'val': []} viscode_list = {'test': [], 'train': [], 'val': []} adas13_list = {'test': [], 'train': [], 'val': []} mmse_list = {'test': [], 'train': [], 'val': []} field_strength_list = {'test': [], 'train': [], 'val': []} file_list = {'test': [], 'train': [], 'val': []} logging.info('Counting files and parsing meta data...') for train_test, set_rids in zip(['train', 'test', 'val'], [train_rids, test_rids, val_rids]): for ii, row in summary.iterrows(): rid = row['rid'] if rid not in set_rids: continue diagnosis_str = row['diagnosis_3cat'] diagnosis = diagnosis_dict[diagnosis_str] if diagnosis not in labels_list: continue rid_list[train_test].append(rid) diag_list[train_test].append(diagnosis) viscode = row['viscode'] viscode_list[train_test].append(viscode_dict[viscode]) weight_list[train_test].append(row['weight']) age_list[train_test].append(row['age']) gender_list[train_test].append(gender_dict[row['gender']]) adas13_list[train_test].append(fix_nan_and_unknown(row['adas13'], target_data_format=np.float32)) mmse_list[train_test].append(fix_nan_and_unknown(row['mmse'], target_data_format=np.uint8)) field_strength = row['field_strength'] field_strength_list[train_test].append(field_strength) phase = row['phase'] file_name = 'rid_%s/%s_%sT_%s_rid%s_%s%s' % (str(rid).zfill(4), phase.lower(), str(field_strength), diagnosis_str, str(rid).zfill(4), viscode, image_postfix) file_list[train_test].append(os.path.join(input_folder, file_name)) # Write the small datasets for tt in ['test', 'train', 'val']: hdf5_file.create_dataset('rid_%s' % tt, data=np.asarray(rid_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('viscode_%s' % tt, data=np.asarray(viscode_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('diagnosis_%s' % tt, data=np.asarray(diag_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('age_%s' % tt, data=np.asarray(age_list[tt], dtype=np.float32)) hdf5_file.create_dataset('weight_%s' % tt, data=np.asarray(weight_list[tt], dtype=np.float32)) hdf5_file.create_dataset('gender_%s' % tt, data=np.asarray(gender_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('adas13_%s' % tt, data=np.asarray(adas13_list[tt], dtype=np.float32)) hdf5_file.create_dataset('mmse_%s' % tt, data=np.asarray(mmse_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('field_strength_%s' % tt, data=np.asarray(field_strength_list[tt], dtype=np.float16)) n_train = len(file_list['train']) n_test = len(file_list['test']) n_val = len(file_list['val']) # assert n_train == n_images_train, 'Mismatch in data sizes, %d not == %d' % (n_train, n_images_train) # assert n_test == n_images_test, 'Mismatch in data sizes, %d not == %d' % (n_test, n_images_test) # assert n_val == n_images_val, 'Mismatch in data sizes, %d not == %d' % (n_val, n_images_val) # Create datasets for images and masks data = {} for tt, num_points in zip(['test', 'train', 'val'], [n_test, n_train, n_val]): data['images_%s' % tt] = hdf5_file.create_dataset("images_%s" % tt, [num_points] + list(size), dtype=np.float32) img_list = {'test': [], 'train': [] , 'val': []} logging.info('Parsing image files') for train_test in ['test', 'train', 'val']: write_buffer = 0 counter_from = 0 for file in file_list[train_test]: logging.info('-----------------------------------------------------------') logging.info('Doing: %s' % file) img_dat = utils.load_nii(file) img = img_dat[0].copy() pixel_size = (img_dat[2].structarr['pixdim'][1], img_dat[2].structarr['pixdim'][2], img_dat[2].structarr['pixdim'][3]) logging.info('Pixel size:') logging.info(pixel_size) scale_vector = [pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1], pixel_size[2] / target_resolution[2]] img_scaled = transform.rescale(img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') img_resized = crop_or_pad_slice_to_size(img_scaled, size, offset=offset) if rescale_to_one: img_resized = image_utils.map_image_to_intensity_range(img_resized, -1, 1) else: img_resized = image_utils.normalise_image(img_resized) ### DEBUGGING ############################################ # utils.create_and_save_nii(img_resized, 'debug.nii.gz') # exit() ######################################################### img_list[train_test].append(img_resized) write_buffer += 1 if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, counter_from, counter_to) _release_tmp_memory(img_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 # after file loop: Write the remaining data logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, counter_from, counter_to) _release_tmp_memory(img_list, train_test) # After test train loop: hdf5_file.close()
def prepare_data(input_folder, output_file, size, target_resolution, labels_list, rescale_to_one, image_postfix='.nii.gz'): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' csv_summary_file = os.path.join(input_folder, 'summary_screening.csv') summary = pd.read_csv(csv_summary_file) summary = summary.loc[summary['image_exists'] == True] train_and_val_cases, test_cases = train_test_split( summary, test_size=0.2, stratify=summary['diagnosis_3cat']) train_cases, val_cases = train_test_split( train_and_val_cases, test_size=0.2, stratify=train_and_val_cases['diagnosis_3cat']) hdf5_file = h5py.File(output_file, "w") diag_list = {'test': [], 'train': [], 'val': []} weight_list = {'test': [], 'train': [], 'val': []} age_list = {'test': [], 'train': [], 'val': []} gender_list = {'test': [], 'train': [], 'val': []} rid_list = {'test': [], 'train': [], 'val': []} confidence_list = {'test': [], 'train': [], 'val': []} adas13_list = {'test': [], 'train': [], 'val': []} mmse_list = {'test': [], 'train': [], 'val': []} field_strength_list = {'test': [], 'train': [], 'val': []} file_list = {'test': [], 'train': [], 'val': []} logging.info('Counting files and parsing meta data...') for train_test, sum_df in zip(['train', 'test', 'val'], [train_cases, test_cases, val_cases]): for ii, row in sum_df.iterrows(): diagnosis_str = row['diagnosis_3cat'] diagnosis = diagnosis_dict[diagnosis_str] if diagnosis not in labels_list: continue diag_list[train_test].append(diagnosis) rid = row['rid'] rid_list[train_test].append(rid) confidence = fix_nan_and_unknown(np.float16, row['confidence'], nan_val=255, unknown_val=254) confidence_list[train_test].append(confidence) weight_list[train_test].append(row['weight']) age_list[train_test].append(row['age']) gender_list[train_test].append(gender_dict[row['gender']]) adas13_list[train_test].append(row['adas13']) mmse_list[train_test].append(row['mmse']) field_strength = row['field_strength'] field_strength_list[train_test].append(field_strength) phase = row['phase'] file_name = '%s_%sT_%s_rid%s%s' % ( phase.lower(), str(field_strength), diagnosis_str, str(rid).zfill(4), image_postfix) file_list[train_test].append(os.path.join(input_folder, file_name)) # Write the small datasets for tt in ['test', 'train', 'val']: hdf5_file.create_dataset('rid_%s' % tt, data=np.asarray(rid_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('confidence_%s' % tt, data=np.asarray(confidence_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('diagnosis_%s' % tt, data=np.asarray(diag_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('age_%s' % tt, data=np.asarray(age_list[tt], dtype=np.float32)) hdf5_file.create_dataset('weight_%s' % tt, data=np.asarray(weight_list[tt], dtype=np.float32)) hdf5_file.create_dataset('gender_%s' % tt, data=np.asarray(gender_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('adas13_%s' % tt, data=np.asarray(adas13_list[tt], dtype=np.float32)) hdf5_file.create_dataset('mmse_%s' % tt, data=np.asarray(mmse_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('field_strength_%s' % tt, data=np.asarray(field_strength_list[tt], dtype=np.float16)) n_train = len(file_list['train']) n_test = len(file_list['test']) n_val = len(file_list['val']) # Create datasets for images and masks data = {} for tt, num_points in zip(['test', 'train', 'val'], [n_test, n_train, n_val]): data['images_%s' % tt] = hdf5_file.create_dataset( "images_%s" % tt, [num_points] + list(size), dtype=np.float32) img_list = {'test': [], 'train': [], 'val': []} logging.info('Parsing image files') for train_test in ['test', 'train', 'val']: write_buffer = 0 counter_from = 0 for file in file_list[train_test]: logging.info( '-----------------------------------------------------------') logging.info('Doing: %s' % file) img_dat = utils.load_nii(file) img = img_dat[0].copy() pixel_size = (img_dat[2].structarr['pixdim'][1], img_dat[2].structarr['pixdim'][2], img_dat[2].structarr['pixdim'][3]) logging.info('Pixel size:') logging.info(pixel_size) scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1], pixel_size[2] / target_resolution[2] ] img_scaled = transform.rescale(img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') if rescale_to_one: img_scaled = image_utils.map_image_to_intensity_range( img_scaled, -1, 1) else: img_scaled = image_utils.normalise_image(img_scaled) img_resized = crop_or_pad_slice_to_size(img_scaled, size) img_list[train_test].append(img_resized) write_buffer += 1 if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, counter_from, counter_to) _release_tmp_memory(img_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 # after file loop: Write the remaining data logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, counter_from, counter_to) _release_tmp_memory(img_list, train_test) # After test train loop: hdf5_file.close()
def prepare_data(input_folder, output_file, mode, size, target_resolution, split_test_train=True): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' assert (mode in ['2D']), 'Unknown mode: %s' % mode if mode == '2D' and not len(size) == 2: raise AssertionError('Inadequate number of size parameters') if mode == '2D' and not len(target_resolution) == 2: raise AssertionError( 'Inadequate number of target resolution parameters') hdf5_file = h5py.File(output_file, "w") file_list = {'test': [], 'train': []} num_slices = {'test': 0, 'train': 0} logging.info('Counting files and parsing meta data...') cptImage = 0 for file in glob.glob(os.path.join(input_folder, '*_image.nii.gz')): if split_test_train: train_test = 'test' if (cptImage % 5 == 1) else 'train' #aiming for 80/20% else: train_test = 'train' file_list[train_test].append(file) cptImage = cptImage + 1 nifty_img = nib.load(file) num_slices[train_test] += nifty_img.shape[2] # Write the small datasets nx, ny = size n_test = num_slices['test'] n_train = num_slices['train'] # Create datasets for images and masks data = {} for tt, num_points in zip(['test', 'train'], [n_test, n_train]): if num_points > 0: data['images_%s' % tt] = hdf5_file.create_dataset( "images_%s" % tt, [num_points] + list(size), dtype=np.float32) data['masks_%s' % tt] = hdf5_file.create_dataset( "masks_%s" % tt, [num_points] + list(size), dtype=np.uint8) mask_list = {'test': [], 'train': []} img_list = {'test': [], 'train': []} logging.info('Parsing image files') train_test_range = ['test', 'train'] if split_test_train else ['train'] logging.info("split_test_train : ") logging.info(split_test_train) for train_test in train_test_range: write_buffer = 0 counter_from = 0 for file in file_list[train_test]: logging.info( '-----------------------------------------------------------') logging.info('Doing: %s' % file) file_base = file.split('.nii.gz')[0] file_mask = file_base + '_gt.nii.gz' img_dat = utils.load_nii(file) mask_dat = utils.load_nii(file_mask) img = img_dat[0].copy() mask = mask_dat[0].copy() img = image_utils.normalise_image(img) pixel_size = (img_dat[2].structarr['pixdim'][1], img_dat[2].structarr['pixdim'][2], img_dat[2].structarr['pixdim'][3]) logging.info('Pixel size:') logging.info(pixel_size) ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ################### scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1] ] for zz in range(img.shape[2]): slice_img = np.squeeze(img[:, :, zz]) slice_rescaled = transform.rescale( slice_img, scale_vector, order=1, preserve_range=True, #multichannel=False, mode='constant') slice_mask = np.squeeze(mask[:, :, zz]) mask_rescaled = transform.rescale( slice_mask, scale_vector, order=0, preserve_range=True, #multichannel=False, mode='constant') slice_cropped = crop_or_pad_slice_to_size( slice_rescaled, nx, ny) mask_cropped = crop_or_pad_slice_to_size(mask_rescaled, nx, ny) img_list[train_test].append(slice_cropped) mask_list[train_test].append(mask_cropped) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 # after file loop: Write the remaining data logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # After test train loop: hdf5_file.close()