def __init__(self, loader_params, derivatives, path_output): # paths_data from loader parameters # TODO: when integrating in pipeline, remove format_path_data here (done before in main) self.paths_data = imed_utils.format_path_data( loader_params['path_data']) # bids_config from loader parameters self.bids_config = None if 'bids_config' not in loader_params else loader_params[ 'bids_config'] # target_suffix and roi_suffix from loader parameters self.target_suffix = copy.deepcopy(loader_params['target_suffix']) # If `target_suffix` is a list of lists convert to list if any(isinstance(t, list) for t in self.target_suffix): self.target_suffix = list( itertools.chain.from_iterable(self.target_suffix)) self.roi_suffix = loader_params['roi_params']['suffix'] # If `roi_suffix` is not None, add to target_suffix if self.roi_suffix is not None: self.target_suffix.append(self.roi_suffix) # extensions from loader parameters self.extensions = loader_params['extensions'] # contrast_lst from loader parameters self.contrast_lst = loader_params["contrast_params"]["contrast_lst"] # derivatives self.derivatives = derivatives # Create dataframe self.df = pd.DataFrame() self.create_bids_dataframe() # Save dataframe as csv file self.save(os.path.join(path_output, "bids_dataframe.csv"))
def create_dataset_and_ivadomed_version_log(context): path_data = context['loader_parameters']['path_data'] ivadomed_version = imed_utils._version_string() datasets_version = [] if isinstance(path_data, str): datasets_version = [ imed_utils.__get_commit(path_to_git_folder=path_data) ] elif isinstance(path_data, list): for Dataset in path_data: datasets_version.append( imed_utils.__get_commit(path_to_git_folder=Dataset)) log_file = os.path.join(context['path_output'], 'version_info.log') try: f = open(log_file, "w") except OSError as err: print("OS error: {0}".format(err)) raise Exception( "Have you selected a log folder, and do you have write permissions for that folder?" ) # IVADOMED f.write('IVADOMED TOOLBOX\n----------------\n(' + ivadomed_version + ')') # DATASETS path_data = imed_utils.format_path_data(path_data) f.write('\n\n\nDATASET VERSION\n---------------\n') f.write('The following BIDS dataset(s) were used for training.\n') for i_dataset in range(len(path_data)): if datasets_version[i_dataset] not in ['', '?!?']: f.write( str(i_dataset + 1) + '. ' + path_data[i_dataset] + ' - Dataset Annex version: ' + datasets_version[i_dataset] + '\n') else: f.write( str(i_dataset + 1) + '. ' + path_data[i_dataset] + ' - Dataset is not Annexed.\n') # SYSTEM INFO f.write('\n\nSYSTEM INFO\n-------------\n') platform_running = sys.platform if platform_running.find('darwin') != -1: os_running = 'osx' elif platform_running.find('linux') != -1: os_running = 'linux' elif platform_running.find('win32') or platform_running.find('win64'): os_running = 'windows' else: os_running = 'NA' f.write('OS: ' + os_running + ' (' + platform.platform() + ')\n') # Display number of CPU cores f.write('CPU cores: Available: {}\n\n\n\n\n'.format( multiprocessing.cpu_count())) # USER INPUTS f.write('CONFIG INPUTS\n-------------\n') if sys.version_info[0] > 2: for k, v in context.items(): f.write(str(k) + ': ' + str(v) + '\n') # Making sure all numbers are converted to strings else: for k, v in context.viewitems(): # Python2 f.write(str(k) + ': ' + str(v) + '\n') f.close()
def run_command(context, n_gif=0, thr_increment=None, resume_training=False): """Run main command. This function is central in the ivadomed project as training / testing / evaluation commands are run via this function. All the process parameters are defined in the config. Args: context (dict): Dictionary containing all parameters that are needed for a given process. See :doc:`configuration_file` for more details. n_gif (int): Generates a GIF during training if larger than zero, one frame per epoch for a given slice. The parameter indicates the number of 2D slices used to generate GIFs, one GIF per slice. A GIF shows predictions of a given slice from the validation sub-dataset. They are saved within the output path. thr_increment (float): A threshold analysis is performed at the end of the training using the trained model and the training + validation sub-dataset to find the optimal binarization threshold. The specified value indicates the increment between 0 and 1 used during the ROC analysis (e.g. 0.1). resume_training (bool): Load a saved model ("checkpoint.pth.tar" in the output directory specified with flag "--path-output" or via the config file "output_path" ' This training state is saved everytime a new best model is saved in the log argument) for resume training directory. Returns: float or pandas.DataFrame or None: * If "train" command: Returns floats: best loss score for both training and validation. * If "test" command: Returns a pandas Dataframe: of metrics computed for each subject of the testing sub-dataset and return the prediction metrics before evaluation. * If "segment" command: No return value. """ command = copy.deepcopy(context["command"]) path_output = set_output_path(context) # Create a log with the version of the Ivadomed software and the version of the Annexed dataset (if present) create_dataset_and_ivadomed_version_log(context) cuda_available, device = imed_utils.define_device(context['gpu_ids'][0]) # BACKWARDS COMPATIBILITY: If bids_path is string, assign to list - Do this here so it propagates to all functions context['loader_parameters']['path_data'] = imed_utils.format_path_data( context['loader_parameters']['path_data']) # Loader params loader_params = set_loader_params(context, command == "train") # Get transforms for each subdataset transform_train_params, transform_valid_params, transform_test_params = \ imed_transforms.get_subdatasets_transforms(context["transformation"]) # MODEL PARAMETERS model_params, loader_params = set_model_params(context, loader_params) if command == 'segment': run_segment_command(context, model_params) return # Get subject lists. "segment" command uses all participants of data path, hence no need to split train_lst, valid_lst, test_lst = imed_loader_utils.get_subdatasets_subjects_list( context["split_dataset"], context['loader_parameters']['path_data'], path_output, context["loader_parameters"]['subject_selection']) # TESTING PARAMS # Aleatoric uncertainty if context['uncertainty'][ 'aleatoric'] and context['uncertainty']['n_it'] > 0: transformation_dict = transform_train_params else: transformation_dict = transform_test_params undo_transforms = imed_transforms.UndoCompose( imed_transforms.Compose(transformation_dict, requires_undo=True)) testing_params = copy.deepcopy(context["training_parameters"]) testing_params.update({'uncertainty': context["uncertainty"]}) testing_params.update({ 'target_suffix': loader_params["target_suffix"], 'undo_transforms': undo_transforms, 'slice_axis': loader_params['slice_axis'] }) if command == "train": imed_utils.display_selected_transfoms(transform_train_params, dataset_type=["training"]) imed_utils.display_selected_transfoms(transform_valid_params, dataset_type=["validation"]) elif command == "test": imed_utils.display_selected_transfoms(transformation_dict, dataset_type=["testing"]) # Check if multiple raters check_multiple_raters(command != "train", loader_params) if command == 'train': # Get Validation dataset ds_valid = get_dataset(loader_params, valid_lst, transform_valid_params, cuda_available, device, 'validation') # Get Training dataset ds_train = get_dataset(loader_params, train_lst, transform_train_params, cuda_available, device, 'training') metric_fns = imed_metrics.get_metric_fns(ds_train.task) # If FiLM, normalize data if 'film_layers' in model_params and any(model_params['film_layers']): model_params, ds_train, ds_valid, train_onehotencoder = \ film_normalize_data(context, model_params, ds_train, ds_valid, path_output) else: train_onehotencoder = None # Model directory create_path_model(context, model_params, ds_train, path_output, train_onehotencoder) save_config_file(context, path_output) # RUN TRAINING best_training_dice, best_training_loss, best_validation_dice, best_validation_loss = imed_training.train( model_params=model_params, dataset_train=ds_train, dataset_val=ds_valid, training_params=context["training_parameters"], path_output=path_output, device=device, cuda_available=cuda_available, metric_fns=metric_fns, n_gif=n_gif, resume_training=resume_training, debugging=context["debugging"]) if thr_increment: # LOAD DATASET if command != 'train': # If command == train, then ds_valid already load # Get Validation dataset ds_valid = get_dataset(loader_params, valid_lst, transform_valid_params, cuda_available, device, 'validation') # Get Training dataset with no Data Augmentation ds_train = get_dataset(loader_params, train_lst, transform_valid_params, cuda_available, device, 'training') # Choice of optimisation metric metric = "recall_specificity" if model_params[ "name"] in imed_utils.CLASSIFIER_LIST else "dice" # Model path model_path = os.path.join(path_output, "best_model.pt") # Run analysis thr = imed_testing.threshold_analysis(model_path=model_path, ds_lst=[ds_train, ds_valid], model_params=model_params, testing_params=testing_params, metric=metric, increment=thr_increment, fname_out=os.path.join( path_output, "roc.png"), cuda_available=cuda_available) # Update threshold in config file context["postprocessing"]["binarize_prediction"] = {"thr": thr} save_config_file(context, path_output) if command == 'train': return best_training_dice, best_training_loss, best_validation_dice, best_validation_loss if command == 'test': # LOAD DATASET ds_test = imed_loader.load_dataset(**{ **loader_params, **{ 'data_list': test_lst, 'transforms_params': transformation_dict, 'dataset_type': 'testing', 'requires_undo': True } }, device=device, cuda_available=cuda_available) metric_fns = imed_metrics.get_metric_fns(ds_test.task) if 'film_layers' in model_params and any(model_params['film_layers']): ds_test, model_params = update_film_model_params( context, ds_test, model_params, path_output) # RUN INFERENCE pred_metrics = imed_testing.test( model_params=model_params, dataset_test=ds_test, testing_params=testing_params, path_output=path_output, device=device, cuda_available=cuda_available, metric_fns=metric_fns, postprocessing=context['postprocessing']) # RUN EVALUATION df_results = imed_evaluation.evaluate( path_data=loader_params['path_data'], path_output=path_output, target_suffix=loader_params["target_suffix"], eval_params=context["evaluation_parameters"]) return df_results, pred_metrics
def run_segment_command(context, model_params): bids_ds = [] path_data = imed_utils.format_path_data( context["loader_parameters"]["path_data"]) for bids_folder in path_data: bids_ds.append(bids.BIDS(bids_folder)) # Get the merged df from all dataset paths df = imed_loader_utils.merge_bids_datasets(path_data) subj_lst = df['participant_id'].tolist() # Append subjects from all BIDSdatasets into a list bids_subjects = [] for i_bids_folder in range(0, len(path_data)): bids_subjects += [ s for s in bids_ds[i_bids_folder].get_subjects() if s.record["subject_id"] in subj_lst ] # Add postprocessing to packaged model path_model = os.path.join(context['path_output'], context['model_name']) path_model_config = os.path.join(path_model, context['model_name'] + ".json") model_config = imed_config_manager.load_json(path_model_config) model_config['postprocessing'] = context['postprocessing'] with open(path_model_config, 'w') as fp: json.dump(model_config, fp, indent=4) options = None for subject in bids_subjects: if context['loader_parameters']['multichannel']: fname_img = [] provided_contrasts = [] contrasts = context['loader_parameters']['contrast_params'][ 'testing'] # Keep contrast order for c in contrasts: for s in bids_subjects: if subject.record['subject_id'] == s.record[ 'subject_id'] and s.record['modality'] == c: provided_contrasts.append(c) fname_img.append(s.record['absolute_path']) bids_subjects.remove(s) if len(fname_img) != len(contrasts): logger.warning( "Missing contrast for subject {}. {} were provided but {} are required. Skipping " "subject.".format(subject.record['subject_id'], provided_contrasts, contrasts)) continue else: fname_img = [subject.record['absolute_path']] if 'film_layers' in model_params and any( model_params['film_layers']) and model_params['metadata']: subj_id = subject.record['subject_id'] metadata = df[df['participant_id'] == subj_id][ model_params['metadata']].values[0] options = {'metadata': metadata} pred_list, target_list = imed_inference.segment_volume( path_model, fname_images=fname_img, gpu_id=context['gpu_ids'][0], options=options) pred_path = os.path.join(context['path_output'], "pred_masks") if not os.path.exists(pred_path): os.makedirs(pred_path) for pred, target in zip(pred_list, target_list): filename = subject.record['subject_id'] + "_" + subject.record['modality'] + target + "_pred" + \ ".nii.gz" nib.save(pred, os.path.join(pred_path, filename))
def __init__(self, bids_df, path_data, subject_lst, target_suffix, contrast_lst, path_hdf5, contrast_balance=None, slice_axis=2, metadata_choice=False, slice_filter_fn=None, roi_params=None, transform=None, object_detection_params=None, soft_gt=False): print("Starting conversion") path_data = imed_utils.format_path_data(path_data) # Sort subject_lst and create a sub-dataframe from bids_df containing only subjects from subject_lst subject_lst = sorted(subject_lst) df_subjects = bids_df.df[bids_df.df['filename'].isin(subject_lst)] # Backward compatibility for subject_lst containing participant_ids instead of filenames if df_subjects.empty: df_subjects = bids_df.df[bids_df.df['participant_id'].isin( subject_lst)] subject_lst = sorted(df_subjects['filename'].to_list()) self.soft_gt = soft_gt self.dt = h5py.special_dtype(vlen=str) # opening an hdf5 file with write access and writing metadata # self.hdf5_file = h5py.File(hdf5_name, "w") self.path_hdf5 = path_hdf5 list_patients = [] self.filename_pairs = [] if metadata_choice == 'mri_params': self.metadata = { "FlipAngle": [], "RepetitionTime": [], "EchoTime": [], "Manufacturer": [] } self.prepro_transforms, self.transform = transform # Create a dictionary with the number of subjects for each contrast of contrast_balance tot = { contrast: df_subjects['suffix'].str.fullmatch(contrast).value_counts()[True] for contrast in contrast_balance.keys() } # Create a counter that helps to balance the contrasts c = {contrast: 0 for contrast in contrast_balance.keys()} # Get all subjects path from bids_df for bounding box get_all_subj_path = bids_df.df[bids_df.df['filename'].str.contains( '|'.join(bids_df.get_subject_fnames()))]['path'].to_list() # Load bounding box from list of path self.has_bounding_box = True bounding_box_dict = imed_obj_detect.load_bounding_boxes( object_detection_params, get_all_subj_path, slice_axis, contrast_lst) # Get all derivatives filenames from bids_df all_deriv = bids_df.get_deriv_fnames() for subject in tqdm(subject_lst, desc="Loading dataset"): df_sub = df_subjects.loc[df_subjects['filename'] == subject] # Training & Validation: do not consider the contrasts over the threshold contained in contrast_balance contrast = df_sub['suffix'].values[0] if contrast in (contrast_balance.keys()): c[contrast] = c[contrast] + 1 if c[contrast] / tot[contrast] > contrast_balance[contrast]: continue target_filename, roi_filename = [None] * len(target_suffix), None derivatives = bids_df.df[bids_df.df['filename'].str.contains( '|'.join(bids_df.get_derivatives( subject, all_deriv)))]['path'].to_list() for deriv in derivatives: for idx, suffix in enumerate(target_suffix): if suffix in deriv: target_filename[idx] = deriv if not (roi_params["suffix"] is None) and roi_params["suffix"] in deriv: roi_filename = [deriv] if (not any(target_filename)) or ( not (roi_params["suffix"] is None) and (roi_filename is None)): continue metadata = df_sub.to_dict(orient='records')[0] metadata['contrast'] = contrast if len(bounding_box_dict): # Take only one bounding box for cropping metadata['bounding_box'] = bounding_box_dict[str( df_sub['path'].values[0])][0] if metadata_choice == 'mri_params': if not all([ imed_film.check_isMRIparam(m, metadata, subject, self.metadata) for m in self.metadata.keys() ]): continue # Get subj_id (prefix filename without modality suffix and extension) subj_id = re.sub(r'_' + df_sub['suffix'].values[0] + '.*', '', subject) self.filename_pairs.append( (subj_id, [df_sub['path'].values[0]], target_filename, roi_filename, [metadata])) list_patients.append(subj_id) self.slice_axis = slice_axis self.slice_filter_fn = slice_filter_fn # Update HDF5 metadata with h5py.File(self.path_hdf5, "w") as hdf5_file: hdf5_file.attrs.create('patients_id', list(set(list_patients)), dtype=self.dt) hdf5_file.attrs['slice_axis'] = slice_axis hdf5_file.attrs['slice_filter_fn'] = [('filter_empty_input', True), ('filter_empty_mask', False)] hdf5_file.attrs['metadata_choice'] = metadata_choice # Save images into HDF5 file self._load_filenames() print("Files loaded.")
def __init__(self, bids_df, path_data, subject_lst, target_suffix, contrast_params, slice_axis=2, cache=True, transform=None, metadata_choice=False, slice_filter_fn=None, roi_params=None, multichannel=False, object_detection_params=None, task="segmentation", soft_gt=False): path_data = imed_utils.format_path_data(path_data) self.roi_params = roi_params if roi_params is not None else {"suffix": None, "slice_filter_roi": None} self.soft_gt = soft_gt self.filename_pairs = [] if metadata_choice == 'mri_params': self.metadata = {"FlipAngle": [], "RepetitionTime": [], "EchoTime": [], "Manufacturer": []} # Sort subject_lst and create a sub-dataframe from bids_df containing only subjects from subject_lst subject_lst = sorted(subject_lst) df_subjects = bids_df.df[bids_df.df['filename'].isin(subject_lst)] # Backward compatibility for subject_lst containing participant_ids instead of filenames if df_subjects.empty: df_subjects = bids_df.df[bids_df.df['participant_id'].isin(subject_lst)] subject_lst = sorted(df_subjects['filename'].to_list()) # Create a dictionary with the number of subjects for each contrast of contrast_balance tot = {contrast: df_subjects['suffix'].str.fullmatch(contrast).value_counts()[True] for contrast in contrast_params["balance"].keys()} # Create a counter that helps to balance the contrasts c = {contrast: 0 for contrast in contrast_params["balance"].keys()} # Get a list of subject_ids for multichannel_subjects (prefix filename without modality suffix and extension) subject_ids = [] for subject in subject_lst: suffix = df_subjects.loc[df_subjects['filename'] == subject]['suffix'].values[0] subject_ids.append(re.sub(r'_' + suffix + '.*', '', subject)) subject_ids = list(set(subject_ids)) # Create multichannel_subjects dictionary for each subject_id multichannel_subjects = {} if multichannel: num_contrast = len(contrast_params["contrast_lst"]) idx_dict = {} for idx, contrast in enumerate(contrast_params["contrast_lst"]): idx_dict[contrast] = idx multichannel_subjects = {subject: {"absolute_paths": [None] * num_contrast, "deriv_path": None, "roi_filename": None, "metadata": [None] * num_contrast} for subject in subject_ids} # Get all subjects path from bids_df for bounding box get_all_subj_path = bids_df.df[bids_df.df['filename'] .str.contains('|'.join(bids_df.get_subject_fnames()))]['path'].to_list() # Load bounding box from list of path bounding_box_dict = imed_obj_detect.load_bounding_boxes(object_detection_params, get_all_subj_path, slice_axis, contrast_params["contrast_lst"]) # Get all derivatives filenames from bids_df all_deriv = bids_df.get_deriv_fnames() # Create filename_pairs for subject in tqdm(subject_lst, desc="Loading dataset"): df_sub = df_subjects.loc[df_subjects['filename'] == subject] # Training & Validation: do not consider the contrasts over the threshold contained in contrast_balance contrast = df_sub['suffix'].values[0] if contrast in (contrast_params["balance"].keys()): c[contrast] = c[contrast] + 1 if c[contrast] / tot[contrast] > contrast_params["balance"][contrast]: continue if isinstance(target_suffix[0], str): target_filename, roi_filename = [None] * len(target_suffix), None else: target_filename, roi_filename = [[] for _ in range(len(target_suffix))], None derivatives = bids_df.df[bids_df.df['filename'] .str.contains('|'.join(bids_df.get_derivatives(subject, all_deriv)))]['path'].to_list() for deriv in derivatives: for idx, suffix_list in enumerate(target_suffix): # If suffix_list is a string, then only one rater annotation per class is available. # Otherwise, multiple raters segmented the same class. if isinstance(suffix_list, list): for suffix in suffix_list: if suffix in deriv: target_filename[idx].append(deriv) elif suffix_list in deriv: target_filename[idx] = deriv if not (self.roi_params["suffix"] is None) and self.roi_params["suffix"] in deriv: roi_filename = [deriv] if (not any(target_filename)) or (not (self.roi_params["suffix"] is None) and (roi_filename is None)): continue metadata = df_sub.to_dict(orient='records')[0] metadata['contrast'] = contrast if len(bounding_box_dict): # Take only one bounding box for cropping metadata['bounding_box'] = bounding_box_dict[str(df_sub['path'].values[0])][0] if metadata_choice == 'mri_params': if not all([imed_film.check_isMRIparam(m, metadata, subject, self.metadata) for m in self.metadata.keys()]): continue elif metadata_choice and metadata_choice != 'contrasts' and metadata_choice is not None: # add custom data to metadata if metadata_choice not in df_sub.columns: raise ValueError("The following metadata cannot be found: {}. " "Invalid metadata choice.".format(metadata_choice)) metadata[metadata_choice] = df_sub[metadata_choice].values[0] # Create metadata dict for OHE data_lst = sorted(set(bids_df.df[metadata_choice].dropna().values)) metadata_dict = {} for idx, data in enumerate(data_lst): metadata_dict[data] = idx metadata['metadata_dict'] = metadata_dict # Fill multichannel dictionary # subj_id is the filename without modality suffix and extension if multichannel: idx = idx_dict[df_sub['suffix'].values[0]] subj_id = re.sub(r'_' + df_sub['suffix'].values[0] + '.*', '', subject) multichannel_subjects[subj_id]["absolute_paths"][idx] = df_sub['path'].values[0] multichannel_subjects[subj_id]["deriv_path"] = target_filename multichannel_subjects[subj_id]["metadata"][idx] = metadata if roi_filename: multichannel_subjects[subj_id]["roi_filename"] = roi_filename else: self.filename_pairs.append(([df_sub['path'].values[0]], target_filename, roi_filename, [metadata])) if multichannel: for subject in multichannel_subjects.values(): if None not in subject["absolute_paths"]: self.filename_pairs.append((subject["absolute_paths"], subject["deriv_path"], subject["roi_filename"], subject["metadata"])) if self.filename_pairs == []: raise Exception('No subjects were selected - check selection of parameters on config.json (e.g. center selected + target_suffix)') super().__init__(self.filename_pairs, slice_axis, cache, transform, slice_filter_fn, task, self.roi_params, self.soft_gt)
def __init__(self, path_data, subject_lst, target_suffix, contrast_lst, path_hdf5, contrast_balance=None, slice_axis=2, metadata_choice=False, slice_filter_fn=None, roi_params=None, transform=None, object_detection_params=None, soft_gt=False): print("Starting conversion") # Getting all patients id self.bids_ds = [] path_data = imed_utils.format_path_data(path_data) for bids_folder in path_data: self.bids_ds.append(bids.BIDS(bids_folder)) # Append subjects from all BIDSdatasets into a list bids_subjects = [ s for s in self.bids_ds[0].get_subjects() if s.record["subject_id"] in subject_lst ] for i_bids_folder in range(1, len(path_data)): bids_subjects += [ s for s in self.bids_ds[i_bids_folder].get_subjects() if s.record["subject_id"] in subject_lst ] self.soft_gt = soft_gt self.dt = h5py.special_dtype(vlen=str) # opening an hdf5 file with write access and writing metadata # self.hdf5_file = h5py.File(hdf5_name, "w") self.path_hdf5 = path_hdf5 list_patients = [] self.filename_pairs = [] if metadata_choice == 'mri_params': self.metadata = { "FlipAngle": [], "RepetitionTime": [], "EchoTime": [], "Manufacturer": [] } self.prepro_transforms, self.transform = transform # Create a list with the filenames for all contrasts and subjects subjects_tot = [] for subject in bids_subjects: subjects_tot.append(str(subject.record["absolute_path"])) # Create a dictionary with the number of subjects for each contrast of contrast_balance tot = { contrast: len([s for s in bids_subjects if s.record["modality"] == contrast]) for contrast in contrast_balance.keys() } # Create a counter that helps to balance the contrasts c = {contrast: 0 for contrast in contrast_balance.keys()} # Append get_subjects() get_subjects_all = self.bids_ds[0].get_subjects() for i_bids_folder in range(1, len(self.bids_ds)): get_subjects_all.extend(self.bids_ds[i_bids_folder].get_subjects()) self.has_bounding_box = True bounding_box_dict = imed_obj_detect.load_bounding_boxes( object_detection_params, get_subjects_all, slice_axis, contrast_lst) for subject in tqdm(bids_subjects, desc="Loading dataset"): if subject.record["modality"] in contrast_lst: # Training & Validation: do not consider the contrasts over the threshold contained in contrast_balance if subject.record["modality"] in contrast_balance.keys(): c[subject. record["modality"]] = c[subject.record["modality"]] + 1 if c[subject.record["modality"]] / tot[subject.record["modality"]] \ > contrast_balance[subject.record["modality"]]: continue if not subject.has_derivative("labels"): print("Subject without derivative, skipping.") continue derivatives = subject.get_derivatives("labels") target_filename, roi_filename = [None ] * len(target_suffix), None for deriv in derivatives: for idx, suffix in enumerate(target_suffix): if deriv.endswith(subject.record["modality"] + suffix + ".nii.gz"): target_filename[idx] = deriv if not (roi_params["suffix"] is None) and \ deriv.endswith(subject.record["modality"] + roi_params["suffix"] + ".nii.gz"): roi_filename = [deriv] if (not any(target_filename)) or ( not (roi_params["suffix"] is None) and (roi_filename is None)): continue if not subject.has_metadata(): print("Subject without metadata.") metadata = {} else: metadata = subject.metadata() # add contrast to metadata metadata['contrast'] = subject.record["modality"] if metadata_choice == 'mri_params': if not all([ imed_film.check_isMRIparam(m, metadata) for m in self.metadata.keys() ]): continue if len(bounding_box_dict): # Take only one bounding box for cropping metadata['bounding_box'] = bounding_box_dict[str( subject.record["absolute_path"])][0] self.filename_pairs.append( (subject.record["subject_id"], [subject.record.absolute_path ], target_filename, roi_filename, [metadata])) list_patients.append(subject.record["subject_id"]) self.slice_axis = slice_axis self.slice_filter_fn = slice_filter_fn # Update HDF5 metadata with h5py.File(self.path_hdf5, "w") as hdf5_file: hdf5_file.attrs.create('patients_id', list(set(list_patients)), dtype=self.dt) hdf5_file.attrs['slice_axis'] = slice_axis hdf5_file.attrs['slice_filter_fn'] = [('filter_empty_input', True), ('filter_empty_mask', False)] hdf5_file.attrs['metadata_choice'] = metadata_choice # Save images into HDF5 file self._load_filenames() print("Files loaded.")
def merge_bids_datasets(path_data): """Read the participants.tsv from several BIDS folders and merge them into a single dataframe. Args: path_data (list) or (str): BIDS folders paths Returns: df: dataframe with merged subjects and columns """ path_data = imed_utils.format_path_data(path_data) if len(path_data) == 1: # read participants.tsv as pandas dataframe df = bids.BIDS(path_data[0]).participants.content # Append a new column to show which dataset the Subjects belong to (this will be used later for loading) df['path_output'] = [path_data[0]] * len(df) elif path_data == []: raise Exception("No dataset folder selected") else: # Merge multiple .tsv files into the same dataframe df = pd.read_table(os.path.join(path_data[0], 'participants.tsv'), encoding="ISO-8859-1") # Convert to string to get rid of potential TypeError during merging within the same column df = df.astype(str) # Add the Bids_path to the dataframe df['path_output'] = [path_data[0]] * len(df) for iFolder in range(1, len(path_data)): df_next = pd.read_table(os.path.join(path_data[iFolder], 'participants.tsv'), encoding="ISO-8859-1") df_next = df_next.astype(str) df_next['path_output'] = [path_data[iFolder]] * len(df_next) # Merge the .tsv files (This keeps also non-overlapping fields) df = pd.merge(left=df, right=df_next, how='outer') # Get rid of duplicate entries based on the field "participant_id" (the same subject could have in theory be # included in both datasets). The assumption here is that if the two datasets contain the same subject, # identical sessions of the subjects are contained within the two folder so only the files within the first folder # will be kept. logical_keep_first_encounter = [] indicesOfDuplicates = [] used = set() # For debugging for iEntry in range(len(df)): if df['participant_id'][iEntry] not in used: used.add(df['participant_id'][iEntry]) # For debugging logical_keep_first_encounter.append(iEntry) else: indicesOfDuplicates.append(iEntry) # For debugging # Just keep the dataframe with unique participant_id df = df.iloc[logical_keep_first_encounter, :] # Rearrange the bids paths to be last column of the dataframe cols = list(df.columns.values) cols.remove("path_output") cols.append("path_output") df = df[cols] # Substitute NaNs with string: "-". This helps with metadata selection df = df.fillna("-") return df