def test_train_classification_rad_3d(): application_data = '3d_rad_segmentation' # read and initialize parameters for specific data dimension parameters = parseConfig(testingDir + '/config_classification.yaml', version_check=False) parameters['patch_size'] = patch_size['3D'] parameters['psize'] = patch_size['3D'] parameters['model']['dimension'] = 3 parameters['model']['class_list'] = [0, 1] # read and parse csv training_data, headers = parseTrainingCSV( inputDir + '/train_3d_rad_classification.csv') # loop through selected models and train for single epoch for model in all_models_regression: parameters['model']['architecture'] = model shutil.rmtree(outputDir) # overwrite previous results Path(outputDir).mkdir(parents=True, exist_ok=True) TrainingManager(dataframe=training_data, headers=headers, outputDir=outputDir, parameters=parameters, device='cpu', reset_prev=True) print('passed')
def test_train_segmentation_rad_3d(): print('Starting 3D Rad segmentation tests') application_data = '3d_rad_segmentation' parameters = parseConfig(inputDir + '/' + application_data + '/sample_training.yaml', version_check = False) parameters['modality'] = 'rad' training_data, headers = parseTrainingCSV(inputDir + '/train_' + application_data + '.csv') for model in all_models_segmentation: parameters['model']['architecture'] = model currentOutputDir = os.path.join(outputDir, application_data + '_' + model) Path(currentOutputDir).mkdir(parents=True, exist_ok=True) TrainingManager(dataframe=training_data, headers = headers, outputDir=currentOutputDir, parameters=parameters, device='cpu') print('passed')
def test_classification_rad_3d(): application_data = '3d_rad_segmentation' parameters = parseConfig(inputDir + '/' + application_data + '/sample_training_classification.yaml', version_check=False) training_data, headers = parseTrainingCSV( inputDir + '/train_3d_rad_classification.csv') for model in all_models_regression: parameters['model']['architecture'] = model shutil.rmtree(outputDir) # overwrite previous results Path(outputDir).mkdir(parents=True, exist_ok=True) TrainingManager(dataframe=training_data, headers=headers, outputDir=outputDir, parameters=parameters, device='cpu', reset_prev=True) print('passed')
def test_train_segmentation_rad_2d(): print('Starting 2D Rad segmentation tests') application_data = '2d_rad_segmentation' parameters = parseConfig(inputDir + '/' + application_data + '/sample_training.yaml', version_check=False) parameters['modality'] = 'rad' training_data, headers = parseTrainingCSV(inputDir + '/train_' + application_data + '.csv') for model in all_models_segmentation: parameters['model']['architecture'] = model shutil.rmtree(outputDir) # overwrite previous results Path(outputDir).mkdir(parents=True, exist_ok=True) TrainingManager(dataframe=training_data, headers=headers, outputDir=outputDir, parameters=parameters, device='cpu', reset_prev=True) print('passed')
def preprocess_and_save(data_csv, config_file, output_dir, label_pad_mode="constant", applyaugs=False): """ This function performs preprocessing based on parameters provided and saves the output. Args: data_csv (str): The CSV file of the training data. config_file (str): The YAML file of the training configuration. output_dir (str): The output directory. label_pad_mode (str): The padding strategy for the label. Defaults to "constant". applyaugs (bool): If data augmentation is to be applied before saving the image. Defaults to False. Raises: ValueError: Parameter check from previous """ Path(output_dir).mkdir(parents=True, exist_ok=True) # read the csv # don't care if the dataframe gets shuffled or not dataframe, headers = parseTrainingCSV(data_csv, train=False) parameters = parseConfig(config_file) # save the parameters so that the same compute doesn't happen once again parameter_file = os.path.join(output_dir, "parameters.pkl") if os.path.exists(parameter_file): parameters_prev = pickle.load(open(parameter_file, "rb")) if parameters != parameters_prev: raise ValueError( "The parameters are not the same as the ones stored in the previous run, please re-check." ) else: with open(parameter_file, "wb") as handle: pickle.dump(parameters, handle, protocol=pickle.HIGHEST_PROTOCOL) parameters = populate_header_in_parameters(parameters, headers) data_for_processing = ImagesFromDataFrame(dataframe, parameters, train=applyaugs, apply_zero_crop=True, loader_type="full") dataloader_for_processing = DataLoader( data_for_processing, batch_size=1, pin_memory=False, ) # initialize a new dict for the preprocessed data base_df = get_dataframe(data_csv) # ensure csv only contains lower case columns base_df.columns = base_df.columns.str.lower() # only store the column names output_columns_to_write = base_df.to_dict() for key in output_columns_to_write.keys(): output_columns_to_write[key] = [] # keep a record of the keys which contains only images keys_with_images = parameters["headers"]["channelHeaders"] keys_with_images = [str(x) for x in keys_with_images] ## to-do # use dataloader_for_processing to loop through all images # if padding is enabled, ensure that it gets applied to the images # save the images to disk, but keep a record that these images are preprocessed. # create new csv that contains new files. # give warning if label sampler is present but number of patches to extract is > 1 if ((parameters["patch_sampler"] == "label") or (isinstance(parameters["patch_sampler"], dict))) and parameters["q_samples_per_volume"] > 1: print( "[WARNING] Label sampling has been enabled but q_samples_per_volume > 1; this has been known to cause issues, so q_samples_per_volume will be hard-coded to 1 during preprocessing. Please contact GaNDLF developers for more information", file=sys.stderr, flush=True, ) for _, (subject) in enumerate( tqdm(dataloader_for_processing, desc="Looping over data")): # initialize the current_output_dir current_output_dir = os.path.join(output_dir, str(subject["subject_id"][0])) Path(current_output_dir).mkdir(parents=True, exist_ok=True) output_columns_to_write["subjectid"].append(subject["subject_id"][0]) subject_dict_to_write, subject_process = {}, {} # start constructing the torchio.Subject object for channel in parameters["headers"]["channelHeaders"]: # the "squeeze" is needed because the dataloader automatically # constructs 5D tensor considering the batch_size as first # dimension, but the constructor needs 4D tensor. subject_process[str(channel)] = torchio.Image( tensor=subject[str(channel)]["data"].squeeze(0), type=torchio.INTENSITY, path=subject[str(channel)]["path"], ) if parameters["headers"]["labelHeader"] is not None: subject_process["label"] = torchio.Image( tensor=subject["label"]["data"].squeeze(0), type=torchio.LABEL, path=subject["label"]["path"], ) subject_dict_to_write = torchio.Subject(subject_process) # apply a different padding mode to image and label (so that label information is not duplicated) if (parameters["patch_sampler"] == "label") or (isinstance( parameters["patch_sampler"], dict)): # get the padding size from the patch_size psize_pad = list( np.asarray(np.ceil(np.divide(parameters["patch_size"], 2)), dtype=int)) # initialize the padder for images padder = torchio.transforms.Pad(psize_pad, padding_mode="symmetric", include=keys_with_images) subject_dict_to_write = padder(subject_dict_to_write) if parameters["headers"]["labelHeader"] is not None: # initialize the padder for label padder_label = torchio.transforms.Pad( psize_pad, padding_mode=label_pad_mode, include="label") subject_dict_to_write = padder_label(subject_dict_to_write) sampler = torchio.data.LabelSampler(parameters["patch_size"]) generator = sampler(subject_dict_to_write, num_patches=1) for patch in generator: for channel in parameters["headers"]["channelHeaders"]: subject_dict_to_write[str(channel)] = patch[str( channel)] subject_dict_to_write["label"] = patch["label"] # write new images common_ext = get_filename_extension_sanitized(subject["1"]["path"][0]) # in cases where the original image has a file format that does not support # RGB floats, use the "vtk" format if common_ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"]: common_ext = ".vtk" if subject["1"]["path"][0] != "": image_for_info_copy = sitk.ReadImage(subject["1"]["path"][0]) else: image_for_info_copy = subject_dict_to_write["1"].as_sitk() correct_spacing_for_info_copy = subject["spacing"][0].tolist() for channel in parameters["headers"]["channelHeaders"]: image_file = Path( os.path.join( current_output_dir, subject["subject_id"][0] + "_" + str(channel) + common_ext, )).as_posix() output_columns_to_write["channel_" + str(channel - 1)].append(image_file) image_to_write = subject_dict_to_write[str(channel)].as_sitk() image_to_write.SetOrigin(image_for_info_copy.GetOrigin()) image_to_write.SetDirection(image_for_info_copy.GetDirection()) image_to_write.SetSpacing(correct_spacing_for_info_copy) if not os.path.isfile(image_file): try: sitk.WriteImage(image_to_write, image_file) except IOError: raise IOError( "Could not write image file: {}. Make sure that the file is not open and try again." .format(image_file)) # now try to write the label if "label" in subject_dict_to_write: image_file = Path( os.path.join(current_output_dir, subject["subject_id"][0] + "_label" + common_ext)).as_posix() output_columns_to_write["label"].append(image_file) image_to_write = subject_dict_to_write["label"].as_sitk() image_to_write.SetOrigin(image_for_info_copy.GetOrigin()) image_to_write.SetDirection(image_for_info_copy.GetDirection()) image_to_write.SetSpacing(correct_spacing_for_info_copy) if not os.path.isfile(image_file): try: sitk.WriteImage(image_to_write, image_file) except IOError: raise IOError( "Could not write image file: {}. Make sure that the file is not open and try again." .format(image_file)) # ensure prediction headers are getting saved, as well if len(parameters["headers"]["predictionHeaders"]) > 1: for key in parameters["headers"]["predictionHeaders"]: output_columns_to_write["valuetopredict_" + str(key)].append( str(subject["value_" + str(key)].numpy()[0])) elif len(parameters["headers"]["predictionHeaders"]) == 1: output_columns_to_write["valuetopredict"].append( str(subject["value_0"].numpy()[0])) path_for_csv = Path(os.path.join(output_dir, "data_processed.csv")).as_posix() print("Writing final csv for subsequent training: ", path_for_csv) pd.DataFrame.from_dict(data=output_columns_to_write).to_csv(path_for_csv, header=True, index=False)
def main_run(data_csv, config_file, output_dir, train_mode, device, resume, reset): """ Main function that runs the training and inference. Args: data_csv (str): The CSV file of the training data. config_file (str): The YAML file of the training configuration. output_dir (str): The output directory. train_mode (bool): Whether to train or infer. device (str): The device type. resume (bool): Whether the previous run will be resumed or not. reset (bool): Whether the previous run will be reset or not. Raises: ValueError: Parameter check from previous run. """ file_data_full = data_csv model_parameters = config_file device = device parameters = parseConfig(model_parameters) # in case the data being passed is already processed, check if the previous parameters exists, # and if it does, compare the two and if they are the same, ensure no preprocess is done. model_parameters_prev = os.path.join(os.path.dirname(output_dir), "parameters.pkl") if train_mode: if not (reset) or not (resume): print( "Trying to resume training without changing any parameters from previous run.", flush=True, ) if os.path.exists(model_parameters_prev): parameters_prev = pickle.load(open(model_parameters_prev, "rb")) if parameters != parameters_prev: raise ValueError( "The parameters are not the same as the ones stored in the previous run, please re-check." ) parameters["output_dir"] = output_dir if "-1" in device: device = "cpu" if train_mode: # train mode Path(parameters["output_dir"]).mkdir(parents=True, exist_ok=True) # parse training CSV if "," in file_data_full: # training and validation pre-split data_full = None both_csvs = file_data_full.split(",") data_train, headers_train = parseTrainingCSV(both_csvs[0], train=train_mode) data_validation, headers_validation = parseTrainingCSV( both_csvs[1], train=train_mode ) if headers_train != headers_validation: sys.exit( "The training and validation CSVs do not have the same header information." ) parameters = populate_header_in_parameters(parameters, headers_train) # if we are here, it is assumed that the user wants to do training if train_mode: TrainingManager_split( dataframe_train=data_train, dataframe_validation=data_validation, outputDir=parameters["output_dir"], parameters=parameters, device=device, resume=resume, reset=reset, ) else: data_full, headers = parseTrainingCSV(file_data_full, train=train_mode) parameters = populate_header_in_parameters(parameters, headers) if train_mode: TrainingManager( dataframe=data_full, outputDir=parameters["output_dir"], parameters=parameters, device=device, resume=resume, reset=reset, ) else: InferenceManager( dataframe=data_full, outputDir=parameters["output_dir"], parameters=parameters, device=device, )