def _test_load_images_from_channels( metadata: Any, image_channel: Any, ground_truth_channel: Any, mask_channel: Any) -> None: """ Test if images are loaded as expected from channels """ sample = io_util.load_images_from_dataset_source( PatientDatasetSource( metadata=metadata, image_channels=[image_channel] * 2, ground_truth_channels=[ground_truth_channel] * 4, mask_channel=mask_channel ) ) if image_channel: image_with_header = io_util.load_image(image_channel) assert list(sample.image.shape) == [2] + list(image_with_header.image.shape) assert all([np.array_equal(x, image_with_header.image) for x in sample.image]) # type: ignore if mask_channel: assert np.array_equal(sample.mask, image_with_header.image) if ground_truth_channel: assert list(sample.labels.shape) == [5] + list(image_with_header.image.shape) assert np.all(sample.labels[0] == 0) and np.all(sample.labels[1:] == 1)
def load_dataset_sources( dataframe: pd.DataFrame, local_dataset_root_folder: Path, image_channels: List[str], ground_truth_channels: List[str], mask_channel: Optional[str], allow_incomplete_labels: bool = False ) -> Dict[str, PatientDatasetSource]: """ Prepares a patient-to-images mapping from a dataframe read directly from a dataset CSV file. The dataframe contains per-patient per-channel image information, relative to a root directory. This method converts that into a per-patient dictionary, that contains absolute file paths separated for for image channels, ground truth channels, and mask channels. :param dataframe: A dataframe read directly from a dataset CSV file. :param local_dataset_root_folder: The root folder that contains all images. :param image_channels: The names of the image channels that should be used in the result. :param ground_truth_channels: The names of the ground truth channels that should be used in the result. :param mask_channel: The name of the mask channel that should be used in the result. This can be None. :param allow_incomplete_labels: Boolean flag. If false, all ground truth files must be provided. If true, ground truth files are optional. Default value is false. :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource. """ expected_headers = { CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER } # validate the csv file actual_headers = list(dataframe) if not expected_headers.issubset(actual_headers): raise ValueError( "The dataset CSV file should contain at least these columns: {}, but got: {}" .format(expected_headers, actual_headers)) # Calculate unique data points, first, and last data point unique_ids: List[str] = sorted(pd.unique(dataframe[CSV_SUBJECT_HEADER])) if not local_dataset_root_folder.is_dir(): raise ValueError("The dataset root folder does not exist: {}".format( local_dataset_root_folder)) def get_mask_channel_or_default() -> Optional[Path]: if mask_channel is None: return None paths = get_paths_for_channel_ids( channels=[mask_channel], allow_incomplete_labels_flag=allow_incomplete_labels) if len(paths) == 0: return None else: return paths[0] def get_paths_for_channel_ids( channels: List[str], allow_incomplete_labels_flag: bool) -> List[Optional[Path]]: if len(set(channels)) < len(channels): raise ValueError(f"ids have duplicated entries: {channels}") rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id] # converts channels to paths and makes second sanity check for channel data paths, failed_channel_info = convert_channels_to_file_paths( channels, rows, local_dataset_root_folder, patient_id, allow_incomplete_labels_flag) if failed_channel_info: raise ValueError(failed_channel_info) return paths dataset_sources = {} for patient_id in unique_ids: metadata = PatientMetadata.from_dataframe(dataframe, patient_id) dataset_sources[patient_id] = PatientDatasetSource( metadata=metadata, image_channels=get_paths_for_channel_ids( channels=image_channels, # type: ignore allow_incomplete_labels_flag=False), mask_channel=get_mask_channel_or_default(), ground_truth_channels=get_paths_for_channel_ids( channels=ground_truth_channels, # type: ignore allow_incomplete_labels_flag=allow_incomplete_labels), allow_incomplete_labels=allow_incomplete_labels) return dataset_sources
def load_dataset_sources( dataframe: pd.DataFrame, local_dataset_root_folder: Path, image_channels: List[str], ground_truth_channels: List[str], mask_channel: Optional[str]) -> Dict[int, PatientDatasetSource]: """ Prepares a patient-to-images mapping from a dataframe read directly from a dataset CSV file. The dataframe contains per-patient per-channel image information, relative to a root directory. This method converts that into a per-patient dictionary, that contains absolute file paths separated for for image channels, ground truth channels, and mask channels. :param dataframe: A dataframe read directly from a dataset CSV file. :param local_dataset_root_folder: The root folder that contains all images. :param image_channels: The names of the image channels that should be used in the result. :param ground_truth_channels: The names of the ground truth channels that should be used in the result. :param mask_channel: The name of the mask channel that should be used in the result. This can be None. :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource. """ expected_headers = { CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER } # validate the csv file actual_headers = list(dataframe) if not expected_headers.issubset(actual_headers): raise ValueError( "The dataset CSV file should contain at least these columns: {}, but got: {}" .format(expected_headers, actual_headers)) # Calculate unique data points, first, and last data point unique_ids = sorted(pd.unique(dataframe[CSV_SUBJECT_HEADER])) if not local_dataset_root_folder.is_dir(): raise ValueError("The dataset root folder does not exist: {}".format( local_dataset_root_folder)) def get_mask_channel_or_default() -> Optional[Path]: if mask_channel is None: return None else: return get_paths_for_channel_ids(channels=[mask_channel])[0] def get_paths_for_channel_ids(channels: List[str]) -> List[Path]: if len(set(channels)) < len(channels): raise ValueError(f"ids have duplicated entries: {channels}") paths: List[Path] = [] rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id] for channel_id in channels: row = rows.loc[rows[CSV_CHANNEL_HEADER] == channel_id] if len(row) == 0: raise ValueError( f"Patient {patient_id} does not have channel '{channel_id}'" ) elif len(row) > 1: raise ValueError( f"Patient {patient_id} has more than one entry for channel '{channel_id}'" ) image_path = local_dataset_root_folder / row[ CSV_PATH_HEADER].values[0] if not image_path.is_file(): raise ValueError( f"The dataset references a file that does not exist: {image_path}" ) paths.append(image_path) return paths dataset_sources = {} for patient_id in unique_ids: metadata = PatientMetadata.from_dataframe(dataframe, patient_id) dataset_sources[patient_id] = PatientDatasetSource( metadata=metadata, image_channels=get_paths_for_channel_ids( channels=image_channels), # type: ignore mask_channel=get_mask_channel_or_default(), ground_truth_channels=get_paths_for_channel_ids( channels=ground_truth_channels) # type: ignore ) return dataset_sources