示例#1
0
def _test_load_images_from_channels(
        metadata: Any,
        image_channel: Any,
        ground_truth_channel: Any,
        mask_channel: Any) -> None:
    """
    Test if images are loaded as expected from channels
    """
    sample = io_util.load_images_from_dataset_source(
        PatientDatasetSource(
            metadata=metadata,
            image_channels=[image_channel] * 2,
            ground_truth_channels=[ground_truth_channel] * 4,
            mask_channel=mask_channel
        )
    )
    if image_channel:
        image_with_header = io_util.load_image(image_channel)
        assert list(sample.image.shape) == [2] + list(image_with_header.image.shape)
        assert all([np.array_equal(x, image_with_header.image) for x in sample.image])  # type: ignore
        if mask_channel:
            assert np.array_equal(sample.mask, image_with_header.image)
        if ground_truth_channel:
            assert list(sample.labels.shape) == [5] + list(image_with_header.image.shape)
            assert np.all(sample.labels[0] == 0) and np.all(sample.labels[1:] == 1)
def load_dataset_sources(
        dataframe: pd.DataFrame,
        local_dataset_root_folder: Path,
        image_channels: List[str],
        ground_truth_channels: List[str],
        mask_channel: Optional[str],
        allow_incomplete_labels: bool = False
) -> Dict[str, PatientDatasetSource]:
    """
    Prepares a patient-to-images mapping from a dataframe read directly from a dataset CSV file.
    The dataframe contains per-patient per-channel image information, relative to a root directory.
    This method converts that into a per-patient dictionary, that contains absolute file paths
    separated for for image channels, ground truth channels, and mask channels.
    :param dataframe: A dataframe read directly from a dataset CSV file.
    :param local_dataset_root_folder: The root folder that contains all images.
    :param image_channels: The names of the image channels that should be used in the result.
    :param ground_truth_channels: The names of the ground truth channels that should be used in the result.
    :param mask_channel: The name of the mask channel that should be used in the result. This can be None.
    :param allow_incomplete_labels: Boolean flag. If false, all ground truth files must be provided. If true, ground
                                    truth files are optional. Default value is false.
    :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource.
    """
    expected_headers = {
        CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER
    }
    # validate the csv file
    actual_headers = list(dataframe)
    if not expected_headers.issubset(actual_headers):
        raise ValueError(
            "The dataset CSV file should contain at least these columns: {}, but got: {}"
            .format(expected_headers, actual_headers))

    # Calculate unique data points, first, and last data point
    unique_ids: List[str] = sorted(pd.unique(dataframe[CSV_SUBJECT_HEADER]))
    if not local_dataset_root_folder.is_dir():
        raise ValueError("The dataset root folder does not exist: {}".format(
            local_dataset_root_folder))

    def get_mask_channel_or_default() -> Optional[Path]:
        if mask_channel is None:
            return None
        paths = get_paths_for_channel_ids(
            channels=[mask_channel],
            allow_incomplete_labels_flag=allow_incomplete_labels)
        if len(paths) == 0:
            return None
        else:
            return paths[0]

    def get_paths_for_channel_ids(
            channels: List[str],
            allow_incomplete_labels_flag: bool) -> List[Optional[Path]]:
        if len(set(channels)) < len(channels):
            raise ValueError(f"ids have duplicated entries: {channels}")
        rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id]
        # converts channels to paths and makes second sanity check for channel data
        paths, failed_channel_info = convert_channels_to_file_paths(
            channels, rows, local_dataset_root_folder, patient_id,
            allow_incomplete_labels_flag)

        if failed_channel_info:
            raise ValueError(failed_channel_info)

        return paths

    dataset_sources = {}
    for patient_id in unique_ids:
        metadata = PatientMetadata.from_dataframe(dataframe, patient_id)
        dataset_sources[patient_id] = PatientDatasetSource(
            metadata=metadata,
            image_channels=get_paths_for_channel_ids(
                channels=image_channels,  # type: ignore
                allow_incomplete_labels_flag=False),
            mask_channel=get_mask_channel_or_default(),
            ground_truth_channels=get_paths_for_channel_ids(
                channels=ground_truth_channels,  # type: ignore
                allow_incomplete_labels_flag=allow_incomplete_labels),
            allow_incomplete_labels=allow_incomplete_labels)

    return dataset_sources
def load_dataset_sources(
        dataframe: pd.DataFrame, local_dataset_root_folder: Path,
        image_channels: List[str], ground_truth_channels: List[str],
        mask_channel: Optional[str]) -> Dict[int, PatientDatasetSource]:
    """
    Prepares a patient-to-images mapping from a dataframe read directly from a dataset CSV file.
    The dataframe contains per-patient per-channel image information, relative to a root directory.
    This method converts that into a per-patient dictionary, that contains absolute file paths
    separated for for image channels, ground truth channels, and mask channels.
    :param dataframe: A dataframe read directly from a dataset CSV file.
    :param local_dataset_root_folder: The root folder that contains all images.
    :param image_channels: The names of the image channels that should be used in the result.
    :param ground_truth_channels: The names of the ground truth channels that should be used in the result.
    :param mask_channel: The name of the mask channel that should be used in the result. This can be None.
    :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource.
    """
    expected_headers = {
        CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER
    }
    # validate the csv file
    actual_headers = list(dataframe)
    if not expected_headers.issubset(actual_headers):
        raise ValueError(
            "The dataset CSV file should contain at least these columns: {}, but got: {}"
            .format(expected_headers, actual_headers))

    # Calculate unique data points, first, and last data point
    unique_ids = sorted(pd.unique(dataframe[CSV_SUBJECT_HEADER]))
    if not local_dataset_root_folder.is_dir():
        raise ValueError("The dataset root folder does not exist: {}".format(
            local_dataset_root_folder))

    def get_mask_channel_or_default() -> Optional[Path]:
        if mask_channel is None:
            return None
        else:
            return get_paths_for_channel_ids(channels=[mask_channel])[0]

    def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
        if len(set(channels)) < len(channels):
            raise ValueError(f"ids have duplicated entries: {channels}")

        paths: List[Path] = []
        rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id]
        for channel_id in channels:
            row = rows.loc[rows[CSV_CHANNEL_HEADER] == channel_id]
            if len(row) == 0:
                raise ValueError(
                    f"Patient {patient_id} does not have channel '{channel_id}'"
                )
            elif len(row) > 1:
                raise ValueError(
                    f"Patient {patient_id} has more than one entry for channel '{channel_id}'"
                )
            image_path = local_dataset_root_folder / row[
                CSV_PATH_HEADER].values[0]
            if not image_path.is_file():
                raise ValueError(
                    f"The dataset references a file that does not exist: {image_path}"
                )
            paths.append(image_path)
        return paths

    dataset_sources = {}
    for patient_id in unique_ids:
        metadata = PatientMetadata.from_dataframe(dataframe, patient_id)
        dataset_sources[patient_id] = PatientDatasetSource(
            metadata=metadata,
            image_channels=get_paths_for_channel_ids(
                channels=image_channels),  # type: ignore
            mask_channel=get_mask_channel_or_default(),
            ground_truth_channels=get_paths_for_channel_ids(
                channels=ground_truth_channels)  # type: ignore
        )

    return dataset_sources