def test_load_single_item_6() -> None: """ Test loading of different channels for different categorical features. """ csv_string = StringIO("""subject,path,channel,cat1,cat2,scalar1,label S1,foo1.nii,week1,True,True,1.2,True S1,foo2.nii,week2,False,False,1.2,True S1,foo2.nii,week3,False,True,1.3,True """) df = pd.read_csv(csv_string, sep=",", dtype=str) item: ScalarDataSource = load_single_data_source(df, subject_id="S1", image_channels=["week1"], image_file_column="path", label_channels=["week1"], label_value_column="label", numerical_columns=["scalar1"], non_image_feature_channels={"scalar1": ["week3"], "cat1": ["week1", "week2"], "cat2": ["week3"]}, categorical_data_encoder=CategoricalToOneHotEncoder.create_from_dataframe( dataframe=df, columns=["cat1", "cat2"] ), channel_column="channel") assert torch.all(item.categorical_non_image_features == torch.tensor([0, 1, 1, 0, 0, 1]))
def test_load_single_item_1() -> None: """ Test if we can create a classificationItem from the rows for a single subject, including NaN scalar and categorical values. """ csv_string = StringIO("""subject,channel,path,value,scalar1,scalar2,categorical1,categorical2 S1,image1,foo1.nii,,2.1,2.2,True,False S1,image2,foo2.nii,,3.1,,True,False S1,label,,True,1.1,1.2,,False """) df = pd.read_csv(csv_string, sep=",", dtype=str) numerical_columns = ["scalar2", "scalar1"] categorical_columns = ["categorical1", "categorical2"] non_image_feature_channels = _get_non_image_dict(["label", "image2"], ["scalar2", "scalar1"], ["categorical1", "categorical2"]) item: ScalarDataSource = load_single_data_source(df, subject_id="S1", # Provide values in a different order from the file! image_channels=["image2", "image1"], image_file_column="path", label_channels=["label"], label_value_column="value", non_image_feature_channels=non_image_feature_channels, # Provide values in a different order from the file! numerical_columns=numerical_columns, categorical_data_encoder=CategoricalToOneHotEncoder.create_from_dataframe( dataframe=df, columns=categorical_columns ), channel_column="channel") assert item.channel_files[0] == "foo2.nii" assert item.channel_files[1] == "foo1.nii" assert item.label == torch.tensor([1.0]) assert item.label.dtype == torch.float32 assert item.numerical_non_image_features[0] == 1.2 assert item.numerical_non_image_features[2] == 1.1 assert item.numerical_non_image_features[3] == 3.1 assert math.isnan(item.numerical_non_image_features[1].item()) assert np.all(np.isnan(item.categorical_non_image_features[0].numpy())) assert item.categorical_non_image_features[1:].tolist() == [1.0, 1.0, 1.0] assert item.numerical_non_image_features.dtype == torch.float32 item_no_scalars: ScalarDataSource = load_single_data_source(df, subject_id="S1", # Provide values in a different order from the file! image_channels=["image2", "image1"], image_file_column="path", label_channels=["label"], label_value_column="value", non_image_feature_channels={}, numerical_columns=[], channel_column="channel") assert item_no_scalars.numerical_non_image_features.shape == (0,)
def pre_process_dataset_dataframe(self) -> None: # some empty values on numeric columns get converted to nan but we want '' assert self.dataset_data_frame is not None df = self.dataset_data_frame.fillna('') self.dataset_data_frame = self.filter_dataframe(df) # update the one-hot encoder based on this dataframe if self.categorical_columns: from InnerEye.ML.utils.dataset_util import CategoricalToOneHotEncoder self.categorical_feature_encoder = CategoricalToOneHotEncoder.create_from_dataframe( dataframe=self.dataset_data_frame, columns=self.categorical_columns)
def __init__(self, use_combined_model: bool = False, imaging_feature_type: ImagingFeatureType = ImagingFeatureType. Image, combine_hidden_states: bool = False, use_encoder_layer_norm: bool = False, sequence_target_positions: Optional[List[int]] = None, use_mean_teacher_model: bool = False, **kwargs: Any) -> None: num_epochs = 3 mean_teacher_alpha = 0.999 if use_mean_teacher_model else None sequence_target_positions = [ 2 ] if sequence_target_positions is None else sequence_target_positions image_column = "image" if use_combined_model else None categorical_feature_encoder = CategoricalToOneHotEncoder.create_from_dataframe( dataframe=_get_mock_sequence_dataset(), columns=["cat1"]) super().__init__( local_dataset=full_ml_test_data_path( "sequence_data_for_classification"), temperature_scaling_config=TemperatureScalingConfig(), label_value_column="label", numerical_columns=["numerical1", "numerical2"], categorical_columns=["cat1"], categorical_feature_encoder=categorical_feature_encoder, sequence_column="seqColumn", sequence_target_positions=sequence_target_positions, image_file_column=image_column, loss_type=ScalarLoss.WeightedCrossEntropyWithLogits, num_epochs=num_epochs, num_dataload_workers=0, train_batch_size=3, l_rate=1e-1, load_segmentation=True, use_mixed_precision=True, label_smoothing_eps=0.05, drop_last_batch_in_training=True, mean_teacher_alpha=mean_teacher_alpha, # Trying to run DDP from the test suite hangs, hence restrict to single GPU. max_num_gpus=1, **kwargs) self.use_combined_model = use_combined_model self.imaging_feature_type = imaging_feature_type self.combine_hidden_state = combine_hidden_states self.use_encoder_layer_norm = use_encoder_layer_norm
def test_load_single_item_7() -> None: """ Test loading of different channels for different categorical features. Case where one column value is invalid. """ # Fit the encoder on the valid labels. csv_string_valid = StringIO("""subject,path,channel,cat1,cat2,label S1,foo1.nii,week1,True,True,True S1,foo2.nii,week2,False,False,True S1,foo2.nii,week3,False,,True """) df = pd.read_csv(csv_string_valid, sep=",", dtype=str) encoder = CategoricalToOneHotEncoder.create_from_dataframe( dataframe=df, columns=["cat1", "cat2"]) # Try to encode a dataframe with invalid value csv_string_invalid = StringIO("""subject,path,channel,cat1,cat2,label S1,foo1.nii,week1,True,True,True S1,foo2.nii,week2,houhou,False,False S1,foo2.nii,week3,False,,True """) df = pd.read_csv(csv_string_invalid, sep=",", dtype=str) item: ScalarDataSource = load_single_data_source( df, subject_id="S1", image_channels=["week1"], image_file_column="path", label_channels=["week1"], label_value_column="label", non_image_feature_channels={ "cat1": ["week1", "week2"], "cat2": ["week3"] }, categorical_data_encoder=encoder, channel_column="channel") # cat1 - week1 is valid assert torch.all( item.categorical_non_image_features[0:2] == torch.tensor([0, 1])) # cat1 - week2 is invalid test regression assert torch.all(torch.isnan(item.categorical_non_image_features[2:4])) # cat2 - week 3 is invalid assert torch.all(torch.isnan(item.categorical_non_image_features[4:6]))
def test_visualization_with_scalar_model(use_non_imaging_features: bool, imaging_feature_type: ImagingFeatureType, encode_channels_jointly: bool, test_output_dirs: OutputFolderForTests) -> None: dataset_contents = """subject,channel,path,label,numerical1,numerical2,categorical1,categorical2 S1,week0,scan1.npy,,1,10,Male,Val1 S1,week1,scan2.npy,True,2,20,Female,Val2 S2,week0,scan3.npy,,3,30,Female,Val3 S2,week1,scan4.npy,False,4,40,Female,Val1 S3,week0,scan1.npy,,5,50,Male,Val2 S3,week1,scan3.npy,True,6,60,Male,Val2 """ dataset_dataframe = pd.read_csv(StringIO(dataset_contents), dtype=str) numerical_columns = ["numerical1", "numerical2"] if use_non_imaging_features else [] categorical_columns = ["categorical1", "categorical2"] if use_non_imaging_features else [] non_image_feature_channels = get_non_image_features_dict(default_channels=["week1", "week0"], specific_channels={"categorical2": ["week1"]}) \ if use_non_imaging_features else {} config = ImageEncoder( local_dataset=Path(), encode_channels_jointly=encode_channels_jointly, should_validate=False, numerical_columns=numerical_columns, categorical_columns=categorical_columns, imaging_feature_type=imaging_feature_type, non_image_feature_channels=non_image_feature_channels, categorical_feature_encoder=CategoricalToOneHotEncoder.create_from_dataframe( dataframe=dataset_dataframe, columns=categorical_columns) ) dataloader = ScalarDataset(config, data_frame=dataset_dataframe) \ .as_data_loader(shuffle=False, batch_size=2) config.set_output_to(test_output_dirs.root_dir) config.num_epochs = 1 model = create_model_with_temperature_scaling(config) visualizer = VisualizationMaps(model, config) # Patch the load_images function that will be called once we access a dataset item image_and_seg = ImageAndSegmentations[np.ndarray](images=np.random.uniform(0, 1, (6, 64, 60)), segmentations=np.random.randint(0, 2, (6, 64, 60))) with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg): batch = next(iter(dataloader)) if config.use_gpu: device = visualizer.grad_cam.device batch = transfer_batch_to_device(batch, device) visualizer.grad_cam.model = visualizer.grad_cam.model.to(device) model_inputs_and_labels = get_scalar_model_inputs_and_labels(model, target_indices=[], sample=batch) number_channels = len(config.image_channels) number_subjects = len(model_inputs_and_labels.subject_ids) guided_grad_cams, grad_cams, pseudo_cam_non_img, probas = visualizer.generate( model_inputs_and_labels.model_inputs) if imaging_feature_type == ImagingFeatureType.ImageAndSegmentation: assert guided_grad_cams.shape[:2] == (number_subjects, number_channels * 2) else: assert guided_grad_cams.shape[:2] == (number_subjects, number_channels) assert grad_cams.shape[:2] == (number_subjects, 1) if encode_channels_jointly \ else (number_subjects, number_channels) if use_non_imaging_features: non_image_features = config.numerical_columns + config.categorical_columns non_imaging_plot_labels = visualizer._get_non_imaging_plot_labels(model_inputs_and_labels.data_item, non_image_features, index=0) assert non_imaging_plot_labels == ['numerical1_week1', 'numerical1_week0', 'numerical2_week1', 'numerical2_week0', 'categorical1_week1', 'categorical1_week0', 'categorical2_week1'] assert pseudo_cam_non_img.shape == (number_subjects, 1, len(non_imaging_plot_labels))
def test_one_hot_encoder_with_infinite_values() -> None: df = pd.DataFrame(columns=["categorical"]) df["categorical"] = ["F", "M", np.inf] encoder = CategoricalToOneHotEncoder.create_from_dataframe( df, ["categorical"]) assert np.isnan(encoder.encode({"categorical": np.inf})).all()