Exemplo n.º 1
0
def test_standardize_features_when_singleton(is_sequence: bool) -> None:
    """
    Test how feature standardize copes with datasets that only have 1 entry.
    """
    numerical_features = torch.ones((1, 3))
    categorical_features = torch.tensor([[0, 1, 1], [1, 0, 0]])
    item: Union[SequenceDataSource, ScalarDataSource]
    sources: Union[ListOfSequences, List[ScalarDataSource]]
    if is_sequence:
        item = SequenceDataSource(
            metadata=GeneralSampleMetadata(id="foo"),
            numerical_non_image_features=numerical_features,
            categorical_non_image_features=categorical_features,
            label=torch.tensor([]),
            channel_files=[])
        sources = [ClassificationItemSequence(id="foo", items=[item])]
        mean_std = FeatureStatistics.from_data_sources(sources)
    else:
        item = ScalarDataSource(
            metadata=GeneralSampleMetadata(id="foo"),
            numerical_non_image_features=numerical_features,
            categorical_non_image_features=categorical_features,
            label=torch.tensor([]),
            channel_files=[])

        sources = [item]
        mean_std = FeatureStatistics.from_data_sources(sources)

    assert_tensors_equal(mean_std.mean, numerical_features)
    # Standard deviation can't be computed because there is only one element, hence becomes nan.
    assert torch.all(torch.isnan(mean_std.std))
    # When applying such a standardization to the sequences, they should not be changed (similar to features that
    # are constant)
    standardized_sources = mean_std.standardize(sources)
    if is_sequence:
        assert_tensors_equal(
            standardized_sources[0].items[0].numerical_non_image_features,
            numerical_features)
        assert_tensors_equal(
            standardized_sources[0].items[0].categorical_non_image_features,
            categorical_features)
    else:
        assert_tensors_equal(
            standardized_sources[0].numerical_non_image_features,
            numerical_features)
        assert_tensors_equal(
            standardized_sources[0].categorical_non_image_features,
            categorical_features)
def test_predict_ensemble(batch_size: int) -> None:
    config_returns_0 = ConstantScalarConfig(0.)
    model_and_info_returns_0 = ModelAndInfo(config=config_returns_0, model_execution_mode=ModelExecutionMode.TEST,
                                            is_mean_teacher=False, checkpoint_path=None)
    model_loaded = model_and_info_returns_0.try_create_model_load_from_checkpoint_and_adjust()
    assert model_loaded
    model_returns_0 = model_and_info_returns_0.model

    config_returns_1 = ConstantScalarConfig(1.)
    model_and_info_returns_1 = ModelAndInfo(config=config_returns_1, model_execution_mode=ModelExecutionMode.TEST,
                                            is_mean_teacher=False, checkpoint_path=None)
    model_loaded = model_and_info_returns_1.try_create_model_load_from_checkpoint_and_adjust()
    assert model_loaded
    model_returns_1 = model_and_info_returns_1.model

    pipeline_0 = ScalarInferencePipeline(model_returns_0, config_returns_0, 0, 0)
    pipeline_1 = ScalarInferencePipeline(model_returns_0, config_returns_0, 0, 1)
    pipeline_2 = ScalarInferencePipeline(model_returns_0, config_returns_0, 0, 2)
    pipeline_3 = ScalarInferencePipeline(model_returns_1, config_returns_1, 0, 3)
    pipeline_4 = ScalarInferencePipeline(model_returns_1, config_returns_1, 0, 4)
    ensemble_pipeline = ScalarEnsemblePipeline([pipeline_0, pipeline_1, pipeline_2, pipeline_3, pipeline_4],
                                               config_returns_0, EnsembleAggregationType.Average)
    data = {"metadata": [GeneralSampleMetadata(id='2')] * batch_size,
            "label": torch.zeros((batch_size, 1)),
            "images": torch.zeros(((batch_size, 1) + config_returns_0.expected_image_size_zyx)),
            "numerical_non_image_features": torch.tensor([]),
            "categorical_non_image_features": torch.tensor([]),
            "segmentations": torch.tensor([])}

    results = ensemble_pipeline.predict(data)
    ids, labels, predicted = results.subject_ids, results.labels, results.model_outputs
    assert ids == ['2'] * batch_size
    assert torch.equal(labels, torch.zeros((batch_size, 1)))
    # 3 models return 0, 2 return 1, so predicted should be ((sigmoid(0)*3)+(sigmoid(1)*2))/5
    assert torch.allclose(predicted, torch.full((batch_size, 1), 0.592423431))
Exemplo n.º 3
0
def test_predict_non_ensemble(batch_size: int, empty_labels: bool) -> None:
    config = ConstantScalarConfig(1.)
    model_and_info = ModelAndInfo(config=config,
                                  model_execution_mode=ModelExecutionMode.TEST,
                                  checkpoint_path=None)
    model_loaded = model_and_info.try_create_model_load_from_checkpoint_and_adjust(
    )
    assert model_loaded

    model = model_and_info.model

    pipeline = ScalarInferencePipeline(model, config, 0, 0)
    actual_labels = torch.zeros(
        (batch_size, 1)) * np.nan if empty_labels else torch.zeros(
            (batch_size, 1))
    data = {
        "metadata": [GeneralSampleMetadata(id='2')] * batch_size,
        "label": actual_labels,
        "images": torch.zeros(
            ((batch_size, 1) + config.expected_image_size_zyx)),
        "numerical_non_image_features": torch.tensor([]),
        "categorical_non_image_features": torch.tensor([]),
        "segmentations": torch.tensor([])
    }

    results = pipeline.predict(data)
    ids, labels, predicted = results.subject_ids, results.labels, results.model_outputs
    assert ids == ['2'] * batch_size
    assert torch.allclose(labels, actual_labels, equal_nan=True)
    # The model always returns 1, so predicted should be sigmoid(1)
    assert torch.allclose(predicted, torch.full((batch_size, 1), 0.731058578))
Exemplo n.º 4
0
def test_predict_non_ensemble(batch_size: int, empty_labels: bool) -> None:
    config = ConstantScalarConfig(1.)
    model = create_lightning_model(config, set_optimizer_and_scheduler=False)
    assert isinstance(model, ScalarLightning)

    pipeline = ScalarInferencePipeline(model, config, 0)
    actual_labels = torch.zeros(
        (batch_size, 1)) * np.nan if empty_labels else torch.zeros(
            (batch_size, 1))
    data = {
        "metadata": [GeneralSampleMetadata(id='2')] * batch_size,
        "label": actual_labels,
        "images": torch.zeros(
            ((batch_size, 1) + config.expected_image_size_zyx)),
        "numerical_non_image_features": torch.tensor([]),
        "categorical_non_image_features": torch.tensor([]),
        "segmentations": torch.tensor([])
    }

    results = pipeline.predict(data)
    ids, labels, predicted = results.subject_ids, results.labels, results.posteriors
    assert ids == ['2'] * batch_size
    assert torch.allclose(labels, actual_labels, equal_nan=True)
    # The model always returns 1, so predicted should be sigmoid(1)
    assert torch.allclose(predicted, torch.full((batch_size, 1), 0.731058578))
Exemplo n.º 5
0
 def _create(features: List) -> SequenceDataSource:
     return SequenceDataSource(
         metadata=GeneralSampleMetadata(id="foo"),
         channel_files=[],
         label=torch.tensor([]),
         categorical_non_image_features=torch.tensor([]),
         numerical_non_image_features=torch.tensor(features).float())
def test_predict_non_ensemble(batch_size: int, empty_labels: bool) -> None:
    config = ClassificationModelForTesting()
    model: Any = ScalarOnesModel(config.expected_image_size_zyx, 1.)
    update_model_for_multiple_gpus(ModelAndInfo(model),
                                   args=config,
                                   execution_mode=ModelExecutionMode.TEST)
    pipeline = ScalarInferencePipeline(model, config, 0, 0)
    actual_labels = torch.zeros(
        (batch_size, 1)) * np.nan if empty_labels else torch.zeros(
            (batch_size, 1))
    data = {
        "metadata": [GeneralSampleMetadata(id='2')] * batch_size,
        "label": actual_labels,
        "images": torch.zeros(
            ((batch_size, 1) + config.expected_image_size_zyx)),
        "numerical_non_image_features": torch.tensor([]),
        "categorical_non_image_features": torch.tensor([]),
        "segmentations": torch.tensor([])
    }

    results = pipeline.predict(data)
    ids, labels, predicted = results.subject_ids, results.labels, results.model_outputs
    assert ids == ['2'] * batch_size
    assert torch.allclose(labels, actual_labels, equal_nan=True)
    # The model always returns 1, so predicted should be sigmoid(1)
    assert torch.allclose(predicted, torch.full((batch_size, 1), 0.731058578))
 def _create(pos: int) -> ScalarDataSource:
     z = torch.empty(0)
     return ScalarDataSource(metadata=GeneralSampleMetadata(
         id="", sequence_position=pos),
                             categorical_non_image_features=z,
                             label=z,
                             numerical_non_image_features=z,
                             channel_files=[])
Exemplo n.º 8
0
 def _create(features: List) -> torch.Tensor:
     return ScalarItem(
         segmentations=torch.empty(0),
         metadata=GeneralSampleMetadata(id="foo"),
         images=torch.tensor([]),
         label=torch.tensor([]),
         categorical_non_image_features=torch.tensor(features).float(),
         numerical_non_image_features=torch.tensor(
             features).float()).get_all_non_imaging_features()
Exemplo n.º 9
0
 def _create(id: str, sequence_position: int, file: Optional[str],
             metadata: str) -> SequenceDataSource:
     return SequenceDataSource(
         channel_files=[file],
         numerical_non_image_features=torch.tensor([]),
         categorical_non_image_features=torch.tensor([]),
         label=torch.tensor([]),
         metadata=GeneralSampleMetadata(id=id,
                                        sequence_position=sequence_position,
                                        props={"M": metadata}))
Exemplo n.º 10
0
def test_item_is_valid(channel_files: List[Optional[str]],
                       numerical_features: torch.Tensor,
                       categorical_features: torch.Tensor,
                       is_valid: bool) -> None:
    c = ScalarDataSource(channel_files=channel_files,
                         numerical_non_image_features=numerical_features,
                         categorical_non_image_features=categorical_features,
                         label=torch.empty(0),
                         metadata=GeneralSampleMetadata(id="foo"))
    assert c.is_valid() == is_valid
Exemplo n.º 11
0
def test_standardize_features() -> None:
    """
    Test if the non-image feature can be normalized to mean 0, std 1.
    :return:
    """
    set_random_seed(1234)
    expected_mean = torch.tensor([[123, 2, 3], [4, 5, 6]])
    expected_std = torch.tensor([[0, 2, 3], [3, 4, 4]])
    feature_size = (2, 3)
    sequences: List[ClassificationItemSequence] = []
    for s in range(1000):
        items = []
        seq_length = torch.randint(low=3, high=6, size=(1, )).item()
        for i in range(seq_length):  # type: ignore
            # All features are random Gaussian, apart from feature 0 which is constant.
            # Normalization must be able to deal with constant features when dividing by standard deviation.
            features = torch.randn(size=feature_size, dtype=torch.float32
                                   ) * expected_std + expected_mean
            # Randomly put some infinite values in the vector
            features[s % 2, s %
                     3] = np.inf if torch.rand(1) > 0.9 else features[s % 2,
                                                                      s % 3]
            features[0, 0] = expected_mean[0, 0]
            item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"),
                              numerical_non_image_features=features,
                              categorical_non_image_features=features,
                              label=torch.tensor([]),
                              images=torch.tensor([]),
                              segmentations=torch.tensor([]))
            items.append(item)
        sequences.append(ClassificationItemSequence(id="foo", items=items))
    mean_std = FeatureStatistics.from_data_sources(sequences)
    assert mean_std.mean.shape == feature_size
    assert mean_std.std.shape == feature_size

    assert_tensors_equal(mean_std.mean, expected_mean, 0.07)
    assert_tensors_equal(mean_std.std, expected_std, 0.07)

    # After normalization, mean should be 0, and std should be 1.
    standardized_seq = mean_std.standardize(sequences)
    mean_std_from_standardized = FeatureStatistics.from_data_sources(
        standardized_seq)
    # After normalization, the mean should be 0, apart from the constant feature, which should be left untouched,
    # hence its mean is the original feature value.
    expected_mean_from_standardized = torch.zeros(feature_size)
    expected_mean_from_standardized[0, 0] = expected_mean[0, 0]
    expected_std_from_standardized = torch.ones(feature_size)
    expected_std_from_standardized[0, 0] = 0.0
    assert_tensors_equal(mean_std_from_standardized.mean,
                         expected_mean_from_standardized,
                         abs=1e-5)
    assert_tensors_equal(mean_std_from_standardized.std,
                         expected_std_from_standardized,
                         abs=1e-5)
Exemplo n.º 12
0
def _create_scalar_items(length: int,
                         label_value: float = 1.0) -> List[ScalarItem]:
    return [
        ScalarItem(metadata=GeneralSampleMetadata(id="foo",
                                                  sequence_position=x),
                   numerical_non_image_features=torch.tensor([]),
                   categorical_non_image_features=torch.tensor([]),
                   label=torch.tensor([label_value]),
                   images=torch.tensor([]),
                   segmentations=torch.tensor([])) for x in range(length)
    ]
Exemplo n.º 13
0
def _create_item(id: str,
                 sequence_position: int,
                 metadata: str,
                 label: Optional[float] = None) -> SequenceDataSource:
    return SequenceDataSource(
        channel_files=["foo"],
        numerical_non_image_features=torch.tensor([]),
        categorical_non_image_features=torch.tensor([]),
        label=(torch.tensor([label]) if label else torch.tensor([])),
        metadata=GeneralSampleMetadata(id=id,
                                       sequence_position=sequence_position,
                                       props={"M": metadata}))
Exemplo n.º 14
0
def test_predict_ensemble(batch_size: int) -> None:
    config_returns_0 = ConstantScalarConfig(0.)
    model_returns_0 = create_lightning_model(config_returns_0,
                                             set_optimizer_and_scheduler=False)
    assert isinstance(model_returns_0, ScalarLightning)

    config_returns_1 = ConstantScalarConfig(1.)
    model_returns_1 = create_lightning_model(config_returns_1,
                                             set_optimizer_and_scheduler=False)
    assert isinstance(model_returns_1, ScalarLightning)

    pipeline_0 = ScalarInferencePipeline(model_returns_0, config_returns_0, 0)
    pipeline_1 = ScalarInferencePipeline(model_returns_0, config_returns_0, 1)
    pipeline_2 = ScalarInferencePipeline(model_returns_0, config_returns_0, 2)
    pipeline_3 = ScalarInferencePipeline(model_returns_1, config_returns_1, 3)
    pipeline_4 = ScalarInferencePipeline(model_returns_1, config_returns_1, 4)
    ensemble_pipeline = ScalarEnsemblePipeline(
        [pipeline_0, pipeline_1, pipeline_2, pipeline_3, pipeline_4],
        config_returns_0, EnsembleAggregationType.Average)
    data = {
        "metadata": [GeneralSampleMetadata(id='2')] * batch_size,
        "label":
        torch.zeros((batch_size, 1)),
        "images":
        torch.zeros(
            ((batch_size, 1) + config_returns_0.expected_image_size_zyx)),
        "numerical_non_image_features":
        torch.tensor([]),
        "categorical_non_image_features":
        torch.tensor([]),
        "segmentations":
        torch.tensor([])
    }

    results = ensemble_pipeline.predict(data)
    ids, labels, predicted = results.subject_ids, results.labels, results.posteriors
    assert ids == ['2'] * batch_size
    assert torch.equal(labels, torch.zeros((batch_size, 1)))
    # 3 models return 0, 2 return 1, so predicted should be ((sigmoid(0)*3)+(sigmoid(1)*2))/5
    assert torch.allclose(predicted, torch.full((batch_size, 1), 0.592423431))
def test_multi_segmentation_encoder() -> None:
    scan_size = (25, 33, 65)
    batch_size = 3
    num_image_channels = 2
    encoder = MultiSegmentationEncoder(num_image_channels=num_image_channels,
                                       encode_channels_jointly=True)
    x = torch.ones((batch_size, num_image_channels *
                    HDF5_NUM_SEGMENTATION_CLASSES) + scan_size)
    y = encoder.encode_and_aggregate(x)
    final_output_channels = _expected_output_channels(
        num_image_channels * HDF5_NUM_SEGMENTATION_CLASSES)
    assert y.size() == (batch_size, final_output_channels, 1, 1, 1)
    full_output = encoder(x)
    assert full_output.size() == (batch_size, 1)
    encoder = MultiSegmentationEncoder(num_image_channels=num_image_channels,
                                       encode_channels_jointly=False)
    x = torch.ones((batch_size, num_image_channels *
                    HDF5_NUM_SEGMENTATION_CLASSES) + scan_size)
    y = encoder.encode_and_aggregate(x)
    final_output_channels = _expected_output_channels(
        HDF5_NUM_SEGMENTATION_CLASSES)
    # Each image channel generates 7 features, we concatenate those 7 features for the 2 image channels
    assert y.size() == (batch_size, final_output_channels * 2, 1, 1, 1)
    full_output = encoder(x)
    assert full_output.size() == (batch_size, 1)
    # Test that the encoder can correctly convert from a scalar data item to the one-hot encoded model input tensor
    scalar_item = ScalarItem(metadata=GeneralSampleMetadata(id="foo"),
                             label=torch.empty(1),
                             numerical_non_image_features=torch.empty(1),
                             categorical_non_image_features=torch.empty(1),
                             images=torch.empty(1),
                             segmentations=torch.ones(
                                 (batch_size, num_image_channels, *scan_size)))
    input_tensors = encoder.get_input_tensors(scalar_item)
    assert len(input_tensors) == 1
    assert input_tensors[0].shape == (batch_size,
                                      HDF5_NUM_SEGMENTATION_CLASSES *
                                      num_image_channels, *scan_size)
def test_predict_ensemble(batch_size: int) -> None:
    config = ClassificationModelForTesting()
    model_returns_0: Any = ScalarOnesModel(config.expected_image_size_zyx, 0.)
    model_returns_1: Any = ScalarOnesModel(config.expected_image_size_zyx, 1.)
    model_and_opt_0 = update_model_for_multiple_gpus(
        ModelAndInfo(model_returns_0),
        args=config,
        execution_mode=ModelExecutionMode.TEST)
    model_returns_0 = model_and_opt_0.model
    model_and_opt_1 = update_model_for_multiple_gpus(
        ModelAndInfo(model_returns_1),
        args=config,
        execution_mode=ModelExecutionMode.TEST)
    model_returns_1 = model_and_opt_1.model
    pipeline_0 = ScalarInferencePipeline(model_returns_0, config, 0, 0)
    pipeline_1 = ScalarInferencePipeline(model_returns_0, config, 0, 1)
    pipeline_2 = ScalarInferencePipeline(model_returns_0, config, 0, 2)
    pipeline_3 = ScalarInferencePipeline(model_returns_1, config, 0, 3)
    pipeline_4 = ScalarInferencePipeline(model_returns_1, config, 0, 4)
    ensemble_pipeline = ScalarEnsemblePipeline(
        [pipeline_0, pipeline_1, pipeline_2, pipeline_3, pipeline_4], config,
        EnsembleAggregationType.Average)
    data = {
        "metadata": [GeneralSampleMetadata(id='2')] * batch_size,
        "label": torch.zeros((batch_size, 1)),
        "images": torch.zeros(
            ((batch_size, 1) + config.expected_image_size_zyx)),
        "numerical_non_image_features": torch.tensor([]),
        "categorical_non_image_features": torch.tensor([]),
        "segmentations": torch.tensor([])
    }

    results = ensemble_pipeline.predict(data)
    ids, labels, predicted = results.subject_ids, results.labels, results.model_outputs
    assert ids == ['2'] * batch_size
    assert torch.equal(labels, torch.zeros((batch_size, 1)))
    # 3 models return 0, 2 return 1, so predicted should be ((sigmoid(0)*3)+(sigmoid(1)*2))/5
    assert torch.allclose(predicted, torch.full((batch_size, 1), 0.592423431))
Exemplo n.º 17
0
def load_single_data_source(subject_rows: pd.DataFrame,
                            subject_id: str,
                            label_value_column: str,
                            channel_column: str,
                            image_channels: Optional[List[str]] = None,
                            image_file_column: Optional[str] = None,
                            label_channels: Optional[List[str]] = None,
                            transform_labels: Union[Callable, List[Callable]] = LabelTransformation.identity,
                            non_image_feature_channels: Optional[Dict] = None,
                            numerical_columns: Optional[List[str]] = None,
                            categorical_data_encoder: Optional[CategoricalToOneHotEncoder] = None,
                            metadata_columns: Optional[Set[str]] = None,
                            is_classification_dataset: bool = True,
                            sequence_position_numeric: Optional[int] = None) -> T:
    """
    Converts a set of dataset rows for a single subject to a ScalarDataSource instance, which contains the
    labels, the non-image features, and the paths to the image files.
    :param channel_column: The name of the column that contains the row identifier ("channels")
    :param metadata_columns: A list of columns that well be added to the item metadata as key/value pairs.
    :param subject_rows: All dataset rows that belong to the same subject.
    :param subject_id: The identifier of the subject that is being processed.
    :param image_channels: The names of all channels (stored in the CSV_CHANNEL_HEADER column of the dataframe)
    that are expected to be loaded from disk later because they are large images.
    :param image_file_column: The name of the column that contains the image file names.
    :param label_channels: The name of the channel where the label scalar or vector is read from.
    :param label_value_column: The column that contains the value for the label scalar or vector.
    :param non_image_feature_channels: non_image_feature_channels: A dictonary of the names of all channels where
    additional scalar values should be read from. THe keys should map each feature to its channels.
    :param numerical_columns: The names of all columns where additional scalar values should be read from.
    :param categorical_data_encoder: Encoding scheme for categorical data.
    :param is_classification_dataset: If the current dataset is classification or not.
    from.
    :param transform_labels: a label transformation or a list of label transformation to apply to the labels.
    If a list is provided, the transformations are applied in order from left to right.
    :param sequence_position_numeric: Numeric position of the data source in a data sequence. Assumed to be
    a non-sequential dataset item if None provided (default).
    :return:
    """

    def _get_row_for_channel(channel: Optional[str]) -> Dict[str, str]:
        return _get_single_channel_row(subject_rows, channel, subject_id, channel_column)

    def _get_label_as_tensor(channel: Optional[str]) -> torch.Tensor:
        extract_fn = extract_label_classification if is_classification_dataset else extract_label_regression
        label_row = _get_row_for_channel(channel)
        label_string = label_row[label_value_column]
        return torch.tensor([extract_fn(label_string=label_string, sample_id=subject_id)],
                            dtype=torch.float)

    def _apply_label_transforms(labels: Any) -> Any:
        """
        Apply the transformations in order.
        """
        if isinstance(transform_labels, List):
            for transform in transform_labels:
                labels = transform(labels)
            label = labels
        else:
            label = transform_labels(labels)
        return label

    def create_none_list(x: Optional[List]) -> List:
        return [None] if x is None or len(x) == 0 else x

    def get_none_list_from_dict(non_image_channels: Dict[str, List[str]], feature: str) -> Sequence[Optional[str]]:
        """
        Return either the list of channels for a given column or if None was passed as
        numerical channels i.e. there are no channel to be specified return [None].
        :param non_image_channels: Dict mapping features name to their channels
        :param feature: feature name for which to return the channels
        :return: List of channels for the given feature.
        """
        if non_image_channels == {}:
            return [None]
        else:
            return non_image_channels[feature]

    def is_empty(x: Optional[List]) -> bool:
        return x is None or len(x) == 0

    def none_if_missing_in_csv(x: Any) -> Optional[str]:
        # If the CSV contains missing values they turn into NaN here, but mark them as None rather.
        return None if isinstance(x, float) and np.isnan(x) else x

    subject_rows = subject_rows.fillna('')
    labels = []
    if label_channels:
        for channel in label_channels:
            labels.append(_get_label_as_tensor(channel))
    else:
        labels.append(_get_label_as_tensor(None))

    label = _apply_label_transforms(labels)

    channel_for_metadata = label_channels[0] if label_channels else None
    label_row = _get_row_for_channel(channel_for_metadata)
    metadata = GeneralSampleMetadata(id=subject_id, props={key: none_if_missing_in_csv(label_row[key])
                                                           for key in metadata_columns or set()})

    image_files = []
    if image_file_column:
        for image_channel in create_none_list(image_channels):
            # Alternative: restrict rows to given channels first, then read out the relevant columns.
            file_path = _get_row_for_channel(image_channel)[image_file_column]
            image_files.append(none_if_missing_in_csv(file_path))

    numerical_columns = numerical_columns or []
    categorical_columns = categorical_data_encoder.get_supported_dataset_column_names() if categorical_data_encoder \
        else []
    _feature_columns = numerical_columns + categorical_columns

    if not non_image_feature_channels:
        non_image_feature_channels = {}

    numerical = []
    categorical = {}
    if not is_empty(_feature_columns):
        for column in _feature_columns:
            list_channels: Sequence[Optional[str]] = [str(sequence_position_numeric)] \
                if sequence_position_numeric is not None else get_none_list_from_dict(non_image_feature_channels,
                                                                                      column)
            numerical_col, categorical_col = [], []
            for channel in list_channels:  # type: ignore
                row = _get_row_for_channel(channel)
                prefix = f"Channel {channel}, column {column}"
                if column in numerical_columns:
                    numerical_col.append(_string_to_float(row[column], error_message_prefix=prefix))
                else:
                    categorical_col.append(row[column])
            if column in numerical_columns:
                numerical.extend(numerical_col)
            else:
                categorical[column] = categorical_col

    categorical_non_image_features = categorical_data_encoder.encode(categorical) \
        if categorical_data_encoder else torch.tensor(list(categorical.values()))

    datasource: Union[SequenceDataSource, ScalarDataSource]
    if sequence_position_numeric is not None:
        metadata.sequence_position = sequence_position_numeric
        datasource = SequenceDataSource(
            label=label,
            channel_files=image_files,
            numerical_non_image_features=torch.tensor(numerical).float(),
            categorical_non_image_features=categorical_non_image_features.float(),
            metadata=metadata
        )
        return datasource  # type: ignore

    datasource = ScalarDataSource(
        label=label,
        channel_files=image_files,
        numerical_non_image_features=torch.tensor(numerical).float(),
        categorical_non_image_features=categorical_non_image_features.float(),
        metadata=metadata
    )
    return datasource  # type: ignore