Пример #1
0
    def __init__(self,
                 dataset: np.ndarray,
                 ground_truth: Optional[np.ndarray] = None,
                 categorical_indices: Optional[np.ndarray] = None,
                 int_to_float: bool = True) -> None:
        """
        Constructs an ``Augmentation`` abstract class.
        """
        # pylint: disable=too-many-locals
        assert _validate_input(dataset,
                               ground_truth=ground_truth,
                               categorical_indices=categorical_indices,
                               int_to_float=int_to_float), 'Invalid input.'

        self.dataset = dataset
        self.data_points_number = dataset.shape[0]
        self.is_structured = fuav.is_structured_array(dataset)

        self.ground_truth = ground_truth

        # Sort out column indices
        indices = fuat.indices_by_type(dataset)
        num_indices = set(indices[0])
        cat_indices = set(indices[1])
        all_indices = num_indices.union(cat_indices)

        if categorical_indices is None:
            categorical_indices = cat_indices
            numerical_indices = num_indices
        else:
            if cat_indices.difference(categorical_indices):
                msg = ('Some of the string-based columns in the input dataset '
                       'were not selected as categorical features via the '
                       'categorical_indices parameter. String-based columns '
                       'cannot be treated as numerical features, therefore '
                       'they will be also treated as categorical features '
                       '(in addition to the ones selected with the '
                       'categorical_indices parameter).')
                warnings.warn(msg, UserWarning)
                categorical_indices = cat_indices.union(categorical_indices)
            numerical_indices = all_indices.difference(categorical_indices)

        self.categorical_indices = sorted(list(categorical_indices))
        self.numerical_indices = sorted(list(numerical_indices))
        self.features_number = len(all_indices)

        # Sort out the dtype of the sampled array.
        ntype = np.dtype(np.float64) if int_to_float else np.dtype(np.int64)
        if self.is_structured:
            sample_dtype = []
            for column_name in self.dataset.dtype.names:
                if column_name in self.numerical_indices:
                    new_dtype = fuat.generalise_dtype(
                        self.dataset.dtype[column_name], ntype)
                    sample_dtype.append((column_name, new_dtype))
                elif column_name in self.categorical_indices:
                    sample_dtype.append(
                        (column_name, self.dataset.dtype[column_name]))
                else:
                    assert False, 'Unknown column name.'  # pragma: nocover
        else:
            if fuav.is_numerical_array(self.dataset):
                sample_dtype = fuat.generalise_dtype(self.dataset.dtype, ntype)
            else:
                sample_dtype = self.dataset.dtype
        self.sample_dtype = sample_dtype
Пример #2
0
def _interpolate_array(
        dataset: np.ndarray,
        feature_index: Union[int, str],  # yapf: disable
        treat_as_categorical: bool,
        steps_number: Union[int, None]) -> Tuple[np.ndarray, np.ndarray]:
    """
    Generates a 3-D array with interpolated values for the selected feature.

    If the selected feature is numerical the interpolated values are a
    numerical array with evenly spaced numbers between the minimum and the
    maximum value in that column. Otherwise, when the feature is categorical
    the interpolated values are all the unique elements of the that column.

    To get the interpolation the original 2-D dataset is stacked on top of
    itself the number of times equal to the number of desired interpolation
    samples. Then, for every copy of that dataset the selected feature is fixed
    to consecutive values of the interpolated array (the same value for the
    whole copy of the dataset).

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset based on which interpolation will be done.
    feature_index : Union[integer, string]
        An index of the feature column in the input dataset for which the
        interpolation will be computed.
    treat_as_categorical : boolean
        Whether to treat the selected feature as categorical or numerical.
    steps_number : Union[integer, None]
        The number of evenly spaced samples between the minimum and the maximum
        value of the selected feature for which the model's prediction will be
        evaluated. This parameter applies only to numerical features, for
        categorical features regardless whether it is a number or ``None``, it
        will be ignored.

    Returns
    -------
    interpolated_data : numpy.ndarray
        Numpy array of shape (n_samples, steps_number, n_features) -- where the
        (n_samples, n_features) is the dimension of the input ``dataset`` --
        holding the input ``dataset`` augmented with the interpolated values.
    interpolated_values : numpy.ndarray
        A 1-dimensional array of shape (steps_number, ) holding the
        interpolated values. If a numerical column is selected this will be a
        series of uniformly distributed ``steps_number`` values between the
        minimum and the maximum value of that column. For categorical (textual)
        columns it will hold all the unique values from that column.
    """
    assert isinstance(dataset, np.ndarray), 'Dataset -> numpy array.'
    assert isinstance(feature_index, (int, str)), 'Feature index -> str/ int.'
    assert isinstance(treat_as_categorical, bool), 'As categorical -> bool.'
    assert steps_number is None or isinstance(steps_number, int), \
        'Steps number -> None/ int.'

    is_structured = fuav.is_structured_array(dataset)

    if is_structured:
        column = dataset[feature_index]
    else:
        column = dataset[:, feature_index]

    if treat_as_categorical:
        interpolated_values = np.unique(column)
        interpolated_values.sort()
        # Ignoring steps number -- not needed for categorical.
        steps_number = interpolated_values.shape[0]
    else:
        assert isinstance(steps_number, int), 'Steps number must be an int.'
        interpolated_values = np.linspace(column.min(), column.max(),
                                          steps_number)

        # Give float type to this column if it is a structured array
        if (is_structured
                and dataset.dtype[feature_index] != interpolated_values.dtype):
            new_types = []
            for name in dataset.dtype.names:
                if name == feature_index:
                    dtype = fuat.generalise_dtype(interpolated_values.dtype,
                                                  dataset.dtype[name])
                    new_types.append((name, dtype))
                else:
                    new_types.append((name, dataset.dtype[name]))
            dataset = dataset.astype(new_types)
        elif not is_structured and dataset.dtype != interpolated_values.dtype:
            dtype = fuat.generalise_dtype(interpolated_values.dtype,
                                          dataset.dtype)
            dataset = dataset.astype(dtype)

    interpolated_data = np.repeat(dataset[:, np.newaxis], steps_number, axis=1)
    assert len(interpolated_values) == steps_number, 'Required for broadcast.'
    if is_structured:
        for idx in range(steps_number):
            # Broadcast the new value.
            interpolated_data[:, idx][feature_index] = interpolated_values[idx]
    else:
        # Broadcast the new vector.
        interpolated_data[:, :, feature_index] = interpolated_values

    return interpolated_data, interpolated_values
def test_generalise_dtype():
    """
    Tests :func:`fatf.utils.array.tools.generalise_dtype`.
    """
    error_msg = 'The {} dtype is not one of the base types (strings/numbers).'
    with pytest.raises(ValueError) as exin:
        fuat.generalise_dtype(np.dtype(np.datetime64), np.dtype(np.datetime64))
    assert str(exin.value) == error_msg.format('first')

    with pytest.raises(ValueError) as exin:
        fuat.generalise_dtype(np.dtype(np.float64), np.dtype(np.datetime64))
    assert str(exin.value) == error_msg.format('second')

    dtype_int = np.dtype(int)
    dtype_int32 = np.dtype(np.int32)
    dtype_int64 = np.dtype(np.int64)
    dtype_float = np.dtype(float)
    dtype_float16 = np.dtype(np.float16)
    dtype_float32 = np.dtype(np.float32)
    dtype_float64 = np.dtype(np.float64)
    dtype_str = np.dtype(str)
    dtype_str4 = np.dtype('U4')
    dtype_str11 = np.dtype('U11')
    dtype_str16 = np.dtype('U16')
    dtype_str21 = np.dtype('U21')
    dtype_str32 = np.dtype('U32')

    assert dtype_int64 is fuat.generalise_dtype(dtype_int, dtype_int32)
    assert dtype_int64 is fuat.generalise_dtype(dtype_int, dtype_int64)
    assert dtype_int64 is fuat.generalise_dtype(dtype_int32, dtype_int64)
    assert dtype_int64 is fuat.generalise_dtype(dtype_int, dtype_int)

    assert dtype_float64 is fuat.generalise_dtype(dtype_float, dtype_float)
    assert dtype_float64 is fuat.generalise_dtype(dtype_float64, dtype_float)
    assert dtype_float64 is fuat.generalise_dtype(dtype_int, dtype_float32)
    assert dtype_float64 is fuat.generalise_dtype(dtype_int32, dtype_float32)
    assert dtype_float32 is fuat.generalise_dtype(dtype_float32, dtype_float16)

    assert dtype_str4 is fuat.generalise_dtype(dtype_str, dtype_str4)
    assert dtype_str21 is fuat.generalise_dtype(dtype_str21, dtype_str4)

    assert dtype_str16 == fuat.generalise_dtype(dtype_str11, dtype_str16)
    assert dtype_str11 == fuat.generalise_dtype(dtype_int32, dtype_str4)
    assert dtype_str21 == fuat.generalise_dtype(dtype_int64, dtype_str4)
    assert dtype_str32 == fuat.generalise_dtype(dtype_float32, dtype_str4)
    assert dtype_str32 == fuat.generalise_dtype(dtype_float64, dtype_str16)