Пример #1
0
def test_encode_util(values, expected):
    uniques = _encode(values)
    assert_array_equal(uniques, expected)
    uniques, encoded = _encode(values, encode=True)
    assert_array_equal(uniques, expected)
    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
    _, encoded = _encode(values, uniques, encode=True)
    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
Пример #2
0
def test_encode_util(values, expected):
    uniques = _encode(values)
    assert_array_equal(uniques, expected)
    uniques, encoded = _encode(values, encode=True)
    assert_array_equal(uniques, expected)
    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
    _, encoded = _encode(values, uniques, encode=True)
    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
Пример #3
0
    def transform(self, y):
        """Transform labels to normalized encoding.

        If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_encoded_label_value`` for unseen values.
        Seen labels are encoded with value between 0 and n_classes-1.  Unseen labels are encoded with
        ``self.fill_encoded_label_value`` with a default value of n_classes.

        Parameters
        ----------
        y : array-like of shape [n_samples]
            Label values.

        Returns
        -------
        y_encoded : array-like of shape [n_samples]
                    Encoded label values.
        """
        check_is_fitted(self, "classes_")
        y = column_or_1d(y, warn=True)

        # transform of empty array is empty array
        if _num_samples(y) == 0:
            return np.array([])

        if self.fill_unseen_labels:
            _, mask = _encode_check_unknown(y, self.classes_, return_mask=True)
            y_encoded = np.searchsorted(self.classes_, y)
            fill_encoded_label_value = self.fill_encoded_label_value or len(
                self.classes_)
            y_encoded[~mask] = fill_encoded_label_value
        else:
            _, y_encoded = _encode(y, uniques=self.classes_, encode=True)

        return y_encoded
Пример #4
0
    def _transform(self, X, handle_unknown='error'):
        X = self._check_X(X)

        _, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            Xi = X[:, i]
            diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
                                                     return_mask=True)

            if not np.all(valid_mask):
                if handle_unknown == 'error':
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    Xi = Xi.copy()
                    Xi[~valid_mask] = self.categories_[i][0]
            _, encoded = _encode(Xi, self.categories_[i], encode=True)
            X_int[:, i] = encoded

        return X_int, X_mask
Пример #5
0
    def _fit(self, X, handle_unknown='error'):
        X = self._check_X(X)

        n_samples, n_features = X.shape

        if self._categories != 'auto':
            if X.dtype != object:
                for cats in self._categories:
                    if not np.all(np.sort(cats) == np.array(cats)):
                        raise ValueError("Unsorted categories are not "
                                         "supported for numerical categories")
            if len(self._categories) != n_features:
                raise ValueError("Shape mismatch: if n_values is an array,"
                                 " it has to be of shape (n_features,).")

        self.categories_ = []

        for i in range(n_features):
            Xi = X[:, i]
            if self._categories == 'auto':
                cats = _encode(Xi)
            else:
                cats = np.array(self._categories[i], dtype=X.dtype)
                if handle_unknown == 'error':
                    diff = _encode_check_unknown(Xi, cats)
                    if diff:
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
            self.categories_.append(cats)
Пример #6
0
    def fit_transform(self, y):
        """Fit label encoder and return encoded labels.

        ``fill_unseen_labels=True`` does nothing in ``fit_transform`` because there will be no unseen labels.

        Parameters
        ----------
        y : array-like of shape [n_samples]
            Label values.

        Returns
        -------
        y_encoded : array-like of shape [n_samples]
                    Encoded label values.
        """
        y = column_or_1d(y, warn=True)
        sorted_labels = self._check_labels_and_sort()
        self.classes_, y_encoded = (_encode(
            y, uniques=sorted_labels, encode=True) if sorted_labels else
                                    _encode(y, encode=True))
        return y_encoded
Пример #7
0
    def fit(self, y):
        """Fit label encoder.

        Parameters
        ----------
        y : array-like of shape (n_samples,)
            Label values.

        Returns
        -------
        self : RobustLabelEncoder.
        """
        y = column_or_1d(y, warn=True)
        self.classes_ = self._check_labels_and_sort() or _encode(y)
        return self
Пример #8
0
def test_encode_check_unknown():
    # test for the check_unknown parameter of _encode()
    uniques = np.array([1, 2, 3])
    values = np.array([1, 2, 3, 4])

    # Default is True, raise error
    with pytest.raises(ValueError,
                       match='y contains previously unseen labels'):
        _encode(values, uniques, encode=True, check_unknown=True)

    # dont raise error if False
    _encode(values, uniques, encode=True, check_unknown=False)

    # parameter is ignored for object dtype
    uniques = np.array(['a', 'b', 'c'], dtype=object)
    values = np.array(['a', 'b', 'c', 'd'], dtype=object)
    with pytest.raises(ValueError,
                       match='y contains previously unseen labels'):
        _encode(values, uniques, encode=True, check_unknown=False)
Пример #9
0
def test_encode_check_unknown():
    # test for the check_unknown parameter of _encode()
    uniques = np.array([1, 2, 3])
    values = np.array([1, 2, 3, 4])

    # Default is True, raise error
    with pytest.raises(ValueError,
                       match='y contains previously unseen labels'):
        _encode(values, uniques, encode=True, check_unknown=True)

    # dont raise error if False
    _encode(values, uniques, encode=True, check_unknown=False)

    # parameter is ignored for object dtype
    uniques = np.array(['a', 'b', 'c'], dtype=object)
    values = np.array(['a', 'b', 'c', 'd'], dtype=object)
    with pytest.raises(ValueError,
                       match='y contains previously unseen labels'):
        _encode(values, uniques, encode=True, check_unknown=False)
# ## Using LSTMs

# In[891]:

max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32

print('Loading data...')
# Encoding Training Labels
y_train = column_or_1d(y_train, warn=True)
classes_, encoded_values = _encode(y_train,
                                   uniques=np.array([
                                       'half-true', 'mostly-true', 'false',
                                       'true', 'barely-true', 'pants-fire'
                                   ]),
                                   encode=True)
encoded_values, classes_
y_train = encoded_values

# Encoding Testing Labels
y_test = column_or_1d(y_test, warn=True)
classes_, encoded_values = _encode(y_test,
                                   uniques=np.array([
                                       'half-true', 'mostly-true', 'false',
                                       'true', 'barely-true', 'pants-fire'
                                   ]),
                                   encode=True)
encoded_values, classes_
y_test = encoded_values