Пример #1
0
def test_pickle_index():
    nelem = 10
    idx = GenericIndex(np.arange(nelem), name="a")
    pickled = pickle.dumps(idx)
    out = pickle.loads(pickled)
    # TODO: Once operations like `all` are supported on Index objects, we can
    # just use that without calling values first.
    assert (idx == out).values.all()
Пример #2
0
def test_onehot_generic_index():
    np.random.seed(0)
    size = 33
    indices = np.random.randint(low=0, high=100, size=size)
    df = DataFrame()
    values = np.random.randint(low=0, high=4, size=size)
    df["fo"] = Series(values, index=GenericIndex(indices))
    out = df.one_hot_encoding(
        "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
    )
    assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
    np.testing.assert_array_equal(values == 0, out.fo_0.to_array())
    np.testing.assert_array_equal(values == 1, out.fo_1.to_array())
    np.testing.assert_array_equal(values == 2, out.fo_2.to_array())
    np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
Пример #3
0
def test_pickle_index():
    nelem = 10
    idx = GenericIndex(np.arange(nelem), name="a")
    pickled = pickle.dumps(idx)
    out = pickle.loads(pickled)
    assert idx == out
Пример #4
0
    def inverse_transform(self, X):
        """
        Convert the data back to the original representation.
        In case unknown categories are encountered (all zeros in the
        one-hot encoding), ``None`` is used to represent this category.

        The return type is the same as the type of the input used by the first
        call to fit on this estimator instance.
        Parameters
        ----------
        X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
            The transformed data.
        Returns
        -------
        X_tr : cudf.DataFrame or cupy.ndarray
            Inverse transformed array.
        """
        self._check_is_fitted()
        if cp.sparse.issparse(X):
            # cupy.sparse 7.x does not support argmax, when we upgrade cupy to
            # 8.x, we should add a condition in the
            # if close: `and not cp.sparse.issparsecsc(X)`
            # and change the following line by `X = X.tocsc()`
            X = X.toarray()
        result = DataFrame(columns=self._encoders.keys())
        j = 0
        for feature in self._encoders.keys():
            feature_enc = self._encoders[feature]
            cats = feature_enc.classes_

            if self.drop is not None:
                # Remove dropped categories
                dropped_class_idx = Series(self.drop_idx_[feature])
                dropped_class_mask = Series(cats).isin(cats[dropped_class_idx])
                if len(cats) == 1:
                    inv = Series(GenericIndex(cats[0]).repeat(X.shape[0]))
                    result[feature] = inv
                    continue
                cats = cats[~dropped_class_mask]

            enc_size = len(cats)
            x_feature = X[:, j:j + enc_size]
            idx = cp.argmax(x_feature, axis=1)
            inv = Series(cats.iloc[idx]).reset_index(drop=True)

            if self.handle_unknown == 'ignore':
                not_null_idx = x_feature.any(axis=1)
                inv.iloc[~not_null_idx] = None
            elif self.drop is not None:
                # drop will either be None or handle_unknown will be error. If
                # self.drop is not None, then we can safely assume that all of
                # the nulls in each column are the dropped value
                dropped_mask = cp.asarray(x_feature.sum(axis=1) == 0).flatten()
                if dropped_mask.any():
                    inv[dropped_mask] = feature_enc.inverse_transform(
                        Series(self.drop_idx_[feature]))[0]

            result[feature] = inv
            j += enc_size
        if self.input_type == 'array':
            try:
                result = cp.asarray(result.as_gpu_matrix())
            except ValueError:
                warnings.warn("The input one hot encoding contains rows with "
                              "unknown categories. Arrays do not support null "
                              "values. Returning output as a DataFrame "
                              "instead.")
        return result
Пример #5
0
def test_pickle_index():
    nelem = 10
    idx = GenericIndex(rmm.to_device(np.arange(nelem)))
    pickled = pickle.dumps(idx)
    out = pickle.loads(pickled)
    assert idx == out