Exemplo n.º 1
0
    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """
        X = self._check_X(X)
        if self.handle_unknown not in ["error", "ignore"]:
            template = "handle_unknown should be either 'error' or " "'ignore', got %s"
            raise ValueError(template % self.handle_unknown)

        if (self.hashing_dim
                is not None) and (not isinstance(self.hashing_dim, int)):
            raise ValueError("value '%r' was specified for hashing_dim, "
                             "which has invalid type, expected None or "
                             "int." % self.hashing_dim)

        if self.categories not in ["auto", "most_frequent", "k-means"]:
            for cats in self.categories:
                if not np.all(np.sort(cats) == np.array(cats)):
                    raise ValueError("Unsorted categories are not yet "
                                     "supported")

        n_samples, n_features = X.shape
        self.categories_ = list()
        self.random_state_ = check_random_state(self.random_state)

        for i in range(n_features):
            Xi = X[:, i]
            if self.categories == "auto":
                self.categories_.append(np.unique(Xi))
            elif self.categories == "most_frequent":
                self.categories_.append(self.get_most_frequent(Xi))
            elif self.categories == "k-means":
                uniques, count = np.unique(Xi, return_counts=True)
                self.categories_.append(
                    get_kmeans_prototypes(
                        uniques,
                        self.n_prototypes,
                        sample_weight=count,
                        random_state=self.random_state_,
                    ))
            else:
                if self.handle_unknown == "error":
                    valid_mask = np.in1d(Xi, self.categories[i])
                    if not np.all(valid_mask):
                        diff = np.unique(Xi[~valid_mask])
                        msg = "Found unknown categories {0} in column {1}" " during fit".format(
                            diff, i)
                        raise ValueError(msg)
                self.categories_.append(
                    np.array(self.categories[i], dtype=object))
        return self
Exemplo n.º 2
0
def test_kmeans_protoypes():
    X_test = np.array(['cbbba', 'baaac', 'accc'])
    proto = get_kmeans_prototypes(X_test, 3)
    assert np.array_equal(np.sort(proto), np.sort(X_test))