def fit(self, X, y=None): """Fit the CategoricalEncoder to X. Parameters ---------- X : array-like, shape [n_samples, n_features] The data to determine the categories of each feature. Returns ------- self """ X = self._check_X(X) if self.handle_unknown not in ["error", "ignore"]: template = "handle_unknown should be either 'error' or " "'ignore', got %s" raise ValueError(template % self.handle_unknown) if (self.hashing_dim is not None) and (not isinstance(self.hashing_dim, int)): raise ValueError("value '%r' was specified for hashing_dim, " "which has invalid type, expected None or " "int." % self.hashing_dim) if self.categories not in ["auto", "most_frequent", "k-means"]: for cats in self.categories: if not np.all(np.sort(cats) == np.array(cats)): raise ValueError("Unsorted categories are not yet " "supported") n_samples, n_features = X.shape self.categories_ = list() self.random_state_ = check_random_state(self.random_state) for i in range(n_features): Xi = X[:, i] if self.categories == "auto": self.categories_.append(np.unique(Xi)) elif self.categories == "most_frequent": self.categories_.append(self.get_most_frequent(Xi)) elif self.categories == "k-means": uniques, count = np.unique(Xi, return_counts=True) self.categories_.append( get_kmeans_prototypes( uniques, self.n_prototypes, sample_weight=count, random_state=self.random_state_, )) else: if self.handle_unknown == "error": valid_mask = np.in1d(Xi, self.categories[i]) if not np.all(valid_mask): diff = np.unique(Xi[~valid_mask]) msg = "Found unknown categories {0} in column {1}" " during fit".format( diff, i) raise ValueError(msg) self.categories_.append( np.array(self.categories[i], dtype=object)) return self
def test_kmeans_protoypes(): X_test = np.array(['cbbba', 'baaac', 'accc']) proto = get_kmeans_prototypes(X_test, 3) assert np.array_equal(np.sort(proto), np.sort(X_test))