def silhouette_samples(X, labels, verbose=False, cuda=False): if not cuda: return s_sil(X, labels) X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr']) le = LabelEncoder() labels = le.fit_transform(labels) n_samples = len(labels) label_freqs = np.bincount(labels) check_number_of_labels(len(le.classes_), n_samples) reduce_func = functools.partial(_silhouette_reduce, labels=labels, label_freqs=label_freqs) results = zip(*pairwise_distances_chunked_cuda(X, reduce_func=reduce_func, verbose=verbose)) intra_clust_dists, inter_clust_dists = results intra_clust_dists = np.concatenate(intra_clust_dists) inter_clust_dists = np.concatenate(inter_clust_dists) denom = (label_freqs - 1).take(labels, mode='clip') with np.errstate(divide="ignore", invalid="ignore"): intra_clust_dists /= denom sil_samples = inter_clust_dists - intra_clust_dists with np.errstate(divide="ignore", invalid="ignore"): sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) # nan values are for clusters of size 1, and should be 0 return np.nan_to_num(sil_samples)
def test_label_encoder(values, classes, unknown): # Test LabelEncoder's transform, fit_transform and # inverse_transform methods le = LabelEncoder() le.fit(values) assert_array_equal(le.classes_, classes) assert_array_equal(le.transform(values), [1, 0, 2, 0, 2]) assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values) le = LabelEncoder() ret = le.fit_transform(values) assert_array_equal(ret, [1, 0, 2, 0, 2]) with pytest.raises(ValueError, match="unseen labels"): le.transform(unknown)