def get_cluster_tree(self): self.htree, n_clusters = birch_hierarchy_wrapper(self.Birch_clusterer) clusters = {} max_depth = 0 for i in range(n_clusters): node = bcluster() sub_cluster = self.htree.flatten()[i] node.set_cluster_id(sub_cluster['cluster_id']) depth = sub_cluster.current_depth node.set_depth(depth) if depth > max_depth: max_depth = depth if i not in clusters.keys(): clusters[i] = {} if sub_cluster.current_depth == 0: node.set_parent() else: node.set_parent(clusters[sub_cluster.parent['cluster_id']]) cluster_size = sub_cluster['cluster_size'] node.set_size(cluster_size) data_points = sub_cluster['document_id_accumulated'] data_points_names = self.data.iloc[ data_points].index.values.tolist() node.set_data_points(data_points_names) centroid = self.data.iloc[ sub_cluster['document_id_accumulated'], :].mean(axis=0).values node.set_centroid(centroid) d1, d1_v = self.calculate_d1(centroid, data_points) d2 = self.calculate_d2(centroid, data_points, d1_v) node.add_d1(d1) node.add_d2(d2) node.calculate_threshold(self.outlier_threshold) clusters[i] = node self.cluster_tree = clusters return self.cluster_tree, max_depth
def test_birch_example_reproducibility(example_id): # check reproducibility of the Birch example rng = np.random.RandomState(42) X, y = make_blobs(n_samples=1000, n_features=10, random_state=rng) cluster_model = Birch(threshold=0.9, branching_factor=20, compute_sample_indices=True) cluster_model.fit(X) # assert len(cluster_model.root_.subclusters_[1].child_.subclusters_) == 3 htree, n_subclusters = birch_hierarchy_wrapper(cluster_model) assert htree.tree_size == n_subclusters # same random seed as in the birch hierarchy example assert htree.tree_size == 78 sc = htree.flatten()[example_id] if example_id == 34: # this is true in both cases, but example_id fails on circle ci assert sc.current_depth == 1 assert len(sc.children) == 3 assert_array_equal([sc['cluster_id'] for sc in htree.flatten()], np.arange(htree.tree_size))
def test_birch_clusterig_single_nodes(): basename = os.path.dirname(__file__) X = np.load( os.path.join(basename, '..', 'data', 'ds_lsi_birch', 'data.npy')) branching_factor = 5 mod = Birch(n_clusters=None, threshold=0.1, branching_factor=branching_factor, compute_labels=False, compute_sample_indices=True) mod.fit(X) htree, n_subclusters = birch_hierarchy_wrapper(mod) # let's compute cluster similarity for row in htree.flatten(): inertia, S_sim = centroid_similarity(X, row['document_id_accumulated']) row['document_similarity'] = S_sim row['cluster_similarity'] = inertia assert htree.tree_size == n_subclusters doc_count = 0 for el in htree.flatten(): doc_count += len(el['document_id']) el.current_depth el.document_id_accumulated assert doc_count == len(htree['document_id_accumulated']) assert doc_count == X.shape[0] assert htree.document_count == X.shape[0] # make sure that we have no clusters with a single child assert sum(len(el.children) == 1 for el in htree.flatten()) == 0
def test_birch_make_hierarchy(dataset, optimal_sampling): if dataset == 'random': np.random.seed(9999) X = np.random.rand(1000, 100) normalize(X) branching_factor = 10 elif dataset == 'birch_hierarchical': basename = os.path.dirname(__file__) X = np.load( os.path.join(basename, '..', 'data', 'ds_lsi_birch', 'data.npy')) branching_factor = 2 mod = Birch(n_clusters=None, threshold=0.1, branching_factor=branching_factor, compute_labels=False, compute_sample_indices=True) mod.fit(X) htree, n_subclusters = birch_hierarchy_wrapper(mod) # let's compute cluster similarity for row in htree.flatten(): inertia, S_sim = centroid_similarity(X, row['document_id_accumulated']) row['document_similarity'] = S_sim row['cluster_similarity'] = inertia assert htree.tree_size == n_subclusters doc_count = 0 for el in htree.flatten(): doc_count += len(el['document_id']) el.current_depth el.document_id_accumulated assert doc_count == len(htree['document_id_accumulated']) assert doc_count == X.shape[0] assert htree.document_count == X.shape[0] if optimal_sampling: s_samples_1 = compute_optimal_sampling(htree, min_similarity=0.85, min_coverage=0.9) for row in s_samples_1: assert len(row['document_similarity']) == 1 assert len(row['document_id_accumulated']) == 1 s_samples_2 = compute_optimal_sampling(htree, min_similarity=0.85, min_coverage=0.2) s_samples_3 = compute_optimal_sampling(htree, min_similarity=0.9, min_coverage=0.9) assert len(s_samples_1) > len(s_samples_2) assert len(s_samples_1) < len(s_samples_3)
def __init__(self, model, metric='cosine'): self.model = model self.htree, _n_clusters = birch_hierarchy_wrapper(model, validate=True) self._n_clusters = _n_clusters self.metric_ = metric
X, y = make_blobs(n_samples=1000, n_features=10, random_state=rng) cluster_model = Birch(threshold=0.9, branching_factor=20, compute_sample_indices=True) cluster_model.fit(X) ############################################################################### # # Next we wrap each subcluster in the cluster hierarchy # (``cluster_model.root_``) with the # :class:`~freediscovery.cluster.BirchSubcluster` class # that allows easier manipulation of the hierarchical tree. htree, _ = birch_hierarchy_wrapper(cluster_model) print('Total number of subclusters:', htree.tree_size) ############################################################################### # # Visualizing the hierarchy # ------------------------- # We can now visualize the cluster hierarchy, htree.display_tree() ############################################################################### # # We have a hierarchy 2 levels deep, with 78 sub-clusters and a total # of 1000 samples. #
item = json.loads(line) item_array = numpy.array([float(item["X"]), float(item["Y"])]) X = numpy.vstack((X, item_array)) except: print("Bad value in file") print(len(X), " values in data") cluster_model = Birch(threshold=0.9, branching_factor=20, compute_sample_indices=True, n_clusters=5) cluster_model.fit(X) htree, _ = birch_hierarchy_wrapper(cluster_model) #print('Total number of subclusters:', htree.tree_size) print(cluster_model.labels_) print(len(cluster_model.labels_)) color = {1: "red", 2: "green", 3: "blue", 4: "black", 5: "white", 0: "yellow"} counter = 0 for point in X: plot.scatter(point[0], point[1], 10, color[cluster_model.labels_[counter]]) counter += 1 plot.title("BIRCH on Set 1") plot.show()
def test_birch_hierarchy_validation(): with pytest.raises(ValueError): birch_hierarchy_wrapper("some other object")
def test_birch_hierarchy_fitted(): model = Birch() with pytest.raises(NotFittedError): birch_hierarchy_wrapper(model)