예제 #1
0
 def get_cluster_tree(self):
     self.htree, n_clusters = birch_hierarchy_wrapper(self.Birch_clusterer)
     clusters = {}
     max_depth = 0
     for i in range(n_clusters):
         node = bcluster()
         sub_cluster = self.htree.flatten()[i]
         node.set_cluster_id(sub_cluster['cluster_id'])
         depth = sub_cluster.current_depth
         node.set_depth(depth)
         if depth > max_depth:
             max_depth = depth
         if i not in clusters.keys():
             clusters[i] = {}
         if sub_cluster.current_depth == 0:
             node.set_parent()
         else:
             node.set_parent(clusters[sub_cluster.parent['cluster_id']])
         cluster_size = sub_cluster['cluster_size']
         node.set_size(cluster_size)
         data_points = sub_cluster['document_id_accumulated']
         data_points_names = self.data.iloc[
             data_points].index.values.tolist()
         node.set_data_points(data_points_names)
         centroid = self.data.iloc[
             sub_cluster['document_id_accumulated'], :].mean(axis=0).values
         node.set_centroid(centroid)
         d1, d1_v = self.calculate_d1(centroid, data_points)
         d2 = self.calculate_d2(centroid, data_points, d1_v)
         node.add_d1(d1)
         node.add_d2(d2)
         node.calculate_threshold(self.outlier_threshold)
         clusters[i] = node
         self.cluster_tree = clusters
     return self.cluster_tree, max_depth
예제 #2
0
def test_birch_example_reproducibility(example_id):
    # check reproducibility of the Birch example
    rng = np.random.RandomState(42)

    X, y = make_blobs(n_samples=1000, n_features=10, random_state=rng)

    cluster_model = Birch(threshold=0.9,
                          branching_factor=20,
                          compute_sample_indices=True)
    cluster_model.fit(X)
    # assert len(cluster_model.root_.subclusters_[1].child_.subclusters_) == 3

    htree, n_subclusters = birch_hierarchy_wrapper(cluster_model)

    assert htree.tree_size == n_subclusters

    # same random seed as in the birch hierarchy example
    assert htree.tree_size == 78
    sc = htree.flatten()[example_id]
    if example_id == 34:
        # this is true in both cases, but example_id fails on circle ci
        assert sc.current_depth == 1
        assert len(sc.children) == 3

    assert_array_equal([sc['cluster_id'] for sc in htree.flatten()],
                       np.arange(htree.tree_size))
예제 #3
0
def test_birch_clusterig_single_nodes():

    basename = os.path.dirname(__file__)
    X = np.load(
        os.path.join(basename, '..', 'data', 'ds_lsi_birch', 'data.npy'))
    branching_factor = 5

    mod = Birch(n_clusters=None,
                threshold=0.1,
                branching_factor=branching_factor,
                compute_labels=False,
                compute_sample_indices=True)
    mod.fit(X)

    htree, n_subclusters = birch_hierarchy_wrapper(mod)

    # let's compute cluster similarity
    for row in htree.flatten():
        inertia, S_sim = centroid_similarity(X, row['document_id_accumulated'])
        row['document_similarity'] = S_sim
        row['cluster_similarity'] = inertia

    assert htree.tree_size == n_subclusters

    doc_count = 0
    for el in htree.flatten():
        doc_count += len(el['document_id'])
        el.current_depth
        el.document_id_accumulated
    assert doc_count == len(htree['document_id_accumulated'])
    assert doc_count == X.shape[0]
    assert htree.document_count == X.shape[0]

    # make sure that we have no clusters with a single child
    assert sum(len(el.children) == 1 for el in htree.flatten()) == 0
예제 #4
0
def test_birch_make_hierarchy(dataset, optimal_sampling):

    if dataset == 'random':
        np.random.seed(9999)

        X = np.random.rand(1000, 100)
        normalize(X)
        branching_factor = 10
    elif dataset == 'birch_hierarchical':
        basename = os.path.dirname(__file__)
        X = np.load(
            os.path.join(basename, '..', 'data', 'ds_lsi_birch', 'data.npy'))
        branching_factor = 2

    mod = Birch(n_clusters=None,
                threshold=0.1,
                branching_factor=branching_factor,
                compute_labels=False,
                compute_sample_indices=True)
    mod.fit(X)

    htree, n_subclusters = birch_hierarchy_wrapper(mod)

    # let's compute cluster similarity
    for row in htree.flatten():
        inertia, S_sim = centroid_similarity(X, row['document_id_accumulated'])
        row['document_similarity'] = S_sim
        row['cluster_similarity'] = inertia

    assert htree.tree_size == n_subclusters

    doc_count = 0
    for el in htree.flatten():
        doc_count += len(el['document_id'])
        el.current_depth
        el.document_id_accumulated
    assert doc_count == len(htree['document_id_accumulated'])
    assert doc_count == X.shape[0]
    assert htree.document_count == X.shape[0]
    if optimal_sampling:
        s_samples_1 = compute_optimal_sampling(htree,
                                               min_similarity=0.85,
                                               min_coverage=0.9)

        for row in s_samples_1:
            assert len(row['document_similarity']) == 1
            assert len(row['document_id_accumulated']) == 1
        s_samples_2 = compute_optimal_sampling(htree,
                                               min_similarity=0.85,
                                               min_coverage=0.2)
        s_samples_3 = compute_optimal_sampling(htree,
                                               min_similarity=0.9,
                                               min_coverage=0.9)

        assert len(s_samples_1) > len(s_samples_2)
        assert len(s_samples_1) < len(s_samples_3)
예제 #5
0
 def __init__(self, model, metric='cosine'):
     self.model = model
     self.htree, _n_clusters = birch_hierarchy_wrapper(model,
                                                       validate=True)
     self._n_clusters = _n_clusters
     self.metric_ = metric
예제 #6
0
X, y = make_blobs(n_samples=1000, n_features=10, random_state=rng)

cluster_model = Birch(threshold=0.9,
                      branching_factor=20,
                      compute_sample_indices=True)
cluster_model.fit(X)

###############################################################################
#
# Next we wrap each subcluster in the cluster hierarchy
# (``cluster_model.root_``) with the
# :class:`~freediscovery.cluster.BirchSubcluster` class
# that allows easier manipulation of the hierarchical tree.

htree, _ = birch_hierarchy_wrapper(cluster_model)
print('Total number of subclusters:', htree.tree_size)

###############################################################################
#
# Visualizing the hierarchy
# -------------------------
# We can now visualize the cluster hierarchy,

htree.display_tree()

###############################################################################
#
# We have a hierarchy 2 levels deep, with 78 sub-clusters and a total
# of 1000 samples.
#
예제 #7
0
        item = json.loads(line)
        item_array = numpy.array([float(item["X"]), float(item["Y"])])
        X = numpy.vstack((X, item_array))
except:
    print("Bad value in file")

print(len(X), " values in data")

cluster_model = Birch(threshold=0.9,
                      branching_factor=20,
                      compute_sample_indices=True,
                      n_clusters=5)

cluster_model.fit(X)

htree, _ = birch_hierarchy_wrapper(cluster_model)
#print('Total number of subclusters:', htree.tree_size)

print(cluster_model.labels_)

print(len(cluster_model.labels_))

color = {1: "red", 2: "green", 3: "blue", 4: "black", 5: "white", 0: "yellow"}
counter = 0

for point in X:
    plot.scatter(point[0], point[1], 10, color[cluster_model.labels_[counter]])
    counter += 1

plot.title("BIRCH on Set 1")
plot.show()
예제 #8
0
def test_birch_hierarchy_validation():
    with pytest.raises(ValueError):
        birch_hierarchy_wrapper("some other object")
예제 #9
0
def test_birch_hierarchy_fitted():
    model = Birch()

    with pytest.raises(NotFittedError):
        birch_hierarchy_wrapper(model)