Exemplo n.º 1
0
class TestParis(unittest.TestCase):

    def setUp(self):
        self.paris = Paris()
        self.house_graph = house_graph()
        self.karate_club_graph = karate_club_graph()

    def test_unknown_types(self):
        with self.assertRaises(TypeError):
            self.paris.fit(identity(1))

        with self.assertRaises(TypeError):
            self.paris.fit(identity(2, format='csr'), node_weights=1)

    def test_unknown_options(self):
        with self.assertRaises(ValueError):
            self.paris.fit(identity(2, format='csr'), node_weights='unknown')

    def test_house_graph(self):
        self.paris.fit(self.house_graph)
        self.assertEqual(self.paris.dendrogram_.shape[0], 4)
        labels = self.paris.predict(sorted_clusters=True)
        self.assertTrue(np.array_equal(labels, np.array([0, 0, 1, 1, 0])))

    def test_karate_club_graph(self):
        self.paris.fit(self.karate_club_graph)
        self.assertEqual(self.paris.dendrogram_.shape[0], 33)
        labels = self.paris.predict()
        self.assertEqual(np.max(labels), 1)
Exemplo n.º 2
0
    def fit_paris(self):
        """ Uses a super useful library scikit-network to fit a PARIS clusterer on the kNN graph.
        PARIS clustering is hierarchical, so it returns a dendrogram instead of clusters. Later we cut the dendrogram.
        see: Hierarchical Graph Clustering using Node Pair Sampling by Bonald et al  https://arxiv.org/abs/1806.01664"""

        if self.verbose:
            print('fitting PARIS hierarchical clustering')
        paris = Paris()        
        paris.fit(self.adj)
        self.dendrogram = paris.dendrogram_
class ParisClusterer(object):
    def __init__(self, featureMatrix):
        self.featureMatrix = featureMatrix

    def buildAdjacency(self, type='pynndescent', nn=50, metric='dice'):
        print('Building nearest neighbor graph (slowest step)...')
        if type == 'pynndescent':
            nn_index = NNDescent(self.featureMatrix,
                                 n_neighbors=nn,
                                 metric=metric)
            n, d = nn_index.neighbor_graph
            self.n = n
            self.d = d
        print('Done')
        print('Building weighted, directed adjacency matrix...')
        wdAdj = sparse.dok_matrix(
            (self.featureMatrix.shape[0], self.featureMatrix.shape[0]),
            dtype=float)
        for neighbours, distances in tqdm(zip(n, d)):
            instanceIndex = neighbours[0]
            for neighbourIndex, distance in zip(neighbours[1:], distances[1:]):
                wdAdj[instanceIndex,
                      neighbourIndex] += 1 - distance  #similarity = 1-distance
        self.wdAdj = sparse.csr_matrix(wdAdj).astype(float)

    def fit(self):
        self.paris = Paris(engine='numba')
        self.paris.fit(self.wdAdj)

    def balanced_cut(self, max_cluster_size):
        n_nodes = self.paris.dendrogram_.shape[0] + 1
        labels = np.zeros(n_nodes, dtype=int)
        cluster = {node: [node] for node in range(n_nodes)}
        completed_clusters = list()

        for t in range(n_nodes - 1):
            currentID = n_nodes + t
            left = cluster[int(self.paris.dendrogram_[t][0])]
            right = cluster[int(self.paris.dendrogram_[t][1])]
            if len(left) + len(right) > max_cluster_size:
                for clust in [left, right]:
                    if len(clust) < max_cluster_size:
                        completed_clusters.append(clust)

            cluster[currentID] = cluster.pop(int(
                self.paris.dendrogram_[t][0])) + cluster.pop(
                    int(self.paris.dendrogram_[t][1]))

        for count, indices in enumerate(completed_clusters):
            labels[indices] = count

        self.labels_ = labels
Exemplo n.º 4
0
class TestMetrics(unittest.TestCase):

    def setUp(self):
        self.paris = Paris()
        self.karate_club_graph = karate_club_graph()

    def test_karate_club_graph(self):
        adjacency = self.karate_club_graph
        dendrogram = self.paris.fit(adjacency).dendrogram_
        tsd = tree_sampling_divergence(adjacency, dendrogram, normalized=True)
        self.assertAlmostEqual(tsd, .65, 2)
        dc = dasgupta_cost(adjacency, dendrogram, normalized=True)
        self.assertAlmostEqual(dc, .33, 2)
Exemplo n.º 5
0
class TestParis(unittest.TestCase):
    def setUp(self):
        self.paris = Paris(engine='python')
        self.biparis = BiParis(engine='python')
        if is_numba_available:
            self.paris_numba = Paris(engine='numba')
            self.biparis_numba = BiParis(engine='numba')
        else:
            with self.assertRaises(ValueError):
                Paris(engine='numba')

    # noinspection PyTypeChecker
    def test_unknown_types(self):
        with self.assertRaises(TypeError):
            self.paris.fit(sparse.identity(1))

    # noinspection DuplicatedCode
    def test_undirected(self):
        house_graph = house()
        if is_numba_available:
            self.paris_numba.fit(house_graph)
            self.assertEqual(self.paris_numba.dendrogram_.shape[0], 4)
            labels = straight_cut(self.paris_numba.dendrogram_,
                                  sorted_clusters=True)
            self.assertTrue(np.array_equal(labels, np.array([0, 0, 1, 1, 0])))
        self.paris.fit(house_graph)
        self.assertEqual(self.paris.dendrogram_.shape[0], 4)
        labels = straight_cut(self.paris.dendrogram_, sorted_clusters=True)
        self.assertTrue(np.array_equal(labels, np.array([0, 0, 1, 1, 0])))

        karate_club_graph = karate_club()
        self.paris.fit(karate_club_graph)
        self.assertEqual(self.paris.dendrogram_.shape[0], 33)
        labels = straight_cut(self.paris.dendrogram_)
        self.assertEqual(np.max(labels), 1)

    def test_bipartite(self):
        star_wars_graph = star_wars_villains()
        self.biparis.fit(star_wars_graph)
        dendrogram = self.biparis.dendrogram_
        self.assertEqual(dendrogram.shape, (6, 4))
        if is_numba_available:
            self.biparis_numba.fit(star_wars_graph)
            dendrogram = self.biparis_numba.dendrogram_
            self.assertEqual(dendrogram.shape, (6, 4))