class TestParis(unittest.TestCase): def setUp(self): self.paris = Paris() self.house_graph = house_graph() self.karate_club_graph = karate_club_graph() def test_unknown_types(self): with self.assertRaises(TypeError): self.paris.fit(identity(1)) with self.assertRaises(TypeError): self.paris.fit(identity(2, format='csr'), node_weights=1) def test_unknown_options(self): with self.assertRaises(ValueError): self.paris.fit(identity(2, format='csr'), node_weights='unknown') def test_house_graph(self): self.paris.fit(self.house_graph) self.assertEqual(self.paris.dendrogram_.shape[0], 4) labels = self.paris.predict(sorted_clusters=True) self.assertTrue(np.array_equal(labels, np.array([0, 0, 1, 1, 0]))) def test_karate_club_graph(self): self.paris.fit(self.karate_club_graph) self.assertEqual(self.paris.dendrogram_.shape[0], 33) labels = self.paris.predict() self.assertEqual(np.max(labels), 1)
def fit_paris(self): """ Uses a super useful library scikit-network to fit a PARIS clusterer on the kNN graph. PARIS clustering is hierarchical, so it returns a dendrogram instead of clusters. Later we cut the dendrogram. see: Hierarchical Graph Clustering using Node Pair Sampling by Bonald et al https://arxiv.org/abs/1806.01664""" if self.verbose: print('fitting PARIS hierarchical clustering') paris = Paris() paris.fit(self.adj) self.dendrogram = paris.dendrogram_
class ParisClusterer(object): def __init__(self, featureMatrix): self.featureMatrix = featureMatrix def buildAdjacency(self, type='pynndescent', nn=50, metric='dice'): print('Building nearest neighbor graph (slowest step)...') if type == 'pynndescent': nn_index = NNDescent(self.featureMatrix, n_neighbors=nn, metric=metric) n, d = nn_index.neighbor_graph self.n = n self.d = d print('Done') print('Building weighted, directed adjacency matrix...') wdAdj = sparse.dok_matrix( (self.featureMatrix.shape[0], self.featureMatrix.shape[0]), dtype=float) for neighbours, distances in tqdm(zip(n, d)): instanceIndex = neighbours[0] for neighbourIndex, distance in zip(neighbours[1:], distances[1:]): wdAdj[instanceIndex, neighbourIndex] += 1 - distance #similarity = 1-distance self.wdAdj = sparse.csr_matrix(wdAdj).astype(float) def fit(self): self.paris = Paris(engine='numba') self.paris.fit(self.wdAdj) def balanced_cut(self, max_cluster_size): n_nodes = self.paris.dendrogram_.shape[0] + 1 labels = np.zeros(n_nodes, dtype=int) cluster = {node: [node] for node in range(n_nodes)} completed_clusters = list() for t in range(n_nodes - 1): currentID = n_nodes + t left = cluster[int(self.paris.dendrogram_[t][0])] right = cluster[int(self.paris.dendrogram_[t][1])] if len(left) + len(right) > max_cluster_size: for clust in [left, right]: if len(clust) < max_cluster_size: completed_clusters.append(clust) cluster[currentID] = cluster.pop(int( self.paris.dendrogram_[t][0])) + cluster.pop( int(self.paris.dendrogram_[t][1])) for count, indices in enumerate(completed_clusters): labels[indices] = count self.labels_ = labels
class TestMetrics(unittest.TestCase): def setUp(self): self.paris = Paris() self.karate_club_graph = karate_club_graph() def test_karate_club_graph(self): adjacency = self.karate_club_graph dendrogram = self.paris.fit(adjacency).dendrogram_ tsd = tree_sampling_divergence(adjacency, dendrogram, normalized=True) self.assertAlmostEqual(tsd, .65, 2) dc = dasgupta_cost(adjacency, dendrogram, normalized=True) self.assertAlmostEqual(dc, .33, 2)
class TestParis(unittest.TestCase): def setUp(self): self.paris = Paris(engine='python') self.biparis = BiParis(engine='python') if is_numba_available: self.paris_numba = Paris(engine='numba') self.biparis_numba = BiParis(engine='numba') else: with self.assertRaises(ValueError): Paris(engine='numba') # noinspection PyTypeChecker def test_unknown_types(self): with self.assertRaises(TypeError): self.paris.fit(sparse.identity(1)) # noinspection DuplicatedCode def test_undirected(self): house_graph = house() if is_numba_available: self.paris_numba.fit(house_graph) self.assertEqual(self.paris_numba.dendrogram_.shape[0], 4) labels = straight_cut(self.paris_numba.dendrogram_, sorted_clusters=True) self.assertTrue(np.array_equal(labels, np.array([0, 0, 1, 1, 0]))) self.paris.fit(house_graph) self.assertEqual(self.paris.dendrogram_.shape[0], 4) labels = straight_cut(self.paris.dendrogram_, sorted_clusters=True) self.assertTrue(np.array_equal(labels, np.array([0, 0, 1, 1, 0]))) karate_club_graph = karate_club() self.paris.fit(karate_club_graph) self.assertEqual(self.paris.dendrogram_.shape[0], 33) labels = straight_cut(self.paris.dendrogram_) self.assertEqual(np.max(labels), 1) def test_bipartite(self): star_wars_graph = star_wars_villains() self.biparis.fit(star_wars_graph) dendrogram = self.biparis.dendrogram_ self.assertEqual(dendrogram.shape, (6, 4)) if is_numba_available: self.biparis_numba.fit(star_wars_graph) dendrogram = self.biparis_numba.dendrogram_ self.assertEqual(dendrogram.shape, (6, 4))