def test_options(self): adjacency = karate_club() # resolution louvain = Louvain(resolution=2) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 7) # tolerance louvain = Louvain(resolution=2, tol_aggregation=0.1) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 12) # shuffling louvain = Louvain(resolution=2, shuffle_nodes=True, random_state=42) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 9) # aggregate graph louvain = Louvain(return_aggregate=True) labels = louvain.fit_transform(adjacency) n_labels = len(set(labels)) self.assertEqual(louvain.adjacency_.shape, (n_labels, n_labels)) # aggregate graph Louvain(n_aggregations=1, sort_clusters=False).fit(adjacency)
def test_options_with_64_bit(self): adjacency = karate_club() # force 64-bit index adjacency.indices = adjacency.indices.astype(np.int64) adjacency.indptr = adjacency.indptr.astype(np.int64) # resolution louvain = Louvain(resolution=2) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 7) # tolerance louvain = Louvain(resolution=2, tol_aggregation=0.1) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 12) # shuffling louvain = Louvain(resolution=2, shuffle_nodes=True, random_state=42) labels = louvain.fit_transform(adjacency) self.assertEqual(len(set(labels)), 9) # aggregate graph louvain = Louvain(return_aggregate=True) labels = louvain.fit_transform(adjacency) n_labels = len(set(labels)) self.assertEqual(louvain.aggregate_.shape, (n_labels, n_labels)) # aggregate graph Louvain(n_aggregations=1, sort_clusters=False).fit(adjacency) # check if labels are 64-bit self.assertEqual(labels.dtype, np.int64)
def test_modularity(self): adjacency = karate_club() louvain_d = Louvain(modularity='dugue') louvain_n = Louvain(modularity='newman') labels_d = louvain_d.fit_transform(adjacency) labels_n = louvain_n.fit_transform(adjacency) self.assertTrue((labels_d == labels_n).all()) louvain_p = Louvain(modularity='potts') louvain_p.fit_transform(adjacency)
def test_bilouvain(self): biadjacency = star_wars() adjacency = bipartite2undirected(biadjacency) louvain = Louvain(modularity='newman') labels1 = louvain.fit_transform(adjacency) louvain.fit(biadjacency) labels2 = np.concatenate((louvain.labels_row_, louvain.labels_col_)) self.assertTrue((labels1 == labels2).all())
def louvain(cls, g, labels): x, y, z = labels.shape t = cls.ragToAdjacencyMatrix(g, 'similarity') louvain = Louvain() l = louvain.fit_transform(t) rep = labels.copy() for k in range(x): for j in range(y): for i in range(z): rep[k, j, i] = l[labels[k, j, i] - 1] return rep
def make_structure_louvain_W2V( keywords, words_vectors, tree, gismo, root=True, depth=3, ): """ Builds a tree structure from Louvain clusterising method Args: tree: the empty node that will contain root: depth: Returns: None, it fills in the empty note that is given at first recursively """ # À la racine, tous les mots sont dans le cluster if root: tree.members = keywords tree.centroid = sum([ gismo.embedding.query_projection(member)[0] for member in tree.members ]) tree.title = " ".join([ gismo.embedding.features[i] for i in gismo.diteration.y_order if gismo.embedding.features[i] in tree.members ][:10]) if depth == 0 or len(tree.members) == 1: return None # Creation de la matrice des mots # words_adjacency = cosine_similarity(words_vectors, dense_output = False) words_adjacency = building_distances_matrix(words_vectors) max_vector = np.ones(np.shape( words_adjacency.data)) * np.max(words_adjacency) words_adjacency.data = max_vector - words_adjacency.data if sum([i for i in words_adjacency.data]) == 0: return None # Clustering louvain = Louvain() labels = louvain.fit_transform(words_adjacency) labels_unique, counts = np.unique(labels, return_counts=True) if len(labels_unique) == 1: return None # Il y a autant d'enfants que de clusters children = [Node() for l in labels_unique] children_members_indexes = [[] for child in children] print(labels_unique) print(keywords) for l in labels_unique: # on remplit members de chaque dico children_members_indexes[l] = np.where(labels == l)[0].tolist() try: words = [ keywords[word_index] for word_index in children_members_indexes[l] ] except: print("plantage avec les mots clef : ", keywords, " et les étiquettes : ", labels_unique) return None children[l].members = words children[l].centroid = sum( [gismo.embedding.query_projection(word)[0] for word in words]) tree.title = " ".join([ gismo.embedding.features[i] for i in gismo.diteration.y_order if gismo.embedding.features[i] in words ][:10]) tree.children = children for (l, child) in enumerate(tree.children): make_structure_louvain_W2V( keywords=child.members, words_vectors=words_vectors[ children_members_indexes[l], :], # to do gismo=gismo, tree=child, root=False, depth=depth - 1)
def make_structure_louvain_gismo_embedding(gismo, tree, keywords_indexes, root=True, depth=3): """ Builds a tree structure from Louvain clusterising method Args: gismo: the gismo built from the dataset tree: the empty node that will contain keywords_indexes: root: depth: Returns: None, it fills in the empty note that is given at first recursively """ # À la racine, tous les mots sont dans le cluster if root: tree.members = [ gismo.embedding.features[indice] for indice in keywords_indexes ] tree.centroid = sum([ gismo.embedding.query_projection(member)[0] for member in tree.members ]) tree.title = " ".join([ gismo.embedding.features[i] for i in gismo.diteration.y_order if gismo.embedding.features[i] in tree.members ][:10]) if depth == 0 or len(tree.members) == 1: return None # Clustering des membres words_vectors = gismo.embedding.y[keywords_indexes, :] words_adjacency = cosine_similarity(words_vectors, dense_output=False) # words_adjacency.setdiag(scipy.zeros(len(keywords_indexes))) # à initialiser avant le premier appel de fonction pour ne pas le refaire plusieurs fois ? louvain = Louvain() labels = louvain.fit_transform(words_adjacency) labels_unique, counts = np.unique(labels, return_counts=True) # Il y a autant d'enfants que de clusters children = [Node() for i in range(len(labels_unique))] for l in labels_unique: # on remplit members de chaque dico words_indexes = keywords_indexes[np.where(labels == l)] words = [ gismo.embedding.features[word_index] for word_index in words_indexes ] children[l].members = words children[l].centroid = sum( [gismo.embedding.query_projection(word)[0] for word in words]) children[l].title = " ".join([ gismo.embedding.features[i] for i in gismo.diteration.y_order if gismo.embedding.features[i] in words ][:10]) tree.children = children for child in tree.children: make_structure_louvain_gismo_embedding( gismo, child, np.array([ gismo.embedding.features.index(word) for word in child.members ]), root=False, depth=depth - 1)
class TestLouvainClustering(unittest.TestCase): def setUp(self): self.louvain = Louvain(engine='python') self.bilouvain = BiLouvain(engine='python') if is_numba_available: self.louvain_numba = Louvain(engine='numba') self.bilouvain_numba = BiLouvain(engine='numba') else: with self.assertRaises(ValueError): Louvain(engine='numba') def test_unknown_types(self): with self.assertRaises(TypeError): self.louvain.fit(sparse.identity(1)) def test_single_node_graph(self): self.assertEqual( self.louvain.fit_transform(sparse.identity(1, format='csr')), [0]) def test_simple_graph(self): self.simple_directed_graph = simple_directed_graph() self.louvain.fit(directed2undirected(self.simple_directed_graph)) self.assertEqual(len(self.louvain.labels_), 10) def test_undirected(self): self.louvain_high_resolution = Louvain(engine='python', resolution=2) self.louvain_null_resolution = Louvain(engine='python', resolution=0) self.karate_club = karate_club() self.louvain.fit(self.karate_club) labels = self.louvain.labels_ self.assertEqual(labels.shape, (34, )) self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42, 2) if is_numba_available: self.louvain_numba.fit(self.karate_club) labels = self.louvain_numba.labels_ self.assertEqual(labels.shape, (34, )) self.assertAlmostEqual(modularity(self.karate_club, labels), 0.42, 2) self.louvain_high_resolution.fit(self.karate_club) labels = self.louvain_high_resolution.labels_ self.assertEqual(labels.shape, (34, )) self.assertAlmostEqual(modularity(self.karate_club, labels), 0.34, 2) self.louvain_null_resolution.fit(self.karate_club) labels = self.louvain_null_resolution.labels_ self.assertEqual(labels.shape, (34, )) self.assertEqual(len(set(self.louvain_null_resolution.labels_)), 1) def test_directed(self): self.painters = painters(return_labels=False) self.louvain.fit(self.painters) labels = self.louvain.labels_ self.assertEqual(labels.shape, (14, )) self.assertAlmostEqual(modularity(self.painters, labels), 0.32, 2) self.bilouvain.fit(self.painters) n1, n2 = self.painters.shape row_labels = self.bilouvain.row_labels_ col_labels = self.bilouvain.col_labels_ self.assertEqual(row_labels.shape, (n1, )) self.assertEqual(col_labels.shape, (n2, )) def test_bipartite(self): star_wars_graph = star_wars_villains() self.bilouvain.fit(star_wars_graph) row_labels = self.bilouvain.row_labels_ col_labels = self.bilouvain.col_labels_ self.assertEqual(row_labels.shape, (4, )) self.assertEqual(col_labels.shape, (3, )) if is_numba_available: self.bilouvain_numba.fit(star_wars_graph) row_labels = self.bilouvain_numba.row_labels_ col_labels = self.bilouvain_numba.col_labels_ self.assertEqual(row_labels.shape, (4, )) self.assertEqual(col_labels.shape, (3, )) def test_shuffling(self): self.louvain_shuffle_first = Louvain(engine='python', shuffle_nodes=True, random_state=0) self.louvain_shuffle_second = Louvain(engine='python', shuffle_nodes=True, random_state=123) self.bow_tie = bow_tie() self.louvain_shuffle_first.fit(self.bow_tie) self.assertEqual(self.louvain_shuffle_first.labels_[1], 1) self.louvain_shuffle_second.fit(self.bow_tie) self.assertEqual(self.louvain_shuffle_second.labels_[1], 1)
katzrank_org = katzrank_org.to_frame() indegreerank_org = data.groupby('Organization')['indegreerank'].mean() indegreerank_org.to_csv('indegreerank_org.csv') indegreerank_org = indegreerank_org.rank(ascending=False) indegreerank_org = indegreerank_org.to_frame() org_rank = pagerank_org.join(katzrank_org) org_rank = org_rank.join(indegreerank_org) org_rank = org_rank.mean(axis=1) org_rank.to_csv('org_rank.csv') #####pattern detection adjacency = nx.adjacency_matrix(G) louvain = Louvain() labels = louvain.fit_transform(adjacency) labels_unique, counts = np.unique(labels, return_counts=True) optimal_modularity = modularity(adjacency, labels) #####modularity of the attribute organization = network_data['Organization'] organization = organization.to_numpy() organization_label = pd.factorize(organization)[0] organization_modularity = modularity(adjacency, organization_label) hireable = network_data['hireable'] hireable = hireable.to_numpy() hireable_label = pd.factorize(hireable)[0] hireable_modularity = modularity(adjacency, hireable_label)
with open(out_csv_path, 'w', newline="") as f: writer = csv.writer(f) writer.writerows(adj) ### IN PIU leng = len(utgs) network = sp.sparse.csr_matrix((adj[2], (adj[0], adj[1])), shape=(leng, leng)) #print(network.get_shape()) # modularity opt for community detection logger.info('Louvain alg with optimization level = ' + str(opt_par)) opt_lev = opt_par # 0.001 louvain = Louvain(random_state=0, tol_aggregation=opt_lev, tol_optimization=opt_lev) out = louvain.fit_transform(network) clusters, n_out = cu.get_clusters(out) n_groups = len(clusters) logger.info('Number of clusters: ' + str(n_groups)) ### REPRESENTATIVES CHOICE ### #evaluating degree of each utg deg = np.zeros(leng, dtype=np.uint32) for i in range(len(adj[0])): deg[adj[0][i]] += adj[2][i] deg[adj[1][i]] += adj[2][i] #create representatives based on deg #max_length_grouped = 10000