def remove_self_loops(sparse_graph: 'gust.SparseGraph') -> 'gust.SparseGraph': """Remove self loops (diagonal entries in the adjacency matrix). Changes are returned in a partially new SparseGraph. """ num_self_loops = (~np.isclose(sparse_graph.adj_matrix.diagonal(), 0)).sum() if num_self_loops > 0: adj_matrix = sparse_graph.adj_matrix.copy().tolil() adj_matrix.setdiag(0) adj_matrix = adj_matrix.tocsr() if sparse_graph.edge_attr_matrix is None: edge_attr_matrix = None else: old_idx = sparse_graph.get_edgeid_to_idx_array() keep_edge_idx = np.where((old_idx[:, 0] - old_idx[:, 1]) != 0)[0] edge_attr_matrix = sparse_graph._edge_attr_matrix[keep_edge_idx] warnings.warn("{0} self loops removed".format(num_self_loops)) return gust.SparseGraph( adj_matrix, sparse_graph.attr_matrix, edge_attr_matrix, sparse_graph.labels, sparse_graph.node_names, sparse_graph.attr_names, sparse_graph.edge_attr_names, sparse_graph.class_names, sparse_graph.metadata) else: return sparse_graph
def test_load_dataset_pkl(): spgraph = gust.SparseGraph( sp.csr_matrix(np.arange(16).reshape(4, 4) > 3, dtype=np.float32)) gust.io.save_to_pickle(test_path / "test.pkl", spgraph) spgraph2 = gust.load_dataset("test", test_path) assert (spgraph.adj_matrix - spgraph2.adj_matrix).nnz == 0 spgraph2 = gust.load_dataset("test.pkl", test_path) assert (spgraph.adj_matrix - spgraph2.adj_matrix).nnz == 0 os.remove(test_path / "test.pkl")
def test_create_subgraph(self): spA = gust.SparseGraph(self.A.copy()) keep = [0, 2, 3] spB = gust.create_subgraph(spA, nodes_to_keep=keep) # Check that changes are not done in-place assert np.allclose(self.A.A, spA.adj_matrix.A) B = sp.csr_matrix( np.array([[1., 0.5, 0.], [0.5, 1., 0.], [0., 1., 0.]])) assert np.allclose(B.A, spB.adj_matrix.A)
def test_remove_self_loops(self): spA = gust.SparseGraph(self.A.copy()) spB = gust.remove_self_loops(spA) # Check that changes are not done in-place assert np.allclose(self.A.A, spA.adj_matrix.A) B = sp.csr_matrix( np.array([[0., 0., 0.5, 0., 0.], [0., 0., 1., 0., 1.], [0.5, 0., 0., 0., 0.], [0., 0., 1., 0., 2.], [0., 1., 0., 0., 0.]])) assert np.allclose(B.A, spB.adj_matrix.A)
def test_largest_connected_components(): A = sp.csr_matrix( np.array([[1., 0., 0.5, 0., 0.], [0., 1., 1., 0., 0.], [0.5, 0., 1., 0., 0.], [0., 0., 0., 0., 2.], [0., 0., 0., 0., 0.]])) spA = gust.SparseGraph(A.copy()) spB = gust.largest_connected_components(spA) # Check that changes are not done in-place assert np.allclose(A.A, spA.adj_matrix.A) B = sp.csr_matrix(np.array([[1., 0., 0.5], [0., 1., 1.], [0.5, 0., 1.]])) assert np.allclose(B.A, spB.adj_matrix.A)
def test_load_dataset_multiple(): spgraph = gust.SparseGraph( sp.csr_matrix(np.arange(16).reshape(4, 4) > 3, dtype=np.float32)) gust.io.save_to_pickle(test_path / "test.pkl", spgraph) gust.io.save_to_pickle(test_path / "test.pkl.gz", spgraph, compression=True) with pytest.warns(UserWarning): spgraph2 = gust.load_dataset("test", test_path) assert (spgraph.adj_matrix - spgraph2.adj_matrix).nnz == 0 os.remove(test_path / "test.pkl") os.remove(test_path / "test.pkl.gz")
def test_create_subgraph_edgeattrs(self): edge_attrs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) spA = gust.SparseGraph(self.A, edge_attr_matrix=edge_attrs) keep = [0, 2, 3] spB = gust.create_subgraph(spA, nodes_to_keep=keep) # Check that changes are not done in-place assert np.allclose(spA.edge_attr_matrix, edge_attrs) B = sp.csr_matrix( np.array([[1., 0.5, 0.], [0.5, 1., 0.], [0., 1., 0.]])) edge_attrs_B = np.array([0, 1, 5, 6, 7]) assert np.allclose(B.A, spB.adj_matrix.A) assert np.allclose(spB.edge_attr_matrix, edge_attrs_B)
def test_remove_self_loops_edgeattrs(self): edge_attrs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) spA = gust.SparseGraph(self.A, edge_attr_matrix=edge_attrs) spB = gust.remove_self_loops(spA) # Check that changes are not done in-place assert np.allclose(spA.edge_attr_matrix, edge_attrs) B = sp.csr_matrix( np.array([[0., 0., 0.5, 0., 0.], [0., 0., 1., 0., 1.], [0.5, 0., 0., 0., 0.], [0., 0., 1., 0., 2.], [0., 1., 0., 0., 0.]])) edge_attrs_B = np.array([1, 3, 4, 5, 7, 8, 9]) assert np.allclose(B.A, spB.adj_matrix.A) assert np.allclose(spB.edge_attr_matrix, edge_attrs_B)
def test_largest_connected_components_edgeattrs(): A = sp.csr_matrix( np.array([[1., 0., 0.5, 0., 0.], [0., 1., 0., 0.5, 0.], [0., 0., 1., 0., 0.], [0., 0., 0., 0., 2.], [0., 0., 0., 0., 0.]])) edge_attrs = np.array([0, 1, 2, 3, 4, 5]) spA = gust.SparseGraph(A, edge_attr_matrix=edge_attrs) spB = gust.largest_connected_components(spA) # Check that changes are not done in-place assert np.allclose(spA.edge_attr_matrix, edge_attrs) B = sp.csr_matrix(np.array([[1., 0.5, 0.], [0., 0., 2.], [0., 0., 0.]])) edge_attrs_B = np.array([2, 3, 5]) assert np.allclose(B.A, spB.adj_matrix.A) assert np.allclose(spB.edge_attr_matrix, edge_attrs_B)
def pca_on_attributes(sparse_graph: 'gust.SparseGraph', n_components: Union[int, float]) -> 'gust.SparseGraph': """Perform PCA on attributes. If the attribute matrix is sparse, it is converted to dense and a warning is raised. Parameters ---------- sparse_graph Input graph. n_components If int, number of components to keep. If float, fraction of variance to preserve. Returns ------- gust.SparseGraph Graph with converted attributes. """ if sparse_graph.attr_matrix is None: raise ValueError("The given SparseGraph is not attributed.") if sp.isspmatrix(sparse_graph.attr_matrix): warnings.warn( "Attribute matrix is converted to dense when performing PCA") attr_matrix = sparse_graph.attr_matrix.todense() else: attr_matrix = sparse_graph.attr_matrix pca = PCA(n_components=n_components) attr_matrix = pca.fit_transform(attr_matrix) return gust.SparseGraph(sparse_graph.adj_matrix, attr_matrix, sparse_graph.edge_attr_matrix, sparse_graph.labels, sparse_graph.node_names, None, sparse_graph.edge_attr_names, sparse_graph.class_names, sparse_graph.metadata)
not_isolated = ((A.sum(0).A1 + A.sum(1).A1) > 0) A = A[not_isolated][:, not_isolated] X = X[not_isolated] z = z[not_isolated] nodes_to_keep = np.where(not_isolated)[0] new_idx = 0 new_i2n = {} for old_idx in nodes_to_keep: new_i2n[new_idx] = i2n[old_idx] new_idx += 1 i2n = new_i2n # Vectorize features if attr_type == 'tfidf': vectorizer = TfidfVectorizer(min_df=10, stop_words='english') elif attr_type == 'binary': vectorizer = CountVectorizer(min_df=10, stop_words='english', binary=True) elif attr_type == 'count': vectorizer = CountVectorizer(min_df=10, stop_words='english', binary=False) else: raise ValueError("Unknown attr_type.") X_vec = vectorizer.fit_transform(X) i2a = {v: k for (k, v) in vectorizer.vocabulary_.items()} i2c = {0: 'Databases', 1: 'Artificial_Intelligence', 2: 'Computer_Vision', 3: 'Data_Mining'} G = gust.SparseGraph(A, attr_matrix=X_vec, labels=z, idx_to_node=i2n, idx_to_attr=i2a, idx_to_class=i2c) gust.io.save_to_npz('dblp', G)
def test_sparsegraph_to_from_networkx(self, sparse): # Set up original graph node_attrs = sp.csr_matrix( np.array([[0., 3., 2.], [0., 0., 4.], [1., 1., 0.], [0., 0., 1.], [0., 2., 0.]])) attr_names = np.array(['a', 'b', 'c']) edge_attrs = np.array([[0, 1.], [1, 0.], [2, 1.], [3, 0.], [1, 0.], [5, 4.], [6, 0.], [7, 3.], [8, 2.], [9, 0.3]]) edge_attr_names = np.array(['ae', 'be']) labels = np.array([0, 1, 1, 0, 2]) class_names = np.array(['in', 'between', 'out']) A_sym = sp.csr_matrix( np.array([[1., 0., 0.5, 0., 0.], [0., 1., 1., 0., 1.], [0.5, 1., 0., 0., 0.], [0., 0., 0., 0., 2.], [0., 1., 0., 2., 0.]])) edge_attrs_sym = np.array([[0, 1.], [1, 0.], [2, 1.], [3, 0.], [8, 2.], [1, 0.], [3, 0.], [7, 3.], [8, 2.], [7, 3.]]) for i in range(2): A = self.A if i == 0 else A_sym edge_a = edge_attrs if i == 0 else edge_attrs_sym spA = gust.SparseGraph(A, attr_matrix=node_attrs, edge_attr_matrix=edge_a, attr_names=attr_names, edge_attr_names=edge_attr_names, labels=labels, class_names=class_names) # Convert to NetworkX and back nx_graph = gust.sparsegraph_to_networkx(spA) spB = gust.networkx_to_sparsegraph(nx_graph, label_name='label', sparse_node_attrs=sparse, sparse_edge_attrs=sparse) # Check adjacency matrix assert np.allclose(spA.adj_matrix.A, spB.adj_matrix.A) # Check node attributes assert len(spA.attr_names) == len(spB.attr_names) for iold, attr in enumerate(spA.attr_names): assert len(np.where(spB.attr_names == attr)[0]) == 1 inew = np.where(spB.attr_names == attr)[0][0] if sparse: assert (spA.attr_matrix[:, iold] != spB.attr_matrix[:, inew]).nnz == 0 else: assert np.allclose(spA.attr_matrix.A[:, iold], spB.attr_matrix[:, inew]) # Check edge attributes assert len(spA.edge_attr_names) == len(spB.edge_attr_names) for iold, attr in enumerate(spA.edge_attr_names): assert len(np.where(spB.edge_attr_names == attr)[0]) == 1 inew = np.where(spB.edge_attr_names == attr)[0][0] if sparse: assert np.allclose(spA.edge_attr_matrix[:, iold], spB.edge_attr_matrix.A[:, inew]) else: assert np.allclose(spA.edge_attr_matrix[:, iold], spB.edge_attr_matrix[:, inew]) # Check labels and class names assert len(spA.class_names) == len(spB.class_names) class_mapping = {} for iold, label in enumerate(spA.class_names): assert len(np.where(spB.class_names == label)[0]) == 1 class_mapping[iold] = np.where(spB.class_names == label)[0][0] assert len(spA.labels) == len(spB.labels) all((class_mapping[old_label] == spB.labels[i] for i, old_label in enumerate(spA.labels)))
def test_sparsegraph_to_from_networkx_simple(self): spA = gust.SparseGraph(self.A) nx_graph = gust.sparsegraph_to_networkx(spA) spB = gust.networkx_to_sparsegraph(nx_graph) assert np.allclose(spA.adj_matrix.A, spB.adj_matrix.A)
def networkx_to_sparsegraph( nx_graph: Union[nx.Graph, nx.DiGraph], label_name: str = None, sparse_node_attrs: bool = True, sparse_edge_attrs: bool = True) -> 'gust.SparseGraph': """Convert NetworkX graph to gust SparseGraph. Node and edge attributes need to be numeric. Missing entries are interpreted as 0. Labels can be any object. If non-numeric they are interpreted as categorical and enumerated. Parameters ---------- nx_graph Graph to convert. Returns ------- gust.SparseGraph Converted graph. """ # Extract node names int_names = True for node in nx_graph.nodes: int_names &= isinstance(node, int) if int_names: node_names = None else: node_names = np.array(nx_graph.nodes) nx_graph = nx.convert_node_labels_to_integers(nx_graph) # Extract adjacency matrix adj_matrix = nx.adjacency_matrix(nx_graph) # Collect all node attribute names attrs = set() for _, node_data in nx_graph.nodes().data(): attrs.update(node_data.keys()) # Initialize labels and remove them from the attribute names if label_name is None: labels = None else: if label_name not in attrs: raise ValueError( "No attribute with label name '{}' found.".format(label_name)) attrs.remove(label_name) labels = [0 for _ in range(nx_graph.number_of_nodes())] if len(attrs) > 0: # Save attribute names if not integer all_integer = all((isinstance(attr, int) for attr in attrs)) if all_integer: attr_names = None attr_mapping = None else: attr_names = np.array(list(attrs)) attr_mapping = {k: i for i, k in enumerate(attr_names)} # Initialize attribute matrix if sparse_node_attrs: attr_matrix = sp.lil_matrix( (nx_graph.number_of_nodes(), len(attr_names)), dtype=np.float32) else: attr_matrix = np.zeros( (nx_graph.number_of_nodes(), len(attr_names)), dtype=np.float32) else: attr_matrix = None attr_names = None # Fill label and attribute matrices for inode, node_attrs in nx_graph.nodes.data(): for key, val in node_attrs.items(): if key == label_name: labels[inode] = val else: if not isinstance(val, Number): if node_names is None: raise ValueError( "Node {} has attribute '{}' with value '{}', which is not a number." .format(inode, key, val)) else: raise ValueError( "Node '{}' has attribute '{}' with value '{}', which is not a number." .format(node_names[inode], key, val)) if attr_mapping is None: attr_matrix[inode, key] = val else: attr_matrix[inode, attr_mapping[key]] = val if attr_matrix is not None and sparse_node_attrs: attr_matrix = attr_matrix.tocsr() # Convert labels to integers if labels is None: class_names = None else: try: labels = np.array(labels, dtype=np.float32) class_names = None except ValueError: class_names = np.unique(labels) class_mapping = {k: i for i, k in enumerate(class_names)} labels_int = np.empty(nx_graph.number_of_nodes(), dtype=np.float32) for inode, label in enumerate(labels): labels_int[inode] = class_mapping[label] labels = labels_int # Collect all edge attribute names edge_attrs = set() for _, _, edge_data in nx_graph.edges().data(): edge_attrs.update(edge_data.keys()) if 'weight' in edge_attrs: edge_attrs.remove('weight') if len(edge_attrs) > 0: # Save edge attribute names if not integer all_integer = all((isinstance(attr, int) for attr in edge_attrs)) if all_integer: edge_attr_names = None edge_attr_mapping = None else: edge_attr_names = np.array(list(edge_attrs)) edge_attr_mapping = {k: i for i, k in enumerate(edge_attr_names)} # Initialize edge attribute matrix if sparse_edge_attrs: edge_attr_matrix = sp.lil_matrix( (adj_matrix.nnz, len(edge_attr_names)), dtype=np.float32) else: edge_attr_matrix = np.zeros((adj_matrix.nnz, len(edge_attr_names)), dtype=np.float32) else: edge_attr_matrix = None edge_attr_names = None # Fill edge attribute matrix edgeid_mat = sp.csr_matrix( (np.arange(adj_matrix.nnz), adj_matrix.indices, adj_matrix.indptr), shape=adj_matrix.shape) for i, j, edge_attrs in nx_graph.edges.data(): for key, val in edge_attrs.items(): if key != 'weight': if not isinstance(val, Number): if node_names is None: raise ValueError( "Edge {}->{} has attribute '{}' with value '{}', which is not a number." .format(i, j, key, val)) else: raise ValueError( "Edge '{}'->'{}' has attribute '{}' with value '{}', which is not a number." .format(node_names[i], node_names[j], key, val)) new_key = key if attr_mapping is None else edge_attr_mapping[ key] edge_attr_matrix[edgeid_mat[i, j], new_key] = val if not nx_graph.is_directed(): edge_attr_matrix[edgeid_mat[j, i], new_key] = val if edge_attr_matrix is not None and sparse_edge_attrs: edge_attr_matrix = edge_attr_matrix.tocsr() return gust.SparseGraph(adj_matrix=adj_matrix, attr_matrix=attr_matrix, edge_attr_matrix=edge_attr_matrix, labels=labels, node_names=node_names, attr_names=attr_names, edge_attr_names=edge_attr_names, class_names=class_names, metadata=None)
def create_subgraph(sparse_graph: 'gust.SparseGraph', _sentinel: None = None, nodes_to_remove: np.ndarray = None, nodes_to_keep: np.ndarray = None) -> 'gust.SparseGraph': """Create a graph with the specified subset of nodes. Exactly one of (nodes_to_remove, nodes_to_keep) should be provided, while the other stays None. Note that to avoid confusion, it is required to pass node indices as named arguments to this function. The subgraph partially points to the old graph's data. Parameters ---------- sparse_graph Input graph. _sentinel Internal, to prevent passing positional arguments. Do not use. nodes_to_remove Indices of nodes that have to removed. nodes_to_keep Indices of nodes that have to be kept. Returns ------- gust.SparseGraph Graph with specified nodes removed. """ # Check that arguments are passed correctly if _sentinel is not None: raise ValueError("Only call `create_subgraph` with named arguments'," " (nodes_to_remove=...) or (nodes_to_keep=...).") if nodes_to_remove is None and nodes_to_keep is None: raise ValueError( "Either nodes_to_remove or nodes_to_keep must be provided.") elif nodes_to_remove is not None and nodes_to_keep is not None: raise ValueError( "Only one of nodes_to_remove or nodes_to_keep must be provided.") elif nodes_to_remove is not None: nodes_to_keep = [ i for i in range(sparse_graph.num_nodes()) if i not in nodes_to_remove ] elif nodes_to_keep is not None: nodes_to_keep = sorted(nodes_to_keep) else: raise RuntimeError("This should never happen.") adj_matrix = sparse_graph.adj_matrix[nodes_to_keep][:, nodes_to_keep] if sparse_graph.attr_matrix is None: attr_matrix = None else: attr_matrix = sparse_graph.attr_matrix[nodes_to_keep] if sparse_graph.edge_attr_matrix is None: edge_attr_matrix = None else: old_idx = sparse_graph.get_edgeid_to_idx_array() keep_edge_idx = np.where( np.all(np.isin(old_idx, nodes_to_keep), axis=1))[0] edge_attr_matrix = sparse_graph.edge_attr_matrix[keep_edge_idx] if sparse_graph.labels is None: labels = None else: labels = sparse_graph.labels[nodes_to_keep] if sparse_graph.node_names is None: node_names = None else: node_names = sparse_graph.node_names[nodes_to_keep] # TODO: add warnings / logging # print("Resulting subgraph with N = {0}, E = {1}" # .format(sparse_graph.num_nodes(), sparse_graph.num_edges())) return gust.SparseGraph(adj_matrix, attr_matrix, edge_attr_matrix, labels, node_names, sparse_graph.attr_names, sparse_graph.edge_attr_names, sparse_graph.class_names, sparse_graph.metadata)
A = A[keep_mask][:, keep_mask] X = X[keep_mask] z = z[keep_mask] # convert i2n map new_idx = 0 new_i2n = {} for old_idx in nodes_to_keep: new_i2n[new_idx] = i2n[old_idx] new_idx += 1 i2n = new_i2n # Rename class labels remap_z = dict(zip(np.unique(z), range(len(np.unique(z))))) z = np.vectorize(remap_z.get)(z) # Vectorize attributes # Note that since we use min_df=10 you will get different results # if you perform vectorization before or after filtering nodes from the network (e.g. singletons) from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(min_df=10, stop_words='english') X_tfidf = tfidf.fit_transform(X) i2a = {v: k for (k, v) in tfidf.vocabulary_.items()} G = gust.SparseGraph(A, attr_matrix=X_tfidf, labels=z, idx_to_node=i2n, idx_to_attr=i2a) gust.io.save_to_npz('citeseer_m10', G)