예제 #1
0
def remove_self_loops(sparse_graph: 'gust.SparseGraph') -> 'gust.SparseGraph':
    """Remove self loops (diagonal entries in the adjacency matrix).

    Changes are returned in a partially new SparseGraph.

    """
    num_self_loops = (~np.isclose(sparse_graph.adj_matrix.diagonal(), 0)).sum()
    if num_self_loops > 0:
        adj_matrix = sparse_graph.adj_matrix.copy().tolil()
        adj_matrix.setdiag(0)
        adj_matrix = adj_matrix.tocsr()
        if sparse_graph.edge_attr_matrix is None:
            edge_attr_matrix = None
        else:
            old_idx = sparse_graph.get_edgeid_to_idx_array()
            keep_edge_idx = np.where((old_idx[:, 0] - old_idx[:, 1]) != 0)[0]
            edge_attr_matrix = sparse_graph._edge_attr_matrix[keep_edge_idx]
        warnings.warn("{0} self loops removed".format(num_self_loops))
        return gust.SparseGraph(
            adj_matrix, sparse_graph.attr_matrix, edge_attr_matrix,
            sparse_graph.labels, sparse_graph.node_names,
            sparse_graph.attr_names, sparse_graph.edge_attr_names,
            sparse_graph.class_names, sparse_graph.metadata)
    else:
        return sparse_graph
예제 #2
0
def test_load_dataset_pkl():
    spgraph = gust.SparseGraph(
        sp.csr_matrix(np.arange(16).reshape(4, 4) > 3, dtype=np.float32))
    gust.io.save_to_pickle(test_path / "test.pkl", spgraph)
    spgraph2 = gust.load_dataset("test", test_path)
    assert (spgraph.adj_matrix - spgraph2.adj_matrix).nnz == 0
    spgraph2 = gust.load_dataset("test.pkl", test_path)
    assert (spgraph.adj_matrix - spgraph2.adj_matrix).nnz == 0
    os.remove(test_path / "test.pkl")
 def test_create_subgraph(self):
     spA = gust.SparseGraph(self.A.copy())
     keep = [0, 2, 3]
     spB = gust.create_subgraph(spA, nodes_to_keep=keep)
     # Check that changes are not done in-place
     assert np.allclose(self.A.A, spA.adj_matrix.A)
     B = sp.csr_matrix(
         np.array([[1., 0.5, 0.], [0.5, 1., 0.], [0., 1., 0.]]))
     assert np.allclose(B.A, spB.adj_matrix.A)
 def test_remove_self_loops(self):
     spA = gust.SparseGraph(self.A.copy())
     spB = gust.remove_self_loops(spA)
     # Check that changes are not done in-place
     assert np.allclose(self.A.A, spA.adj_matrix.A)
     B = sp.csr_matrix(
         np.array([[0., 0., 0.5, 0., 0.], [0., 0., 1., 0., 1.],
                   [0.5, 0., 0., 0., 0.], [0., 0., 1., 0., 2.],
                   [0., 1., 0., 0., 0.]]))
     assert np.allclose(B.A, spB.adj_matrix.A)
def test_largest_connected_components():
    A = sp.csr_matrix(
        np.array([[1., 0., 0.5, 0., 0.], [0., 1., 1., 0., 0.],
                  [0.5, 0., 1., 0., 0.], [0., 0., 0., 0., 2.],
                  [0., 0., 0., 0., 0.]]))
    spA = gust.SparseGraph(A.copy())
    spB = gust.largest_connected_components(spA)
    # Check that changes are not done in-place
    assert np.allclose(A.A, spA.adj_matrix.A)
    B = sp.csr_matrix(np.array([[1., 0., 0.5], [0., 1., 1.], [0.5, 0., 1.]]))
    assert np.allclose(B.A, spB.adj_matrix.A)
예제 #6
0
def test_load_dataset_multiple():
    spgraph = gust.SparseGraph(
        sp.csr_matrix(np.arange(16).reshape(4, 4) > 3, dtype=np.float32))
    gust.io.save_to_pickle(test_path / "test.pkl", spgraph)
    gust.io.save_to_pickle(test_path / "test.pkl.gz",
                           spgraph,
                           compression=True)
    with pytest.warns(UserWarning):
        spgraph2 = gust.load_dataset("test", test_path)
    assert (spgraph.adj_matrix - spgraph2.adj_matrix).nnz == 0
    os.remove(test_path / "test.pkl")
    os.remove(test_path / "test.pkl.gz")
 def test_create_subgraph_edgeattrs(self):
     edge_attrs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
     spA = gust.SparseGraph(self.A, edge_attr_matrix=edge_attrs)
     keep = [0, 2, 3]
     spB = gust.create_subgraph(spA, nodes_to_keep=keep)
     # Check that changes are not done in-place
     assert np.allclose(spA.edge_attr_matrix, edge_attrs)
     B = sp.csr_matrix(
         np.array([[1., 0.5, 0.], [0.5, 1., 0.], [0., 1., 0.]]))
     edge_attrs_B = np.array([0, 1, 5, 6, 7])
     assert np.allclose(B.A, spB.adj_matrix.A)
     assert np.allclose(spB.edge_attr_matrix, edge_attrs_B)
 def test_remove_self_loops_edgeattrs(self):
     edge_attrs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
     spA = gust.SparseGraph(self.A, edge_attr_matrix=edge_attrs)
     spB = gust.remove_self_loops(spA)
     # Check that changes are not done in-place
     assert np.allclose(spA.edge_attr_matrix, edge_attrs)
     B = sp.csr_matrix(
         np.array([[0., 0., 0.5, 0., 0.], [0., 0., 1., 0., 1.],
                   [0.5, 0., 0., 0., 0.], [0., 0., 1., 0., 2.],
                   [0., 1., 0., 0., 0.]]))
     edge_attrs_B = np.array([1, 3, 4, 5, 7, 8, 9])
     assert np.allclose(B.A, spB.adj_matrix.A)
     assert np.allclose(spB.edge_attr_matrix, edge_attrs_B)
def test_largest_connected_components_edgeattrs():
    A = sp.csr_matrix(
        np.array([[1., 0., 0.5, 0., 0.], [0., 1., 0., 0.5, 0.],
                  [0., 0., 1., 0., 0.], [0., 0., 0., 0., 2.],
                  [0., 0., 0., 0., 0.]]))
    edge_attrs = np.array([0, 1, 2, 3, 4, 5])
    spA = gust.SparseGraph(A, edge_attr_matrix=edge_attrs)
    spB = gust.largest_connected_components(spA)
    # Check that changes are not done in-place
    assert np.allclose(spA.edge_attr_matrix, edge_attrs)
    B = sp.csr_matrix(np.array([[1., 0.5, 0.], [0., 0., 2.], [0., 0., 0.]]))
    edge_attrs_B = np.array([2, 3, 5])
    assert np.allclose(B.A, spB.adj_matrix.A)
    assert np.allclose(spB.edge_attr_matrix, edge_attrs_B)
예제 #10
0
def pca_on_attributes(sparse_graph: 'gust.SparseGraph',
                      n_components: Union[int, float]) -> 'gust.SparseGraph':
    """Perform PCA on attributes.

    If the attribute matrix is sparse, it is converted to dense and a warning is raised.

    Parameters
    ----------
    sparse_graph
        Input graph.
    n_components
        If int, number of components to keep.
        If float, fraction of variance to preserve.

    Returns
    -------
    gust.SparseGraph
        Graph with converted attributes.

    """
    if sparse_graph.attr_matrix is None:
        raise ValueError("The given SparseGraph is not attributed.")

    if sp.isspmatrix(sparse_graph.attr_matrix):
        warnings.warn(
            "Attribute matrix is converted to dense when performing PCA")
        attr_matrix = sparse_graph.attr_matrix.todense()
    else:
        attr_matrix = sparse_graph.attr_matrix

    pca = PCA(n_components=n_components)
    attr_matrix = pca.fit_transform(attr_matrix)
    return gust.SparseGraph(sparse_graph.adj_matrix, attr_matrix,
                            sparse_graph.edge_attr_matrix, sparse_graph.labels,
                            sparse_graph.node_names, None,
                            sparse_graph.edge_attr_names,
                            sparse_graph.class_names, sparse_graph.metadata)
예제 #11
0
not_isolated = ((A.sum(0).A1 + A.sum(1).A1) > 0)

A = A[not_isolated][:, not_isolated]
X = X[not_isolated]
z = z[not_isolated]
nodes_to_keep = np.where(not_isolated)[0]
new_idx = 0
new_i2n = {}
for old_idx in nodes_to_keep:
    new_i2n[new_idx] = i2n[old_idx]
    new_idx += 1
i2n = new_i2n

# Vectorize features
if attr_type == 'tfidf':
    vectorizer = TfidfVectorizer(min_df=10, stop_words='english')
elif attr_type == 'binary':
    vectorizer = CountVectorizer(min_df=10, stop_words='english', binary=True)
elif attr_type == 'count':
    vectorizer = CountVectorizer(min_df=10, stop_words='english', binary=False)
else:
    raise ValueError("Unknown attr_type.")

X_vec = vectorizer.fit_transform(X)
i2a = {v: k for (k, v) in vectorizer.vocabulary_.items()}
i2c = {0: 'Databases', 1: 'Artificial_Intelligence', 2: 'Computer_Vision', 3: 'Data_Mining'}

G = gust.SparseGraph(A, attr_matrix=X_vec, labels=z,
                     idx_to_node=i2n, idx_to_attr=i2a, idx_to_class=i2c)
gust.io.save_to_npz('dblp', G)
    def test_sparsegraph_to_from_networkx(self, sparse):

        # Set up original graph
        node_attrs = sp.csr_matrix(
            np.array([[0., 3., 2.], [0., 0., 4.], [1., 1., 0.], [0., 0., 1.],
                      [0., 2., 0.]]))
        attr_names = np.array(['a', 'b', 'c'])
        edge_attrs = np.array([[0, 1.], [1, 0.], [2, 1.], [3, 0.], [1, 0.],
                               [5, 4.], [6, 0.], [7, 3.], [8, 2.], [9, 0.3]])
        edge_attr_names = np.array(['ae', 'be'])
        labels = np.array([0, 1, 1, 0, 2])
        class_names = np.array(['in', 'between', 'out'])
        A_sym = sp.csr_matrix(
            np.array([[1., 0., 0.5, 0., 0.], [0., 1., 1., 0., 1.],
                      [0.5, 1., 0., 0., 0.], [0., 0., 0., 0., 2.],
                      [0., 1., 0., 2., 0.]]))
        edge_attrs_sym = np.array([[0, 1.], [1, 0.], [2, 1.], [3, 0.], [8, 2.],
                                   [1, 0.], [3, 0.], [7, 3.], [8, 2.], [7,
                                                                        3.]])
        for i in range(2):
            A = self.A if i == 0 else A_sym
            edge_a = edge_attrs if i == 0 else edge_attrs_sym
            spA = gust.SparseGraph(A,
                                   attr_matrix=node_attrs,
                                   edge_attr_matrix=edge_a,
                                   attr_names=attr_names,
                                   edge_attr_names=edge_attr_names,
                                   labels=labels,
                                   class_names=class_names)

            # Convert to NetworkX and back
            nx_graph = gust.sparsegraph_to_networkx(spA)
            spB = gust.networkx_to_sparsegraph(nx_graph,
                                               label_name='label',
                                               sparse_node_attrs=sparse,
                                               sparse_edge_attrs=sparse)

            # Check adjacency matrix
            assert np.allclose(spA.adj_matrix.A, spB.adj_matrix.A)

            # Check node attributes
            assert len(spA.attr_names) == len(spB.attr_names)
            for iold, attr in enumerate(spA.attr_names):
                assert len(np.where(spB.attr_names == attr)[0]) == 1
                inew = np.where(spB.attr_names == attr)[0][0]
                if sparse:
                    assert (spA.attr_matrix[:, iold] !=
                            spB.attr_matrix[:, inew]).nnz == 0
                else:
                    assert np.allclose(spA.attr_matrix.A[:, iold],
                                       spB.attr_matrix[:, inew])

            # Check edge attributes
            assert len(spA.edge_attr_names) == len(spB.edge_attr_names)
            for iold, attr in enumerate(spA.edge_attr_names):
                assert len(np.where(spB.edge_attr_names == attr)[0]) == 1
                inew = np.where(spB.edge_attr_names == attr)[0][0]
                if sparse:
                    assert np.allclose(spA.edge_attr_matrix[:, iold],
                                       spB.edge_attr_matrix.A[:, inew])
                else:
                    assert np.allclose(spA.edge_attr_matrix[:, iold],
                                       spB.edge_attr_matrix[:, inew])

            # Check labels and class names
            assert len(spA.class_names) == len(spB.class_names)
            class_mapping = {}
            for iold, label in enumerate(spA.class_names):
                assert len(np.where(spB.class_names == label)[0]) == 1
                class_mapping[iold] = np.where(spB.class_names == label)[0][0]
            assert len(spA.labels) == len(spB.labels)
            all((class_mapping[old_label] == spB.labels[i]
                 for i, old_label in enumerate(spA.labels)))
 def test_sparsegraph_to_from_networkx_simple(self):
     spA = gust.SparseGraph(self.A)
     nx_graph = gust.sparsegraph_to_networkx(spA)
     spB = gust.networkx_to_sparsegraph(nx_graph)
     assert np.allclose(spA.adj_matrix.A, spB.adj_matrix.A)
예제 #14
0
def networkx_to_sparsegraph(
        nx_graph: Union[nx.Graph, nx.DiGraph],
        label_name: str = None,
        sparse_node_attrs: bool = True,
        sparse_edge_attrs: bool = True) -> 'gust.SparseGraph':
    """Convert NetworkX graph to gust SparseGraph.

    Node and edge attributes need to be numeric.
    Missing entries are interpreted as 0.
    Labels can be any object. If non-numeric they are interpreted as
    categorical and enumerated.

    Parameters
    ----------
    nx_graph
        Graph to convert.

    Returns
    -------
    gust.SparseGraph
        Converted graph.

    """
    # Extract node names
    int_names = True
    for node in nx_graph.nodes:
        int_names &= isinstance(node, int)
    if int_names:
        node_names = None
    else:
        node_names = np.array(nx_graph.nodes)
        nx_graph = nx.convert_node_labels_to_integers(nx_graph)

    # Extract adjacency matrix
    adj_matrix = nx.adjacency_matrix(nx_graph)

    # Collect all node attribute names
    attrs = set()
    for _, node_data in nx_graph.nodes().data():
        attrs.update(node_data.keys())

    # Initialize labels and remove them from the attribute names
    if label_name is None:
        labels = None
    else:
        if label_name not in attrs:
            raise ValueError(
                "No attribute with label name '{}' found.".format(label_name))
        attrs.remove(label_name)
        labels = [0 for _ in range(nx_graph.number_of_nodes())]

    if len(attrs) > 0:
        # Save attribute names if not integer
        all_integer = all((isinstance(attr, int) for attr in attrs))
        if all_integer:
            attr_names = None
            attr_mapping = None
        else:
            attr_names = np.array(list(attrs))
            attr_mapping = {k: i for i, k in enumerate(attr_names)}

        # Initialize attribute matrix
        if sparse_node_attrs:
            attr_matrix = sp.lil_matrix(
                (nx_graph.number_of_nodes(), len(attr_names)),
                dtype=np.float32)
        else:
            attr_matrix = np.zeros(
                (nx_graph.number_of_nodes(), len(attr_names)),
                dtype=np.float32)
    else:
        attr_matrix = None
        attr_names = None

    # Fill label and attribute matrices
    for inode, node_attrs in nx_graph.nodes.data():
        for key, val in node_attrs.items():
            if key == label_name:
                labels[inode] = val
            else:
                if not isinstance(val, Number):
                    if node_names is None:
                        raise ValueError(
                            "Node {} has attribute '{}' with value '{}', which is not a number."
                            .format(inode, key, val))
                    else:
                        raise ValueError(
                            "Node '{}' has attribute '{}' with value '{}', which is not a number."
                            .format(node_names[inode], key, val))
                if attr_mapping is None:
                    attr_matrix[inode, key] = val
                else:
                    attr_matrix[inode, attr_mapping[key]] = val
    if attr_matrix is not None and sparse_node_attrs:
        attr_matrix = attr_matrix.tocsr()

    # Convert labels to integers
    if labels is None:
        class_names = None
    else:
        try:
            labels = np.array(labels, dtype=np.float32)
            class_names = None
        except ValueError:
            class_names = np.unique(labels)
            class_mapping = {k: i for i, k in enumerate(class_names)}
            labels_int = np.empty(nx_graph.number_of_nodes(), dtype=np.float32)
            for inode, label in enumerate(labels):
                labels_int[inode] = class_mapping[label]
            labels = labels_int

    # Collect all edge attribute names
    edge_attrs = set()
    for _, _, edge_data in nx_graph.edges().data():
        edge_attrs.update(edge_data.keys())
    if 'weight' in edge_attrs:
        edge_attrs.remove('weight')

    if len(edge_attrs) > 0:
        # Save edge attribute names if not integer
        all_integer = all((isinstance(attr, int) for attr in edge_attrs))
        if all_integer:
            edge_attr_names = None
            edge_attr_mapping = None
        else:
            edge_attr_names = np.array(list(edge_attrs))
            edge_attr_mapping = {k: i for i, k in enumerate(edge_attr_names)}

        # Initialize edge attribute matrix
        if sparse_edge_attrs:
            edge_attr_matrix = sp.lil_matrix(
                (adj_matrix.nnz, len(edge_attr_names)), dtype=np.float32)
        else:
            edge_attr_matrix = np.zeros((adj_matrix.nnz, len(edge_attr_names)),
                                        dtype=np.float32)
    else:
        edge_attr_matrix = None
        edge_attr_names = None

    # Fill edge attribute matrix
    edgeid_mat = sp.csr_matrix(
        (np.arange(adj_matrix.nnz), adj_matrix.indices, adj_matrix.indptr),
        shape=adj_matrix.shape)
    for i, j, edge_attrs in nx_graph.edges.data():
        for key, val in edge_attrs.items():
            if key != 'weight':
                if not isinstance(val, Number):
                    if node_names is None:
                        raise ValueError(
                            "Edge {}->{} has attribute '{}' with value '{}', which is not a number."
                            .format(i, j, key, val))
                    else:
                        raise ValueError(
                            "Edge '{}'->'{}' has attribute '{}' with value '{}', which is not a number."
                            .format(node_names[i], node_names[j], key, val))
                new_key = key if attr_mapping is None else edge_attr_mapping[
                    key]
                edge_attr_matrix[edgeid_mat[i, j], new_key] = val
                if not nx_graph.is_directed():
                    edge_attr_matrix[edgeid_mat[j, i], new_key] = val
    if edge_attr_matrix is not None and sparse_edge_attrs:
        edge_attr_matrix = edge_attr_matrix.tocsr()

    return gust.SparseGraph(adj_matrix=adj_matrix,
                            attr_matrix=attr_matrix,
                            edge_attr_matrix=edge_attr_matrix,
                            labels=labels,
                            node_names=node_names,
                            attr_names=attr_names,
                            edge_attr_names=edge_attr_names,
                            class_names=class_names,
                            metadata=None)
예제 #15
0
def create_subgraph(sparse_graph: 'gust.SparseGraph',
                    _sentinel: None = None,
                    nodes_to_remove: np.ndarray = None,
                    nodes_to_keep: np.ndarray = None) -> 'gust.SparseGraph':
    """Create a graph with the specified subset of nodes.

    Exactly one of (nodes_to_remove, nodes_to_keep) should be provided, while the other stays None.
    Note that to avoid confusion, it is required to pass node indices as named arguments to this function.

    The subgraph partially points to the old graph's data.

    Parameters
    ----------
    sparse_graph
        Input graph.
    _sentinel
        Internal, to prevent passing positional arguments. Do not use.
    nodes_to_remove
        Indices of nodes that have to removed.
    nodes_to_keep
        Indices of nodes that have to be kept.

    Returns
    -------
    gust.SparseGraph
        Graph with specified nodes removed.

    """
    # Check that arguments are passed correctly
    if _sentinel is not None:
        raise ValueError("Only call `create_subgraph` with named arguments',"
                         " (nodes_to_remove=...) or (nodes_to_keep=...).")
    if nodes_to_remove is None and nodes_to_keep is None:
        raise ValueError(
            "Either nodes_to_remove or nodes_to_keep must be provided.")
    elif nodes_to_remove is not None and nodes_to_keep is not None:
        raise ValueError(
            "Only one of nodes_to_remove or nodes_to_keep must be provided.")
    elif nodes_to_remove is not None:
        nodes_to_keep = [
            i for i in range(sparse_graph.num_nodes())
            if i not in nodes_to_remove
        ]
    elif nodes_to_keep is not None:
        nodes_to_keep = sorted(nodes_to_keep)
    else:
        raise RuntimeError("This should never happen.")

    adj_matrix = sparse_graph.adj_matrix[nodes_to_keep][:, nodes_to_keep]
    if sparse_graph.attr_matrix is None:
        attr_matrix = None
    else:
        attr_matrix = sparse_graph.attr_matrix[nodes_to_keep]
    if sparse_graph.edge_attr_matrix is None:
        edge_attr_matrix = None
    else:
        old_idx = sparse_graph.get_edgeid_to_idx_array()
        keep_edge_idx = np.where(
            np.all(np.isin(old_idx, nodes_to_keep), axis=1))[0]
        edge_attr_matrix = sparse_graph.edge_attr_matrix[keep_edge_idx]
    if sparse_graph.labels is None:
        labels = None
    else:
        labels = sparse_graph.labels[nodes_to_keep]
    if sparse_graph.node_names is None:
        node_names = None
    else:
        node_names = sparse_graph.node_names[nodes_to_keep]
    # TODO: add warnings / logging
    # print("Resulting subgraph with N = {0}, E = {1}"
    #               .format(sparse_graph.num_nodes(), sparse_graph.num_edges()))
    return gust.SparseGraph(adj_matrix, attr_matrix, edge_attr_matrix, labels,
                            node_names, sparse_graph.attr_names,
                            sparse_graph.edge_attr_names,
                            sparse_graph.class_names, sparse_graph.metadata)
예제 #16
0
A = A[keep_mask][:, keep_mask]
X = X[keep_mask]
z = z[keep_mask]
# convert i2n map
new_idx = 0
new_i2n = {}
for old_idx in nodes_to_keep:
    new_i2n[new_idx] = i2n[old_idx]
    new_idx += 1
i2n = new_i2n

# Rename class labels
remap_z = dict(zip(np.unique(z), range(len(np.unique(z)))))
z = np.vectorize(remap_z.get)(z)

# Vectorize attributes

# Note that since we use min_df=10 you will get different results
# if you perform vectorization before or after filtering nodes from the network (e.g. singletons)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=10, stop_words='english')
X_tfidf = tfidf.fit_transform(X)
i2a = {v: k for (k, v) in tfidf.vocabulary_.items()}

G = gust.SparseGraph(A,
                     attr_matrix=X_tfidf,
                     labels=z,
                     idx_to_node=i2n,
                     idx_to_attr=i2a)
gust.io.save_to_npz('citeseer_m10', G)