示例#1
0
def sorensen_coefficient(G, ebunch=None):
    """

    Parameters
    ----------
    G : cugraph.Graph
        cuGraph Graph instance, should contain the connectivity information
        as an edge list (edge weights are not used for this algorithm). The
        graph should be undirected where an undirected edge is represented by a
        directed edge in both direction. The adjacency list will be computed if
        not already present.
    ebunch : cudf.DataFrame, optional (default=None)
        A GPU dataframe consisting of two columns representing pairs of
        vertices. If provided, the sorensen coefficient is computed for the
        given vertex pairs.  If the vertex_pair is not provided then the
        current implementation computes the sorensen coefficient for all
        adjacent vertices in the graph.

    Returns
    -------
    df  : cudf.DataFrame
        GPU data frame of size E (the default) or the size of the given pairs
        (first, second) containing the Sorensen weights. The ordering is
        relative to the adjacency list, or that given by the specified vertex
        pairs.

        df['source'] : cudf.Series
            The source vertex ID (will be identical to first if specified)
        df['destination'] : cudf.Series
            The destination vertex ID (will be identical to second if
            specified)
        df['sorensen_coeff'] : cudf.Series
            The computed sorensen coefficient between the source and
            destination vertices

    Examples
    --------
    >>> gdf = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
    ...                     dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> df = cugraph.sorensen_coefficient(G)

    """
    vertex_pair = None

    G, isNx = ensure_cugraph_obj_for_nx(G)

    if isNx is True and ebunch is not None:
        vertex_pair = cudf.DataFrame(ebunch)

    df = sorensen(G, vertex_pair)

    if isNx is True:
        df = df_edge_score_to_dictionary(df,
                                         k="sorensen_coeff",
                                         src="source",
                                         dst="destination")

    return df
示例#2
0
def test_modularity_clustering_nx(graph_file, partitions):
    # Read in the graph and get a cugraph object
    csv_data = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)

    nxG = nx.from_pandas_edgelist(
            csv_data,
            source="0",
            target="1",
            edge_attr="weight",
            create_using=nx.DiGraph(),
        )
    assert nx.is_directed(nxG) is True
    assert nx.is_weighted(nxG) is True

    cuG, isNx = ensure_cugraph_obj_for_nx(nxG)
    assert cugraph.is_directed(cuG) is True
    assert cugraph.is_weighted(cuG) is True

    # Get the modularity score for partitioning versus random assignment
    cu_score = cugraph_call(cuG, partitions)
    rand_score = random_call(cuG, partitions)

    # Assert that the partitioning has better modularity than the random
    # assignment
    assert cu_score > rand_score
示例#3
0
def core_number(G):
    """
    Compute the core numbers for the nodes of the graph G. A k-core of a graph
    is a maximal subgraph that contains nodes of degree k or more.
    A node has a core number of k if it belongs a k-core but not to k+1-core.
    This call does not support a graph with self-loops and parallel
    edges.

    Parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        The graph should contain undirected edges where undirected edges are
        represented as directed edges in both directions. While this graph
        can contain edge weights, they don't participate in the calculation
        of the core numbers.

    Returns
    -------
    df : cudf.DataFrame or python dictionary (in NetworkX input)
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding core number values.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['core_number'] : cudf.Series
            Contains the core number of vertices

    Examples
    --------
    >>> gdf = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
    ...                     dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> cn = cugraph.core_number(G)

    """

    G, isNx = ensure_cugraph_obj_for_nx(G)

    df = core_number_wrapper.core_number(G)

    if G.renumbered:
        df = G.unrenumber(df, "vertex")

    if isNx is True:
        df = df_score_to_dictionary(df, 'core_number')

    return df
示例#4
0
def minimum_spanning_tree(
    G, weight=None, algorithm="boruvka", ignore_nan=False
):
    """
    Returns a minimum spanning tree (MST) or forest (MSF) on an undirected
    graph

    Parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        cuGraph graph descriptor with connectivity information.

    weight : string
        default to the weights in the graph, if the graph edges do not have a
        weight attribute a default weight of 1 will be used.

    algorithm : string
        Default to 'boruvka'. The parallel algorithm to use when finding a
        minimum spanning tree.

    ignore_nan : bool
        Default to False

    Returns
    -------
    G_mst : cuGraph.Graph or networkx.Graph
        A graph descriptor with a minimum spanning tree or forest.
        The networkx graph will not have all attributes copied over

    Examples
    --------
    >>> M = cudf.read_csv(datasets_path / 'netscience.csv', delimiter='\t',
    ...                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1')
    >>> # cugraph.minimum_spanning_tree(G)

    """
    # FIXME: Uncomment out the above example

    G, isNx = ensure_cugraph_obj_for_nx(G)

    if isNx is True:
        mst = _minimum_spanning_tree_subgraph(G)
        return cugraph_to_nx(mst)
    else:
        return _minimum_spanning_tree_subgraph(G)
示例#5
0
def overlap_coefficient(G, ebunch=None):
    """
    NetworkX similar API.  See 'jaccard' for a description

    """
    vertex_pair = None

    G, isNx = ensure_cugraph_obj_for_nx(G)

    if isNx is True and ebunch is not None:
        vertex_pair = cudf.DataFrame(ebunch)

    df = overlap(G, vertex_pair)

    if isNx is True:
        df = df_edge_score_to_dictionary(df,
                                         k="overlap_coeff",
                                         src="source",
                                         dst="destination")

    return df
示例#6
0
def triangles(G):
    """
    Compute the number of triangles (cycles of length three) in the
    input graph.

    Unlike NetworkX, this algorithm simply returns the total number of
    triangle and not the number per vertex.

    Parameters
    ----------
    G : cugraph.graph or networkx.Graph
        cuGraph graph descriptor, should contain the connectivity information,
        (edge weights are not used in this algorithm)

    Returns
    -------
    count : int64
        A 64 bit integer whose value gives the number of triangles in the
        graph.

    Examples
    --------
    >>> gdf = cudf.read_csv(datasets_path / 'karate.csv',
    ...                     delimiter = ' ',
    ...                     dtype=['int32', 'int32', 'float32'],
    ...                     header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> count = cugraph.triangles(G)

    """

    G, _ = ensure_cugraph_obj_for_nx(G)

    if type(G) is not Graph:
        raise Exception("input graph must be undirected")

    result = triangle_count_wrapper.triangles(G)

    return result
示例#7
0
def k_truss(G, k):
    """
    Returns the K-Truss subgraph of a graph for a specific k.

    NOTE: this function is currently not available on CUDA 11.4 systems.

    The k-truss of a graph is a subgraph where each edge is part of at least
    (k−2) triangles. K-trusses are used for finding tighlty knit groups of
    vertices in a graph. A k-truss is a relaxation of a k-clique in the graph
    and was define in [1]. Finding cliques is computationally demanding and
    finding the maximal k-clique is known to be NP-Hard.

    Parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        cuGraph graph descriptor with connectivity information. k-Trusses are
        defined for only undirected graphs as they are defined for
        undirected triangle in a graph.

    k : int
        The desired k to be used for extracting the k-truss subgraph.

    Returns
    -------
    G_truss : cuGraph.Graph or networkx.Graph
        A cugraph graph descriptor with the k-truss subgraph for the given k.
        The networkx graph will NOT have all attributes copied over
    """

    _ensure_compatible_cuda_version()

    G, isNx = ensure_cugraph_obj_for_nx(G)

    if isNx is True:
        k_sub = ktruss_subgraph(G, k)
        S = cugraph_to_nx(k_sub)
        return S
    else:
        return ktruss_subgraph(G, k)
def betweenness_centrality(
    G,
    k=None,
    normalized=True,
    weight=None,
    endpoints=False,
    seed=None,
    result_dtype=np.float64,
):
    """
    Compute the betweenness centrality for all vertices of the graph G.
    Betweenness centrality is a measure of the number of shortest paths that
    pass through a vertex.  A vertex with a high betweenness centrality score
    has more paths passing through it and is therefore believed to be more
    important.

    To improve performance. rather than doing an all-pair shortest path,
    a sample of k starting vertices can be used.

    CuGraph does not currently support the 'endpoints' and 'weight' parameters
    as seen in the corresponding networkX call.

    Parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        The graph can be either directed (Graph(directed=True)) or undirected.
        Weights in the graph are ignored, the current implementation uses
        BFS traversals. Use weight parameter if weights need to be considered
        (currently not supported)

    k : int or list or None, optional (default=None)
        If k is not None, use k node samples to estimate betweenness.  Higher
        values give better approximation.  If k is a list, use the content
        of the list for estimation: the list should contain vertex
        identifiers. If k is None (the default), all the vertices are used
        to estimate betweenness.  Vertices obtained through sampling or
        defined as a list will be used assources for traversals inside the
        algorithm.

    normalized : bool, optional
        Default is True.
        If true, the betweenness values are normalized by
        __2 / ((n - 1) * (n - 2))__ for undirected Graphs, and
        __1 / ((n - 1) * (n - 2))__ for directed Graphs
        where n is the number of nodes in G.
        Normalization will ensure that values are in [0, 1],
        this normalization scales for the highest possible value where one
        node is crossed by every single shortest path.

    weight : cudf.DataFrame, optional (default=None)
        Specifies the weights to be used for each edge.
        Should contain a mapping between
        edges and weights.
        (Not Supported)

    endpoints : bool, optional (default=False)
        If true, include the endpoints in the shortest path counts.
        (Not Supported)

    seed : optional
        if k is specified and k is an integer, use seed to initialize the
        random number generator.
        Using None as seed relies on random.seed() behavior: using current
        system time
        If k is either None or list: seed parameter is ignored

    result_dtype : np.float32 or np.float64, optional, default=np.float64
        Indicate the data type of the betweenness centrality scores

    Returns
    -------
    df : cudf.DataFrame or Dictionary if using NetworkX
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding betweenness centrality values.
        Please note that the resulting the 'vertex' column might not be
        in ascending order.  The Dictionary contains the same two columns

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['betweenness_centrality'] : cudf.Series
            Contains the betweenness centrality of vertices

    Examples
    --------
    >>> gdf = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
    ...                     dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> bc = cugraph.betweenness_centrality(G)

    """
    # vertices is intended to be a cuDF series that contains a sampling of
    # k vertices out of the graph.
    #
    # NOTE: cuDF doesn't currently support sampling, but there is a python
    # workaround.

    if weight is not None:
        raise NotImplementedError("weighted implementation of betweenness "
                                  "centrality not currently supported")

    if result_dtype not in [np.float32, np.float64]:
        raise TypeError("result type can only be np.float32 or np.float64")

    G, isNx = ensure_cugraph_obj_for_nx(G)

    vertices = _initialize_vertices(G, k, seed)

    df = betweenness_centrality_wrapper.betweenness_centrality(
        G, normalized, endpoints, weight, vertices, result_dtype)

    if G.renumbered:
        df = G.unrenumber(df, "vertex")

    if isNx is True:
        dict = df_score_to_dictionary(df, 'betweenness_centrality')
        return dict
    else:
        return df
def edge_betweenness_centrality(G,
                                k=None,
                                normalized=True,
                                weight=None,
                                seed=None,
                                result_dtype=np.float64):
    """
    Compute the edge betweenness centrality for all edges of the graph G.
    Betweenness centrality is a measure of the number of shortest paths
    that pass over an edge.  An edge with a high betweenness centrality
    score has more paths passing over it and is therefore believed to be
    more important.

    To improve performance, rather than doing an all-pair shortest path,
    a sample of k starting vertices can be used.

    CuGraph does not currently support the 'weight' parameter
    as seen in the corresponding networkX call.

    Parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        The graph can be either directed (Graph(directed=True)) or undirected.
        Weights in the graph are ignored, the current implementation uses
        BFS traversals. Use weight parameter if weights need to be considered
        (currently not supported)

    k : int or list or None, optional (default=None)
        If k is not None, use k node samples to estimate betweenness.  Higher
        values give better approximation.
        If k is a list, use the content of the list for estimation: the list
        should contain vertices identifiers.
        Vertices obtained through sampling or defined as a list will be used as
        sources for traversals inside the algorithm.

    normalized : bool, optional (default=True)
        Default is True.
        If true, the betweenness values are normalized by
        2 / (n * (n - 1)) for undirected Graphs, and
        1 / (n * (n - 1)) for directed Graphs
        where n is the number of nodes in G.
        Normalization will ensure that values are in [0, 1],
        this normalization scales for the highest possible value where one
        edge is crossed by every single shortest path.

    weight : cudf.DataFrame, optional (default=None)
        Specifies the weights to be used for each edge.
        Should contain a mapping between
        edges and weights.
        (Not Supported)

    seed : optional (default=None)
        if k is specified and k is an integer, use seed to initialize the
        random number generator.
        Using None as seed relies on random.seed() behavior: using current
        system time
        If k is either None or list: seed parameter is ignored

    result_dtype : np.float32 or np.float64, optional (default=np.float64)
        Indicate the data type of the betweenness centrality scores
        Using double automatically switch implementation to "default"

    Returns
    -------
    df : cudf.DataFrame or Dictionary if using NetworkX
        GPU data frame containing three cudf.Series of size E: the vertex
        identifiers of the sources, the vertex identifies of the destinations
        and the corresponding betweenness centrality values.
        Please note that the resulting the 'src', 'dst' column might not be
        in ascending order.

        df['src'] : cudf.Series
            Contains the vertex identifiers of the source of each edge

        df['dst'] : cudf.Series
            Contains the vertex identifiers of the destination of each edge

        df['edge_betweenness_centrality'] : cudf.Series
            Contains the betweenness centrality of edges

        When using undirected graphs, 'src' and 'dst' only contains elements
        such that 'src' < 'dst', which might differ from networkx and user's
        input. Namely edge (1 -> 0) is transformed into (0 -> 1) but
        contains the betweenness centrality of edge (1 -> 0).


    Examples
    --------
    >>> gdf = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
    ...                     dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> ebc = cugraph.edge_betweenness_centrality(G)

    """
    if weight is not None:
        raise NotImplementedError("weighted implementation of betweenness "
                                  "centrality not currently supported")
    if result_dtype not in [np.float32, np.float64]:
        raise TypeError("result type can only be np.float32 or np.float64")

    G, isNx = ensure_cugraph_obj_for_nx(G)
    vertices = _initialize_vertices(G, k, seed)

    df = edge_betweenness_centrality_wrapper.edge_betweenness_centrality(
        G, normalized, weight, vertices, result_dtype)

    if G.renumbered:
        df = G.unrenumber(df, "src")
        df = G.unrenumber(df, "dst")

    if G.is_directed() is False:
        # select the lower triangle of the df based on src/dst vertex value
        lower_triangle = df['src'] >= df['dst']
        # swap the src and dst vertices for the lower triangle only. Because
        # this is a symmeterized graph, this operation results in a df with
        # multiple src/dst entries.
        df['src'][lower_triangle], df['dst'][lower_triangle] = \
            df['dst'][lower_triangle], df['src'][lower_triangle]
        # overwrite the df with the sum of the values for all alike src/dst
        # vertex pairs, resulting in half the edges of the original df from the
        # symmeterized graph.
        df = df.groupby(by=["src", "dst"]).sum().reset_index()

    if isNx is True:
        return df_edge_score_to_dictionary(df, 'betweenness_centrality')
    else:
        return df
示例#10
0
def k_core(G, k=None, core_number=None):
    """
    Compute the k-core of the graph G based on the out degree of its nodes. A
    k-core of a graph is a maximal subgraph that contains nodes of degree k or
    more. This call does not support a graph with self-loops and parallel
    edges.

    Parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        cuGraph graph descriptor with connectivity information. The graph
        should contain undirected edges where undirected edges are represented
        as directed edges in both directions. While this graph can contain edge
        weights, they don't participate in the calculation of the k-core.

    k : int, optional (default=None)
        Order of the core. This value must not be negative. If set to None, the
        main core is returned.

    core_number : cudf.DataFrame, optional (default=None)
        Precomputed core number of the nodes of the graph G containing two
        cudf.Series of size V: the vertex identifiers and the corresponding
        core number values. If set to None, the core numbers of the nodes are
        calculated internally.

        core_number['vertex'] : cudf.Series
            Contains the vertex identifiers
        core_number['values'] : cudf.Series
            Contains the core number of vertices

    Returns
    -------
    KCoreGraph : cuGraph.Graph
        K Core of the input graph

    Examples
    --------
    >>> gdf = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
    ...                     dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> KCoreGraph = cugraph.k_core(G)

    """

    G, isNx = ensure_cugraph_obj_for_nx(G)

    mytype = type(G)
    KCoreGraph = mytype()

    if mytype is not Graph:
        raise Exception("directed graph not supported")

    if core_number is not None:
        if G.renumbered is True:
            if len(G.renumber_map.implementation.col_names) > 1:
                cols = core_number.columns[:-1].to_list()
            else:
                cols = 'vertex'
            core_number = G.add_internal_vertex_id(core_number, 'vertex', cols)

    else:
        core_number = core_number_wrapper.core_number(G)
        core_number = core_number.rename(columns={"core_number": "values"},
                                         copy=False)

    if k is None:
        k = core_number["values"].max()

    k_core_df = k_core_wrapper.k_core(G, k, core_number)

    if G.renumbered:
        k_core_df, src_names = G.unrenumber(k_core_df,
                                            "src",
                                            get_column_names=True)
        k_core_df, dst_names = G.unrenumber(k_core_df,
                                            "dst",
                                            get_column_names=True)

    if G.edgelist.weights:
        KCoreGraph.from_cudf_edgelist(k_core_df,
                                      source=src_names,
                                      destination=dst_names,
                                      edge_attr="weight")
    else:
        KCoreGraph.from_cudf_edgelist(
            k_core_df,
            source=src_names,
            destination=dst_names,
        )

    if isNx is True:
        KCoreGraph = cugraph_to_nx(KCoreGraph)

    return KCoreGraph
示例#11
0
def louvain(G, max_iter=100, resolution=1.):
    """
    Compute the modularity optimizing partition of the input graph using the
    Louvain method

    It uses the Louvain method described in:

    VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of
    community hierarchies in large networks, J Stat Mech P10008 (2008),
    http://arxiv.org/abs/0803.0476

    Parameters
    ----------
    G : cugraph.Graph or NetworkX Graph
        The graph descriptor should contain the connectivity information
        and weights. The adjacency list will be computed if not already
        present.

    max_iter : integer, optional (default=100)
        This controls the maximum number of levels/iterations of the Louvain
        algorithm. When specified the algorithm will terminate after no more
        than the specified number of iterations. No error occurs when the
        algorithm terminates early in this manner.

    resolution: float/double, optional (default=1.0)
        Called gamma in the modularity formula, this changes the size
        of the communities.  Higher resolutions lead to more smaller
        communities, lower resolutions lead to fewer larger communities.
        Defaults to 1.

    Returns
    -------
    parts : cudf.DataFrame
        GPU data frame of size V containing two columns the vertex id and the
        partition id it is assigned to.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['partition'] : cudf.Series
            Contains the partition assigned to the vertices

    modularity_score : float
        a floating point number containing the global modularity score of the
        partitioning.

    Examples
    --------
    >>> M = cudf.read_csv(datasets_path / 'karate.csv',
    ...                   delimiter = ' ',
    ...                   dtype=['int32', 'int32', 'float32'],
    ...                   header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1')
    >>> parts, modularity_score = cugraph.louvain(G)

    """

    G, isNx = ensure_cugraph_obj_for_nx(G)

    if type(G) is not Graph:
        raise Exception("input graph must be undirected")

    parts, modularity_score = louvain_wrapper.louvain(G, max_iter, resolution)

    if G.renumbered:
        parts = G.unrenumber(parts, "vertex")

    if isNx is True:
        parts = df_score_to_dictionary(parts, "partition")

    return parts, modularity_score
示例#12
0
def spectralBalancedCutClustering(
    G,
    num_clusters,
    num_eigen_vects=2,
    evs_tolerance=0.00001,
    evs_max_iter=100,
    kmean_tolerance=0.00001,
    kmean_max_iter=100,
):
    """
    Compute a clustering/partitioning of the given graph using the spectral
    balanced cut method.

    Parameters
    ----------
    G : cugraph.Graph or networkx.Graph
        Graph descriptor

    num_clusters : integer
        Specifies the number of clusters to find, must be greater than 1

    num_eigen_vects : integer, optional
        Specifies the number of eigenvectors to use. Must be lower or equal to
        num_clusters. Default is 2

    evs_tolerance: float, optional
        Specifies the tolerance to use in the eigensolver.
        Default is 0.00001

    evs_max_iter: integer, optional
        Specifies the maximum number of iterations for the eigensolver.
        Default is 100

    kmean_tolerance: float, optional
        Specifies the tolerance to use in the k-means solver.
        Default is 0.00001

    kmean_max_iter: integer, optional
        Specifies the maximum number of iterations for the k-means solver.
        Default is 100

    Returns
    -------
    df : cudf.DataFrame
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding cluster assignments.

        df['vertex'] : cudf.Series
            contains the vertex identifiers
        df['cluster'] : cudf.Series
            contains the cluster assignments

    Examples
    --------
    >>> M = cudf.read_csv(datasets_path / 'karate.csv',
    ...                   delimiter = ' ',
    ...                   dtype=['int32', 'int32', 'float32'],
    ...                   header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1')
    >>> df = cugraph.spectralBalancedCutClustering(G, 5)

    """

    # Error checking in C++ code

    G, isNx = ensure_cugraph_obj_for_nx(G)

    df = spectral_clustering_wrapper.spectralBalancedCutClustering(
        G,
        num_clusters,
        num_eigen_vects,
        evs_tolerance,
        evs_max_iter,
        kmean_tolerance,
        kmean_max_iter,
    )

    if G.renumbered:
        df = G.unrenumber(df, "vertex")

    if isNx is True:
        df = df_score_to_dictionary(df, "cluster")

    return df
示例#13
0
def force_atlas2(
    input_graph,
    max_iter=500,
    pos_list=None,
    outbound_attraction_distribution=True,
    lin_log_mode=False,
    prevent_overlapping=False,
    edge_weight_influence=1.0,
    jitter_tolerance=1.0,
    barnes_hut_optimize=True,
    barnes_hut_theta=0.5,
    scaling_ratio=2.0,
    strong_gravity_mode=False,
    gravity=1.0,
    verbose=False,
    callback=None,
):

    """
        ForceAtlas2 is a continuous graph layout algorithm for handy network
        visualization.

        NOTE: Peak memory allocation occurs at 30*V.

        Parameters
        ----------
        input_graph : cugraph.Graph
            cuGraph graph descriptor with connectivity information.
            Edge weights, if present, should be single or double precision
            floating point values.

        max_iter : integer, optional (default=500)
            This controls the maximum number of levels/iterations of the Force
            Atlas algorithm. When specified the algorithm will terminate after
            no more than the specified number of iterations.
            No error occurs when the algorithm terminates in this manner.
            Good short-term quality can be achieved with 50-100 iterations.
            Above 1000 iterations is discouraged.

        pos_list: cudf.DataFrame, optional (default=None)
            Data frame with initial vertex positions containing two columns:
            'x' and 'y' positions.

        outbound_attraction_distribution: bool, optional (default=True)
            Distributes attraction along outbound edges.
            Hubs attract less and thus are pushed to the borders.

        lin_log_mode: bool, optional (default=False)
            Switch Force Atlas model from lin-lin to lin-log.
            Makes clusters more tight.

        prevent_overlapping: bool, optional (default=False)
            Prevent nodes to overlap.

        edge_weight_influence: float, optional (default=1.0)
            How much influence you give to the edges weight.
            0 is “no influence” and 1 is “normal”.

        jitter_tolerance: float, optional (default=1.0)
            How much swinging you allow. Above 1 discouraged.
            Lower gives less speed and more precision.

        barnes_hut_optimize: bool, optional (default=True)
            Whether to use the Barnes Hut approximation or the slower
            exact version.

        barnes_hut_theta: float, optional (default=0.5)
            Float between 0 and 1. Tradeoff for speed (1) vs
            accuracy (0) for Barnes Hut only.

        scaling_ratio: float, optional (default=2.0)
            How much repulsion you want. More makes a more sparse graph.
            Switching from regular mode to LinLog mode needs a readjustment
            of the scaling parameter.

        strong_gravity_mode: bool, optional (default=False)
            Sets a force that attracts the nodes that are distant from the
            center more. It is so strong that it can sometimes dominate other
            forces.

        gravity : float, optional (default=1.0)
            Attracts nodes to the center. Prevents islands from drifting away.

        verbose: bool, optional (default=False)
            Output convergence info at each interation.

        callback: GraphBasedDimRedCallback, optional (default=None)
            An instance of GraphBasedDimRedCallback class to intercept
            the internal state of positions while they are being trained.

            Example of callback usage:
                from cugraph.internals import GraphBasedDimRedCallback
                    class CustomCallback(GraphBasedDimRedCallback):
                        def on_preprocess_end(self, positions):
                            print(positions.copy_to_host())
                        def on_epoch_end(self, positions):
                            print(positions.copy_to_host())
                        def on_train_end(self, positions):
                            print(positions.copy_to_host())

        Returns
        -------
        pos : cudf.DataFrame
            GPU data frame of size V containing three columns:
            the vertex identifiers and the x and y positions.
    """
    input_graph, isNx = ensure_cugraph_obj_for_nx(input_graph)

    if pos_list is not None:
        if input_graph.renumbered is True:
            if input_graph.vertex_column_size() > 1:
                cols = pos_list.columns[:-2].to_list()
            else:
                cols = 'vertex'
            pos_list = input_graph.add_internal_vertex_id(pos_list,
                                                          "vertex",
                                                          cols)

    if prevent_overlapping:
        raise Exception("Feature not supported")

    if input_graph.is_directed():
        input_graph = input_graph.to_undirected()

    pos = force_atlas2_wrapper.force_atlas2(
        input_graph,
        max_iter=max_iter,
        pos_list=pos_list,
        outbound_attraction_distribution=outbound_attraction_distribution,
        lin_log_mode=lin_log_mode,
        prevent_overlapping=prevent_overlapping,
        edge_weight_influence=edge_weight_influence,
        jitter_tolerance=jitter_tolerance,
        barnes_hut_optimize=barnes_hut_optimize,
        barnes_hut_theta=barnes_hut_theta,
        scaling_ratio=scaling_ratio,
        strong_gravity_mode=strong_gravity_mode,
        gravity=gravity,
        verbose=verbose,
        callback=callback,
    )

    if input_graph.renumbered:
        pos = input_graph.unrenumber(pos, "vertex")

    return pos
示例#14
0
def hits(G, max_iter=100, tol=1.0e-5, nstart=None, normalized=True):
    """
    Compute HITS hubs and authorities values for each vertex

    The HITS algorithm computes two numbers for a node.  Authorities
    estimates the node value based on the incoming links.  Hubs estimates
    the node value based on outgoing links.

    The cuGraph implementation of HITS is a wrapper around the gunrock
    implementation of HITS.

    Note that the gunrock implementation uses a 2-norm, while networkx
    uses a 1-norm.  The raw scores will be different, but the rank ordering
    should be comparable with networkx.

    Parameters
    ----------
    graph : cugraph.Graph
        cuGraph graph descriptor, should contain the connectivity information
        as an edge list (edge weights are not used for this algorithm).
        The adjacency list will be computed if not already present.

    max_iter : int, optional (default=100)
        The maximum number of iterations before an answer is returned.
        The gunrock implementation does not currently support tolerance,
        so this will in fact be the number of iterations the HITS algorithm
        executes.

    tol : float, optional (default=1.0e-5)
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.  This parameter is not currently supported.

    nstart : cudf.Dataframe, optional (default=None)
        Not currently supported

    normalized : bool, optional (default=True)
        Not currently supported, always used as True

    Returns
    -------
    HubsAndAuthorities : cudf.DataFrame
        GPU data frame containing three cudf.Series of size V: the vertex
        identifiers and the corresponding hubs values and the corresponding
        authorities values.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['hubs'] : cudf.Series
            Contains the hubs score
        df['authorities'] : cudf.Series
            Contains the authorities score


    Examples
    --------
    >>> gdf = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
    ...                     dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> hits = cugraph.hits(G, max_iter = 50)

    """

    G, isNx = ensure_cugraph_obj_for_nx(G)

    df = hits_wrapper.hits(G, max_iter, tol)

    if G.renumbered:
        df = G.unrenumber(df, "vertex")

    if isNx is True:
        d1 = df_score_to_dictionary(df[["vertex", "hubs"]], "hubs")
        d2 = df_score_to_dictionary(df[["vertex", "authorities"]],
                                    "authorities")
        df = (d1, d2)

    return df
示例#15
0
文件: ecg.py 项目: rapidsai/cugraph
def ecg(input_graph, min_weight=0.05, ensemble_size=16, weight=None):
    """
    Compute the Ensemble Clustering for Graphs (ECG) partition of the input
    graph. ECG runs truncated Louvain on an ensemble of permutations of the
    input graph, then uses the ensemble partitions to determine weights for
    the input graph. The final result is found by running full Louvain on
    the input graph using the determined weights.

    See https://arxiv.org/abs/1809.05578 for further information.

    Parameters
    ----------
    input_graph : cugraph.Graph or NetworkX Graph
        The graph descriptor should contain the connectivity information
        and weights. The adjacency list will be computed if not already
        present.

    min_weight : float, optional (default=0.5)
        The minimum value to assign as an edgeweight in the ECG algorithm.
        It should be a value in the range [0,1] usually left as the default
        value of .05

    ensemble_size : integer, optional (default=16)
        The number of graph permutations to use for the ensemble.
        The default value is 16, larger values may produce higher quality
        partitions for some graphs.

    weight : str, optional (default=None)
        This parameter is here for NetworkX compatibility and
        represents which NetworkX data column represents Edge weights.

    Returns
    -------
    parts : cudf.DataFrame or python dictionary
        GPU data frame of size V containing two columns, the vertex id and
        the partition id it is assigned to.

        df[vertex] : cudf.Series
            Contains the vertex identifiers
        df[partition] : cudf.Series
            Contains the partition assigned to the vertices

    Examples
    --------
    >>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter = ' ',
    ...                   dtype=['int32', 'int32', 'float32'],
    ...                   header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
    >>> parts = cugraph.ecg(G)

    """

    input_graph, isNx = ensure_cugraph_obj_for_nx(input_graph, weight)

    parts = ecg_wrapper.ecg(input_graph, min_weight, ensemble_size)

    if input_graph.renumbered:
        parts = input_graph.unrenumber(parts, "vertex")

    if isNx is True:
        return df_score_to_dictionary(parts, 'partition')
    else:
        return parts
示例#16
0
def random_walks(G,
                 start_vertices,
                 max_depth=None,
                 use_padding=False):
    """
    compute random walks for each nodes in 'start_vertices'

    parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        The graph can be either directed (DiGraph) or undirected (Graph).
        Weights in the graph are ignored.
        Use weight parameter if weights need to be considered
        (currently not supported)

    start_vertices : int or list or cudf.Series or cudf.DataFrame
        A single node or a list or a cudf.Series of nodes from which to run
        the random walks. In case of multi-column vertices it should be
        a cudf.DataFrame

    max_depth : int, optional (default=None)
        The maximum depth of the random walks

    use_padding : bool, optional (default=False)
        If True, padded paths are returned else coalesced paths are returned.

    Returns
    -------
    vertex_paths : cudf.Series or cudf.DataFrame
        Series containing the vertices of edges/paths in the random walk.

    edge_weight_paths: cudf.Series
        Series containing the edge weights of edges represented by the
        returned vertex_paths

    sizes: int
        The path size in case of coalesced paths.

    Examples
    --------
    >>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
    ...                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1')
    >>> _, _, _ = cugraph.random_walks(G, M, 3)

    """
    if max_depth is None:
        raise TypeError("must specify a 'max_depth'")

    # FIXME: supporting Nx types should mean having a return type that better
    # matches Nx expectations (eg. data on the CPU, possibly using a different
    # data struct like a dictionary, etc.). The 2nd value is ignored here,
    # which is typically named isNx and used to convert the return type.
    # Consider a different return type if Nx types are passed in.
    G, _ = ensure_cugraph_obj_for_nx(G)

    if start_vertices is int:
        start_vertices = [start_vertices]

    if isinstance(start_vertices, list):
        start_vertices = cudf.Series(start_vertices)

    if G.renumbered is True:
        if isinstance(start_vertices, cudf.DataFrame):
            start_vertices = G.lookup_internal_vertex_id(
                start_vertices,
                start_vertices.columns)
        else:
            start_vertices = G.lookup_internal_vertex_id(start_vertices)

    vertex_set, edge_set, sizes = random_walks_wrapper.random_walks(
        G, start_vertices, max_depth, use_padding)

    if G.renumbered:
        df_ = cudf.DataFrame()
        df_['vertex_set'] = vertex_set
        df_ = G.unrenumber(df_, 'vertex_set', preserve_order=True)
        vertex_set = cudf.Series(df_['vertex_set'])

    if use_padding:
        edge_set_sz = (max_depth-1)*len(start_vertices)
        return vertex_set, edge_set[:edge_set_sz], sizes

    vertex_set_sz = sizes.sum()
    edge_set_sz = vertex_set_sz - len(start_vertices)
    return vertex_set[:vertex_set_sz], edge_set[:edge_set_sz], sizes
示例#17
0
def subgraph(G, vertices):
    """
    Compute a subgraph of the existing graph including only the specified
    vertices.  This algorithm works with both directed and undirected graphs
    and does not actually traverse the edges, but instead simply pulls out any
    edges that are incident on vertices that are both contained in the vertices
    list.

    Parameters
    ----------
    G : cugraph.Graph
        cuGraph graph descriptor

    vertices : cudf.Series or cudf.DataFrame
        Specifies the vertices of the induced subgraph. For multi-column
        vertices, vertices should be provided as a cudf.DataFrame

    Returns
    -------
    Sg : cugraph.Graph
        A graph object containing the subgraph induced by the given vertex set.

    Examples
    --------
    >>> gdf = cudf.read_csv(datasets_path / 'karate.csv',
    ...                     delimiter = ' ',
    ...                     dtype=['int32', 'int32', 'float32'],
    ...                     header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> verts = np.zeros(3, dtype=np.int32)
    >>> verts[0] = 0
    >>> verts[1] = 1
    >>> verts[2] = 2
    >>> sverts = cudf.Series(verts)
    >>> Sg = cugraph.subgraph(G, sverts)

    """

    G, isNx = ensure_cugraph_obj_for_nx(G)

    if G.renumbered:
        if isinstance(vertices, cudf.DataFrame):
            vertices = G.lookup_internal_vertex_id(vertices, vertices.columns)
        else:
            vertices = G.lookup_internal_vertex_id(vertices)

    result_graph = type(G)()

    df = subgraph_extraction_wrapper.subgraph(G, vertices)
    src_names = "src"
    dst_names = "dst"

    if G.renumbered:
        df, src_names = G.unrenumber(df, src_names, get_column_names=True)
        df, dst_names = G.unrenumber(df, dst_names, get_column_names=True)

    if G.edgelist.weights:
        result_graph.from_cudf_edgelist(df,
                                        source=src_names,
                                        destination=dst_names,
                                        edge_attr="weight")
    else:
        result_graph.from_cudf_edgelist(df,
                                        source=src_names,
                                        destination=dst_names)

    if isNx is True:
        result_graph = cugraph_to_nx(result_graph)

    return result_graph
示例#18
0
def analyzeClustering_edge_cut(G, n_clusters, clustering,
                               vertex_col_name='vertex',
                               cluster_col_name='cluster'):
    """
    Compute the edge cut score for a partitioning/clustering
    The assumption is that “clustering” is the results from a call
    from a special clustering algorithm and contains columns named
    “vertex” and “cluster”.

    Parameters
    ----------
    G : cugraph.Graph
        cuGraph graph descriptor

    n_clusters : integer
        Specifies the number of clusters in the given clustering

    clustering : cudf.DataFrame
        The cluster assignment to analyze.

    vertex_col_name : str, optional (default='vertex')
        The name of the column in the clustering dataframe identifying
        the external vertex id

    cluster_col_name : str, optional (default='cluster')
        The name of the column in the clustering dataframe identifying
        the cluster id

    Returns
    -------
    score : float
        The computed edge cut score

    Examples
    --------
    >>> M = cudf.read_csv(datasets_path / 'karate.csv',
    ...                   delimiter = ' ',
    ...                   dtype=['int32', 'int32', 'float32'],
    ...                   header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr=None)
    >>> df = cugraph.spectralBalancedCutClustering(G, 5)
    >>> score = cugraph.analyzeClustering_edge_cut(G, 5, df)

    """
    if type(vertex_col_name) is list:
        if not all(isinstance(name, str) for name in vertex_col_name):
            raise Exception("vertex_col_name must be list of string")
    elif type(vertex_col_name) is not str:
        raise Exception("vertex_col_name must be a string")

    if type(cluster_col_name) is not str:
        raise Exception("cluster_col_name must be a string")

    G, isNx = ensure_cugraph_obj_for_nx(G)

    if G.renumbered:
        clustering = G.add_internal_vertex_id(clustering,
                                              'vertex',
                                              vertex_col_name,
                                              drop=True)

    clustering = clustering.sort_values('vertex').reset_index(drop=True)

    score = spectral_clustering_wrapper.analyzeClustering_edge_cut(
        G, n_clusters, clustering[cluster_col_name]
    )

    return score
示例#19
0
def pagerank(
    G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None,
    weight=None, dangling=None
):
    """
    Find the PageRank score for every vertex in a graph. cuGraph computes an
    approximation of the Pagerank eigenvector using the power method. The
    number of iterations depends on the properties of the network itself; it
    increases when the tolerance descreases and/or alpha increases toward the
    limiting value of 1. The user is free to use default values or to provide
    inputs for the initial guess, tolerance and maximum number of iterations.

    Parameters
    ----------
    G : cugraph.Graph or networkx.Graph
        cuGraph graph descriptor, should contain the connectivity information
        as an edge list.
        The transposed adjacency list will be computed if not already present.

    alpha : float, optional (default=0.85)
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.

    personalization : cudf.Dataframe, optional (default=None)
        GPU Dataframe containing the personalization information.

        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices

    max_iter : int, optional (default=100)
        The maximum number of iterations before an answer is returned. This can
        be used to limit the execution time and do an early exit before the
        solver reaches the convergence tolerance.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 100.

    tol : float, optional (default=1e-05)
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.

    nstart : cudf.Dataframe, optional (default=None)
        GPU Dataframe containing the initial guess for pagerank.

        nstart['vertex'] : cudf.Series
            Subset of vertices of graph for initial guess for pagerank values
        nstart['values'] : cudf.Series
            Pagerank values for vertices

    weight: str, optional (default=None)
        The attribute column to be used as edge weights if Graph is a NetworkX
        Graph. This parameter is here for NetworkX compatibility and is ignored
        in case of a cugraph.Graph

    dangling : dict, optional (default=None)
        This parameter is here for NetworkX compatibility and ignored

    Returns
    -------
    PageRank : cudf.DataFrame
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding PageRank values.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['pagerank'] : cudf.Series
            Contains the PageRank score


    Examples
    --------
    >>> gdf = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
    ...                     dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> pr = cugraph.pagerank(G, alpha = 0.85, max_iter = 500, tol = 1.0e-05)

    """

    G, isNx = ensure_cugraph_obj_for_nx(G, weight)

    if personalization is not None:
        if not isinstance(personalization, cudf.DataFrame):
            raise NotImplementedError(
                "personalization other than a cudf dataframe "
                "currently not supported"
            )
        if G.renumbered is True:
            if len(G.renumber_map.implementation.col_names) > 1:
                cols = personalization.columns[:-1].to_list()
            else:
                cols = 'vertex'
            personalization = G.add_internal_vertex_id(
                personalization, "vertex", cols
            )

    if nstart is not None:
        if G.renumbered is True:
            if len(G.renumber_map.implementation.col_names) > 1:
                cols = nstart.columns[:-1].to_list()
            else:
                cols = 'vertex'
            nstart = G.add_internal_vertex_id(
                nstart, "vertex", cols
            )

    df = pagerank_wrapper.pagerank(
        G, alpha, personalization, max_iter, tol, nstart
    )

    if G.renumbered:
        df = G.unrenumber(df, "vertex")

    if isNx is True:
        return df_score_to_dictionary(df, 'pagerank')
    else:
        return df
示例#20
0
def leiden(G, max_iter=100, resolution=1.):
    """
    Compute the modularity optimizing partition of the input graph using the
    Leiden algorithm

    It uses the Louvain method described in:

    Traag, V. A., Waltman, L., & van Eck, N. J. (2019). From Louvain to Leiden:
    guaranteeing well-connected communities. Scientific reports, 9(1), 5233.
    doi: 10.1038/s41598-019-41695-z

    Parameters
    ----------
    G : cugraph.Graph
        cuGraph graph descriptor of type Graph

        The adjacency list will be computed if not already present.

    max_iter : integer, optional (default=100)
        This controls the maximum number of levels/iterations of the Leiden
        algorithm. When specified the algorithm will terminate after no more
        than the specified number of iterations. No error occurs when the
        algorithm terminates early in this manner.

    resolution: float/double, optional (default=1.0)
        Called gamma in the modularity formula, this changes the size
        of the communities.  Higher resolutions lead to more smaller
        communities, lower resolutions lead to fewer larger communities.
        Defaults to 1.

    Returns
    -------
    parts : cudf.DataFrame
        GPU data frame of size V containing two columns the vertex id and the
        partition id it is assigned to.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['partition'] : cudf.Series
            Contains the partition assigned to the vertices

    modularity_score : float
        a floating point number containing the global modularity score of the
        partitioning.

    Examples
    --------
    >>> M = cudf.read_csv(datasets_path / 'karate.csv',
    ...                   delimiter = ' ',
    ...                   dtype=['int32', 'int32', 'float32'],
    ...                   header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1')
    >>> parts, modularity_score = cugraph.leiden(G)

    """
    G, isNx = ensure_cugraph_obj_for_nx(G)

    if type(G) is not Graph:
        raise Exception(f"input graph must be undirected was {type(G)}")

    parts, modularity_score = leiden_wrapper.leiden(G, max_iter, resolution)

    if G.renumbered:
        parts = G.unrenumber(parts, "vertex")

    if isNx is True:
        parts = df_score_to_dictionary(parts, "partition")

    return parts, modularity_score