예제 #1
0
def bfs(graph,
        start,
        return_distances=False):
    """
    Find the distances and predecessors for a breadth first traversal of a
    graph.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    start : Integer
        Specify starting vertex for breadth-first search; this function
        iterates over edges in the component reachable from this node.
    return_distances : bool, optional, default=False
        Indicates if distances should be returned

    Returns
    -------
    df : dask_cudf.DataFrame
        df['vertex'] gives the vertex id

        df['distance'] gives the path distance from the
        starting vertex (Only if return_distances is True)

        df['predecessor'] gives the vertex it was
        reached from in the traversal

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize(p2p=True)
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
    >>> df = dcg.bfs(dg, 0)
    >>> Comms.destroy()
    """

    client = default_client()

    graph.compute_renumber_edge_list(transposed=False)
    (ddf,
     num_verts,
     partition_row_size,
     partition_col_size,
     vertex_partition_offsets) = shuffle(graph, transposed=False)
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if graph.renumbered:
        start = graph.lookup_internal_vertex_id(cudf.Series([start],
                                                dtype='int32')).compute()
        start = start.iloc[0]

    result = [client.submit(
              call_bfs,
              Comms.get_session_id(),
              wf[1],
              num_verts,
              num_edges,
              vertex_partition_offsets,
              start,
              return_distances,
              workers=[wf[0]])
              for idx, wf in enumerate(data.worker_to_parts.items())]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if graph.renumbered:
        ddf = graph.unrenumber(ddf, 'vertex')
        ddf = graph.unrenumber(ddf, 'predecessor')
        ddf["predecessor"] = ddf["predecessor"].fillna(-1)
    return ddf
예제 #2
0
def katz_centrality(input_graph,
                    alpha=None,
                    beta=None,
                    max_iter=100,
                    tol=1.0e-5,
                    nstart=None,
                    normalized=True):
    """
    Compute the Katz centrality for the nodes of the graph G.

    Parameters
    ----------
    input_graph : cuGraph.Graph
        cuGraph graph descriptor with connectivity information. The graph can
        contain either directed (DiGraph) or undirected edges (Graph).
    alpha : float
        Attenuation factor defaulted to None. If alpha is not specified then
        it is internally calculated as 1/(degree_max) where degree_max is the
        maximum out degree.
        NOTE : The maximum acceptable value of alpha for convergence
        alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue
        of the graph.
        Since lambda_max is always lesser than or equal to degree_max for a
        graph, alpha_max will always be greater than or equal to
        (1/degree_max). Therefore, setting alpha to (1/degree_max) will
        guarantee that it will never exceed alpha_max thus in turn fulfilling
        the requirement for convergence.
    beta : None
        A weight scalar - currently Not Supported
    max_iter : int
        The maximum number of iterations before an answer is returned. This can
        be used to limit the execution time and do an early exit before the
        solver reaches the convergence tolerance.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 100.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0e-6.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 1e-2 and 1e-6 are
        acceptable.
    nstart : dask_cudf.Dataframe
        GPU Dataframe containing the initial guess for katz centrality
        nstart['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        nstart['values'] : dask_cudf.Series
            Contains the katz centrality values of vertices
    normalized : bool
        If True normalize the resulting katz centrality values

    Returns
    -------
    katz_centrality : dask_cudf.DataFrame
        GPU data frame containing two dask_cudf.Series of size V: the
        vertex identifiers and the corresponding katz centrality values.

        ddf['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        ddf['katz_centrality'] : dask_cudf.Series
            Contains the katz centrality of vertices

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize(p2p=True)
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> pr = dcg.katz_centrality(dg)
    >>> Comms.destroy()
    """

    nstart = None

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=True)
    (ddf, num_verts, partition_row_size, partition_col_size,
     vertex_partition_offsets) = shuffle(input_graph, transposed=True)
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    result = [
        client.submit(call_katz_centrality,
                      Comms.get_session_id(),
                      wf[1],
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      alpha,
                      beta,
                      max_iter,
                      tol,
                      nstart,
                      normalized,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)
    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
예제 #3
0
def sssp(graph, source):
    """
    Compute the distance and predecessors for shortest paths from the specified
    source to all the vertices in the graph. The distances column will store
    the distance from the source to each vertex. The predecessors column will
    store each vertex's predecessor in the shortest path. Vertices that are
    unreachable will have a distance of infinity denoted by the maximum value
    of the data type and the predecessor set as -1. The source vertex's
    predecessor is also set to -1.
    The input graph must contain edge list as dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe.
        Undirected Graph not currently supported.
    source : Integer
        Specify source vertex

    Returns
    -------
    df : dask_cudf.DataFrame
        df['vertex'] gives the vertex id

        df['distance'] gives the path distance from the
        starting vertex

        df['predecessor'] gives the vertex id it was
        reached from in the traversal

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize(p2p=True)
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
    >>> df = dcg.sssp(dg, 0)
    >>> Comms.destroy()
    """

    client = default_client()

    graph.compute_renumber_edge_list(transposed=False)
    (ddf, num_verts, partition_row_size, partition_col_size,
     vertex_partition_offsets) = shuffle(graph, transposed=False)
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if graph.renumbered:
        source = graph.lookup_internal_vertex_id(
            cudf.Series([source], dtype='int32')).compute()
        source = source.iloc[0]

    result = [
        client.submit(call_sssp,
                      Comms.get_session_id(),
                      wf[1],
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      source,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if graph.renumbered:
        ddf = graph.unrenumber(ddf, 'vertex')
        ddf = graph.unrenumber(ddf, 'predecessor')
        ddf["predecessor"] = ddf["predecessor"].fillna(-1)

    return ddf
예제 #4
0
def louvain(input_graph, max_iter=100, resolution=1.0):
    """
    Compute the modularity optimizing partition of the input graph using the
    Louvain method on multiple GPUs

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize(p2p=True)
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.Graph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> parts, modularity_score = dcg.louvain(dg)
    """
    # FIXME: finish docstring: describe parameters, etc.

    # MG Louvain currently requires CUDA 10.2 or higher.
    # FIXME: remove this check once RAPIDS drops support for CUDA < 10.2
    if is_cuda_version_less_than((10, 2)):
        raise NotImplementedError("Multi-GPU Louvain is not implemented for "
                                  "this version of CUDA. Ensure CUDA version "
                                  "10.2 or higher is installed.")

    # FIXME: dask methods to populate graphs from edgelists are only present on
    # DiGraph classes. Disable the Graph check for now and assume inputs are
    # symmetric DiGraphs.
    # if type(graph) is not Graph:
    #     raise Exception("input graph must be undirected")
    client = default_client()
    # Calling renumbering results in data that is sorted by degree
    input_graph.compute_renumber_edge_list(transposed=False)
    sorted_by_degree = True

    (ddf, num_verts, partition_row_size, partition_col_size,
     vertex_partition_offsets) = shuffle(input_graph, transposed=False)

    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    futures = [
        client.submit(call_louvain,
                      Comms.get_session_id(),
                      wf[1],
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      sorted_by_degree,
                      max_iter,
                      resolution,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]

    wait(futures)

    # futures is a list of Futures containing tuples of (DataFrame, mod_score),
    # unpack using separate calls to client.submit with a callable to get
    # individual items.
    # FIXME: look into an alternate way (not returning a tuples, accessing
    # tuples differently, etc.) since multiple client.submit() calls may not be
    # optimal.
    df_futures = [client.submit(op.getitem, f, 0) for f in futures]
    mod_score_futures = [client.submit(op.getitem, f, 1) for f in futures]

    ddf = dask_cudf.from_delayed(df_futures)
    # Each worker should have computed the same mod_score
    mod_score = mod_score_futures[0].result()

    if input_graph.renumbered:
        # MG renumbering is lazy, but it's safe to assume it's been called at
        # this point if renumbered=True
        ddf = input_graph.unrenumber(ddf, "vertex")

    return (ddf, mod_score)
예제 #5
0
def pagerank(input_graph,
             alpha=0.85,
             personalization=None,
             max_iter=100,
             tol=1.0e-5,
             nstart=None):
    """
    Find the PageRank values for each vertex in a graph using multiple GPUs.
    cuGraph computes an approximation of the Pagerank using the power method.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    alpha : float
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.
    personalization : cudf.Dataframe
        GPU Dataframe containing the personalization information.
        Currently not supported.
        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices
    max_iter : int
        The maximum number of iterations before an answer is returned.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 30.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.
    nstart : not supported
        initial guess for pagerank
    Returns
    -------
    PageRank : dask_cudf.DataFrame
        GPU data frame containing two dask_cudf.Series of size V: the
        vertex identifiers and the corresponding PageRank values.

        ddf['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        ddf['pagerank'] : dask_cudf.Series
            Contains the PageRank score

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize(p2p=True)
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> pr = dcg.pagerank(dg)
    >>> Comms.destroy()
    """
    from cugraph.structure.graph import null_check

    nstart = None

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=True)
    (ddf, num_verts, partition_row_size, partition_col_size,
     vertex_partition_offsets) = shuffle(input_graph, transposed=True)
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if personalization is not None:
        null_check(personalization["vertex"])
        null_check(personalization["values"])
        if input_graph.renumbered is True:
            personalization = input_graph.add_internal_vertex_id(
                personalization, "vertex", "vertex")
        p_data = get_distributed_data(personalization)

        result = [
            client.submit(call_pagerank,
                          Comms.get_session_id(),
                          wf[1],
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          alpha,
                          max_iter,
                          tol,
                          p_data.worker_to_parts[wf[0]][0],
                          nstart,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
    else:
        result = [
            client.submit(call_pagerank,
                          Comms.get_session_id(),
                          wf[1],
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          alpha,
                          max_iter,
                          tol,
                          personalization,
                          nstart,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)
    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
예제 #6
0
파일: louvain.py 프로젝트: wphicks/cugraph
def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True):
    """
    Compute the modularity optimizing partition of the input graph using the
    Louvain method on multiple GPUs

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.Graph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> parts, modularity_score = dcg.louvain(dg)
    """
    # FIXME: finish docstring: describe parameters, etc.

    # FIXME: import here to prevent circular import: cugraph->louvain
    # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure
    # from cugraph.structure.graph import Graph

    # FIXME: dask methods to populate graphs from edgelists are only present on
    # DiGraph classes. Disable the Graph check for now and assume inputs are
    # symmetric DiGraphs.
    # if type(graph) is not Graph:
    #     raise Exception("input graph must be undirected")

    client = default_client()
    # Calling renumbering results in data that is sorted by degree
    input_graph.compute_renumber_edge_list(transposed=False)
    sorted_by_degree = True
    (ddf, num_verts, partition_row_size, partition_col_size,
     vertex_partition_offsets) = shuffle(input_graph, transposed=False)
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_louvain,
                                  Comms.get_session_id(),
                                  wf[1],
                                  num_verts,
                                  num_edges,
                                  partition_row_size,
                                  partition_col_size,
                                  vertex_partition_offsets,
                                  sorted_by_degree,
                                  max_iter,
                                  resolution,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])

    wait(result)

    (parts, modularity_score) = result[0].result()

    if input_graph.renumbered:
        # MG renumbering is lazy, but it's safe to assume it's been called at
        # this point if renumbered=True
        parts = input_graph.unrenumber(parts, "vertex")

    return parts, modularity_score