Exemplo n.º 1
0
def overlap(input_graph, vertex_pair=None):
    """
    Compute the Overlap Coefficient between each pair of vertices connected by
    an edge, or between arbitrary pairs of vertices specified by the user.
    Overlap Coefficient is defined between two sets as the ratio of the volume
    of their intersection divided by the smaller of their two volumes. In the
    context of graphs, the neighborhood of a vertex is seen as a set. The
    Overlap Coefficient weight of each edge represents the strength of
    connection between vertices based on the relative similarity of their
    neighbors. If first is specified but second is not, or vice versa, an
    exception will be thrown.

    Parameters
    ----------
    graph : cugraph.Graph
        cuGraph graph descriptor, should contain the connectivity information
        as an edge list (edge weights are not used for this algorithm). The
        adjacency list will be computed if not already present.
    vertex_pair : cudf.DataFrame
        A GPU dataframe consisting of two columns representing pairs of
        vertices. If provided, the overlap coefficient is computed for the
        given vertex pairs, else, it is computed for all vertex pairs.

    Returns
    -------
    df : cudf.DataFrame
        GPU data frame of size E (the default) or the size of the given pairs
        (first, second) containing the Overlap coefficients. The ordering is
        relative to the adjacency list, or that given by the specified vertex
        pairs.

        df['source'] : cudf.Series
            The source vertex ID (will be identical to first if specified).
        df['destination'] : cudf.Series
            The destination vertex ID (will be identical to second if
            specified).
        df['overlap_coeff'] : cudf.Series
            The computed Overlap coefficient between the source and destination
            vertices.

    Examples
    --------
    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> df = cugraph.overlap(G)
    """

    if (type(vertex_pair) == cudf.DataFrame):
        null_check(vertex_pair[vertex_pair.columns[0]])
        null_check(vertex_pair[vertex_pair.columns[1]])
    elif vertex_pair is None:
        pass
    else:
        raise ValueError("vertex_pair must be a cudf dataframe")

    df = overlap_wrapper.overlap(input_graph, None, vertex_pair)

    return df
Exemplo n.º 2
0
def subgraph(G, vertices):
    """
    Compute a subgraph of the existing graph including only the specified
    vertices.  This algorithm works for both directed and undirected graphs,
    it does not actually traverse the edges, simply pulls out any edges that
    are incident on vertices that are both contained in the vertices list.

    Parameters
    ----------
    G : cugraph.Graph
        cuGraph graph descriptor
    vertices : cudf.Series
        Specifies the vertices of the induced subgraph

    Returns
    -------
    Sg : cugraph.Graph
        A graph object containing the subgraph induced by the given vertex set.

    Examples
    --------
    >>> gdf = cudf.read_csv('datasets/karate.csv',
                          delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> verts = numpy.zeros(3, dtype=numpy.int32)
    >>> verts[0] = 0
    >>> verts[1] = 1
    >>> verts[2] = 2
    >>> sverts = cudf.Series(verts)
    >>> Sg = cugraph.subgraph(G, sverts)
    """

    null_check(vertices)

    if G.renumbered:
        vertices = G.lookup_internal_vertex_id(vertices)

    result_graph = type(G)()

    df = subgraph_extraction_wrapper.subgraph(G, vertices)

    if G.renumbered:
        df = G.unrenumber(df, "src")
        df = G.unrenumber(df, "dst")

    if G.edgelist.weights:
        result_graph.from_cudf_edgelist(df,
                                        source="src",
                                        destination="dst",
                                        edge_attr="weight")
    else:
        result_graph.from_cudf_edgelist(df, source="src", destination="dst")

    return result_graph
Exemplo n.º 3
0
def renumber(source_col, dest_col):
    """
    Take a (potentially sparse) set of source and destination vertex ids and
    renumber the vertices to create a dense set of vertex ids using all values
    contiguously from 0 to the number of unique vertices - 1.

    Input columns can be either int64 or int32.  The output will be mapped to
    int32, since many of the cugraph functions are limited to int32. If the
    number of unique values in source_col and dest_col > 2^31-1 then this
    function will return an error.

    Return from this call will be three cudf Series - the renumbered
    source_col, the renumbered dest_col and a numbering map that maps the new
    ids to the original ids.

    Parameters
    ----------
    source_col : cudf.Series
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains the source index for each edge.
        Source indices must be an integer type.
    dest_col : cudf.Series
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains the destination index for each edge.
        Destination indices must be an integer type.
    numbering_map : cudf.Series
        This cudf.Series wraps a gdf column of size V (V: number of vertices).
        The gdf column contains a numbering map that maps the new ids to the
        original ids.

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> sources = cudf.Series(M['0'])
    >>> destinations = cudf.Series(M['1'])
    >>> source_col, dest_col, numbering_map = cugraph.renumber(sources,
    >>>                                                        destinations)
    >>> G = cugraph.Graph()
    >>> G.add_edge_list(source_col, dest_col, None)
    """
    csg.null_check(source_col)
    csg.null_check(dest_col)

    (source_col, dest_col,
     numbering_map) = graph_new_wrapper.renumber(source_col, dest_col)

    return source_col, dest_col, numbering_map
Exemplo n.º 4
0
def subgraph(G, vertices):
    """
    Compute a subgraph of the existing graph including only the specified
    vertices.  This algorithm works for both directed and undirected graphs,
    it does not actually traverse the edges, simply pulls out any edges that
    are incident on vertices that are both contained in the vertices list.

    Parameters
    ----------
    G : cugraph.Graph
        cuGraph graph descriptor
    vertices : cudf.Series
        Specifies the vertices of the induced subgraph

    Returns
    -------
    Sg : cugraph.Graph
        A graph object containing the subgraph induced by the given vertex set.

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> sources = cudf.Series(M['0'])
    >>> destinations = cudf.Series(M['1'])
    >>> G = cugraph.Graph()
    >>> G.add_edge_list(sources, destinations, None)
    >>> verts = numpy.zeros(3, dtype=numpy.int32)
    >>> verts[0] = 0
    >>> verts[1] = 1
    >>> verts[2] = 2
    >>> sverts = cudf.Series(verts)
    >>> Sg = cugraph.subgraph(G, sverts)
    """

    null_check(vertices)

    result_graph = Graph()

    subgraph_extraction_wrapper.subgraph(G.graph_ptr, vertices,
                                         result_graph.graph_ptr)

    return result_graph
Exemplo n.º 5
0
def symmetrize(source_col, dest_col, value_col=None):
    """
    Take a COO set of source destination pairs along with associated values and
    create a new COO set of source destination pairs along with values where
    all edges exist in both directions.

    Return from this call will be a COO stored as two cudf Series - the
    symmetrized source column and the symmetrized dest column, along with
    an optional cudf Series containing the associated values (only if the
    values are passed in).

    Parameters
    ----------
    source_col : cudf.Series
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains the source index for each edge.
        Source indices must be an integer type.
    dest_col : cudf.Series
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains the destination index for each edge.
        Destination indices must be an integer type.
    value_col : cudf.Series (optional)
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains values associated with this edge.
        For this function the values can be any type, they are not
        examined, just copied.

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> sources = cudf.Series(M['0'])
    >>> destinations = cudf.Series(M['1'])
    >>> values = cudf.Series(M['2'])
    >>> src, dst, val = cugraph.symmetrize(sources, destinations, values)
    >>> G = cugraph.Graph()
    >>> G.add_edge_list(src, dst, val)
    """
    csg.null_check(source_col)
    csg.null_check(dest_col)

    input_df = cudf.DataFrame({"source": source_col, "destination": dest_col})

    if value_col is not None:
        csg.null_check(value_col)
        input_df.insert(len(input_df.columns), "value", value_col)

    output_df = symmetrize_df(input_df, "source", "destination")

    if value_col is not None:
        return (
            output_df["source"],
            output_df["destination"],
            output_df["value"],
        )

    return output_df["source"], output_df["destination"]
Exemplo n.º 6
0
def traveling_salesperson(
    pos_list,
    restarts=100000,
    beam_search=True,
    k=4,
    nstart=None,
    verbose=False,
):
    """
    Finds an approximate solution to the traveling salesperson problem (TSP).
    cuGraph computes an approximation of the TSP problem using hill climbing
    optimization.

    The current implementation does not support a weighted graph.

    Parameters
    ----------
    pos_list: cudf.DataFrame
        Data frame with initial vertex positions containing three columns:
        'vertex' ids and 'x', 'y' positions.
    restarts: int
        Number of starts to try. The more restarts, the better the solution
        will be approximated. The number of restarts depends on the problem
        size and should be kept low for instances above 2k cities.
    beam_search: bool
        Specify if the initial solution should use KNN for an approximation
        solution.
    k: int
        Beam width to use in the search.
    nstart: int
        Vertex id to use as starting position.
    verbose: bool
        Logs configuration and iterative improvement.

    Returns
    -------
    route : cudf.Series
        cudf.Series of size V containing the ordered list of vertices
        than needs to be visited.
    """

    if not isinstance(pos_list, cudf.DataFrame):
        raise TypeError("Instance should be cudf.DataFrame")

    null_check(pos_list['vertex'])
    null_check(pos_list['x'])
    null_check(pos_list['y'])

    if nstart is not None and not pos_list[pos_list['vertex'] == nstart].index:
        raise ValueError("nstart should be in vertex ids")

    route, cost = traveling_salesperson_wrapper.traveling_salesperson(
        pos_list, restarts, beam_search, k, nstart, verbose)
    return route, cost
Exemplo n.º 7
0
def symmetrize(source_col, dest_col, value_col=None, multi=False,
               symmetrize=True):
    """
    Take a COO set of source destination pairs along with associated values
    stored in a single GPU or distributed
    create a new COO set of source destination pairs along with values where
    all edges exist in both directions.

    Return from this call will be a COO stored as two cudf Series or
    dask_cudf.Series -the symmetrized source column and the symmetrized dest
    column, along with
    an optional cudf Series containing the associated values (only if the
    values are passed in).

    Parameters
    ----------
    source_col : cudf.Series or dask_cudf.Series
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains the source index for each edge.
        Source indices must be an integer type.
    dest_col : cudf.Series or dask_cudf.Series
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains the destination index for each edge.
        Destination indices must be an integer type.
    value_col : cudf.Series or dask_cudf.Series (optional)
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains values associated with this edge.
        For this function the values can be any type, they are not
        examined, just copied.

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> sources = cudf.Series(M['0'])
    >>> destinations = cudf.Series(M['1'])
    >>> values = cudf.Series(M['2'])
    >>> src, dst, val = cugraph.symmetrize(sources, destinations, values)
    """

    input_df = None
    weight_name = None
    if type(source_col) is dask_cudf.Series:
        # FIXME convoluted way of just wrapping dask cudf Series in a ddf
        input_df = source_col.to_frame()
        input_df = input_df.rename(columns={source_col.name: "source"})
        input_df["destination"] = dest_col
    else:
        input_df = cudf.DataFrame(
            {"source": source_col, "destination": dest_col}
        )
        csg.null_check(source_col)
        csg.null_check(dest_col)
    if value_col is not None:
        weight_name = "value"
        input_df.insert(len(input_df.columns), "value", value_col)
    output_df = None
    if type(source_col) is dask_cudf.Series:
        output_df = symmetrize_ddf(
            input_df, "source", "destination", weight_name
        ).persist()
    else:
        output_df = symmetrize_df(input_df, "source", "destination", multi,
                                  symmetrize)

    if value_col is not None:
        return (
            output_df["source"],
            output_df["destination"],
            output_df["value"],
        )
    return output_df["source"], output_df["destination"]
Exemplo n.º 8
0
def jaccard(input_graph, vertex_pair=None):
    """
    Compute the Jaccard similarity between each pair of vertices connected by
    an edge, or between arbitrary pairs of vertices specified by the user.
    Jaccard similarity is defined between two sets as the ratio of the volume
    of their intersection divided by the volume of their union. In the context
    of graphs, the neighborhood of a vertex is seen as a set. The Jaccard
    similarity weight of each edge represents the strength of connection
    between vertices based on the relative similarity of their neighbors. If
    first is specified but second is not, or vice versa, an exception will be
    thrown.

    NOTE: If the vertex_pair parameter is not specified then the behavior
    of cugraph.jaccard is different from the behavior of
    networkx.jaccard_coefficient.

    cugraph.jaccard, in the absence of a specified vertex pair list, will
    use the edges of the graph to construct a vertex pair list and will
    return the jaccard coefficient for those vertex pairs.

    networkx.jaccard_coefficient, in the absence of a specified vertex
    pair list, will return an upper triangular dense matrix, excluding
    the diagonal as well as vertex pairs that are directly connected
    by an edge in the graph, of jaccard coefficients.  Technically, networkx
    returns a lazy iterator across this upper triangular matrix where
    the actual jaccard coefficient is computed when the iterator is
    dereferenced.  Computing a dense matrix of results is not feasible
    if the number of vertices in the graph is large (100,000 vertices
    would result in 4.9 billion values in that iterator).

    If your graph is small enough (or you have enough memory and patience)
    you can get the interesting (non-zero) values that are part of the networkx
    solution by doing the following:

    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> pairs = cugraph.get_two_hop_neighbors(G)
    >>> df = cugraph.jaccard(G, pairs)

    But please remember that cugraph will fill the dataframe with the entire
    solution you request, so you'll need enough memory to store the 2-hop
    neighborhood dataframe.


    Parameters
    ----------
    graph : cugraph.Graph
        cuGraph graph descriptor, should contain the connectivity information
        as an edge list (edge weights are not used for this algorithm). The
        graph should be undirected where an undirected edge is represented by a
        directed edge in both direction. The adjacency list will be computed if
        not already present.
    vertex_pair : cudf.DataFrame
        A GPU dataframe consisting of two columns representing pairs of
        vertices. If provided, the jaccard coefficient is computed for the
        given vertex pairs.  If the vertex_pair is not provided then the
        current implementation computes the jaccard coefficient for all
        adjacent vertices in the graph.

    Returns
    -------
    df  : cudf.DataFrame
        GPU data frame of size E (the default) or the size of the given pairs
        (first, second) containing the Jaccard weights. The ordering is
        relative to the adjacency list, or that given by the specified vertex
        pairs.

        df['source'] : cudf.Series
            The source vertex ID (will be identical to first if specified)
        df['destination'] : cudf.Series
            The destination vertex ID (will be identical to second if
            specified)
        df['jaccard_coeff'] : cudf.Series
            The computed Jaccard coefficient between the source and destination
            vertices

    Examples
    --------
    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> df = cugraph.jaccard(G)
    """
    if type(input_graph) is not Graph:
        raise Exception("input graph must be undirected")

    # FIXME: Add support for multi-column vertices
    if type(vertex_pair) == cudf.DataFrame:
        for col in vertex_pair.columns:
            null_check(vertex_pair[col])
            if input_graph.renumbered:
                vertex_pair = input_graph.add_internal_vertex_id(
                    vertex_pair, col, col)

    elif vertex_pair is None:
        pass
    else:
        raise ValueError("vertex_pair must be a cudf dataframe")

    df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair)

    if input_graph.renumbered:
        df = input_graph.unrenumber(df, "source")
        df = input_graph.unrenumber(df, "destination")

    return df
Exemplo n.º 9
0
def jaccard(input_graph, first=None, second=None):
    """
    Compute the Jaccard similarity between each pair of vertices connected by
    an edge, or between arbitrary pairs of vertices specified by the user.
    Jaccard similarity is defined between two sets as the ratio of the volume
    of their intersection divided by the volume of their union. In the context
    of graphs, the neighborhood of a vertex is seen as a set. The Jaccard
    similarity weight of each edge represents the strength of connection
    between vertices based on the relative similarity of their neighbors. If
    first is specified but second is not, or vice versa, an exception will be
    thrown.

    Parameters
    ----------
    graph : cugraph.Graph
        cuGraph graph descriptor, should contain the connectivity information
        as an edge list (edge weights are not used for this algorithm). The
        graph should be undirected where an undirected edge is represented by a
        directed edge in both direction. The adjacency list will be computed if
        not already present.
    first : cudf.Series
        Specifies the first vertices of each pair of vertices to compute for,
        must be specified along with second.
    second : cudf.Series
        Specifies the second vertices of each pair of vertices to compute for,
        must be specified along with first.

    Returns
    -------
    df  : cudf.DataFrame
        GPU data frame of size E (the default) or the size of the given pairs
        (first, second) containing the Jaccard weights. The ordering is
        relative to the adjacency list, or that given by the specified vertex
        pairs.

        df['source'] : cudf.Series
            The source vertex ID (will be identical to first if specified)
        df['destination'] : cudf.Series
            The destination vertex ID (will be identical to second if
            specified)
        df['jaccard_coeff'] : cudf.Series
            The computed Jaccard coefficient between the source and destination
            vertices

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> sources = cudf.Series(M['0'])
    >>> destinations = cudf.Series(M['1'])
    >>> G = cugraph.Graph()
    >>> G.add_edge_list(sources, destinations, None)
    >>> df = cugraph.jaccard(G)
    """

    if (type(first) == cudf.Series and type(second) == cudf.Series):
        null_check(first)
        null_check(second)
    elif first is None and second is None:
        pass
    else:
        raise ValueError("Specify first and second or neither")

    df = jaccard_wrapper.jaccard(input_graph.graph_ptr, first, second)

    return df
Exemplo n.º 10
0
def force_atlas2(input_graph,
                 max_iter=500,
                 pos_list=None,
                 outbound_attraction_distribution=True,
                 lin_log_mode=False,
                 prevent_overlapping=False,
                 edge_weight_influence=1.0,
                 jitter_tolerance=1.0,
                 barnes_hut_optimize=True,
                 barnes_hut_theta=0.5,
                 scaling_ratio=2.0,
                 strong_gravity_mode=False,
                 gravity=1.0,
                 verbose=False,
                 callback=None):
    """
        ForceAtlas2 is a continuous graph layout algorithm for handy network
        visualization.

        NOTE: Peak memory allocation occurs at 17*V.

        Parameters
        ----------
        input_graph : cugraph.Graph
            cuGraph graph descriptor with connectivity information.
            Edge weights, if present, should be single or double precision
            floating point values.
        max_iter : integer
            This controls the maximum number of levels/iterations of the Force
            Atlas algorithm. When specified the algorithm will terminate after
            no more than the specified number of iterations.
            No error occurs when the algorithm terminates early in this manner.
            Good short-term quality can be achieved with 50-100 iterations.
            Above 1000 iterations is discouraged.
        pos_list: cudf.DataFrame
            Data frame with initial vertex positions containing two columns:
            'x' and 'y' positions.
        outbound_attraction_distribution: bool
            Distributes attraction along outbound edges.
            Hubs attract less and thus are pushed to the borders.
        lin_log_mode: bool
            Switch Force Atlas model from lin-lin to lin-log.
            Makes clusters more tight.
        prevent_overlapping: bool
            Prevent nodes to overlap.
        edge_weight_influence: float
            How much influence you give to the edges weight.
            0 is “no influence” and 1 is “normal”.
        jitter_tolerance: float
            How much swinging you allow. Above 1 discouraged.
            Lower gives less speed and more precision.
        barnes_hut_theta: float
            Float between 0 and 1. Tradeoff for speed (1) vs
            accuracy (0) for Barnes Hut only.
        scaling_ratio: float
            How much repulsion you want. More makes a more sparse graph.
            Switching from regular mode to LinLog mode needs a readjustment
            of the scaling parameter.
        gravity : float
            Attracts nodes to the center. Prevents islands from drifting away.
        verbose: bool
            Output convergence info at each interation.
        callback: GraphBasedDimRedCallback
            An instance of GraphBasedDimRedCallback class to intercept
            the internal state of positions while they are being trained.
            Example of callback usage:
                from cugraph.layout import GraphBasedDimRedCallback
                    class CustomCallback(GraphBasedDimRedCallback):
                        def on_preprocess_end(self, positions):
                            print(positions.copy_to_host())
                        def on_train_end(self, positions):
                            print(positions.copy_to_host())
                        def on_train_end(self, positions):
                            print(positions.copy_to_host())

        Returns
        -------
        pos : cudf.DataFrame
            GPU data frame of size V containing three columns:
            the vertex identifiers and the x and y positions.
    """

    if pos_list is not None:
        null_check(pos_list['vertex'])
        null_check(pos_list['x'])
        null_check(pos_list['y'])

    if prevent_overlapping:
        raise Exception("Feature not supported")

    if input_graph.is_directed():
        input_graph = input_graph.to_undirected()

    pos = force_atlas2_wrapper.force_atlas2(
        input_graph,
        max_iter=max_iter,
        pos_list=pos_list,
        outbound_attraction_distribution=outbound_attraction_distribution,
        lin_log_mode=lin_log_mode,
        prevent_overlapping=prevent_overlapping,
        edge_weight_influence=edge_weight_influence,
        jitter_tolerance=jitter_tolerance,
        barnes_hut_optimize=barnes_hut_optimize,
        barnes_hut_theta=barnes_hut_theta,
        scaling_ratio=scaling_ratio,
        strong_gravity_mode=strong_gravity_mode,
        gravity=gravity,
        verbose=verbose,
        callback=callback)
    return pos
Exemplo n.º 11
0
def pagerank(G,
             alpha=0.85,
             personalization=None,
             max_iter=100,
             tol=1.0e-5,
             nstart=None):
    """
    Find the PageRank score for every vertex in a graph. cuGraph computes an
    approximation of the Pagerank eigenvector using the power method. The
    number of iterations depends on the properties of the network itself; it
    increases when the tolerance descreases and/or alpha increases toward the
    limiting value of 1. The user is free to use default values or to provide
    inputs for the initial guess, tolerance and maximum number of iterations.

    Parameters
    ----------
    graph : cugraph.Graph
        cuGraph graph descriptor, should contain the connectivity information
        as an edge list (edge weights are not used for this algorithm).
        The transposed adjacency list will be computed if not already present.
    alpha : float
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.
    personalization : cudf.Dataframe
        GPU Dataframe containing the personalization information.

        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices

    max_iter : int
        The maximum number of iterations before an answer is returned. This can
        be used to limit the execution time and do an early exit before the
        solver reaches the convergence tolerance.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 100.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.
    nstart : cudf.Dataframe
        GPU Dataframe containing the initial guess for pagerank.

        nstart['vertex'] : cudf.Series
            Subset of vertices of graph for initial guess for pagerank values
        nstart['values'] : cudf.Series
            Pagerank values for vertices

    Returns
    -------
    PageRank : cudf.DataFrame
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding PageRank values.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['pagerank'] : cudf.Series
            Contains the PageRank score


    Examples
    --------
    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> pr = cugraph.pagerank(G, alpha = 0.85, max_iter = 500, tol = 1.0e-05)
    """

    if personalization is not None:
        null_check(personalization["vertex"])
        null_check(personalization["values"])
        if G.renumbered is True:
            personalization = G.add_internal_vertex_id(personalization,
                                                       "vertex", "vertex")

    if nstart is not None:
        if G.renumbered is True:
            nstart = G.add_internal_vertex_id(nstart, "vertex", "vertex")

    df = pagerank_wrapper.pagerank(G, alpha, personalization, max_iter, tol,
                                   nstart)

    if G.renumbered:
        df = G.unrenumber(df, "vertex")

    return df
Exemplo n.º 12
0
def pagerank(input_graph,
             alpha=0.85,
             personalization=None,
             max_iter=100,
             tol=1.0e-5,
             nstart=None,
             load_balance=True):
    """
    Find the PageRank values for each vertex in a graph using multiple GPUs.
    cuGraph computes an approximation of the Pagerank using the power method.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    alpha : float
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.
    personalization : cudf.Dataframe
        GPU Dataframe containing the personalization information.

        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices
    max_iter : int
        The maximum number of iterations before an answer is returned.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 30.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.
    nstart : not supported
        initial guess for pagerank
    load_balance : bool
        Set as True to perform load_balancing after global sorting of
        dask-cudf DataFrame. This ensures that the data is uniformly
        distributed among multiple GPUs to avoid over-loading.

    Returns
    -------
    PageRank : cudf.DataFrame
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding PageRank values.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['pagerank'] : cudf.Series
            Contains the PageRank score

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> pr = dcg.pagerank(dg)
    >>> Comms.destroy()
    """
    from cugraph.structure.graph import null_check

    nstart = None

    client = default_client()

    if (input_graph.local_data is not None
            and input_graph.local_data['by'] == 'dst'):
        data = input_graph.local_data['data']
    else:
        data = get_local_data(input_graph, by='dst', load_balance=load_balance)

    if personalization is not None:
        null_check(personalization["vertex"])
        null_check(personalization["values"])
        if input_graph.renumbered is True:
            personalization = input_graph.add_internal_vertex_id(
                personalization, "vertex", "vertex").compute()

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_pagerank,
                                  Comms.get_session_id(),
                                  wf[1],
                                  data.local_data,
                                  alpha,
                                  max_iter,
                                  tol,
                                  personalization,
                                  nstart,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])
    wait(result)

    if input_graph.renumbered:
        return input_graph.unrenumber(result[0].result(), 'vertex').compute()

    return result[0].result()
Exemplo n.º 13
0
def pagerank(input_graph,
             alpha=0.85,
             personalization=None,
             max_iter=100,
             tol=1.0e-5,
             nstart=None):
    """
    Find the PageRank values for each vertex in a graph using multiple GPUs.
    cuGraph computes an approximation of the Pagerank using the power method.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    alpha : float
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.
    personalization : cudf.Dataframe
        GPU Dataframe containing the personalization information.
        Currently not supported.

        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices
    max_iter : int
        The maximum number of iterations before an answer is returned.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 30.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.
    nstart : not supported
        initial guess for pagerank

    Returns
    -------
    PageRank : dask_cudf.DataFrame
        GPU data frame containing two dask_cudf.Series of size V: the
        vertex identifiers and the corresponding PageRank values.

        ddf['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        ddf['pagerank'] : dask_cudf.Series
            Contains the PageRank score

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> ... Init a DASK Cluster
    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> pr = dcg.pagerank(dg)
    """
    from cugraph.structure.graph import null_check

    nstart = None

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=True)

    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if personalization is not None:
        null_check(personalization["vertex"])
        null_check(personalization["values"])
        if input_graph.renumbered is True:
            personalization = input_graph.add_internal_vertex_id(
                personalization, "vertex", "vertex")
        p_data = get_distributed_data(personalization)

        result = [
            client.submit(call_pagerank,
                          Comms.get_session_id(),
                          wf[1],
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          alpha,
                          max_iter,
                          tol,
                          p_data.worker_to_parts[wf[0]][0],
                          nstart,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
    else:
        result = [
            client.submit(call_pagerank,
                          Comms.get_session_id(),
                          wf[1],
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          alpha,
                          max_iter,
                          tol,
                          personalization,
                          nstart,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)
    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
Exemplo n.º 14
0
def jaccard_w(input_graph, weights, vertex_pair=None):
    """
    Compute the weighted Jaccard similarity between each pair of vertices
    connected by an edge, or between arbitrary pairs of vertices specified by
    the user. Jaccard similarity is defined between two sets as the ratio of
    the volume of their intersection divided by the volume of their union. In
    the context of graphs, the neighborhood of a vertex is seen as a set. The
    Jaccard similarity weight of each edge represents the strength of
    connection between vertices based on the relative similarity of their
    neighbors. If first is specified but second is not, or vice versa, an
    exception will be thrown.

    Parameters
    ----------
    graph : cugraph.Graph
        cuGraph graph descriptor, should contain the connectivity information
        as an edge list (edge weights are not used for this algorithm). The
        adjacency list will be computed if not already present.

    weights : cudf.Series
        Specifies the weights to be used for each vertex.

    vertex_pair : cudf.DataFrame
        A GPU dataframe consisting of two columns representing pairs of
        vertices. If provided, the jaccard coefficient is computed for the
        given vertex pairs, else, it is computed for all vertex pairs.

    Returns
    -------
    df : cudf.DataFrame
        GPU data frame of size E (the default) or the size of the given pairs
        (first, second) containing the Jaccard weights. The ordering is
        relative to the adjacency list, or that given by the specified vertex
        pairs.

        df['source'] : cudf.Series
            The source vertex ID
        df['destination'] : cudf.Series
            The destination vertex ID
        df['jaccard_coeff'] : cudf.Series
            The computed weighted Jaccard coefficient between the source and
            destination vertices.

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> sources = cudf.Series(M['0'])
    >>> destinations = cudf.Series(M['1'])
    >>> weights = cudf.Series(numpy.ones(
    >>>     max(sources.max(),destinations.max())+1, dtype=numpy.float32))
    >>> G = cugraph.Graph()
    >>> G.add_edge_list(sources, destinations, None)
    >>> df = cugraph.jaccard_w(G, weights)
    """
    if type(input_graph) is not Graph:
        raise Exception("input graph must be undirected")

    if (type(vertex_pair) == cudf.DataFrame):
        null_check(vertex_pair[vertex_pair.columns[0]])
        null_check(vertex_pair[vertex_pair.columns[1]])
    elif vertex_pair is None:
        pass
    else:
        raise ValueError("vertex_pair must be a cudf dataframe")

    df = jaccard_wrapper.jaccard(input_graph, weights, vertex_pair)

    return df