Пример #1
0
def test_rearrange(shuffle, get):
    df = pd.DataFrame({'x': np.random.random(10)})
    ddf = dd.from_pandas(df, npartitions=4)
    ddf2 = ddf.assign(y=ddf.x % 4)

    result = rearrange_by_column(ddf2, 'y', max_branch=32, shuffle=shuffle)
    assert result.npartitions == ddf.npartitions
    assert set(ddf.dask).issubset(result.dask)

    # Every value in exactly one partition
    a = result.compute(get=get)
    parts = get(result.dask, result._keys())
    for i in a.y.drop_duplicates():
        assert sum(i in part.y for part in parts) == 1
Пример #2
0
def test_rearrange(shuffle, scheduler):
    df = pd.DataFrame({"x": np.random.random(10)})
    ddf = dd.from_pandas(df, npartitions=4)
    ddf2 = ddf.assign(_partitions=ddf.x % 4)

    result = rearrange_by_column(ddf2, "_partitions", max_branch=32, shuffle=shuffle)
    assert result.npartitions == ddf.npartitions
    assert set(ddf.dask).issubset(result.dask)

    # Every value in exactly one partition
    a = result.compute(scheduler=scheduler)
    get = dask.base.get_scheduler(scheduler=scheduler)
    parts = get(result.dask, result.__dask_keys__())

    for i in a._partitions.drop_duplicates():
        assert sum(i in set(part._partitions) for part in parts) == 1
Пример #3
0
def sort_values(
    df,
    by,
    max_branch=None,
    divisions=None,
    set_divisions=False,
    ignore_index=False,
):
    """ Sort by the given list/tuple of column names.
    """
    npartitions = df.npartitions
    if isinstance(by, tuple):
        by = list(by)
    elif not isinstance(by, list):
        by = [by]

    # Step 1 - Calculate new divisions (if necessary)
    if divisions is None:
        divisions = quantile_divisions(df, by, npartitions)

    # Step 2 - Perform repartitioning shuffle
    meta = df._meta._constructor_sliced([0])
    if not isinstance(divisions, (gd.Series, gd.DataFrame)):
        dtype = df[by[0]].dtype
        divisions = df._meta._constructor_sliced(divisions, dtype=dtype)

    partitions = df[by].map_partitions(
        _set_partitions_pre, divisions=divisions, meta=meta
    )

    df2 = df.assign(_partitions=partitions)
    df3 = rearrange_by_column(
        df2,
        "_partitions",
        max_branch=max_branch,
        npartitions=len(divisions) - 1,
        shuffle="tasks",
        ignore_index=ignore_index,
    ).drop(columns=["_partitions"])
    df3.divisions = (None,) * (df3.npartitions + 1)

    # Step 3 - Return final sorted df
    df4 = df3.map_partitions(M.sort_values, by)
    if not isinstance(divisions, gd.DataFrame) and set_divisions:
        # Can't have multi-column divisions elsewhere in dask (yet)
        df4.divisions = methods.tolist(divisions)
    return df4
Пример #4
0
def test_rearrange_disk_cleanup_with_exception():
    # ensure temporary files are cleaned up when there's an internal exception.

    with mock.patch("dask.dataframe.shuffle.shuffle_group_3", new=mock_shuffle_group_3):
        df = pd.DataFrame({"x": np.random.random(10)})
        ddf = dd.from_pandas(df, npartitions=4)
        ddf2 = ddf.assign(_partitions=ddf.x % 4)

        tmpdir = tempfile.mkdtemp()

        with dask.config.set(temporay_directory=str(tmpdir)):
            with pytest.raises(ValueError, match="Mock exception!"):
                result = rearrange_by_column(
                    ddf2, "_partitions", max_branch=32, shuffle="disk"
                )
                result.compute(scheduler="processes")

    assert len(os.listdir(tmpdir)) == 0
Пример #5
0
def shuffle(dg, transposed=False, prows=None, pcols=None):
    """
    Shuffles the renumbered input distributed graph edgelist into ngpu
    partitions. The number of processes/gpus P = prows*pcols. The 2D
    partitioning divides the matrix into P*pcols rectangular partitions
    as per vertex partitioning performed in renumbering, and then shuffles
    these partitions into P gpus.
    """

    ddf = dg.edgelist.edgelist_df
    ngpus = get_n_workers()
    if prows is None and pcols is None:
        prows, pcols = get_2D_div(ngpus)
    else:
        if prows is not None and pcols is not None:
            if ngpus != prows*pcols:
                raise Exception('prows*pcols should be equal to the\
 number of processes')
        elif prows is not None:
            if ngpus % prows != 0:
                raise Exception('prows must be a factor of the number\
 of processes')
            pcols = int(ngpus/prows)
        elif pcols is not None:
            if ngpus % pcols != 0:
                raise Exception('pcols must be a factor of the number\
 of processes')
            prows = int(ngpus/pcols)

    renumber_vertex_count = dg.renumber_map.implementation.\
        ddf.map_partitions(len).compute()
    renumber_vertex_cumsum = renumber_vertex_count.cumsum()
    src_dtype = ddf['src'].dtype
    dst_dtype = ddf['dst'].dtype

    vertex_row_partitions = cudf.Series([0], dtype=src_dtype)
    vertex_row_partitions = vertex_row_partitions.append(cudf.Series(
        renumber_vertex_cumsum, dtype=src_dtype))
    num_verts = vertex_row_partitions.iloc[-1]
    vertex_col_partitions = []
    for i in range(pcols + 1):
        vertex_col_partitions.append(vertex_row_partitions.iloc[i*prows])
    vertex_col_partitions = cudf.Series(vertex_col_partitions, dtype=dst_dtype)

    meta = ddf._meta._constructor_sliced([0])
    partitions = ddf.map_partitions(
        _set_partitions_pre,
        vertex_row_partitions=vertex_row_partitions,
        vertex_col_partitions=vertex_col_partitions, prows=prows,
        pcols=pcols, transposed=transposed, meta=meta)
    ddf2 = ddf.assign(_partitions=partitions)
    ddf3 = rearrange_by_column(
        ddf2,
        "_partitions",
        max_branch=None,
        npartitions=ngpus,
        shuffle="tasks",
        ignore_index=True,
    ).drop(columns=["_partitions"])

    return ddf3, num_verts, vertex_row_partitions
Пример #6
0
def rearrange_by_hash(df,
                      columns,
                      npartitions,
                      max_branch=None,
                      ignore_index=True):
    if npartitions and npartitions != df.npartitions:
        # Use main-line dask for new npartitions
        meta = df._meta._constructor_sliced([0])
        partitions = df[columns].map_partitions(set_partitions_hash,
                                                columns,
                                                npartitions,
                                                meta=meta)
        # Note: Dask will use a shallow copy for assign
        df2 = df.assign(_partitions=partitions)
        return rearrange_by_column(
            df2,
            "_partitions",
            shuffle="tasks",
            max_branch=max_branch,
            npartitions=npartitions,
            ignore_index=ignore_index,
        )

    n = df.npartitions
    if max_branch is False:
        stages = 1
    else:
        max_branch = max_branch or 32
        stages = int(math.ceil(math.log(n) / math.log(max_branch)))

    if stages > 1:
        k = int(math.ceil(n**(1 / stages)))
    else:
        k = n

    if isinstance(columns, str):
        columns = [columns]
    elif isinstance(columns, tuple):
        columns = list(columns)

    groups = []
    splits = []
    combines = []

    inputs = [
        tuple(digit(i, j, k) for j in range(stages)) for i in range(k**stages)
    ]

    token = tokenize(df, columns, max_branch)

    start = {("shuffle-combine-" + token, 0, inp):
             (df._name, i) if i < df.npartitions else df._meta
             for i, inp in enumerate(inputs)}

    for stage in range(1, stages + 1):
        group = {  # Convert partition into dict of dataframe pieces
            ("shuffle-group-" + token, stage, inp): (
                _shuffle_group,
                ("shuffle-combine-" + token, stage - 1, inp),
                columns,
                stage - 1,
                k,
                n,
                ignore_index,
            )
            for inp in inputs
        }

        split = {  # Get out each individual dataframe piece from the dicts
            ("shuffle-split-" + token, stage, i, inp): (
                getitem,
                ("shuffle-group-" + token, stage, inp),
                i,
            )
            for i in range(k) for inp in inputs
        }

        combine = {  # concatenate those pieces together, with their friends
            ("shuffle-combine-" + token, stage, inp): (
                _concat,
                [(
                    "shuffle-split-" + token,
                    stage,
                    inp[stage - 1],
                    insert(inp, stage - 1, j),
                ) for j in range(k)],
                ignore_index,
            )
            for inp in inputs
        }
        groups.append(group)
        splits.append(split)
        combines.append(combine)

    end = {("shuffle-" + token, i): ("shuffle-combine-" + token, stages, inp)
           for i, inp in enumerate(inputs)}

    dsk = toolz.merge(start, end, *(groups + splits + combines))
    graph = HighLevelGraph.from_collections("shuffle-" + token,
                                            dsk,
                                            dependencies=[df])
    df2 = df.__class__(graph, "shuffle-" + token, df, df.divisions)
    df2.divisions = (None, ) * (df.npartitions + 1)

    return df2
Пример #7
0
def pagerank(input_graph,
             alpha=0.85,
             personalization=None,
             max_iter=100,
             tol=1.0e-5,
             nstart=None):
    """
    Find the PageRank values for each vertex in a graph using multiple GPUs.
    cuGraph computes an approximation of the Pagerank using the power method.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    input_graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.

    alpha : float, optional (default=0.85)
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.

    personalization : cudf.Dataframe, optional (default=None)
        GPU Dataframe containing the personalization information.
        Currently not supported.

        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices

    max_iter : int, optional (default=100)
        The maximum number of iterations before an answer is returned.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 30.

    tol : float, optional (default=1.0e-5)
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.

    nstart : not supported
        initial guess for pagerank

    Returns
    -------
    PageRank : dask_cudf.DataFrame
        GPU data frame containing two dask_cudf.Series of size V: the
        vertex identifiers and the corresponding PageRank values.

        ddf['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        ddf['pagerank'] : dask_cudf.Series
            Contains the PageRank score

    Examples
    --------
    >>> # import cugraph.dask as dcg
    >>> # ... Init a DASK Cluster
    >>> #    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/..
    >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv")
    >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize)
    >>> # dg = cugraph.Graph(directed=True)
    >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
    >>> #                            edge_attr='value')
    >>> # pr = dcg.pagerank(dg)

    """
    nstart = None

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=True)

    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    src_col_name = input_graph.renumber_map.renumbered_src_col_name
    dst_col_name = input_graph.renumber_map.renumbered_dst_col_name

    if personalization is not None:
        if input_graph.renumbered is True:
            personalization = input_graph.add_internal_vertex_id(
                personalization, "vertex", "vertex")

        # Function to assign partition id to personalization dataframe
        def _set_partitions_pre(s, divisions):
            partitions = divisions.searchsorted(s, side="right") - 1
            partitions[divisions.tail(1).searchsorted(
                s, side="right").astype("bool")] = (len(divisions) - 2)
            return partitions

        # Assign partition id column as per vertex_partition_offsets
        df = personalization
        by = ['vertex']
        meta = df._meta._constructor_sliced([0])
        divisions = vertex_partition_offsets
        partitions = df[by].map_partitions(_set_partitions_pre,
                                           divisions=divisions,
                                           meta=meta)

        df2 = df.assign(_partitions=partitions)

        # Shuffle personalization values according to the partition id
        df3 = rearrange_by_column(
            df2,
            "_partitions",
            max_branch=None,
            npartitions=len(divisions) - 1,
            shuffle="tasks",
            ignore_index=False,
        ).drop(columns=["_partitions"])

        p_data = get_distributed_data(df3)

        result = [
            client.submit(call_pagerank,
                          Comms.get_session_id(),
                          wf[1],
                          src_col_name,
                          dst_col_name,
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          input_graph.aggregate_segment_offsets,
                          alpha,
                          max_iter,
                          tol,
                          p_data.worker_to_parts[wf[0]][0],
                          nstart,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
    else:
        result = [
            client.submit(call_pagerank,
                          Comms.get_session_id(),
                          wf[1],
                          src_col_name,
                          dst_col_name,
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          input_graph.aggregate_segment_offsets,
                          alpha,
                          max_iter,
                          tol,
                          personalization,
                          nstart,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)
    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
Пример #8
0
def shuffle(dg, transposed=False):
    """
    Shuffles the renumbered input distributed graph edgelist into ngpu
    partitions. The number of processes/gpus P = prows*pcols. The 2D
    partitioning divides the matrix into P*pcols rectangular partitions
    as per vertex partitioning performed in renumbering, and then shuffles
    these partitions into P gpus.

    Parameters
    ----------
    transposed : bool, optional (default=False)
    """

    ddf = dg.edgelist.edgelist_df
    ngpus = Comms.get_n_workers()
    prows, pcols, partition_type = Comms.get_2D_partition()

    renumber_vertex_count = dg.renumber_map.implementation.\
        ddf.map_partitions(len).compute()
    renumber_vertex_cumsum = renumber_vertex_count.cumsum()

    if transposed:
        row_dtype = ddf['dst'].dtype
        col_dtype = ddf['src'].dtype
    else:
        row_dtype = ddf['src'].dtype
        col_dtype = ddf['dst'].dtype

    vertex_partition_offsets = cudf.Series([0], dtype=row_dtype)
    vertex_partition_offsets = vertex_partition_offsets.append(
        cudf.Series(renumber_vertex_cumsum, dtype=row_dtype))
    num_verts = vertex_partition_offsets.iloc[-1]
    if partition_type == 1:
        vertex_row_partitions = []
        for i in range(prows + 1):
            vertex_row_partitions.append(vertex_partition_offsets.iloc[i *
                                                                       pcols])
        vertex_row_partitions = cudf.Series(vertex_row_partitions,
                                            dtype=row_dtype)
    else:
        vertex_row_partitions = vertex_partition_offsets
    vertex_col_partitions = []
    for i in range(pcols + 1):
        vertex_col_partitions.append(vertex_partition_offsets.iloc[i * prows])
    vertex_col_partitions = cudf.Series(vertex_col_partitions, dtype=col_dtype)

    meta = ddf._meta._constructor_sliced([0])
    partitions = ddf.map_partitions(
        _set_partitions_pre,
        vertex_row_partitions=vertex_row_partitions,
        vertex_col_partitions=vertex_col_partitions,
        prows=prows,
        pcols=pcols,
        transposed=transposed,
        partition_type=partition_type,
        meta=meta)
    ddf2 = ddf.assign(_partitions=partitions)
    ddf3 = rearrange_by_column(
        ddf2,
        "_partitions",
        max_branch=None,
        npartitions=ngpus,
        shuffle="tasks",
        ignore_index=True,
    ).drop(columns=["_partitions"])

    partition_row_size = pcols
    partition_col_size = prows

    return (ddf3, num_verts, partition_row_size, partition_col_size,
            vertex_partition_offsets)
Пример #9
0
def sort_values(
    df,
    by,
    max_branch=None,
    divisions=None,
    set_divisions=False,
    ignore_index=False,
    ascending=True,
    na_position="last",
    sort_function=None,
    sort_function_kwargs=None,
):
    """Sort by the given list/tuple of column names."""
    if not isinstance(ascending, bool):
        raise ValueError("ascending must be either True or False")
    if na_position not in ("first", "last"):
        raise ValueError("na_position must be either 'first' or 'last'")

    npartitions = df.npartitions
    if isinstance(by, tuple):
        by = list(by)
    elif not isinstance(by, list):
        by = [by]

    # parse custom sort function / kwargs if provided
    sort_kwargs = {
        "by": by,
        "ascending": ascending,
        "na_position": na_position,
    }
    if sort_function is None:
        sort_function = M.sort_values
    if sort_function_kwargs is not None:
        sort_kwargs.update(sort_function_kwargs)

    # handle single partition case
    if npartitions == 1:
        return df.map_partitions(sort_function, **sort_kwargs)

    # Step 1 - Calculate new divisions (if necessary)
    if divisions is None:
        divisions = quantile_divisions(df, by, npartitions)

    # Step 2 - Perform repartitioning shuffle
    meta = df._meta._constructor_sliced([0])
    if not isinstance(divisions, (gd.Series, gd.DataFrame)):
        dtype = df[by[0]].dtype
        divisions = df._meta._constructor_sliced(divisions, dtype=dtype)

    partitions = df[by].map_partitions(
        _set_partitions_pre,
        divisions=divisions,
        ascending=ascending,
        na_position=na_position,
        meta=meta,
    )

    df2 = df.assign(_partitions=partitions)
    df3 = rearrange_by_column(
        df2,
        "_partitions",
        max_branch=max_branch,
        npartitions=len(divisions) - 1,
        shuffle="tasks",
        ignore_index=ignore_index,
    ).drop(columns=["_partitions"])
    df3.divisions = (None, ) * (df3.npartitions + 1)

    # Step 3 - Return final sorted df
    df4 = df3.map_partitions(sort_function, **sort_kwargs)
    if not isinstance(divisions, gd.DataFrame) and set_divisions:
        # Can't have multi-column divisions elsewhere in dask (yet)
        df4.divisions = methods.tolist(divisions)

    return df4