Пример #1
0
def test_symmetrize_df(graph_file):
    gc.collect()

    cu_M = utils.read_csv_file(graph_file+'.csv')
    sym_df = cugraph.symmetrize_df(cu_M, '0', '1')

    compare(cu_M['0'], cu_M['1'], cu_M['2'],
            sym_df['0'], sym_df['1'], sym_df['2'])
Пример #2
0
def test_symmetrize_df(graph_file):
    gc.collect()

    cu_M = utils.read_csv_file(graph_file)
    sym_df = cugraph.symmetrize_df(cu_M, "0", "1")

    compare(cu_M["0"], cu_M["1"], cu_M["2"], sym_df["0"], sym_df["1"],
            sym_df["2"])
Пример #3
0
def test_symmetrize_bad_weights():
    src = [0, 0, 0, 0, 1, 2]
    dst = [1, 2, 3, 4, 0, 3]
    val = [1.0, 1.0, 1.0, 1.0, 2.0, 1.0]

    df = pd.DataFrame({'src': src, 'dst': dst, 'val': val})

    gdf = cudf.DataFrame.from_pandas(df[['src', 'dst', 'val']])
    sym_df = cugraph.symmetrize_df(gdf, 'src', 'dst')

    compare(gdf['src'], gdf['dst'], gdf['val'], sym_df['src'], sym_df['dst'],
            sym_df['val'])
Пример #4
0
def test_symmetrize_df(managed, pool, graph_file):
    gc.collect()

    rmm.reinitialize(managed_memory=managed, pool_allocator=pool)

    assert (rmm.is_initialized())

    cu_M = utils.read_csv_file(graph_file + '.csv')
    sym_df = cugraph.symmetrize_df(cu_M, '0', '1')

    compare(cu_M['0'], cu_M['1'], cu_M['2'], sym_df['0'], sym_df['1'],
            sym_df['2'])
Пример #5
0
def test_symmetrize_bad_weights():
    src = [0, 0, 0, 0, 1, 2]
    dst = [1, 2, 3, 4, 0, 3]
    val = [1.0, 1.0, 1.0, 1.0, 2.0, 1.0]

    df = pd.DataFrame({"src": src, "dst": dst, "val": val})

    gdf = cudf.DataFrame.from_pandas(df[["src", "dst", "val"]])
    sym_df = cugraph.symmetrize_df(gdf, "src", "dst")

    compare(
        gdf["src"],
        gdf["dst"],
        gdf["val"],
        sym_df["src"],
        sym_df["dst"],
        sym_df["val"],
    )
Пример #6
0
def cluster(
    X,
    n_neighbors=30,
    community="louvain",
    metric="euclidean",
    algorithm="brute",
    similarity="jaccard",
    min_size=10,
    distributed=False,
):
    """
    Clusters

    Parameters
    ----------
    X : cudf.DataFrame
        Input cell-by-feature dataframe.
    n_neighbors : int
        Number of neighbors for kNN.
    community: string
        Community detection algorithm to use.
        Deault is 'louvain'.
    metric: string
        Distance metric to use for kNN.
        Currently, only 'euclidean' is supported.
    algorithm: string
        The query algorithm to use.
        Currently, only 'brute' is supported.
    similarity: string
        Similarity metric to use for neighbor edge refinement.
        Default is 'jaccard'.
    min_size: int
        Minimum cluster size.
    distributed: bool
        If True, use a multi-GPU dask cluster for kNN search.
    Returns
    -------
    communities: cudf.DataFrame
        Community labels.
    G: cugraph.Graph
        k-neighbors graph.
    Q: float
        Modularity score for detected communities.
        Q is not returned if community='ecg' is used.
    """

    tic = time.time()
    # Go!

    idx = find_neighbors(X, n_neighbors, metric, algorithm, distributed)

    print(f"Neighbors computed in {time.time() - tic} seconds...")

    subtic = time.time()

    G = kneighbors_graph(idx, n_neighbors, X.shape[0])

    if similarity == "overlap":
        print("Computing overlap similarity...", flush=True)
        G = cugraph.overlap(G)

    else:
        similarity = "jaccard"
        print("Computing Jaccard similarity...", flush=True)
        G = cugraph.jaccard(G)

    print(
        f"{similarity} graph constructed in {time.time() - subtic} seconds...",
        flush=True,
    )

    g = cugraph.symmetrize_df(G, "source", "destination")
    G = cugraph.Graph()
    G.from_cudf_edgelist(g, edge_attr=f"{similarity}_coeff")
    del g

    if community == "louvain":

        print("Running Louvain modularity optimization...", flush=True)

        parts, Q = cugraph.louvain(G, max_iter=1000)

        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)
        print(f"Modularity: {Q}", flush=True)

        return communities, G, Q

    elif community == "leiden":

        print("Running Leiden modularity optimization...", flush=True)

        parts, Q = cugraph.leiden(G, max_iter=1000)

        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)
        print(f"Modularity: {Q}", flush=True)

        return communities, G, Q

    elif community == "ecg":

        print("Running ECG...", flush=True)
        parts = cugraph.ecg(G)
        communities = sort_by_size(
            cp.asarray(parts.sort_values(by="vertex").partition), min_size)

        n_parts = cp.unique(communities).shape[0]

        print(f"grapheno completed in {time.time() - tic} seconds...",
              flush=True)
        print(f"Communities detected: {n_parts}", flush=True)

        return communities, G, None

    # Insert any community/clustering method...
    elif community == "your favorite method":
        pass