예제 #1
0
def test_extract_subgraph_specific_query(property_graph_instance):
    """
    Graph of only transactions after time 1639085000 for merchant_id 4 (should
    be a graph of 2 vertices, 1 edge)
    """
    pG = property_graph_instance

    selection = pG.select_edges("(_TYPE_=='transactions') & "
                                "(merchant_id==4) & "
                                "(time>1639085000)")
    G = pG.extract_subgraph(selection=selection,
                            create_using=DiGraph_inst,
                            edge_weight_property="card_num")

    expected_edgelist = cudf.DataFrame({
        "src": [89216],
        "dst": [4],
        "weights": [8832]
    })
    actual_edgelist = G.unrenumber(G.edgelist.edgelist_df,
                                   "src",
                                   preserve_order=True)
    actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True)

    assert G.is_directed()
    assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
예제 #2
0
def test_enable_batch_edgelist_replication(graph_file, directed, dask_client):
    gc.collect()
    G = utils.generate_cugraph_graph_from_file(graph_file, directed)
    G.enable_batch()
    df = G.edgelist.edgelist_df
    for worker in G.batch_edgelists:
        replicated_df = G.batch_edgelists[worker].result()
        assert_frame_equal(df, replicated_df)
예제 #3
0
def test_basic_assert_frame_equal(
    rdtype,
    rname,
    index,
    check_exact,
    check_dtype,
    check_names,
    check_like,
    mismatch,
):
    data = [1, 2, 1]
    p_left = pd.DataFrame(index=[1, 2, 3])
    p_left["a"] = np.array(data, dtype="int8")
    p_left["b"] = np.array(data, dtype="int16")
    if mismatch:
        p_left["c"] = np.array([1, 2, 3], dtype="int64")
    else:
        p_left["c"] = np.array(data, dtype="int64")

    p_right = pd.DataFrame(index=index)
    for dtype, name in zip(rdtype, rname):
        p_right[name] = np.array(data, dtype=dtype)

    left = cudf.from_pandas(p_left)
    right = cudf.from_pandas(p_right)

    kind = None
    try:
        pd.testing.assert_frame_equal(
            p_left,
            p_right,
            check_exact=check_exact,
            check_dtype=check_dtype,
            check_names=check_names,
            check_like=check_like,
        )
    except BaseException as e:
        kind = type(e)

    if kind is not None:
        with pytest.raises(kind):
            assert_frame_equal(
                left,
                right,
                check_exact=check_exact,
                check_dtype=check_dtype,
                check_names=check_names,
                check_like=check_like,
            )
    else:
        assert_frame_equal(
            left,
            right,
            check_exact=check_exact,
            check_dtype=check_dtype,
            check_names=check_names,
            check_like=check_like,
        )
예제 #4
0
def test_replicate_cudf_dataframe_no_weights(input_data_path, dask_client):
    gc.collect()
    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst"],
        dtype=["int32", "int32"],
    )
    worker_to_futures = replication.replicate_cudf_dataframe(df)
    for worker in worker_to_futures:
        replicated_df = worker_to_futures[worker].result()
        assert_frame_equal(df, replicated_df)
예제 #5
0
def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
    path = datadir / inputfile
    try:
        orcfile = pa.orc.ORCFile(path)
    except pa.ArrowIOError as e:
        pytest.skip(".orc file is not found: %s" % e)

    expect = orcfile.read(columns=columns).to_pandas()
    got = cudf.read_orc(path,
                        engine=engine,
                        columns=columns,
                        use_index=use_index)

    assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False)
예제 #6
0
def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
    reference_file = "TestOrcFile.demo-12-zlib.orc"
    pdf_fname = datadir / reference_file
    gdf_fname = tmpdir.join("gdf.orc")

    try:
        orcfile = pa.orc.ORCFile(pdf_fname)
    except Exception as excpr:
        if type(excpr).__name__ == "ArrowIOError":
            pytest.skip(".orc file is not found")
        else:
            print(type(excpr).__name__)

    expect = cudf.from_pandas(orcfile.read().to_pandas())
    expect.to_orc(gdf_fname.strpath, statistics=stats_freq)
    got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas())

    assert_frame_equal(expect, got)
예제 #7
0
def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
    pdf_fname = datadir / reference_file
    gdf_fname = tmpdir.join("gdf.orc")

    try:
        orcfile = pa.orc.ORCFile(pdf_fname)
    except Exception as excpr:
        if type(excpr).__name__ == "ArrowIOError":
            pytest.skip(".orc file is not found")
        else:
            print(type(excpr).__name__)

    expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas())
    expect.to_orc(gdf_fname.strpath, compression=compression)
    got = cudf.from_pandas(
        pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas())

    assert_frame_equal(expect, got)
예제 #8
0
def test_extract_subgraph_edge_prop_condition_only(property_graph_instance):
    pG = property_graph_instance

    selection = pG.select_edges("_TYPE_=='transactions'")
    G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst)

    # last item is the DataFrame rows
    transactions = dataset1["transactions"][-1]
    (srcs, dsts) = zip(*[(t[0], t[1]) for t in transactions])
    expected_edgelist = cudf.DataFrame({"src": srcs, "dst": dsts})
    expected_edgelist = expected_edgelist.sort_values(by="src",
                                                      ignore_index=True)

    actual_edgelist = G.unrenumber(G.edgelist.edgelist_df,
                                   "src",
                                   preserve_order=True)
    actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True)
    actual_edgelist = actual_edgelist.sort_values(by="src", ignore_index=True)

    assert G.is_directed()
    assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
예제 #9
0
def test_extract_subgraph_graph_without_vert_props():
    """
    Ensure a subgraph can be extracted from a PropertyGraph that does not have
    vertex properties.
    """
    from cugraph.experimental import PropertyGraph

    transactions = dataset1["transactions"]
    relationships = dataset1["relationships"]

    pG = PropertyGraph()

    pG.add_edge_data(cudf.DataFrame(columns=transactions[0],
                                    data=transactions[1]),
                     type_name="transactions",
                     vertex_id_columns=("user_id", "merchant_id"),
                     property_columns=None)
    pG.add_edge_data(cudf.DataFrame(columns=relationships[0],
                                    data=relationships[1]),
                     type_name="relationships",
                     vertex_id_columns=("user_id_1", "user_id_2"),
                     property_columns=None)

    G = pG.extract_subgraph(selection=pG.select_edges("_SRC_ == 89216"),
                            create_using=DiGraph_inst,
                            edge_weight_property="relationship_type",
                            default_edge_weight=0)

    expected_edgelist = cudf.DataFrame({
        "src": [89216, 89216, 89216],
        "dst": [4, 89021, 32431],
        "weights": [0, 9, 9]
    })
    actual_edgelist = G.unrenumber(G.edgelist.edgelist_df,
                                   "src",
                                   preserve_order=True)
    actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True)

    assert G.is_directed()
    assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
예제 #10
0
def test_select_vertices_from_previous_selection(property_graph_instance):
    """
    Ensures that the intersection of vertices of multiple types (only vertices
    that are both type A and type B) can be selected.
    """
    pG = property_graph_instance

    # Select referrals from only taxpayers who are users (should be 1)
    selection = pG.select_vertices("_TYPE_ == 'taxpayers'")
    selection = pG.select_vertices("_TYPE_ == 'users'",
                                   from_previous_selection=selection)
    selection += pG.select_edges("_TYPE_ == 'referrals'")
    G = pG.extract_subgraph(create_using=DiGraph_inst, selection=selection)

    expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634]})
    actual_edgelist = G.unrenumber(G.edgelist.edgelist_df,
                                   "src",
                                   preserve_order=True)
    actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True)

    assert G.is_directed()
    assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
예제 #11
0
def test_extract_subgraph_vertex_edge_prop_condition(property_graph_instance):
    pG = property_graph_instance

    selection = pG.select_vertices("(user_location==47906) | "
                                   "(user_location==78750)")
    selection += pG.select_edges("_TYPE_=='referrals'")
    G = pG.extract_subgraph(selection=selection,
                            create_using=DiGraph_inst,
                            edge_weight_property="stars")

    expected_edgelist = cudf.DataFrame({
        "src": [78634],
        "dst": [32431],
        "weights": [4]
    })
    actual_edgelist = G.unrenumber(G.edgelist.edgelist_df,
                                   "src",
                                   preserve_order=True)
    actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True)

    assert G.is_directed()
    assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
예제 #12
0
def test_extract_subgraph_vertex_prop_condition_only(property_graph_instance):

    pG = property_graph_instance

    selection = pG.select_vertices("(_TYPE_=='taxpayers') & (amount<100)")
    G = pG.extract_subgraph(selection=selection,
                            create_using=DiGraph_inst,
                            edge_weight_property="stars")

    expected_edgelist = cudf.DataFrame({
        "src": [89021],
        "dst": [78634],
        "weights": [4]
    })
    actual_edgelist = G.unrenumber(G.edgelist.edgelist_df,
                                   "src",
                                   preserve_order=True)
    actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True)

    assert G.is_directed()
    # check_like=True ignores differences in column/index ordering
    assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
예제 #13
0
def test_chunked_orc_writer(datadir, tmpdir, reference_file, columns,
                            compression):
    pdf_fname = datadir / reference_file
    gdf_fname = tmpdir.join("chunked_gdf.orc")

    try:
        orcfile = pa.orc.ORCFile(pdf_fname)
    except Exception as excpr:
        if type(excpr).__name__ == "ArrowIOError":
            pytest.skip(".orc file is not found")
        else:
            print(type(excpr).__name__)

    pdf = orcfile.read(columns=columns).to_pandas()
    gdf = cudf.from_pandas(pdf)
    expect = pd.concat([pdf, pdf]).reset_index(drop=True)

    writer = ORCWriter(gdf_fname, compression=compression)
    writer.write_table(gdf)
    writer.write_table(gdf)
    writer.close()

    got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
    assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))
예제 #14
0
def test_extract_subgraph_default_edge_weight(property_graph_instance):
    """
    Ensure the default_edge_weight value is added to edges with missing
    properties used for weights.
    """
    pG = property_graph_instance

    selection = pG.select_edges("_TYPE_=='transactions'")
    G = pG.extract_subgraph(create_using=DiGraph_inst,
                            selection=selection,
                            edge_weight_property="volume",
                            default_edge_weight=99)

    # last item is the DataFrame rows
    transactions = dataset1["transactions"][-1]
    (srcs, dsts, weights) = zip(*[(t[0], t[1], t[2]) for t in transactions])
    # replace None with the expected value (convert to a list to replace)
    weights_list = list(weights)
    weights_list[weights.index(None)] = 99.
    weights = tuple(weights_list)
    expected_edgelist = cudf.DataFrame({
        "src": srcs,
        "dst": dsts,
        "weights": weights
    })
    expected_edgelist = expected_edgelist.sort_values(by="src",
                                                      ignore_index=True)

    actual_edgelist = G.unrenumber(G.edgelist.edgelist_df,
                                   "src",
                                   preserve_order=True)
    actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True)
    actual_edgelist = actual_edgelist.sort_values(by="src", ignore_index=True)

    assert G.is_directed()
    assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)
예제 #15
0
def test_edge_props_to_graph(property_graph_instance):
    """
    Access the property DataFrames directly and use them to perform a more
    complex query, then call edge_props_to_graph() to create the corresponding
    graph.
    """
    pG = property_graph_instance

    # Select referrals from only taxpayers who are users (should be 1)

    # Find the list of vertices that are both users and taxpayers
    def contains_both(df):
        return (df["_TYPE_"] == "taxpayers").any() and \
            (df["_TYPE_"] == "users").any()
    verts = pG._vertex_prop_dataframe.groupby("_VERTEX_")\
                                     .apply(contains_both)
    verts = verts[verts].keys()  # get an array of only verts that have both

    # Find the "referral" edge_props containing only those verts
    referrals = pG._edge_prop_dataframe["_TYPE_"] == "referrals"
    srcs = pG._edge_prop_dataframe[referrals]["_SRC_"].isin(verts)
    dsts = pG._edge_prop_dataframe[referrals]["_DST_"].isin(verts)
    matching_edges = (srcs & dsts)
    indices = matching_edges.index[matching_edges]
    edge_props = pG._edge_prop_dataframe.loc[indices]

    G = pG.edge_props_to_graph(edge_props, create_using=DiGraph_inst)

    expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634]})
    actual_edgelist = G.unrenumber(G.edgelist.edgelist_df,
                                   "src",
                                   preserve_order=True)
    actual_edgelist = G.unrenumber(actual_edgelist, "dst", preserve_order=True)

    assert G.is_directed()
    assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True)