示例#1
0
def dist_merge():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'first': cp.random.randint(100, 110, 5),
        'second': cp.random.randint(100, 110, 5)
    })
    df2 = gcy.DataFrame({
        'first': cp.random.randint(100, 110, 5),
        'second': cp.random.randint(100, 110, 5)
    })

    print(df1)
    print(df2)
    df3 = df1.merge(right=df2,
                    on="first",
                    how="left",
                    left_on=None,
                    right_on=None,
                    left_index=False,
                    right_index=False,
                    env=env)
    print("distributed joined df:\n", df3)
    env.finalize()
示例#2
0
def test_shuffle():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonContext Initialized: My rank: ", env.rank)

    input_file = "data/input/cities_a_" + str(env.rank) + ".csv"
    str_shuffle_file = "data/output/shuffle_str_cities_a_" + str(
        env.rank) + ".csv"
    int_shuffle_file = "data/output/shuffle_int_cities_a_" + str(
        env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(input_file))

    str_shuffled = df1.shuffle(on="state_id", ignore_index=True, env=env)
    str_shuffled_sorted = str_shuffled.to_cudf()\
        .sort_values(by=["state_id", "city", "population"], ignore_index=True)

    int_shuffled = df1.shuffle(on="population", ignore_index=True, env=env)
    int_shuffled_sorted = int_shuffled.to_cudf()\
        .sort_values(by=["state_id", "city", "population"], ignore_index=True)

    str_shuffled_saved = cudf.read_csv(str_shuffle_file)\
        .sort_values(by=["state_id", "city", "population"], ignore_index=True)
    int_shuffled_saved = cudf.read_csv(int_shuffle_file)\
        .sort_values(by=["state_id", "city", "population"], ignore_index=True)

    assert str_shuffled_sorted.equals(str_shuffled_saved), \
        "String based Shuffled DataFrame and DataFrame from file are not equal"
    assert int_shuffled_sorted.equals(int_shuffled_saved), \
        "Integer based Shuffled DataFrame and DataFrame from file are not equal"
示例#3
0
def dist_drop_duplicates():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    unionFile = "data/output/union_cities_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    print("df1: \n", df1)
    print("df2: \n", df2)
    concatedDf = gcy.concat([df1, df2], env=env)
    duplicates_dropped = concatedDf.drop_duplicates(ignore_index=True, env=env)
    d_dropped_sorted = duplicates_dropped.to_cudf().sort_values(
        by=["city", "state_id"], ignore_index=True)

    saved_union = cudf.read_csv(unionFile).sort_values(by=["city", "state_id"],
                                                       ignore_index=True)
    print(env.rank,
          " equal") if d_dropped_sorted.equals(saved_union) else print(
              env.rank, " not equal")

    env.finalize()
示例#4
0
def test_drop_duplicates():
    """
    We first perform concatenation of two dataframes,
    then drop duplicates.
    Resulting dataframe must be equal to the union of the two original dataframe
    """

    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    unionFile = "data/output/union_cities_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    concatedDf = gcy.concat([df1, df2], env=env)
    duplicates_dropped = concatedDf.drop_duplicates(ignore_index=True, env=env)
    d_dropped_sorted = duplicates_dropped.to_cudf().sort_values(
        by=["city", "state_id"], ignore_index=True)

    saved_union = cudf.read_csv(unionFile).sort_values(by=["city", "state_id"],
                                                       ignore_index=True)

    assert d_dropped_sorted.equals(saved_union), \
        "Duplicates dropped DataFrame and the DataFrame from file are not equal"
示例#5
0
def test_diff():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    diffFile1 = "data/output/diff_df1-df2_" + str(env.rank) + ".csv"
    diffFile2 = "data/output/diff_df2-df1_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    diff1 = df1.set_difference(other=df2, env=env)
    diff2 = df2.set_difference(other=df1, env=env)

    #  sort difference dataframes
    diff1_sorted = diff1.to_cudf().sort_values(by=["city", "state_id"],
                                               ignore_index=True)
    diff2_sorted = diff2.to_cudf().sort_values(by=["city", "state_id"],
                                               ignore_index=True)

    saved_diff1 = cudf.read_csv(diffFile1).sort_values(by=["city", "state_id"],
                                                       ignore_index=True)
    saved_diff2 = cudf.read_csv(diffFile2).sort_values(by=["city", "state_id"],
                                                       ignore_index=True)

    assert diff1_sorted.equals(saved_diff1), \
        "First Difference DataFrame and the DataFrame from file are not equal"
    assert diff2_sorted.equals(saved_diff2), \
        "Second Difference DataFrame and the DataFrame from file are not equal"
示例#6
0
def dist_union():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    unionFile = "data/output/union_cities_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    print("df1: \n", df1)
    print("df2: \n", df2)
    unionDf = df1.set_union(other=df2, env=env)

    #  sort union dataframes
    union_sorted = unionDf.to_cudf().sort_values(by=["city", "state_id"],
                                                 ignore_index=True)

    savedUnion = cudf.read_csv(unionFile).sort_values(by=["city", "state_id"],
                                                      ignore_index=True)

    print(env.rank, " equal") if savedUnion.equals(union_sorted) else print(
        env.rank, " not equal")
    env.finalize()
示例#7
0
def dist_diff():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    diffFile1 = "data/output/diff_df1-df2_" + str(env.rank) + ".csv"
    diffFile2 = "data/output/diff_df2-df1_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    print("df1: \n", df1)
    print("df2: \n", df2)
    diff1 = df1.set_difference(other=df2, env=env)
    diff2 = df2.set_difference(other=df1, env=env)

    #  sort difference dataframes
    diff1_sorted = diff1.to_cudf().sort_values(by=["city", "state_id"],
                                               ignore_index=True)
    diff2_sorted = diff2.to_cudf().sort_values(by=["city", "state_id"],
                                               ignore_index=True)

    savedDiff1 = cudf.read_csv(diffFile1).sort_values(by=["city", "state_id"],
                                                      ignore_index=True)
    savedDiff2 = cudf.read_csv(diffFile2).sort_values(by=["city", "state_id"],
                                                      ignore_index=True)

    print(env.rank, " equal") if savedDiff1.equals(diff1_sorted) else print(
        env.rank, " not equal")
    print(env.rank, " equal") if savedDiff2.equals(diff2_sorted) else print(
        env.rank, " not equal")

    env.finalize()
示例#8
0
def drop_cuplicates():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)
    df1 = gcy.DataFrame({
        'first': cp.random.randint(100, 110, 20),
        'second': cp.random.randint(100, 110, 20)
    })
    print("df1: \n", df1)
    df2 = df1.drop_duplicates(ignore_index=True, env=env)
    print("duplicates dropped: \n", df2) if df2 else print(
        "duplicates dropped: \n", df1)
    env.finalize()
示例#9
0
def gen_join_test_data():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    input_file1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    input_file2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    join_file = "data/output/join_cities_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(input_file1, index_col="state_id"))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(input_file2, index_col="state_id"))

    joined_df = df1.join(other=df2, how="inner", env=env)
    joined_df.to_cudf().to_csv(join_file)

    print(env.rank, " written join_file to the file: ", join_file)
    env.finalize()
示例#10
0
def dist_join():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'first': cp.random.rand(10),
        'second': cp.random.rand(10)
    })
    df2 = gcy.DataFrame({
        'first': cp.random.rand(10),
        'second': cp.random.rand(10)
    })
    print(df1)
    print(df2)
    df3 = df1.join(other=df2, env=env)
    print("distributed joined df:\n", df3)
    env.finalize()
示例#11
0
def dist_concat():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'first': cp.random.randint(0, 10, 5),
        'second': cp.random.randint(100, 110, 5)
    })
    df2 = gcy.DataFrame({
        'second': cp.random.randint(100, 110, 5),
        'first': cp.random.randint(0, 10, 5)
    })
    print(df1)
    print(df2)
    df3 = gcy.concat([df1, df2], join="inner", env=env)
    print("distributed concated df:\n", df3)
    env.finalize()
def gen_union_files():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    unionFile = "data/output/union_cities_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    print("df1: \n", df1)
    print("df2: \n", df2)
    unionDf = df1.set_union(other=df2, env=env)

    unionDf.to_cudf().to_csv(unionFile, index=False)
    print(env.rank, " written unionFile to the file: ", unionFile)

    env.finalize()
def gen_concat_files():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    concatFile = "data/output/concat_cities_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    print("df1: \n", df1)
    print("df2: \n", df2)
    concatedDf = gcy.concat([df1, df2], env=env)

    concatedDf.to_cudf().to_csv(concatFile, index=False)
    print(env.rank, " written concatFile to the file: ", concatFile)

    env.finalize()
def gen_groupby_test_data():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile = "data/input/cities_a_" + str(env.rank) + ".csv"
    gbyFile1 = "data/output/groupby_sum_cities_a_" + str(env.rank) + ".csv"
    gbyFile2 = "data/output/groupby_max_cities_a_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile))
    df1 = df1[["state_id", "population"]]

    print("df1: \n", df1)
    gby = df1.groupby("state_id", env=env)
    gby.sum().to_csv(gbyFile1)
    gby.max().to_csv(gbyFile2)

    print(env.rank, " written gbyFile1 to the file: ", gbyFile1)
    print(env.rank, " written gbyFile2 to the file: ", gbyFile2)
    env.finalize()
示例#15
0
def dist_groupby():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    if env.rank == 0:
        df = gcy.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 3], 'c': [1, 3, 5]})
        print("df on rank 0: \n", df)
    elif env.rank == 1:
        df = gcy.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 4], 'c': [2, 4, 6]})
        print("df on rank 1: \n", df)

    gby = df.groupby("a", env=env)
    print("df grouped-by on column 'a', performed 'sum': \n", gby.sum())
    print("performed 'max' on the same groupby object: \n", gby.max())
    print(
        "performed 'sum' on the same groupby object, aggregated on the column 'b' only: \n",
        gby["b"].sum())
    print("performed 'mean' on the same groupby object: \n", gby.mean())
    print("sizes of each group: \n", gby.size())

    gby = df.groupby(["a", "b"], env=env)
    print("df grouped-by on columns a and b, performed 'sum': \n", gby.sum())
    print("performed 'max' on the same groupby object: \n", gby.max())

    # groupby on index column with "level" parameter
    df1 = df.set_index("a")
    gby = df1.groupby(level="a", env=env)
    print("df grouped-by on index 'a', performed 'sum': \n", gby.sum())
    print("performed 'max' on the same groupby object: \n", gby.max())

    # if the original dataframe has many columns and
    # we only want to perform the groupby on some columns only,
    # the best way is to create a new dataframe with a subset of columns and
    # perform the groupby on this new dataframe
    df2 = df[["a", "b"]]
    print("two columns projected dataframe:\n", df2)
    gby = df2.groupby("a", env=env)
    print("grouped-by on column 'a' of projected df, performed 'sum': \n",
          gby.sum())
    print("performed 'max' on the same groupby object: \n", gby.max())

    env.finalize()
示例#16
0
def test_intersect():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    intersectFile = "data/output/intersect_cities_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    intersectDf = df1.set_intersect(other=df2, env=env)
    intersect_sorted = intersectDf.to_cudf().sort_values(
        by=["city", "state_id"], ignore_index=True)

    saved_intersect = cudf.read_csv(intersectFile).sort_values(
        by=["city", "state_id"], ignore_index=True)

    assert intersect_sorted.equals(saved_intersect), \
        "Intersect DataFrame and the DataFrame from file are not equal"
示例#17
0
def dist_union():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'weight': [60 + env.rank, 80 + env.rank],
        'age': [44, 55],
    })
    df2 = gcy.DataFrame({
        'age': [44, 66],
        'weight': [60 + env.rank, 80 + env.rank],
    })
    print(df1)
    print(df2)
    df3 = df1.set_union(other=df2, env=env)
    print("distributed set union:\n", df3)

    df3 = df1.set_union(other=df2, keep_duplicates=True, ignore_index=True, env=env)
    print("distributed set union with duplicates:\n", df3)
    env.finalize()
示例#18
0
def dist_intersection():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'weight': [60 + env.rank, 80 + env.rank],
        'age': [44, 55],
    })
    df2 = gcy.DataFrame({
        'weight': [60 + env.rank, 80 + env.rank],
        'age': [44, 66],
    })
    print(df1)
    print(df2)
    df3 = df1.set_intersect(other=df2, env=env)
    print("distributed set intersection:\n", df3)

    df3 = df1.set_intersect(other=df2, subset=["age"], env=env)
    print("distributed set intersection with a subset of columns:\n", df3)
    env.finalize()
示例#19
0
def test_concat():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    concatFile = "data/output/concat_cities_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    concatedDf = gcy.concat([df1, df2], env=env)
    concated_sorted = concatedDf.to_cudf().sort_values(by=["city", "state_id"],
                                                       ignore_index=True)

    saved_concated = cudf.read_csv(concatFile).sort_values(
        by=["city", "state_id"], ignore_index=True)

    assert concated_sorted.equals(saved_concated), \
        "Concatanated DataFrame and the DataFrame from file are not equal"
def gen_shuffle_test_data():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    input_file = "data/input/cities_a_" + str(env.rank) + ".csv"
    str_shuffle_file = "data/output/shuffle_str_cities_a_" + str(
        env.rank) + ".csv"
    int_shuffle_file = "data/output/shuffle_int_cities_a_" + str(
        env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(input_file))

    str_shuffled = df1.shuffle(on="state_id", ignore_index=True, env=env)
    str_shuffled.to_cudf().to_csv(str_shuffle_file, index=False)

    int_shuffled = df1.shuffle(on="population", ignore_index=True, env=env)
    int_shuffled.to_cudf().to_csv(int_shuffle_file, index=False)

    print(env.rank, " written gbyFile1 to the file: ", str_shuffle_file)
    print(env.rank, " written gbyFile2 to the file: ", int_shuffle_file)
    env.finalize()
def gen_diff_files():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    diffFile1 = "data/output/diff_df1-df2_" + str(env.rank) + ".csv"
    diffFile2 = "data/output/diff_df2-df1_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    print("df1: \n", df1)
    print("df2: \n", df2)
    diff1 = df1.set_difference(other=df2, env=env)
    diff2 = df2.set_difference(other=df1, env=env)

    diff1.to_cudf().to_csv(diffFile1, index=False)
    diff2.to_cudf().to_csv(diffFile2, index=False)
    print(env.rank, " written diff1 to the file: ", diffFile1)
    print(env.rank, " written diff2 to the file: ", diffFile2)
    env.finalize()
示例#22
0
def test_groupby():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonContext Initialized: My rank: ", env.rank)

    inputFile = "data/input/cities_a_" + str(env.rank) + ".csv"
    gbyFile1 = "data/output/groupby_sum_cities_a_" + str(env.rank) + ".csv"
    gbyFile2 = "data/output/groupby_max_cities_a_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile))
    df1 = df1[["state_id", "population"]]
    gby = df1.groupby("state_id", env=env)
    sum_df = gby.sum().sort_index()
    max_df = gby.max().sort_index()

    saved_sum_df = cudf.read_csv(gbyFile1, index_col="state_id").sort_index()
    saved_max_df = cudf.read_csv(gbyFile2, index_col="state_id").sort_index()

    assert sum_df.equals(
        saved_sum_df
    ), "Groupbyed Sum DataFrame and DataFrame from file are not equal"
    assert max_df.equals(
        saved_max_df
    ), "Groupbyed Maz DataFrame and DataFrame from file are not equal"
示例#23
0
def dist_diff():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'weight': [60 + env.rank, 80 + env.rank],
        'age': [44, 55],
    })
    df2 = gcy.DataFrame({
        'weight': [60 + env.rank, 80 + env.rank],
        'age': [44, 66],
    })
    print("df1: \n", df1)
    print("df2: \n", df2)
    df3 = df1.set_difference(other=df2, env=env)
    print("df1 distributed set difference df2:\n", df3)
    df3 = df2.set_difference(other=df1, env=env)
    print("df2 distributed set difference df1:\n", df3)
#    df3 = df1.set_difference(df2, subset=["age"], env=env)
#    print("df1 distributed set difference df2 on subset=['age']: \n", df3)
    df3 = df2.set_difference(df1, subset=["age"], env=env)
    print("df2 distributed set difference df1 on subset=['age']: \n", df3)
    env.finalize()
示例#24
0
def test_join():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonContext Initialized: My rank: ", env.rank)

    input_file1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    input_file2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    join_file = "data/output/join_cities_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(
        cudf.read_csv(input_file1, index_col="state_id"))
    df2 = gcy.DataFrame.from_cudf(
        cudf.read_csv(input_file2, index_col="state_id"))

    joined_df = df1.join(other=df2, how="inner", env=env)
    joined_sorted = joined_df.to_cudf() \
        .sort_values(by=["cityl", "populationl", "cityr", "populationr"])

    saved_sorted = cudf.read_csv(join_file, index_col="state_id") \
        .sort_values(by=["cityl", "populationl", "cityr", "populationr"])

    assert len(joined_sorted) == len(saved_sorted)
    assert joined_sorted.equals(saved_sorted), \
        "Joined DataFrame and DataFrame from file are not equal"
示例#25
0
def dist_concat():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv"
    inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv"
    concatFile = "data/output/concat_cities_" + str(env.rank) + ".csv"

    df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1))
    df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2))

    concatedDf = gcy.concat([df1, df2], env=env)

    #  sort dataframe
    concated_sorted = concatedDf.to_cudf().sort_values(by=["city", "state_id"],
                                                       ignore_index=True)

    saved_concated = cudf.read_csv(concatFile).sort_values(
        by=["city", "state_id"], ignore_index=True)

    print(env.rank,
          " equal") if concated_sorted.equals(saved_concated) else print(
              env.rank, " not equal")
    env.finalize()
示例#26
0
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

import cupy as cp
import pycylon as cy
import pygcylon as gcy

env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
print("CylonContext Initialized: My rank: ", env.rank)

start = 100 * env.rank
df = gcy.DataFrame({
    'first': cp.random.randint(start, start + 10, 10),
    'second': cp.random.randint(start, start + 10, 10)
})
print("initial df from rank: ", env.rank, "\n", df)

shuffledDF = df.shuffle(on="first", ignore_index=True, env=env)

print("shuffled df from rank: ", env.rank, "\n", shuffledDF)

env.finalize()
print("after finalize from the rank:", env.rank)