Пример #1
0
def test_dataframe_join_mismatch_cats(how):
    pdf1 = pd.DataFrame({"join_col": ["a", "b", "c", "d", "e"],
                         "data_col_left": [10, 20, 30, 40, 50]})
    pdf2 = pd.DataFrame({"join_col": ["c", "e", "f"],
                         "data_col_right": [6, 7, 8]})

    pdf1["join_col"] = pdf1["join_col"].astype("category")
    pdf2["join_col"] = pdf2["join_col"].astype("category")

    gdf1 = DataFrame.from_pandas(pdf1)
    gdf2 = DataFrame.from_pandas(pdf2)

    gdf1 = gdf1.set_index("join_col")
    gdf2 = gdf2.set_index("join_col")

    pdf1 = pdf1.set_index('join_col')
    pdf2 = pdf2.set_index('join_col')
    join_gdf = gdf1.join(gdf2, how=how, sort=True)
    join_pdf = pdf1.join(pdf2, how=how)

    got = join_gdf.to_pandas()
    expect = join_pdf.fillna(-1)  # note: pygdf join doesn't mask NA

    expect.data_col_right = expect.data_col_right.astype(np.int64)
    expect.data_col_left = expect.data_col_left.astype(np.int64)
    pd.util.testing.assert_frame_equal(got, expect, check_names=False,
                                       check_index_type=False,
                                       # For inner joins, pandas returns
                                       # weird categories.
                                       check_categorical=how != 'inner')
    assert list(got.index) == list(expect.index)
Пример #2
0
def test_dataframe_setitem_from_masked_object():
    ary = np.random.randn(100)
    mask = np.zeros(100, dtype=bool)
    mask[:20] = True
    np.random.shuffle(mask)
    ary[mask] = np.nan

    test1 = Series(ary)
    assert (test1.has_null_mask)
    assert (test1.null_count == 20)

    test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary}))
    assert (test2['a'].has_null_mask)
    assert (test2['a'].null_count == 20)

    gpu_ary = cuda.to_device(ary)
    test3 = Series(gpu_ary)
    assert (test3.has_null_mask)
    assert (test3.null_count == 20)

    test4 = DataFrame()
    lst = [1, 2, None, 4, 5, 6, None, 8, 9]
    test4['lst'] = lst
    assert (test4['lst'].has_null_mask)
    assert (test4['lst'].null_count == 2)
Пример #3
0
def test_dataframe_sort_values_sliced(nelem, sliceobj):
    np.random.seed(0)
    df = pd.DataFrame()
    df['a'] = np.random.random(nelem)

    expect = df[sliceobj]['a'].sort_values()
    gdf = DataFrame.from_pandas(df)
    got = gdf[sliceobj]['a'].sort_values()
    assert (got.to_pandas() == expect).all()
Пример #4
0
def test_dataframe_nsmallest_sliced(counts, sliceobj):
    nelem, n = counts
    np.random.seed(0)
    df = pd.DataFrame()
    df['a'] = np.random.random(nelem)
    df['b'] = np.random.random(nelem)

    expect = df[sliceobj].nsmallest(n, 'a')
    gdf = DataFrame.from_pandas(df)
    got = gdf[sliceobj].nsmallest(n, 'a')
    assert (got.to_pandas() == expect).all().all()
Пример #5
0
def test_from_pandas_with_index():
    pdf = pd.DataFrame({'a': [0, 1, 2, 3], 'b': [0.1, 0.2, None, 0.3]})
    pdf = pdf.set_index(np.asarray([4, 3, 2, 1]))
    df = DataFrame.from_pandas(pdf)

    # Check columns
    np.testing.assert_array_equal(df.a.to_array(), pdf.a)
    np.testing.assert_array_equal(df.b.to_array(), pdf.b)
    # Check index
    np.testing.assert_array_equal(df.index.values, pdf.index.values)
    # Check again using pandas testing tool on frames
    pd.util.testing.assert_frame_equal(df.to_pandas(), pdf)
Пример #6
0
    def to_gdf(self):
        """Access proc data as pygdf data frame (GPU - data frame). If the UDF input data is a single table then
            a pygdf data frame is returned. If it is multiple tables then a Pandas Series where the elements are
            of type pygdf data frame is returned.

            Returns:
                 Pygdf Data Frame if single table, Pandas Series of Pygdf Data Frames if multiple tables.
                 None if Pygdf is not installed.
        """
        try:
            table_data = self.to_df()
            if isinstance(table_data, pd.DataFrame):
                return DataFrame.from_pandas(table_data)
            gpu_df_series = pd.Series()
            for table_name in table_data.index:
                current_gpu_df = DataFrame.from_pandas(table_data[table_name])
                gpu_df_series[table_name] = current_gpu_df
            return gpu_df_series
        except NameError:
            print('Pygdf not installed.')
            return None
Пример #7
0
def test_from_pandas_ex1():
    pdf = pd.DataFrame({'a': [0, 1, 2, 3], 'b': [0.1, 0.2, None, 0.3]})
    print(pdf)
    df = DataFrame.from_pandas(pdf)
    print(df)

    assert tuple(df.columns) == tuple(pdf.columns)
    assert np.all(df['a'].to_array() == pdf['a'])
    matches = df['b'].to_array() == pdf['b']
    # the 3d element is False due to (nan == nan) == False
    assert np.all(matches == [True, True, False, True])
    assert np.isnan(df['b'].to_array()[2])
    assert np.isnan(pdf['b'][2])
Пример #8
0
def test_dataframe_append_empty():
    pdf = pd.DataFrame({
        "key": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
        "value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    })
    gdf = DataFrame.from_pandas(pdf)

    gdf['newcol'] = 100
    pdf['newcol'] = 100

    assert len(gdf['newcol']) == len(pdf)
    assert len(pdf['newcol']) == len(pdf)
    pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
Пример #9
0
def test_from_pandas():
    pdf = pd.DataFrame()
    pdf['a'] = np.arange(10, dtype=np.int32)
    pdf['b'] = np.arange(10, 20, dtype=np.float64)

    df = DataFrame.from_pandas(pdf)

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df['a'].dtype == pdf['a'].dtype
    assert df['b'].dtype == pdf['b'].dtype

    assert len(df['a']) == len(pdf['a'])
    assert len(df['b']) == len(pdf['b'])
Пример #10
0
def test_from_pandas_ex1():
    pdf = pd.DataFrame({'a': [0, 1, 2, 3],
                        'b': [0.1, 0.2, None, 0.3]})
    print(pdf)
    df = DataFrame.from_pandas(pdf)
    print(df)

    assert tuple(df.columns) == tuple(pdf.columns)
    assert np.all(df['a'].to_array() == pdf['a'])
    matches = df['b'].to_array() == pdf['b']
    # the 3d element is False due to (nan == nan) == False
    assert np.all(matches == [True, True, False, True])
    assert np.isnan(df['b'].to_array()[2])
    assert np.isnan(pdf['b'][2])
Пример #11
0
def test_from_pandas():
    pdf = pd.DataFrame()
    pdf['a'] = np.arange(10, dtype=np.int32)
    pdf['b'] = np.arange(10, 20, dtype=np.float64)

    df = DataFrame.from_pandas(pdf)

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df['a'].dtype == pdf['a'].dtype
    assert df['b'].dtype == pdf['b'].dtype

    assert len(df['a']) == len(pdf['a'])
    assert len(df['b']) == len(pdf['b'])
Пример #12
0
def test_dataframe_column_name_indexing():
    df = DataFrame()
    data = np.asarray(range(10), dtype=np.int32)
    df['a'] = data
    df[1] = data
    np.testing.assert_equal(df['a'].to_array(),
                            np.asarray(range(10), dtype=np.int32))
    np.testing.assert_equal(df[1].to_array(),
                            np.asarray(range(10), dtype=np.int32))

    pdf = pd.DataFrame()
    nelem = 10
    pdf['key1'] = np.random.randint(0, 5, nelem)
    pdf['key2'] = np.random.randint(0, 3, nelem)
    pdf[1] = np.arange(1, 1 + nelem)
    pdf[2] = np.random.random(nelem)
    df = DataFrame.from_pandas(pdf)
    for i in range(1, len(pdf.columns) + 1):
        for idx in combinations(pdf.columns, i):
            assert (pdf[list(idx)].equals(df[list(idx)].to_pandas()))
Пример #13
0
def test_query_splitted_combine():
    np.random.seed(0)
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=10),
                       'y': np.random.normal(size=10)})
    gdf = DataFrame.from_pandas(df)

    # Split the GDF
    s1 = gdf[:5]
    s2 = gdf[5:]

    # Do the query
    expr = 'x > 2'
    q1 = s1.query(expr)
    q2 = s2.query(expr)
    # Combine
    got = pygdf.concat([q1, q2]).to_pandas()

    # Should equal to just querying the original GDF
    expect = gdf.query(expr).to_pandas()
    assert_frame_equal(got, expect)
Пример #14
0
def test_issue_165():
    df_pandas = pd.DataFrame()
    start_date = dt.datetime.strptime("2000-10-21", '%Y-%m-%d')
    data = [(start_date + dt.timedelta(days=x)) for x in range(6)]
    df_pandas["dates"] = data
    df_pandas["num"] = [1, 2, 3, 4, 5, 6]
    df_pygdf = DataFrame.from_pandas(df_pandas)

    base = df_pandas.query("dates==@start_date")
    test = df_pygdf.query("dates==@start_date")
    assert_frame_equal(base, test.to_pandas())
    assert len(test) > 0

    mask = df_pygdf.dates == start_date
    base_mask = df_pandas.dates == start_date
    assert_series_equal(mask.to_pandas(), base_mask, check_names=False)
    assert mask.to_pandas().sum() > 0

    start_date_ts = pd.Timestamp(start_date)
    test = df_pygdf.query("dates==@start_date_ts")
    base = df_pandas.query("dates==@start_date_ts")
    assert_frame_equal(base, test.to_pandas())
    assert len(test) > 0

    mask = df_pygdf.dates == start_date_ts
    base_mask = df_pandas.dates == start_date_ts
    assert_series_equal(mask.to_pandas(), base_mask, check_names=False)
    assert mask.to_pandas().sum() > 0

    start_date_np = np.datetime64(start_date_ts, 'ns')
    test = df_pygdf.query("dates==@start_date_np")
    base = df_pandas.query("dates==@start_date_np")
    assert_frame_equal(base, test.to_pandas())
    assert len(test) > 0

    mask = df_pygdf.dates == start_date_np
    base_mask = df_pandas.dates == start_date_np
    assert_series_equal(mask.to_pandas(), base_mask, check_names=False)
    assert mask.to_pandas().sum() > 0