def test_dataframe_join_mismatch_cats(how): pdf1 = pd.DataFrame({"join_col": ["a", "b", "c", "d", "e"], "data_col_left": [10, 20, 30, 40, 50]}) pdf2 = pd.DataFrame({"join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8]}) pdf1["join_col"] = pdf1["join_col"].astype("category") pdf2["join_col"] = pdf2["join_col"].astype("category") gdf1 = DataFrame.from_pandas(pdf1) gdf2 = DataFrame.from_pandas(pdf2) gdf1 = gdf1.set_index("join_col") gdf2 = gdf2.set_index("join_col") pdf1 = pdf1.set_index('join_col') pdf2 = pdf2.set_index('join_col') join_gdf = gdf1.join(gdf2, how=how, sort=True) join_pdf = pdf1.join(pdf2, how=how) got = join_gdf.to_pandas() expect = join_pdf.fillna(-1) # note: pygdf join doesn't mask NA expect.data_col_right = expect.data_col_right.astype(np.int64) expect.data_col_left = expect.data_col_left.astype(np.int64) pd.util.testing.assert_frame_equal(got, expect, check_names=False, check_index_type=False, # For inner joins, pandas returns # weird categories. check_categorical=how != 'inner') assert list(got.index) == list(expect.index)
def test_dataframe_setitem_from_masked_object(): ary = np.random.randn(100) mask = np.zeros(100, dtype=bool) mask[:20] = True np.random.shuffle(mask) ary[mask] = np.nan test1 = Series(ary) assert (test1.has_null_mask) assert (test1.null_count == 20) test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary})) assert (test2['a'].has_null_mask) assert (test2['a'].null_count == 20) gpu_ary = cuda.to_device(ary) test3 = Series(gpu_ary) assert (test3.has_null_mask) assert (test3.null_count == 20) test4 = DataFrame() lst = [1, 2, None, 4, 5, 6, None, 8, 9] test4['lst'] = lst assert (test4['lst'].has_null_mask) assert (test4['lst'].null_count == 2)
def test_dataframe_sort_values_sliced(nelem, sliceobj): np.random.seed(0) df = pd.DataFrame() df['a'] = np.random.random(nelem) expect = df[sliceobj]['a'].sort_values() gdf = DataFrame.from_pandas(df) got = gdf[sliceobj]['a'].sort_values() assert (got.to_pandas() == expect).all()
def test_dataframe_nsmallest_sliced(counts, sliceobj): nelem, n = counts np.random.seed(0) df = pd.DataFrame() df['a'] = np.random.random(nelem) df['b'] = np.random.random(nelem) expect = df[sliceobj].nsmallest(n, 'a') gdf = DataFrame.from_pandas(df) got = gdf[sliceobj].nsmallest(n, 'a') assert (got.to_pandas() == expect).all().all()
def test_from_pandas_with_index(): pdf = pd.DataFrame({'a': [0, 1, 2, 3], 'b': [0.1, 0.2, None, 0.3]}) pdf = pdf.set_index(np.asarray([4, 3, 2, 1])) df = DataFrame.from_pandas(pdf) # Check columns np.testing.assert_array_equal(df.a.to_array(), pdf.a) np.testing.assert_array_equal(df.b.to_array(), pdf.b) # Check index np.testing.assert_array_equal(df.index.values, pdf.index.values) # Check again using pandas testing tool on frames pd.util.testing.assert_frame_equal(df.to_pandas(), pdf)
def to_gdf(self): """Access proc data as pygdf data frame (GPU - data frame). If the UDF input data is a single table then a pygdf data frame is returned. If it is multiple tables then a Pandas Series where the elements are of type pygdf data frame is returned. Returns: Pygdf Data Frame if single table, Pandas Series of Pygdf Data Frames if multiple tables. None if Pygdf is not installed. """ try: table_data = self.to_df() if isinstance(table_data, pd.DataFrame): return DataFrame.from_pandas(table_data) gpu_df_series = pd.Series() for table_name in table_data.index: current_gpu_df = DataFrame.from_pandas(table_data[table_name]) gpu_df_series[table_name] = current_gpu_df return gpu_df_series except NameError: print('Pygdf not installed.') return None
def test_from_pandas_ex1(): pdf = pd.DataFrame({'a': [0, 1, 2, 3], 'b': [0.1, 0.2, None, 0.3]}) print(pdf) df = DataFrame.from_pandas(pdf) print(df) assert tuple(df.columns) == tuple(pdf.columns) assert np.all(df['a'].to_array() == pdf['a']) matches = df['b'].to_array() == pdf['b'] # the 3d element is False due to (nan == nan) == False assert np.all(matches == [True, True, False, True]) assert np.isnan(df['b'].to_array()[2]) assert np.isnan(pdf['b'][2])
def test_dataframe_append_empty(): pdf = pd.DataFrame({ "key": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], "value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] }) gdf = DataFrame.from_pandas(pdf) gdf['newcol'] = 100 pdf['newcol'] = 100 assert len(gdf['newcol']) == len(pdf) assert len(pdf['newcol']) == len(pdf) pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
def test_from_pandas(): pdf = pd.DataFrame() pdf['a'] = np.arange(10, dtype=np.int32) pdf['b'] = np.arange(10, 20, dtype=np.float64) df = DataFrame.from_pandas(pdf) assert tuple(df.columns) == tuple(pdf.columns) assert df['a'].dtype == pdf['a'].dtype assert df['b'].dtype == pdf['b'].dtype assert len(df['a']) == len(pdf['a']) assert len(df['b']) == len(pdf['b'])
def test_from_pandas_ex1(): pdf = pd.DataFrame({'a': [0, 1, 2, 3], 'b': [0.1, 0.2, None, 0.3]}) print(pdf) df = DataFrame.from_pandas(pdf) print(df) assert tuple(df.columns) == tuple(pdf.columns) assert np.all(df['a'].to_array() == pdf['a']) matches = df['b'].to_array() == pdf['b'] # the 3d element is False due to (nan == nan) == False assert np.all(matches == [True, True, False, True]) assert np.isnan(df['b'].to_array()[2]) assert np.isnan(pdf['b'][2])
def test_from_pandas(): pdf = pd.DataFrame() pdf['a'] = np.arange(10, dtype=np.int32) pdf['b'] = np.arange(10, 20, dtype=np.float64) df = DataFrame.from_pandas(pdf) assert tuple(df.columns) == tuple(pdf.columns) assert df['a'].dtype == pdf['a'].dtype assert df['b'].dtype == pdf['b'].dtype assert len(df['a']) == len(pdf['a']) assert len(df['b']) == len(pdf['b'])
def test_dataframe_column_name_indexing(): df = DataFrame() data = np.asarray(range(10), dtype=np.int32) df['a'] = data df[1] = data np.testing.assert_equal(df['a'].to_array(), np.asarray(range(10), dtype=np.int32)) np.testing.assert_equal(df[1].to_array(), np.asarray(range(10), dtype=np.int32)) pdf = pd.DataFrame() nelem = 10 pdf['key1'] = np.random.randint(0, 5, nelem) pdf['key2'] = np.random.randint(0, 3, nelem) pdf[1] = np.arange(1, 1 + nelem) pdf[2] = np.random.random(nelem) df = DataFrame.from_pandas(pdf) for i in range(1, len(pdf.columns) + 1): for idx in combinations(pdf.columns, i): assert (pdf[list(idx)].equals(df[list(idx)].to_pandas()))
def test_query_splitted_combine(): np.random.seed(0) df = pd.DataFrame({'x': np.random.randint(0, 5, size=10), 'y': np.random.normal(size=10)}) gdf = DataFrame.from_pandas(df) # Split the GDF s1 = gdf[:5] s2 = gdf[5:] # Do the query expr = 'x > 2' q1 = s1.query(expr) q2 = s2.query(expr) # Combine got = pygdf.concat([q1, q2]).to_pandas() # Should equal to just querying the original GDF expect = gdf.query(expr).to_pandas() assert_frame_equal(got, expect)
def test_issue_165(): df_pandas = pd.DataFrame() start_date = dt.datetime.strptime("2000-10-21", '%Y-%m-%d') data = [(start_date + dt.timedelta(days=x)) for x in range(6)] df_pandas["dates"] = data df_pandas["num"] = [1, 2, 3, 4, 5, 6] df_pygdf = DataFrame.from_pandas(df_pandas) base = df_pandas.query("dates==@start_date") test = df_pygdf.query("dates==@start_date") assert_frame_equal(base, test.to_pandas()) assert len(test) > 0 mask = df_pygdf.dates == start_date base_mask = df_pandas.dates == start_date assert_series_equal(mask.to_pandas(), base_mask, check_names=False) assert mask.to_pandas().sum() > 0 start_date_ts = pd.Timestamp(start_date) test = df_pygdf.query("dates==@start_date_ts") base = df_pandas.query("dates==@start_date_ts") assert_frame_equal(base, test.to_pandas()) assert len(test) > 0 mask = df_pygdf.dates == start_date_ts base_mask = df_pandas.dates == start_date_ts assert_series_equal(mask.to_pandas(), base_mask, check_names=False) assert mask.to_pandas().sum() > 0 start_date_np = np.datetime64(start_date_ts, 'ns') test = df_pygdf.query("dates==@start_date_np") base = df_pandas.query("dates==@start_date_np") assert_frame_equal(base, test.to_pandas()) assert len(test) > 0 mask = df_pygdf.dates == start_date_np base_mask = df_pandas.dates == start_date_np assert_series_equal(mask.to_pandas(), base_mask, check_names=False) assert mask.to_pandas().sum() > 0