def test_get_dummies(data): gdf = DataFrame({"x": data}) pdf = pd.DataFrame({"x": data}) encoded_expected = pd.get_dummies(pdf, prefix="test") encoded_actual = cudf.get_dummies(gdf, prefix="test") utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False) encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8) utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False)
def test_get_dummies(data, index): gdf = DataFrame({"x": data}, index=index) pdf = pd.DataFrame({"x": data}, index=index) encoded_expected = pd.get_dummies(pdf, prefix="test") encoded_actual = cudf.get_dummies(gdf, prefix="test") utils.assert_eq( encoded_expected, encoded_actual, check_dtype=False if len(data) == 0 else True, ) encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8) utils.assert_eq( encoded_expected, encoded_actual, check_dtype=False if len(data) == 0 else True, )
def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na): pdf = pd.DataFrame({"a": [0, 1, np.nan]}) df = DataFrame.from_pandas(pdf, nan_as_null=nan_as_null) expected = pd.get_dummies(pdf, dummy_na=dummy_na, columns=["a"]) got = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"]) if dummy_na and nan_as_null: got = got.rename(columns={"a_null": "a_nan"})[expected.columns] utils.assert_eq(expected, got)
def test_onehot_get_dummies_multicol(n_cols): n_categories = 5 data = dict( zip(ascii_lowercase, (np.arange(n_categories) for _ in range(n_cols)))) gdf = cudf.DataFrame(data) pdf = pd.DataFrame(data) encoded_expected = pd.get_dummies(pdf, prefix="test") encoded_actual = cudf.get_dummies(gdf, prefix="test") utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False)
def test_get_dummies_array_like(data, prefix_sep, prefix, dtype): expected = cudf.get_dummies( data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype ) if isinstance(data, (cudf.Series, cudf.BaseIndex)): pd_data = data.to_pandas() else: pd_data = data actual = pd.get_dummies( pd_data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype ) utils.assert_eq(expected, actual)
def test_get_dummies_with_nan(): df = cudf.DataFrame( {"a": cudf.Series([1, 2, np.nan, None], nan_as_null=False)}) expected = cudf.DataFrame( { "a_1.0": [1, 0, 0, 0], "a_2.0": [0, 1, 0, 0], "a_nan": [0, 0, 1, 0], "a_null": [0, 0, 0, 1], }, dtype="uint8", ) actual = cudf.get_dummies(df, dummy_na=True, columns=["a"]) utils.assert_eq(expected, actual)
def test_get_dummies_array_like_with_nan(): ser = cudf.Series([0.1, 2, 3, None, np.nan], nan_as_null=False) expected = cudf.DataFrame( { "a_null": [0, 0, 0, 1, 0], "a_0.1": [1, 0, 0, 0, 0], "a_2.0": [0, 1, 0, 0, 0], "a_3.0": [0, 0, 1, 0, 0], "a_nan": [0, 0, 0, 0, 1], }, dtype="uint8", ) actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_") utils.assert_eq(expected, actual)
def test_get_dummies_prefix_sep(prefix, prefix_sep): data = { "first": ["1", "2", "3"], "second": ["abc", "def", "ghi"], "third": ["ji", "ji", "ji"], } gdf = DataFrame(data) pdf = pd.DataFrame(data) encoded_expected = pd.get_dummies(pdf, prefix=prefix, prefix_sep=prefix_sep) encoded_actual = cudf.get_dummies(gdf, prefix=prefix, prefix_sep=prefix_sep) utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False)
def one_hot_encoder(self, dummy_nas=None): """ Takes the output_df and creates dummifies any features in ohe_feats list By default it won't dummy any NAs in the features but this can be tweaked to True to handle them Params: - dummy_nas = True/False (default to False), used to indicate if get_dummies will dummy NAs """ # Check if dummy_nas if the default (None), if it is then set dummy_nas to False (i.e. don't dummy NAs) if dummy_nas == None: dummy_nas = False # Otherwise set to True (dummy NAs) else: dummy_nas = True self.output_df = cudf.get_dummies(self.output_df, columns=self.ohe_feats, dummy_na=dummy_nas)
def basic_feature_engineering(train, test, gpu=False): """ reads in a train and test set of data and processes as per the basic feature engineering example Args: train (dataframe): the training dataframe (should include TARGET) test (dataframe): the testing dataframe gpu (boolean): whether to use cudf or not Returns: train (dataframe): the processed train frame test (dataframe): the processed test frame train_target (dataframe): The training target column """ if gpu: import cudf as dd else: import pandas as dd app_train_mis_values = see_percent_missing_values(train) df_app_train_miss_values = dd.DataFrame({ 'columns': app_train_mis_values.index, 'missing percent': app_train_mis_values.values }) if type(df_app_train_miss_values) == cudf.core.dataframe.DataFrame: drop_columns = df_app_train_miss_values[df_app_train_miss_values['missing percent'] \ >= 40]['columns'].to_arrow().to_pylist() else: drop_columns = df_app_train_miss_values[df_app_train_miss_values['missing percent'] \ >= 40]['columns'].tolist() train = train.drop(drop_columns, axis=1) test = test.drop(drop_columns, axis=1) train_target = train['TARGET'] train = train.drop('TARGET', axis=1) # here we will use a basic dummy treatment # we merged the dataframes first because when we dummify # we could have some columns only in train or only in test. Merging first will prevent this unified = dd.concat([train, test]) dummy_cols = unified.select_dtypes(['bool', 'O', 'category']).columns.tolist() unified = dd.get_dummies(unified, columns=dummy_cols, dtype='int64') # XGB for pandas does not like Int64 for col in unified.select_dtypes('Int64').columns.tolist(): unified[col] = unified[col].fillna(int(unified[col].mean())) unified[col] = unified[col].astype('int64') for col in unified.isna().any()[unified.isna().any() == True].index.to_arrow().tolist(): unified[col] = unified[col].fillna(0) train = unified[0:307511] test = unified[307511:] return train, test, train_target
def ohe_gpu(): tmpdf = cudf.DataFrame() tmpdf['grade'] = loan_pdf['grade'] x = ohe_gpu_df = cudf.get_dummies(tmpdf)