def test_onehot_categories(as_array): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) categories = DataFrame({'chars': ['a', 'b', 'c'], 'int': [0, 1, 2]}) if as_array: X = _from_df_to_cupy(X) categories = _from_df_to_cupy(categories).transpose() enc = OneHotEncoder(categories=categories, sparse=False) ref = cp.array([[1., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1.]]) res = enc.fit_transform(X) cp.testing.assert_array_equal(res, ref)
def test_onehot_inverse_transform(drop, as_array): X = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]}) if as_array: X = _from_df_to_cupy(X) drop = _convert_drop(drop) enc = OneHotEncoder(drop=drop) ohe = enc.fit_transform(X) inv = enc.inverse_transform(ohe) assert_inverse_equal(inv, X)
def test_onehot_category_specific_cases(): # See this for reasoning: https://github.com/rapidsai/cuml/issues/2690 # All of these cases use sparse=False, where # test_onehot_category_class_count uses sparse=True # ==== 2 Rows (Low before High) ==== example_df = DataFrame() example_df["low_cardinality_column"] = ["A"] * 200 + ["B"] * 56 example_df["high_cardinality_column"] = cp.linspace(0, 255, 256) encoder = OneHotEncoder(handle_unknown="ignore", sparse=False) encoder.fit_transform(example_df) # ==== 2 Rows (High before Low, used to fail) ==== example_df = DataFrame() example_df["high_cardinality_column"] = cp.linspace(0, 255, 256) example_df["low_cardinality_column"] = ["A"] * 200 + ["B"] * 56 encoder = OneHotEncoder(handle_unknown="ignore", sparse=False) encoder.fit_transform(example_df)
def test_onehot_category_class_count(total_classes: int): # See this for reasoning: https://github.com/rapidsai/cuml/issues/2690 # All tests use sparse=True to avoid memory errors encoder = OneHotEncoder(handle_unknown="ignore", sparse=True) # ==== 2 Rows ==== example_df = DataFrame() example_df["high_cardinality_column"] = cp.linspace( 0, total_classes - 1, total_classes) example_df["low_cardinality_column"] = ["A"] * 200 + ["B"] * ( total_classes - 200) assert (encoder.fit_transform(example_df).shape[1] == total_classes + 2) # ==== 3 Rows ==== example_df = DataFrame() example_df["high_cardinality_column"] = cp.linspace( 0, total_classes - 1, total_classes) example_df["low_cardinality_column"] = ["A"] * total_classes example_df["med_cardinality_column"] = ["B"] * total_classes assert (encoder.fit_transform(example_df).shape[1] == total_classes + 2) # ==== N Rows (Even Split) ==== num_rows = [3, 10, 100] for row_count in num_rows: class_per_row = int(math.ceil(total_classes / float(row_count))) + 1 example_df = DataFrame() for row_idx in range(row_count): example_df[str(row_idx)] = cp.linspace( row_idx * class_per_row, ((row_idx + 1) * class_per_row) - 1, class_per_row) assert (encoder.fit_transform(example_df).shape[1] == class_per_row * row_count)
def test_onehot_vs_skonehot(as_array): X = DataFrame({'gender': ['M', 'F', 'F'], 'int': [1, 3, 2]}) skX = from_df_to_array(X) if as_array: X = _from_df_to_cupy(X) skX = cp.asnumpy(X) enc = OneHotEncoder(sparse=True) skohe = SkOneHotEncoder(sparse=True) ohe = enc.fit_transform(X) ref = skohe.fit_transform(skX) cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
def test_onehot_drop_idx_first(as_array): X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) if as_array: X = _from_df_to_cupy(X) X_ary = cp.asnumpy(X) enc = OneHotEncoder(sparse=False, drop='first', categories='auto') sk_enc = SkOneHotEncoder(sparse=False, drop='first', categories='auto') ohe = enc.fit_transform(X) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe, ref) inv = enc.inverse_transform(ohe) assert_inverse_equal(inv, X)
def test_onehot_random_inputs(drop, sparse, n_samples, as_array): X, ary = generate_inputs_from_categories(n_samples=n_samples, as_array=as_array) enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto') sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto') ohe = enc.fit_transform(X) ref = sk_enc.fit_transform(ary) if sparse: cp.testing.assert_array_equal(ohe.toarray(), ref.toarray()) else: cp.testing.assert_array_equal(ohe, ref) inv_ohe = enc.inverse_transform(ohe) assert_inverse_equal(inv_ohe, X)
def test_onehot_sparse_drop(as_array): X = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2], 'l': [5, 5, 6]}) drop = {'g': 'F', 'i': 3, 'l': 6} ary = from_df_to_array(X) drop_ary = ['F', 3, 6] if as_array: X = _from_df_to_cupy(X) ary = cp.asnumpy(X) drop = drop_ary = _convert_drop(drop) enc = OneHotEncoder(sparse=True, drop=drop, categories='auto') sk_enc = SkOneHotEncoder(sparse=True, drop=drop_ary, categories='auto') ohe = enc.fit_transform(X) ref = sk_enc.fit_transform(ary) cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
def test_onehot_drop_one_of_each(as_array): X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'}) X_ary = from_df_to_array(X) drop_ary = ['b', 2, 'b'] if as_array: X = _from_df_to_cupy(X) X_ary = cp.asnumpy(X) drop = drop_ary = _convert_drop(drop) enc = OneHotEncoder(sparse=False, drop=drop, categories='auto') ohe = enc.fit_transform(X) print(ohe.dtype) ref = SkOneHotEncoder(sparse=False, drop=drop_ary, categories='auto').fit_transform(X_ary) cp.testing.assert_array_equal(ohe, ref) inv = enc.inverse_transform(ohe) assert_inverse_equal(inv, X)