Пример #1
0
def test_onehot_categories(as_array):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
    categories = DataFrame({'chars': ['a', 'b', 'c'], 'int': [0, 1, 2]})
    if as_array:
        X = _from_df_to_cupy(X)
        categories = _from_df_to_cupy(categories).transpose()

    enc = OneHotEncoder(categories=categories, sparse=False)
    ref = cp.array([[1., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1.]])
    res = enc.fit_transform(X)
    cp.testing.assert_array_equal(res, ref)
Пример #2
0
def test_onehot_inverse_transform(drop, as_array):
    X = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]})
    if as_array:
        X = _from_df_to_cupy(X)
        drop = _convert_drop(drop)

    enc = OneHotEncoder(drop=drop)
    ohe = enc.fit_transform(X)
    inv = enc.inverse_transform(ohe)

    assert_inverse_equal(inv, X)
Пример #3
0
def test_onehot_category_specific_cases():
    # See this for reasoning: https://github.com/rapidsai/cuml/issues/2690

    # All of these cases use sparse=False, where
    # test_onehot_category_class_count uses sparse=True

    # ==== 2 Rows (Low before High) ====
    example_df = DataFrame()
    example_df["low_cardinality_column"] = ["A"] * 200 + ["B"] * 56
    example_df["high_cardinality_column"] = cp.linspace(0, 255, 256)

    encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
    encoder.fit_transform(example_df)

    # ==== 2 Rows (High before Low, used to fail) ====
    example_df = DataFrame()
    example_df["high_cardinality_column"] = cp.linspace(0, 255, 256)
    example_df["low_cardinality_column"] = ["A"] * 200 + ["B"] * 56

    encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
    encoder.fit_transform(example_df)
Пример #4
0
def test_onehot_category_class_count(total_classes: int):
    # See this for reasoning: https://github.com/rapidsai/cuml/issues/2690
    # All tests use sparse=True to avoid memory errors

    encoder = OneHotEncoder(handle_unknown="ignore", sparse=True)

    # ==== 2 Rows ====
    example_df = DataFrame()
    example_df["high_cardinality_column"] = cp.linspace(
        0, total_classes - 1, total_classes)
    example_df["low_cardinality_column"] = ["A"] * 200 + ["B"] * (
        total_classes - 200)

    assert (encoder.fit_transform(example_df).shape[1] == total_classes + 2)

    # ==== 3 Rows ====
    example_df = DataFrame()
    example_df["high_cardinality_column"] = cp.linspace(
        0, total_classes - 1, total_classes)
    example_df["low_cardinality_column"] = ["A"] * total_classes
    example_df["med_cardinality_column"] = ["B"] * total_classes

    assert (encoder.fit_transform(example_df).shape[1] == total_classes + 2)

    # ==== N Rows (Even Split) ====
    num_rows = [3, 10, 100]

    for row_count in num_rows:

        class_per_row = int(math.ceil(total_classes / float(row_count))) + 1
        example_df = DataFrame()

        for row_idx in range(row_count):
            example_df[str(row_idx)] = cp.linspace(
                row_idx * class_per_row, ((row_idx + 1) * class_per_row) - 1,
                class_per_row)

        assert (encoder.fit_transform(example_df).shape[1] == class_per_row *
                row_count)
Пример #5
0
def test_onehot_vs_skonehot(as_array):
    X = DataFrame({'gender': ['M', 'F', 'F'], 'int': [1, 3, 2]})
    skX = from_df_to_array(X)
    if as_array:
        X = _from_df_to_cupy(X)
        skX = cp.asnumpy(X)

    enc = OneHotEncoder(sparse=True)
    skohe = SkOneHotEncoder(sparse=True)

    ohe = enc.fit_transform(X)
    ref = skohe.fit_transform(skX)

    cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
Пример #6
0
def test_onehot_drop_idx_first(as_array):
    X_ary = [['c', 2, 'a'], ['b', 2, 'b']]
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    if as_array:
        X = _from_df_to_cupy(X)
        X_ary = cp.asnumpy(X)

    enc = OneHotEncoder(sparse=False, drop='first', categories='auto')
    sk_enc = SkOneHotEncoder(sparse=False, drop='first', categories='auto')
    ohe = enc.fit_transform(X)
    ref = sk_enc.fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe, ref)
    inv = enc.inverse_transform(ohe)
    assert_inverse_equal(inv, X)
Пример #7
0
def test_onehot_random_inputs(drop, sparse, n_samples, as_array):
    X, ary = generate_inputs_from_categories(n_samples=n_samples,
                                             as_array=as_array)

    enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    ohe = enc.fit_transform(X)
    ref = sk_enc.fit_transform(ary)
    if sparse:
        cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
    else:
        cp.testing.assert_array_equal(ohe, ref)
    inv_ohe = enc.inverse_transform(ohe)
    assert_inverse_equal(inv_ohe, X)
Пример #8
0
def test_onehot_sparse_drop(as_array):
    X = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2], 'l': [5, 5, 6]})
    drop = {'g': 'F', 'i': 3, 'l': 6}

    ary = from_df_to_array(X)
    drop_ary = ['F', 3, 6]
    if as_array:
        X = _from_df_to_cupy(X)
        ary = cp.asnumpy(X)
        drop = drop_ary = _convert_drop(drop)

    enc = OneHotEncoder(sparse=True, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=True, drop=drop_ary, categories='auto')
    ohe = enc.fit_transform(X)
    ref = sk_enc.fit_transform(ary)
    cp.testing.assert_array_equal(ohe.toarray(), ref.toarray())
Пример #9
0
def test_onehot_drop_one_of_each(as_array):
    X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']})
    drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'})
    X_ary = from_df_to_array(X)
    drop_ary = ['b', 2, 'b']
    if as_array:
        X = _from_df_to_cupy(X)
        X_ary = cp.asnumpy(X)
        drop = drop_ary = _convert_drop(drop)

    enc = OneHotEncoder(sparse=False, drop=drop, categories='auto')
    ohe = enc.fit_transform(X)
    print(ohe.dtype)
    ref = SkOneHotEncoder(sparse=False, drop=drop_ary,
                          categories='auto').fit_transform(X_ary)
    cp.testing.assert_array_equal(ohe, ref)
    inv = enc.inverse_transform(ohe)
    assert_inverse_equal(inv, X)