예제 #1
0
def test_labelencoder_unseen():
    """ Try encoding a value that was not present during fitting
    """
    df = cudf.Series(np.random.choice(10, (10, )))
    le = LabelEncoder().fit(df)
    assert le._fitted

    with pytest.raises(KeyError):
        le.transform(cudf.Series([-1]))
예제 #2
0
def test_labelencoder_unfitted():
    """ Try calling `.transform()` without fitting first
    """
    df = cudf.Series(np.random.choice(10, (10, )))
    le = LabelEncoder()
    assert not le._fitted

    with pytest.raises(NotFittedError):
        le.transform(df)
예제 #3
0
def test_unfitted_inverse_transform():
    """ Try calling `.inverse_transform()` without fitting first
    """
    df = cudf.Series(np.random.choice(10, (10, )))
    le = LabelEncoder()
    assert (not le._fitted)

    with pytest.raises(RuntimeError):
        le.transform(df)
예제 #4
0
def test_masked_encode():
    df = cudf.DataFrame({
        "filter_col": [1, 1, 2, 3, 1, 1, 1, 1, 6, 5],
        "cat_col": ['a', 'b', 'c', 'd', 'a', 'a', 'a', 'c', 'b', 'c']
    })

    df_filter = df[df["filter_col"] == 1]
    df_filter["cat_col"] = LabelEncoder().fit_transform(df_filter["cat_col"])

    df["cat_col"] = LabelEncoder().fit_transform(df["cat_col"])
    df = df[df["filter_col"] == 1]

    assert (df_filter["cat_col"] == df["cat_col"]).all()
예제 #5
0
def test_labelencoder_transform(length, cardinality):
    """ Try fitting and then encoding a small subset of the df
    """
    df = cudf.Series(np.random.choice(cardinality, (length, )))
    le = LabelEncoder().fit(df)
    assert le._fitted

    subset = df.iloc[0:df.shape[0] // 2]
    encoded = le.transform(subset)

    subset_arr = _df_to_similarity_mat(subset)
    encoded_arr = _df_to_similarity_mat(encoded)
    assert ((encoded_arr == encoded_arr.T) == (
        subset_arr == subset_arr.T)).all()
예제 #6
0
def test_labelencoder_fit_transform(length, cardinality):
    """ Try encoding the entire df
    """
    df = cudf.Series(np.random.choice(cardinality, (length, )))
    encoded = LabelEncoder().fit_transform(df)

    df_arr = _df_to_similarity_mat(df)
    encoded_arr = _df_to_similarity_mat(encoded)
    assert ((encoded_arr == encoded_arr.T) == (df_arr == df_arr.T)).all()
예제 #7
0
def test_masked_encode():
    int_values = [3, 1, 1, 2, 1, 1, 1, 1, 6, 5]
    cat_values = ['a', 'd', 'b', 'c', 'd', 'd', 'd', 'c', 'b', 'c']
    df = cudf.DataFrame({"filter_col": int_values,
                         "cat_col": cat_values})

    df_filter = df[df["filter_col"] == 1]
    df_filter["cat_col"] = LabelEncoder().fit_transform(df_filter["cat_col"])

    filtered_int_values = [int_values[i] for i in range(len(int_values))
                           if int_values[i] == 1]
    filtered_cat_values = [cat_values[i] for i in range(len(int_values))
                           if int_values[i] == 1]
    df_test = cudf.DataFrame({"filter_col": filtered_int_values,
                              "cat_col": filtered_cat_values})
    df_test["cat_col"] = LabelEncoder().fit_transform(df_test["cat_col"])

    assert(df_filter["cat_col"].values == df_test["cat_col"].values).all()
예제 #8
0
def test_empty_input(empty, ord_label):
    # prepare LabelEncoder
    le = LabelEncoder()
    le.fit(empty)
    assert (le._fitted is True)

    # test if correctly raies ValueError
    with pytest.raises(ValueError, match='y contains previously unseen label'):
        le.inverse_transform(ord_label)

    # check fit_transform()
    le = LabelEncoder()
    transformed = le.fit_transform(empty)
    assert (le._fitted is True)
    assert (len(transformed) == 0)
예제 #9
0
def test_inverse_transform(orig_label, ord_label, expected_reverted,
                           bad_ord_label, use_fit_transform):
    # prepare LabelEncoder
    le = LabelEncoder()
    if use_fit_transform:
        le.fit_transform(orig_label)
    else:
        le.fit(orig_label)
    assert (le._fitted is True)

    # test if inverse_transform is correct
    reverted = le.inverse_transform(ord_label)
    assert (len(reverted) == len(expected_reverted))
    assert (len(reverted) == len(reverted[reverted == expected_reverted]))
    # test if correctly raies ValueError
    with pytest.raises(ValueError, match='y contains previously unseen label'):
        le.inverse_transform(bad_ord_label)
예제 #10
0
def test_labelencoder_fit_transform_cupy_numpy(length, cardinality, dtype):
    """ Try encoding the cupy array
    """
    x = cp.random.choice(cardinality, (length,))
    if dtype == 'numpy':
        x = x.get()
    encoded = LabelEncoder().fit_transform(x)

    x_arr = _array_to_similarity_mat(x)
    encoded_arr = _array_to_similarity_mat(encoded.values)
    if dtype == 'numpy':
        encoded_arr = encoded_arr.get()
    assert ((encoded_arr == encoded_arr.T) == (x == x_arr.T)).all()