def test_labelencoder_unfitted(client): """ Try calling `.transform()` without fitting first """ df = dask_cudf.from_cudf(cudf.Series(np.random.choice(10, (10, ))), npartitions=len(client.has_what())) le = LabelEncoder() with pytest.raises(NotFittedError): le.transform(df).compute()
def test_unfitted_inverse_transform(client): """ Try calling `.inverse_transform()` without fitting first """ tmp = cudf.Series(np.random.choice(10, (10, ))) df = dask_cudf.from_cudf(tmp, npartitions=len(client.has_what())) le = LabelEncoder() with pytest.raises(NotFittedError): le.transform(df)
def test_labelencoder_unseen(client): """ Try encoding a value that was not present during fitting """ df = dask_cudf.from_cudf(cudf.Series(np.random.choice(10, (10, ))), npartitions=len(client.has_what())) le = LabelEncoder().fit(df) assert le._fitted with pytest.raises(KeyError): tmp = dask_cudf.from_cudf(cudf.Series([-100, -120]), npartitions=len(client.has_what())) le.transform(tmp).compute()
def test_labelencoder_transform(length, cardinality, client): """ Try fitting and then encoding a small subset of the df """ tmp = cudf.Series(np.random.choice(cardinality, (length, ))) df = dask_cudf.from_cudf(tmp, npartitions=len(client.has_what())) le = LabelEncoder().fit(df) assert le._fitted encoded = le.transform(df) df_arr = df.compute().to_numpy() df_arr = _arr_to_similarity_mat(df_arr) encoder_arr = cp.asnumpy(encoded.compute().to_numpy()) encoded_arr = _arr_to_similarity_mat(encoder_arr) assert ((encoded_arr == encoded_arr.T) == (df_arr == df_arr.T)).all()