示例#1
0
def test_gap_encoder(hashing, init, analyzer, add_words, n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_samples]
    n_components = 10
    # Test output shape
    encoder = GapEncoder(
        n_components=n_components, hashing=hashing, init=init,
        analyzer=analyzer, add_words=add_words,
        random_state=42, rescale_W=True)
    encoder.fit(X)
    y = encoder.transform(X)
    assert y.shape == (n_samples, n_components), str(y.shape)
    assert len(set(y[0])) == n_components

    # Test L1-norm of topics W.
    l1_norm_W = np.abs(encoder.W_).sum(axis=1)
    np.testing.assert_array_almost_equal(
        l1_norm_W, np.ones(n_components))

    # Test same seed return the same output
    encoder = GapEncoder(
        n_components=n_components, hashing=hashing, init=init,
        analyzer=analyzer, add_words=add_words,
        random_state=42)
    encoder.fit(X)
    y2 = encoder.transform(X)
    np.testing.assert_array_equal(y, y2)
    return
def _test_possibilities(
    X,
    expected_transformers_df,
    expected_transformers_2,
    expected_transformers_np_no_cast,
    expected_transformers_series,
    expected_transformers_plain,
    expected_transformers_np_cast,
):
    """
    Do a bunch of tests with the SuperVectorizer.
    We take some expected transformers results as argument. They're usually
    lists or dictionaries.
    """
    # Test with low cardinality and a StandardScaler for the numeric columns
    vectorizer_base = SuperVectorizer(
        cardinality_threshold=4,
        # we must have n_samples = 5 >= n_components
        high_card_cat_transformer=GapEncoder(n_components=2),
        numerical_transformer=StandardScaler(),
    )
    # Warning: order-dependant
    vectorizer_base.fit_transform(X)
    check_same_transformers(expected_transformers_df,
                            vectorizer_base.transformers)

    # Test with higher cardinality threshold and no numeric transformer
    vectorizer_default = SuperVectorizer()  # Using default values
    vectorizer_default.fit_transform(X)
    check_same_transformers(expected_transformers_2,
                            vectorizer_default.transformers)

    # Test with a numpy array
    arr = X.to_numpy()
    # Instead of the columns names, we'll have the column indices.
    vectorizer_base.fit_transform(arr)
    check_same_transformers(expected_transformers_np_no_cast,
                            vectorizer_base.transformers)

    # Test with pandas series
    vectorizer_base.fit_transform(X['cat1'])
    check_same_transformers(expected_transformers_series,
                            vectorizer_base.transformers)

    # Test casting values
    vectorizer_cast = SuperVectorizer(
        cardinality_threshold=4,
        # we must have n_samples = 5 >= n_components
        high_card_cat_transformer=GapEncoder(n_components=2),
        numerical_transformer=StandardScaler(),
    )
    X_str = X.astype('object')
    # With pandas
    vectorizer_cast.fit_transform(X_str)
    check_same_transformers(expected_transformers_plain,
                            vectorizer_cast.transformers)
    # With numpy
    vectorizer_cast.fit_transform(X_str.to_numpy())
    check_same_transformers(expected_transformers_np_cast,
                            vectorizer_cast.transformers)
示例#3
0
def test_analyzer(init1, analyzer1, analyzer2):
    """" Test if the output is different when the analyzer is 'word' or 'char'.
        If it is, no error ir raised. 
    """
    add_words = False
    n_samples = 70
    X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples]
    X = np.array([X_txt, X_txt]).T
    n_components = 10
    # Test first analyzer output:
    encoder = GapEncoder(n_components=n_components,
                         init='k-means++',
                         analyzer=analyzer1,
                         add_words=add_words,
                         random_state=42,
                         rescale_W=True)
    encoder.fit(X)
    y = encoder.transform(X)

    # Test the other analyzer output:
    encoder = GapEncoder(n_components=n_components,
                         init='k-means++',
                         analyzer=analyzer2,
                         add_words=add_words,
                         random_state=42)
    encoder.fit(X)
    y2 = encoder.transform(X)

    # Test inequality btw analyzer word and char ouput:
    np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, y,
                             y2)
示例#4
0
def test_overflow_error():
    np.seterr(over='raise', divide='raise')
    r = np.random.RandomState(0)
    X = r.randint(1e5, 1e6, size=(8000, 1)).astype(str)
    enc = GapEncoder(n_components=2, batch_size=1, min_iter=1, max_iter=1,
                     random_state=0)
    enc.fit(X)
    return
示例#5
0
def test_get_feature_names(n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_samples]
    enc = GapEncoder()
    enc.fit(X)
    topic_labels = enc.get_feature_names()
    # Check number of labels
    assert len(topic_labels) == enc.n_components
    return
示例#6
0
    def __init__(
        self,
        *,
        cardinality_threshold: int = 40,
        low_card_cat_transformer: Optional[Union[BaseEstimator,
                                                 str]] = OneHotEncoder(),
        high_card_cat_transformer: Optional[Union[BaseEstimator,
                                                  str]] = GapEncoder(
                                                      n_components=30),
        numerical_transformer: Optional[Union[BaseEstimator, str]] = None,
        datetime_transformer: Optional[Union[BaseEstimator, str]] = None,
        auto_cast: bool = True,
        impute_missing: str = 'auto',
        # Following parameters are inherited from ColumnTransformer
        remainder: str = 'passthrough',
        sparse_threshold: float = 0.3,
        n_jobs: int = None,
        transformer_weights=None,
        verbose: bool = False,
    ):
        super().__init__(transformers=[])

        self.cardinality_threshold = cardinality_threshold
        self.low_card_cat_transformer = low_card_cat_transformer
        self.high_card_cat_transformer = high_card_cat_transformer
        self.numerical_transformer = numerical_transformer
        self.datetime_transformer = datetime_transformer
        self.auto_cast = auto_cast
        self.impute_missing = impute_missing

        self.remainder = remainder
        self.sparse_threshold = sparse_threshold
        self.n_jobs = n_jobs
        self.transformer_weights = transformer_weights
        self.verbose = verbose
示例#7
0
def test_missing_values(missing):
    observations = [['alice', 'bob'], ['bob', 'alice'], ['bob', np.nan],
                    ['alice', 'charlie'], [np.nan, 'alice']]
    observations = np.array(observations, dtype=object)
    enc = GapEncoder(handle_missing=missing, n_components=3)
    if missing == 'error':
        with pytest.raises(ValueError,
                           match=r'Input data contains missing values.'):
            enc.fit_transform(observations)
    elif missing == 'zero_impute':
        enc.fit_transform(observations)
        enc.partial_fit(observations)
    else:
        with pytest.raises(ValueError,
                           match=r"handle_missing should be either "
                           r"'error' or 'zero_impute', got 'aaa'"):
            enc.fit_transform(observations)
示例#8
0
def test_score(n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples]
    X1 = np.array(X_txt)[:, None]
    X2 = np.hstack([X1, X1])
    enc = GapEncoder(random_state=42)
    enc.fit(X1)
    score_X1 = enc.score(X1)
    enc.fit(X2)
    score_X2 = enc.score(X2)
    # Check that two identical columns give the same score
    assert score_X1 * 2 == score_X2
    return
示例#9
0
def test_partial_fit(n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_samples]
    # Gap encoder with fit on one batch
    enc = GapEncoder(random_state=42, batch_size=n_samples, max_iter=1)
    X_enc = enc.fit_transform(X)
    # Gap encoder with partial fit
    enc = GapEncoder(random_state=42)
    enc.partial_fit(X)
    X_enc_partial = enc.transform(X)
    # Check if the encoded vectors are the same
    np.testing.assert_almost_equal(X_enc, X_enc_partial)
    return
示例#10
0
def test_get_feature_names_out(n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples]
    X = np.array([X_txt, X_txt]).T
    enc = GapEncoder(random_state=42)
    enc.fit(X)
    for topic_labels in [enc.get_feature_names(), enc.get_feature_names_out()]:
        # Check number of labels
        assert len(topic_labels) == enc.n_components * X.shape[1]
        # Test different parameters for col_names
        topic_labels_2 = enc.get_feature_names_out(col_names='auto')
        assert topic_labels_2[0] == 'col0: ' + topic_labels[0]
        topic_labels_3 = enc.get_feature_names_out(col_names=['abc', 'def'])
        assert topic_labels_3[0] == 'abc: ' + topic_labels[0]
    return
示例#11
0
def test_input_type():
    # Numpy array
    X = np.array(['alice', 'bob'])
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_array = enc.fit_transform(X)
    # List
    X = ['alice', 'bob']
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_list = enc.fit_transform(X)
    # Check if the encoded vectors are the same
    np.testing.assert_array_equal(X_enc_array, X_enc_list)
    return
def test_with_arrays():
    """
    Check that the SuperVectorizer works if we input a list of lists or a numpy array.
    """
    expected_transformers = {
        'numeric': [0, 1],
        'low_card_cat': [2, 4],
        'high_card_cat': [3, 5],
    }
    vectorizer = SuperVectorizer(
        cardinality_threshold=4,
        # we must have n_samples = 5 >= n_components
        high_card_cat_transformer=GapEncoder(n_components=2),
        numerical_transformer=StandardScaler(),
    )

    X = _get_numpy_array()
    vectorizer.fit_transform(X)
    check_same_transformers(expected_transformers, vectorizer.transformers)

    X = _get_list_of_lists()
    vectorizer.fit_transform(X)
    check_same_transformers(expected_transformers, vectorizer.transformers)
示例#13
0
# Position Title' column, as this columns contains 400 different entries:
import numpy as np
np.unique(y)

# %%
# We will now experiment with encoders specially made for handling
# dirty columns
from dirty_cat import SimilarityEncoder, TargetEncoder, MinHashEncoder,\
    GapEncoder

encoders = {
    'one-hot': one_hot,
    'similarity': SimilarityEncoder(similarity='ngram'),
    'target': TargetEncoder(handle_unknown='ignore'),
    'minhash': MinHashEncoder(n_components=100),
    'gap': GapEncoder(n_components=100),
}

# %%
# We now loop over the different encoding methods,
# instantiate a new |Pipeline| each time, fit it
# and store the returned cross-validation score:

from sklearn.model_selection import cross_val_score

all_scores = dict()

for name, method in encoders.items():
    encoder = make_column_transformer(
        (one_hot, ['gender', 'department_name', 'assignment_category']),
        ('passthrough', ['year_first_hired']),
print(employee_salaries.description)

###############################################################################
# Now, we retrieve the dirty column to encode:
dirty_column = 'employee_position_title'
X_dirty = employee_salaries.X[[dirty_column]]
print(X_dirty.head(), end='\n\n')
print(f'Number of dirty entries = {len(X_dirty)}')

###############################################################################
# Encoding dirty job titles
# -------------------------
#
# We first create an instance of the GapEncoder with n_components=10:
from dirty_cat import GapEncoder
enc = GapEncoder(n_components=10, random_state=42)

###############################################################################
# Then we fit the model on the dirty categorical data and transform it to
# obtain encoded vectors of size 10:
X_enc = enc.fit_transform(X_dirty)
print(f'Shape of encoded vectors = {X_enc.shape}')

###############################################################################
# Interpreting encoded vectors
# ----------------------------
#
# The GapEncoder can be understood as a continuous encoding on a set of latent
# topics estimated from the data. The latent topics are built by
# capturing combinations of substrings that frequently co-occur, and encoded
# vectors correspond to their activations.
示例#15
0
def test_input_type():
    # Numpy array with one column
    X = np.array([['alice'], ['bob']])
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_array = enc.fit_transform(X)
    # List
    X2 = [['alice'], ['bob']]
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_list = enc.fit_transform(X2)
    # Check if the encoded vectors are the same
    np.testing.assert_array_equal(X_enc_array, X_enc_list)

    # Numpy array with two columns
    X = np.array([['alice', 'charlie'], ['bob', 'delta']])
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_array = enc.fit_transform(X)
    # Pandas dataframe with two columns
    df = pd.DataFrame(X)
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_df = enc.fit_transform(df)
    # Check if the encoded vectors are the same
    np.testing.assert_array_equal(X_enc_array, X_enc_df)
    return
示例#16
0
def test_super_vectorizer():
    # Create a simple DataFrame
    X = _get_dataframe()
    # Test with low cardinality and a StandardScaler for the numeric columns
    vectorizer_base = SuperVectorizer(
        cardinality_threshold=3,
        # we must have n_samples = 5 >= n_components
        high_card_str_transformer=GapEncoder(n_components=2),
        high_card_cat_transformer=GapEncoder(n_components=2),
        numerical_transformer=StandardScaler(),
    )
    # Warning: order-dependant
    expected_transformers_df = {
        'numeric': ['int', 'float'],
        'low_card_str': ['str1'],
        'high_card_str': ['str2'],
        'low_card_cat': ['cat1'],
        'high_card_cat': ['cat2'],
    }
    vectorizer_base.fit_transform(X)
    check_same_transformers(expected_transformers_df,
                            vectorizer_base.transformers)

    # Test with higher cardinality threshold and no numeric transformer
    vectorizer_default = SuperVectorizer()  # Using default values
    expected_transformers_2 = {
        'low_card_str': ['str1', 'str2'],
        'low_card_cat': ['cat1', 'cat2'],
    }
    vectorizer_default.fit_transform(X)
    check_same_transformers(expected_transformers_2,
                            vectorizer_default.transformers)

    # Test with a numpy array
    arr = X.to_numpy()
    # Instead of the columns names, we'll have the column indices.
    expected_transformers_np = {
        'numeric': [0, 1],
        'low_card_str': [2, 4],
        'high_card_str': [3, 5],
    }
    vectorizer_base.fit_transform(arr)
    check_same_transformers(expected_transformers_np,
                            vectorizer_base.transformers)

    # Test with pandas series
    expected_transformers_series = {
        'low_card_cat': ['cat1'],
    }
    vectorizer_base.fit_transform(X['cat1'])
    check_same_transformers(expected_transformers_series,
                            vectorizer_base.transformers)

    # Test casting values
    vectorizer_cast = SuperVectorizer(
        cardinality_threshold=3,
        auto_cast=True,
        # we must have n_samples = 5 >= n_components
        high_card_str_transformer=GapEncoder(n_components=2),
        high_card_cat_transformer=GapEncoder(n_components=2),
        numerical_transformer=StandardScaler(),
    )
    X_str = X.astype('object')
    expected_transformers_plain = {
        'high_card_str': ['str2', 'cat2'],
        'low_card_str': ['str1', 'cat1'],
        'numeric': ['int', 'float']
    }
    # With pandas
    vectorizer_cast.fit_transform(X_str)
    check_same_transformers(expected_transformers_plain,
                            vectorizer_cast.transformers)
    # With numpy
    vectorizer_cast.fit_transform(X_str.to_numpy())
    check_same_transformers(expected_transformers_np,
                            vectorizer_cast.transformers)