def test_gap_encoder(hashing, init, analyzer, add_words, n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'] X = X_txt[:n_samples] n_components = 10 # Test output shape encoder = GapEncoder( n_components=n_components, hashing=hashing, init=init, analyzer=analyzer, add_words=add_words, random_state=42, rescale_W=True) encoder.fit(X) y = encoder.transform(X) assert y.shape == (n_samples, n_components), str(y.shape) assert len(set(y[0])) == n_components # Test L1-norm of topics W. l1_norm_W = np.abs(encoder.W_).sum(axis=1) np.testing.assert_array_almost_equal( l1_norm_W, np.ones(n_components)) # Test same seed return the same output encoder = GapEncoder( n_components=n_components, hashing=hashing, init=init, analyzer=analyzer, add_words=add_words, random_state=42) encoder.fit(X) y2 = encoder.transform(X) np.testing.assert_array_equal(y, y2) return
def _test_possibilities( X, expected_transformers_df, expected_transformers_2, expected_transformers_np_no_cast, expected_transformers_series, expected_transformers_plain, expected_transformers_np_cast, ): """ Do a bunch of tests with the SuperVectorizer. We take some expected transformers results as argument. They're usually lists or dictionaries. """ # Test with low cardinality and a StandardScaler for the numeric columns vectorizer_base = SuperVectorizer( cardinality_threshold=4, # we must have n_samples = 5 >= n_components high_card_cat_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) # Warning: order-dependant vectorizer_base.fit_transform(X) check_same_transformers(expected_transformers_df, vectorizer_base.transformers) # Test with higher cardinality threshold and no numeric transformer vectorizer_default = SuperVectorizer() # Using default values vectorizer_default.fit_transform(X) check_same_transformers(expected_transformers_2, vectorizer_default.transformers) # Test with a numpy array arr = X.to_numpy() # Instead of the columns names, we'll have the column indices. vectorizer_base.fit_transform(arr) check_same_transformers(expected_transformers_np_no_cast, vectorizer_base.transformers) # Test with pandas series vectorizer_base.fit_transform(X['cat1']) check_same_transformers(expected_transformers_series, vectorizer_base.transformers) # Test casting values vectorizer_cast = SuperVectorizer( cardinality_threshold=4, # we must have n_samples = 5 >= n_components high_card_cat_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) X_str = X.astype('object') # With pandas vectorizer_cast.fit_transform(X_str) check_same_transformers(expected_transformers_plain, vectorizer_cast.transformers) # With numpy vectorizer_cast.fit_transform(X_str.to_numpy()) check_same_transformers(expected_transformers_np_cast, vectorizer_cast.transformers)
def test_analyzer(init1, analyzer1, analyzer2): """" Test if the output is different when the analyzer is 'word' or 'char'. If it is, no error ir raised. """ add_words = False n_samples = 70 X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples] X = np.array([X_txt, X_txt]).T n_components = 10 # Test first analyzer output: encoder = GapEncoder(n_components=n_components, init='k-means++', analyzer=analyzer1, add_words=add_words, random_state=42, rescale_W=True) encoder.fit(X) y = encoder.transform(X) # Test the other analyzer output: encoder = GapEncoder(n_components=n_components, init='k-means++', analyzer=analyzer2, add_words=add_words, random_state=42) encoder.fit(X) y2 = encoder.transform(X) # Test inequality btw analyzer word and char ouput: np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, y, y2)
def test_overflow_error(): np.seterr(over='raise', divide='raise') r = np.random.RandomState(0) X = r.randint(1e5, 1e6, size=(8000, 1)).astype(str) enc = GapEncoder(n_components=2, batch_size=1, min_iter=1, max_iter=1, random_state=0) enc.fit(X) return
def test_get_feature_names(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'] X = X_txt[:n_samples] enc = GapEncoder() enc.fit(X) topic_labels = enc.get_feature_names() # Check number of labels assert len(topic_labels) == enc.n_components return
def __init__( self, *, cardinality_threshold: int = 40, low_card_cat_transformer: Optional[Union[BaseEstimator, str]] = OneHotEncoder(), high_card_cat_transformer: Optional[Union[BaseEstimator, str]] = GapEncoder( n_components=30), numerical_transformer: Optional[Union[BaseEstimator, str]] = None, datetime_transformer: Optional[Union[BaseEstimator, str]] = None, auto_cast: bool = True, impute_missing: str = 'auto', # Following parameters are inherited from ColumnTransformer remainder: str = 'passthrough', sparse_threshold: float = 0.3, n_jobs: int = None, transformer_weights=None, verbose: bool = False, ): super().__init__(transformers=[]) self.cardinality_threshold = cardinality_threshold self.low_card_cat_transformer = low_card_cat_transformer self.high_card_cat_transformer = high_card_cat_transformer self.numerical_transformer = numerical_transformer self.datetime_transformer = datetime_transformer self.auto_cast = auto_cast self.impute_missing = impute_missing self.remainder = remainder self.sparse_threshold = sparse_threshold self.n_jobs = n_jobs self.transformer_weights = transformer_weights self.verbose = verbose
def test_missing_values(missing): observations = [['alice', 'bob'], ['bob', 'alice'], ['bob', np.nan], ['alice', 'charlie'], [np.nan, 'alice']] observations = np.array(observations, dtype=object) enc = GapEncoder(handle_missing=missing, n_components=3) if missing == 'error': with pytest.raises(ValueError, match=r'Input data contains missing values.'): enc.fit_transform(observations) elif missing == 'zero_impute': enc.fit_transform(observations) enc.partial_fit(observations) else: with pytest.raises(ValueError, match=r"handle_missing should be either " r"'error' or 'zero_impute', got 'aaa'"): enc.fit_transform(observations)
def test_score(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples] X1 = np.array(X_txt)[:, None] X2 = np.hstack([X1, X1]) enc = GapEncoder(random_state=42) enc.fit(X1) score_X1 = enc.score(X1) enc.fit(X2) score_X2 = enc.score(X2) # Check that two identical columns give the same score assert score_X1 * 2 == score_X2 return
def test_partial_fit(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'] X = X_txt[:n_samples] # Gap encoder with fit on one batch enc = GapEncoder(random_state=42, batch_size=n_samples, max_iter=1) X_enc = enc.fit_transform(X) # Gap encoder with partial fit enc = GapEncoder(random_state=42) enc.partial_fit(X) X_enc_partial = enc.transform(X) # Check if the encoded vectors are the same np.testing.assert_almost_equal(X_enc, X_enc_partial) return
def test_get_feature_names_out(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples] X = np.array([X_txt, X_txt]).T enc = GapEncoder(random_state=42) enc.fit(X) for topic_labels in [enc.get_feature_names(), enc.get_feature_names_out()]: # Check number of labels assert len(topic_labels) == enc.n_components * X.shape[1] # Test different parameters for col_names topic_labels_2 = enc.get_feature_names_out(col_names='auto') assert topic_labels_2[0] == 'col0: ' + topic_labels[0] topic_labels_3 = enc.get_feature_names_out(col_names=['abc', 'def']) assert topic_labels_3[0] == 'abc: ' + topic_labels[0] return
def test_input_type(): # Numpy array X = np.array(['alice', 'bob']) enc = GapEncoder(n_components=2, random_state=42) X_enc_array = enc.fit_transform(X) # List X = ['alice', 'bob'] enc = GapEncoder(n_components=2, random_state=42) X_enc_list = enc.fit_transform(X) # Check if the encoded vectors are the same np.testing.assert_array_equal(X_enc_array, X_enc_list) return
def test_with_arrays(): """ Check that the SuperVectorizer works if we input a list of lists or a numpy array. """ expected_transformers = { 'numeric': [0, 1], 'low_card_cat': [2, 4], 'high_card_cat': [3, 5], } vectorizer = SuperVectorizer( cardinality_threshold=4, # we must have n_samples = 5 >= n_components high_card_cat_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) X = _get_numpy_array() vectorizer.fit_transform(X) check_same_transformers(expected_transformers, vectorizer.transformers) X = _get_list_of_lists() vectorizer.fit_transform(X) check_same_transformers(expected_transformers, vectorizer.transformers)
# Position Title' column, as this columns contains 400 different entries: import numpy as np np.unique(y) # %% # We will now experiment with encoders specially made for handling # dirty columns from dirty_cat import SimilarityEncoder, TargetEncoder, MinHashEncoder,\ GapEncoder encoders = { 'one-hot': one_hot, 'similarity': SimilarityEncoder(similarity='ngram'), 'target': TargetEncoder(handle_unknown='ignore'), 'minhash': MinHashEncoder(n_components=100), 'gap': GapEncoder(n_components=100), } # %% # We now loop over the different encoding methods, # instantiate a new |Pipeline| each time, fit it # and store the returned cross-validation score: from sklearn.model_selection import cross_val_score all_scores = dict() for name, method in encoders.items(): encoder = make_column_transformer( (one_hot, ['gender', 'department_name', 'assignment_category']), ('passthrough', ['year_first_hired']),
print(employee_salaries.description) ############################################################################### # Now, we retrieve the dirty column to encode: dirty_column = 'employee_position_title' X_dirty = employee_salaries.X[[dirty_column]] print(X_dirty.head(), end='\n\n') print(f'Number of dirty entries = {len(X_dirty)}') ############################################################################### # Encoding dirty job titles # ------------------------- # # We first create an instance of the GapEncoder with n_components=10: from dirty_cat import GapEncoder enc = GapEncoder(n_components=10, random_state=42) ############################################################################### # Then we fit the model on the dirty categorical data and transform it to # obtain encoded vectors of size 10: X_enc = enc.fit_transform(X_dirty) print(f'Shape of encoded vectors = {X_enc.shape}') ############################################################################### # Interpreting encoded vectors # ---------------------------- # # The GapEncoder can be understood as a continuous encoding on a set of latent # topics estimated from the data. The latent topics are built by # capturing combinations of substrings that frequently co-occur, and encoded # vectors correspond to their activations.
def test_input_type(): # Numpy array with one column X = np.array([['alice'], ['bob']]) enc = GapEncoder(n_components=2, random_state=42) X_enc_array = enc.fit_transform(X) # List X2 = [['alice'], ['bob']] enc = GapEncoder(n_components=2, random_state=42) X_enc_list = enc.fit_transform(X2) # Check if the encoded vectors are the same np.testing.assert_array_equal(X_enc_array, X_enc_list) # Numpy array with two columns X = np.array([['alice', 'charlie'], ['bob', 'delta']]) enc = GapEncoder(n_components=2, random_state=42) X_enc_array = enc.fit_transform(X) # Pandas dataframe with two columns df = pd.DataFrame(X) enc = GapEncoder(n_components=2, random_state=42) X_enc_df = enc.fit_transform(df) # Check if the encoded vectors are the same np.testing.assert_array_equal(X_enc_array, X_enc_df) return
def test_super_vectorizer(): # Create a simple DataFrame X = _get_dataframe() # Test with low cardinality and a StandardScaler for the numeric columns vectorizer_base = SuperVectorizer( cardinality_threshold=3, # we must have n_samples = 5 >= n_components high_card_str_transformer=GapEncoder(n_components=2), high_card_cat_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) # Warning: order-dependant expected_transformers_df = { 'numeric': ['int', 'float'], 'low_card_str': ['str1'], 'high_card_str': ['str2'], 'low_card_cat': ['cat1'], 'high_card_cat': ['cat2'], } vectorizer_base.fit_transform(X) check_same_transformers(expected_transformers_df, vectorizer_base.transformers) # Test with higher cardinality threshold and no numeric transformer vectorizer_default = SuperVectorizer() # Using default values expected_transformers_2 = { 'low_card_str': ['str1', 'str2'], 'low_card_cat': ['cat1', 'cat2'], } vectorizer_default.fit_transform(X) check_same_transformers(expected_transformers_2, vectorizer_default.transformers) # Test with a numpy array arr = X.to_numpy() # Instead of the columns names, we'll have the column indices. expected_transformers_np = { 'numeric': [0, 1], 'low_card_str': [2, 4], 'high_card_str': [3, 5], } vectorizer_base.fit_transform(arr) check_same_transformers(expected_transformers_np, vectorizer_base.transformers) # Test with pandas series expected_transformers_series = { 'low_card_cat': ['cat1'], } vectorizer_base.fit_transform(X['cat1']) check_same_transformers(expected_transformers_series, vectorizer_base.transformers) # Test casting values vectorizer_cast = SuperVectorizer( cardinality_threshold=3, auto_cast=True, # we must have n_samples = 5 >= n_components high_card_str_transformer=GapEncoder(n_components=2), high_card_cat_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) X_str = X.astype('object') expected_transformers_plain = { 'high_card_str': ['str2', 'cat2'], 'low_card_str': ['str1', 'cat1'], 'numeric': ['int', 'float'] } # With pandas vectorizer_cast.fit_transform(X_str) check_same_transformers(expected_transformers_plain, vectorizer_cast.transformers) # With numpy vectorizer_cast.fit_transform(X_str.to_numpy()) check_same_transformers(expected_transformers_np, vectorizer_cast.transformers)