def test_pipeline_feature_generator(generator_helper, data_helper): # Given input_data = data_helper.generate_multi_feature_full() toy_vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 3), max_features=10, dtype=np.uint8) text_ngram_feature_generator = TextNgramFeatureGenerator( vectorizer=toy_vectorizer) text_ngram_feature_generator.max_memory_ratio = None # Necessary in test to avoid CI non-deterministically pruning ngram counts. generator = PipelineFeatureGenerator(generators=[[ IdentityFeatureGenerator(infer_features_in_args=dict( valid_raw_types=[R_INT, R_FLOAT])), CategoryFeatureGenerator(), DatetimeFeatureGenerator(), TextSpecialFeatureGenerator(), text_ngram_feature_generator, ]]) expected_feature_metadata_in_full = { ('category', ()): ['cat'], ('datetime', ()): ['datetime'], ('float', ()): ['float'], ('int', ()): ['int'], ('object', ()): ['obj'], ('object', ('datetime_as_object', )): ['datetime_as_object'], ('object', ('text', )): ['text'] } expected_feature_metadata_full = { ('category', ()): ['obj', 'cat'], ('float', ()): ['float'], ('int', ()): ['int'], ('int', ('binned', 'text_special')): [ 'text.char_count', 'text.word_count', 'text.lower_ratio', 'text.special_ratio', 'text.symbol_ratio. ' ], ('int', ('datetime_as_int', )): ['datetime', 'datetime_as_object'], ('int', ('text_ngram', )): [ '__nlp__.breaks', '__nlp__.end', '__nlp__.end of', '__nlp__.end of the', '__nlp__.of', '__nlp__.sentence', '__nlp__.sentence breaks', '__nlp__.the', '__nlp__.the end', '__nlp__.world', '__nlp__._total_' ] } expected_output_data_feat_datetime = [ 1533140820000000000, -9223372036854775808, -9223372036854775808, 1524238620000000000, 1524238620000000000, -5364662400000000000, 7289654340000000000, 1597475520000000000, 1608257520000000000 ] expected_output_data_feat_lower_ratio = [3, 2, 0, 3, 3, 3, 3, 3, 1] expected_output_data_feat_total = [1, 3, 0, 0, 7, 1, 3, 7, 3] # When output_data = generator_helper.fit_transform_assert( input_data=input_data, generator=generator, expected_feature_metadata_in_full=expected_feature_metadata_in_full, expected_feature_metadata_full=expected_feature_metadata_full, ) # int and float checks assert output_data['int'].equals(input_data['int']) assert output_data['float'].equals(input_data['float']) # object and category checks assert list(output_data['obj'].values) == [1, 2, 1, 4, 4, 4, 3, 0, 0] assert list( output_data['cat'].values) == [0, 1, 0, 3, 3, 3, 2, np.nan, np.nan] # datetime checks assert list(output_data['datetime'].values) == list( output_data['datetime_as_object'].values) assert expected_output_data_feat_datetime == list( output_data['datetime'].values) # text_special checks assert expected_output_data_feat_lower_ratio == list( output_data['text.lower_ratio'].values) # text_ngram checks assert expected_output_data_feat_total == list( output_data['__nlp__._total_'].values)
def test_category_feature_generator(generator_helper, data_helper): # Given input_data = data_helper.generate_multi_feature_standard() category_input_data = input_data[['obj', 'cat']].astype('category') generator_1 = CategoryFeatureGenerator() generator_2 = CategoryFeatureGenerator(maximum_num_cat=2) generator_3 = CategoryFeatureGenerator(minimum_cat_count=3) generator_4 = CategoryFeatureGenerator(cat_order='count') generator_5 = CategoryFeatureGenerator(minimize_memory=False) expected_feature_metadata_in_full = { ('object', ()): ['obj'], ('category', ()): ['cat'], } expected_feature_metadata_full = {('category', ()): ['obj', 'cat']} expected_cat_categories_lst = [ [0, 1, 2, 3], [0, 1], [0], [0, 1, 2, 3], ] expected_cat_values_lst = [ [0, 1, 0, 3, 3, 3, 2, np.nan, np.nan], [0, np.nan, 0, 1, 1, 1, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, 0, 0, 0, np.nan, np.nan, np.nan], [2, 1, 2, 3, 3, 3, 0, np.nan, np.nan], ] expected_cat_codes_lst = [ [0, 1, 0, 3, 3, 3, 2, -1, -1], [0, -1, 0, 1, 1, 1, -1, -1, -1], [-1, -1, -1, 0, 0, 0, -1, -1, -1], [2, 1, 2, 3, 3, 3, 0, -1, -1], ] # When output_datas = [] for generator in [ generator_1, generator_2, generator_3, generator_4, generator_5 ]: output_data = generator_helper.fit_transform_assert( input_data=input_data, generator=generator, expected_feature_metadata_in_full=expected_feature_metadata_in_full, expected_feature_metadata_full=expected_feature_metadata_full, ) output_datas.append(output_data) # Therefore assert category_input_data.equals(output_datas[4]) output_datas = output_datas[:4] for i in range(len(output_datas)): output_data = output_datas[i] for col in ['obj', 'cat']: assert output_data[col].dtype.name == 'category' assert list(output_data[col].cat.categories ) == expected_cat_categories_lst[i] assert list(output_data[col]) == expected_cat_values_lst[i] assert list( output_data[col].cat.codes) == expected_cat_codes_lst[i]
def test_pipeline_feature_generator_removal_advanced(generator_helper, data_helper): # Given input_data = data_helper.generate_multi_feature_full() toy_vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 3), max_features=10, dtype=np.uint8) text_ngram_feature_generator = TextNgramFeatureGenerator( vectorizer=toy_vectorizer) text_ngram_feature_generator.max_memory_ratio = None # Necessary in test to avoid CI non-deterministically pruning ngram counts. generator = PipelineFeatureGenerator(generators=[ [ IdentityFeatureGenerator(infer_features_in_args=dict( valid_raw_types=[R_INT, R_FLOAT])), CategoryFeatureGenerator(), DatetimeFeatureGenerator(), TextSpecialFeatureGenerator(), text_ngram_feature_generator, ], [ IdentityFeatureGenerator(infer_features_in_args=dict( valid_raw_types=[R_CATEGORY])) ], ]) expected_feature_metadata_in_full = { ('category', ()): ['cat'], ('object', ()): ['obj'] } expected_feature_metadata_full = {('category', ()): ['obj', 'cat']} expected_feature_metadata_in_unused_full = { 'datetime': ('datetime', ()), 'datetime_as_object': ('object', ('datetime_as_object', )), 'float': ('float', ()), 'int': ('int', ()), 'text': ('object', ('text', )) } # When output_data = generator_helper.fit_transform_assert( input_data=input_data, generator=generator, expected_feature_metadata_in_full=expected_feature_metadata_in_full, expected_feature_metadata_full=expected_feature_metadata_full, ) feature_metadata_in_unused_full = generator._feature_metadata_in_unused.to_dict( ) # object and category checks assert list(output_data['obj'].values) == [1, 2, 1, 4, 4, 4, 3, 0, 0] assert list( output_data['cat'].values) == [0, 1, 0, 3, 3, 3, 2, np.nan, np.nan] assert feature_metadata_in_unused_full == expected_feature_metadata_in_unused_full
# identity_feature_generator.fit(X=X) # This is identical to fit_transform, just without returning X_identity_out # Because IdentityFeatureGenerator simply passes the data along, nothing changed. assert X_transform.equals(X) identity_feature_generator = IdentityFeatureGenerator(features_in=['age', 'workclass']) # Limit the valid input to only 'age' and 'workclass' features. X_transform = identity_feature_generator.fit_transform(X=X, verbosity=3) print(X_transform.head(5)) # Now the output only contains the two features we declared in the input arguments to the generator, acting as a feature filter. from autogluon.tabular.features import R_INT identity_feature_generator = IdentityFeatureGenerator(infer_features_in_args={'valid_raw_types': [R_INT]}, verbosity=3) # Limit the valid input to only integer features. X_transform = identity_feature_generator.fit_transform(X=X) print(X_transform.head(5)) # Now the output only contains the int type features, acting as a type filter. # Our data contains object features at present, but this is not valid input to models, so lets convert them to category types. category_feature_generator = CategoryFeatureGenerator(verbosity=3) X_transform = category_feature_generator.fit_transform(X=X) print(X_transform.head(5)) # Note that the int features were automatically filtered out of this output. This is due to the defaults of CategoryFeatureGenerator which does not handle features other than objects and categories. ##################################### # Create a custom feature generator # ##################################### from pandas import DataFrame from autogluon.tabular.features import AbstractFeatureGenerator # Feature generator to add k to all values of integer features. class PlusKFeatureGenerator(AbstractFeatureGenerator): def __init__(self, k, **kwargs): super().__init__(**kwargs)