def test_features_raw_data_no_target(data_classification_balanced, feature_descriptor): """Testing if raw_dataframe() drops Target column when drop_target=True.""" X, y = data_classification_balanced f = Features(X, y, feature_descriptor) expected_df = X.drop(["Date"], axis=1) cols = expected_df.columns actual_df = f.raw_data(drop_target=True)[cols] assert actual_df.equals(expected_df)
def test_features_create_mapped_dataframe(data_classification_balanced, feature_descriptor, expected_raw_mapping): """Testing if ._create_mapped_dataframe correctly returns mapped dataframe (with replaced values according to mapping). """ X, y = data_classification_balanced f = Features(X, y, feature_descriptor) expected_df = pd.concat([X, y], axis=1).drop(["Date"], axis=1).replace(expected_raw_mapping) cols = expected_df.columns actual_df = f._create_mapped_dataframe()[cols] assert actual_df.equals(expected_df)
def test_features_numerical_features_no_target( feature_list, target, expected, data_classification_balanced, feature_descriptor ): """Testing if .numerical_features() returns correct values when drop_target = True (without Target feature name).""" X, y = data_classification_balanced f = Features(X, y, feature_descriptor) f._numerical_features = feature_list f.target = target actual = f.numerical_features(drop_target=True) assert actual == expected
def test_features_data(data_classification_balanced, feature_descriptor, expected_raw_mapping): """Testing if .data() returns mapped df (with replaced values according to mapping) but without Target column ( when drop_target=True). """ X, y = data_classification_balanced f = Features(X, y, feature_descriptor) expected_df = X.drop(["Date"], axis=1).replace(expected_raw_mapping) cols = expected_df.columns actual_df = f.data(drop_target=True)[cols] assert actual_df.equals(expected_df)
def test_features_create_mapping(data_classification_balanced, feature_descriptor, expected_mapping): """Testing if ._create_mapping() creates a correct mapping dictionary.""" X, y = data_classification_balanced f = Features(X, y, feature_descriptor) expected = expected_mapping for feat in ["Height", "Price"]: expected[feat] = None actual = f.mapping() assert actual == expected
def test_features_create_raw_dataframe(data_classification_balanced, feature_descriptor): """Testing if .create_raw_dataframe returns correct dataframe (the same that was provided as input to the object). """ X, y = data_classification_balanced f = Features(X, y, feature_descriptor) expected_df = pd.concat([X, y], axis=1).drop(["Date"], axis=1) cols = expected_df.columns actual_df = f._create_raw_dataframe()[cols] assert actual_df.equals(expected_df)
def test_features_raw_data_excluded_transformed(data_classification_balanced, feature_descriptor, transformed_columns): """Testing if raw_data returns correct dataframe without transformed columns when excluded_transformed is set to True.""" X, y = data_classification_balanced f = Features(X, y, feature_descriptor, transformed_columns) expected_df = X.drop(["Date"], axis=1) if transformed_columns: expected_df = expected_df.drop(transformed_columns, axis=1) cols = expected_df.columns actual_df = f.raw_data(exclude_transformed=True)[cols] assert actual_df.equals(expected_df)
def test_features_numerical_features_exclude_transformed( data_classification_balanced, feature_descriptor, transformed_features ): """Testing if returning numerical features list with transformed columns excluded works properly.""" col_list = ["Height", "Price"] X, y = data_classification_balanced f = Features(X, y, feature_descriptor, transformed_features) actual_result = f.numerical_features(exclude_transformed=True) if transformed_features: expected_result = [feature for feature in col_list if feature not in transformed_features] else: expected_result = col_list assert actual_result == expected_result
def test_features_categorical_features_exclude_transformed( data_classification_balanced, feature_descriptor, transformed_features ): """Testing if returning categorical features list with transformed columns excluded works properly.""" col_list = ["AgeGroup", "bool", "Product", "Sex", "Target"] X, y = data_classification_balanced f = Features(X, y, feature_descriptor, transformed_features) actual_result = f.categorical_features(exclude_transformed=True) if transformed_features: expected_result = [feature for feature in col_list if feature not in transformed_features] else: expected_result = col_list assert actual_result == expected_result
def test_features_analyze_features_transformed_features( data_classification_balanced, feature_descriptor, transformed_features ): """Testing if creating features properly assigns Transformed flag based on provided transformed_features sequence.""" X, y = data_classification_balanced f = Features(X, y, feature_descriptor, transformed_features) f.original_dataframe = pd.concat([X, y], axis=1) # original_dataframe needs to be set up actual = f._analyze_features(feature_descriptor) for feature in actual.keys(): if feature in transformed_features: assert actual[feature].transformed else: assert not actual[feature].transformed
def test_features_create_descriptions(data_classification_balanced, feature_descriptions, feature_descriptor): """Testing if ._create_descriptions creates a correct descriptions dictionary.""" placeholder = Features._description_not_available d = FeatureDescriptor._description assert True expected_descriptions = {} for feat in ["Sex", "Height", "Product", "Price", "bool", "Target"]: expected_descriptions[feat] = feature_descriptions[feat][d] expected_descriptions["AgeGroup"] = placeholder X, y = data_classification_balanced f = Features(X, y, feature_descriptor) actual_descriptions = f.descriptions() assert actual_descriptions == expected_descriptions
def test_features_create_raw_dataframe_preserving_index(data_classification_balanced, feature_descriptor): """Testing if create_raw_dataframe preserves the index of the DataFrame.""" X, y = data_classification_balanced not_expected_df = pd.concat([X, y], axis=1).drop(["Date"], axis=1) length = X.shape[0] new_ind = list(range(100, length + 100)) X.index = new_ind y.index = new_ind f = Features(X, y, feature_descriptor) expected_df = pd.concat([X, y], axis=1).drop(["Date"], axis=1) expected_df.index = new_ind cols = expected_df.columns actual_df = f._create_raw_dataframe()[cols] assert not actual_df.equals(not_expected_df) assert actual_df.equals(expected_df)
def test_features_features_list_no_target( data_classification_balanced, feature_descriptor_type, feature_descriptor, feature_descriptor_forced_categories ): """Testing if .features() returns correct values when drop_target = True (without Target feature name).""" expected = ["AgeGroup", "bool", "Height", "Price", "Product", "Sex"] X, y = data_classification_balanced # couldn't find a way to incorporate fixtures into @pytest.mark.parametrize if feature_descriptor_type == "normal": fd = feature_descriptor elif feature_descriptor_type == "forced": fd = feature_descriptor_forced_categories else: raise f = Features(X, y, fd) actual = f.features(drop_target=True) assert actual == expected
def test_features_create_categorical_features( data_classification_balanced, feature_descriptor_type, expected, feature_descriptor, feature_descriptor_forced_categories ): """Testing if ._create_categorical_features() returns correct values depending on the Features provided.""" X, y = data_classification_balanced # couldn't find a way to incorporate fixtures into @pytest.mark.parametrize if feature_descriptor_type == "normal": fd = feature_descriptor elif feature_descriptor_type == "forced": fd = feature_descriptor_forced_categories else: raise f = Features(X, y, fd) actual = f._create_categorical_features() assert actual == expected
def test_analyzer_numeric_describe_no_numerical_features( data_classification_balanced): """Testing if numeric_describe() returns None when there are no numerical columns present.""" numerical_cols = ["Price", "Height"] X, y = data_classification_balanced X = X.drop(numerical_cols, axis=1) f = Features(X, y, None) analyzer = Analyzer(f) actual_result = analyzer.numerical_describe_df() assert actual_result is None
def test_features_impute_column_type(data_classification_balanced, column_name, expected_type): """Testing if imputing column type works correctly.""" X = data_classification_balanced[0] y = data_classification_balanced[1] df = pd.concat([X, y], axis=1) f = Features(X, y) cat = f._categorical num = f._numerical dat = f._date if expected_type == "categorical": expected = cat elif expected_type == "numerical": expected = num elif expected_type == "date": expected = dat else: raise actual = f._impute_column_type(df[column_name]) assert actual == expected
def test_features_analyze_features(data_classification_balanced, feature_descriptor): """Testing if .analyze_features() method of Features class returns a dictionary with a correct content""" n = NumericalFeature c = CategoricalFeature expected = { "Sex": c, "AgeGroup": c, "Height": n, "Product": c, "Price": n, "bool": c, "Target": c } X, y = data_classification_balanced f = Features(X, y, feature_descriptor) f.original_dataframe = pd.concat([X, y], axis=1) # original_dataframe needs to be set up actual = f._analyze_features(feature_descriptor) assert isinstance(actual, dict) for key, item in expected.items(): assert isinstance(actual[key], item)
def test_analyzer_categorical_describe_no_categorical_features( data_classification_balanced): """Testing if categorical_describe() returns None when there are no categorical columns present.""" numerical_cols = [ "Price", "Height" ] # its easier to provide numerical columns instead of dropping all categorical data = data_classification_balanced[0] X = data[numerical_cols[:1]] y = data[numerical_cols[1]] f = Features(X, y, None) analyzer = Analyzer(f) actual_result = analyzer.categorical_describe_df() assert actual_result is None
def fixture_features(data_classification_balanced, feature_descriptor): """Fixture Features object for data_classification_balanced test data.""" X, y = data_classification_balanced f = Features(X, y, feature_descriptor) return f
def fixture_features_multiclass(data_multiclass, feature_descriptor): """Fixture Features for multiclass problem.""" X, y = data_multiclass f = Features(X, y, feature_descriptor) return f
def test_features_unused_features(data_classification_balanced, feature_descriptor): """Testing if unused_features() returns correct values.""" X, y = data_classification_balanced f = Features(X, y, feature_descriptor) assert f.unused_features() == ["Date"]