def test_fit_numeric_dataset(): irep = IREP(random_state=42) rip = RIPPER(random_state=42) irep.fit( CREDIT_DF, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS, ) assert irep.ruleset_ == CREDIT_IREP_RULESET_42
def test_verbosity(): irep_v5 = IREP(random_state=42, verbosity=5) rip_v5 = RIPPER(random_state=42, verbosity=5) irep_v5.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS) assert irep_v5.ruleset_ == IREP_RULESET_42 rip_v5.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS) assert rip_v5.ruleset_ == RIP_RULESET_42
def test_fit_X_y_np(): irep = IREP(random_state=42) rip = RIPPER(random_state=42) irep.fit(X_DF, y=Y_DF, pos_class=POS_CLASS) assert irep.ruleset_ == IREP_RULESET_42 rip.fit(X_DF, y=Y_DF, pos_class=POS_CLASS) assert rip.ruleset_ == RIP_RULESET_42
def test_fit_Xy_df(): irep = IREP(random_state=42) rip = RIPPER(random_state=42) irep.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS) assert irep.ruleset_ == IREP_RULESET_42 rip.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS) assert rip.ruleset_ == RIP_RULESET_42
def test_fit_Xy_np(): irep = IREP(random_state=42) rip = RIPPER(random_state=42) irep.fit(XY_NP, y=None, class_feat=NP_CLASS_FEAT, pos_class=POS_CLASS) assert irep.ruleset_ == feat_to_num_rs(IREP_RULESET_42) rip.fit(XY_NP, y=None, class_feat=NP_CLASS_FEAT, pos_class=POS_CLASS) assert rip.ruleset_ == feat_to_num_rs(RIP_RULESET_42)
def test_df_isnt_modified(): old_df = pd.read_csv("credit.csv") df = old_df.copy() irep = IREP(random_state=42) irep.fit(CREDIT_DF, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS) assert df.equals(old_df) old_df = pd.read_csv("credit.csv") df = old_df.copy() rip = RIPPER(random_state=42) rip.fit(CREDIT_DF, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS) assert df.equals(old_df)
def test_fit_discrete_dataset(): irep = IREP(random_state=0, n_discretize_bins=11) rip = RIPPER(random_state=0, n_discretize_bins=11) discrete_df = CREDIT_DF.select_dtypes(float).applymap(lambda x: int(x % 10)) discrete_df[CREDIT_CLASS_FEAT] = CREDIT_DF[CREDIT_CLASS_FEAT] irep.fit(discrete_df, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS) assert not (irep.ruleset_.isuniversal()) and not (irep.ruleset_.isnull()) rip.fit(discrete_df, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS) assert not (rip.ruleset_.isuniversal()) and not (rip.ruleset_.isnull())
def test_random_state(): # Party dataset irep_rulesets = [] rip_rulesets = [] for _ in range(3): irep = IREP(random_state=72) rip = RIPPER(random_state=72) irep.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS) rip.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS) irep_rulesets.append(irep.ruleset_) rip_rulesets.append(rip.ruleset_) assert all(rs == irep_rulesets[0] for rs in irep_rulesets) assert all(rs == rip_rulesets[0] for rs in rip_rulesets) # Credit dataset irep_rulesets = [] rip_rulesets = [] for _ in range(3): irep = IREP(random_state=72) rip = RIPPER(random_state=72) irep.fit(CREDIT_DF, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS) rip.fit(CREDIT_DF, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS) irep_rulesets.append(irep.ruleset_) rip_rulesets.append(rip.ruleset_) assert all(rs == irep_rulesets[0] for rs in irep_rulesets) assert all(rs == rip_rulesets[0] for rs in rip_rulesets)
def test_fit_boolean_dataset(): irep = IREP(random_state=42) rip = RIPPER(random_state=42) def tobool(x): if x == "y": return 0 elif x == "n": return 1 else: return 2 bool_df = DF.copy() for col in bool_df.drop("Party", axis=1).columns: bool_df[col] = bool_df[col].map(tobool) irep.fit(bool_df, class_feat="Party", pos_class="democrat") assert not (irep.ruleset_.isuniversal()) and not (irep.ruleset_.isnull())
def test_df_isnt_modified(): # df shouldn't be affected by side-effects during model fitting old_df = pd.read_csv("credit.csv") df = old_df.copy() irep = IREP(random_state=42) irep.fit(CREDIT_DF, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS) assert df.equals(old_df) old_df = pd.read_csv("credit.csv") df = old_df.copy() rip = RIPPER(random_state=42) rip.fit(CREDIT_DF, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS) assert df.equals(old_df)
def test_infer_pos_class(): irep = IREP(random_state=42) rip = RIPPER(random_state=42) infer_df = DF.copy() infer_df[CLASS_FEAT] = infer_df[CLASS_FEAT].map( lambda x: 1 if x == "democrat" else 0 ) irep.fit( infer_df, class_feat=CLASS_FEAT, ) assert irep.ruleset_ == IREP_RULESET_42 rip.fit( infer_df, class_feat=CLASS_FEAT, ) assert rip.ruleset_ == RIP_RULESET_42
def test_fit_XY_rename_columns(): irep = IREP(random_state=42) rip = RIPPER(random_state=42) # With xy irep.fit( XY_NP, y=None, class_feat=CLASS_FEAT, pos_class=POS_CLASS, feature_names=DF.columns, ) assert irep.ruleset_ == IREP_RULESET_42 rip.fit( XY_NP, y=None, class_feat=CLASS_FEAT, pos_class=POS_CLASS, feature_names=DF.columns, ) assert rip.ruleset_ == RIP_RULESET_42 irep = IREP(random_state=42) rip = RIPPER(random_state=42) # With x_y irep.fit( X_NP, y=Y_NP, class_feat=CLASS_FEAT, pos_class=POS_CLASS, feature_names=DF.drop(CLASS_FEAT, axis=1).columns, ) assert irep.ruleset_ == IREP_RULESET_42 rip.fit( X_NP, y=Y_NP, class_feat=CLASS_FEAT, pos_class=POS_CLASS, feature_names=DF.drop(CLASS_FEAT, axis=1).columns, ) assert rip.ruleset_ == RIP_RULESET_42
def test_deprecated_bin_transformer(): deprecated_bin_transformer = { "A11": [(0, 1), (1, 2), (2, 4), (4, 8), (8, 17), (17, 67)], "A15": [ (0, 1), (1, 9), (10, 105), (108, 351), (351, 1004), (1058, 4607), (4700, 100000), ], "A3": [ (0.0, 0.415), (0.415, 0.79), (0.79, 1.375), (1.375, 2.04), (2.04, 3.04), (3.04, 4.71), (4.75, 7.04), (7.08, 10.665), (10.75, 14.585), (14.79, 28.0), ], "A8": [ (0.0, 0.04), (0.04, 0.165), (0.165, 0.335), (0.335, 0.71), (0.75, 1.25), (1.25, 1.835), (1.835, 2.79), (3.0, 5.04), (5.085, 13.0), (13.5, 28.5), ], } df = pd.read_csv("credit.csv") irep = IREP() irep.fit(df, class_feat="Class", pos_class="+") irep.bin_transformer_ = deprecated_bin_transformer preds = irep.predict(df) rip = RIPPER() rip.fit(df, class_feat="Class", pos_class="+") rip.bin_transformer_ = deprecated_bin_transformer preds = rip.predict(df)
def test_same_inputs_give_same_results(): for random_state in range(3): irep_res = [] rip_res = [] irep = IREP(random_state=random_state) rip = RIPPER(random_state=random_state) irep.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS) irep_res.append(irep.ruleset_) rip.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS) rip_res.append(rip.ruleset_) irep = IREP(random_state=random_state) rip = RIPPER(random_state=random_state) irep.fit(X_DF, y=Y_DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS) irep_res.append(irep.ruleset_) rip.fit(X_DF, y=Y_DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS) rip_res.append(rip.ruleset_) irep = IREP(random_state=random_state) rip = RIPPER(random_state=random_state) irep.fit(X_DF, y=Y_DF, pos_class=POS_CLASS) irep_res.append(irep.ruleset_) rip.fit(X_DF, y=Y_DF, pos_class=POS_CLASS) rip_res.append(rip.ruleset_) irep = IREP(random_state=random_state) rip = RIPPER(random_state=random_state) irep.fit( XY_NP, y=None, class_feat=CLASS_FEAT, pos_class=POS_CLASS, feature_names=DF.columns, ) irep_res.append(irep.ruleset_) rip.fit( XY_NP, y=None, class_feat=CLASS_FEAT, pos_class=POS_CLASS, feature_names=DF.columns, ) rip_res.append(rip.ruleset_) irep = IREP(random_state=random_state) rip = RIPPER(random_state=random_state) irep.fit( X_NP, y=Y_NP, class_feat=CLASS_FEAT, pos_class=POS_CLASS, feature_names=DF.drop(CLASS_FEAT, axis=1).columns, ) irep_res.append(irep.ruleset_) rip.fit( X_NP, y=Y_NP, class_feat=CLASS_FEAT, pos_class=POS_CLASS, feature_names=DF.drop(CLASS_FEAT, axis=1).columns, ) rip_res.append(rip.ruleset_) assert all([res == irep_res[0] for res in irep_res]) assert all([res == rip_res[0] for res in rip_res])
def test_use_initial_model(): initial_model = "[[A9=t ^ A10=t]]" expected_irep = ruleset_fromstr( """[[A9=t ^ A10=t] V [A9=t ^ A7=h] V [A9=t ^ A4=u ^ A7=v]] """ ) expected_rip = ruleset_fromstr( """[[A9=t ^ A10=t] V [A9=t ^ A7=h] V [A9=t ^ A4=u ^ A14=0 ^ A15=0-0] V [A9=t ^ A6=w]] """ ) # From str irep = IREP(random_state=1) irep.fit(credit_df, class_feat='Class', pos_class='+', initial_model=initial_model ) assert irep.ruleset_ == expected_irep rip = RIPPER(random_state=1) rip.fit(credit_df, class_feat='Class', pos_class='+', initial_model=initial_model ) assert rip.ruleset_ == expected_rip # From IREP initial_irep_model = IREP() initial_irep_model.init_ruleset(initial_model) irep = IREP(random_state=1) irep.fit(credit_df, class_feat='Class', pos_class='+', initial_model=initial_irep_model ) assert irep.ruleset_ == expected_irep rip = RIPPER(random_state=1) rip.fit(credit_df, class_feat='Class', pos_class='+', initial_model=initial_irep_model ) assert rip.ruleset_ == expected_rip # From RIP initial_rip_model = RIPPER() initial_rip_model.init_ruleset(initial_model) irep = IREP(random_state=1) irep.fit(credit_df, class_feat='Class', pos_class='+', initial_model=initial_rip_model ) assert irep.ruleset_ == expected_irep rip = RIPPER(random_state=1) rip.fit(credit_df, class_feat='Class', pos_class='+', initial_model=initial_rip_model ) assert rip.ruleset_ == expected_rip # No side-effects assert initial_irep_model.ruleset_ == ruleset_fromstr(initial_model) assert initial_rip_model.ruleset_ == ruleset_fromstr(initial_model)
from copy import deepcopy import os import pytest import pandas as pd from wittgenstein.irep import IREP from wittgenstein.ripper import RIPPER from wittgenstein.base import Ruleset, ruleset_fromstr, rule_fromstr DF = pd.read_csv("mushroom.csv") original_ruleset_str = "[[Odor=f] V [Gill-size=n] V [Spore-print-color=r] V [Odor=m]]" original_ruleset = ruleset_fromstr(original_ruleset_str) original_rules = original_ruleset.rules original_irep = IREP(random_state=42) original_irep.fit(DF, class_feat="Poisonous/Edible", pos_class="p") # Ensure setup works assert original_ruleset == original_irep.ruleset_ credit_df = pd.read_csv("credit.csv") credit_class_feat = "Class" credit_pos_class = "+" credit_rip = RIPPER(random_state=42, verbosity=0) credit_rip.fit(credit_df, class_feat="Class", pos_class="+") credit_original_ruleset = ruleset_fromstr( "[[A9=t ^ A10=t ^ A4=u ^ A1=b ^ A11=7-16] V \ [A9=t ^ A10=t ^ A4=u ^ A11=3-7] V \ [A9=t ^ A10=t ^ A14=0] V \ [A9=t ^ A10=t] V \ [A9=t ^ A7=h ^ A6=q]]"
def test_initruleset(): irep = IREP(random_state=42) irep.init_ruleset() irep.ruleset_ == Ruleset() irep = IREP(random_state=42) irep.init_ruleset(original_ruleset) irep.ruleset_ == original_ruleset irep = IREP(random_state=42) irep.init_ruleset(original_ruleset_str) irep.ruleset_ == original_ruleset