Exemplo n.º 1
0
def test_fit_numeric_dataset():
    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    irep.fit(
        CREDIT_DF, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS,
    )
    assert irep.ruleset_ == CREDIT_IREP_RULESET_42
Exemplo n.º 2
0
def test_verbosity():
    irep_v5 = IREP(random_state=42, verbosity=5)
    rip_v5 = RIPPER(random_state=42, verbosity=5)

    irep_v5.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS)
    assert irep_v5.ruleset_ == IREP_RULESET_42
    rip_v5.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS)
    assert rip_v5.ruleset_ == RIP_RULESET_42
Exemplo n.º 3
0
def test_fit_X_y_np():
    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    irep.fit(X_DF, y=Y_DF, pos_class=POS_CLASS)
    assert irep.ruleset_ == IREP_RULESET_42

    rip.fit(X_DF, y=Y_DF, pos_class=POS_CLASS)
    assert rip.ruleset_ == RIP_RULESET_42
Exemplo n.º 4
0
def test_fit_Xy_df():
    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    irep.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS)
    assert irep.ruleset_ == IREP_RULESET_42

    rip.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS)
    assert rip.ruleset_ == RIP_RULESET_42
Exemplo n.º 5
0
def test_fit_Xy_np():
    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    irep.fit(XY_NP, y=None, class_feat=NP_CLASS_FEAT, pos_class=POS_CLASS)
    assert irep.ruleset_ == feat_to_num_rs(IREP_RULESET_42)

    rip.fit(XY_NP, y=None, class_feat=NP_CLASS_FEAT, pos_class=POS_CLASS)
    assert rip.ruleset_ == feat_to_num_rs(RIP_RULESET_42)
Exemplo n.º 6
0
def test_df_isnt_modified():
    old_df = pd.read_csv("credit.csv")
    df = old_df.copy()
    irep = IREP(random_state=42)
    irep.fit(CREDIT_DF, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS)
    assert df.equals(old_df)

    old_df = pd.read_csv("credit.csv")
    df = old_df.copy()
    rip = RIPPER(random_state=42)
    rip.fit(CREDIT_DF, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS)
    assert df.equals(old_df)
Exemplo n.º 7
0
def test_fit_discrete_dataset():

    irep = IREP(random_state=0, n_discretize_bins=11)
    rip = RIPPER(random_state=0, n_discretize_bins=11)

    discrete_df = CREDIT_DF.select_dtypes(float).applymap(lambda x: int(x % 10))
    discrete_df[CREDIT_CLASS_FEAT] = CREDIT_DF[CREDIT_CLASS_FEAT]

    irep.fit(discrete_df, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS)
    assert not (irep.ruleset_.isuniversal()) and not (irep.ruleset_.isnull())
    rip.fit(discrete_df, class_feat=CREDIT_CLASS_FEAT, pos_class=CREDIT_POS_CLASS)
    assert not (rip.ruleset_.isuniversal()) and not (rip.ruleset_.isnull())
Exemplo n.º 8
0
def test_random_state():

    # Party dataset
    irep_rulesets = []
    rip_rulesets = []
    for _ in range(3):
        irep = IREP(random_state=72)
        rip = RIPPER(random_state=72)
        irep.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS)
        rip.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS)
        irep_rulesets.append(irep.ruleset_)
        rip_rulesets.append(rip.ruleset_)
    assert all(rs == irep_rulesets[0] for rs in irep_rulesets)
    assert all(rs == rip_rulesets[0] for rs in rip_rulesets)

    # Credit dataset
    irep_rulesets = []
    rip_rulesets = []
    for _ in range(3):
        irep = IREP(random_state=72)
        rip = RIPPER(random_state=72)
        irep.fit(CREDIT_DF,
                 class_feat=CREDIT_CLASS_FEAT,
                 pos_class=CREDIT_POS_CLASS)
        rip.fit(CREDIT_DF,
                class_feat=CREDIT_CLASS_FEAT,
                pos_class=CREDIT_POS_CLASS)
        irep_rulesets.append(irep.ruleset_)
        rip_rulesets.append(rip.ruleset_)
    assert all(rs == irep_rulesets[0] for rs in irep_rulesets)
    assert all(rs == rip_rulesets[0] for rs in rip_rulesets)
Exemplo n.º 9
0
def test_fit_boolean_dataset():
    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    def tobool(x):
        if x == "y":
            return 0
        elif x == "n":
            return 1
        else:
            return 2

    bool_df = DF.copy()
    for col in bool_df.drop("Party", axis=1).columns:
        bool_df[col] = bool_df[col].map(tobool)
    irep.fit(bool_df, class_feat="Party", pos_class="democrat")
    assert not (irep.ruleset_.isuniversal()) and not (irep.ruleset_.isnull())
Exemplo n.º 10
0
def test_df_isnt_modified():
    # df shouldn't be affected by side-effects during model fitting
    old_df = pd.read_csv("credit.csv")
    df = old_df.copy()
    irep = IREP(random_state=42)
    irep.fit(CREDIT_DF,
             class_feat=CREDIT_CLASS_FEAT,
             pos_class=CREDIT_POS_CLASS)
    assert df.equals(old_df)

    old_df = pd.read_csv("credit.csv")
    df = old_df.copy()
    rip = RIPPER(random_state=42)
    rip.fit(CREDIT_DF,
            class_feat=CREDIT_CLASS_FEAT,
            pos_class=CREDIT_POS_CLASS)
    assert df.equals(old_df)
Exemplo n.º 11
0
def test_infer_pos_class():
    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    infer_df = DF.copy()
    infer_df[CLASS_FEAT] = infer_df[CLASS_FEAT].map(
        lambda x: 1 if x == "democrat" else 0
    )

    irep.fit(
        infer_df, class_feat=CLASS_FEAT,
    )
    assert irep.ruleset_ == IREP_RULESET_42
    rip.fit(
        infer_df, class_feat=CLASS_FEAT,
    )
    assert rip.ruleset_ == RIP_RULESET_42
Exemplo n.º 12
0
def test_fit_XY_rename_columns():
    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    # With xy
    irep.fit(
        XY_NP,
        y=None,
        class_feat=CLASS_FEAT,
        pos_class=POS_CLASS,
        feature_names=DF.columns,
    )
    assert irep.ruleset_ == IREP_RULESET_42

    rip.fit(
        XY_NP,
        y=None,
        class_feat=CLASS_FEAT,
        pos_class=POS_CLASS,
        feature_names=DF.columns,
    )
    assert rip.ruleset_ == RIP_RULESET_42

    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    # With x_y
    irep.fit(
        X_NP,
        y=Y_NP,
        class_feat=CLASS_FEAT,
        pos_class=POS_CLASS,
        feature_names=DF.drop(CLASS_FEAT, axis=1).columns,
    )
    assert irep.ruleset_ == IREP_RULESET_42

    rip.fit(
        X_NP,
        y=Y_NP,
        class_feat=CLASS_FEAT,
        pos_class=POS_CLASS,
        feature_names=DF.drop(CLASS_FEAT, axis=1).columns,
    )
    assert rip.ruleset_ == RIP_RULESET_42
Exemplo n.º 13
0
def test_deprecated_bin_transformer():
    deprecated_bin_transformer = {
        "A11": [(0, 1), (1, 2), (2, 4), (4, 8), (8, 17), (17, 67)],
        "A15": [
            (0, 1),
            (1, 9),
            (10, 105),
            (108, 351),
            (351, 1004),
            (1058, 4607),
            (4700, 100000),
        ],
        "A3": [
            (0.0, 0.415),
            (0.415, 0.79),
            (0.79, 1.375),
            (1.375, 2.04),
            (2.04, 3.04),
            (3.04, 4.71),
            (4.75, 7.04),
            (7.08, 10.665),
            (10.75, 14.585),
            (14.79, 28.0),
        ],
        "A8": [
            (0.0, 0.04),
            (0.04, 0.165),
            (0.165, 0.335),
            (0.335, 0.71),
            (0.75, 1.25),
            (1.25, 1.835),
            (1.835, 2.79),
            (3.0, 5.04),
            (5.085, 13.0),
            (13.5, 28.5),
        ],
    }
    df = pd.read_csv("credit.csv")
    irep = IREP()
    irep.fit(df, class_feat="Class", pos_class="+")
    irep.bin_transformer_ = deprecated_bin_transformer
    preds = irep.predict(df)

    rip = RIPPER()
    rip.fit(df, class_feat="Class", pos_class="+")
    rip.bin_transformer_ = deprecated_bin_transformer
    preds = rip.predict(df)
Exemplo n.º 14
0
def test_same_inputs_give_same_results():
    for random_state in range(3):
        irep_res = []
        rip_res = []

        irep = IREP(random_state=random_state)
        rip = RIPPER(random_state=random_state)
        irep.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS)
        irep_res.append(irep.ruleset_)
        rip.fit(DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS)
        rip_res.append(rip.ruleset_)

        irep = IREP(random_state=random_state)
        rip = RIPPER(random_state=random_state)
        irep.fit(X_DF, y=Y_DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS)
        irep_res.append(irep.ruleset_)
        rip.fit(X_DF, y=Y_DF, class_feat=CLASS_FEAT, pos_class=POS_CLASS)
        rip_res.append(rip.ruleset_)

        irep = IREP(random_state=random_state)
        rip = RIPPER(random_state=random_state)
        irep.fit(X_DF, y=Y_DF, pos_class=POS_CLASS)
        irep_res.append(irep.ruleset_)
        rip.fit(X_DF, y=Y_DF, pos_class=POS_CLASS)
        rip_res.append(rip.ruleset_)

        irep = IREP(random_state=random_state)
        rip = RIPPER(random_state=random_state)
        irep.fit(
            XY_NP,
            y=None,
            class_feat=CLASS_FEAT,
            pos_class=POS_CLASS,
            feature_names=DF.columns,
        )
        irep_res.append(irep.ruleset_)
        rip.fit(
            XY_NP,
            y=None,
            class_feat=CLASS_FEAT,
            pos_class=POS_CLASS,
            feature_names=DF.columns,
        )
        rip_res.append(rip.ruleset_)
        irep = IREP(random_state=random_state)
        rip = RIPPER(random_state=random_state)
        irep.fit(
            X_NP,
            y=Y_NP,
            class_feat=CLASS_FEAT,
            pos_class=POS_CLASS,
            feature_names=DF.drop(CLASS_FEAT, axis=1).columns,
        )
        irep_res.append(irep.ruleset_)
        rip.fit(
            X_NP,
            y=Y_NP,
            class_feat=CLASS_FEAT,
            pos_class=POS_CLASS,
            feature_names=DF.drop(CLASS_FEAT, axis=1).columns,
        )
        rip_res.append(rip.ruleset_)

        assert all([res == irep_res[0] for res in irep_res])
        assert all([res == rip_res[0] for res in rip_res])
Exemplo n.º 15
0
def test_use_initial_model():

    initial_model = "[[A9=t ^ A10=t]]"
    expected_irep = ruleset_fromstr(
        """[[A9=t ^ A10=t] V
        [A9=t ^ A7=h] V
        [A9=t ^ A4=u ^ A7=v]]
        """
    )
    expected_rip = ruleset_fromstr(
        """[[A9=t ^ A10=t] V
        [A9=t ^ A7=h] V
        [A9=t ^ A4=u ^ A14=0 ^ A15=0-0] V
        [A9=t ^ A6=w]]
        """
    )

    # From str
    irep = IREP(random_state=1)
    irep.fit(credit_df, class_feat='Class', pos_class='+',
            initial_model=initial_model
    )
    assert irep.ruleset_ == expected_irep
    rip = RIPPER(random_state=1)
    rip.fit(credit_df, class_feat='Class', pos_class='+',
            initial_model=initial_model
    )
    assert rip.ruleset_ == expected_rip

    # From IREP
    initial_irep_model = IREP()
    initial_irep_model.init_ruleset(initial_model)
    irep = IREP(random_state=1)
    irep.fit(credit_df, class_feat='Class', pos_class='+',
            initial_model=initial_irep_model
    )
    assert irep.ruleset_ == expected_irep
    rip = RIPPER(random_state=1)
    rip.fit(credit_df, class_feat='Class', pos_class='+',
            initial_model=initial_irep_model
    )
    assert rip.ruleset_ == expected_rip

    # From RIP
    initial_rip_model = RIPPER()
    initial_rip_model.init_ruleset(initial_model)
    irep = IREP(random_state=1)
    irep.fit(credit_df, class_feat='Class', pos_class='+',
            initial_model=initial_rip_model
    )
    assert irep.ruleset_ == expected_irep
    rip = RIPPER(random_state=1)
    rip.fit(credit_df, class_feat='Class', pos_class='+',
            initial_model=initial_rip_model
    )
    assert rip.ruleset_ == expected_rip

    # No side-effects
    assert initial_irep_model.ruleset_ == ruleset_fromstr(initial_model)
    assert initial_rip_model.ruleset_ == ruleset_fromstr(initial_model)
Exemplo n.º 16
0
from copy import deepcopy
import os

import pytest
import pandas as pd

from wittgenstein.irep import IREP
from wittgenstein.ripper import RIPPER
from wittgenstein.base import Ruleset, ruleset_fromstr, rule_fromstr

DF = pd.read_csv("mushroom.csv")
original_ruleset_str = "[[Odor=f] V [Gill-size=n] V [Spore-print-color=r] V [Odor=m]]"
original_ruleset = ruleset_fromstr(original_ruleset_str)
original_rules = original_ruleset.rules
original_irep = IREP(random_state=42)
original_irep.fit(DF, class_feat="Poisonous/Edible", pos_class="p")
# Ensure setup works
assert original_ruleset == original_irep.ruleset_


credit_df = pd.read_csv("credit.csv")
credit_class_feat = "Class"
credit_pos_class = "+"
credit_rip = RIPPER(random_state=42, verbosity=0)
credit_rip.fit(credit_df, class_feat="Class", pos_class="+")
credit_original_ruleset = ruleset_fromstr(
    "[[A9=t ^ A10=t ^ A4=u ^ A1=b ^ A11=7-16] V \
    [A9=t ^ A10=t ^ A4=u ^ A11=3-7] V \
    [A9=t ^ A10=t ^ A14=0] V \
    [A9=t ^ A10=t] V \
    [A9=t ^ A7=h ^ A6=q]]"
Exemplo n.º 17
0
def test_initruleset():
    irep = IREP(random_state=42)
    irep.init_ruleset()
    irep.ruleset_ == Ruleset()

    irep = IREP(random_state=42)
    irep.init_ruleset(original_ruleset)
    irep.ruleset_ == original_ruleset

    irep = IREP(random_state=42)
    irep.init_ruleset(original_ruleset_str)
    irep.ruleset_ == original_ruleset