Exemplo n.º 1
0
def test_random_split_df_shuffled_split_are_same():
    idx1, idx2 = random_split(FULL_DF.index,
                              0.66,
                              res_type=set,
                              random_state=42)
    df1, df2 = df_shuffled_split(FULL_DF, split_size=0.66, random_state=42)
    assert (set(FULL_DF.loc[idx1, :].index),
            set(FULL_DF.loc[idx2, :].index)) == (
                idx1,
                idx2,
            )
Exemplo n.º 2
0
def test_stacking():
    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    df = DF.copy()
    numeric_cols = df.select_dtypes("number").columns
    categorical_cols = [
        col for col in df.columns
        if (col not in numeric_cols and not col == CLASS_FEAT)
    ]
    dum_df = pd.get_dummies(df[categorical_cols])
    for col in numeric_cols:
        dum_df[col] = df[col]
    dum_df[CLASS_FEAT] = df[CLASS_FEAT]
    sktrain, sktest = df_shuffled_split(dum_df, random_state=42)
    sktrain_x, sktrain_y = sktrain.drop(CLASS_FEAT, axis=1), train[CLASS_FEAT]
    sktest_x, sktest_y = sktest.drop(CLASS_FEAT, axis=1), test[CLASS_FEAT]

    lone_tree = DecisionTreeClassifier(random_state=42)
    lone_tree.fit(sktrain_x, sktrain_y)
    lone_tree_score = lone_tree.score(sktest_x, sktest_y)
    # print('lone_tree_score',lone_tree_score)

    irep_tree = SVC(random_state=42)
    irep_stack_estimators = [("irep", irep), ("tree", irep_tree)]
    irep_stack = StackingClassifier(estimators=irep_stack_estimators,
                                    final_estimator=LogisticRegression())
    irep_stack.fit(sktrain_x, sktrain_y)
    irep_stack_score = irep_stack.score(sktest_x, sktest_y)
    # print('irep_stack_score', irep_stack_score)
    assert irep_stack_score != lone_tree_score

    rip_tree = DecisionTreeClassifier(random_state=42)
    rip_stack_estimators = [("rip", rip), ("tree", rip_tree)]
    rip_stack = StackingClassifier(estimators=rip_stack_estimators,
                                   final_estimator=LogisticRegression())
    rip_stack.fit(sktrain_x, sktrain_y)
    rip_stack_score = rip_stack.score(sktest_x, sktest_y)
    # print('rip_stack_score',rip_stack_score)
    assert rip_stack_score != lone_tree_score
Exemplo n.º 3
0
X_DF = DF.drop(CLASS_FEAT, axis=1)
Y_DF = DF[CLASS_FEAT]

XY_NP = DF.values
X_NP = X_DF.values
Y_NP = Y_DF.values
NP_CLASS_FEAT = -1

######

irep = IREP(random_state=42)
rip = RIPPER(random_state=42)

#####

train, test = df_shuffled_split(DF, random_state=42)
test_x, test_y = test.drop(CLASS_FEAT, axis=1), test[CLASS_FEAT]

irep.fit(train, class_feat=CLASS_FEAT, pos_class=CREDIT_POS_CLASS)
rip.fit(train, class_feat=CLASS_FEAT, pos_class=CREDIT_POS_CLASS)

#####


def test_predict():
    irep_preds = irep.predict(test_x)

    assert all(p in (True, False) for p in irep_preds)
    assert not all(p == True for p in irep_preds)
    assert not all(p == False for p in irep_preds)
    assert sum(irep_preds) == 128
Exemplo n.º 4
0
def test_shuffled_splits_are_len_7_len_3():
    df1, df2 = df_shuffled_split(FIRST_10_EXAMPLES, 0.7, random_state=None)
    assert (len(df1), len(df2)) == (7, 3)
Exemplo n.º 5
0
    def _grow_ruleset(self, pos_df, neg_df, initial_model=None):
        """Grow a Ruleset with (optional) pruning."""

        ruleset = self._ruleset_frommodel(initial_model)
        ruleset._set_possible_conds(pos_df, neg_df)

        if self.verbosity >= 2:
            print("growing ruleset...")
            print(f"initial model: {ruleset}")
            print()

        prune_size = (self.prune_size if self.prune_size is not None else 0
                      )  # If not pruning, use all the data for growing
        pos_remaining = pos_df.copy()
        neg_remaining = neg_df.copy()
        self.rules = []

        # Stop adding disjunctions if there are no more positive examples to cover
        while len(pos_remaining) > 0:

            # If applicable, check for user-specified early stopping
            if stop_early(ruleset, self.max_rules, self.max_total_conds):
                break

            # Grow-prune split remaining uncovered examples (if applicable)
            pos_growset, pos_pruneset = base_functions.df_shuffled_split(
                pos_remaining, (1 - prune_size),
                random_state=self.random_state)
            neg_growset, neg_pruneset = base_functions.df_shuffled_split(
                neg_remaining, (1 - prune_size),
                random_state=self.random_state)
            if self.verbosity >= 2:
                print(
                    f"pos_growset {len(pos_growset)} pos_pruneset {len(pos_pruneset)}"
                )
                print(
                    f"neg_growset {len(neg_growset)} neg_pruneset {len(neg_pruneset)}"
                )
                if not prune_size:
                    print(f"(pruning is turned off)")

            # Grow Rule
            grown_rule = base_functions.grow_rule(
                pos_growset,
                neg_growset,
                ruleset.possible_conds,
                max_rule_conds=self.max_rule_conds,
                verbosity=self.verbosity,
            )

            # If not pruning, add Rule to Ruleset and drop only the covered positive examples
            if not prune_size:
                ruleset.add(grown_rule)
                if self.verbosity >= 2:
                    print(
                        f"updated ruleset: {ruleset.truncstr(direction='right')}"
                    )
                    print()
                rule_covers_pos = grown_rule.covers(pos_remaining)
                pos_remaining = pos_remaining.drop(rule_covers_pos.index,
                                                   axis=0)
                if self.verbosity >= 3:
                    print(
                        f"examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg"
                    )
                    print()

            # If pruning, prune Rule, assess if it's time to stop, and drop all covered examples
            else:
                pruned_rule = base_functions.prune_rule(
                    grown_rule,
                    _IREP_prune_metric,
                    pos_pruneset,
                    neg_pruneset,
                    verbosity=self.verbosity,
                )

                # Stop if the Rule is bad
                prune_precision = base_functions.precision(
                    pruned_rule, pos_pruneset, neg_pruneset)
                if not prune_precision or prune_precision < 0.50:
                    break
                # Otherwise, add new Rule, remove covered examples, and continue
                else:
                    ruleset.add(pruned_rule)
                    if self.verbosity >= 2:
                        print(
                            f"updated ruleset: {ruleset.truncstr(direction='right')}"
                        )
                        print()
                    pos_remaining, neg_remaining = base_functions.rm_covered(
                        pruned_rule, pos_remaining, neg_remaining)
                    if self.verbosity >= 3:
                        print(
                            f"examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg"
                        )
                        print()

        # Return new ruleset
        return ruleset