def test_random_split_df_shuffled_split_are_same(): idx1, idx2 = random_split(FULL_DF.index, 0.66, res_type=set, random_state=42) df1, df2 = df_shuffled_split(FULL_DF, split_size=0.66, random_state=42) assert (set(FULL_DF.loc[idx1, :].index), set(FULL_DF.loc[idx2, :].index)) == ( idx1, idx2, )
def test_stacking(): irep = IREP(random_state=42) rip = RIPPER(random_state=42) df = DF.copy() numeric_cols = df.select_dtypes("number").columns categorical_cols = [ col for col in df.columns if (col not in numeric_cols and not col == CLASS_FEAT) ] dum_df = pd.get_dummies(df[categorical_cols]) for col in numeric_cols: dum_df[col] = df[col] dum_df[CLASS_FEAT] = df[CLASS_FEAT] sktrain, sktest = df_shuffled_split(dum_df, random_state=42) sktrain_x, sktrain_y = sktrain.drop(CLASS_FEAT, axis=1), train[CLASS_FEAT] sktest_x, sktest_y = sktest.drop(CLASS_FEAT, axis=1), test[CLASS_FEAT] lone_tree = DecisionTreeClassifier(random_state=42) lone_tree.fit(sktrain_x, sktrain_y) lone_tree_score = lone_tree.score(sktest_x, sktest_y) # print('lone_tree_score',lone_tree_score) irep_tree = SVC(random_state=42) irep_stack_estimators = [("irep", irep), ("tree", irep_tree)] irep_stack = StackingClassifier(estimators=irep_stack_estimators, final_estimator=LogisticRegression()) irep_stack.fit(sktrain_x, sktrain_y) irep_stack_score = irep_stack.score(sktest_x, sktest_y) # print('irep_stack_score', irep_stack_score) assert irep_stack_score != lone_tree_score rip_tree = DecisionTreeClassifier(random_state=42) rip_stack_estimators = [("rip", rip), ("tree", rip_tree)] rip_stack = StackingClassifier(estimators=rip_stack_estimators, final_estimator=LogisticRegression()) rip_stack.fit(sktrain_x, sktrain_y) rip_stack_score = rip_stack.score(sktest_x, sktest_y) # print('rip_stack_score',rip_stack_score) assert rip_stack_score != lone_tree_score
X_DF = DF.drop(CLASS_FEAT, axis=1) Y_DF = DF[CLASS_FEAT] XY_NP = DF.values X_NP = X_DF.values Y_NP = Y_DF.values NP_CLASS_FEAT = -1 ###### irep = IREP(random_state=42) rip = RIPPER(random_state=42) ##### train, test = df_shuffled_split(DF, random_state=42) test_x, test_y = test.drop(CLASS_FEAT, axis=1), test[CLASS_FEAT] irep.fit(train, class_feat=CLASS_FEAT, pos_class=CREDIT_POS_CLASS) rip.fit(train, class_feat=CLASS_FEAT, pos_class=CREDIT_POS_CLASS) ##### def test_predict(): irep_preds = irep.predict(test_x) assert all(p in (True, False) for p in irep_preds) assert not all(p == True for p in irep_preds) assert not all(p == False for p in irep_preds) assert sum(irep_preds) == 128
def test_shuffled_splits_are_len_7_len_3(): df1, df2 = df_shuffled_split(FIRST_10_EXAMPLES, 0.7, random_state=None) assert (len(df1), len(df2)) == (7, 3)
def _grow_ruleset(self, pos_df, neg_df, initial_model=None): """Grow a Ruleset with (optional) pruning.""" ruleset = self._ruleset_frommodel(initial_model) ruleset._set_possible_conds(pos_df, neg_df) if self.verbosity >= 2: print("growing ruleset...") print(f"initial model: {ruleset}") print() prune_size = (self.prune_size if self.prune_size is not None else 0 ) # If not pruning, use all the data for growing pos_remaining = pos_df.copy() neg_remaining = neg_df.copy() self.rules = [] # Stop adding disjunctions if there are no more positive examples to cover while len(pos_remaining) > 0: # If applicable, check for user-specified early stopping if stop_early(ruleset, self.max_rules, self.max_total_conds): break # Grow-prune split remaining uncovered examples (if applicable) pos_growset, pos_pruneset = base_functions.df_shuffled_split( pos_remaining, (1 - prune_size), random_state=self.random_state) neg_growset, neg_pruneset = base_functions.df_shuffled_split( neg_remaining, (1 - prune_size), random_state=self.random_state) if self.verbosity >= 2: print( f"pos_growset {len(pos_growset)} pos_pruneset {len(pos_pruneset)}" ) print( f"neg_growset {len(neg_growset)} neg_pruneset {len(neg_pruneset)}" ) if not prune_size: print(f"(pruning is turned off)") # Grow Rule grown_rule = base_functions.grow_rule( pos_growset, neg_growset, ruleset.possible_conds, max_rule_conds=self.max_rule_conds, verbosity=self.verbosity, ) # If not pruning, add Rule to Ruleset and drop only the covered positive examples if not prune_size: ruleset.add(grown_rule) if self.verbosity >= 2: print( f"updated ruleset: {ruleset.truncstr(direction='right')}" ) print() rule_covers_pos = grown_rule.covers(pos_remaining) pos_remaining = pos_remaining.drop(rule_covers_pos.index, axis=0) if self.verbosity >= 3: print( f"examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg" ) print() # If pruning, prune Rule, assess if it's time to stop, and drop all covered examples else: pruned_rule = base_functions.prune_rule( grown_rule, _IREP_prune_metric, pos_pruneset, neg_pruneset, verbosity=self.verbosity, ) # Stop if the Rule is bad prune_precision = base_functions.precision( pruned_rule, pos_pruneset, neg_pruneset) if not prune_precision or prune_precision < 0.50: break # Otherwise, add new Rule, remove covered examples, and continue else: ruleset.add(pruned_rule) if self.verbosity >= 2: print( f"updated ruleset: {ruleset.truncstr(direction='right')}" ) print() pos_remaining, neg_remaining = base_functions.rm_covered( pruned_rule, pos_remaining, neg_remaining) if self.verbosity >= 3: print( f"examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg" ) print() # Return new ruleset return ruleset