def test_merge_rule_statistics_of_duplicate(self): """Checks that the statistics are updated correctly if a duplicate rule is generated during the generalization step in bracid()""" rules = [ pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"}, name=0), pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"}, name=1), # Duplicate ] orig_idx = 0 dupl_idx = 1 my_vars.unique_rules = {} my_vars.all_rules = {} for rule in rules: hash_val = compute_hashable_key(rule) my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name) my_vars.all_rules[rule.name] = rule print("hashes", my_vars.unique_rules) # Some random values my_vars.seed_example_rule = {0: {1, 5}, 10: {0}, 4: {7}} my_vars.seed_rule_example = {5: 0, 1: 0, 0: 10, 7: 4} my_vars.closest_examples_per_rule = {0: {0, 3}, 1: {4}, 4: {8}} my_vars.closest_rule_per_example = {0: Data(rule_id=0, dist=3), 3: Data(rule_id=0, dist=2), 4: Data(rule_id=1, dist=0.13), 5: Data(rule_id=76, dist=3)} my_vars.examples_covered_by_rule = {0: {43, 12}, 1: {7}, 2: {3}} # Delete entries of the rule with ID 1 as the one with ID 0 already exists merge_rule_statistics_of_duplicate(rules[orig_idx], rules[dupl_idx]) # Read: example with ID 0 is seed for the rule with ID 5.... correct_seed_example_rule = {0: {5}, 10: {0}, 4: {7}} # Read: rule with ID 5 has as seed example the one with ID 0... correct_seed_rule_example = {5: 0, 0: 10, 7: 4} correct_unique_rules = {compute_hashable_key(rules[orig_idx]): {0}} correct_all_rules = {0: rules[orig_idx]} # extra_rule now also covers the 3 examples to which the 2 deleted rules were closest correct_closest_examples_per_rule = {0: {0, 3, 4}, 4: {8}} correct_closest_rule_per_example = {0: Data(rule_id=0, dist=3), 3: Data(rule_id=0, dist=2), 4: Data(rule_id=0, dist=0.13), 5: Data(rule_id=76, dist=3)} correct_covered_by_rule = {2: {3}, 0: {43, 12, 7}} self.assertTrue(my_vars.seed_rule_example == correct_seed_rule_example) self.assertTrue(my_vars.seed_example_rule == correct_seed_example_rule) self.assertTrue(my_vars.unique_rules == correct_unique_rules) self.assertTrue(my_vars.all_rules == correct_all_rules) self.assertTrue(my_vars.closest_examples_per_rule == correct_closest_examples_per_rule) self.assertTrue(my_vars.closest_rule_per_example == correct_closest_rule_per_example) self.assertTrue(my_vars.examples_covered_by_rule == correct_covered_by_rule)
def test_evaluate_f1_update_confusion_matrix_not_updated(self): """Tests what happens if input has a numeric and a nominal feature and a rule that predicts an example is not updated as F1 score doesn't improve""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.examples_covered_by_rule = {} my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } my_vars.conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } new_rule = pd.Series( { "A": "low", "B": (0.5, 0.5), "C": (3, 3), "Class": "banana" }, name=4) correct_f1 = 2 * 1 * 0.5 / 1.5 f1 = evaluate_f1_update_confusion_matrix(df, new_rule, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } self.assertTrue(f1 == correct_f1) for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id][0] and abs(dist - correct_closest_rule_per_example[example_id][1]) < 0.001) correct_conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } self.assertTrue(my_vars.conf_matrix == correct_conf_matrix)
def test_delete_rule_statistics_collision(self): """Deletes a rule that shares its hash with other rules""" extra_rule = pd.Series( { "A": "high", "B": Bounds(lower=0.1, upper=1), "C": Bounds(lower=1, upper=2), "Class": "apple" }, name=4) rules = [ extra_rule, pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=1), # Duplicate ] df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "apple", "apple", "apple", "apple"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 1, 'low': 2, my_vars.CONDITIONAL: { 'high': Counter({ 'apple': 1 }), 'low': Counter({ 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 0.1, "max": 1 }, "C": { "min": 1, "max": 3 } }) my_vars.minority_class = "apple" my_vars.unique_rules = {} my_vars.all_rules = {} for rule in rules: hash_val = compute_hashable_key(rule) my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name) my_vars.all_rules[rule.name] = rule print("hashes", my_vars.unique_rules) # Some random values my_vars.seed_example_rule = {0: {1, 5}, 10: {0}, 4: {7}} my_vars.seed_rule_example = {5: 0, 1: 0, 0: 10, 7: 4} my_vars.closest_examples_per_rule = {0: {0, 3}, 1: {4}, 4: {8}} my_vars.closest_rule_per_example = { 0: Data(rule_id=0, dist=3), 3: Data(rule_id=0, dist=2), 4: Data(rule_id=1, dist=0.13), 5: Data(rule_id=76, dist=3) } my_vars.examples_covered_by_rule = {0: {43, 12}, 1: {7}, 2: {3}} final_rules = {} # Delete entries for rules with IDs 0 and 1 from all statistics rule1 = rules.pop() delete_rule_statistics(df, rule1, rules, final_rules, class_col_name, lookup, min_max, classes) rule2 = rules.pop() delete_rule_statistics(df, rule2, rules, final_rules, class_col_name, lookup, min_max, classes) correct_seed_example_rule = {4: {7}} correct_seed_rule_example = {5: 0, 7: 4} correct_unique_rules = {compute_hashable_key(extra_rule): {4}} correct_all_rules = {4: extra_rule} # extra_rule now also covers the 3 examples to which the 2 deleted rules were closest correct_closest_examples_per_rule = {4: {8, 0, 3, 4}} correct_closest_rule_per_example = { 5: Data(rule_id=76, dist=3), 4: Data(rule_id=4, dist=0.25), 0: Data(rule_id=4, dist=0.25), 3: Data(rule_id=4, dist=0.371141975308642) } correct_covered_by_rule = {2: {3}} self.assertTrue(my_vars.seed_rule_example == correct_seed_rule_example) self.assertTrue(my_vars.seed_example_rule == correct_seed_example_rule) self.assertTrue(my_vars.unique_rules == correct_unique_rules) self.assertTrue(my_vars.all_rules == correct_all_rules) self.assertTrue(my_vars.closest_examples_per_rule == correct_closest_examples_per_rule) self.assertTrue(my_vars.closest_rule_per_example == correct_closest_rule_per_example) self.assertTrue( my_vars.examples_covered_by_rule == correct_covered_by_rule)
def test_find_neighbors_numeric_nominal_stats(self): """Tests that global statistics are updated accordingly""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0) my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } # Reset because other tests added data, so if you only run this test it would work, but not if other # tests are run prior to that my_vars.examples_covered_by_rule = {} my_vars.closest_examples_per_rule = {} my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } # my_vars.all_rules = {0: rule} k = 4 correct = df.iloc[[5, 2, 3, 4]] classes = ["apple", "banana"] my_vars.minority_class = "banana" min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) correct_covered = {} correct_examples_per_rule = {0: {1, 2, 4, 5}, 1: {0, 3}} correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=0, dist=0.09), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=0, dist=0.0006250000000000001) } neighbors, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=False) self.assertTrue(neighbors.equals(correct)) self.assertTrue(correct_covered == my_vars.examples_covered_by_rule) self.assertTrue( correct_examples_per_rule == my_vars.closest_examples_per_rule) for example_id, (rule_id, dist) in correct_closest_rule_per_example.items(): features = my_vars.all_rules[rule_id].size self.assertTrue(example_id in my_vars.closest_rule_per_example) other_id, other_dist = my_vars.closest_rule_per_example[example_id] other_features = my_vars.all_rules[other_id].size self.assertTrue(rule_id == other_id) self.assertTrue(features == other_features) self.assertTrue(abs(dist - other_dist) < 0.0001)
def test_add_all_good_rules(self): """Tests that rule set is updated when a generalized rule improves F1""" df = pd.DataFrame({"A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"]}) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({"B": {"min": 1, "max": 5}, "C": {"min": 1, "max": 11}}) # Use majority class as minority to have multiple neighbors and see if the function works correctly my_vars.minority_class = "banana" rules = [ pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"}, name=0), pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple"}, name=1), pd.Series({"A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana"}, name=3), pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana"}, name=4), pd.Series({"A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana"}, name=5), pd.Series({"A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana"}, name=2) # Current rule to be tested is always at the end ] test_idx = -1 my_vars.latest_rule_id = len(rules) - 1 my_vars.examples_covered_by_rule = {} my_vars.all_rules = {0: rules[0], 1: rules[1], 2: rules[test_idx], 3: rules[2], 4: rules[3], 5: rules[4]} my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}} my_vars.unique_rules = {} for rule in rules: hash_val = compute_hashable_key(rule) my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name) initial_correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625)} initial_f1 = evaluate_f1_initialize_confusion_matrix(df, rules, class_col_name, lookup, min_max, classes) correct_confusion_matrix = {my_vars.TP: {2, 5}, my_vars.FP: set(), my_vars.TN: {0, 1}, my_vars.FN: {3, 4}} correct_rules = 8 self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) # Make sure confusion matrix, closest rule per example are correct at the beginning for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue(rule_id == initial_correct_closest_rule_per_example[example_id].rule_id and abs(dist - initial_correct_closest_rule_per_example[example_id].dist) < 0.001) correct_initial_f1 = 2 * 0.5 * 1 / 1.5 self.assertTrue(initial_f1 == correct_initial_f1) k = 3 neighbors, dists, _ = find_nearest_examples(df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors= True) improved, updated_rules, f1 = add_all_good_rules(df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) self.assertTrue(improved is True) print("f1", f1) # correct_covered = {2: {0, 1, 2, 3, 4, 5}} correct_covered = {6: {0, 1, 2, 4, 5}, 7: {3}} correct_confusion_matrix = {my_vars.TP: {2, 3, 4, 5}, my_vars.FP: {0, 1}, my_vars.TN: set(), my_vars.FN: set()} # correct_closest_rule_per_example = { # 0: Data(rule_id=2, dist=0.0), # 1: Data(rule_id=2, dist=0.0), # 2: Data(rule_id=2, dist=0.0), # 3: Data(rule_id=2, dist=0.0), # 4: Data(rule_id=2, dist=0.0), # 5: Data(rule_id=2, dist=0.0)} correct_closest_rule_per_example = { 0: Data(rule_id=6, dist=0.0), 1: Data(rule_id=6, dist=0.0), 2: Data(rule_id=6, dist=0.0), 3: Data(rule_id=7, dist=0.0), 4: Data(rule_id=6, dist=0.0), 5: Data(rule_id=6, dist=0.0) } correct_f1 = 0.8 self.assertTrue(correct_f1 == f1) for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue(rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) # latest_rule_id must be 7 as 2 new rules were added to the 5 initial rules self.assertTrue(len(updated_rules) == correct_rules and my_vars.latest_rule_id == (correct_rules - 1)) self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
def test_evaluate_f1_temporarily(self): """Tests that the global variables won't be updated despite local changes""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" # Reset as other tests change the data my_vars.examples_covered_by_rule = {} my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } correct_closest_rules = copy.deepcopy(my_vars.closest_rule_per_example) correct_closest_examples = copy.deepcopy( my_vars.closest_examples_per_rule) my_vars.conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3, 4}, my_vars.TN: {2, 5}, my_vars.FN: set() } new_rule = pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=1.0), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=0) correct_f1 = 0.8 f1, conf_matrix, closest_rules, closest_examples, covered, updated_example_ids = \ evaluate_f1_temporarily(df, new_rule, new_rule.name, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.0), 5: Data(rule_id=2, dist=0.67015625) } correct_covered = {0: {4}} correct_updated_examples = [4] self.assertTrue(updated_example_ids == correct_updated_examples) self.assertTrue(f1 == correct_f1) # Local result is still the same as in test_evaluate_f1_update_confusion_matrix.py for example_id in closest_rules: rule_id, dist = closest_rules[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id][0] and abs(dist - correct_closest_rule_per_example[example_id][1]) < 0.001) self.assertTrue(closest_examples == my_vars.closest_examples_per_rule) correct_conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3}, my_vars.TN: {2, 4, 5}, my_vars.FN: set() } self.assertTrue(conf_matrix == correct_conf_matrix) # But now check that global variables remained unaffected by the changes correct_conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3, 4}, my_vars.TN: {2, 5}, my_vars.FN: set() } self.assertTrue(my_vars.conf_matrix == correct_conf_matrix) self.assertTrue( correct_closest_rules == my_vars.closest_rule_per_example) self.assertTrue( correct_closest_examples == my_vars.closest_examples_per_rule) self.assertTrue(correct_covered == covered)
def test_find_nearest_rule_ties(self): """Tests that ties (multiple rules cover an example) are resolved properly""" df = pd.DataFrame({ "A": ["low", "low", "low"], "B": [1, 1, 2], "C": [1, 2, 3], "Class": ["apple", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series({ "A": "low", "B": (1, 2), "C": (1, 3), "Class": "apple" }, name=0), pd.Series({ "B": (1, 2), "C": (1, 3), "Class": "apple" }, name=1), pd.Series({ "B": (0, 3), "C": (1, 4), "Class": "apple" }, name=2), ] # Reset because other tests change the data my_vars.closest_examples_per_rule = {} my_vars.closest_rule_per_example = {} my_vars.examples_covered_by_rule = {} my_vars.unique_rules = {} my_vars.seed_example_rule = {0: {0}, 1: {1}, 2: {2}} my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2} my_vars.all_rules = {0: rules[0], 1: rules[1], 2: rules[2]} my_vars.conf_matrix = {} for example_id, example in df.iterrows(): rule, dist, was_updated = find_nearest_rule( rules, example, class_col_name, lookup, min_max, classes, my_vars.examples_covered_by_rule, label_type=my_vars.ALL_LABELS, only_uncovered_neighbors=False) # print("eid: {} rule:\n{}\ndist: {} updated: {}".format(example_id, rule, dist, was_updated)) print("eid: {} rule: {} dist: {} updated: {}".format( example_id, rule.name, dist, was_updated)) self.assertTrue(was_updated is True) print("closest rules") print(my_vars.closest_rule_per_example) # Note: it's permissible that rule 1 covers example 1 (although example 1 is the seed for rule 1) # because rule 1 already covers example 0 correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.0), 1: Data(rule_id=1, dist=0.0), 2: Data(rule_id=1, dist=0.0), } correct_closest_examples_per_rule = { 1: {0, 1, 2}, } print(my_vars.closest_rule_per_example) print(my_vars.closest_examples_per_rule) # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue(correct_closest_examples_per_rule == my_vars.closest_examples_per_rule)
def test_find_nearest_rule_no_ties(self): """Tests that the nearest rule is found per example assuming no ties""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series({ "A": "low", "B": (1, 1), "C": (3, 3), "Class": "apple" }, name=0), pd.Series({ "A": "low", "B": (1, 1), "C": (2, 2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": (4, 4), "C": (1, 1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": (1.5, 1.5), "C": (0.5, 0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": (0.5, 0.5), "C": (3, 3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": (0.75, 0.75), "C": (2, 2), "Class": "banana" }, name=5) ] # Reset because other tests change the data my_vars.closest_examples_per_rule = {} my_vars.closest_rule_per_example = {} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}, 6: {8} } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8} my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.examples_covered_by_rule = {6: {8}} my_vars.unique_rules = {} my_vars.conf_matrix = {} for example_id, example in df.iterrows(): rule, dist, was_updated = find_nearest_rule( rules, example, class_col_name, lookup, min_max, classes, my_vars.examples_covered_by_rule, label_type=my_vars.ALL_LABELS, only_uncovered_neighbors=False) # print("eid: {} rule:\n{}\ndist: {} updated: {}".format(example_id, rule, dist, was_updated)) self.assertTrue(was_updated is True) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } correct_closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } print(my_vars.closest_rule_per_example) print(correct_closest_rule_per_example) # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue(correct_closest_examples_per_rule == my_vars.closest_examples_per_rule)
def test_add_one_best_rule_unique(self): """Tests that the best rule found by this function is unique and correspondingly updates relevant statistics if that's not the case""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } test_idx = -1 classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" # name=6 because this guy already exists in the rules and the new rule with name=0 becomes the same, so # it's removed correct_generalized_rule = pd.Series( { "A": "low", "B": (1, 1), "C": (2.0, 3), "Class": "apple" }, name=6) rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2.0, upper=3), "Class": "apple" }, name=6), # same as best rule pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0) # Current rule is always at the end of the list ] for rule in rules: rule_hash = compute_hashable_key(rule) my_vars.unique_rules[rule_hash] = {rule.name} correct_generalized_rule_hash = compute_hashable_key( correct_generalized_rule) my_vars.examples_covered_by_rule = {} my_vars.all_rules = { 0: rules[test_idx], 1: rules[0], 2: rules[1], 3: rules[2], 4: rules[3], 5: rules[4], 6: rules[5] } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } # Note that 6: {8} is incorrect and was just added to test if the entries are merged correctly my_vars.examples_covered_by_rule = {6: {8}} print("rule hashes", my_vars.unique_rules) print(correct_generalized_rule_hash) my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=6, dist=0.0), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625), 8: Data(rule_id=6, dist=0) # Fake entry } my_vars.conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3, 4}, my_vars.TN: {2, 5}, my_vars.FN: set() } initial_f1 = 0.66666 k = 3 neighbors, dists, _ = find_nearest_examples( df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) improved, updated_rules, f1 = add_one_best_rule( df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=6, dist=0.0), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=6, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625), 8: Data(rule_id=6, dist=0) } self.assertTrue(improved is True) correct_f1 = 2 * 0.5 * 1 / 1.5 self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION) correct_confusion_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3, 4}, my_vars.TN: {2, 5}, my_vars.FN: set() } # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: # 8 was only added to test something else, since it won't be in the result # if example_id != 8: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue(updated_rules[5].equals(correct_generalized_rule)) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) # Duplicate rule was deleted so that the last rule now corresponds to the rule with id self.assertTrue( len(rules) - 1 == len(updated_rules) and updated_rules[-1].name == 6)
def test_add_one_best_rule_no_update(self): """Tests that rule set is not updated when no generalized rule improves F1""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } test_idx = -1 classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0) # Current rule is always at the end of the list ] my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } my_vars.all_rules = { 0: rules[test_idx], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[0] } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } my_vars.examples_covered_by_rule = {} # F1 is actually 0.6666, but setting it to 0.8 makes it not update any rule initial_f1 = 0.8 k = 3 my_vars.unique_rules = {} for rule in rules: rule_hash = compute_hashable_key(rule) my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name) neighbors, dists, _ = find_nearest_examples( df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) improved, updated_rules, f1 = add_one_best_rule( df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } self.assertTrue(improved is False) correct_f1 = initial_f1 self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION) correct_generalized_rule = pd.Series( { "A": "low", "B": (1, 1), "C": (3, 3), "Class": "apple" }, name=0) correct_confusion_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) print(rules[test_idx]) print(correct_generalized_rule) print("updated") print(updated_rules) self.assertTrue( updated_rules[test_idx].equals(correct_generalized_rule)) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
def test_add_one_best_rule_update_stats(self): """Tests that rule set is updated when a generalized rule improves F1 and also the mapping of closest rule per example changes""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } test_idx = -1 classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0) # Current rule is always at the end of the list ] my_vars.closest_examples_per_rule = { 0: {4}, 1: {0, 1, 3}, # Change compared to previous test case 2: {5}, 5: {2} } my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=1, dist=0.010000000000000002 ), # Change compared to previous test case 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } # Reset because other tests change the data # my_vars.examples_covered_by_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}, 6: {8}} my_vars.examples_covered_by_rule = {} my_vars.all_rules = { 0: rules[test_idx], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.unique_rules = {} my_vars.unique_rules = {} for rule in rules: rule_hash = compute_hashable_key(rule) my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name) # Actually, correctly it should've been # my_vars.conf_matrix = {my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4}} # at the start (i.e. F1=0.66666), but to see if it changes, it's changed my_vars.conf_matrix = { my_vars.TP: {0}, my_vars.FP: set(), my_vars.TN: {1, 2, 5}, my_vars.FN: {3, 4} } initial_f1 = 0.1 k = 3 neighbors, dists, _ = find_nearest_examples( df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) improved, updated_rules, f1 = add_one_best_rule( df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.0), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } correct_closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } correct_f1 = 2 * 0.5 * 1 / 1.5 self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION) self.assertTrue(improved is True) correct_generalized_rule = pd.Series( { "A": "low", "B": (1, 1), "C": (2.0, 3), "Class": "apple" }, name=0) correct_confusion_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue( updated_rules[test_idx].equals(correct_generalized_rule)) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) print(correct_closest_examples_per_rule) print(my_vars.closest_examples_per_rule) self.assertTrue(correct_closest_examples_per_rule == my_vars.closest_examples_per_rule)