Exemplo n.º 1
0
    def test_merge_rule_statistics_of_duplicate(self):
        """Checks that the statistics are updated correctly if a duplicate rule is generated during the generalization
        step in bracid()"""

        rules = [
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"},
                      name=0),
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"},
                      name=1),  # Duplicate
        ]

        orig_idx = 0
        dupl_idx = 1
        my_vars.unique_rules = {}
        my_vars.all_rules = {}
        for rule in rules:
            hash_val = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name)
            my_vars.all_rules[rule.name] = rule
        print("hashes", my_vars.unique_rules)
        # Some random values
        my_vars.seed_example_rule = {0: {1, 5}, 10: {0}, 4: {7}}
        my_vars.seed_rule_example = {5: 0, 1: 0, 0: 10, 7: 4}
        my_vars.closest_examples_per_rule = {0: {0, 3}, 1: {4}, 4: {8}}
        my_vars.closest_rule_per_example = {0: Data(rule_id=0, dist=3), 3: Data(rule_id=0, dist=2),
                                            4: Data(rule_id=1, dist=0.13), 5: Data(rule_id=76, dist=3)}
        my_vars.examples_covered_by_rule = {0: {43, 12}, 1: {7}, 2: {3}}

        # Delete entries of the rule with ID 1 as the one with ID 0 already exists
        merge_rule_statistics_of_duplicate(rules[orig_idx], rules[dupl_idx])

        # Read: example with ID 0 is seed for the rule with ID 5....
        correct_seed_example_rule = {0: {5}, 10: {0}, 4: {7}}
        # Read: rule with ID 5 has as seed example the one with ID 0...
        correct_seed_rule_example = {5: 0, 0: 10, 7: 4}
        correct_unique_rules = {compute_hashable_key(rules[orig_idx]): {0}}
        correct_all_rules = {0: rules[orig_idx]}
        # extra_rule now also covers the 3 examples to which the 2 deleted rules were closest
        correct_closest_examples_per_rule = {0: {0, 3, 4}, 4: {8}}
        correct_closest_rule_per_example = {0: Data(rule_id=0, dist=3), 3: Data(rule_id=0, dist=2),
                                            4: Data(rule_id=0, dist=0.13), 5: Data(rule_id=76, dist=3)}
        correct_covered_by_rule = {2: {3}, 0: {43, 12, 7}}
        self.assertTrue(my_vars.seed_rule_example == correct_seed_rule_example)
        self.assertTrue(my_vars.seed_example_rule == correct_seed_example_rule)
        self.assertTrue(my_vars.unique_rules == correct_unique_rules)
        self.assertTrue(my_vars.all_rules == correct_all_rules)
        self.assertTrue(my_vars.closest_examples_per_rule == correct_closest_examples_per_rule)
        self.assertTrue(my_vars.closest_rule_per_example == correct_closest_rule_per_example)
        self.assertTrue(my_vars.examples_covered_by_rule == correct_covered_by_rule)
    def test_evaluate_f1_update_confusion_matrix_not_updated(self):
        """Tests what happens if input has a numeric and a nominal feature and a rule that predicts an example is
        not updated as F1 score doesn't improve"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"

        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5)
        ]
        my_vars.examples_covered_by_rule = {}
        my_vars.closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }
        my_vars.all_rules = {
            0: rules[0],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        my_vars.conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        new_rule = pd.Series(
            {
                "A": "low",
                "B": (0.5, 0.5),
                "C": (3, 3),
                "Class": "banana"
            },
            name=4)
        correct_f1 = 2 * 1 * 0.5 / 1.5

        f1 = evaluate_f1_update_confusion_matrix(df, new_rule, class_col_name,
                                                 lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        self.assertTrue(f1 == correct_f1)
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id][0]
                and abs(dist - correct_closest_rule_per_example[example_id][1])
                < 0.001)
        correct_conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        self.assertTrue(my_vars.conf_matrix == correct_conf_matrix)
Exemplo n.º 3
0
    def test_delete_rule_statistics_collision(self):
        """Deletes a rule that shares its hash with other rules"""
        extra_rule = pd.Series(
            {
                "A": "high",
                "B": Bounds(lower=0.1, upper=1),
                "C": Bounds(lower=1, upper=2),
                "Class": "apple"
            },
            name=4)
        rules = [
            extra_rule,
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=1),  # Duplicate
        ]
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class": ["apple", "apple", "apple", "apple", "apple", "apple"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 1,
                        'low': 2,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'apple': 1
                                    }),
                                'low':
                                    Counter({
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 0.1,
                "max": 1
            },
            "C": {
                "min": 1,
                "max": 3
            }
        })
        my_vars.minority_class = "apple"
        my_vars.unique_rules = {}
        my_vars.all_rules = {}
        for rule in rules:
            hash_val = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name)
            my_vars.all_rules[rule.name] = rule
        print("hashes", my_vars.unique_rules)
        # Some random values
        my_vars.seed_example_rule = {0: {1, 5}, 10: {0}, 4: {7}}
        my_vars.seed_rule_example = {5: 0, 1: 0, 0: 10, 7: 4}
        my_vars.closest_examples_per_rule = {0: {0, 3}, 1: {4}, 4: {8}}
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=0, dist=3),
            3: Data(rule_id=0, dist=2),
            4: Data(rule_id=1, dist=0.13),
            5: Data(rule_id=76, dist=3)
        }
        my_vars.examples_covered_by_rule = {0: {43, 12}, 1: {7}, 2: {3}}
        final_rules = {}
        # Delete entries for rules with IDs 0 and 1 from all statistics
        rule1 = rules.pop()
        delete_rule_statistics(df, rule1, rules, final_rules, class_col_name,
                               lookup, min_max, classes)
        rule2 = rules.pop()
        delete_rule_statistics(df, rule2, rules, final_rules, class_col_name,
                               lookup, min_max, classes)

        correct_seed_example_rule = {4: {7}}
        correct_seed_rule_example = {5: 0, 7: 4}
        correct_unique_rules = {compute_hashable_key(extra_rule): {4}}
        correct_all_rules = {4: extra_rule}
        # extra_rule now also covers the 3 examples to which the 2 deleted rules were closest
        correct_closest_examples_per_rule = {4: {8, 0, 3, 4}}
        correct_closest_rule_per_example = {
            5: Data(rule_id=76, dist=3),
            4: Data(rule_id=4, dist=0.25),
            0: Data(rule_id=4, dist=0.25),
            3: Data(rule_id=4, dist=0.371141975308642)
        }
        correct_covered_by_rule = {2: {3}}
        self.assertTrue(my_vars.seed_rule_example == correct_seed_rule_example)
        self.assertTrue(my_vars.seed_example_rule == correct_seed_example_rule)
        self.assertTrue(my_vars.unique_rules == correct_unique_rules)
        self.assertTrue(my_vars.all_rules == correct_all_rules)
        self.assertTrue(my_vars.closest_examples_per_rule ==
                        correct_closest_examples_per_rule)
        self.assertTrue(my_vars.closest_rule_per_example ==
                        correct_closest_rule_per_example)
        self.assertTrue(
            my_vars.examples_covered_by_rule == correct_covered_by_rule)
Exemplo n.º 4
0
    def test_find_neighbors_numeric_nominal_stats(self):
        """Tests that global statistics are updated accordingly"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0)
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        # Reset because other tests added data, so if you only run this test it would work, but not if other
        # tests are run prior to that
        my_vars.examples_covered_by_rule = {}
        my_vars.closest_examples_per_rule = {}
        my_vars.closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5)
        ]
        my_vars.all_rules = {
            0: rules[0],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }
        # my_vars.all_rules = {0: rule}
        k = 4
        correct = df.iloc[[5, 2, 3, 4]]

        classes = ["apple", "banana"]
        my_vars.minority_class = "banana"
        min_max = pd.DataFrame({
            "A": {
                "min": 1,
                "max": 5
            },
            "B": {
                "min": 1,
                "max": 11
            }
        })
        correct_covered = {}
        correct_examples_per_rule = {0: {1, 2, 4, 5}, 1: {0, 3}}
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=0, dist=0.09),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=0, dist=0.0006250000000000001)
        }
        neighbors, _, _ = find_nearest_examples(
            df,
            k,
            rule,
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=False)
        self.assertTrue(neighbors.equals(correct))
        self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
        self.assertTrue(
            correct_examples_per_rule == my_vars.closest_examples_per_rule)
        for example_id, (rule_id,
                         dist) in correct_closest_rule_per_example.items():
            features = my_vars.all_rules[rule_id].size
            self.assertTrue(example_id in my_vars.closest_rule_per_example)
            other_id, other_dist = my_vars.closest_rule_per_example[example_id]
            other_features = my_vars.all_rules[other_id].size
            self.assertTrue(rule_id == other_id)
            self.assertTrue(features == other_features)
            self.assertTrue(abs(dist - other_dist) < 0.0001)
Exemplo n.º 5
0
    def test_add_all_good_rules(self):
        """Tests that rule set is updated when a generalized rule improves F1"""
        df = pd.DataFrame({"A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75],
                           "C": [3, 2, 1, .5, 3, 2],
                           "Class": ["apple", "apple", "banana", "banana", "banana", "banana"]})
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({"B": {"min": 1, "max": 5}, "C": {"min": 1, "max": 11}})
        # Use majority class as minority to have multiple neighbors and see if the function works correctly
        my_vars.minority_class = "banana"
        rules = [
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"},
                      name=0),
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple"},
                      name=1),
            pd.Series({"A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5),
                       "Class": "banana"}, name=3),
            pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3),
                       "Class": "banana"}, name=4),
            pd.Series({"A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2),
                       "Class": "banana"}, name=5),
            pd.Series({"A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1),
                       "Class": "banana"}, name=2)  # Current rule to be tested is always at the end
        ]
        test_idx = -1
        my_vars.latest_rule_id = len(rules) - 1
        my_vars.examples_covered_by_rule = {}
        my_vars.all_rules = {0: rules[0], 1: rules[1], 2: rules[test_idx], 3: rules[2], 4: rules[3], 5: rules[4]}
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}}
        my_vars.unique_rules = {}
        for rule in rules:
            hash_val = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name)

        initial_correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)}
        initial_f1 = evaluate_f1_initialize_confusion_matrix(df, rules, class_col_name, lookup, min_max, classes)
        correct_confusion_matrix = {my_vars.TP: {2, 5}, my_vars.FP: set(), my_vars.TN: {0, 1}, my_vars.FN: {3, 4}}
        correct_rules = 8
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)

        # Make sure confusion matrix, closest rule per example are correct at the beginning
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(rule_id == initial_correct_closest_rule_per_example[example_id].rule_id and
                            abs(dist - initial_correct_closest_rule_per_example[example_id].dist) < 0.001)

        correct_initial_f1 = 2 * 0.5 * 1 / 1.5
        self.assertTrue(initial_f1 == correct_initial_f1)
        k = 3
        neighbors, dists, _ = find_nearest_examples(df, k, rules[test_idx], class_col_name, lookup, min_max, classes,
                                                    label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=
                                                    True)
        improved, updated_rules, f1 = add_all_good_rules(df, neighbors, rules[test_idx], rules, initial_f1,
                                                         class_col_name, lookup, min_max, classes)
        self.assertTrue(improved is True)
        print("f1", f1)
        # correct_covered = {2: {0, 1, 2, 3, 4, 5}}
        correct_covered = {6: {0, 1, 2, 4, 5}, 7: {3}}
        correct_confusion_matrix = {my_vars.TP: {2, 3, 4, 5}, my_vars.FP: {0, 1}, my_vars.TN: set(), my_vars.FN: set()}
        # correct_closest_rule_per_example = {
        #     0: Data(rule_id=2, dist=0.0),
        #     1: Data(rule_id=2, dist=0.0),
        #     2: Data(rule_id=2, dist=0.0),
        #     3: Data(rule_id=2, dist=0.0),
        #     4: Data(rule_id=2, dist=0.0),
        #     5: Data(rule_id=2, dist=0.0)}
        correct_closest_rule_per_example = {
            0: Data(rule_id=6, dist=0.0),
            1: Data(rule_id=6, dist=0.0),
            2: Data(rule_id=6, dist=0.0),
            3: Data(rule_id=7, dist=0.0),
            4: Data(rule_id=6, dist=0.0),
            5: Data(rule_id=6, dist=0.0)
        }
        correct_f1 = 0.8
        self.assertTrue(correct_f1 == f1)
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(rule_id == correct_closest_rule_per_example[example_id].rule_id and
                            abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001)
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
        # latest_rule_id must be 7 as 2 new rules were added to the 5 initial rules
        self.assertTrue(len(updated_rules) == correct_rules and my_vars.latest_rule_id == (correct_rules - 1))
        self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
    def test_evaluate_f1_temporarily(self):
        """Tests that the global variables won't be updated despite local changes"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        # Reset as other tests change the data
        my_vars.examples_covered_by_rule = {}
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }

        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5)
        ]
        my_vars.all_rules = {
            0: rules[0],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }

        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        my_vars.closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        correct_closest_rules = copy.deepcopy(my_vars.closest_rule_per_example)
        correct_closest_examples = copy.deepcopy(
            my_vars.closest_examples_per_rule)
        my_vars.conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3, 4},
            my_vars.TN: {2, 5},
            my_vars.FN: set()
        }
        new_rule = pd.Series(
            {
                "A": "low",
                "B": Bounds(lower=0.5, upper=1.0),
                "C": Bounds(lower=3, upper=3),
                "Class": "banana"
            },
            name=0)
        correct_f1 = 0.8

        f1, conf_matrix, closest_rules, closest_examples, covered, updated_example_ids = \
            evaluate_f1_temporarily(df, new_rule, new_rule.name, class_col_name, lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.0),
            5: Data(rule_id=2, dist=0.67015625)
        }
        correct_covered = {0: {4}}
        correct_updated_examples = [4]
        self.assertTrue(updated_example_ids == correct_updated_examples)
        self.assertTrue(f1 == correct_f1)
        # Local result is still the same as in test_evaluate_f1_update_confusion_matrix.py
        for example_id in closest_rules:
            rule_id, dist = closest_rules[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id][0]
                and abs(dist - correct_closest_rule_per_example[example_id][1])
                < 0.001)
        self.assertTrue(closest_examples == my_vars.closest_examples_per_rule)
        correct_conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3},
            my_vars.TN: {2, 4, 5},
            my_vars.FN: set()
        }
        self.assertTrue(conf_matrix == correct_conf_matrix)
        # But now check that global variables remained unaffected by the changes
        correct_conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3, 4},
            my_vars.TN: {2, 5},
            my_vars.FN: set()
        }
        self.assertTrue(my_vars.conf_matrix == correct_conf_matrix)
        self.assertTrue(
            correct_closest_rules == my_vars.closest_rule_per_example)
        self.assertTrue(
            correct_closest_examples == my_vars.closest_examples_per_rule)
        self.assertTrue(correct_covered == covered)
Exemplo n.º 7
0
    def test_find_nearest_rule_ties(self):
        """Tests that ties (multiple rules cover an example) are resolved properly"""
        df = pd.DataFrame({
            "A": ["low", "low", "low"],
            "B": [1, 1, 2],
            "C": [1, 2, 3],
            "Class": ["apple", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        rules = [
            pd.Series({
                "A": "low",
                "B": (1, 2),
                "C": (1, 3),
                "Class": "apple"
            },
                      name=0),
            pd.Series({
                "B": (1, 2),
                "C": (1, 3),
                "Class": "apple"
            }, name=1),
            pd.Series({
                "B": (0, 3),
                "C": (1, 4),
                "Class": "apple"
            }, name=2),
        ]
        # Reset because other tests change the data
        my_vars.closest_examples_per_rule = {}
        my_vars.closest_rule_per_example = {}
        my_vars.examples_covered_by_rule = {}
        my_vars.unique_rules = {}
        my_vars.seed_example_rule = {0: {0}, 1: {1}, 2: {2}}
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2}
        my_vars.all_rules = {0: rules[0], 1: rules[1], 2: rules[2]}
        my_vars.conf_matrix = {}

        for example_id, example in df.iterrows():
            rule, dist, was_updated = find_nearest_rule(
                rules,
                example,
                class_col_name,
                lookup,
                min_max,
                classes,
                my_vars.examples_covered_by_rule,
                label_type=my_vars.ALL_LABELS,
                only_uncovered_neighbors=False)
            # print("eid: {} rule:\n{}\ndist: {} updated: {}".format(example_id, rule, dist, was_updated))
            print("eid: {} rule: {} dist: {} updated: {}".format(
                example_id, rule.name, dist, was_updated))
            self.assertTrue(was_updated is True)
        print("closest rules")
        print(my_vars.closest_rule_per_example)
        # Note: it's permissible that rule 1 covers example 1 (although example 1 is the seed for rule 1)
        # because rule 1 already covers example 0
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.0),
            1: Data(rule_id=1, dist=0.0),
            2: Data(rule_id=1, dist=0.0),
        }
        correct_closest_examples_per_rule = {
            1: {0, 1, 2},
        }
        print(my_vars.closest_rule_per_example)
        print(my_vars.closest_examples_per_rule)
        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        self.assertTrue(correct_closest_examples_per_rule ==
                        my_vars.closest_examples_per_rule)
Exemplo n.º 8
0
    def test_find_nearest_rule_no_ties(self):
        """Tests that the nearest rule is found per example assuming no ties"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        rules = [
            pd.Series({
                "A": "low",
                "B": (1, 1),
                "C": (3, 3),
                "Class": "apple"
            },
                      name=0),
            pd.Series({
                "A": "low",
                "B": (1, 1),
                "C": (2, 2),
                "Class": "apple"
            },
                      name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": (4, 4),
                    "C": (1, 1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": (1.5, 1.5),
                    "C": (0.5, 0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": (0.5, 0.5),
                    "C": (3, 3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": (0.75, 0.75),
                    "C": (2, 2),
                    "Class": "banana"
                },
                name=5)
        ]
        # Reset because other tests change the data
        my_vars.closest_examples_per_rule = {}
        my_vars.closest_rule_per_example = {}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5},
            6: {8}
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8}
        my_vars.all_rules = {
            0: rules[0],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }
        my_vars.examples_covered_by_rule = {6: {8}}
        my_vars.unique_rules = {}
        my_vars.conf_matrix = {}
        for example_id, example in df.iterrows():
            rule, dist, was_updated = find_nearest_rule(
                rules,
                example,
                class_col_name,
                lookup,
                min_max,
                classes,
                my_vars.examples_covered_by_rule,
                label_type=my_vars.ALL_LABELS,
                only_uncovered_neighbors=False)
            # print("eid: {} rule:\n{}\ndist: {} updated: {}".format(example_id, rule, dist, was_updated))
            self.assertTrue(was_updated is True)

        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        correct_closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        print(my_vars.closest_rule_per_example)
        print(correct_closest_rule_per_example)
        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        self.assertTrue(correct_closest_examples_per_rule ==
                        my_vars.closest_examples_per_rule)
Exemplo n.º 9
0
    def test_add_one_best_rule_unique(self):
        """Tests that the best rule found by this function is unique and correspondingly updates relevant
            statistics if that's not the case"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        test_idx = -1
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        # name=6 because this guy already exists in the rules and the new rule with name=0 becomes the same, so
        # it's removed
        correct_generalized_rule = pd.Series(
            {
                "A": "low",
                "B": (1, 1),
                "C": (2.0, 3),
                "Class": "apple"
            }, name=6)
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2.0, upper=3),
                    "Class": "apple"
                },
                name=6),  # same as best rule
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0)  # Current rule is always at the end of the list
        ]
        for rule in rules:
            rule_hash = compute_hashable_key(rule)
            my_vars.unique_rules[rule_hash] = {rule.name}
        correct_generalized_rule_hash = compute_hashable_key(
            correct_generalized_rule)

        my_vars.examples_covered_by_rule = {}
        my_vars.all_rules = {
            0: rules[test_idx],
            1: rules[0],
            2: rules[1],
            3: rules[2],
            4: rules[3],
            5: rules[4],
            6: rules[5]
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }

        my_vars.closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        # Note that 6: {8} is incorrect and was just added to test if the entries are merged correctly
        my_vars.examples_covered_by_rule = {6: {8}}
        print("rule hashes", my_vars.unique_rules)
        print(correct_generalized_rule_hash)
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=6, dist=0.0),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625),
            8: Data(rule_id=6, dist=0)  # Fake entry
        }
        my_vars.conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3, 4},
            my_vars.TN: {2, 5},
            my_vars.FN: set()
        }
        initial_f1 = 0.66666
        k = 3
        neighbors, dists, _ = find_nearest_examples(
            df,
            k,
            rules[test_idx],
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        improved, updated_rules, f1 = add_one_best_rule(
            df, neighbors, rules[test_idx], rules, initial_f1, class_col_name,
            lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=6, dist=0.0),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=6, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625),
            8: Data(rule_id=6, dist=0)
        }
        self.assertTrue(improved is True)
        correct_f1 = 2 * 0.5 * 1 / 1.5
        self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION)
        correct_confusion_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3, 4},
            my_vars.TN: {2, 5},
            my_vars.FN: set()
        }

        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            # 8 was only added to test something else, since it won't be in the result
            # if example_id != 8:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        self.assertTrue(updated_rules[5].equals(correct_generalized_rule))
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
        # Duplicate rule was deleted so that the last rule now corresponds to the rule with id
        self.assertTrue(
            len(rules) - 1 == len(updated_rules)
            and updated_rules[-1].name == 6)
Exemplo n.º 10
0
    def test_add_one_best_rule_no_update(self):
        """Tests that rule set is not updated when no generalized rule improves F1"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        test_idx = -1
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0)  # Current rule is always at the end of the list
        ]
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        my_vars.all_rules = {
            0: rules[test_idx],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[0]
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }
        my_vars.conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        my_vars.examples_covered_by_rule = {}
        # F1 is actually 0.6666, but setting it to 0.8 makes it not update any rule
        initial_f1 = 0.8
        k = 3
        my_vars.unique_rules = {}
        for rule in rules:
            rule_hash = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name)

        neighbors, dists, _ = find_nearest_examples(
            df,
            k,
            rules[test_idx],
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        improved, updated_rules, f1 = add_one_best_rule(
            df, neighbors, rules[test_idx], rules, initial_f1, class_col_name,
            lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        self.assertTrue(improved is False)
        correct_f1 = initial_f1
        self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION)
        correct_generalized_rule = pd.Series(
            {
                "A": "low",
                "B": (1, 1),
                "C": (3, 3),
                "Class": "apple"
            }, name=0)
        correct_confusion_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        print(rules[test_idx])
        print(correct_generalized_rule)
        print("updated")
        print(updated_rules)
        self.assertTrue(
            updated_rules[test_idx].equals(correct_generalized_rule))
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
Exemplo n.º 11
0
    def test_add_one_best_rule_update_stats(self):
        """Tests that rule set is updated when a generalized rule improves F1 and also the mapping of closest rule per
        example changes"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        test_idx = -1
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0)  # Current rule is always at the end of the list
        ]
        my_vars.closest_examples_per_rule = {
            0: {4},
            1: {0, 1, 3},  # Change compared to previous test case
            2: {5},
            5: {2}
        }
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=1, dist=0.010000000000000002
                    ),  # Change compared to previous test case
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        # Reset because other tests change the data
        # my_vars.examples_covered_by_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}, 6: {8}}
        my_vars.examples_covered_by_rule = {}
        my_vars.all_rules = {
            0: rules[test_idx],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }
        my_vars.unique_rules = {}
        my_vars.unique_rules = {}
        for rule in rules:
            rule_hash = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name)

        # Actually, correctly it should've been
        # my_vars.conf_matrix = {my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4}}
        # at the start (i.e. F1=0.66666), but to see if it changes, it's changed
        my_vars.conf_matrix = {
            my_vars.TP: {0},
            my_vars.FP: set(),
            my_vars.TN: {1, 2, 5},
            my_vars.FN: {3, 4}
        }
        initial_f1 = 0.1
        k = 3
        neighbors, dists, _ = find_nearest_examples(
            df,
            k,
            rules[test_idx],
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        improved, updated_rules, f1 = add_one_best_rule(
            df, neighbors, rules[test_idx], rules, initial_f1, class_col_name,
            lookup, min_max, classes)

        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.0),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        correct_closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        correct_f1 = 2 * 0.5 * 1 / 1.5
        self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION)
        self.assertTrue(improved is True)
        correct_generalized_rule = pd.Series(
            {
                "A": "low",
                "B": (1, 1),
                "C": (2.0, 3),
                "Class": "apple"
            }, name=0)
        correct_confusion_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        self.assertTrue(
            updated_rules[test_idx].equals(correct_generalized_rule))
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
        print(correct_closest_examples_per_rule)
        print(my_vars.closest_examples_per_rule)
        self.assertTrue(correct_closest_examples_per_rule ==
                        my_vars.closest_examples_per_rule)