예제 #1
0
 def test_find_neighbors_numeric_nominal_covers(self):
     """Tests that the stats for a newly covered rule are updated (dist = 0)"""
     """Tests that global statistics are updated accordingly"""
     df = pd.DataFrame({
         "A": ["low", "low", "high", "high", "low", "high"],
         "B": [1, 1, 1, 1, 0.5, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     class_col_name = "Class"
     lookup = \
         {
             "A":
                 {
                     'high': 2,
                     'low': 4,
                     my_vars.CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 2
                                 }),
                             'low':
                                 Counter({
                                     'banana': 2,
                                     'apple': 2
                                 })
                         }
                 }
         }
     rules = [
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "apple"
             },
             name=0),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=4, upper=4),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "banana"
             },
             name=2),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1.5, upper=1.5),
                 "C": Bounds(lower=0.5, upper=0.5),
                 "Class": "banana"
             },
             name=3),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=0.5, upper=0.5),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "banana"
             },
             name=4),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=0.75, upper=0.75),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "banana"
             },
             name=5)
     ]
     my_vars.all_rules = {
         0: rules[0],
         1: rules[1],
         2: rules[2],
         3: rules[3],
         4: rules[4],
         5: rules[5]
     }
     my_vars.closest_rule_per_example = {
         0: (1, 0.010000000000000002),
         1: (0, 0.010000000000000002),
         2: (5, 0.67015625),
         3: (1, 0.038125),
         4: (0, 0.015625),
         5: (2, 0.67015625)
     }
     my_vars.closest_examples_per_rule = {
         0: {1, 4},
         1: {0, 3},
         2: {5},
         5: {2}
     }
     k = 4
     correct = df.iloc[[2, 3, 5, 4]]
     rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0)
     classes = ["apple", "banana"]
     my_vars.minority_class = "banana"
     min_max = pd.DataFrame({
         "A": {
             "min": 1,
             "max": 5
         },
         "B": {
             "min": 1,
             "max": 11
         }
     })
     # An example could be covered by multiple rules, so example 2 should be covered by rules 0 and 1 at the end
     my_vars.examples_covered_by_rule = {1: {2}}
     correct_covered = {0: {2, 3}, 1: {2}}
     correct_examples_per_rule = {0: {1, 2, 3, 4, 5}, 1: {0}}
     correct_closest_rule_per_example = {
         0: (1, 0.010000000000000002),
         1: (0, 0.010000000000000002),
         2: (0, 0.0),
         3: (0, 0.0),
         4: (0, 0.015625),
         5: (0, 0.0006250000000000001)
     }
     neighbors, _, _ = find_nearest_examples(
         df,
         k,
         rule,
         class_col_name,
         lookup,
         min_max,
         classes,
         label_type=my_vars.SAME_LABEL_AS_RULE,
         only_uncovered_neighbors=False)
     self.assertTrue(neighbors.equals(correct))
     self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
     self.assertTrue(
         correct_examples_per_rule == my_vars.closest_examples_per_rule)
     for example_id, (rule_id,
                      dist) in correct_closest_rule_per_example.items():
         self.assertTrue(example_id in my_vars.closest_rule_per_example)
         other_id, other_dist = my_vars.closest_rule_per_example[example_id]
         self.assertTrue(rule_id == other_id)
         self.assertTrue(abs(dist - other_dist) < 0.0001)
예제 #2
0
 def test_find_neighbors_numeric_nominal(self):
     """Tests what happens if input has a numeric and a nominal feature"""
     df = pd.DataFrame({
         "A": ["low", "low", "high", "low", "low", "high"],
         "B": [1, 1, 4, 1.5, 0.5, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     class_col_name = "Class"
     lookup = \
         {
             "A":
                 {
                     'high': 2,
                     'low': 4,
                     my_vars.CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 2
                                 }),
                             'low':
                                 Counter({
                                     'banana': 2,
                                     'apple': 2
                                 })
                         }
                 }
         }
     k = 4
     correct = None
     if k == 1:
         correct = df.iloc[[5]]
     elif k == 2:
         correct = df.iloc[[5, 2]]
     elif k == 3:
         correct = df.iloc[[5, 2, 3]]
     elif k >= 4:
         correct = df.iloc[[5, 2, 3, 4]]
     rule = pd.Series({
         "A": "high",
         "B": Bounds(lower=1, upper=1),
         "Class": "banana"
     })
     classes = ["apple", "banana"]
     min_max = pd.DataFrame({
         "A": {
             "min": 1,
             "max": 5
         },
         "B": {
             "min": 1,
             "max": 11
         }
     })
     # Reset as other tests changed the content of the dictionary
     my_vars.closest_rule_per_example = {}
     neighbors, _, _ = find_nearest_examples(
         df,
         k,
         rule,
         class_col_name,
         lookup,
         min_max,
         classes,
         label_type=my_vars.SAME_LABEL_AS_RULE,
         only_uncovered_neighbors=False)
     if neighbors is not None:
         self.assertTrue(neighbors.shape[0] == k)
     self.assertTrue(neighbors.equals(correct))
예제 #3
0
    def test_find_neighbors_numeric_nominal_covered(self):
        """Tests what happens if input has a numeric and a nominal feature and some examples are already covered
            by the rule"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        k = 4
        my_vars.closest_rule_per_example = {}
        correct = None
        if k == 1:
            correct = df.iloc[[5]]
        elif k == 2:
            correct = df.iloc[[5, 2]]
        elif k == 3:
            correct = df.iloc[[5, 2, 3]]
        elif k >= 4:
            # correct = df.iloc[[5, 2, 3, 4]]
            # Examples at indices 2 and 4 are already covered by the rule, so don't return them as neighbors
            my_vars.examples_covered_by_rule = {0: {2, 4}}
            correct = df.iloc[[5, 3]]
        my_vars.all_rules = {}
        rule = pd.Series(
            {
                "A": "high",
                "B": Bounds(lower=1, upper=1),
                "Class": "banana"
            },
            name=0)
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "A": {
                "min": 1,
                "max": 5
            },
            "B": {
                "min": 1,
                "max": 11
            }
        })

        neighbors, _, _ = find_nearest_examples(
            df,
            k,
            rule,
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        self.assertTrue(neighbors.equals(correct))
예제 #4
0
    def test_find_neighbors_numeric_nominal_stats(self):
        """Tests that global statistics are updated accordingly"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0)
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        # Reset because other tests added data, so if you only run this test it would work, but not if other
        # tests are run prior to that
        my_vars.examples_covered_by_rule = {}
        my_vars.closest_examples_per_rule = {}
        my_vars.closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5)
        ]
        my_vars.all_rules = {
            0: rules[0],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }
        # my_vars.all_rules = {0: rule}
        k = 4
        correct = df.iloc[[5, 2, 3, 4]]

        classes = ["apple", "banana"]
        my_vars.minority_class = "banana"
        min_max = pd.DataFrame({
            "A": {
                "min": 1,
                "max": 5
            },
            "B": {
                "min": 1,
                "max": 11
            }
        })
        correct_covered = {}
        correct_examples_per_rule = {0: {1, 2, 4, 5}, 1: {0, 3}}
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=0, dist=0.09),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=0, dist=0.0006250000000000001)
        }
        neighbors, _, _ = find_nearest_examples(
            df,
            k,
            rule,
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=False)
        self.assertTrue(neighbors.equals(correct))
        self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
        self.assertTrue(
            correct_examples_per_rule == my_vars.closest_examples_per_rule)
        for example_id, (rule_id,
                         dist) in correct_closest_rule_per_example.items():
            features = my_vars.all_rules[rule_id].size
            self.assertTrue(example_id in my_vars.closest_rule_per_example)
            other_id, other_dist = my_vars.closest_rule_per_example[example_id]
            other_features = my_vars.all_rules[other_id].size
            self.assertTrue(rule_id == other_id)
            self.assertTrue(features == other_features)
            self.assertTrue(abs(dist - other_dist) < 0.0001)
예제 #5
0
 def test_find_neighbors_numeric_nominal_label_type(self):
     """Tests what happens if input has a numeric and a nominal feature and we vary label_type as parameter"""
     df = pd.DataFrame({
         "A": ["low", "low", "high", "low", "low", "high"],
         "B": [1, 1, 4, 1.5, 0.5, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     class_col_name = "Class"
     lookup = \
         {
             "A":
                 {
                     'high': 2,
                     'low': 4,
                     my_vars.CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 2
                                 }),
                             'low':
                                 Counter({
                                     'banana': 2,
                                     'apple': 2
                                 })
                         }
                 }
         }
     k = 3
     rules = [
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "apple"
             },
             name=0),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=4, upper=4),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "banana"
             },
             name=2),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1.5, upper=1.5),
                 "C": Bounds(lower=0.5, upper=0.5),
                 "Class": "banana"
             },
             name=3),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=0.5, upper=0.5),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "banana"
             },
             name=4),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=0.75, upper=0.75),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "banana"
             },
             name=5)
     ]
     my_vars.all_rules = {
         0: rules[0],
         1: rules[1],
         2: rules[2],
         3: rules[3],
         4: rules[4],
         5: rules[5]
     }
     my_vars.closest_rule_per_example = {}
     my_vars.closest_examples_per_rule = {}
     correct_all = df.iloc[[5, 2, 0]]
     correct_same = df.iloc[[5, 2, 3]]
     correct_opposite = df.iloc[[0, 1]]
     rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0)
     classes = ["apple", "banana"]
     min_max = pd.DataFrame({
         "A": {
             "min": 1,
             "max": 5
         },
         "B": {
             "min": 1,
             "max": 11
         }
     })
     neighbors_all, _, _ = find_nearest_examples(
         df,
         k,
         rule,
         class_col_name,
         lookup,
         min_max,
         classes,
         label_type=my_vars.ALL_LABELS,
         only_uncovered_neighbors=False)
     neighbors_same, _, _ = find_nearest_examples(
         df,
         k,
         rule,
         class_col_name,
         lookup,
         min_max,
         classes,
         label_type=my_vars.SAME_LABEL_AS_RULE,
         only_uncovered_neighbors=False)
     neighbors_opposite, _, _ = find_nearest_examples(
         df,
         k,
         rule,
         class_col_name,
         lookup,
         min_max,
         classes,
         label_type=my_vars.OPPOSITE_LABEL_TO_RULE,
         only_uncovered_neighbors=False)
     print(neighbors_all)
     print(neighbors_same)
     print(neighbors_opposite)
     self.assertTrue(neighbors_all.equals(correct_all))
     self.assertTrue(neighbors_same.equals(correct_same))
     self.assertTrue(neighbors_opposite.equals(correct_opposite))
예제 #6
0
    def test_add_all_good_rules(self):
        """Tests that rule set is updated when a generalized rule improves F1"""
        df = pd.DataFrame({"A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75],
                           "C": [3, 2, 1, .5, 3, 2],
                           "Class": ["apple", "apple", "banana", "banana", "banana", "banana"]})
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({"B": {"min": 1, "max": 5}, "C": {"min": 1, "max": 11}})
        # Use majority class as minority to have multiple neighbors and see if the function works correctly
        my_vars.minority_class = "banana"
        rules = [
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"},
                      name=0),
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple"},
                      name=1),
            pd.Series({"A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5),
                       "Class": "banana"}, name=3),
            pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3),
                       "Class": "banana"}, name=4),
            pd.Series({"A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2),
                       "Class": "banana"}, name=5),
            pd.Series({"A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1),
                       "Class": "banana"}, name=2)  # Current rule to be tested is always at the end
        ]
        test_idx = -1
        my_vars.latest_rule_id = len(rules) - 1
        my_vars.examples_covered_by_rule = {}
        my_vars.all_rules = {0: rules[0], 1: rules[1], 2: rules[test_idx], 3: rules[2], 4: rules[3], 5: rules[4]}
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}}
        my_vars.unique_rules = {}
        for rule in rules:
            hash_val = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name)

        initial_correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)}
        initial_f1 = evaluate_f1_initialize_confusion_matrix(df, rules, class_col_name, lookup, min_max, classes)
        correct_confusion_matrix = {my_vars.TP: {2, 5}, my_vars.FP: set(), my_vars.TN: {0, 1}, my_vars.FN: {3, 4}}
        correct_rules = 8
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)

        # Make sure confusion matrix, closest rule per example are correct at the beginning
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(rule_id == initial_correct_closest_rule_per_example[example_id].rule_id and
                            abs(dist - initial_correct_closest_rule_per_example[example_id].dist) < 0.001)

        correct_initial_f1 = 2 * 0.5 * 1 / 1.5
        self.assertTrue(initial_f1 == correct_initial_f1)
        k = 3
        neighbors, dists, _ = find_nearest_examples(df, k, rules[test_idx], class_col_name, lookup, min_max, classes,
                                                    label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=
                                                    True)
        improved, updated_rules, f1 = add_all_good_rules(df, neighbors, rules[test_idx], rules, initial_f1,
                                                         class_col_name, lookup, min_max, classes)
        self.assertTrue(improved is True)
        print("f1", f1)
        # correct_covered = {2: {0, 1, 2, 3, 4, 5}}
        correct_covered = {6: {0, 1, 2, 4, 5}, 7: {3}}
        correct_confusion_matrix = {my_vars.TP: {2, 3, 4, 5}, my_vars.FP: {0, 1}, my_vars.TN: set(), my_vars.FN: set()}
        # correct_closest_rule_per_example = {
        #     0: Data(rule_id=2, dist=0.0),
        #     1: Data(rule_id=2, dist=0.0),
        #     2: Data(rule_id=2, dist=0.0),
        #     3: Data(rule_id=2, dist=0.0),
        #     4: Data(rule_id=2, dist=0.0),
        #     5: Data(rule_id=2, dist=0.0)}
        correct_closest_rule_per_example = {
            0: Data(rule_id=6, dist=0.0),
            1: Data(rule_id=6, dist=0.0),
            2: Data(rule_id=6, dist=0.0),
            3: Data(rule_id=7, dist=0.0),
            4: Data(rule_id=6, dist=0.0),
            5: Data(rule_id=6, dist=0.0)
        }
        correct_f1 = 0.8
        self.assertTrue(correct_f1 == f1)
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(rule_id == correct_closest_rule_per_example[example_id].rule_id and
                            abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001)
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
        # latest_rule_id must be 7 as 2 new rules were added to the 5 initial rules
        self.assertTrue(len(updated_rules) == correct_rules and my_vars.latest_rule_id == (correct_rules - 1))
        self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
예제 #7
0
    def test_add_one_best_rule_unique(self):
        """Tests that the best rule found by this function is unique and correspondingly updates relevant
            statistics if that's not the case"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        test_idx = -1
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        # name=6 because this guy already exists in the rules and the new rule with name=0 becomes the same, so
        # it's removed
        correct_generalized_rule = pd.Series(
            {
                "A": "low",
                "B": (1, 1),
                "C": (2.0, 3),
                "Class": "apple"
            }, name=6)
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2.0, upper=3),
                    "Class": "apple"
                },
                name=6),  # same as best rule
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0)  # Current rule is always at the end of the list
        ]
        for rule in rules:
            rule_hash = compute_hashable_key(rule)
            my_vars.unique_rules[rule_hash] = {rule.name}
        correct_generalized_rule_hash = compute_hashable_key(
            correct_generalized_rule)

        my_vars.examples_covered_by_rule = {}
        my_vars.all_rules = {
            0: rules[test_idx],
            1: rules[0],
            2: rules[1],
            3: rules[2],
            4: rules[3],
            5: rules[4],
            6: rules[5]
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }

        my_vars.closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        # Note that 6: {8} is incorrect and was just added to test if the entries are merged correctly
        my_vars.examples_covered_by_rule = {6: {8}}
        print("rule hashes", my_vars.unique_rules)
        print(correct_generalized_rule_hash)
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=6, dist=0.0),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625),
            8: Data(rule_id=6, dist=0)  # Fake entry
        }
        my_vars.conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3, 4},
            my_vars.TN: {2, 5},
            my_vars.FN: set()
        }
        initial_f1 = 0.66666
        k = 3
        neighbors, dists, _ = find_nearest_examples(
            df,
            k,
            rules[test_idx],
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        improved, updated_rules, f1 = add_one_best_rule(
            df, neighbors, rules[test_idx], rules, initial_f1, class_col_name,
            lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=6, dist=0.0),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=6, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625),
            8: Data(rule_id=6, dist=0)
        }
        self.assertTrue(improved is True)
        correct_f1 = 2 * 0.5 * 1 / 1.5
        self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION)
        correct_confusion_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3, 4},
            my_vars.TN: {2, 5},
            my_vars.FN: set()
        }

        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            # 8 was only added to test something else, since it won't be in the result
            # if example_id != 8:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        self.assertTrue(updated_rules[5].equals(correct_generalized_rule))
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
        # Duplicate rule was deleted so that the last rule now corresponds to the rule with id
        self.assertTrue(
            len(rules) - 1 == len(updated_rules)
            and updated_rules[-1].name == 6)
예제 #8
0
    def test_add_one_best_rule_no_update(self):
        """Tests that rule set is not updated when no generalized rule improves F1"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        test_idx = -1
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0)  # Current rule is always at the end of the list
        ]
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        my_vars.all_rules = {
            0: rules[test_idx],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[0]
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }
        my_vars.conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        my_vars.examples_covered_by_rule = {}
        # F1 is actually 0.6666, but setting it to 0.8 makes it not update any rule
        initial_f1 = 0.8
        k = 3
        my_vars.unique_rules = {}
        for rule in rules:
            rule_hash = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name)

        neighbors, dists, _ = find_nearest_examples(
            df,
            k,
            rules[test_idx],
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        improved, updated_rules, f1 = add_one_best_rule(
            df, neighbors, rules[test_idx], rules, initial_f1, class_col_name,
            lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        self.assertTrue(improved is False)
        correct_f1 = initial_f1
        self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION)
        correct_generalized_rule = pd.Series(
            {
                "A": "low",
                "B": (1, 1),
                "C": (3, 3),
                "Class": "apple"
            }, name=0)
        correct_confusion_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        print(rules[test_idx])
        print(correct_generalized_rule)
        print("updated")
        print(updated_rules)
        self.assertTrue(
            updated_rules[test_idx].equals(correct_generalized_rule))
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
예제 #9
0
    def test_add_one_best_rule_update_stats(self):
        """Tests that rule set is updated when a generalized rule improves F1 and also the mapping of closest rule per
        example changes"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        test_idx = -1
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0)  # Current rule is always at the end of the list
        ]
        my_vars.closest_examples_per_rule = {
            0: {4},
            1: {0, 1, 3},  # Change compared to previous test case
            2: {5},
            5: {2}
        }
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=1, dist=0.010000000000000002
                    ),  # Change compared to previous test case
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        # Reset because other tests change the data
        # my_vars.examples_covered_by_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}, 6: {8}}
        my_vars.examples_covered_by_rule = {}
        my_vars.all_rules = {
            0: rules[test_idx],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }
        my_vars.unique_rules = {}
        my_vars.unique_rules = {}
        for rule in rules:
            rule_hash = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name)

        # Actually, correctly it should've been
        # my_vars.conf_matrix = {my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4}}
        # at the start (i.e. F1=0.66666), but to see if it changes, it's changed
        my_vars.conf_matrix = {
            my_vars.TP: {0},
            my_vars.FP: set(),
            my_vars.TN: {1, 2, 5},
            my_vars.FN: {3, 4}
        }
        initial_f1 = 0.1
        k = 3
        neighbors, dists, _ = find_nearest_examples(
            df,
            k,
            rules[test_idx],
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        improved, updated_rules, f1 = add_one_best_rule(
            df, neighbors, rules[test_idx], rules, initial_f1, class_col_name,
            lookup, min_max, classes)

        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.0),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        correct_closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        correct_f1 = 2 * 0.5 * 1 / 1.5
        self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION)
        self.assertTrue(improved is True)
        correct_generalized_rule = pd.Series(
            {
                "A": "low",
                "B": (1, 1),
                "C": (2.0, 3),
                "Class": "apple"
            }, name=0)
        correct_confusion_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        self.assertTrue(
            updated_rules[test_idx].equals(correct_generalized_rule))
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
        print(correct_closest_examples_per_rule)
        print(my_vars.closest_examples_per_rule)
        self.assertTrue(correct_closest_examples_per_rule ==
                        my_vars.closest_examples_per_rule)