示例#1
0
 def test_find_duplicate_rule_id(self):
     """Tests that a duplicate rule is detected properly"""
     rules = [
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=0.5, upper=1.5),
                 "C": Bounds(lower=0.5, upper=3.0),
                 "Class": "banana"
             },
             name=7),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=0.5, upper=1.5),
                 "C": Bounds(lower=0.5, upper=3.0),
                 "Class": "banana"
             },
             name=12)  # Duplicate
     ]
     duplicate_idx = 1
     my_vars.unique_rules = {compute_hashable_key(rules[0]): {7}}
     my_vars.all_rules = {7: rules[0]}
     duplicate_hash = compute_hashable_key(rules[duplicate_idx])
     duplicate_id = find_duplicate_rule_id(rules[duplicate_idx],
                                           duplicate_hash)
     print("duplicate ID:", duplicate_id)
     self.assertTrue(duplicate_id == rules[0].name)
示例#2
0
    def test_merge_rule_statistics_of_duplicate(self):
        """Checks that the statistics are updated correctly if a duplicate rule is generated during the generalization
        step in bracid()"""

        rules = [
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"},
                      name=0),
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"},
                      name=1),  # Duplicate
        ]

        orig_idx = 0
        dupl_idx = 1
        my_vars.unique_rules = {}
        my_vars.all_rules = {}
        for rule in rules:
            hash_val = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name)
            my_vars.all_rules[rule.name] = rule
        print("hashes", my_vars.unique_rules)
        # Some random values
        my_vars.seed_example_rule = {0: {1, 5}, 10: {0}, 4: {7}}
        my_vars.seed_rule_example = {5: 0, 1: 0, 0: 10, 7: 4}
        my_vars.closest_examples_per_rule = {0: {0, 3}, 1: {4}, 4: {8}}
        my_vars.closest_rule_per_example = {0: Data(rule_id=0, dist=3), 3: Data(rule_id=0, dist=2),
                                            4: Data(rule_id=1, dist=0.13), 5: Data(rule_id=76, dist=3)}
        my_vars.examples_covered_by_rule = {0: {43, 12}, 1: {7}, 2: {3}}

        # Delete entries of the rule with ID 1 as the one with ID 0 already exists
        merge_rule_statistics_of_duplicate(rules[orig_idx], rules[dupl_idx])

        # Read: example with ID 0 is seed for the rule with ID 5....
        correct_seed_example_rule = {0: {5}, 10: {0}, 4: {7}}
        # Read: rule with ID 5 has as seed example the one with ID 0...
        correct_seed_rule_example = {5: 0, 0: 10, 7: 4}
        correct_unique_rules = {compute_hashable_key(rules[orig_idx]): {0}}
        correct_all_rules = {0: rules[orig_idx]}
        # extra_rule now also covers the 3 examples to which the 2 deleted rules were closest
        correct_closest_examples_per_rule = {0: {0, 3, 4}, 4: {8}}
        correct_closest_rule_per_example = {0: Data(rule_id=0, dist=3), 3: Data(rule_id=0, dist=2),
                                            4: Data(rule_id=0, dist=0.13), 5: Data(rule_id=76, dist=3)}
        correct_covered_by_rule = {2: {3}, 0: {43, 12, 7}}
        self.assertTrue(my_vars.seed_rule_example == correct_seed_rule_example)
        self.assertTrue(my_vars.seed_example_rule == correct_seed_example_rule)
        self.assertTrue(my_vars.unique_rules == correct_unique_rules)
        self.assertTrue(my_vars.all_rules == correct_all_rules)
        self.assertTrue(my_vars.closest_examples_per_rule == correct_closest_examples_per_rule)
        self.assertTrue(my_vars.closest_rule_per_example == correct_closest_rule_per_example)
        self.assertTrue(my_vars.examples_covered_by_rule == correct_covered_by_rule)
示例#3
0
 def test_are_duplicates_length(self):
     """Tests that two rules of different lengths can never be duplicates"""
     rules = [
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "apple"
             },
             name=2)
     ]
     duplicate = _are_duplicates(rules[0], rules[1])
     self.assertTrue(duplicate is False)
示例#4
0
 def test_is_duplicate_false(self):
     """Tests if no duplicate rule is detected"""
     rules = [
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=1, upper=2),
                 "C": Bounds(lower=1, upper=3),
                 "Class": "apple"
             },
             name=0),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "apple"
             },
             name=1)
     ]
     new_rule = pd.Series(
         {
             "A": "high",
             "B": Bounds(lower=1, upper=3),
             "C": Bounds(lower=1, upper=3),
             "Class": "apple"
         },
         name=2)
     my_vars.all_rules = {0: rules[0], 1: rules[1]}
     rule_id = is_duplicate(new_rule, existing_rule_ids=[0, 1])
     self.assertTrue(rule_id == -1)
示例#5
0
 def test_are_duplicates_true(self):
     """Tests that two rules are detected as duplicates if only the rule ID is different in both rules"""
     rules = [
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "apple"
             },
             name=2)
     ]
     duplicate = _are_duplicates(rules[0], rules[1])
     self.assertTrue(duplicate is True)
示例#6
0
 def test_are_duplicates_bounds(self):
     """Tests that no duplicate rules are detected if they are different in a lower or upper boundary value"""
     rules = [
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=0.8, upper=1),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "apple"
             },
             name=2)
     ]
     duplicate = _are_duplicates(rules[0], rules[1])
     self.assertTrue(duplicate is False)
示例#7
0
 def test_are_duplicates_nominal(self):
     """Tests that no duplicate rules are detected if they are different in a nominal feature"""
     rules = [
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=4, upper=4),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "banana"
             },
             name=2)
     ]
     duplicate = _are_duplicates(rules[0], rules[1])
     self.assertTrue(duplicate is False)
示例#8
0
 def test_train(self):
     """Test with numeric and nominal features"""
     training_set = pd.DataFrame({"A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75],
                                  "C": [3, 2, 1, .5, 3, 2],
                                  "Class": ["apple", "apple", "banana", "banana", "banana", "banana"]})
     # Use majority class as minority to have multiple neighbors and see if the function works correctly
     minority_label = "banana"
     class_col_name = "Class"
     rules = {
         2: pd.Series({"B": Bounds(lower=1.25, upper=4.0), "C": Bounds(lower=0.5, upper=1.5),
                       "Class": "banana"}, name=2),
         6: pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0),
                       "Class": "banana"}, name=6),
         5: pd.Series({"A": "high", "B": Bounds(lower=0.75, upper=4.0), "C": Bounds(lower=1.0, upper=2.5),
                       "Class": "banana"}, name=5),
         0: pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0),
                       "Class": "apple"}, name=0),
     }
     model = train_binary(rules, training_set, minority_label, class_col_name)
     correct_model = {2: Support(minority=1.0, majority=0.0), 6: Support(minority=0.5, majority=0.5),
                      5: Support(minority=1.0, majority=0.0), 0: Support(minority=0.5, majority=0.5)}
     self.assertTrue(model == correct_model)
示例#9
0
    def test_add_one_best_rule_unique(self):
        """Tests that the best rule found by this function is unique and correspondingly updates relevant
            statistics if that's not the case"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        test_idx = -1
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        # name=6 because this guy already exists in the rules and the new rule with name=0 becomes the same, so
        # it's removed
        correct_generalized_rule = pd.Series(
            {
                "A": "low",
                "B": (1, 1),
                "C": (2.0, 3),
                "Class": "apple"
            }, name=6)
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2.0, upper=3),
                    "Class": "apple"
                },
                name=6),  # same as best rule
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0)  # Current rule is always at the end of the list
        ]
        for rule in rules:
            rule_hash = compute_hashable_key(rule)
            my_vars.unique_rules[rule_hash] = {rule.name}
        correct_generalized_rule_hash = compute_hashable_key(
            correct_generalized_rule)

        my_vars.examples_covered_by_rule = {}
        my_vars.all_rules = {
            0: rules[test_idx],
            1: rules[0],
            2: rules[1],
            3: rules[2],
            4: rules[3],
            5: rules[4],
            6: rules[5]
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }

        my_vars.closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        # Note that 6: {8} is incorrect and was just added to test if the entries are merged correctly
        my_vars.examples_covered_by_rule = {6: {8}}
        print("rule hashes", my_vars.unique_rules)
        print(correct_generalized_rule_hash)
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=6, dist=0.0),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625),
            8: Data(rule_id=6, dist=0)  # Fake entry
        }
        my_vars.conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3, 4},
            my_vars.TN: {2, 5},
            my_vars.FN: set()
        }
        initial_f1 = 0.66666
        k = 3
        neighbors, dists, _ = find_nearest_examples(
            df,
            k,
            rules[test_idx],
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        improved, updated_rules, f1 = add_one_best_rule(
            df, neighbors, rules[test_idx], rules, initial_f1, class_col_name,
            lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=6, dist=0.0),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=6, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625),
            8: Data(rule_id=6, dist=0)
        }
        self.assertTrue(improved is True)
        correct_f1 = 2 * 0.5 * 1 / 1.5
        self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION)
        correct_confusion_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3, 4},
            my_vars.TN: {2, 5},
            my_vars.FN: set()
        }

        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            # 8 was only added to test something else, since it won't be in the result
            # if example_id != 8:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        self.assertTrue(updated_rules[5].equals(correct_generalized_rule))
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
        # Duplicate rule was deleted so that the last rule now corresponds to the rule with id
        self.assertTrue(
            len(rules) - 1 == len(updated_rules)
            and updated_rules[-1].name == 6)
    def test_evaluate_f1_update_confusion_matrix_not_updated(self):
        """Tests what happens if input has a numeric and a nominal feature and a rule that predicts an example is
        not updated as F1 score doesn't improve"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"

        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5)
        ]
        my_vars.examples_covered_by_rule = {}
        my_vars.closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }
        my_vars.all_rules = {
            0: rules[0],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        my_vars.conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        new_rule = pd.Series(
            {
                "A": "low",
                "B": (0.5, 0.5),
                "C": (3, 3),
                "Class": "banana"
            },
            name=4)
        correct_f1 = 2 * 1 * 0.5 / 1.5

        f1 = evaluate_f1_update_confusion_matrix(df, new_rule, class_col_name,
                                                 lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        self.assertTrue(f1 == correct_f1)
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id][0]
                and abs(dist - correct_closest_rule_per_example[example_id][1])
                < 0.001)
        correct_conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        self.assertTrue(my_vars.conf_matrix == correct_conf_matrix)
示例#11
0
 def test_add_tags_all_tags(self):
     """Add tags when using nominal and numeric features and assigning noisy, borderline and safe as tags"""
     df = pd.DataFrame({
         "A": ["low", "low", "high", "low", "low", "high"],
         "B": [1, 1, 4, 1.5, 0.5, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     class_col_name = "Class"
     my_vars.examples_covered_by_rule = {
         0: {0},
         1: {1},
         2: {2},
         3: {3},
         4: {4},
         5: {5}
     }
     lookup = \
         {
             "A":
                 {
                     'high': 2,
                     'low': 4,
                     CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 2
                                 }),
                             'low':
                                 Counter({
                                     'banana': 2,
                                     'apple': 2
                                 })
                         }
                 }
         }
     correct = pd.DataFrame({
         "A": ["low", "low", "high", "low", "low", "high"],
         "B": [1, 1, 4, 1.5, 0.5, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"],
         TAG: [BORDERLINE, BORDERLINE, SAFE, NOISY, NOISY, BORDERLINE]
     })
     classes = ["apple", "banana"]
     min_max = pd.DataFrame({
         "C": {
             "min": 1,
             "max": 5
         },
         "B": {
             "min": 1,
             "max": 11
         }
     })
     k = 2
     rules = [
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "apple"
             },
             name=0),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=4, upper=4),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "banana"
             },
             name=2),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1.5, upper=1.5),
                 "C": Bounds(lower=0.5, upper=0.5),
                 "Class": "banana"
             },
             name=3),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=0.5, upper=0.5),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "banana"
             },
             name=4),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=0.75, upper=0.75),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "banana"
             },
             name=5)
     ]
     my_vars.all_rules = {
         0: rules[0],
         1: rules[1],
         2: rules[2],
         3: rules[3],
         4: rules[4],
         5: rules[5]
     }
     my_vars.closest_rule_per_example = {}
     my_vars.closest_examples_per_rule = {}
     my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
     my_vars.seed_example_rule = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
     # Note: examples_covered_by_rule implicitly includes the seeds of all rules
     my_vars.examples_covered_by_rule = {}
     tagged = add_tags(df, k, rules, class_col_name, lookup, min_max,
                       classes)
     self.assertTrue(tagged.equals(correct))
示例#12
0
 def test_add_tags_nan(self):
     """Add tags when using nominal and numeric features when all examples contain at least one NaN value"""
     df = pd.DataFrame({
         "A": [np.NaN, np.NaN, "high", np.NaN, "low", np.NaN],
         "B": [np.NaN, 1, np.NaN, 1.5, np.NaN, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     my_vars.examples_covered_by_rule = {
         0: {0},
         1: {1},
         2: {2},
         3: {3},
         4: {4},
         5: {5}
     }
     class_col_name = "Class"
     lookup = \
         {
             "A":
                 {
                     'high': 1,
                     'low': 1,
                     CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 1
                                 }),
                             'low':
                                 Counter({
                                     'banana': 1
                                 })
                         }
                 }
         }
     correct = pd.DataFrame({
         "A": [np.NaN, np.NaN, "high", np.NaN, "low", np.NaN],
         "B": [np.NaN, 1, np.NaN, 1.5, np.NaN, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"],
         TAG: [BORDERLINE, NOISY, SAFE, SAFE, SAFE, SAFE]
     })
     classes = ["apple", "banana"]
     min_max = pd.DataFrame({
         "C": {
             "min": 1,
             "max": 5
         },
         "B": {
             "min": 1,
             "max": 11
         }
     })
     rules = [
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "apple"
             },
             name=0),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=4, upper=4),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "banana"
             },
             name=2),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1.5, upper=1.5),
                 "C": Bounds(lower=0.5, upper=0.5),
                 "Class": "banana"
             },
             name=3),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=0.5, upper=0.5),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "banana"
             },
             name=4),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=0.75, upper=0.75),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "banana"
             },
             name=5)
     ]
     k = 3
     my_vars.all_rules = {
         0: rules[0],
         1: rules[1],
         2: rules[2],
         3: rules[3],
         4: rules[4],
         5: rules[5]
     }
     my_vars.closest_rule_per_example = {}
     my_vars.closest_examples_per_rule = {}
     my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
     my_vars.seed_example_rule = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
     # Note: examples_covered_by_rule implicitly includes the seeds of all rules
     my_vars.examples_covered_by_rule = {}
     tagged = add_tags(df, k, rules, class_col_name, lookup, min_max,
                       classes)
     # Due to floating point precision, use approximate comparison
     self.assertTrue(tagged.equals(correct))
示例#13
0
 def test_find_neighbors_numeric_nominal(self):
     """Tests what happens if input has a numeric and a nominal feature"""
     df = pd.DataFrame({
         "A": ["low", "low", "high", "low", "low", "high"],
         "B": [1, 1, 4, 1.5, 0.5, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     class_col_name = "Class"
     lookup = \
         {
             "A":
                 {
                     'high': 2,
                     'low': 4,
                     my_vars.CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 2
                                 }),
                             'low':
                                 Counter({
                                     'banana': 2,
                                     'apple': 2
                                 })
                         }
                 }
         }
     k = 4
     correct = None
     if k == 1:
         correct = df.iloc[[5]]
     elif k == 2:
         correct = df.iloc[[5, 2]]
     elif k == 3:
         correct = df.iloc[[5, 2, 3]]
     elif k >= 4:
         correct = df.iloc[[5, 2, 3, 4]]
     rule = pd.Series({
         "A": "high",
         "B": Bounds(lower=1, upper=1),
         "Class": "banana"
     })
     classes = ["apple", "banana"]
     min_max = pd.DataFrame({
         "A": {
             "min": 1,
             "max": 5
         },
         "B": {
             "min": 1,
             "max": 11
         }
     })
     # Reset as other tests changed the content of the dictionary
     my_vars.closest_rule_per_example = {}
     neighbors, _, _ = find_nearest_examples(
         df,
         k,
         rule,
         class_col_name,
         lookup,
         min_max,
         classes,
         label_type=my_vars.SAME_LABEL_AS_RULE,
         only_uncovered_neighbors=False)
     if neighbors is not None:
         self.assertTrue(neighbors.shape[0] == k)
     self.assertTrue(neighbors.equals(correct))
    def test_evaluate_f1_temporarily(self):
        """Tests that the global variables won't be updated despite local changes"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        # Reset as other tests change the data
        my_vars.examples_covered_by_rule = {}
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }

        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5)
        ]
        my_vars.all_rules = {
            0: rules[0],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }

        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        my_vars.closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        correct_closest_rules = copy.deepcopy(my_vars.closest_rule_per_example)
        correct_closest_examples = copy.deepcopy(
            my_vars.closest_examples_per_rule)
        my_vars.conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3, 4},
            my_vars.TN: {2, 5},
            my_vars.FN: set()
        }
        new_rule = pd.Series(
            {
                "A": "low",
                "B": Bounds(lower=0.5, upper=1.0),
                "C": Bounds(lower=3, upper=3),
                "Class": "banana"
            },
            name=0)
        correct_f1 = 0.8

        f1, conf_matrix, closest_rules, closest_examples, covered, updated_example_ids = \
            evaluate_f1_temporarily(df, new_rule, new_rule.name, class_col_name, lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.0),
            5: Data(rule_id=2, dist=0.67015625)
        }
        correct_covered = {0: {4}}
        correct_updated_examples = [4]
        self.assertTrue(updated_example_ids == correct_updated_examples)
        self.assertTrue(f1 == correct_f1)
        # Local result is still the same as in test_evaluate_f1_update_confusion_matrix.py
        for example_id in closest_rules:
            rule_id, dist = closest_rules[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id][0]
                and abs(dist - correct_closest_rule_per_example[example_id][1])
                < 0.001)
        self.assertTrue(closest_examples == my_vars.closest_examples_per_rule)
        correct_conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3},
            my_vars.TN: {2, 4, 5},
            my_vars.FN: set()
        }
        self.assertTrue(conf_matrix == correct_conf_matrix)
        # But now check that global variables remained unaffected by the changes
        correct_conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: {3, 4},
            my_vars.TN: {2, 5},
            my_vars.FN: set()
        }
        self.assertTrue(my_vars.conf_matrix == correct_conf_matrix)
        self.assertTrue(
            correct_closest_rules == my_vars.closest_rule_per_example)
        self.assertTrue(
            correct_closest_examples == my_vars.closest_examples_per_rule)
        self.assertTrue(correct_covered == covered)
示例#15
0
    def test_predict_covered(self):
        """Predict the class labels of covered examples"""
        test_set = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class": ["", "", "", "", "", ""]
        })
        # Use majority class as minority to have multiple neighbors and see if the function works correctly
        classes = ["apple", "banana"]
        class_col_name = "Class"
        my_vars.minority_class = classes[0]
        rules = {
            2:
            pd.Series(
                {
                    "B": Bounds(lower=1.25, upper=4.0),
                    "C": Bounds(lower=0.5, upper=1.5),
                    "Class": "banana"
                },
                name=2),
            6:
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=3.0),
                    "Class": "banana"
                },
                name=6),
            5:
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=4.0),
                    "C": Bounds(lower=1.0, upper=2.5),
                    "Class": "banana"
                },
                name=5),
            0:
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=3.0),
                    "Class": "apple"
                },
                name=0),
        }
        model = {
            2: Support(minority=0.75, majority=0.25),
            6: Support(minority=0.2, majority=0.8),
            5: Support(minority=1.0, majority=0.0),
            0: Support(minority=0, majority=1)
        }

        # Last 2 parameters aren't be used in this test
        df = predict_binary(model,
                            test_set,
                            rules,
                            classes,
                            class_col_name,
                            None,
                            None,
                            for_multiclass=False)
        correct = pd.DataFrame({
            my_vars.PREDICTED_LABEL:
            ["banana", "banana", "apple", "banana", "banana", "apple"],
            my_vars.PREDICTION_CONFIDENCE: [0.9, 0.9, 0.875, 0.683333, 0.9, 1]
        })
        self.assertTrue(
            np.array_equal(correct[my_vars.PREDICTED_LABEL].values,
                           df[my_vars.PREDICTED_LABEL].values))
        self.assertTrue(
            np.allclose(correct[my_vars.PREDICTION_CONFIDENCE],
                        df[my_vars.PREDICTION_CONFIDENCE]))
    def test_evaluate_f1_initialize_confusion_matrix(self):
        """Tests what happens if input has a numeric and a nominal feature"""
        df = pd.DataFrame({"A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75],
                           "C": [3, 2, 1, .5, 3, 2],
                           "Class": ["apple", "apple", "banana", "banana", "banana", "banana"]})
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        rules = [
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"},
                      name=0),
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple"},
                      name=1),
            pd.Series({"A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1),
                       "Class": "banana"}, name=2),
            pd.Series({"A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5),
                       "Class": "banana"}, name=3),
            pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3),
                       "Class": "banana"}, name=4),
            pd.Series({"A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2),
                       "Class": "banana"}, name=5)
        ]
        min_max = pd.DataFrame({"B": {"min": 1, "max": 5}, "C": {"min": 1, "max": 11}})
        my_vars.minority_class = "apple"
        # Reset as other tests changed the data
        my_vars.closest_rule_per_example = {}
        my_vars.closest_examples_per_rule = {}
        my_vars.all_rules = {0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5]}

        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}}
        # Note: examples_covered_by_rule implicitly includes the seeds of all rules
        my_vars.examples_covered_by_rule = {}

        # tagged, initial_rules = add_tags_and_extract_rules(df, 2, class_col_name, lookup, min_max, classes)
        correct_f1 = 2*1*0.5/1.5
        f1 = evaluate_f1_initialize_confusion_matrix(df, rules, class_col_name, lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: (1, 0.010000000000000002),
            1: (0, 0.010000000000000002),
            2: (5, 0.67015625),
            3: (1, 0.038125),
            4: (0, 0.015625),
            5: (2, 0.67015625)}
        correct_closest_examples_per_rule = {1: {0, 3}, 0: {1, 4}, 5: {2}, 2: {5}}
        correct_conf_matrix = {'tp': {0, 1}, 'fp': {3, 4}, 'tn': {2, 5}, 'fn': set()}
        self.assertTrue(f1 == correct_f1)
        self.assertTrue(correct_closest_examples_per_rule == my_vars.closest_examples_per_rule)
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(rule_id == correct_closest_rule_per_example[example_id][0] and
                            abs(dist - correct_closest_rule_per_example[example_id][1]) < 0.001)
        self.assertTrue(my_vars.conf_matrix == correct_conf_matrix)
示例#17
0
 def test_find_neighbors_too_few(self):
     """Test that warning is thrown if too few neighbors exist"""
     dataset = pd.DataFrame({
         "A": [1, 2],
         "B": [1, 2],
         "C": [2, 2],
         "D": ["x", "y"],
         "Class": ["A", "B"]
     })
     rule = pd.Series({
         "A": (0.1, 1),
         "B": Bounds(lower=1, upper=1),
         "C": Bounds(lower=2, upper=2),
         "D": "x",
         "Class": "A"
     })
     k = 3
     classes = ["apple", "banana"]
     class_col_name = "Class"
     min_max = pd.DataFrame({
         "A": {
             "min": 1,
             "max": 5
         },
         "B": {
             "min": 1,
             "max": 11
         },
         "C": {
             "min": 1,
             "max": 2
         }
     })
     lookup = \
         {
             "D":
                 {
                     'x': 1,
                     'y': 1,
                     my_vars.CONDITIONAL:
                         {
                             'x':
                                 Counter({
                                     'A': 1
                                 }),
                             'y':
                                 Counter({
                                     'B': 1
                                 })
                         }
                 }
         }
     self.assertWarns(UserWarning,
                      find_nearest_examples,
                      dataset,
                      k,
                      rule,
                      class_col_name,
                      lookup,
                      min_max,
                      classes,
                      label_type=my_vars.SAME_LABEL_AS_RULE,
                      only_uncovered_neighbors=False)
示例#18
0
    def test_delete_rule_statistics_collision(self):
        """Deletes a rule that shares its hash with other rules"""
        extra_rule = pd.Series(
            {
                "A": "high",
                "B": Bounds(lower=0.1, upper=1),
                "C": Bounds(lower=1, upper=2),
                "Class": "apple"
            },
            name=4)
        rules = [
            extra_rule,
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=1),  # Duplicate
        ]
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class": ["apple", "apple", "apple", "apple", "apple", "apple"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 1,
                        'low': 2,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'apple': 1
                                    }),
                                'low':
                                    Counter({
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 0.1,
                "max": 1
            },
            "C": {
                "min": 1,
                "max": 3
            }
        })
        my_vars.minority_class = "apple"
        my_vars.unique_rules = {}
        my_vars.all_rules = {}
        for rule in rules:
            hash_val = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name)
            my_vars.all_rules[rule.name] = rule
        print("hashes", my_vars.unique_rules)
        # Some random values
        my_vars.seed_example_rule = {0: {1, 5}, 10: {0}, 4: {7}}
        my_vars.seed_rule_example = {5: 0, 1: 0, 0: 10, 7: 4}
        my_vars.closest_examples_per_rule = {0: {0, 3}, 1: {4}, 4: {8}}
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=0, dist=3),
            3: Data(rule_id=0, dist=2),
            4: Data(rule_id=1, dist=0.13),
            5: Data(rule_id=76, dist=3)
        }
        my_vars.examples_covered_by_rule = {0: {43, 12}, 1: {7}, 2: {3}}
        final_rules = {}
        # Delete entries for rules with IDs 0 and 1 from all statistics
        rule1 = rules.pop()
        delete_rule_statistics(df, rule1, rules, final_rules, class_col_name,
                               lookup, min_max, classes)
        rule2 = rules.pop()
        delete_rule_statistics(df, rule2, rules, final_rules, class_col_name,
                               lookup, min_max, classes)

        correct_seed_example_rule = {4: {7}}
        correct_seed_rule_example = {5: 0, 7: 4}
        correct_unique_rules = {compute_hashable_key(extra_rule): {4}}
        correct_all_rules = {4: extra_rule}
        # extra_rule now also covers the 3 examples to which the 2 deleted rules were closest
        correct_closest_examples_per_rule = {4: {8, 0, 3, 4}}
        correct_closest_rule_per_example = {
            5: Data(rule_id=76, dist=3),
            4: Data(rule_id=4, dist=0.25),
            0: Data(rule_id=4, dist=0.25),
            3: Data(rule_id=4, dist=0.371141975308642)
        }
        correct_covered_by_rule = {2: {3}}
        self.assertTrue(my_vars.seed_rule_example == correct_seed_rule_example)
        self.assertTrue(my_vars.seed_example_rule == correct_seed_example_rule)
        self.assertTrue(my_vars.unique_rules == correct_unique_rules)
        self.assertTrue(my_vars.all_rules == correct_all_rules)
        self.assertTrue(my_vars.closest_examples_per_rule ==
                        correct_closest_examples_per_rule)
        self.assertTrue(my_vars.closest_rule_per_example ==
                        correct_closest_rule_per_example)
        self.assertTrue(
            my_vars.examples_covered_by_rule == correct_covered_by_rule)
示例#19
0
 def test_bracid_stops(self):
     """Tests that the method stops"""
     df = pd.DataFrame({
         "A": ["low", "low", "high", "low", "low", "high"],
         "B": [1, 1, 4, 1.5, 0.5, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     class_col_name = "Class"
     lookup = \
         {
             "A":
                 {
                     'high': 2,
                     'low': 4,
                     my_vars.CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 2
                                 }),
                             'low':
                                 Counter({
                                     'banana': 2,
                                     'apple': 2
                                 })
                         }
                 }
         }
     classes = ["apple", "banana"]
     min_max = pd.DataFrame({
         "B": {
             "min": 1,
             "max": 5
         },
         "C": {
             "min": 1,
             "max": 11
         }
     })
     # Use majority class as minority to have multiple neighbors and see if the function works correctly
     minority_label = "banana"
     k = 3
     correct_rules = {
         0:
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=2, upper=3.0),
                 "Class": "apple"
             },
             name=0),
         2:
         pd.Series(
             {
                 "B": Bounds(lower=1.25, upper=4.0),
                 "C": Bounds(lower=0.5, upper=1.5),
                 "Class": "banana"
             },
             name=2),
         3:
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=0.5, upper=1.5),
                 "C": Bounds(lower=0.5, upper=3.0),
                 "Class": "banana"
             },
             name=3),
         4:
         pd.Series(
             {
                 "B": Bounds(lower=0.5, upper=0.875),
                 "C": Bounds(lower=2.0, upper=3.0),
                 "Class": "banana"
             },
             name=4),
         5:
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=0.75, upper=4.0),
                 "C": Bounds(lower=1.0, upper=2.5),
                 "Class": "banana"
             },
             name=5),
     }
     rules = bracid(df, k, class_col_name, lookup, min_max, classes,
                    minority_label)
     all_rules_are_equal = True
     for r in rules:
         if not rules[r].equals(correct_rules[r]):
             all_rules_are_equal = False
             break
     self.assertTrue(all_rules_are_equal)
示例#20
0
    def test_extend_rule_mixed(self):
        """Test that a rule containing nominal and numeric features is extended correctly"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3.1, 3.2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5)
        ]

        k = 3
        # Reset from previous test to make sure they don't affect the outcomes of this test
        my_vars.closest_examples_per_rule = {}
        my_vars.closest_rule_per_example = {}
        my_vars.examples_covered_by_rule = {}
        extended_rule = extend_rule(df, k, rules[0], class_col_name, lookup,
                                    min_max, classes)
        correct_rule = pd.Series(
            {
                "A": "low",
                "B": (0.875, 1.25),
                "C": (1.75, 3.05),
                "Class": "apple"
            },
            name=0)
        print(extended_rule)
        self.assertTrue(extended_rule.equals(correct_rule))
示例#21
0
    def test_add_one_best_rule_no_update(self):
        """Tests that rule set is not updated when no generalized rule improves F1"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        test_idx = -1
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0)  # Current rule is always at the end of the list
        ]
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        my_vars.all_rules = {
            0: rules[test_idx],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[0]
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }
        my_vars.conf_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        my_vars.examples_covered_by_rule = {}
        # F1 is actually 0.6666, but setting it to 0.8 makes it not update any rule
        initial_f1 = 0.8
        k = 3
        my_vars.unique_rules = {}
        for rule in rules:
            rule_hash = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name)

        neighbors, dists, _ = find_nearest_examples(
            df,
            k,
            rules[test_idx],
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        improved, updated_rules, f1 = add_one_best_rule(
            df, neighbors, rules[test_idx], rules, initial_f1, class_col_name,
            lookup, min_max, classes)
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        self.assertTrue(improved is False)
        correct_f1 = initial_f1
        self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION)
        correct_generalized_rule = pd.Series(
            {
                "A": "low",
                "B": (1, 1),
                "C": (3, 3),
                "Class": "apple"
            }, name=0)
        correct_confusion_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        print(rules[test_idx])
        print(correct_generalized_rule)
        print("updated")
        print(updated_rules)
        self.assertTrue(
            updated_rules[test_idx].equals(correct_generalized_rule))
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
示例#22
0
 def test_extend_rule_no_change(self):
     """Test that a rule containing nominal and numeric features isn't extended due to no neighbors"""
     df = pd.DataFrame({
         "A": ["low", "low", "high", "low", "low", "high"],
         "B": [1, 1, 1, 1, 1, 1],
         "C": [3, 2, 3, 3, 3, 3],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     class_col_name = "Class"
     lookup = \
         {
             "A":
                 {
                     'high': 2,
                     'low': 4,
                     my_vars.CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 2
                                 }),
                             'low':
                                 Counter({
                                     'banana': 2,
                                     'apple': 2
                                 })
                         }
                 }
         }
     classes = ["apple", "banana"]
     min_max = pd.DataFrame({
         "B": {
             "min": 1,
             "max": 5
         },
         "C": {
             "min": 1,
             "max": 11
         }
     })
     my_vars.minority_class = "apple"
     rules = [
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "apple"
             },
             name=0),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=4, upper=4),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "banana"
             },
             name=2),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1.5, upper=1.5),
                 "C": Bounds(lower=0.5, upper=0.5),
                 "Class": "banana"
             },
             name=3),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=0.5, upper=0.5),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "banana"
             },
             name=4),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=0.75, upper=0.75),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "banana"
             },
             name=5)
     ]
     my_vars.closest_examples_per_rule = {}
     my_vars.closest_rule_per_example = {}
     k = 3
     extended_rule = extend_rule(df, k, rules[0], class_col_name, lookup,
                                 min_max, classes)
     correct_rule = pd.Series(
         {
             "A": "low",
             "B": (1, 1),
             "C": (3, 3),
             "Class": "apple"
         }, name=0)
     self.assertTrue(extended_rule.equals(correct_rule))
示例#23
0
 def test_find_neighbors_numeric_nominal_covers(self):
     """Tests that the stats for a newly covered rule are updated (dist = 0)"""
     """Tests that global statistics are updated accordingly"""
     df = pd.DataFrame({
         "A": ["low", "low", "high", "high", "low", "high"],
         "B": [1, 1, 1, 1, 0.5, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     class_col_name = "Class"
     lookup = \
         {
             "A":
                 {
                     'high': 2,
                     'low': 4,
                     my_vars.CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 2
                                 }),
                             'low':
                                 Counter({
                                     'banana': 2,
                                     'apple': 2
                                 })
                         }
                 }
         }
     rules = [
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "apple"
             },
             name=0),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=4, upper=4),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "banana"
             },
             name=2),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1.5, upper=1.5),
                 "C": Bounds(lower=0.5, upper=0.5),
                 "Class": "banana"
             },
             name=3),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=0.5, upper=0.5),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "banana"
             },
             name=4),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=0.75, upper=0.75),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "banana"
             },
             name=5)
     ]
     my_vars.all_rules = {
         0: rules[0],
         1: rules[1],
         2: rules[2],
         3: rules[3],
         4: rules[4],
         5: rules[5]
     }
     my_vars.closest_rule_per_example = {
         0: (1, 0.010000000000000002),
         1: (0, 0.010000000000000002),
         2: (5, 0.67015625),
         3: (1, 0.038125),
         4: (0, 0.015625),
         5: (2, 0.67015625)
     }
     my_vars.closest_examples_per_rule = {
         0: {1, 4},
         1: {0, 3},
         2: {5},
         5: {2}
     }
     k = 4
     correct = df.iloc[[2, 3, 5, 4]]
     rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0)
     classes = ["apple", "banana"]
     my_vars.minority_class = "banana"
     min_max = pd.DataFrame({
         "A": {
             "min": 1,
             "max": 5
         },
         "B": {
             "min": 1,
             "max": 11
         }
     })
     # An example could be covered by multiple rules, so example 2 should be covered by rules 0 and 1 at the end
     my_vars.examples_covered_by_rule = {1: {2}}
     correct_covered = {0: {2, 3}, 1: {2}}
     correct_examples_per_rule = {0: {1, 2, 3, 4, 5}, 1: {0}}
     correct_closest_rule_per_example = {
         0: (1, 0.010000000000000002),
         1: (0, 0.010000000000000002),
         2: (0, 0.0),
         3: (0, 0.0),
         4: (0, 0.015625),
         5: (0, 0.0006250000000000001)
     }
     neighbors, _, _ = find_nearest_examples(
         df,
         k,
         rule,
         class_col_name,
         lookup,
         min_max,
         classes,
         label_type=my_vars.SAME_LABEL_AS_RULE,
         only_uncovered_neighbors=False)
     self.assertTrue(neighbors.equals(correct))
     self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
     self.assertTrue(
         correct_examples_per_rule == my_vars.closest_examples_per_rule)
     for example_id, (rule_id,
                      dist) in correct_closest_rule_per_example.items():
         self.assertTrue(example_id in my_vars.closest_rule_per_example)
         other_id, other_dist = my_vars.closest_rule_per_example[example_id]
         self.assertTrue(rule_id == other_id)
         self.assertTrue(abs(dist - other_dist) < 0.0001)
示例#24
0
 def test_find_neighbors_numeric_nominal_label_type(self):
     """Tests what happens if input has a numeric and a nominal feature and we vary label_type as parameter"""
     df = pd.DataFrame({
         "A": ["low", "low", "high", "low", "low", "high"],
         "B": [1, 1, 4, 1.5, 0.5, 0.75],
         "C": [3, 2, 1, .5, 3, 2],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     class_col_name = "Class"
     lookup = \
         {
             "A":
                 {
                     'high': 2,
                     'low': 4,
                     my_vars.CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 2
                                 }),
                             'low':
                                 Counter({
                                     'banana': 2,
                                     'apple': 2
                                 })
                         }
                 }
         }
     k = 3
     rules = [
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "apple"
             },
             name=0),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1, upper=1),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "apple"
             },
             name=1),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=4, upper=4),
                 "C": Bounds(lower=1, upper=1),
                 "Class": "banana"
             },
             name=2),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=1.5, upper=1.5),
                 "C": Bounds(lower=0.5, upper=0.5),
                 "Class": "banana"
             },
             name=3),
         pd.Series(
             {
                 "A": "low",
                 "B": Bounds(lower=0.5, upper=0.5),
                 "C": Bounds(lower=3, upper=3),
                 "Class": "banana"
             },
             name=4),
         pd.Series(
             {
                 "A": "high",
                 "B": Bounds(lower=0.75, upper=0.75),
                 "C": Bounds(lower=2, upper=2),
                 "Class": "banana"
             },
             name=5)
     ]
     my_vars.all_rules = {
         0: rules[0],
         1: rules[1],
         2: rules[2],
         3: rules[3],
         4: rules[4],
         5: rules[5]
     }
     my_vars.closest_rule_per_example = {}
     my_vars.closest_examples_per_rule = {}
     correct_all = df.iloc[[5, 2, 0]]
     correct_same = df.iloc[[5, 2, 3]]
     correct_opposite = df.iloc[[0, 1]]
     rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0)
     classes = ["apple", "banana"]
     min_max = pd.DataFrame({
         "A": {
             "min": 1,
             "max": 5
         },
         "B": {
             "min": 1,
             "max": 11
         }
     })
     neighbors_all, _, _ = find_nearest_examples(
         df,
         k,
         rule,
         class_col_name,
         lookup,
         min_max,
         classes,
         label_type=my_vars.ALL_LABELS,
         only_uncovered_neighbors=False)
     neighbors_same, _, _ = find_nearest_examples(
         df,
         k,
         rule,
         class_col_name,
         lookup,
         min_max,
         classes,
         label_type=my_vars.SAME_LABEL_AS_RULE,
         only_uncovered_neighbors=False)
     neighbors_opposite, _, _ = find_nearest_examples(
         df,
         k,
         rule,
         class_col_name,
         lookup,
         min_max,
         classes,
         label_type=my_vars.OPPOSITE_LABEL_TO_RULE,
         only_uncovered_neighbors=False)
     print(neighbors_all)
     print(neighbors_same)
     print(neighbors_opposite)
     self.assertTrue(neighbors_all.equals(correct_all))
     self.assertTrue(neighbors_same.equals(correct_same))
     self.assertTrue(neighbors_opposite.equals(correct_opposite))
    def test_add_all_good_rules(self):
        """Tests that rule set is updated when a generalized rule improves F1"""
        df = pd.DataFrame({"A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75],
                           "C": [3, 2, 1, .5, 3, 2],
                           "Class": ["apple", "apple", "banana", "banana", "banana", "banana"]})
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({"B": {"min": 1, "max": 5}, "C": {"min": 1, "max": 11}})
        # Use majority class as minority to have multiple neighbors and see if the function works correctly
        my_vars.minority_class = "banana"
        rules = [
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"},
                      name=0),
            pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple"},
                      name=1),
            pd.Series({"A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5),
                       "Class": "banana"}, name=3),
            pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3),
                       "Class": "banana"}, name=4),
            pd.Series({"A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2),
                       "Class": "banana"}, name=5),
            pd.Series({"A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1),
                       "Class": "banana"}, name=2)  # Current rule to be tested is always at the end
        ]
        test_idx = -1
        my_vars.latest_rule_id = len(rules) - 1
        my_vars.examples_covered_by_rule = {}
        my_vars.all_rules = {0: rules[0], 1: rules[1], 2: rules[test_idx], 3: rules[2], 4: rules[3], 5: rules[4]}
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
        my_vars.seed_example_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}}
        my_vars.unique_rules = {}
        for rule in rules:
            hash_val = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name)

        initial_correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)}
        initial_f1 = evaluate_f1_initialize_confusion_matrix(df, rules, class_col_name, lookup, min_max, classes)
        correct_confusion_matrix = {my_vars.TP: {2, 5}, my_vars.FP: set(), my_vars.TN: {0, 1}, my_vars.FN: {3, 4}}
        correct_rules = 8
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)

        # Make sure confusion matrix, closest rule per example are correct at the beginning
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(rule_id == initial_correct_closest_rule_per_example[example_id].rule_id and
                            abs(dist - initial_correct_closest_rule_per_example[example_id].dist) < 0.001)

        correct_initial_f1 = 2 * 0.5 * 1 / 1.5
        self.assertTrue(initial_f1 == correct_initial_f1)
        k = 3
        neighbors, dists, _ = find_nearest_examples(df, k, rules[test_idx], class_col_name, lookup, min_max, classes,
                                                    label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=
                                                    True)
        improved, updated_rules, f1 = add_all_good_rules(df, neighbors, rules[test_idx], rules, initial_f1,
                                                         class_col_name, lookup, min_max, classes)
        self.assertTrue(improved is True)
        print("f1", f1)
        # correct_covered = {2: {0, 1, 2, 3, 4, 5}}
        correct_covered = {6: {0, 1, 2, 4, 5}, 7: {3}}
        correct_confusion_matrix = {my_vars.TP: {2, 3, 4, 5}, my_vars.FP: {0, 1}, my_vars.TN: set(), my_vars.FN: set()}
        # correct_closest_rule_per_example = {
        #     0: Data(rule_id=2, dist=0.0),
        #     1: Data(rule_id=2, dist=0.0),
        #     2: Data(rule_id=2, dist=0.0),
        #     3: Data(rule_id=2, dist=0.0),
        #     4: Data(rule_id=2, dist=0.0),
        #     5: Data(rule_id=2, dist=0.0)}
        correct_closest_rule_per_example = {
            0: Data(rule_id=6, dist=0.0),
            1: Data(rule_id=6, dist=0.0),
            2: Data(rule_id=6, dist=0.0),
            3: Data(rule_id=7, dist=0.0),
            4: Data(rule_id=6, dist=0.0),
            5: Data(rule_id=6, dist=0.0)
        }
        correct_f1 = 0.8
        self.assertTrue(correct_f1 == f1)
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(rule_id == correct_closest_rule_per_example[example_id].rule_id and
                            abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001)
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
        # latest_rule_id must be 7 as 2 new rules were added to the 5 initial rules
        self.assertTrue(len(updated_rules) == correct_rules and my_vars.latest_rule_id == (correct_rules - 1))
        self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
示例#26
0
    def test_find_neighbors_numeric_nominal_covered(self):
        """Tests what happens if input has a numeric and a nominal feature and some examples are already covered
            by the rule"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        k = 4
        my_vars.closest_rule_per_example = {}
        correct = None
        if k == 1:
            correct = df.iloc[[5]]
        elif k == 2:
            correct = df.iloc[[5, 2]]
        elif k == 3:
            correct = df.iloc[[5, 2, 3]]
        elif k >= 4:
            # correct = df.iloc[[5, 2, 3, 4]]
            # Examples at indices 2 and 4 are already covered by the rule, so don't return them as neighbors
            my_vars.examples_covered_by_rule = {0: {2, 4}}
            correct = df.iloc[[5, 3]]
        my_vars.all_rules = {}
        rule = pd.Series(
            {
                "A": "high",
                "B": Bounds(lower=1, upper=1),
                "Class": "banana"
            },
            name=0)
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "A": {
                "min": 1,
                "max": 5
            },
            "B": {
                "min": 1,
                "max": 11
            }
        })

        neighbors, _, _ = find_nearest_examples(
            df,
            k,
            rule,
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        self.assertTrue(neighbors.equals(correct))
示例#27
0
    def test_find_neighbors_numeric_nominal_stats(self):
        """Tests that global statistics are updated accordingly"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0)
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        # Reset because other tests added data, so if you only run this test it would work, but not if other
        # tests are run prior to that
        my_vars.examples_covered_by_rule = {}
        my_vars.closest_examples_per_rule = {}
        my_vars.closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5)
        ]
        my_vars.all_rules = {
            0: rules[0],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }
        # my_vars.all_rules = {0: rule}
        k = 4
        correct = df.iloc[[5, 2, 3, 4]]

        classes = ["apple", "banana"]
        my_vars.minority_class = "banana"
        min_max = pd.DataFrame({
            "A": {
                "min": 1,
                "max": 5
            },
            "B": {
                "min": 1,
                "max": 11
            }
        })
        correct_covered = {}
        correct_examples_per_rule = {0: {1, 2, 4, 5}, 1: {0, 3}}
        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.010000000000000002),
            2: Data(rule_id=0, dist=0.09),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=0, dist=0.0006250000000000001)
        }
        neighbors, _, _ = find_nearest_examples(
            df,
            k,
            rule,
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=False)
        self.assertTrue(neighbors.equals(correct))
        self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
        self.assertTrue(
            correct_examples_per_rule == my_vars.closest_examples_per_rule)
        for example_id, (rule_id,
                         dist) in correct_closest_rule_per_example.items():
            features = my_vars.all_rules[rule_id].size
            self.assertTrue(example_id in my_vars.closest_rule_per_example)
            other_id, other_dist = my_vars.closest_rule_per_example[example_id]
            other_features = my_vars.all_rules[other_id].size
            self.assertTrue(rule_id == other_id)
            self.assertTrue(features == other_features)
            self.assertTrue(abs(dist - other_dist) < 0.0001)
示例#28
0
    def test_predict_uncovered(self):
        """Predict the class labels of uncovered examples with handling ties (2 rules are equally distant) for
        example 4, namely rules 0 and 6"""
        # Assumptions: these are the data for the training set NOT for the test set
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })

        classes = ["apple", "banana"]
        test_set = pd.DataFrame({
            "A": ["low", "high", "high", "low", "low", "high"],
            "B": [4.1, 6.1, 5.4, 0.15, 0.05, 0.075],
            "C": [0.3, 4, 0.1, .4, 0.3, 5],
            "Class": ["", "", "", "", "", ""]
        })
        # Use majority class as minority to have multiple neighbors and see if the function works correctly
        class_col_name = "Class"
        my_vars.minority_class = classes[0]
        my_vars.examples_covered_by_rule = {}
        my_vars.closest_examples_per_rule = {}
        my_vars.closest_rule_per_example = {}
        rules = {
            2:
            pd.Series(
                {
                    "B": Bounds(lower=1.25, upper=4.0),
                    "C": Bounds(lower=0.5, upper=1.5),
                    "Class": "banana"
                },
                name=2),
            6:
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=3.0),
                    "Class": "banana"
                },
                name=6),
            5:
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=4.0),
                    "C": Bounds(lower=1.0, upper=2.5),
                    "Class": "banana"
                },
                name=5),
            0:
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=3.0),
                    "Class": "apple"
                },
                name=0),
        }
        my_vars.all_rules = rules
        model = {
            2: Support(minority=0.75, majority=0.25),
            6: Support(minority=0.2, majority=0.8),
            5: Support(minority=1.0, majority=0.0),
            0: Support(minority=0, majority=1)
        }
        correct_covered = {}
        correct_examples_per_rule = {}
        correct_rule_per_example = {}

        df = predict_binary(model,
                            test_set,
                            rules,
                            classes,
                            class_col_name,
                            lookup,
                            min_max,
                            for_multiclass=False)
        correct = pd.DataFrame({
            my_vars.PREDICTED_LABEL:
            ["apple", "apple", "apple", "banana", "banana", "apple"],
            my_vars.PREDICTION_CONFIDENCE: [0.75, 1, 0.75, 0.9, 0.9, 1]
        })

        # Test that predictions didn't change internal statistics of the model
        self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
        self.assertTrue(
            correct_examples_per_rule == my_vars.closest_examples_per_rule)
        self.assertTrue(
            correct_rule_per_example == my_vars.closest_rule_per_example)

        self.assertTrue(
            np.array_equal(correct[my_vars.PREDICTED_LABEL].values,
                           df[my_vars.PREDICTED_LABEL].values))
        self.assertTrue(
            np.allclose(correct[my_vars.PREDICTION_CONFIDENCE],
                        df[my_vars.PREDICTION_CONFIDENCE]))
示例#29
0
    def test_add_one_best_rule_update_stats(self):
        """Tests that rule set is updated when a generalized rule improves F1 and also the mapping of closest rule per
        example changes"""
        df = pd.DataFrame({
            "A": ["low", "low", "high", "low", "low", "high"],
            "B": [1, 1, 4, 1.5, 0.5, 0.75],
            "C": [3, 2, 1, .5, 3, 2],
            "Class":
            ["apple", "apple", "banana", "banana", "banana", "banana"]
        })
        class_col_name = "Class"
        lookup = \
            {
                "A":
                    {
                        'high': 2,
                        'low': 4,
                        my_vars.CONDITIONAL:
                            {
                                'high':
                                    Counter({
                                        'banana': 2
                                    }),
                                'low':
                                    Counter({
                                        'banana': 2,
                                        'apple': 2
                                    })
                            }
                    }
            }
        test_idx = -1
        classes = ["apple", "banana"]
        min_max = pd.DataFrame({
            "B": {
                "min": 1,
                "max": 5
            },
            "C": {
                "min": 1,
                "max": 11
            }
        })
        my_vars.minority_class = "apple"
        rules = [
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "apple"
                },
                name=1),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=4, upper=4),
                    "C": Bounds(lower=1, upper=1),
                    "Class": "banana"
                },
                name=2),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1.5, upper=1.5),
                    "C": Bounds(lower=0.5, upper=0.5),
                    "Class": "banana"
                },
                name=3),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=0.5, upper=0.5),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "banana"
                },
                name=4),
            pd.Series(
                {
                    "A": "high",
                    "B": Bounds(lower=0.75, upper=0.75),
                    "C": Bounds(lower=2, upper=2),
                    "Class": "banana"
                },
                name=5),
            pd.Series(
                {
                    "A": "low",
                    "B": Bounds(lower=1, upper=1),
                    "C": Bounds(lower=3, upper=3),
                    "Class": "apple"
                },
                name=0)  # Current rule is always at the end of the list
        ]
        my_vars.closest_examples_per_rule = {
            0: {4},
            1: {0, 1, 3},  # Change compared to previous test case
            2: {5},
            5: {2}
        }
        my_vars.closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=1, dist=0.010000000000000002
                    ),  # Change compared to previous test case
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        # Reset because other tests change the data
        # my_vars.examples_covered_by_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}, 6: {8}}
        my_vars.examples_covered_by_rule = {}
        my_vars.all_rules = {
            0: rules[test_idx],
            1: rules[1],
            2: rules[2],
            3: rules[3],
            4: rules[4],
            5: rules[5]
        }
        my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8}
        my_vars.seed_example_rule = {
            0: {0},
            1: {1},
            2: {2},
            3: {3},
            4: {4},
            5: {5}
        }
        my_vars.unique_rules = {}
        my_vars.unique_rules = {}
        for rule in rules:
            rule_hash = compute_hashable_key(rule)
            my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name)

        # Actually, correctly it should've been
        # my_vars.conf_matrix = {my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4}}
        # at the start (i.e. F1=0.66666), but to see if it changes, it's changed
        my_vars.conf_matrix = {
            my_vars.TP: {0},
            my_vars.FP: set(),
            my_vars.TN: {1, 2, 5},
            my_vars.FN: {3, 4}
        }
        initial_f1 = 0.1
        k = 3
        neighbors, dists, _ = find_nearest_examples(
            df,
            k,
            rules[test_idx],
            class_col_name,
            lookup,
            min_max,
            classes,
            label_type=my_vars.SAME_LABEL_AS_RULE,
            only_uncovered_neighbors=True)
        improved, updated_rules, f1 = add_one_best_rule(
            df, neighbors, rules[test_idx], rules, initial_f1, class_col_name,
            lookup, min_max, classes)

        correct_closest_rule_per_example = {
            0: Data(rule_id=1, dist=0.010000000000000002),
            1: Data(rule_id=0, dist=0.0),
            2: Data(rule_id=5, dist=0.67015625),
            3: Data(rule_id=1, dist=0.038125),
            4: Data(rule_id=0, dist=0.015625),
            5: Data(rule_id=2, dist=0.67015625)
        }
        correct_closest_examples_per_rule = {
            0: {1, 4},
            1: {0, 3},
            2: {5},
            5: {2}
        }
        correct_f1 = 2 * 0.5 * 1 / 1.5
        self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION)
        self.assertTrue(improved is True)
        correct_generalized_rule = pd.Series(
            {
                "A": "low",
                "B": (1, 1),
                "C": (2.0, 3),
                "Class": "apple"
            }, name=0)
        correct_confusion_matrix = {
            my_vars.TP: {0, 1},
            my_vars.FP: set(),
            my_vars.TN: {2, 5},
            my_vars.FN: {3, 4}
        }
        # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too
        for example_id in my_vars.closest_rule_per_example:
            rule_id, dist = my_vars.closest_rule_per_example[example_id]
            self.assertTrue(
                rule_id == correct_closest_rule_per_example[example_id].rule_id
                and
                abs(dist - correct_closest_rule_per_example[example_id].dist) <
                0.001)
        self.assertTrue(
            updated_rules[test_idx].equals(correct_generalized_rule))
        self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
        print(correct_closest_examples_per_rule)
        print(my_vars.closest_examples_per_rule)
        self.assertTrue(correct_closest_examples_per_rule ==
                        my_vars.closest_examples_per_rule)