def test_simple(self):

        task = task_dummy(self.df, ps.BinaryTarget('columnC', 1))
        qf = ps.StandardQF(0)
        qf.calculate_constant_statistics(task)

        self.ga_qf.calculate_constant_statistics(task)

        #print(qf.calculate_statistics(self.A1, self.df))
        #print(qf.calculate_statistics(self.BA, self.df))
        #print(qf.calculate_statistics(ps.Conjunction([self.A1, self.BA]), self.df))
        #print(qf.calculate_statistics(slice(None), self.df))
        ga_stat = self.ga_qf.calculate_statistics(
            ps.Conjunction([self.A1, self.BA]), self.df)

        self.assertEqual(ga_stat.subgroup_stats,
                         ps.SimplePositivesQF.tpl(3, 2))
        self.assertEqual(ga_stat.generalisation_stats,
                         ps.SimplePositivesQF.tpl(5, 3))
        # Ensure cache works properly
        self.assertEqual(
            ga_stat,
            self.ga_qf.calculate_statistics(ps.Conjunction([self.A1, self.BA]),
                                            self.df))

        ga_score = self.ga_qf.evaluate(ps.Conjunction([self.A1, self.BA]),
                                       self.df)
        ga_score2 = self.ga_qf.evaluate(ps.Conjunction([self.A1, self.BA]),
                                        self.df)

        self.assertEqual(ga_score, ga_score2)
        self.assertAlmostEqual(ga_score, 0.06666666666666)
    def test_DNF(self):
        A1 = ps.EqualitySelector("A1", 1)
        A2 = ps.EqualitySelector("A2", 1, "AA")
        B1 = ps.EqualitySelector("B1", 1)
        B2 = ps.EqualitySelector("B2", "1")

        dnf1 = ps.DNF()
        dnf1.append_or([A1, A2])
        dnf2 = ps.DNF([A1, A2])
        self.assertTrue(dnf1 == dnf2)

        dnf3 = ps.DNF(ps.Conjunction([A1, A2]))
        dnf4 = ps.DNF()
        dnf4.append_and([A1, A2])
        dnf5 = ps.DNF()
        dnf5.append_and(A1)
        dnf5.append_and(A2)
        self.assertTrue(dnf3 == dnf4)
        self.assertTrue(dnf4 == dnf5)

        dnf6 = ps.DNF([])
        dnf6.append_and([B1, B2])
        dnf7 = ps.DNF([])
        dnf7.append_and([A1, A2])
        dnf7.append_or(ps.Conjunction([B1, B2]))
        self.df = pd.DataFrame.from_dict({
            "A1": [1, 1, 1, 2, 2, 2, 2, 0, 0, 0],  #pylint: disable=attribute-defined-outside-init
            "A2": [0, 1, 1, 1, 2, 2, 2, 0, 0, 0],
            "B1": [0, 0, 0, 0, 1, 1, 1, 0, 1, 1],
            "B2": ["0", "0", "0", "0", "1", "1", "2", "0", "0", "1"]
        })
        self.check_dataframe_query(dnf1, [1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
        self.check_dataframe_query(dnf3, [0, 1, 1, 0, 0, 0, 0, 0, 0, 0])
        self.check_dataframe_query(dnf6, [0, 0, 0, 0, 1, 1, 0, 0, 0, 1])
        self.check_dataframe_query(dnf7, [0, 1, 1, 0, 1, 1, 0, 0, 0, 1])
Пример #3
0
 def setUp(self):
     NS_checking = ps.EqualitySelector("checking_status", b"<0")
     NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes")
     NS_other_parties = ps.EqualitySelector("other_parties", b"none")
     NS_savings_status = ps.EqualitySelector("savings_status", b"<100")
     NS_job = ps.EqualitySelector("job", b"skilled")
     self.result = [ps.Conjunction([NS_checking, NS_foreign_worker]),
                    ps.Conjunction([NS_checking]),
                    ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]),
                    ps.Conjunction([NS_checking, NS_other_parties]),
                    ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]),
                    ps.Conjunction([NS_checking, NS_savings_status]),
                    ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties, NS_foreign_worker]),
                    ps.Conjunction([NS_checking, NS_job, NS_foreign_worker]),
                    ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties]),
                    ps.Conjunction([NS_checking, NS_job]),
                    ]
     self.qualities = [0.055299999999999995,
                       0.05280000000000001,
                       0.052300000000000006,
                       0.05059999999999999,
                       0.04959999999999999,
                       0.048299999999999996,
                       0.04660000000000001,
                       0.04550000000000001,
                       0.0452,
                       0.044399999999999995]
     data = get_credit_data()
     target = ps.BinaryTarget('class', b'bad')
     searchSpace = ps.create_nominal_selectors(data, ignore=['class'])
     self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=5, qf=ps.StandardQF(1.0))
    def test_equality_expressions(self):
        A1 = ps.EqualitySelector("A", 1)
        A2 = ps.EqualitySelector("A", 2, "AA")
        B1 = ps.EqualitySelector("B", 1)

        D1 = ps.Disjunction([A1, A2])
        D1_clone = ps.Disjunction([A1, A2])
        self.assertTrue(D1 == D1_clone)
        self.assertTrue(hash(D1) == hash(D1_clone))

        D_all = ps.Disjunction([A1, A2, B1])
        D1_clone.append_or(B1)
        self.assertTrue(D_all == D1_clone)
        self.assertTrue(hash(D_all) == hash(D1_clone))

        C1 = ps.Conjunction([A1, A2])
        C1_clone = ps.Conjunction([A1, A2])
        self.assertTrue(C1 == C1_clone)
        self.assertTrue(hash(C1) == hash(C1_clone))

        C_all = ps.Conjunction([A1, A2, B1])
        C1_clone.append_and(B1)
        self.assertTrue(C_all == C1_clone)
        self.assertTrue(hash(C_all) == hash(C1_clone))

        self.assertFalse(C1 == D1)
        self.assertFalse(hash(C1) == hash(D1))
    def test_CountTarget2(self):
        df = self.df
        self.ga_qf.calculate_constant_statistics(task_dummy(df, None))

        ga_score = self.ga_qf.evaluate(ps.Conjunction([self.A1, self.BA]), df)

        A_B_score = self.qf.evaluate(ps.Conjunction([self.A1, self.BA]), df)
        zero_score = self.qf.evaluate(ps.Conjunction([]), df)

        self.assertEqual(ga_score, A_B_score - zero_score)
Пример #6
0
    def test_CountTarget1(self):
        df = self.df
        target = ps.FITarget()
        self.ga_qf.calculate_constant_statistics(df, target)

        ga_score = self.ga_qf.evaluate(ps.Conjunction([self.A1]), target, df)

        A1_score = self.qf.evaluate(ps.Conjunction([self.A1]), target, df)
        zero_score = self.qf.evaluate(ps.Conjunction([]), target, df)

        self.assertEqual(ga_score, A1_score - zero_score)

        ga2_score = self.ga_qf.evaluate(ps.Conjunction([self.A1]), target, df)

        self.assertEqual(ga2_score, ga_score)
Пример #7
0
    def execute(self, task):
        task.qf.calculate_constant_statistics(task)
        result = []
        all_selectors = chain.from_iterable(
            combinations(task.search_space, r)
            for r in range(1, task.depth + 1))
        if self.show_progress:
            try:
                from tqdm import tqdm

                def binomial(x, y):
                    try:
                        binom = factorial(x) // factorial(y) // factorial(x -
                                                                          y)
                    except ValueError:
                        binom = 0
                    return binom

                total = sum(
                    binomial(len(task.search_space), k)
                    for k in range(1, task.depth + 1))
                all_selectors = tqdm(all_selectors, total=total)
            except ImportError:
                pass
        for selectors in all_selectors:
            sg = ps.Conjunction(selectors)
            statistics = task.qf.calculate_statistics(sg, task.data)
            quality = task.qf.evaluate(sg, statistics)
            ps.add_if_required(result, sg, quality, task)
        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
Пример #8
0
    def search_internal(self, task, prefix, modification_set, result, bitset):
        self.num_calls += 1
        sg_size = bitset.sum()
        if sg_size == 0:
            return result
        target_values_sg = self.target_values[bitset]

        target_values_cs = np.cumsum(target_values_sg)
        sizes = np.arange(1, len(target_values_cs) + 1)
        mean_values_cs = target_values_cs / sizes
        tpl = DFSNumeric.tpl(sizes, mean_values_cs)
        qualities = self.evaluate(None, tpl)
        optimistic_estimate = np.max(qualities)

        if optimistic_estimate <= ps.minimum_required_quality(result, task):
            return result

        sg = ps.Conjunction(copy.copy(prefix))

        quality = qualities[-1]
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            new_modification_set = copy.copy(modification_set)
            for sel in modification_set:
                prefix.append(sel)
                new_bitset = bitset & self.bitsets[sel]
                new_modification_set.pop(0)
                self.search_internal(task, prefix, new_modification_set,
                                     result, new_bitset)
                # remove the sel again
                prefix.pop(-1)
        return result
Пример #9
0
    def search_internal(self, task, prefix, modification_set, result,
                        use_optimistic_estimates):
        sg = ps.Conjunction(copy.copy(prefix))

        statistics = task.qf.calculate_statistics(sg, task.data)
        if use_optimistic_estimates and len(
                prefix) < task.depth and isinstance(
                    task.qf, ps.BoundedInterestingnessMeasure):
            optimistic_estimate = task.qf.optimistic_estimate(sg, statistics)
            if not (optimistic_estimate > ps.minimum_required_quality(
                    result, task)):
                return result
        quality = task.qf.evaluate(sg, statistics)
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            new_modification_set = copy.copy(modification_set)
            for sel in modification_set:
                prefix.append(sel)
                new_modification_set.pop(0)
                self.search_internal(task, prefix, new_modification_set,
                                     result, use_optimistic_estimates)
                # remove the sel again
                prefix.pop(-1)
        return result
Пример #10
0
    def execute(self, task):
        result = []
        queue = [(float("-inf"), ps.Conjunction([]))]
        operator = ps.StaticSpecializationOperator(task.search_space)
        task.qf.calculate_constant_statistics(task)
        while queue:
            q, old_description = heappop(queue)
            q = -q
            if not (q > ps.minimum_required_quality(result, task)):
                break
            for candidate_description in operator.refinements(old_description):
                sg = candidate_description
                statistics = task.qf.calculate_statistics(sg, task.data)
                ps.add_if_required(result, sg,
                                   task.qf.evaluate(sg, statistics), task)
                optimistic_estimate = task.qf.optimistic_estimate(
                    sg, statistics)

                # compute refinements and fill the queue
                if len(
                        candidate_description
                ) < task.depth and optimistic_estimate >= ps.minimum_required_quality(
                        result, task):
                    heappush(queue,
                             (-optimistic_estimate, candidate_description))

        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
Пример #11
0
    def execute(self, task):
        # adapt beam width to the result set size if desired
        if self.beam_width_adaptive:
            self.beam_width = task.result_set_size

        # check if beam size is to small for result set
        if self.beam_width < task.result_set_size:
            raise RuntimeError(
                'Beam width in the beam search algorithm is smaller than the result set size!'
            )

        task.qf.calculate_constant_statistics(task)

        # init
        beam = [(0, ps.Conjunction([]),
                 task.qf.calculate_statistics(slice(None), task.data))]
        last_beam = None

        depth = 0
        while beam != last_beam and depth < task.depth:
            last_beam = beam.copy()
            for (_, last_sg, _) in last_beam:
                if not getattr(last_sg, 'visited', False):
                    setattr(last_sg, 'visited', True)
                    for sel in task.search_space:
                        # create a clone
                        new_selectors = list(last_sg.selectors)
                        if sel not in new_selectors:
                            new_selectors.append(sel)
                            sg = ps.Conjunction(new_selectors)
                            statistics = task.qf.calculate_statistics(
                                sg, task.data)
                            quality = task.qf.evaluate(sg, statistics)
                            ps.add_if_required(beam,
                                               sg,
                                               quality,
                                               task,
                                               check_for_duplicates=True,
                                               statistics=statistics)
            depth += 1


# TODO make sure there is no bug here
        result = beam[:task.result_set_size]
        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
Пример #12
0
def get_max_generalization_mean(data, subgroup, weighting_attribute=None):
    selectors = subgroup.subgroup_description.selectors
    generalizations = ps.powerset(selectors)
    max_mean = 0
    for sels in generalizations:
        sg = ps.Subgroup(subgroup.target, ps.Conjunction(list(sels)))
        mean_sg = sg.get_base_statistics(data, weighting_attribute)[3]
        max_mean = max(max_mean, mean_sg)
    return max_mean
Пример #13
0
    def get_stats_and_previous_stats(self, subgroup, data):
        stats_subgroup = self.qf.calculate_statistics(subgroup, data)
        max_stats = self.stats0
        selectors = subgroup.selectors
        if len(selectors) > 0:
            # compute quality of all generalizations
            generalizations = combinations(selectors, len(selectors)-1)

            for sels in generalizations:
                sgd = ps.Conjunction(list(sels))
                (stats_sg, stats_prev) = self.calculate_statistics(sgd, data)
                max_stats = self.get_max(max_stats, stats_sg, stats_prev)
        return (stats_subgroup, max_stats)
Пример #14
0
    def get_qual_and_previous_qual(self, subgroup, target, data):
        q_subgroup = self.qf.evaluate(subgroup, target, data)
        max_q = 0
        selectors = subgroup.selectors
        if len(selectors) > 0:
            # compute quality of all generalizations
            generalizations = combinations(selectors, len(selectors) - 1)

            for sels in generalizations:
                sgd = ps.Conjunction(list(sels))
                (q_sg, q_prev) = self.calculate_statistics(sgd, target, data)
                max_q = max(max_q, q_sg, q_prev)
        return (q_subgroup, max_q)
Пример #15
0
    def setUp(self):
        NS_cabin = ps.EqualitySelector("Cabin", np.nan)
        NS_embarked = ps.EqualitySelector("Embarked", 'S')
        NS_embarked2 = ps.EqualitySelector("Embarked", 'C')
        NS_male = ps.EqualitySelector("Sex", 'male')
        NS_female = ps.EqualitySelector("Sex", 'female')
        #NS_other_parties = ps.EqualitySelector("other_parties", b"none")
        #NS_savings_status = ps.EqualitySelector("savings_status", b"<100")
        #NS_job = ps.EqualitySelector("job", b"skilled")
        self.result = [
            ps.Conjunction([NS_cabin, NS_embarked]),
            ps.Conjunction([NS_cabin, NS_male]),
            ps.Conjunction([NS_embarked, NS_male]),
            ps.Conjunction([NS_cabin]),
            ps.Conjunction([NS_embarked]),
            ps.Conjunction([NS_male]),
            ps.Conjunction([NS_cabin, NS_female]),
            ps.Conjunction([NS_embarked, NS_female]),
            ps.Conjunction([NS_female]),
            ps.Conjunction([NS_cabin, NS_embarked2]),
        ]

        self.qualities = [178, 164, 146, 125, 110, 100, 86, 74, 56, 46]

        data = get_titanic_data()
        self.qualities2 = [
            np.count_nonzero(conj.covers(data)) * conj.depth
            for conj in self.result
        ]
        self.assertEqual(self.qualities, self.qualities2)
        searchSpace = ps.create_nominal_selectors(data)
        self.task = ps.SubgroupDiscoveryTask(data,
                                             ps.FITarget,
                                             searchSpace,
                                             result_set_size=10,
                                             depth=2,
                                             qf=ps.AreaQF())
 def setUp(self):
     NS_checking = ps.EqualitySelector("checking_status", b"<0")
     NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes")
     NS_other_parties = ps.EqualitySelector("other_parties", b"none")
     NS_savings_status = ps.EqualitySelector("savings_status", b"<100")
     NS_payment_plans = ps.EqualitySelector("other_payment_plans", b"none")
     self.result = [
         ps.Conjunction([NS_checking, NS_foreign_worker]),
         ps.Conjunction([NS_checking]),
         ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]),
         ps.Conjunction([NS_checking, NS_other_parties]),
         ps.Conjunction([NS_checking, NS_savings_status,
                         NS_foreign_worker]),
         ps.Conjunction([NS_checking, NS_savings_status]),
         ps.Conjunction([NS_checking, NS_foreign_worker, NS_payment_plans]),
         ps.Conjunction([NS_checking, NS_payment_plans]),
         ps.Conjunction([NS_foreign_worker, NS_savings_status]),
         ps.Conjunction(
             [NS_foreign_worker, NS_other_parties, NS_savings_status]),
     ]
     self.qualities = [
         0.055299999999999995, 0.05280000000000001, 0.052300000000000006,
         0.05059999999999999, 0.04959999999999999, 0.048299999999999996,
         0.0426, 0.04, 0.03869999999999999, 0.03750000000000001
     ]
     data = get_credit_data()
     target = ps.BinaryTarget('class', b'bad')
     searchSpace = ps.create_nominal_selectors(data, ignore=['class'])
     self.task = ps.SubgroupDiscoveryTask(
         data,
         target,
         searchSpace,
         result_set_size=10,
         depth=5,
         qf=ps.StandardQF(1.0),
         constraints=[ps.MinSupportConstraint(200)])
Пример #17
0
 def calculate_quality_function_for_patterns(self, patterns,
                                             selectors_sorted, arrs):
     out = []
     for indices, gp_params in self.tqdm(
             patterns,
             'computing quality function',
     ):
         if len(indices) > 0:
             selectors = [selectors_sorted[i] for i in indices]
             #print(selectors, stats)
             sg = ps.Conjunction(selectors)
             if self.requires_cover_arr:
                 statistics = task.qf.gp_get_params(
                     np.all([arrs[i] for i in indices]), gp_params)
             else:
                 statistics = task.qf.gp_get_params(None, gp_params)
             #qual1 = task.qf.evaluate(sg, task.qf.calculate_statistics(sg, task.data))
             qual2 = task.qf.evaluate(sg, statistics)
             out.append((qual2, sg))
     return out
Пример #18
0
    def execute(self, task):
        result = []
        queue = [(float("-inf"), ps.Conjunction([]))]

        operator = SpecializationOperator(data=task.data.drop(['target'], axis=1), n_bins=self.n_bins,
                                          max_features=self.max_features,
                                          intervals_only=self.intervals_only,
                                          binning=self.binning, specialization=self.specialization,
                                          search_space=task.search_space)
        task.qf.calculate_constant_statistics(task.data, task.target)
        while queue:
            q, old_description = heappop(queue)
            q = -q
            if not q > ps.minimum_required_quality(result, task):
                break
            for candidate_description in operator.refinements(old_description):
                score_eval = task.qf.evaluate(candidate_description, task.target, task.data, None)
                ps.add_if_required(result, candidate_description, score_eval, task)
                if len(candidate_description) < task.depth:
                    heappush(queue, (-score_eval, candidate_description))

        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
Пример #19
0
    def setUp(self):
        NS_checking = ps.EqualitySelector("checking_status", b"<0")
        NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes")
        NS_other_parties = ps.EqualitySelector("other_parties", b"none")
        NS_savings_status = ps.EqualitySelector("savings_status", b"<100")
        NS_job = ps.EqualitySelector("job", b"skilled")
        NS_dependents = ps.EqualitySelector("num_dependents", 1.0)
        self.result = [ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties, NS_savings_status]),  # AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100'
                       # 0.113713540226172:    checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled'' AND savings_status=='b'<100''
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_job]),  # checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled''
                       # checking_status=='b'<0'' AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100''
                       ps.Conjunction([NS_checking, NS_job, NS_other_parties, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties]),
                       ps.Conjunction([NS_checking, NS_job, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_foreign_worker]),
                       ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_dependents, NS_savings_status]),
                       ps.Conjunction([NS_checking, NS_job, NS_other_parties])]

        self.qualities = [0.11457431093955019,
                          0.113713540226172,
                          0.11201325679119281,
                          0.1117538749727658,
                          0.11161046793076415,
                          0.11145710640046322,
                          0.11045259291161472,
                          0.10929088624672183,
                          0.10875519439407161,
                          0.10866138825404954,
                          0.10832735026213287,
                          0.10813405094128754]
        data = get_credit_data()
        target = ps.BinaryTarget('class', b'bad')
        searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['class'])
        searchSpace_Numeric = ps.create_numeric_selectors(data, ignore=['class'])
        searchSpace = searchSpace_Nominal + searchSpace_Numeric
        self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=12, depth=5, qf=ps.StandardQF(0.5))