예제 #1
0
    def execute(self, task):
        result = []
        queue = [(float("-inf"), ps.Conjunction([]))]
        operator = ps.StaticSpecializationOperator(task.search_space)
        task.qf.calculate_constant_statistics(task)
        while queue:
            q, old_description = heappop(queue)
            q = -q
            if not (q > ps.minimum_required_quality(result, task)):
                break
            for candidate_description in operator.refinements(old_description):
                sg = candidate_description
                statistics = task.qf.calculate_statistics(sg, task.data)
                ps.add_if_required(result, sg,
                                   task.qf.evaluate(sg, statistics), task)
                optimistic_estimate = task.qf.optimistic_estimate(
                    sg, statistics)

                # compute refinements and fill the queue
                if len(
                        candidate_description
                ) < task.depth and optimistic_estimate >= ps.minimum_required_quality(
                        result, task):
                    heappush(queue,
                             (-optimistic_estimate, candidate_description))

        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
예제 #2
0
    def execute(self, task):
        if not isinstance(task.qf, ps.StandardQFNumeric):
            warnings.warn(
                "BSD_numeric so far is only implemented for StandardQFNumeric")
        self.pop_size = len(task.data)
        sorted_data = task.data.sort_values(task.target.get_attributes(),
                                            ascending=False)

        # generate target bitset
        self.target_values = sorted_data[task.target.get_attributes()
                                         [0]].to_numpy()

        task.qf.calculate_constant_statistics(task)
        self.evaluate = task.qf.evaluate

        # generate selector bitsets
        self.bitsets = {}
        for sel in task.search_space:
            # generate bitset
            self.bitsets[sel] = sel.covers(sorted_data)
        result = self.search_internal(task, [], task.search_space, [],
                                      np.ones(len(sorted_data), dtype=bool))
        result.sort(key=lambda x: x[0], reverse=True)

        return ps.SubgroupDiscoveryResult(result, task)
예제 #3
0
    def execute(self, task):
        task.qf.calculate_constant_statistics(task)
        result = []
        all_selectors = chain.from_iterable(
            combinations(task.search_space, r)
            for r in range(1, task.depth + 1))
        if self.show_progress:
            try:
                from tqdm import tqdm

                def binomial(x, y):
                    try:
                        binom = factorial(x) // factorial(y) // factorial(x -
                                                                          y)
                    except ValueError:
                        binom = 0
                    return binom

                total = sum(
                    binomial(len(task.search_space), k)
                    for k in range(1, task.depth + 1))
                all_selectors = tqdm(all_selectors, total=total)
            except ImportError:
                pass
        for selectors in all_selectors:
            sg = ps.Conjunction(selectors)
            statistics = task.qf.calculate_statistics(sg, task.data)
            quality = task.qf.evaluate(sg, statistics)
            ps.add_if_required(result, sg, quality, task)
        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
예제 #4
0
 def execute(self, task):
     self.operator = ps.StaticSpecializationOperator(task.search_space)
     task.qf.calculate_constant_statistics(task)
     result = []
     with self.apply_representation(task.data,
                                    task.search_space) as representation:
         self.search_internal(task, result, representation.Conjunction([]))
     result.sort(key=lambda x: x[0], reverse=True)
     return ps.SubgroupDiscoveryResult(result, task)
예제 #5
0
    def execute(self, task):
        result = []
        queue = []
        operator = ps.StaticGeneralizationOperator(task.search_space)
        # init the first level
        for sel in task.search_space:
            queue.append((float("-inf"), ps.Disjunction([sel])))
        task.qf.calculate_constant_statistics(task)

        while queue:
            q, candidate_description = heappop(queue)
            q = -q
            if q < ps.minimum_required_quality(result, task):
                break

            sg = candidate_description
            statistics = task.qf.calculate_statistics(sg, task.data)
            quality = task.qf.evaluate(sg, statistics)
            ps.add_if_required(result,
                               sg,
                               quality,
                               task,
                               statistics=statistics)

            qual = ps.minimum_required_quality(result, task)

            if (quality, sg) in result:
                new_queue = []
                for q_tmp, c_tmp in queue:
                    if (-q_tmp) > qual:
                        heappush(new_queue, (q_tmp, c_tmp))
                queue = new_queue
            optimistic_estimate = task.qf.optimistic_estimate(sg, statistics)
            # else:
            #    ps.add_if_required(result, sg, task.qf.evaluate_from_dataset(task.data, sg), task)
            #    optimistic_estimate = task.qf.optimistic_generalisation_from_dataset(task.data, sg) if qf_is_bounded else float("inf")

            # compute refinements and fill the queue
            if len(candidate_description) < task.depth and (
                    optimistic_estimate /
                    self.alpha**(len(candidate_description) + 1)
            ) >= ps.minimum_required_quality(result, task):
                # print(qual)
                # print(optimistic_estimate)
                self.refined[len(candidate_description)] += 1
                # print(str(candidate_description))
                for new_description in operator.refinements(
                        candidate_description):
                    heappush(queue, (-optimistic_estimate, new_description))
            else:
                self.discarded[len(candidate_description)] += 1

        result.sort(key=lambda x: x[0], reverse=True)
        for qual, sg in result:
            print("{} {}".format(qual, sg))
        print("discarded " + str(self.discarded))
        return ps.SubgroupDiscoveryResult(result, task)
예제 #6
0
    def execute(self, task):
        if not isinstance(task.qf, ps.BoundedInterestingnessMeasure):
            raise RuntimeWarning(
                "Quality function is unbounded, long runtime expected")

        task.qf.calculate_constant_statistics(task)

        with self.representation_type(task.data,
                                      task.search_space) as representation:
            combine_selectors = getattr(representation.__class__,
                                        self.combination_name)
            result = []
            # init the first level
            next_level_candidates = []
            for sel in task.search_space:
                next_level_candidates.append(combine_selectors([sel]))

            # level-wise search
            depth = 1
            while next_level_candidates:
                # check sgs from the last level
                if self.use_vectorization:
                    promising_candidates = self.get_next_level_candidates_vectorized(
                        task, result, next_level_candidates)
                else:
                    promising_candidates = self.get_next_level_candidates(
                        task, result, next_level_candidates)

                if depth == task.depth:
                    break

                if self.use_repruning:
                    promising_candidates = self.reprune_lower_levels(
                        promising_candidates, depth)

                next_level_candidates_no_pruning = self.next_level(
                    promising_candidates)

                # select those selectors and build a subgroup from them
                #   for which all subsets of length depth (=candidate length -1) are in the set of promising candidates
                set_promising_candidates = set(
                    tuple(p) for p in promising_candidates)
                next_level_candidates = [
                    combine_selectors(selectors)
                    for selectors in next_level_candidates_no_pruning
                    if all((subset in set_promising_candidates)
                           for subset in combinations(selectors, depth))
                ]
                depth = depth + 1

        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
예제 #7
0
    def execute(self, task):
        # adapt beam width to the result set size if desired
        if self.beam_width_adaptive:
            self.beam_width = task.result_set_size

        # check if beam size is to small for result set
        if self.beam_width < task.result_set_size:
            raise RuntimeError(
                'Beam width in the beam search algorithm is smaller than the result set size!'
            )

        task.qf.calculate_constant_statistics(task)

        # init
        beam = [(0, ps.Conjunction([]),
                 task.qf.calculate_statistics(slice(None), task.data))]
        last_beam = None

        depth = 0
        while beam != last_beam and depth < task.depth:
            last_beam = beam.copy()
            for (_, last_sg, _) in last_beam:
                if not getattr(last_sg, 'visited', False):
                    setattr(last_sg, 'visited', True)
                    for sel in task.search_space:
                        # create a clone
                        new_selectors = list(last_sg.selectors)
                        if sel not in new_selectors:
                            new_selectors.append(sel)
                            sg = ps.Conjunction(new_selectors)
                            statistics = task.qf.calculate_statistics(
                                sg, task.data)
                            quality = task.qf.evaluate(sg, statistics)
                            ps.add_if_required(beam,
                                               sg,
                                               quality,
                                               task,
                                               check_for_duplicates=True,
                                               statistics=statistics)
            depth += 1


# TODO make sure there is no bug here
        result = beam[:task.result_set_size]
        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
예제 #8
0
    def execute(self, task):
        result = []
        queue = [(float("-inf"), ps.Conjunction([]))]

        operator = SpecializationOperator(data=task.data.drop(['target'], axis=1), n_bins=self.n_bins,
                                          max_features=self.max_features,
                                          intervals_only=self.intervals_only,
                                          binning=self.binning, specialization=self.specialization,
                                          search_space=task.search_space)
        task.qf.calculate_constant_statistics(task.data, task.target)
        while queue:
            q, old_description = heappop(queue)
            q = -q
            if not q > ps.minimum_required_quality(result, task):
                break
            for candidate_description in operator.refinements(old_description):
                score_eval = task.qf.evaluate(candidate_description, task.target, task.data, None)
                ps.add_if_required(result, candidate_description, score_eval, task)
                if len(candidate_description) < task.depth:
                    heappush(queue, (-score_eval, candidate_description))

        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
예제 #9
0
 def execute(self, task, use_optimistic_estimates=True):
     task.qf.calculate_constant_statistics(task)
     result = self.search_internal(task, [], task.search_space, [],
                                   use_optimistic_estimates)
     result.sort(key=lambda x: x[0], reverse=True)
     return ps.SubgroupDiscoveryResult(result, task)