예제 #1
0
    def test_refine_empty_subgroup(self):
        dataset = pd.DataFrame({'A': [1]})

        condition = Condition('`A` == 2')  # Condition matches 0 rows
        seed = description_factory(condition, dataset)

        descriptions = refine(dataset, [], seed)

        self.assertEqual(1, len(descriptions), 'No refinements added')
예제 #2
0
    def test_refine_duplicate_splits(self):
        dataset = pd.DataFrame({'A': [0, 1, 1, 2]})

        # With 3 equal-width bins, the splits will be duplicated. Splits at position 1 and 2
        descriptions = refine(dataset, [], description_factory([], dataset))

        # Shouldn't contain `A >= 1` and `A <= 1` twice
        self.assertEqual(3, len(descriptions),
                         'Only contains unique inequalities')
예제 #3
0
    def test_refine_boolean(self):
        dataset = pd.DataFrame({'A': [True, False]})

        descriptions = refine(dataset, [], description_factory([], dataset))
        queries = [d.to_querystring() for d in descriptions]

        self.assertEqual(len(descriptions), 3, 'Added 2 conditions')
        self.assertEqual('`A` == 1' in queries, True,
                         'Added condition equal to True')
        self.assertEqual('`A` == 0' in queries, True,
                         'Added condition equal to False')
예제 #4
0
    def test_refine_numeric(self):
        dataset = pd.DataFrame({'A': [1, 2, 3, 4]})

        descriptions = refine(dataset, [], description_factory([], dataset))
        queries = [d.to_querystring() for d in descriptions]

        self.assertEqual(5, len(descriptions),
                         'Added 4 conditions (2 * (num_buckets - 1))')
        self.assertIn('`A` <= 2', queries)
        self.assertIn('`A` >= 2', queries)
        self.assertIn('`A` <= 3', queries)
        self.assertIn('`A` >= 3', queries)
예제 #5
0
    def test_refine_nominal(self):
        dataset = pd.DataFrame({'A': ['foo', 'bar', 'lex']})

        descriptions = refine(dataset, [], description_factory([], dataset))
        queries = [d.to_querystring() for d in descriptions]

        self.assertEqual(7, len(descriptions), 'Added 4 conditions (2g)')
        self.assertIn("`A` == 'foo'", queries, 'Added condition equal to g(1)')
        self.assertIn("`A` != 'foo'", queries,
                      'Added condition not equal to g(1)')
        self.assertIn("`A` == 'bar'", queries,
                      'Added condition equal to g(2))')
        self.assertIn("`A` != 'bar'", queries,
                      'Added condition not equal to g(2)')
        self.assertIn("`A` == 'lex'", queries,
                      'Added condition equal to g(3))')
        self.assertIn("`A` != 'lex'", queries,
                      'Added condition not equal to g(3)')
예제 #6
0
    def test_refine_unsupported_type(self):
        dataset = pd.DataFrame({'A': [datetime.now()]
                                })  # No refinement implemented for dates

        with self.assertRaises(NotImplementedError):
            refine(dataset, [], description_factory([], dataset))
예제 #7
0
def beam_search(data, targets, quality_measure: QualityMeasure, options={}):
    timer = Timer()
    timer.start()  # Keep track of execution time for debugging

    set_options(options)

    candidate_queue = Queue(maxsize=0)
    candidate_queue.put(get_initial_seed(data))

    result_set = MinPriorityQueue(max_size=config.RESULT_SET_SIZE)

    # Print settings
    print('Settings:')
    for var in [x for x in dir(config) if not x.startswith('__')]:
        print(f'  {var}={getattr(config, var)}')

    print('Setting-up quality measure...')
    quality_measure.set_data(data)

    print('Finding subgroups...')

    for depth in range(0, config.SEARCH_DEPTH):
        beam = MinPriorityQueue(max_size=config.BEAM_WIDTH)

        while not candidate_queue.empty():
            seed = candidate_queue.get().description

            for description in refine(data, targets, seed):
                coverage, quality = quality_measure.calculate(description)

                # Check if the description satisfies the constraints
                if not satisfies_all(description=description,
                                     coverage=coverage,
                                     quality=quality):
                    continue  # Continue with next candidate description

                result = Result(quality=quality, description=description)

                # Check if the description is novel
                if not result_set.contains(result):
                    result_set.put(result)

                beam.put(result)

        print(f'Best subgroups at depth {depth}:')
        while not beam.empty():
            candidate = beam.get(
            )  # Not sure about get(), pseudo code uses get_front_element()

            print(
                f'quality = {round(candidate.quality, 5)}, description = {candidate.description.to_querystring()}'
            )

            candidate_queue.put(candidate)

    top_q = list(result_set)
    top_q.reverse()  # Sort by descending quality

    print('Done.')
    print(f'Finished in {round(timer.elapsed_time())} seconds.')

    return top_q