def test_priority_queue_contains(self): queue = MinPriorityQueue(max_size=1) data = pd.DataFrame({ 'A': [1, 2], 'B': [1, 2] }) # Pass data because descriptions need to be evaluated description1 = description_factory(Condition('A == 1'), data) description2 = description_factory( Condition('A == 1'), data) # Different object BUT same contents description3 = description_factory( Condition('B == 2'), data) # Different object AND different contents item1 = Result(quality=1, description=description1) item2 = Result(quality=1, description=description2) item3 = Result(quality=1, description=description3) queue.put(item1) self.assertTrue(queue.contains(item2), 'Compares hashed value instead of references') self.assertFalse(queue.contains(item3), 'Hashes quality as well as description')
def test_priority_queue_equal_quality(self): queue = MinPriorityQueue(max_size=2) data = pd.DataFrame({ 'A': [1, 2], 'B': [1, 2] }) # Pass data because descriptions need to be evaluated description1 = description_factory(Condition('A == 1'), data) description2 = description_factory( Condition('B >= 1'), data) # Weaker description (larger coverage) item1 = Result(quality=1, description=description1) item2 = Result( quality=1, description=description2) # Same quality, different description queue.put(item1) queue.put(item2) result = list(queue) self.assertEqual(item2, result[0], "The description 'B >= 1' is weaker") self.assertEqual(item1, result[1], "The description 'A == 1' is stronger")
def test_refine_empty_subgroup(self): dataset = pd.DataFrame({'A': [1]}) condition = Condition('`A` == 2') # Condition matches 0 rows seed = description_factory(condition, dataset) descriptions = refine(dataset, [], seed) self.assertEqual(1, len(descriptions), 'No refinements added')
def test_refine_duplicate_splits(self): dataset = pd.DataFrame({'A': [0, 1, 1, 2]}) # With 3 equal-width bins, the splits will be duplicated. Splits at position 1 and 2 descriptions = refine(dataset, [], description_factory([], dataset)) # Shouldn't contain `A >= 1` and `A <= 1` twice self.assertEqual(3, len(descriptions), 'Only contains unique inequalities')
def test_priority_queue_overflowing(self): queue = MinPriorityQueue(max_size=2) description1 = description_factory(Condition('L')) description2 = description_factory(Condition('M')) description3 = description_factory(Condition('H')) item1 = Result(quality=1, description=description1) item2 = Result(quality=2, description=description2) item3 = Result(quality=3, description=description3) queue.put(item1) queue.put(item2) queue.put(item3) self.assertFalse(queue.contains(item1), 'Removed low quality item') self.assertTrue(queue.contains(item2), 'Contains medium priority item') self.assertTrue(queue.contains(item3), 'Contains high priority item')
def test_refine_boolean(self): dataset = pd.DataFrame({'A': [True, False]}) descriptions = refine(dataset, [], description_factory([], dataset)) queries = [d.to_querystring() for d in descriptions] self.assertEqual(len(descriptions), 3, 'Added 2 conditions') self.assertEqual('`A` == 1' in queries, True, 'Added condition equal to True') self.assertEqual('`A` == 0' in queries, True, 'Added condition equal to False')
def test_priority_queue_list(self): queue = MinPriorityQueue(max_size=3) description1 = description_factory(Condition('C')) description2 = description_factory(Condition('B')) description3 = description_factory(Condition('A')) # Insert items on purpose in non-ascending order to force rebuilding the heap queue.put(Result(quality=3, description=description1)) queue.put(Result(quality=2, description=description2)) queue.put(Result(quality=1, description=description3)) result = list(queue) self.assertEqual(len(result), 3) # Make sure the list of correctly ordered (low to high), i.e. not a dump of the heap self.assertEqual('A', result[0].description.to_querystring()) self.assertEqual('B', result[1].description.to_querystring()) self.assertEqual('C', result[2].description.to_querystring())
def test_refine_numeric(self): dataset = pd.DataFrame({'A': [1, 2, 3, 4]}) descriptions = refine(dataset, [], description_factory([], dataset)) queries = [d.to_querystring() for d in descriptions] self.assertEqual(5, len(descriptions), 'Added 4 conditions (2 * (num_buckets - 1))') self.assertIn('`A` <= 2', queries) self.assertIn('`A` >= 2', queries) self.assertIn('`A` <= 3', queries) self.assertIn('`A` >= 3', queries)
def test_refine_nominal(self): dataset = pd.DataFrame({'A': ['foo', 'bar', 'lex']}) descriptions = refine(dataset, [], description_factory([], dataset)) queries = [d.to_querystring() for d in descriptions] self.assertEqual(7, len(descriptions), 'Added 4 conditions (2g)') self.assertIn("`A` == 'foo'", queries, 'Added condition equal to g(1)') self.assertIn("`A` != 'foo'", queries, 'Added condition not equal to g(1)') self.assertIn("`A` == 'bar'", queries, 'Added condition equal to g(2))') self.assertIn("`A` != 'bar'", queries, 'Added condition not equal to g(2)') self.assertIn("`A` == 'lex'", queries, 'Added condition equal to g(3))') self.assertIn("`A` != 'lex'", queries, 'Added condition not equal to g(3)')
def test_calculate(self): data = pd.DataFrame({'foo': ['a', 'a', 'b', 'c']}) description = description_factory(Condition("`foo` == 'b'"), data) # Label ranking of population: [a: 1, b: 2, c: 3] # Label ranking of subgroup: [a: 2, b: 1, c: 3] t = tree.Node('a', children=[tree.Node('b'), tree.Node('c')]) qm = LabelDistribution( target='foo', tree=t, gap_func=lambda x, y: 1 ) # Gap component is 1 so all subgroups are exceptional qm.set_data(data) coverage, quality = qm.calculate(description) self.assertEqual(1, coverage) self.assertNotEqual( 0, quality ) # Subgroup has a different label ranking so the quality should be non-zero
def test_refine_unsupported_type(self): dataset = pd.DataFrame({'A': [datetime.now()] }) # No refinement implemented for dates with self.assertRaises(NotImplementedError): refine(dataset, [], description_factory([], dataset))
def get_initial_seed(data: pd.DataFrame) -> Result: return Result(quality=-1, description=description_factory(conditions=[], data=data))