def test_profile_merge_with_different_options(self): # Creating first profiler with default options options = IntOptions() options.max.is_enabled = False options.min.is_enabled = False data = [2, 4, 6, 8] df = pd.Series(data).apply(str) profiler1 = IntColumn("Int", options=options) profiler1.update(df) profiler1.match_count = 0 # Creating second profiler with separate options options = IntOptions() options.min.is_enabled = False data2 = [10, 15] df2 = pd.Series(data2).apply(str) profiler2 = IntColumn("Int", options=options) profiler2.update(df2) # Asserting warning when adding 2 profilers with different options with self.assertWarnsRegex( RuntimeWarning, "max is disabled because it is not enabled in" " both profiles."): profiler3 = profiler1 + profiler2 # Assert that these features are still merged profile = profiler3.profile self.assertIsNotNone(profiler3.histogram_selection) self.assertIsNotNone(profile['variance']) self.assertIsNotNone(profiler3.sum) # Assert that these features are not calculated self.assertIsNone(profiler3.max) self.assertIsNone(profiler3.min)
def test_profile_merge_edge_case(self): data = [2.0, 12.5, 'not a float', 6.0, 'not a float'] df = pd.Series(data).apply(str) profiler1 = IntColumn(name="Int") profiler1.update(df) profiler1.match_count = 0 data2 = [10.0, 3.5, 'not a float', 15.0, 'not a float'] df2 = pd.Series(data2).apply(str) profiler2 = IntColumn(name="Int") profiler2.update(df2) profiler3 = profiler1 + profiler2 self.assertEqual(profiler3.stddev, profiler2.stddev) # test merge with empty data df1 = pd.Series([], dtype=object) profiler1 = IntColumn("Int") profiler1.update(df1) df2 = pd.Series([], dtype=object) profiler2 = IntColumn("Int") profiler2.update(df2) profiler = profiler1 + profiler2 self.assertEqual(profiler.min, None) self.assertEqual(profiler.max, None) self.assertIsNone(profiler.histogram_selection) df3 = pd.Series([2, 3]).apply(str) profiler3 = IntColumn("Int") profiler3.update(df3) profiler = profiler1 + profiler3 self.assertEqual(profiler.min, 2) self.assertEqual(profiler.max, 3) df4 = pd.Series([4, 5]).apply(str) profiler4 = IntColumn("Int") profiler4.update(df4) profiler = profiler3 + profiler4 self.assertEqual(profiler.min, 2) self.assertEqual(profiler.max, 5)
def test_profile_merge_no_bin_overlap(self): data = [2, 'not an int', 6, 'not an int'] df = pd.Series(data).apply(str) profiler1 = IntColumn("Int") profiler1.update(df) profiler1.match_count = 0 data2 = [10, 'not an int', 15, 'not an int'] df2 = pd.Series(data2).apply(str) profiler2 = IntColumn("Int") profiler2.update(df2) # set bin names so no overlap profiler1.histogram_bin_method_names = ['No overlap 1'] profiler2.histogram_bin_method_names = ['No overlap 2'] with self.assertRaisesRegex( ValueError, 'Profiles have no overlapping bin methods ' 'and therefore cannot be added together.'): profiler1 + profiler2
def test_profile_merge_edge_case(self): data = [2.0, 12.5, 'not a float', 6.0, 'not a float'] df = pd.Series(data).apply(str) profiler1 = IntColumn(name="Int") profiler1.update(df) profiler1.match_count = 0 data2 = [10.0, 3.5, 'not a float', 15.0, 'not a float'] df2 = pd.Series(data2).apply(str) profiler2 = IntColumn(name="Int") profiler2.update(df2) profiler3 = profiler1 + profiler2 self.assertEqual(profiler3.stddev, profiler2.stddev) # test merge with empty data df1 = pd.Series([], dtype=object) profiler1 = IntColumn("Int") profiler1.update(df1) df2 = pd.Series([], dtype=object) profiler2 = IntColumn("Int") profiler2.update(df2) profiler = profiler1 + profiler2 self.assertEqual(profiler.min, None) self.assertEqual(profiler.max, None) self.assertTrue(np.isnan(profiler.skewness)) self.assertTrue(np.isnan(profiler.kurtosis)) self.assertIsNone(profiler.histogram_selection) df3 = pd.Series([2, 3]).apply(str) profiler3 = IntColumn("Int") profiler3.update(df3) profiler = profiler1 + profiler3 self.assertEqual(profiler.min, 2) self.assertEqual(profiler.max, 3) self.assertTrue(np.isnan(profiler.skewness)) self.assertTrue(np.isnan(profiler.kurtosis)) self.assertEqual(profiler.num_zeros, 0) self.assertEqual(profiler.num_negatives, 0) df4 = pd.Series([4, 5]).apply(str) profiler4 = IntColumn("Int") profiler4.update(df4) profiler = profiler3 + profiler4 self.assertEqual(profiler.min, 2) self.assertEqual(profiler.max, 5) self.assertEqual(profiler.skewness, 0) self.assertAlmostEqual(profiler.kurtosis, -1.2) self.assertEqual(profiler.num_zeros, 0) self.assertEqual(profiler.num_negatives, 0) df5 = pd.Series([0, 0, -1]).apply(str) profiler5 = IntColumn("Int") profiler5.update(df5) profiler = profiler4 + profiler5 self.assertEqual(profiler.min, -1) self.assertEqual(profiler.max, 5) self.assertEqual(profiler.num_zeros, 2) self.assertEqual(profiler.num_negatives, 1)