def test_domain_basic_stats(self): domain = self.zoo.domain attr_stats = [BasicStats(self.zoo, a) for a in domain.attributes] class_var_stats = [BasicStats(self.zoo, a) for a in domain.class_vars] meta_stats = [BasicStats(self.zoo, a) for a in domain.metas] domain_stats = DomainBasicStats(self.zoo) self.assertStatsEqual(domain_stats.stats, attr_stats + class_var_stats) domain_stats = DomainBasicStats(self.zoo, include_metas=True) self.assertStatsEqual(domain_stats.stats, attr_stats + class_var_stats + meta_stats)
def test_speed(self): n, m = 10, 10000 data = Table.from_numpy(None, np.random.rand(n, m)) start = time.time() for i in range(m): BasicStats(data, i) elapsed = time.time() - start self.assertLess(elapsed, 10.0)
def __call__(self, data, attribute, fixed=None): if fixed: min, max = fixed[attribute.name] points = self._split_eq_width(min, max) else: if type(data) == SqlTable: stats = BasicStats(data, attribute) points = self._split_eq_width(stats.min, stats.max) else: values = data[:, attribute] values = values.X if values.X.size else values.Y min, max = ut.nanmin(values), ut.nanmax(values) points = self._split_eq_width(min, max) return Discretizer.create_discretized_var( data.domain[attribute], points)
def __call__(self, data: Table, attribute, fixed=None): if fixed: mn, mx = fixed[attribute.name] points = self._split_eq_width(mn, mx) else: if type(data) == SqlTable: stats = BasicStats(data, attribute) points = self._split_eq_width(stats.min, stats.max) else: values, _ = data.get_column_view(attribute) if values.size: mn, mx = ut.nanmin(values), ut.nanmax(values) points = self._split_eq_width(mn, mx) else: points = [] return Discretizer.create_discretized_var(data.domain[attribute], points)
def test_basic_stats(self): iris = SqlTable(self.conn, self.iris, inspect_values=True) stats = BasicStats(iris, iris.domain['sepal length']) self.assertAlmostEqual(stats.min, 4.3) self.assertAlmostEqual(stats.max, 7.9) self.assertAlmostEqual(stats.mean, 5.8, 1) self.assertEqual(stats.nans, 0) self.assertEqual(stats.non_nans, 150) domain_stats = DomainBasicStats(iris, include_metas=True) self.assertEqual(len(domain_stats.stats), len(iris.domain) + len(iris.domain.metas)) stats = domain_stats['sepal length'] self.assertAlmostEqual(stats.min, 4.3) self.assertAlmostEqual(stats.max, 7.9) self.assertAlmostEqual(stats.mean, 5.8, 1) self.assertEqual(stats.nans, 0) self.assertEqual(stats.non_nans, 150)
def test_basic_stats_on_large_data(self): # By setting LARGE_TABLE to 100, iris will be treated as # a large table and sampling will be used. As the table # is actually small, time base sampling should return # all rows, so the same assertions can be used. iris = SqlTable(self.conn, self.iris, inspect_values=True) stats = BasicStats(iris, iris.domain['sepal length']) self.assertAlmostEqual(stats.min, 4.3) self.assertAlmostEqual(stats.max, 7.9) self.assertAlmostEqual(stats.mean, 5.8, 1) self.assertEqual(stats.nans, 0) self.assertEqual(stats.non_nans, 150) domain_stats = DomainBasicStats(iris, include_metas=True) self.assertEqual(len(domain_stats.stats), len(iris.domain) + len(iris.domain.metas)) stats = domain_stats['sepal length'] self.assertAlmostEqual(stats.min, 4.3) self.assertAlmostEqual(stats.max, 7.9) self.assertAlmostEqual(stats.mean, 5.8, 1) self.assertEqual(stats.nans, 0) self.assertEqual(stats.non_nans, 150)