def test_build_already_build(self, calculate_bins_func, add_hist_func): """Should not rebuild if Histogram was already build before""" hist = Histogram() hist.is_build = True hist.build() self.assertFalse(add_hist_func.called) self.assertFalse(calculate_bins_func.called)
def test_build(self): """Should calculate the bin list, and hist values for each column in the Histogram, if the histogram hasn't been build before""" hist = Histogram(bins=2) test_df = self.create_test_df() column_to_ad = test_df.select(F.col('value')) column_to_ad_2 = test_df.select(F.col('value2')) hist.add_column(column_to_ad) hist.add_column(column_to_ad_2) hist.build() self.assertEqual(3, len(hist.bin_list)) self.assertEqual(2, len(hist.hist_dict)) self.assertTrue(hist.is_build)
def test_add_hist_single_value(self): """Should set the bin list to n (self.nr_bins) bins (n+1 bin borders) where the min bin border is the single value -0.5 and the max bin border is the single value +0.5 incase a column is input with only a single value""" single_column_value = 1 nr_bins = 5 column_values = [single_column_value] * 100 test_df = self.sqlCtx.createDataFrame( pd.DataFrame({'foo': column_values})) hist = Histogram(bins=nr_bins) hist.add_column(test_df.select(F.col('foo'))) hist.build() self.assertEqual(6, len(hist.bin_boundaries)) self.assertEqual(single_column_value - 0.5, min(hist.bin_boundaries)) self.assertEqual(single_column_value + 0.5, max(hist.bin_boundaries)) self.assertEqual(len(column_values), hist.hist_dict['foo'][math.floor(nr_bins / 2)])