def test_get_perc_works_as_expected(self): float_nums = [(0.123, '12.30%'), (3.1243453, '312.43%'), (213.12312, '21,312.31%')] int_nums = [(0.14, '14%'), (1.300, '130%')] for num, expected in float_nums: self.assertEqual(DataFrameSummary._percent(num), expected) for num, expected in int_nums: self.assertEqual(DataFrameSummary._percent(num), expected)
def test_bool1_summary(self): count_values = self.df['dbool1'].value_counts() total_count = self.df['dbool1'].count() count0 = count_values[0] count1 = count_values[1] perc0 = DataFrameSummary._percent(count0 / total_count) perc1 = DataFrameSummary._percent(count1 / total_count) expected = pd.Series(index=['"0" count', '"0" perc', '"1" count', '"1" perc', 'counts', 'uniques', 'missing', 'missing_perc', 'types'], data=[str(count0), perc0, str(count1), perc1, self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL], name='dbool1', dtype=object).sort_index() assert_series_equal(self.dfs['dbool1'].sort_index(), expected)
def test_numerics_summary(self): num1 = self.df['dnumerics1'] dm, dmp = self.dfs._get_deviation_of_mean(num1) dam, damp = self.dfs._get_median_absolute_deviation(num1) expected = pd.Series(index=['mean', 'std', 'variance', 'min', 'max', '5%', '25%', '50%', '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum', 'mad', 'cv', 'zeros_num', 'zeros_perc', 'deviating_of_mean', 'deviating_of_mean_perc', 'deviating_of_median', 'deviating_of_median_perc', 'top_correlations', 'counts', 'uniques', 'missing', 'missing_perc', 'types'], data=[num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(), num1.quantile(0.05), num1.quantile( 0.25), num1.quantile(0.5), num1.quantile(0.75), num1.quantile(0.95), num1.quantile(0.75) - num1.quantile(0.25), num1.kurt(), num1.skew(), num1.sum(), num1.mad(), num1.std() / num1.mean() if num1.mean() else np.nan, self.size - np.count_nonzero(num1), DataFrameSummary._percent( (self.size - np.count_nonzero(num1))/self.size), dm, dmp, dam, damp, 'dnumerics2: 100%', self.size, self.size, 0, '0%', DataFrameSummary.TYPE_NUMERIC], name='dnumerics1', dtype=object) assert_series_equal(self.dfs['dnumerics1'], expected)
def test_bool1_summary(self): count_values = self.df['dbool1'].value_counts() total_count = self.df['dbool1'].count() count0 = count_values[0] count1 = count_values[1] perc0 = DataFrameSummary._percent(count0 / total_count) perc1 = DataFrameSummary._percent(count1 / total_count) expected = pd.Series(index=[ '"0" count', '"0" perc', '"1" count', '"1" perc', 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ str(count0), perc0, str(count1), perc1, self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL ], name='dbool1', dtype=object) assert_series_equal(self.dfs['dbool1'], expected)
def test_numerics_summary(self): num1 = self.df['dnumerics1'] dm, dmp = self.dfs._get_deviation_of_mean(num1) dam, damp = self.dfs._get_median_absolute_deviation(num1) expected = pd.Series( index=[ 'mean', 'std', 'variance', 'min', 'max', 'mode', '5%', '25%', '50%', '75%', '95%', 'iqr', 'kurtosis', 'skewness', 'sum', 'mad', 'cv', 'zeros_num', 'zeros_perc', 'deviating_of_mean', 'deviating_of_mean_perc', 'deviating_of_median', 'deviating_of_median_perc', 'top_correlations', 'counts', 'uniques', 'missing', 'missing_perc', 'types' ], data=[ num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(), num1.mode()[0], num1.quantile(0.05), num1.quantile(0.25), num1.quantile(0.5), num1.quantile(0.75), num1.quantile(0.95), num1.quantile(0.75) - num1.quantile(0.25), num1.kurt(), num1.skew(), num1.sum(), num1.mad(), num1.std() / num1.mean() if num1.mean() else np.nan, self.size - np.count_nonzero(num1), DataFrameSummary._percent( (self.size - np.count_nonzero(num1)) / self.size), dm, dmp, dam, damp, 'dnumerics2: 100%', self.size, self.size, 0, '0%', DataFrameSummary.TYPE_NUMERIC ], name='dnumerics1', dtype=object) assert_series_equal(self.dfs['dnumerics1'], expected)