def test_compute_hist_multi_columns(self): expected_bins = np.linspace(1, 50, 11) kdf = ks.DataFrame( { "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50], "b": [50, 50, 30, 30, 30, 24, 10, 5, 4, 3, 1], } ) bins = HistogramPlotBase.get_bins(kdf.to_spark(), 10) self.assert_eq(pd.Series(expected_bins), pd.Series(bins)) expected_histograms = [ np.array([5, 4, 1, 0, 0, 0, 0, 0, 0, 1]), np.array([4, 1, 0, 0, 1, 3, 0, 0, 0, 2]), ] histograms = HistogramPlotBase.compute_hist(kdf, bins) expected_names = ["a", "b"] for histogram, expected_histogram, expected_name in zip( histograms, expected_histograms, expected_names ): self.assert_eq( pd.Series(expected_histogram, name=expected_name), histogram, almost=True )
def plot_histogram(data: Union["ks.DataFrame", "ks.Series"], **kwargs): import plotly.graph_objs as go bins = kwargs.get("bins", 10) kdf, bins = HistogramPlotBase.prepare_hist_data(data, bins) assert len(bins) > 2, "the number of buckets must be higher than 2." output_series = HistogramPlotBase.compute_hist(kdf, bins) prev = float("%.9f" % bins[0]) # to make it prettier, truncate. text_bins = [] for b in bins[1:]: norm_b = float("%.9f" % b) text_bins.append("[%s, %s)" % (prev, norm_b)) prev = norm_b text_bins[ -1] = text_bins[-1][:-1] + "]" # replace ) to ] for the last bucket. bins = 0.5 * (bins[:-1] + bins[1:]) output_series = list(output_series) bars = [] for series in output_series: bars.append( go.Bar( x=bins, y=series, name=name_like_string(series.name), text=text_bins, hovertemplate=("variable=" + name_like_string(series.name) + "<br>value=%{text}<br>count=%{y}"), )) fig = go.Figure(data=bars, layout=go.Layout(barmode="stack")) fig["layout"]["xaxis"]["title"] = "value" fig["layout"]["yaxis"]["title"] = "count" return fig
def test_compute_hist_single_column(self): kdf = ks.DataFrame( {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50],}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9, 10, 10] ) expected_bins = np.linspace(1, 50, 11) bins = HistogramPlotBase.get_bins(kdf[["a"]].to_spark(), 10) expected_histogram = np.array([5, 4, 1, 0, 0, 0, 0, 0, 0, 1]) histogram = HistogramPlotBase.compute_hist(kdf[["a"]], bins)[0] self.assert_eq(pd.Series(expected_bins), pd.Series(bins)) self.assert_eq(pd.Series(expected_histogram, name="a"), histogram, almost=True)
def _make_plot(self): # TODO: this logic is similar with KdePlot. Might have to deduplicate it. # 'num_colors' requires to calculate `shape` which has to count all. # Use 1 for now to save the computation. colors = self._get_colors(num_colors=1) stacking_id = self._get_stacking_id() output_series = HistogramPlotBase.compute_hist(self.data, self.bins) for (i, label), y in zip(enumerate(self.data._internal.column_labels), output_series): ax = self._get_ax(i) kwds = self.kwds.copy() label = pprint_thing(label if len(label) > 1 else label[0]) kwds["label"] = label style, kwds = self._apply_style_colors(colors, kwds, i, label) if style is not None: kwds["style"] = style kwds = self._make_plot_keywords(kwds, y) artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) self._add_legend_handle(artists[0], label, index=i)
def _compute_plot_data(self): self.data, self.bins = HistogramPlotBase.prepare_hist_data( self.data, self.bins)