def get_histogram(data_id): """ :class:`flask:flask.Flask` route which returns output from numpy.histogram to front-end as JSON :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param col: string from flask.request.args['col'] containing name of a column in your dataframe :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param bins: the number of bins to display in your histogram, options on the front-end are 5, 10, 20, 50 :returns: JSON {results: DATA, desc: output from pd.DataFrame[col].describe(), success: True/False} """ col = get_str_arg(request, 'col', 'values') query = get_str_arg(request, 'query') bins = get_int_arg(request, 'bins', 20) try: data = DATA[data_id] if query: data = data.query(query) selected_col = find_selected_column(data, col) data = data[~pd.isnull(data[selected_col])][[selected_col]] hist = np.histogram(data, bins=bins) desc = load_describe(data[selected_col]) return jsonify(data=[json_float(h) for h in hist[0]], labels=['{0:.1f}'.format(l) for l in hist[1]], desc=desc) except BaseException as e: return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
def build_histogram(data_id, col, query, point_filter): data = run_query( handle_predefined(data_id), query, global_state.get_context_variables(data_id), ) query, _ = build_group_inputs_filter(data, [point_filter]) data = run_query(data, query) s = data[~pd.isnull(data[col])][col] hist_data, hist_labels = np.histogram(s, bins=10) hist_labels = list( map(lambda x: json_float(x, precision=3), hist_labels[1:])) axes_builder = build_axes( dict( data=dict(all=dict(Frequency=hist_data, Bins=hist_labels)), min=dict(Frequency=0), max=dict(Frequency=max(hist_data)), ), "Bins", dict(type="single", data={}), ) hist_data = dict(data={"all": dict(x=hist_labels, Frequency=hist_data)}) bars = bar_builder( hist_data, "Bins", ["Frequency"], axes_builder, chart_builder_passthru, modal=True, ) bars.figure["layout"]["xaxis"]["type"] = "category" bars.figure["layout"]["title"]["text"] = "{} {} ({} {})".format( text("Histogram of"), col, len(s), text("data points")) return bars
def build(self, parent): if parent.classifier == "D": parent.data.loc[:, parent.selected_col] = apply( parent.data[parent.selected_col], json_timestamp) hist_data, hist_labels = np.histogram(parent.data[parent.selected_col], bins=self.bins) hist_data = [json_float(h) for h in hist_data] return_data = dict( labels=["{0:.1f}".format(lbl) for lbl in hist_labels[1:] ], # drop the first bin because of just a minimum data=hist_data, ) kde, kde_code = build_kde(parent.data[parent.selected_col], hist_labels, parent.selected_col) if kde is not None: return_data["kde"] = kde desc, desc_code = load_describe(parent.data[parent.selected_col]) dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col) for p in ["skew", "kurt"]: if p in dtype_info: desc[p] = dtype_info[p] return_data["desc"] = desc return return_data, self._build_code(parent, kde_code, desc_code)
def build_histogram_data(self, series): hist_data, hist_labels = np.histogram(series, bins=self.bins) hist_data = [json_float(h) for h in hist_data] return ( dict( labels=["{0:.1f}".format(lbl) for lbl in hist_labels[1:] ], # drop the first bin because of just a minimum data=hist_data, ), hist_labels, )
def build_kde(s, hist_labels, selected_col): try: kde = sts.gaussian_kde(s) kde_data = kde.pdf(hist_labels) kde_data = [json_float(k, precision=12) for k in kde_data] code = [ "import scipy.stats as sts", "kde = sts.gaussian_kde(s['{}'])".format(selected_col), "kde_data = kde.pdf(np.linspace(labels.min(), labels.max()))", ] return kde_data, code except np.linalg.LinAlgError: return None, []
def build_histogram_data(self, series): hist_kwargs = {"density": True} if self.density else {"bins": self.bins} hist_data, hist_labels = np.histogram(series, **hist_kwargs) hist_data = [json_float(h) for h in hist_data] return ( dict( labels=[ "{0:.1f}".format(lbl) for lbl in hist_labels[1:] ], # drop the first bin because of just a minimum data=hist_data, ), hist_labels, )
def load_describe(column_series, additional_aggs=None): """ Helper function for grabbing the output from :meth:`pandas:pandas.Series.describe` in a JSON serializable format :param column_series: data to describe :type column_series: :class:`pandas:pandas.Series` :return: JSON serializable dictionary of the output from calling :meth:`pandas:pandas.Series.describe` """ desc = column_series.describe().to_frame().T code = [ "# main statistics", "stats = df['{col}'].describe().to_frame().T".format(col=column_series.name), ] if additional_aggs: for agg in additional_aggs: if agg == "mode": mode = column_series.mode().values desc["mode"] = np.nan if len(mode) > 1 else mode[0] code.append( ( "# mode\n" "mode = df['{col}'].mode().values\n" "stats['mode'] = np.nan if len(mode) > 1 else mode[0]" ).format(col=column_series.name) ) continue desc[agg] = getattr(column_series, agg)() code.append( "# {agg}\nstats['{agg}'] = df['{col}'].{agg}()".format( col=column_series.name, agg=agg ) ) desc_f_overrides = { "I": lambda f, i, c: f.add_int(i, c, as_string=True), "F": lambda f, i, c: f.add_float(i, c, precision=4, as_string=True), } desc_f = grid_formatter( grid_columns(desc), nan_display="nan", overrides=desc_f_overrides ) desc = desc_f.format_dict(next(desc.itertuples(), None)) if "count" in desc: # pandas always returns 'count' as a float and it adds useless decimal points desc["count"] = desc["count"].split(".")[0] desc["total_count"] = json_int(len(column_series), as_string=True) missing_ct = column_series.isnull().sum() desc["missing_pct"] = json_float((missing_ct / len(column_series) * 100).round(2)) desc["missing_ct"] = json_int(missing_ct, as_string=True) return desc, code