예제 #1
0
def describe(column):
    """
    Flask route which returns standard details about column data using :meth:`pandas:pandas.DataFrame.describe` to
    the front-end as JSON

    :param column: required dash separated string "START-END" stating a range of row indexes to be returned
                   to the screen
    :return: JSON {
        describe: object representing output from :meth:`pandas:pandas.Series.describe`,
        unique_data: array of unique values when data has <= 100 unique values
        success: True/False
    }

    """
    try:
        data = DATA[get_port()]
        desc = load_describe(data[column])
        return_data = dict(describe=desc, success=True)
        uniq_vals = data[column].unique()
        if 'unique' not in return_data['describe']:
            return_data['describe']['unique'] = json_int(len(uniq_vals),
                                                         as_string=True)
        if len(uniq_vals) <= 100:
            uniq_f = find_dtype_formatter(get_dtypes(data)[column])
            return_data['uniques'] = [
                uniq_f(u, nan_display='N/A') for u in uniq_vals
            ]
        return jsonify(return_data)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
예제 #2
0
    def build(self, parent):
        if parent.classifier == "D":
            parent.data.loc[:, parent.selected_col] = apply(
                parent.data[parent.selected_col], json_timestamp)
        kde_code = []
        if self.target is None:
            return_data, hist_labels = self.build_histogram_data(
                parent.data[parent.selected_col])
            kde, kde_code = build_kde(parent.data[parent.selected_col],
                                      hist_labels, parent.selected_col)
            if kde is not None:
                return_data["kde"] = kde
        else:
            return_data = {"targets": [], "labels": list(range(self.bins))}
            target_dtype = find_dtype(parent.data[self.target])
            target_formatter = find_dtype_formatter(target_dtype)
            for target, target_data in parent.data[[
                    self.target, parent.selected_col
            ]].groupby(self.target):
                target_data, _ = self.build_histogram_data(
                    target_data[parent.selected_col])
                target_data["target"] = target_formatter(target,
                                                         as_string=True)
                return_data["targets"].append(target_data)

        desc, desc_code = load_describe(parent.data[parent.selected_col])
        dtype_info = global_state.get_dtype_info(parent.data_id,
                                                 parent.selected_col)
        for p in ["skew", "kurt"]:
            if p in dtype_info:
                desc[p] = dtype_info[p]

        return_data["desc"] = desc
        return return_data, self._build_code(parent, kde_code, desc_code)
예제 #3
0
def describe(data_id, column):
    """
    :class:`flask:flask.Flask` route which returns standard details about column data using
    :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param column: required dash separated string "START-END" stating a range of row indexes to be returned
                   to the screen
    :return: JSON {
        describe: object representing output from :meth:`pandas:pandas.Series.describe`,
        unique_data: array of unique values when data has <= 100 unique values
        success: True/False
    }

    """
    try:
        data = DATA[data_id][[column]]
        additional_aggs = None
        dtype = next((dtype_info['dtype'] for dtype_info in DTYPES[data_id]
                      if dtype_info['name'] == column), None)
        if classify_type(dtype) in ['I', 'F']:
            additional_aggs = [
                'sum', 'median', 'mode', 'var', 'sem', 'skew', 'kurt'
            ]
        desc = load_describe(data[column], additional_aggs=additional_aggs)
        return_data = dict(describe=desc, success=True)
        uniq_vals = data[column].unique()
        if 'unique' not in return_data['describe']:
            return_data['describe']['unique'] = json_int(len(uniq_vals),
                                                         as_string=True)
        uniq_f = find_dtype_formatter(get_dtypes(data)[column])
        if len(uniq_vals) <= 100:
            return_data['uniques'] = dict(data=[uniq_f(u) for u in uniq_vals],
                                          top=False)
        else:  # get top 100 most common values
            uniq_vals = data[column].value_counts().sort_values(
                ascending=False).head(100).index.values
            return_data['uniques'] = dict(data=[uniq_f(u) for u in uniq_vals],
                                          top=True)

        return jsonify(return_data)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
예제 #4
0
    def build(self, parent):
        if parent.classifier == "D":
            parent.data.loc[:, parent.selected_col] = apply(
                parent.data[parent.selected_col], json_timestamp
            )
        kde_code = []
        if self.target is None:
            return_data, hist_labels = self.build_histogram_data(
                parent.data[parent.selected_col]
            )
            kde, kde_code = build_kde(
                parent.data[parent.selected_col], hist_labels, parent.selected_col
            )
            if kde is not None:
                return_data["kde"] = kde
        else:
            bin_vals = pd.cut(parent.data[parent.selected_col], bins=self.bins)
            labels = ["{}".format(c) for c in bin_vals.dtype.categories]
            parent.data.loc[:, "bin"] = bin_vals.astype("str")
            return_data = {"targets": [], "labels": labels}
            target_dtype = find_dtype(parent.data[self.target])
            target_formatter = find_dtype_formatter(target_dtype)
            for target, target_data in parent.data[[self.target, "bin"]].groupby(
                self.target
            ):
                target_counts = target_data["bin"].value_counts()
                target_counts = [
                    int(tc) for tc in target_counts.reindex(labels, fill_value=0).values
                ]
                return_data["targets"].append(
                    dict(
                        target=target_formatter(target, as_string=True),
                        data=target_counts,
                    )
                )

        desc, desc_code = load_describe(parent.data[parent.selected_col])
        dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col)
        for p in ["skew", "kurt"]:
            if p in dtype_info:
                desc[p] = dtype_info[p]

        return_data["desc"] = desc
        return return_data, self._build_code(parent, kde_code, desc_code)
예제 #5
0
파일: utils.py 프로젝트: reza1615/dtale
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_val=None,
                     agg=None,
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     animate_by=None,
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """
    group_fmt_overrides = {
        "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}")
    }
    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get("z"),
                                     group_col,
                                     animate_by,
                                     group_val=group_val)
    x_col = str("x")
    if x is None:
        x = x_col
        data.loc[:, x_col] = range(1,
                                   len(data) +
                                   1)  # sequential integers: 1, 2, ..., N
    y_cols = make_list(y)
    z_col = kwargs.get("z")
    z_cols = make_list(z_col)
    sort_cols = y_cols if len(z_cols) else []
    if group_col is not None and len(group_col):
        main_group = group_col
        if animate_by is not None:
            main_group = [animate_by] + main_group
        sort_cols = main_group + [x] + sort_cols
        data = data.sort_values(sort_cols)
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(sort_cols)))
        check_all_nan(data)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")
        if agg is not None:
            data, agg_code = build_agg_data(
                data,
                x_col,
                y_cols,
                kwargs,
                agg,
                z=z_col,
                group_col=group_col,
                animate_by=animate_by,
            )
            code += agg_code
        MAX_GROUPS = 30
        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. "
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                "are listed below:\n\n{}").format(
                    ", ".join(group_col), MAX_GROUPS,
                    group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts
                if col in [x_col] + y_cols + z_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts
                if col in [x_col] + y_cols + z_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }

        def _load_groups(df):
            for group_val, grp in df.groupby(group_col):

                def _group_filter():
                    for gv, gc in zip(make_list(group_val), group_col):
                        classifier = classify_type(dtypes[gc])
                        yield group_filter_handler(
                            gc, group_fmts[gc](gv, as_string=True), classifier)

                group_filter = " and ".join(list(_group_filter()))
                yield group_filter, data_f.format_lists(grp)

        if animate_by is not None:
            frame_fmt = find_dtype_formatter(dtypes[animate_by],
                                             overrides=group_fmt_overrides)
            ret_data["frames"] = []
            for frame_key, frame in data.sort_values(animate_by).groupby(
                    animate_by):
                ret_data["frames"].append(
                    dict(
                        data=dict(_load_groups(frame)),
                        name=frame_fmt(frame_key, as_string=True),
                    ))
            ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
        else:
            ret_data["data"] = dict(_load_groups(data))
        return ret_data, code
    main_group = [x]
    if animate_by is not None:
        main_group = [animate_by] + main_group
    sort_cols = main_group + sort_cols
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data)
    y_cols = [str(y_col) for y_col in y_cols]
    data = data[main_group + y_cols + z_cols]

    data = data.rename(columns={x: x_col})
    main_group = [x_col if c == x else c for c in main_group]
    code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                x_col + "'})")

    if agg is not None:
        data, agg_code = build_agg_data(data,
                                        x_col,
                                        y_cols,
                                        kwargs,
                                        agg,
                                        z=z_col,
                                        animate_by=animate_by)
        code += agg_code
    data = data.dropna()
    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = main_group + (y_cols if len(z_cols) else [])
    check_exceptions(
        data[dupe_cols].rename(columns={x_col: x}),
        allow_duplicates or agg in ["raw", "drop_duplicates"],
        unlimited_data=unlimited_data,
        data_limit=40000 if len(z_cols) or animate_by is not None else 15000,
    )
    data_f, range_f = build_formatters(data)

    ret_data = dict(
        min={
            col: fmt(data[col].min(), None)
            for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols
        },
        max={
            col: fmt(data[col].max(), None)
            for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols
        },
    )
    if animate_by is not None:
        frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]),
                                         overrides=group_fmt_overrides)
        ret_data["frames"] = []
        for frame_key, frame in data.sort_values(animate_by).groupby(
                animate_by):
            ret_data["frames"].append(
                dict(
                    data={str("all"): data_f.format_lists(frame)},
                    name=frame_fmt(frame_key, as_string=True),
                ))
        ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
    else:
        ret_data["data"] = {str("all"): data_f.format_lists(data)}
    return ret_data, code
예제 #6
0
파일: utils.py 프로젝트: kennethjhim/dtale
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_type=None,
                     group_val=None,
                     bins_val=None,
                     bin_type=None,
                     agg=None,
                     extended_aggregation=[],
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     animate_by=None,
                     cleaners=[],
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param extended_aggregation: list of configurations that point to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: list, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """
    group_fmt_overrides = {
        "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}")
    }
    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get("z"),
                                     group_col,
                                     animate_by,
                                     group_val=group_val)
    cleaners = cleaners or []
    if len(cleaners):
        for col in data.columns:
            if classify_type(find_dtype(data[col])) == "S":
                code.append("s = chart_data['{}']".format(col))
                cleaned_col, cleaned_code = handle_cleaners(
                    data[col], ",".join(cleaners))
                data.loc[:, col] = cleaned_col
                code += cleaned_code
                code.append("chart_data.loc[:, '{}'] = s".format(col))

    x_col = str("x")
    if x is None:
        x = x_col
        data.loc[:, x_col] = range(1,
                                   len(data) +
                                   1)  # sequential integers: 1, 2, ..., N
    y_cols = make_list(y)

    z_col = kwargs.get("z")
    z_cols = make_list(z_col)
    y_cols = [str(col) for col in y_cols]
    is_z = len(z_cols) > 0
    y_group_cols = y_cols if is_z else []
    sort_cols = y_group_cols
    final_cols = y_cols + z_cols
    if group_col is not None and len(group_col):
        for col in make_list(group_col):
            classifier = classify_type(find_dtype(data[col]))
            if classifier == "F" or (classifier == "I"
                                     and group_type == "bins"):
                if bin_type == "width":
                    data.loc[:, col] = pd.qcut(data[col],
                                               q=bins_val,
                                               duplicates="drop").astype("str")
                    code.append((
                        "chart_data.loc[:, '{col}'] = pd.qcut(chart_data['{col}'], q={bins}, duplicates=\"drop\")"
                    ).format(col=col, bins=bins_val))
                else:
                    bins_data = data[col].dropna()
                    npt = len(bins_data)
                    equal_freq_bins = np.interp(
                        np.linspace(0, npt, bins_val + 1),
                        np.arange(npt),
                        np.sort(bins_data),
                    )
                    data.loc[:, col] = pd.cut(data[col],
                                              bins=equal_freq_bins,
                                              duplicates="drop").astype("str")
                    code.append((
                        "bins_data = data['{col}'].dropna()\n"
                        "npt = len(bins_data)\n"
                        "equal_freq_bins = np.interp(np.linspace(0, npt, {bins}), np.arange(npt), "
                        "np.sort(bins_data))\n"
                        "chart_data.loc[:, '{col}'] = pd.cut(chart_data['{col}'], bins=equal_freq_bins, "
                        'duplicates="drop")').format(col=col,
                                                     bins=bins_val + 1))

        main_group = group_col
        if animate_by is not None:
            main_group = [animate_by] + main_group
        sort_cols = main_group + [x] + sort_cols
        data = data.sort_values(sort_cols)
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(sort_cols)))
        check_all_nan(data)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")

        if agg is not None or len(extended_aggregation):
            data, agg_code, final_cols = build_agg_data(
                data,
                x_col,
                y_cols,
                kwargs,
                agg,
                z=z_col,
                group_col=group_col,
                animate_by=animate_by,
                extended_aggregation=extended_aggregation,
            )
            code += agg_code

        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. "
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                "are listed below:\n\n{}").format(
                    ", ".join(group_col), MAX_GROUPS,
                    group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + final_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + final_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }

        def _load_groups(df):
            for group_val, grp in df.groupby(group_col):

                def _group_filter():
                    for gv, gc in zip(make_list(group_val), group_col):
                        classifier = classify_type(dtypes[gc])
                        yield group_filter_handler(
                            gc, group_fmts[gc](gv, as_string=True), classifier)

                final_group_filter, final_group_label = [], []
                for gf, gl in _group_filter():
                    final_group_filter.append(gf)
                    final_group_label.append(gl)
                group_filter = " and ".join(final_group_filter)
                group_label = "({})".format(", ".join(final_group_label))
                data = data_f.format_lists(grp)
                data["_filter_"] = group_filter
                yield group_label, data

        if animate_by is not None:
            frame_fmt = find_dtype_formatter(dtypes[animate_by],
                                             overrides=group_fmt_overrides)
            ret_data["frames"] = []
            for frame_key, frame in data.sort_values(animate_by).groupby(
                    animate_by):
                ret_data["frames"].append(
                    dict(
                        data=dict(_load_groups(frame)),
                        name=frame_fmt(frame_key, as_string=True),
                    ))
            ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
        else:
            ret_data["data"] = dict(_load_groups(data))
        return ret_data, code

    main_group = [x]
    if animate_by is not None:
        main_group = [animate_by] + main_group
    sort_cols = main_group + sort_cols
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data)
    data = data[main_group + final_cols]

    data = data.rename(columns={x: x_col})
    main_group = [x_col if c == x else c for c in main_group]
    code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                x_col + "'})")

    # convert booleans into integers for aggregation
    for col in z_cols or y_cols:
        classifier = classify_type(find_dtype(data[col]))
        if classifier == "B":
            data.loc[:, col] = data[col].astype("int")

    if agg is not None or len(extended_aggregation):
        data, agg_code, final_cols = build_agg_data(
            data,
            x_col,
            y_cols,
            kwargs,
            agg,
            z=z_col,
            animate_by=animate_by,
            extended_aggregation=extended_aggregation,
        )
        code += agg_code
    data = data.dropna()

    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = main_group + y_group_cols
    data_limit = global_state.get_chart_settings(
    )["3d_points" if is_z or animate_by is not None else "scatter_points"]
    check_exceptions(
        data[dupe_cols].rename(columns={x_col: x}),
        allow_duplicates or agg in ["raw", "drop_duplicates"],
        unlimited_data=unlimited_data,
        data_limit=data_limit,
    )
    data_f, range_f = build_formatters(data)

    ret_data = dict(
        min={
            col: fmt(data[col].min(), None)
            for _, col, fmt in range_f.fmts
            if col in [x_col] + y_group_cols + final_cols
        },
        max={
            col: fmt(data[col].max(), None)
            for _, col, fmt in range_f.fmts
            if col in [x_col] + y_group_cols + final_cols
        },
    )
    if animate_by is not None:
        frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]),
                                         overrides=group_fmt_overrides)
        ret_data["frames"] = []
        for frame_key, frame in data.sort_values(animate_by).groupby(
                animate_by):
            ret_data["frames"].append(
                dict(
                    data={str("all"): data_f.format_lists(frame)},
                    name=frame_fmt(frame_key, as_string=True),
                ))
        ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
    else:
        ret_data["data"] = {str("all"): data_f.format_lists(data)}
    return ret_data, code
예제 #7
0
파일: utils.py 프로젝트: shalevy1/dtale
def build_chart(raw_data, x, y, group_col=None, agg=None, allow_duplicates=False, **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """

    data, code = retrieve_chart_data(raw_data, x, y, kwargs.get('z'), group_col)
    x_col = str('x')
    y_cols = make_list(y)
    z_col = kwargs.get('z')
    z_cols = []
    if z_col is not None:
        z_cols = [z_col]
    if group_col is not None and len(group_col):
        data = data.sort_values(group_col + [x])
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(cols="', '".join(group_col + [x])))
        check_all_nan(data, [x] + y_cols)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})")
        if agg is not None:
            data = data.groupby(group_col + [x_col])
            data = getattr(data, agg)().reset_index()
            code.append("chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()".format(
                cols="', '".join(group_col + [x]), agg=agg
            ))
        max_groups = 15
        if len(data[group_col].drop_duplicates()) > max_groups:
            msg = (
                'Group ({}) contains more than {} unique values, please add additional filter'
                ' or else chart will be unreadable'
            ).format(', '.join(group_col), max_groups)
            raise Exception(msg)

        data = data.dropna()
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
            max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
        )

        dtypes = get_dtypes(data)
        group_fmt_overrides = {'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')}
        group_fmts = {c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col}
        for group_val, grp in data.groupby(group_col):
            group_val = '/'.join([
                group_fmts[gc](gv, as_string=True) for gv, gc in zip(make_list(group_val), group_col)
            ])
            ret_data['data'][group_val] = data_f.format_lists(grp)
        ret_data['dtypes'] = {c: classify_type(dtype) for c, dtype in dtypes.items()}
        return ret_data, code
    sort_cols = [x] + (y_cols if len(z_cols) else [])
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(cols="', '".join(sort_cols)))
    check_all_nan(data, [x] + y_cols + z_cols)
    y_cols = [str(y_col) for y_col in y_cols]
    data.columns = [x_col] + y_cols + z_cols
    code.append("chart_data.columns = ['{cols}']".format(cols="', '".join([x_col] + y_cols + z_cols)))
    if agg is not None:
        data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col)
        code += agg_code
    data = data.dropna()
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = [x_col] + (y_cols if len(z_cols) else [])
    check_exceptions(data[dupe_cols].rename(columns={'x': x}), allow_duplicates,
                     data_limit=40000 if len(z_cols) else 15000)
    data_f, range_f = build_formatters(data)
    ret_data = dict(
        data={str('all'): data_f.format_lists(data)},
        min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols},
        max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols}
    )
    return ret_data, code
예제 #8
0
def build_chart(data, x, y, group_col=None, agg=None, allow_duplicates=False, **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param data: dataframe to be used for chart
    :type data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param aggregation: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type aggregation: str, optional
    :return: dict
    """

    def build_formatters(df):
        cols = grid_columns(df)
        data_f = grid_formatter(cols, nan_display=None)
        overrides = {'F': lambda f, i, c: f.add_float(i, c, precision=2)}
        range_f = grid_formatter(cols, overrides=overrides, nan_display=None)
        return data_f, range_f

    def check_all_nan(df, cols):
        for col in cols:
            if df[col].isnull().all():
                raise Exception('All data for column "{}" is NaN!'.format(col))

    x_col = str('x')
    y_cols = make_list(y)
    if group_col is not None:
        data = data[group_col + [x] + y_cols].sort_values(group_col + [x])
        check_all_nan(data, [x] + y_cols)
        y_cols = [str(y_col) for y_col in y_cols]
        data.columns = group_col + [x_col] + y_cols
        if agg is not None:
            data = data.groupby(group_col + [x_col])
            data = getattr(data, agg)().reset_index()
        max_groups = 15
        if len(data[group_col].drop_duplicates()) > max_groups:
            msg = (
                'Group ({}) contains more than {} unique values, please add additional filter'
                ' or else chart will be unreadable'
            ).format(', '.join(group_col), max_groups)
            raise Exception(msg)

        data_f, range_f = build_formatters(data[[x_col] + y_cols])
        ret_data = dict(
            data={},
            min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
            max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
        )
        dtypes = get_dtypes(data)
        group_fmts = {c: find_dtype_formatter(dtypes[c]) for c in group_col}
        for group_val, grp in data.groupby(group_col):
            group_val = '/'.join([
                group_fmts[gc](gv, as_string=True) for gv, gc in zip(make_list(group_val), group_col)
            ])
            ret_data['data'][group_val] = data_f.format_lists(grp)
        return ret_data
    data = data[[x] + y_cols].sort_values(x)
    check_all_nan(data, [x] + y_cols)
    y_cols = [str(y_col) for y_col in y_cols]
    data.columns = [x_col] + y_cols
    if agg is not None:
        if agg == 'rolling':
            window, comp = map(kwargs.get, ['rolling_win', 'rolling_comp'])
            data = data.set_index(x_col).rolling(window=window)
            data = pd.DataFrame({c: getattr(data[c], comp)() for c in y_cols})
            data = data.reset_index()
        else:
            data = data.groupby(x_col)
            data = getattr(data[y_cols], agg)().reset_index()

    if not allow_duplicates and any(data[x_col].duplicated()):
        raise Exception('{} contains duplicates, please specify group or additional filtering'.format(x))
    if len(data) > 15000:
        raise Exception('Dataset exceeds 15,000 records, cannot render. Please apply filter...')
    data_f, range_f = build_formatters(data)
    ret_data = dict(
        data={str('all'): data_f.format_lists(data)},
        min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
        max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
    )
    return ret_data
예제 #9
0
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_val=None,
                     agg=None,
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """

    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get('z'),
                                     group_col,
                                     group_val=group_val)
    x_col = str('x')
    y_cols = make_list(y)
    z_col = kwargs.get('z')
    z_cols = make_list(z_col)
    if group_col is not None and len(group_col):
        data = data.sort_values(group_col + [x])
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(group_col + [x])))
        check_all_nan(data, [x] + y_cols)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")
        if agg is not None and agg != 'raw':
            data = data.groupby(group_col + [x_col])
            data = getattr(data, agg)().reset_index()
            code.append(
                "chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()"
                .format(cols="', '".join(group_col + [x]), agg=agg))
        MAX_GROUPS = 30
        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmt_overrides = {
                'I':
                lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')
            }
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                'Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. '
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                'are listed below:').format(', '.join(group_col), MAX_GROUPS,
                                            group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + y_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + y_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmt_overrides = {
            'I':
            lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')
        }
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }
        for group_val, grp in data.groupby(group_col):

            def _group_filter():
                for gv, gc in zip(make_list(group_val), group_col):
                    classifier = classify_type(dtypes[gc])
                    yield group_filter_handler(
                        gc, group_fmts[gc](gv, as_string=True), classifier)

            group_filter = ' and '.join(list(_group_filter()))
            ret_data['data'][group_filter] = data_f.format_lists(grp)
        return ret_data, code
    sort_cols = [x] + (y_cols if len(z_cols) else [])
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data, [x] + y_cols + z_cols)
    y_cols = [str(y_col) for y_col in y_cols]
    data.columns = [x_col] + y_cols + z_cols
    code.append("chart_data.columns = ['{cols}']".format(
        cols="', '".join([x_col] + y_cols + z_cols)))
    if agg is not None:
        data, agg_code = build_agg_data(data,
                                        x_col,
                                        y_cols,
                                        kwargs,
                                        agg,
                                        z=z_col)
        code += agg_code
    data = data.dropna()
    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = [x_col] + (y_cols if len(z_cols) else [])
    check_exceptions(data[dupe_cols].rename(columns={'x': x}),
                     allow_duplicates or agg == 'raw',
                     unlimited_data=unlimited_data,
                     data_limit=40000 if len(z_cols) else 15000)
    data_f, range_f = build_formatters(data)
    ret_data = dict(data={str('all'): data_f.format_lists(data)},
                    min={
                        col: fmt(data[col].min(), None)
                        for _, col, fmt in range_f.fmts
                        if col in [x_col] + y_cols + z_cols
                    },
                    max={
                        col: fmt(data[col].max(), None)
                        for _, col, fmt in range_f.fmts
                        if col in [x_col] + y_cols + z_cols
                    })
    return ret_data, code