示例#1
0
文件: describe.py 项目: unachka/dtale
def load_describe(column_series, additional_aggs=None):
    """
    Helper function for grabbing the output from :meth:`pandas:pandas.Series.describe` in a JSON serializable format

    :param column_series: data to describe
    :type column_series: :class:`pandas:pandas.Series`
    :return: JSON serializable dictionary of the output from calling :meth:`pandas:pandas.Series.describe`
    """
    desc = column_series.describe().to_frame().T
    code = [
        "# main statistics",
        "stats = df['{col}'].describe().to_frame().T".format(col=column_series.name),
    ]
    if additional_aggs:
        for agg in additional_aggs:
            if agg == "mode":
                mode = column_series.mode().values
                desc["mode"] = np.nan if len(mode) > 1 else mode[0]
                code.append(
                    (
                        "# mode\n"
                        "mode = df['{col}'].mode().values\n"
                        "stats['mode'] = np.nan if len(mode) > 1 else mode[0]"
                    ).format(col=column_series.name)
                )
                continue
            desc[agg] = getattr(column_series, agg)()
            code.append(
                "# {agg}\nstats['{agg}'] = df['{col}'].{agg}()".format(
                    col=column_series.name, agg=agg
                )
            )
    desc_f_overrides = {
        "I": lambda f, i, c: f.add_int(i, c, as_string=True),
        "F": lambda f, i, c: f.add_float(i, c, precision=4, as_string=True),
    }
    desc_f = grid_formatter(
        grid_columns(desc), nan_display="nan", overrides=desc_f_overrides
    )
    desc = desc_f.format_dict(next(desc.itertuples(), None))
    if "count" in desc:
        # pandas always returns 'count' as a float and it adds useless decimal points
        desc["count"] = desc["count"].split(".")[0]
    desc["total_count"] = json_int(len(column_series), as_string=True)
    missing_ct = column_series.isnull().sum()
    desc["missing_pct"] = json_float((missing_ct / len(column_series) * 100).round(2))
    desc["missing_ct"] = json_int(missing_ct, as_string=True)
    return desc, code
示例#2
0
def describe(column):
    """
    Flask route which returns standard details about column data using :meth:`pandas:pandas.DataFrame.describe` to
    the front-end as JSON

    :param column: required dash separated string "START-END" stating a range of row indexes to be returned
                   to the screen
    :return: JSON {
        describe: object representing output from :meth:`pandas:pandas.Series.describe`,
        unique_data: array of unique values when data has <= 100 unique values
        success: True/False
    }

    """
    try:
        data = DATA[get_port()]
        desc = load_describe(data[column])
        return_data = dict(describe=desc, success=True)
        uniq_vals = data[column].unique()
        if 'unique' not in return_data['describe']:
            return_data['describe']['unique'] = json_int(len(uniq_vals),
                                                         as_string=True)
        if len(uniq_vals) <= 100:
            uniq_f = find_dtype_formatter(get_dtypes(data)[column])
            return_data['uniques'] = [
                uniq_f(u, nan_display='N/A') for u in uniq_vals
            ]
        return jsonify(return_data)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
示例#3
0
def describe(data_id, column):
    """
    :class:`flask:flask.Flask` route which returns standard details about column data using
    :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param column: required dash separated string "START-END" stating a range of row indexes to be returned
                   to the screen
    :return: JSON {
        describe: object representing output from :meth:`pandas:pandas.Series.describe`,
        unique_data: array of unique values when data has <= 100 unique values
        success: True/False
    }

    """
    try:
        data = DATA[data_id][[column]]
        additional_aggs = None
        dtype = next((dtype_info['dtype'] for dtype_info in DTYPES[data_id]
                      if dtype_info['name'] == column), None)
        if classify_type(dtype) in ['I', 'F']:
            additional_aggs = [
                'sum', 'median', 'mode', 'var', 'sem', 'skew', 'kurt'
            ]
        desc = load_describe(data[column], additional_aggs=additional_aggs)
        return_data = dict(describe=desc, success=True)
        uniq_vals = data[column].unique()
        if 'unique' not in return_data['describe']:
            return_data['describe']['unique'] = json_int(len(uniq_vals),
                                                         as_string=True)
        uniq_f = find_dtype_formatter(get_dtypes(data)[column])
        if len(uniq_vals) <= 100:
            return_data['uniques'] = dict(data=[uniq_f(u) for u in uniq_vals],
                                          top=False)
        else:  # get top 100 most common values
            uniq_vals = data[column].value_counts().sort_values(
                ascending=False).head(100).index.values
            return_data['uniques'] = dict(data=[uniq_f(u) for u in uniq_vals],
                                          top=True)

        return jsonify(return_data)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
示例#4
0
文件: utils.py 项目: reza1615/dtale
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_val=None,
                     agg=None,
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     animate_by=None,
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """
    group_fmt_overrides = {
        "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}")
    }
    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get("z"),
                                     group_col,
                                     animate_by,
                                     group_val=group_val)
    x_col = str("x")
    if x is None:
        x = x_col
        data.loc[:, x_col] = range(1,
                                   len(data) +
                                   1)  # sequential integers: 1, 2, ..., N
    y_cols = make_list(y)
    z_col = kwargs.get("z")
    z_cols = make_list(z_col)
    sort_cols = y_cols if len(z_cols) else []
    if group_col is not None and len(group_col):
        main_group = group_col
        if animate_by is not None:
            main_group = [animate_by] + main_group
        sort_cols = main_group + [x] + sort_cols
        data = data.sort_values(sort_cols)
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(sort_cols)))
        check_all_nan(data)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")
        if agg is not None:
            data, agg_code = build_agg_data(
                data,
                x_col,
                y_cols,
                kwargs,
                agg,
                z=z_col,
                group_col=group_col,
                animate_by=animate_by,
            )
            code += agg_code
        MAX_GROUPS = 30
        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. "
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                "are listed below:\n\n{}").format(
                    ", ".join(group_col), MAX_GROUPS,
                    group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts
                if col in [x_col] + y_cols + z_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts
                if col in [x_col] + y_cols + z_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }

        def _load_groups(df):
            for group_val, grp in df.groupby(group_col):

                def _group_filter():
                    for gv, gc in zip(make_list(group_val), group_col):
                        classifier = classify_type(dtypes[gc])
                        yield group_filter_handler(
                            gc, group_fmts[gc](gv, as_string=True), classifier)

                group_filter = " and ".join(list(_group_filter()))
                yield group_filter, data_f.format_lists(grp)

        if animate_by is not None:
            frame_fmt = find_dtype_formatter(dtypes[animate_by],
                                             overrides=group_fmt_overrides)
            ret_data["frames"] = []
            for frame_key, frame in data.sort_values(animate_by).groupby(
                    animate_by):
                ret_data["frames"].append(
                    dict(
                        data=dict(_load_groups(frame)),
                        name=frame_fmt(frame_key, as_string=True),
                    ))
            ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
        else:
            ret_data["data"] = dict(_load_groups(data))
        return ret_data, code
    main_group = [x]
    if animate_by is not None:
        main_group = [animate_by] + main_group
    sort_cols = main_group + sort_cols
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data)
    y_cols = [str(y_col) for y_col in y_cols]
    data = data[main_group + y_cols + z_cols]

    data = data.rename(columns={x: x_col})
    main_group = [x_col if c == x else c for c in main_group]
    code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                x_col + "'})")

    if agg is not None:
        data, agg_code = build_agg_data(data,
                                        x_col,
                                        y_cols,
                                        kwargs,
                                        agg,
                                        z=z_col,
                                        animate_by=animate_by)
        code += agg_code
    data = data.dropna()
    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = main_group + (y_cols if len(z_cols) else [])
    check_exceptions(
        data[dupe_cols].rename(columns={x_col: x}),
        allow_duplicates or agg in ["raw", "drop_duplicates"],
        unlimited_data=unlimited_data,
        data_limit=40000 if len(z_cols) or animate_by is not None else 15000,
    )
    data_f, range_f = build_formatters(data)

    ret_data = dict(
        min={
            col: fmt(data[col].min(), None)
            for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols
        },
        max={
            col: fmt(data[col].max(), None)
            for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols
        },
    )
    if animate_by is not None:
        frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]),
                                         overrides=group_fmt_overrides)
        ret_data["frames"] = []
        for frame_key, frame in data.sort_values(animate_by).groupby(
                animate_by):
            ret_data["frames"].append(
                dict(
                    data={str("all"): data_f.format_lists(frame)},
                    name=frame_fmt(frame_key, as_string=True),
                ))
        ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
    else:
        ret_data["data"] = {str("all"): data_f.format_lists(data)}
    return ret_data, code
示例#5
0
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_type=None,
                     group_val=None,
                     bins_val=None,
                     bin_type=None,
                     agg=None,
                     extended_aggregation=[],
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     animate_by=None,
                     cleaners=[],
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param extended_aggregation: list of configurations that point to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: list, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """
    group_fmt_overrides = {
        "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}")
    }
    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get("z"),
                                     group_col,
                                     animate_by,
                                     group_val=group_val)
    cleaners = cleaners or []
    if len(cleaners):
        for col in data.columns:
            if classify_type(find_dtype(data[col])) == "S":
                code.append("s = chart_data['{}']".format(col))
                cleaned_col, cleaned_code = handle_cleaners(
                    data[col], ",".join(cleaners))
                data.loc[:, col] = cleaned_col
                code += cleaned_code
                code.append("chart_data.loc[:, '{}'] = s".format(col))

    x_col = str("x")
    if x is None:
        x = x_col
        data.loc[:, x_col] = range(1,
                                   len(data) +
                                   1)  # sequential integers: 1, 2, ..., N
    y_cols = make_list(y)

    z_col = kwargs.get("z")
    z_cols = make_list(z_col)
    y_cols = [str(col) for col in y_cols]
    is_z = len(z_cols) > 0
    y_group_cols = y_cols if is_z else []
    sort_cols = y_group_cols
    final_cols = y_cols + z_cols
    if group_col is not None and len(group_col):
        for col in make_list(group_col):
            classifier = classify_type(find_dtype(data[col]))
            if classifier == "F" or (classifier == "I"
                                     and group_type == "bins"):
                if bin_type == "width":
                    data.loc[:, col] = pd.qcut(data[col],
                                               q=bins_val,
                                               duplicates="drop").astype("str")
                    code.append((
                        "chart_data.loc[:, '{col}'] = pd.qcut(chart_data['{col}'], q={bins}, duplicates=\"drop\")"
                    ).format(col=col, bins=bins_val))
                else:
                    bins_data = data[col].dropna()
                    npt = len(bins_data)
                    equal_freq_bins = np.interp(
                        np.linspace(0, npt, bins_val + 1),
                        np.arange(npt),
                        np.sort(bins_data),
                    )
                    data.loc[:, col] = pd.cut(data[col],
                                              bins=equal_freq_bins,
                                              duplicates="drop").astype("str")
                    code.append((
                        "bins_data = data['{col}'].dropna()\n"
                        "npt = len(bins_data)\n"
                        "equal_freq_bins = np.interp(np.linspace(0, npt, {bins}), np.arange(npt), "
                        "np.sort(bins_data))\n"
                        "chart_data.loc[:, '{col}'] = pd.cut(chart_data['{col}'], bins=equal_freq_bins, "
                        'duplicates="drop")').format(col=col,
                                                     bins=bins_val + 1))

        main_group = group_col
        if animate_by is not None:
            main_group = [animate_by] + main_group
        sort_cols = main_group + [x] + sort_cols
        data = data.sort_values(sort_cols)
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(sort_cols)))
        check_all_nan(data)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")

        if agg is not None or len(extended_aggregation):
            data, agg_code, final_cols = build_agg_data(
                data,
                x_col,
                y_cols,
                kwargs,
                agg,
                z=z_col,
                group_col=group_col,
                animate_by=animate_by,
                extended_aggregation=extended_aggregation,
            )
            code += agg_code

        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. "
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                "are listed below:\n\n{}").format(
                    ", ".join(group_col), MAX_GROUPS,
                    group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + final_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + final_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }

        def _load_groups(df):
            for group_val, grp in df.groupby(group_col):

                def _group_filter():
                    for gv, gc in zip(make_list(group_val), group_col):
                        classifier = classify_type(dtypes[gc])
                        yield group_filter_handler(
                            gc, group_fmts[gc](gv, as_string=True), classifier)

                final_group_filter, final_group_label = [], []
                for gf, gl in _group_filter():
                    final_group_filter.append(gf)
                    final_group_label.append(gl)
                group_filter = " and ".join(final_group_filter)
                group_label = "({})".format(", ".join(final_group_label))
                data = data_f.format_lists(grp)
                data["_filter_"] = group_filter
                yield group_label, data

        if animate_by is not None:
            frame_fmt = find_dtype_formatter(dtypes[animate_by],
                                             overrides=group_fmt_overrides)
            ret_data["frames"] = []
            for frame_key, frame in data.sort_values(animate_by).groupby(
                    animate_by):
                ret_data["frames"].append(
                    dict(
                        data=dict(_load_groups(frame)),
                        name=frame_fmt(frame_key, as_string=True),
                    ))
            ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
        else:
            ret_data["data"] = dict(_load_groups(data))
        return ret_data, code

    main_group = [x]
    if animate_by is not None:
        main_group = [animate_by] + main_group
    sort_cols = main_group + sort_cols
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data)
    data = data[main_group + final_cols]

    data = data.rename(columns={x: x_col})
    main_group = [x_col if c == x else c for c in main_group]
    code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                x_col + "'})")

    # convert booleans into integers for aggregation
    for col in z_cols or y_cols:
        classifier = classify_type(find_dtype(data[col]))
        if classifier == "B":
            data.loc[:, col] = data[col].astype("int")

    if agg is not None or len(extended_aggregation):
        data, agg_code, final_cols = build_agg_data(
            data,
            x_col,
            y_cols,
            kwargs,
            agg,
            z=z_col,
            animate_by=animate_by,
            extended_aggregation=extended_aggregation,
        )
        code += agg_code
    data = data.dropna()

    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = main_group + y_group_cols
    data_limit = global_state.get_chart_settings(
    )["3d_points" if is_z or animate_by is not None else "scatter_points"]
    check_exceptions(
        data[dupe_cols].rename(columns={x_col: x}),
        allow_duplicates or agg in ["raw", "drop_duplicates"],
        unlimited_data=unlimited_data,
        data_limit=data_limit,
    )
    data_f, range_f = build_formatters(data)

    ret_data = dict(
        min={
            col: fmt(data[col].min(), None)
            for _, col, fmt in range_f.fmts
            if col in [x_col] + y_group_cols + final_cols
        },
        max={
            col: fmt(data[col].max(), None)
            for _, col, fmt in range_f.fmts
            if col in [x_col] + y_group_cols + final_cols
        },
    )
    if animate_by is not None:
        frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]),
                                         overrides=group_fmt_overrides)
        ret_data["frames"] = []
        for frame_key, frame in data.sort_values(animate_by).groupby(
                animate_by):
            ret_data["frames"].append(
                dict(
                    data={str("all"): data_f.format_lists(frame)},
                    name=frame_fmt(frame_key, as_string=True),
                ))
        ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
    else:
        ret_data["data"] = {str("all"): data_f.format_lists(data)}
    return ret_data, code
示例#6
0
文件: utils.py 项目: shalevy1/dtale
def build_chart(raw_data, x, y, group_col=None, agg=None, allow_duplicates=False, **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """

    data, code = retrieve_chart_data(raw_data, x, y, kwargs.get('z'), group_col)
    x_col = str('x')
    y_cols = make_list(y)
    z_col = kwargs.get('z')
    z_cols = []
    if z_col is not None:
        z_cols = [z_col]
    if group_col is not None and len(group_col):
        data = data.sort_values(group_col + [x])
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(cols="', '".join(group_col + [x])))
        check_all_nan(data, [x] + y_cols)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})")
        if agg is not None:
            data = data.groupby(group_col + [x_col])
            data = getattr(data, agg)().reset_index()
            code.append("chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()".format(
                cols="', '".join(group_col + [x]), agg=agg
            ))
        max_groups = 15
        if len(data[group_col].drop_duplicates()) > max_groups:
            msg = (
                'Group ({}) contains more than {} unique values, please add additional filter'
                ' or else chart will be unreadable'
            ).format(', '.join(group_col), max_groups)
            raise Exception(msg)

        data = data.dropna()
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
            max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
        )

        dtypes = get_dtypes(data)
        group_fmt_overrides = {'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')}
        group_fmts = {c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col}
        for group_val, grp in data.groupby(group_col):
            group_val = '/'.join([
                group_fmts[gc](gv, as_string=True) for gv, gc in zip(make_list(group_val), group_col)
            ])
            ret_data['data'][group_val] = data_f.format_lists(grp)
        ret_data['dtypes'] = {c: classify_type(dtype) for c, dtype in dtypes.items()}
        return ret_data, code
    sort_cols = [x] + (y_cols if len(z_cols) else [])
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(cols="', '".join(sort_cols)))
    check_all_nan(data, [x] + y_cols + z_cols)
    y_cols = [str(y_col) for y_col in y_cols]
    data.columns = [x_col] + y_cols + z_cols
    code.append("chart_data.columns = ['{cols}']".format(cols="', '".join([x_col] + y_cols + z_cols)))
    if agg is not None:
        data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col)
        code += agg_code
    data = data.dropna()
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = [x_col] + (y_cols if len(z_cols) else [])
    check_exceptions(data[dupe_cols].rename(columns={'x': x}), allow_duplicates,
                     data_limit=40000 if len(z_cols) else 15000)
    data_f, range_f = build_formatters(data)
    ret_data = dict(
        data={str('all'): data_f.format_lists(data)},
        min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols},
        max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols}
    )
    return ret_data, code
示例#7
0
def find_coverage():
    """
    Flask route which returns coverage information(counts) for a column grouped by other column(s)

    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param col: string from flask.request.args['col'] containing name of a column in your dataframe
    :param filters(deprecated): JSON string from flaks.request.args['filters'] with filtering information from group
           drilldown [
        {name: col1, prevFreq: Y, freq: Q, date: YYYY-MM-DD},
        ...
        {name: col1, prevFreq: D, freq: W, date: YYYY-MM-DD},
    ]
    :param group: JSON string from flask.request.args['group'] containing grouping logic in this structure [
        {name: col1} or {name: date_col1, freq: [D,W,M,Q,Y]}
    ]
    :returns: JSON {
        data: {
            [col]: [count1,count2,...,countN],
            labels: [{group_col1: gc1_v1, group_col2: gc2_v1},...,{group_col1: gc1_vN, group_col2: gc2_vN}],
            success: True
    } or {error: 'Exception message', traceback: 'Exception stacktrace', success: False}
    """
    def filter_data(df, req, groups, query=None):
        filters = get_str_arg(req, 'filters')
        if not filters:
            return df.query(query or 'index == index'), groups, ''
        filters = json.loads(filters)
        col, prev_freq, freq, end = map(filters[-1].get,
                                        ['name', 'prevFreq', 'freq', 'date'])
        start = DATE_RANGES[prev_freq](pd.Timestamp(end)).strftime('%Y%m%d')
        range_query = "{col} >= '{start}' and {col} <= '{end}'".format(
            col=col, start=start, end=end)
        logger.info('filtered coverage data to slice: {}'.format(range_query))
        updated_groups = [
            dict(name=col, freq=freq) if g['name'] == col else g
            for g in groups
        ]
        return df.query(
            query or
            'index == index').query(range_query), updated_groups, range_query

    try:
        col = get_str_arg(request, 'col')
        groups = get_str_arg(request, 'group')
        if groups:
            groups = json.loads(groups)
        data = DATA[get_port()]
        data, groups, query = filter_data(data,
                                          request,
                                          groups,
                                          query=get_str_arg(request, 'query'))
        grouper = []
        for g_cfg in groups:
            if 'freq' in g_cfg:
                freq_grp = data.set_index([g_cfg['name']]).index.to_period(
                    g_cfg['freq']).to_timestamp(how='end')
                freq_grp.name = g_cfg['name']
                grouper.append(freq_grp)
            else:
                grouper.append(data[g_cfg['name']])

        data_groups = data.groupby(grouper)
        group_data = data_groups[col].count()
        if len(groups) > 1:
            unstack_order = enumerate(
                zip(group_data.index.names, group_data.index.levels))
            unstack_order = sorted([(uo[0], uo[1][0], len(uo[1][1]))
                                    for uo in unstack_order],
                                   key=lambda k: k[2])
            for i, n, l in unstack_order[:-1]:
                group_data = group_data.unstack(i)
            group_data = group_data.fillna(0)
            if len(unstack_order[:-1]) > 1:
                group_data.columns = [
                    ', '.join([
                        str(group_data.columns.levels[c2[0]][c2[1]])
                        for c2 in enumerate(c)
                    ]) for c in zip(*group_data.columns.labels)
                ]
            else:
                group_data.columns = map(str, group_data.columns.values)

        if len(group_data) > 15000:
            return jsonify(
                dict(error=(
                    'Your grouping created {} groups, chart will not render. '
                    'Try making date columns a higher frequency (W, M, Q, Y)'
                ).format(len(data_groups))))
        if len(groups) == 1:
            data = {col: [json_int(v) for v in group_data.values]}
        else:
            data = dict([(c, [json_int(v) for v in group_data[c].values])
                         for c in group_data.columns])
        labels = pd.DataFrame(group_data.index.values,
                              columns=group_data.index.names)
        labels_f_overrides = {
            'D': lambda f, i, c: f.add_date(i, c, fmt='%Y-%m-%d'),
        }
        labels_f = grid_formatter(grid_columns(labels),
                                  overrides=labels_f_overrides)
        labels = labels_f.format_dicts(labels.itertuples())
        return jsonify(data=data, labels=labels, success=True)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
示例#8
0
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_val=None,
                     agg=None,
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """

    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get('z'),
                                     group_col,
                                     group_val=group_val)
    x_col = str('x')
    y_cols = make_list(y)
    z_col = kwargs.get('z')
    z_cols = make_list(z_col)
    if group_col is not None and len(group_col):
        data = data.sort_values(group_col + [x])
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(group_col + [x])))
        check_all_nan(data, [x] + y_cols)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")
        if agg is not None and agg != 'raw':
            data = data.groupby(group_col + [x_col])
            data = getattr(data, agg)().reset_index()
            code.append(
                "chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()"
                .format(cols="', '".join(group_col + [x]), agg=agg))
        MAX_GROUPS = 30
        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmt_overrides = {
                'I':
                lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')
            }
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                'Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. '
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                'are listed below:').format(', '.join(group_col), MAX_GROUPS,
                                            group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + y_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + y_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmt_overrides = {
            'I':
            lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')
        }
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }
        for group_val, grp in data.groupby(group_col):

            def _group_filter():
                for gv, gc in zip(make_list(group_val), group_col):
                    classifier = classify_type(dtypes[gc])
                    yield group_filter_handler(
                        gc, group_fmts[gc](gv, as_string=True), classifier)

            group_filter = ' and '.join(list(_group_filter()))
            ret_data['data'][group_filter] = data_f.format_lists(grp)
        return ret_data, code
    sort_cols = [x] + (y_cols if len(z_cols) else [])
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data, [x] + y_cols + z_cols)
    y_cols = [str(y_col) for y_col in y_cols]
    data.columns = [x_col] + y_cols + z_cols
    code.append("chart_data.columns = ['{cols}']".format(
        cols="', '".join([x_col] + y_cols + z_cols)))
    if agg is not None:
        data, agg_code = build_agg_data(data,
                                        x_col,
                                        y_cols,
                                        kwargs,
                                        agg,
                                        z=z_col)
        code += agg_code
    data = data.dropna()
    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = [x_col] + (y_cols if len(z_cols) else [])
    check_exceptions(data[dupe_cols].rename(columns={'x': x}),
                     allow_duplicates or agg == 'raw',
                     unlimited_data=unlimited_data,
                     data_limit=40000 if len(z_cols) else 15000)
    data_f, range_f = build_formatters(data)
    ret_data = dict(data={str('all'): data_f.format_lists(data)},
                    min={
                        col: fmt(data[col].min(), None)
                        for _, col, fmt in range_f.fmts
                        if col in [x_col] + y_cols + z_cols
                    },
                    max={
                        col: fmt(data[col].max(), None)
                        for _, col, fmt in range_f.fmts
                        if col in [x_col] + y_cols + z_cols
                    })
    return ret_data, code