Exemplo n.º 1
0
def generate_html_detail_text(feature_dict: dict, compare_dict: dict,
                              dataframe_report):
    template = jinja2_env.get_template('feature_detail_text.html')

    # Set some parameters for detail columns
    # ------------------------------------
    cols = dict()
    # Cols: Move text if there is a comparison pair display
    cur_x = config["Layout"].getint("pair_spacing")
    padding = config["Layout"].getint("col_spacing")
    if compare_dict is not None:
        cols["compare"] = cur_x
        cur_x = cur_x + config["Layout"].getint("pair_spacing")
    cur_x = cur_x + padding
    cols["text"] = cur_x
    cols["text_width"] = config["Layout"].getint(
        "detail_text_max_width") - cur_x
    cols["full_text_width"] = config["Layout"].getint("detail_text_max_width")

    max_text_rows = config["Detail_Stats"].getint("detail_max_text_rows")

    # Filter final row list to display, add "other"
    # ------------------------------------
    full_list = feature_dict["detail"]["full_count"]
    feature_dict["detail"]["detail_count"] = full_list[:max_text_rows]
    detail_list = feature_dict["detail"]["detail_count"]

    # Clipping text only for memory purposes (display will be handled by the browser)
    max_text_display_length = config["Summary_Stats"].getint(
        "text_max_string_len")
    for elem in detail_list:
        elem["name"] = elem["name"][:max_text_display_length]

    # Add "others"
    if len(detail_list) == max_text_rows:
        total = feature_dict["base_stats"]["num_values"].number
        cur_count = sum(row_data["count"].number for row_data in detail_list)
        other = total - cur_count
        row = dict()
        row["name"] = OTHERS_GROUPED.strip()
        row["count"] = NumWithPercent(other, total)
        row["target_stats"] = None
        row["target_stats_compare"] = None
        if compare_dict is not None:
            total = compare_dict["base_stats"]["num_values"].number
            cur_count = sum( \
                (row_data["count_compare"].number if row_data.get("count_compare") else 0) \
                for row_data in detail_list)
            other = total - cur_count
            row["count_compare"] = NumWithPercent(other, total)
        else:
            row["count_compare"] = None
        if row["count"].number > 0 or (row.get("count_compare") and
                                       row.get("count_compare").number > 0):
            detail_list.append(row)

    output = template.render(feature_dict = feature_dict, compare_dict = compare_dict, \
                             cols=cols)
    return output
def do_detail_numeric(series: pd.Series, counts: dict, counts_compare: dict,
                      updated_dict: dict):
    updated_dict["detail"] = dict()
    detail = updated_dict["detail"]
    total_num = float(updated_dict["base_stats"]["num_values"])
    num_to_show = config["Detail_Stats"].getint("max_num_numeric_top_values")

    detail["frequent_values"] = list()
    detail["min_values"] = list()
    detail["max_values"] = list()
    frequent_values = pd.DataFrame(
        counts["value_counts_without_nan"].head(num_to_show))
    min_values = pd.DataFrame(counts["value_counts_without_nan"].sort_index( \
            ascending=True).head(num_to_show))
    max_values = pd.DataFrame(counts["value_counts_without_nan"].sort_index( \
            ascending=False)).head(num_to_show)

    if counts_compare is not None:
        this_compare_count = counts_compare["value_counts_without_nan"]
        compare_total_num = float(
            updated_dict["compare"]["base_stats"]["num_values"])
    else:
        this_compare_count = None
    for frequent, min_value, max_value in zip(frequent_values.itertuples(), \
                                              min_values.itertuples(), max_values.itertuples()):

        def get_comparison_num(feature_name):
            this_comparison = None
            if this_compare_count is not None:
                try:
                    this_comparison = this_compare_count.get(feature_name)
                except TypeError:
                    # Workaround for cases where source dataset has ints only, but compare has floats...
                    pass
                    #...this was incorrect as it could have created false matches:
                    # if this_compare_count.index.dtype.name.find('int') != -1:
                    #     this_comparison = this_compare_count.get(np.int64(feature_name))
                    # else:
                    #     this_comparison = None
                if this_comparison is not None:
                    this_comparison = NumWithPercent(this_comparison,
                                                     compare_total_num)
                else:
                    # If there is a comparison array but no matching value, insert 0
                    # ("none" is the absence of value)
                    this_comparison = NumWithPercent(0, compare_total_num)
            return this_comparison

        detail["frequent_values"].append(
            (frequent[0], NumWithPercent(frequent[1], total_num),
             get_comparison_num(frequent[0])))
        detail["min_values"].append(
            (min_value[0], NumWithPercent(min_value[1], total_num),
             get_comparison_num(min_value[0])))
        detail["max_values"].append(
            (max_value[0], NumWithPercent(max_value[1], total_num),
             get_comparison_num(max_value[0])))
Exemplo n.º 3
0
 def get_comparison_num(feature_name):
     this_comparison = None
     if this_compare_count is not None:
         this_comparison = this_compare_count.get(feature_name)
         if this_comparison is not None:
             this_comparison = NumWithPercent(this_comparison,
                                              compare_total_num)
         else:
             # If there is a comparison array but no matching value, insert 0
             # ("none" is the absence of value)
             this_comparison = NumWithPercent(0, compare_total_num)
     return this_comparison
Exemplo n.º 4
0
def add_series_base_stats_to_dict(series: pd.Series, counts: dict, updated_dict: dict) -> dict:
    updated_dict["stats"] = dict()
    updated_dict["base_stats"] = dict()
    base_stats = updated_dict["base_stats"]
    num_total = counts["num_rows_total"]
    num_zeros = series[series == 0].sum()
    non_nan = counts["num_rows_with_data"]
    base_stats["total_rows"] = num_total
    base_stats["num_values"] = NumWithPercent(non_nan, num_total)
    base_stats["num_missing"] = NumWithPercent(num_total - non_nan, num_total)
    base_stats["num_zeroes"] = NumWithPercent(num_zeros, num_total)
    base_stats["num_distinct"] = NumWithPercent(counts["distinct_count_without_nan"], num_total)
Exemplo n.º 5
0
def do_detail_numeric(series: pd.Series, counts: dict, counts_compare: dict,
                      updated_dict: dict):
    updated_dict["detail"] = dict()
    detail = updated_dict["detail"]
    total_num = float(updated_dict["base_stats"]["num_values"])
    num_to_show = config["Detail_Stats"].getint("max_num_numeric_top_values")

    detail["frequent_values"] = list()
    detail["min_values"] = list()
    detail["max_values"] = list()
    frequent_values = pd.DataFrame(
        counts["value_counts_without_nan"].head(num_to_show))
    min_values = pd.DataFrame(counts["value_counts_without_nan"].sort_index( \
            ascending=True).head(num_to_show))
    max_values = pd.DataFrame(counts["value_counts_without_nan"].sort_index( \
            ascending=False)).head(num_to_show)

    if counts_compare is not None:
        this_compare_count = counts_compare["value_counts_without_nan"]
        compare_total_num = float(
            updated_dict["compare"]["base_stats"]["num_values"])
    else:
        this_compare_count = None
    for frequent, min_value, max_value in zip(frequent_values.itertuples(), \
                                              min_values.itertuples(), max_values.itertuples()):

        def get_comparison_num(feature_name):
            this_comparison = None
            if this_compare_count is not None:
                this_comparison = this_compare_count.get(feature_name)
                if this_comparison is not None:
                    this_comparison = NumWithPercent(this_comparison,
                                                     compare_total_num)
                else:
                    # If there is a comparison array but no matching value, insert 0
                    # ("none" is the absence of value)
                    this_comparison = NumWithPercent(0, compare_total_num)
            return this_comparison

        detail["frequent_values"].append(
            (frequent[0], NumWithPercent(frequent[1], total_num),
             get_comparison_num(frequent[0])))
        detail["min_values"].append(
            (min_value[0], NumWithPercent(min_value[1], total_num),
             get_comparison_num(min_value[0])))
        detail["max_values"].append(
            (max_value[0], NumWithPercent(max_value[1], total_num),
             get_comparison_num(max_value[0])))
Exemplo n.º 6
0
    def summarize_dataframe(self, source: pd.DataFrame, name: str, target_dict: dict, skip: List[str]):
        target_dict["name"] = name
        target_dict["num_rows"] = len(source)
        target_dict["num_columns"] = len(source.columns)
        target_dict["num_skipped_columns"] = len(source.columns) - len([x for x in source.columns if x not in skip])

        target_dict["memory_total"] = source.memory_usage(index=True, deep=True).sum()
        target_dict["memory_single_row"] = \
            float(target_dict["memory_total"]) / target_dict["num_rows"]

        target_dict["duplicates"] = NumWithPercent(sum(source.duplicated()), len(source))
 def get_comparison_num(feature_name):
     this_comparison = None
     if this_compare_count is not None:
         try:
             this_comparison = this_compare_count.get(feature_name)
         except TypeError:
             # Workaround for cases where source dataset has ints only, but compare has floats...
             pass
             #...this was incorrect as it could have created false matches:
             # if this_compare_count.index.dtype.name.find('int') != -1:
             #     this_comparison = this_compare_count.get(np.int64(feature_name))
             # else:
             #     this_comparison = None
         if this_comparison is not None:
             this_comparison = NumWithPercent(this_comparison,
                                              compare_total_num)
         else:
             # If there is a comparison array but no matching value, insert 0
             # ("none" is the absence of value)
             this_comparison = NumWithPercent(0, compare_total_num)
     return this_comparison
def do_detail_text(to_process: FeatureToProcess, updated_dict: dict):
    updated_dict["detail"] = dict()
    detail = updated_dict["detail"]

    # Compute COUNT stats (i.e. below graph)
    # ----------------------------------------------------------------------------------------------
    detail["full_count"] = []

    num_values = updated_dict["base_stats"]["num_values"].number
    if to_process.compare_counts is not None:
        num_values_compare = updated_dict["compare"]["base_stats"][
            "num_values"].number

    # Iterate through ALL VALUES and get stats
    for item in to_process.source_counts["value_counts_without_nan"].iteritems(
    ):
        row = dict()
        row["name"] = html.escape(str(item[0]))
        row["count"] = NumWithPercent(item[1], num_values)
        # Defaults to no comparison or target
        row["count_compare"] = None
        row["target_stats"] = None
        row["target_stats_compare"] = None
        if to_process.compare_counts is not None:
            # HAS COMPARE...
            if row["name"] in to_process.compare_counts[
                    "value_counts_without_nan"].index:
                # ...and value exists in COMPARE
                matching = to_process.compare_counts[
                    "value_counts_without_nan"][row["name"]]
                row["count_compare"] = NumWithPercent(matching,
                                                      num_values_compare)

        detail["full_count"].append(row)

    return
Exemplo n.º 9
0
def do_detail_categorical(to_process: FeatureToProcess, updated_dict: dict):
    updated_dict["detail"] = dict()
    detail = updated_dict["detail"]

    # Compute COUNT stats (i.e. below graph)
    # ----------------------------------------------------------------------------------------------
    detail["full_count"] = []

    # To get percentages
    num_values = updated_dict["base_stats"]["num_values"].number
    if to_process.compare_counts is not None:
        num_values_compare = updated_dict["compare"]["base_stats"][
            "num_values"].number

    category_counts = utils.get_clamped_value_counts(to_process.source_counts["value_counts_without_nan"], \
                                   config["Graphs"].getint("detail_graph_max_categories"))

    # Iterate through ALL VALUES and get stats
    total_num_compare = 0
    max_abs_value = 0
    for item in category_counts.iteritems():
        row = dict()
        row["name"] = item[0]
        row["count"] = NumWithPercent(item[1], num_values)
        # Defaults to no comparison or target
        row["count_compare"] = None
        row["target_stats"] = None
        row["target_stats_compare"] = None
        row["is_total"] = None

        if to_process.source_target is not None:
            # HAS TARGET
            # TODO: OPTIMIZE: CACHE FROM GRAPH?
            if row["name"] == OTHERS_GROUPED:
                this_value_target_only = to_process.source_target[
                    ~to_process.source.isin(category_counts.keys())]
            else:
                this_value_target_only = to_process.source_target[
                    to_process.source == row["name"]]
            if to_process.predetermined_type_target == FeatureType.TYPE_BOOL:
                count_this_value_target_only = float(
                    this_value_target_only.count())
                count_true = this_value_target_only.sum()
                row["target_stats"] = NumWithPercent(
                    count_true, count_this_value_target_only)
            elif to_process.predetermined_type_target == FeatureType.TYPE_NUM:
                row["target_stats"] = NumWithPercent(
                    this_value_target_only.mean(), 1.0)
                max_abs_value = max(max_abs_value, row["target_stats"].number)

        if to_process.compare_counts is not None:
            # HAS COMPARE...
            if row["name"] in to_process.compare_counts[
                    "value_counts_without_nan"].index:
                # ...and value exists in COMPARE
                matching = to_process.compare_counts[
                    "value_counts_without_nan"][row["name"]]
                row["count_compare"] = NumWithPercent(matching,
                                                      num_values_compare)

                if to_process.compare_target is not None:
                    # TODO: OPTIMIZE: CACHE FROM GRAPH?
                    if row["name"] == OTHERS_GROUPED:
                        this_value_target_only = to_process.compare_target[
                            ~to_process.compare.isin(category_counts.keys())]
                    else:
                        this_value_target_only = to_process.compare_target[
                            to_process.compare == row["name"]]
                    # HAS COMPARE-TARGET
                    if to_process.predetermined_type_target == FeatureType.TYPE_BOOL:
                        count_this_value_target_only = float(
                            this_value_target_only.count())
                        count_true = this_value_target_only.sum()
                        row["target_stats_compare"] = NumWithPercent(
                            count_true, count_this_value_target_only)
                    elif to_process.predetermined_type_target == FeatureType.TYPE_NUM:
                        row["target_stats_compare"] = NumWithPercent(
                            this_value_target_only.mean(), 1.0)
                        max_abs_value = max(max_abs_value,
                                            row["target_stats_compare"].number)

        detail["full_count"].append(row)
    detail["max_range"] = max_abs_value

    # "ALL" row
    # -----------------------------------------------
    row = dict()
    row["name"] = "ALL"
    row["count"] = NumWithPercent(num_values, num_values)
    # Defaults to no comparison or target
    row["count_compare"] = None
    row["target_stats"] = None
    row["target_stats_compare"] = None
    row["is_total"] = True

    if to_process.source_target is not None:
        # HAS TARGET
        if to_process.predetermined_type_target == FeatureType.TYPE_BOOL:
            # TODO: OPTIMIZE: CACHE FROM GRAPH?
            count_this_value_target_only = float(
                to_process.source_target.count())
            count_true = to_process.source_target.sum()
            row["target_stats"] = NumWithPercent(count_true,
                                                 count_this_value_target_only)
        elif to_process.predetermined_type_target == FeatureType.TYPE_NUM:
            # TODO: OPTIMIZE: CACHE FROM GRAPH?
            row["target_stats"] = NumWithPercent(
                to_process.source_target.mean(), 1.0)

    if to_process.compare_counts is not None:
        row["count_compare"] = NumWithPercent(num_values_compare,
                                              num_values_compare)
        if to_process.compare_target is not None:
            # HAS COMPARE-TARGET
            if to_process.predetermined_type_target == FeatureType.TYPE_BOOL:
                # TODO: OPTIMIZE: CACHE FROM GRAPH?
                count_this_value_target_only = float(
                    to_process.compare_target.count())
                count_true = to_process.compare_target.sum()
                row["target_stats_compare"] = NumWithPercent(
                    count_true, count_this_value_target_only)
            elif to_process.predetermined_type_target == FeatureType.TYPE_NUM:
                # TODO: OPTIMIZE: CACHE FROM GRAPH?
                row["target_stats_compare"] = NumWithPercent(
                    to_process.compare_target.mean(), 1.0)
    detail["full_count"].append(row)
    return