Пример #1
0
def render_real(summary):
    varid = summary["varid"]
    template_variables = render_common(summary)
    image_format = config["plot"]["image_format"].get(str)

    if summary["min"] >= 0:
        name = "Real number (&Ropf;<sub>&ge;0</sub>)"
    else:
        name = "Real number (&Ropf;)"

    # Top
    info = VariableInfo(summary["varid"], summary["varname"], name,
                        summary["warnings"])

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Infinite",
            "value": summary["n_infinite"],
            "fmt": "fmt",
            "alert": "n_infinite" in summary["warn_fields"],
        },
        {
            "name": "Infinite (%)",
            "value": summary["p_infinite"],
            "fmt": "fmt_percent",
            "alert": "p_infinite" in summary["warn_fields"],
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt",
            "alert": "n_zeros" in summary["warn_fields"],
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent",
            "alert": "p_zeros" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    histogram_bins = 10

    # TODO: replace with SmallImage...
    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary, histogram_bins),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    quantile_statistics = Table(
        [
            {
                "name": "Minimum",
                "value": summary["min"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "5-th percentile",
                "value": summary["5%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Q1",
                "value": summary["25%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "median",
                "value": summary["50%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Q3",
                "value": summary["75%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "95-th percentile",
                "value": summary["95%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Maximum",
                "value": summary["max"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Range",
                "value": summary["range"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Interquartile range (IQR)",
                "value": summary["iqr"],
                "fmt": "fmt_numeric",
            },
        ],
        name="Quantile statistics",
    )

    descriptive_statistics = Table(
        [
            {
                "name": "Standard deviation",
                "value": summary["std"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Coefficient of variation (CV)",
                "value": summary["cv"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Kurtosis",
                "value": summary["kurtosis"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Mean",
                "value": summary["mean"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Median Absolute Deviation (MAD)",
                "value": summary["mad"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Skewness",
                "value": summary["skewness"],
                "fmt": "fmt_numeric",
                "class":
                "alert" if "skewness" in summary["warn_fields"] else "",
            },
            {
                "name": "Sum",
                "value": summary["sum"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Variance",
                "value": summary["variance"],
                "fmt": "fmt_numeric"
            },
        ],
        name="Descriptive statistics",
    )

    statistics = Container(
        [quantile_statistics, descriptive_statistics],
        anchor_id=f"{varid}statistics",
        name="Statistics",
        sequence_type="grid",
    )

    seqs = [
        Image(
            histogram(summary["histogram_data"], summary, histogram_bins),
            image_format=image_format,
            alt="Histogram",
            caption=
            f"<strong>Histogram with fixed size bins</strong> (bins={histogram_bins})",
            name="Histogram",
            anchor_id=f"{varid}histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id=f"{varid}common_values",
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id=f"{varid}firstn",
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id=f"{varid}lastn",
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id=f"{varid}extreme_values",
    )

    if "histogram_bins_bayesian_blocks" in summary:
        histo_dyn = Image(
            histogram(
                summary["histogram_data"],
                summary,
                summary["histogram_bins_bayesian_blocks"],
            ),
            image_format=image_format,
            alt="Histogram",
            caption=
            '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)'
            .format(
                fmt_array(summary["histogram_bins_bayesian_blocks"],
                          threshold=5)),
            name="Dynamic Histogram",
            anchor_id=f"{varid}dynamic_histogram",
        )

        seqs.append(histo_dyn)

    template_variables["bottom"] = Container(
        [
            statistics,
            Container(
                seqs,
                sequence_type="tabs",
                name="Histogram(s)",
                anchor_id=f"{varid}histograms",
            ),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=f"{varid}bottom",
    )

    return template_variables
Пример #2
0
def render_count(config: Settings, summary: dict) -> dict:
    template_variables = render_common(config, summary)
    image_format = config.plot.image_format

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Real number (&Ropf; / &Ropf;<sub>&ge;0</sub>)",
        summary["alerts"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"]),
            "alert": False,
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
            "alert": False,
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"]),
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name":
            "Mean",
            "value":
            fmt_numeric(summary["mean"], precision=config.report.precision),
            "alert":
            False,
        },
        {
            "name": "Minimum",
            "value": fmt_numeric(summary["min"],
                                 precision=config.report.precision),
            "alert": False,
        },
        {
            "name": "Maximum",
            "value": fmt_numeric(summary["max"],
                                 precision=config.report.precision),
            "alert": False,
        },
        {
            "name": "Zeros",
            "value": fmt(summary["n_zeros"]),
            "alert": False,
        },
        {
            "name": "Zeros (%)",
            "value": fmt_percent(summary["p_zeros"]),
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
            "alert": False,
        },
    ])

    mini_histo = Image(
        mini_histogram(config, *summary["histogram"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    seqs = [
        Image(
            histogram(config, *summary["histogram"]),
            image_format=image_format,
            alt="Histogram",
            caption=
            f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
            name="Histogram",
            anchor_id="histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id="common_values",
        redact=False,
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name=f"Minimum {config.n_extreme_obs} values",
                anchor_id="firstn",
                redact=False,
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name=f"Maximum {config.n_extreme_obs} values",
                anchor_id="lastn",
                redact=False,
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id="extreme_values",
    )

    template_variables["bottom"] = Container(
        [
            Container(seqs,
                      sequence_type="tabs",
                      name="Histogram(s)",
                      anchor_id="histograms"),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    return template_variables
def render_categorical(summary):
    varid = summary["varid"]
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_common(summary)

    # TODO: merge with boolean
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["count"],
        max_number_to_print=n_obs_cat,
    )

    # Top
    # Element composition
    info = VariableInfo(summary["varid"], summary["varname"], "Categorical",
                        summary["warnings"])

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    # TODO: settings 3,3,6
    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    # Bottom
    items = []
    frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id=f"{varid}common_values",
    )

    items.append(frequency_table)

    check_compositions = config["vars"]["cat"]["check_composition"].get(bool)
    if check_compositions:
        length_table = Table(
            [
                {
                    "name": "Max length",
                    "value": summary["max_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
                {
                    "name": "Mean length",
                    "value": summary["mean_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
                {
                    "name": "Min length",
                    "value": summary["min_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
            ],
            name="Length",
            anchor_id=f"{varid}lengthstats",
        )

        histogram_bins = 10

        length = Image(
            histogram(summary["length"], summary, histogram_bins),
            image_format=image_format,
            alt="Scatter",
            name="Length",
            anchor_id=f"{varid}length",
        )

        tbl = Sequence(
            [length, length_table],
            anchor_id=f"{varid}tbl",
            name="Length",
            sequence_type="grid",
        )

        items.append(tbl)

        n_freq_table_max = config["n_freq_table_max"].get(int)

        citems = []
        vc = pd.Series(summary["category_alias_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Categories",
                anchor_id=f"{varid}category_long_values",
            ))

        vc = pd.Series(summary["script_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Scripts",
                anchor_id=f"{varid}script_values",
            ))

        vc = pd.Series(summary["block_alias_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Blocks",
                anchor_id=f"{varid}block_alias_values",
            ))

        characters = Sequence(
            citems,
            name="Characters",
            sequence_type="tabs",
            anchor_id=f"{varid}characters",
        )

        items.append(characters)

    template_variables["bottom"] = Sequence(items,
                                            sequence_type="tabs",
                                            anchor_id=f"{varid}bottom")

    return template_variables
Пример #4
0
def render_categorical(summary):
    varid = summary["varid"]
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)
    redact = config["vars"]["cat"]["redact"].get(bool)
    words = config["vars"]["cat"]["words"].get(bool)
    characters = config["vars"]["cat"]["characters"].get(bool)
    length = config["vars"]["cat"]["length"].get(bool)

    template_variables = render_common(summary)

    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Categorical",
        summary["warnings"],
        summary["description"],
    )

    table = Table(
        [
            {
                "name": "Distinct",
                "value": summary["n_distinct"],
                "fmt": "fmt",
                "alert": "n_distinct" in summary["warn_fields"],
            },
            {
                "name": "Distinct (%)",
                "value": summary["p_distinct"],
                "fmt": "fmt_percent",
                "alert": "p_distinct" in summary["warn_fields"],
            },
            {
                "name": "Missing",
                "value": summary["n_missing"],
                "fmt": "fmt",
                "alert": "n_missing" in summary["warn_fields"],
            },
            {
                "name": "Missing (%)",
                "value": summary["p_missing"],
                "fmt": "fmt_percent",
                "alert": "p_missing" in summary["warn_fields"],
            },
            {
                "name": "Memory size",
                "value": summary["memory_size"],
                "fmt": "fmt_bytesize",
                "alert": False,
            },
        ]
    )

    fqm = FrequencyTableSmall(
        freq_table(
            freqtable=summary["value_counts"],
            n=summary["count"],
            max_number_to_print=n_obs_cat,
        ),
        redact=redact,
    )

    template_variables["top"] = Container([info, table, fqm], sequence_type="grid")

    # ============================================================================================

    frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id=f"{varid}common_values",
        redact=redact,
    )

    unique_stats, value_counts = render_categorical_frequency(
        summary, varid, image_format
    )

    overview_items = []

    if length:
        length_table, length_histo = render_categorical_length(
            summary, varid, image_format
        )
        overview_items.append(length_table)

    if characters:
        overview_table_char, unitab = render_categorical_unicode(summary, varid, redact)
        overview_items.append(overview_table_char)

    overview_items.append(unique_stats)

    if not redact:
        rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row")

        sample = Table(
            [
                {
                    "name": name,
                    "value": value,
                    "fmt": "fmt",
                    "alert": False,
                }
                for name, value in zip(rows, summary["first_rows"])
            ],
            name="Sample",
        )
        overview_items.append(sample)

    string_items = [frequency_table]
    if length:
        string_items.append(length_histo)

    max_unique = config["plot"]["pie"]["max_unique"].get(int)
    if max_unique > 0 and summary["n_distinct"] <= max_unique:
        string_items.append(
            Image(
                pie_plot(summary["value_counts"], legend_kws={"loc": "upper right"}),
                image_format=image_format,
                alt="Pie chart",
                name="Pie chart",
                anchor_id=f"{varid}pie_chart",
            )
        )

    bottom_items = [
        Container(
            overview_items,
            name="Overview",
            anchor_id=f"{varid}overview",
            sequence_type="batch_grid",
            batch_size=len(overview_items),
            titles=False,
        ),
        Container(
            string_items,
            name="Categories",
            anchor_id=f"{varid}string",
            sequence_type="batch_grid",
            batch_size=len(string_items),
        ),
    ]

    if words:
        woc = freq_table(
            freqtable=summary["word_counts"],
            n=summary["word_counts"].sum(),
            max_number_to_print=10,
        )

        fqwo = FrequencyTable(
            woc,
            name="Common words",
            anchor_id=f"{varid}cwo",
            redact=redact,
        )

        bottom_items.append(
            Container(
                [fqwo],
                name="Words",
                anchor_id=f"{varid}word",
                sequence_type="grid",
            )
        )

    if characters:
        bottom_items.append(
            Container(
                [unitab],
                name="Characters",
                anchor_id=f"{varid}characters",
                sequence_type="grid",
            )
        )

    # Bottom
    template_variables["bottom"] = Container(
        bottom_items, sequence_type="tabs", anchor_id=f"{varid}bottom"
    )

    return template_variables
Пример #5
0
def render_count(summary):
    varid = summary["varid"]
    template_variables = render_common(summary)
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Real number (&Ropf; / &Ropf;<sub>&ge;0</sub>)",
        summary["warnings"],
    )

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    # TODO: replace with SmallImage...
    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary,
                       summary["histogram_bins"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Sequence([info, table1, table2, mini_histo],
                                         sequence_type="grid")

    quantile_statistics = {
        "name":
        "Quantile statistics",
        "items": [
            {
                "name": "Minimum",
                "value": summary["min"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "5-th percentile",
                "value": summary["quantile_5"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q1",
                "value": summary["quantile_25"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "median",
                "value": summary["quantile_50"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q3",
                "value": summary["quantile_75"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "95-th percentile",
                "value": summary["quantile_95"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Maximum",
                "value": summary["max"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Range",
                "value": summary["range"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Interquartile range",
                "value": summary["iqr"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
        ],
    }

    descriptive_statistics = {
        "name":
        "Descriptive statistics",
        "items": [
            {
                "name": "Standard deviation",
                "value": summary["std"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Coefficient of variation",
                "value": summary["cv"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Kurtosis",
                "value": summary["kurt"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Mean",
                "value": summary["mean"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "MAD",
                "value": summary["mad"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Skewness",
                "value": summary["skew"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Sum",
                "value": summary["sum"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Variance",
                "value": summary["var"],
                "fmt": "fmt_numeric"
            },
        ],
    }

    # TODO: Make sections data structure
    # statistics = ItemRenderer(
    #     'statistics',
    #     'Statistics',
    #     'table',
    #     [
    #         quantile_statistics,
    #         descriptive_statistics
    #     ]
    # )

    seqs = [
        Image(
            histogram(summary["histogram_data"], summary,
                      summary["histogram_bins"]),
            image_format=image_format,
            alt="Histogram",
            caption=
            f"<strong>Histogram with fixed size bins</strong> (bins={summary['histogram_bins']})",
            name="Histogram",
            anchor_id="histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id="common_values",
    )

    evs = Sequence(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id="firstn",
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id="lastn",
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id="extreme_values",
    )

    if "histogram_bins_bayesian_blocks" in summary:
        histo_dyn = Image(
            histogram(
                summary["histogram_data"],
                summary,
                summary["histogram_bins_bayesian_blocks"],
            ),
            image_format=image_format,
            alt="Histogram",
            caption=
            '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)'
            .format(
                fmt_array(summary["histogram_bins_bayesian_blocks"],
                          threshold=5)),
            name="Dynamic Histogram",
            anchor_id="dynamic_histogram",
        )

        seqs.append(histo_dyn)

    template_variables["bottom"] = Sequence(
        [
            # statistics,
            Sequence(seqs,
                     sequence_type="tabs",
                     name="Histogram(s)",
                     anchor_id="histograms"),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    return template_variables
Пример #6
0
def get_dataset_overview(summary):
    dataset_info = Table(
        [
            {
                "name": "Number of variables",
                "value": summary["table"]["n_var"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Number of observations",
                "value": summary["table"]["n"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Missing cells",
                "value": summary["table"]["n_cells_missing"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Missing cells (%)",
                "value": summary["table"]["p_cells_missing"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Duplicate rows",
                "value": summary["table"]["n_duplicates"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Duplicate rows (%)",
                "value": summary["table"]["p_duplicates"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Total size in memory",
                "value": summary["table"]["memory_size"],
                "fmt": "fmt_bytesize",
            },
            {
                "name": "Average record size in memory",
                "value": summary["table"]["record_size"],
                "fmt": "fmt_bytesize",
            },
        ],
        name="Dataset statistics",
    )

    dataset_types = Table(
        [{
            "name": type_name,
            "value": count,
            "fmt": "fmt_numeric"
        } for type_name, count in summary["table"]["types"].items()],
        name="Variable types",
    )

    return Container(
        [dataset_info, dataset_types],
        anchor_id="dataset_overview",
        name="Overview",
        sequence_type="grid",
    )
Пример #7
0
def render_boolean(summary):
    varid = summary["varid"]
    n_obs_bool = config["vars"]["bool"]["n_obs"].get(int)

    # Prepare variables
    template_variables = render_common(summary)
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["n"],
        max_number_to_print=n_obs_bool,
    )

    # Element composition
    info = VariableInfo(
        anchor_id=summary["varid"],
        warnings=summary["warnings"],
        var_type="Boolean",
        var_name=summary["varname"],
    )

    table = Table(
        [
            {
                "name": "Distinct count",
                "value": summary["n_unique"],
                "fmt": "fmt",
                "alert": "n_unique" in summary["warn_fields"],
            },
            {
                "name": "Unique (%)",
                "value": summary["p_unique"],
                "fmt": "fmt_percent",
                "alert": "p_unique" in summary["warn_fields"],
            },
            {
                "name": "Missing",
                "value": summary["n_missing"],
                "fmt": "fmt",
                "alert": "n_missing" in summary["warn_fields"],
            },
            {
                "name": "Missing (%)",
                "value": summary["p_missing"],
                "fmt": "fmt_percent",
                "alert": "p_missing" in summary["warn_fields"],
            },
            {
                "name": "Memory size",
                "value": summary["memory_size"],
                "fmt": "fmt_bytesize",
                "alert": False,
            },
        ]
    )

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    template_variables["top"] = Container([info, table, fqm], sequence_type="grid")

    freqtable = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Frequency Table",
        anchor_id=f"{varid}frequency_table",
    )

    template_variables["bottom"] = Container(
        [freqtable], sequence_type="tabs", anchor_id=f"{varid}bottom"
    )

    return template_variables
Пример #8
0
def render_date(summary):
    # TODO: render common?
    template_variables = {}

    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(summary["varid"], summary["varname"], "Date",
                        summary["warnings"])

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
    ])

    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary,
                       summary["histogram_bins"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Sequence([info, table1, table2, mini_histo],
                                         sequence_type="grid")

    # Bottom
    bottom = Sequence(
        [
            Image(
                histogram(summary["histogram_data"], summary,
                          summary["histogram_bins"]),
                image_format=image_format,
                alt="Histogram",
                caption="Histogram",
                name="Histogram",
                anchor_id="{varid}histogram".format(varid=summary["varid"]),
            )
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    template_variables["bottom"] = bottom

    return template_variables
Пример #9
0
def render_boolean(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    n_obs_bool = config.vars.bool.n_obs
    image_format = config.plot.image_format

    # Prepare variables
    template_variables = render_common(config, summary)

    # Element composition
    info = VariableInfo(
        anchor_id=summary["varid"],
        alerts=summary["alerts"],
        var_type="Boolean",
        var_name=summary["varname"],
        description=summary["description"],
    )

    table = Table(
        [
            {
                "name": "Distinct",
                "value": fmt(summary["n_distinct"]),
                "alert": "n_distinct" in summary["alert_fields"],
            },
            {
                "name": "Distinct (%)",
                "value": fmt_percent(summary["p_distinct"]),
                "alert": "p_distinct" in summary["alert_fields"],
            },
            {
                "name": "Missing",
                "value": fmt(summary["n_missing"]),
                "alert": "n_missing" in summary["alert_fields"],
            },
            {
                "name": "Missing (%)",
                "value": fmt_percent(summary["p_missing"]),
                "alert": "p_missing" in summary["alert_fields"],
            },
            {
                "name": "Memory size",
                "value": fmt_bytesize(summary["memory_size"]),
                "alert": False,
            },
        ]
    )

    fqm = FrequencyTableSmall(
        freq_table(
            freqtable=summary["value_counts_without_nan"],
            n=summary["n"],
            max_number_to_print=n_obs_bool,
        ),
        redact=False,
    )

    template_variables["top"] = Container([info, table, fqm], sequence_type="grid")

    items: List[Renderable] = [
        FrequencyTable(
            template_variables["freq_table_rows"],
            name="Common Values",
            anchor_id=f"{varid}frequency_table",
            redact=False,
        )
    ]

    show = config.plot.cat_freq.show
    max_unique = config.plot.cat_freq.max_unique

    if show and (max_unique > 0):
        items.append(
            Image(
                cat_frequency_plot(
                    config,
                    summary["value_counts_without_nan"],
                ),
                image_format=image_format,
                alt="Category Frequency Plot",
                name="Category Frequency Plot",
                anchor_id=f"{varid}cat_frequency_plot",
            )
        )

    template_variables["bottom"] = Container(
        items, sequence_type="tabs", anchor_id=f"{varid}bottom"
    )

    return template_variables
Пример #10
0
def render_categorical(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    n_obs_cat = config.vars.cat.n_obs
    image_format = config.plot.image_format
    words = config.vars.cat.words
    characters = config.vars.cat.characters
    length = config.vars.cat.length

    template_variables = render_common(config, summary)

    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Categorical",
        summary["alerts"],
        summary["description"],
    )

    table = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"]),
            "alert": "n_distinct" in summary["alert_fields"],
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
            "alert": "p_distinct" in summary["alert_fields"],
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"]),
            "alert": "n_missing" in summary["alert_fields"],
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
            "alert": "p_missing" in summary["alert_fields"],
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(
        freq_table(
            freqtable=summary["value_counts_without_nan"],
            n=summary["count"],
            max_number_to_print=n_obs_cat,
        ),
        redact=config.vars.cat.redact,
    )

    template_variables["top"] = Container([info, table, fqm],
                                          sequence_type="grid")

    # ============================================================================================

    frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id=f"{varid}common_values",
        redact=config.vars.cat.redact,
    )

    unique_stats = render_categorical_frequency(config, summary, varid)

    overview_items = []

    if length:
        length_table, length_histo = render_categorical_length(
            config, summary, varid)
        overview_items.append(length_table)

    if characters:
        overview_table_char, unitab = render_categorical_unicode(
            config, summary, varid)
        overview_items.append(overview_table_char)

    overview_items.append(unique_stats)

    if not config.vars.cat.redact:
        rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row")

        sample = Table(
            [{
                "name": name,
                "value": fmt(value),
                "alert": False,
            } for name, value in zip(rows, summary["first_rows"])],
            name="Sample",
        )
        overview_items.append(sample)

    string_items: List[Renderable] = [frequency_table]
    if length:
        string_items.append(length_histo)

    show = config.plot.cat_freq.show
    max_unique = config.plot.cat_freq.max_unique

    if show and (max_unique > 0) and (summary["n_distinct"] <= max_unique):
        string_items.append(
            Image(
                cat_frequency_plot(
                    config,
                    summary["value_counts_without_nan"],
                ),
                image_format=image_format,
                alt="Category Frequency Plot",
                name="Category Frequency Plot",
                anchor_id=f"{varid}cat_frequency_plot",
            ))

    bottom_items = [
        Container(
            overview_items,
            name="Overview",
            anchor_id=f"{varid}overview",
            sequence_type="batch_grid",
            batch_size=len(overview_items),
            titles=False,
        ),
        Container(
            string_items,
            name="Categories",
            anchor_id=f"{varid}string",
            sequence_type="batch_grid",
            batch_size=len(string_items),
        ),
    ]

    if words:
        woc = freq_table(
            freqtable=summary["word_counts"],
            n=summary["word_counts"].sum(),
            max_number_to_print=10,
        )

        fqwo = FrequencyTable(
            woc,
            name="Common words",
            anchor_id=f"{varid}cwo",
            redact=config.vars.cat.redact,
        )

        bottom_items.append(
            Container(
                [fqwo],
                name="Words",
                anchor_id=f"{varid}word",
                sequence_type="grid",
            ))

    if characters:
        bottom_items.append(
            Container(
                [unitab],
                name="Characters",
                anchor_id=f"{varid}characters",
                sequence_type="grid",
            ))

    # Bottom
    template_variables["bottom"] = Container(bottom_items,
                                             sequence_type="tabs",
                                             anchor_id=f"{varid}bottom")

    return template_variables
Пример #11
0
def render_complex(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    template_variables = {}
    image_format = config.plot.image_format

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Complex number (&Copf;)",
        summary["alerts"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"])
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"])
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
        },
    ])

    table2 = Table([
        {
            "name":
            "Mean",
            "value":
            fmt_numeric(summary["mean"], precision=config.report.precision),
        },
        {
            "name":
            "Minimum",
            "value":
            fmt_numeric(summary["min"], precision=config.report.precision),
        },
        {
            "name":
            "Maximum",
            "value":
            fmt_numeric(summary["max"], precision=config.report.precision),
        },
        {
            "name":
            "Zeros",
            "value":
            fmt_numeric(summary["n_zeros"], precision=config.report.precision),
        },
        {
            "name": "Zeros (%)",
            "value": fmt_percent(summary["p_zeros"])
        },
    ])

    placeholder = HTML("")

    template_variables["top"] = Container([info, table1, table2, placeholder],
                                          sequence_type="grid")

    # Bottom
    items = [
        Image(
            scatter_complex(config, summary["scatter_data"]),
            image_format=image_format,
            alt="Scatterplot",
            caption="Scatterplot in the complex plane",
            name="Scatter",
            anchor_id=f"{varid}scatter",
        )
    ]

    bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"])

    template_variables["bottom"] = bottom

    return template_variables
def render_path(summary):
    varid = summary["varid"]
    n_freq_table_max = config["n_freq_table_max"].get(int)

    template_variables = render_categorical(summary)

    keys = ["name", "parent", "suffix", "stem", "anchor"]
    for path_part in keys:
        template_variables[f"freqtable_{path_part}"] = freq_table(
            freqtable=summary[f"{path_part}_counts"],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    # Top
    template_variables["top"].content["items"][0].content["var_type"] = "Path"

    # Bottom
    path_overview_tab = Container(
        [
            Table(
                [
                    {
                        "name": "Common prefix",
                        "value": summary["common_prefix"],
                        "fmt": "fmt",
                        "alert": False,
                    },
                    {
                        "name": "Unique stems",
                        "value": summary["n_stem_unique"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Unique names",
                        "value": summary["n_name_unique"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Unique extensions",
                        "value": summary["n_suffix_unique"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Unique directories",
                        "value": summary["n_parent_unique"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Unique anchors",
                        "value": summary["n_anchor_unique"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                ]
            )
        ],
        anchor_id=f"{varid}tbl",
        name="Overview",
        sequence_type="list",
    )

    path_items = [
        path_overview_tab,
        FrequencyTable(
            template_variables["freq_table_rows"],
            name="Full",
            anchor_id=f"{varid}full_frequency",
        ),
        FrequencyTable(
            template_variables["freqtable_stem"],
            name="Stem",
            anchor_id=f"{varid}stem_frequency",
        ),
        FrequencyTable(
            template_variables["freqtable_name"],
            name="Name",
            anchor_id=f"{varid}name_frequency",
        ),
        FrequencyTable(
            template_variables["freqtable_suffix"],
            name="Extension",
            anchor_id=f"{varid}suffix_frequency",
        ),
        FrequencyTable(
            template_variables["freqtable_parent"],
            name="Parent",
            anchor_id=f"{varid}parent_frequency",
        ),
        FrequencyTable(
            template_variables["freqtable_anchor"],
            name="Anchor",
            anchor_id=f"{varid}anchor_frequency",
        ),
    ]

    path_tab = Container(
        path_items, name="Path", sequence_type="tabs", anchor_id=f"{varid}path",
    )

    template_variables["bottom"].content["items"].append(path_tab)

    return template_variables
def render_categorical(summary):
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_common(summary)

    # TODO: merge with boolean
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["count"],
        max_number_to_print=n_obs_cat,
    )

    # Top
    # Element composition
    info = Overview(summary["varid"], summary["varname"], "Categorical",
                    summary["warnings"])

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "class": "alert" if "n_unique" in summary["warn_fields"] else "",
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "class": "alert" if "p_unique" in summary["warn_fields"] else "",
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "class": "alert" if "n_missing" in summary["warn_fields"] else "",
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "class": "alert" if "p_missing" in summary["warn_fields"] else "",
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    # TODO: settings 3,3,6
    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    # Bottom
    items = []
    frequency_table = FrequencyTable(
        # 'frequency_table',
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id="{varid}common_values".format(varid=summary["varid"]),
    )

    items.append(frequency_table)

    check_compositions = config["vars"]["cat"]["check_composition"].get(bool)
    if check_compositions:
        composition = Table(
            [
                {
                    "name": "Contains chars",
                    "value": summary["composition"]["chars"],
                    "fmt": "fmt",
                },
                {
                    "name": "Contains digits",
                    "value": summary["composition"]["digits"],
                    "fmt": "fmt",
                },
                {
                    "name": "Contains whitespace",
                    "value": summary["composition"]["spaces"],
                    "fmt": "fmt",
                },
                {
                    "name": "Contains non-words",
                    "value": summary["composition"]["non-words"],
                    "fmt": "fmt",
                },
            ],
            name="Composition",
            anchor_id="{varid}composition".format(varid=summary["varid"]),
        )

        length = Table(
            [
                {
                    "name": "Max length",
                    "value": summary["max_length"],
                    "fmt": "fmt_numeric",
                },
                {
                    "name": "Mean length",
                    "value": summary["mean_length"],
                    "fmt": "fmt_numeric",
                },
                {
                    "name": "Min length",
                    "value": summary["min_length"],
                    "fmt": "fmt_numeric",
                },
            ],
            name="Length",
            anchor_id="{varid}lengthstats".format(varid=summary["varid"]),
        )

        tbl = Sequence(
            [composition, length],
            anchor_id="{varid}tbl".format(varid=summary["varid"]),
            name="Composition",
            sequence_type="grid",
        )

        items.append(tbl)

        histogram_bins = 10

        length = Image(
            histogram(summary["length"], summary, histogram_bins),
            image_format=image_format,
            alt="Scatter",
            name="Length",
            anchor_id="{varid}length".format(varid=summary["varid"]),
        )
        items.append(length)

    template_variables["bottom"] = Sequence(
        items,
        sequence_type="tabs",
        anchor_id="{varid}bottom".format(varid=summary["varid"]),
    )

    return template_variables
Пример #14
0
def get_dataset_overview(config: Settings, summary: dict) -> Renderable:
    table_metrics = [
        {
            "name": "Number of variables",
            "value": fmt_number(summary["table"]["n_var"]),
        },
        {
            "name": "Number of observations",
            "value": fmt_number(summary["table"]["n"]),
        },
        {
            "name": "Missing cells",
            "value": fmt_number(summary["table"]["n_cells_missing"]),
        },
        {
            "name": "Missing cells (%)",
            "value": fmt_percent(summary["table"]["p_cells_missing"]),
        },
    ]
    if "n_duplicates" in summary["table"]:
        table_metrics.extend([
            {
                "name": "Duplicate rows",
                "value": fmt_number(summary["table"]["n_duplicates"]),
            },
            {
                "name": "Duplicate rows (%)",
                "value": fmt_percent(summary["table"]["p_duplicates"]),
            },
        ])

    table_metrics.extend([
        {
            "name": "Total size in memory",
            "value": fmt_bytesize(summary["table"]["memory_size"]),
        },
        {
            "name": "Average record size in memory",
            "value": fmt_bytesize(summary["table"]["record_size"]),
        },
    ])

    dataset_info = Table(
        table_metrics,
        name="Dataset statistics",
    )

    dataset_types = Table(
        [{
            "name": str(type_name),
            "value": fmt_numeric(count, precision=config.report.precision),
        } for type_name, count in summary["table"]["types"].items()],
        name="Variable types",
    )

    return Container(
        [dataset_info, dataset_types],
        anchor_id="dataset_overview",
        name="Overview",
        sequence_type="grid",
    )
Пример #15
0
def render_categorical(summary):
    varid = summary["varid"]
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)
    redact = config["vars"]["cat"]["redact"].get(bool)

    template_variables = render_common(summary)

    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Categorical",
        summary["warnings"],
        summary["description"],
    )

    table = Table([
        {
            "name": "Distinct",
            "value": summary["n_distinct"],
            "fmt": "fmt",
            "alert": "n_distinct" in summary["warn_fields"],
        },
        {
            "name": "Distinct (%)",
            "value": summary["p_distinct"],
            "fmt": "fmt_percent",
            "alert": "p_distinct" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(
        freq_table(
            freqtable=summary["value_counts"],
            n=summary["count"],
            max_number_to_print=n_obs_cat,
        ),
        redact=redact,
    )

    template_variables["top"] = Container([info, table, fqm],
                                          sequence_type="grid")

    citems = [
        FrequencyTable(
            template_variables["freq_table_rows"],
            name="Common Values",
            anchor_id=f"{varid}common_values",
            redact=redact,
        ),
        render_categorical_frequency(summary, varid, image_format),
    ]

    max_unique = config["plot"]["pie"]["max_unique"].get(int)
    if max_unique > 0 and summary["n_distinct"] <= max_unique:
        citems.append(
            Image(
                pie_plot(summary["value_counts"],
                         legend_kws={"loc": "upper right"}),
                image_format=image_format,
                alt="Chart",
                name="Chart",
                anchor_id=f"{varid}pie_chart",
            ))

    # Bottom
    items = [
        Container(
            citems,
            name="Frequencies",
            anchor_id=f"{varid}frequencies",
            sequence_type="tabs",
        ),
    ]

    check_length = config["vars"]["cat"]["length"].get(bool)
    if check_length:
        items.append(render_categorical_length(summary, varid, image_format))

    check_unicode = config["vars"]["cat"]["unicode"].get(bool)
    if check_unicode:
        items.append(render_categorical_unicode(summary, varid, redact))

    template_variables["bottom"] = Container(items,
                                             sequence_type="tabs",
                                             anchor_id=f"{varid}bottom")

    return template_variables
Пример #16
0
def render_categorical_unicode(summary, varid, redact):
    n_freq_table_max = config["n_freq_table_max"].get(int)

    category_items = [
        FrequencyTable(
            freq_table(
                freqtable=summary["category_alias_counts"],
                n=summary["category_alias_counts"].sum(),
                max_number_to_print=n_freq_table_max,
            ),
            name="Most occurring categories",
            anchor_id=f"{varid}category_long_values",
            redact=False,
        )
    ]
    for category_alias_name, category_alias_counts in summary[
            "category_alias_char_counts"].items():
        category_alias_name = category_alias_name.replace("_", " ")
        category_items.append(
            FrequencyTable(
                freq_table(
                    freqtable=category_alias_counts,
                    n=category_alias_counts.sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name=f"Most frequent {category_alias_name} characters",
                anchor_id=f"{varid}category_alias_values_{category_alias_name}",
                redact=redact,
            ))

    script_items = [
        FrequencyTable(
            freq_table(
                freqtable=summary["script_counts"],
                n=summary["script_counts"].sum(),
                max_number_to_print=n_freq_table_max,
            ),
            name="Most occurring scripts",
            anchor_id=f"{varid}script_values",
            redact=False,
        ),
    ]
    for script_name, script_counts in summary["script_char_counts"].items():
        script_items.append(
            FrequencyTable(
                freq_table(
                    freqtable=script_counts,
                    n=script_counts.sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name=f"Most frequent {script_name} characters",
                anchor_id=f"{varid}script_values_{script_name}",
                redact=redact,
            ))

    block_items = [
        FrequencyTable(
            freq_table(
                freqtable=summary["block_alias_counts"],
                n=summary["block_alias_counts"].sum(),
                max_number_to_print=n_freq_table_max,
            ),
            name="Most occurring blocks",
            anchor_id=f"{varid}block_alias_values",
            redact=False,
        )
    ]
    for block_name, block_counts in summary["block_alias_char_counts"].items():
        block_items.append(
            FrequencyTable(
                freq_table(
                    freqtable=block_counts,
                    n=block_counts.sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name=f"Most frequent {block_name} characters",
                anchor_id=f"{varid}block_alias_values_{block_name}",
                redact=redact,
            ))

    citems = [
        Container(
            [
                Table(
                    [
                        {
                            "name": "Unique unicode characters",
                            "value": summary["n_characters"],
                            "fmt": "fmt_numeric",
                            "alert": False,
                        },
                        {
                            "name":
                            'Unique unicode categories (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_character_property#General_Category">?</a>)',
                            "value": summary["n_category"],
                            "fmt": "fmt_numeric",
                            "alert": False,
                        },
                        {
                            "name":
                            'Unique unicode scripts (<a target="_blank" href="https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode">?</a>)',
                            "value": summary["n_scripts"],
                            "fmt": "fmt_numeric",
                            "alert": False,
                        },
                        {
                            "name":
                            'Unique unicode blocks (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_block">?</a>)',
                            "value": summary["n_block_alias"],
                            "fmt": "fmt_numeric",
                            "alert": False,
                        },
                    ],
                    name="Overview of Unicode Properties",
                    caption=
                    "The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ",
                ),
            ],
            anchor_id=f"{varid}character_overview",
            name="Overview",
            sequence_type="list",
        ),
        Container(
            [
                FrequencyTable(
                    freq_table(
                        freqtable=summary["character_counts"],
                        n=summary["character_counts"].sum(),
                        max_number_to_print=n_freq_table_max,
                    ),
                    name="Most occurring characters",
                    anchor_id=f"{varid}character_frequency",
                    redact=redact,
                ),
            ],
            name="Characters",
            anchor_id=f"{varid}characters",
            sequence_type="named_list",
        ),
        Container(
            category_items,
            name="Categories",
            anchor_id=f"{varid}categories",
            sequence_type="named_list",
        ),
        Container(
            script_items,
            name="Scripts",
            anchor_id=f"{varid}scripts",
            sequence_type="named_list",
        ),
        Container(
            block_items,
            name="Blocks",
            anchor_id=f"{varid}blocks",
            sequence_type="named_list",
        ),
    ]

    return Container(
        citems,
        name="Unicode",
        sequence_type="tabs",
        anchor_id=f"{varid}unicode",
    )
def render_image(summary):
    varid = summary["varid"]
    n_freq_table_max = config["n_freq_table_max"].get(int)

    template_variables = render_file(summary)

    # Top
    template_variables["top"].content["items"][0].content["var_type"] = "Image"

    # Bottom
    image_items = []
    """
    Min Width           Min Height          Min Area
    Mean Width          Mean Height         Mean Height
    Median Width        Median Height       Median Height
    Max Width           Max Height          Max Height
    
    All dimension properties are in pixels.
    """

    image_shape_items = [
        Container(
            [
                Table([
                    {
                        "name": "Min width",
                        "value": summary["min_width"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Median width",
                        "value": summary["median_width"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Max width",
                        "value": summary["max_width"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                ]),
                Table([
                    {
                        "name": "Min height",
                        "value": summary["min_height"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Median height",
                        "value": summary["median_height"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Max height",
                        "value": summary["max_height"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                ]),
                Table([
                    {
                        "name": "Min area",
                        "value": summary["min_area"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Median area",
                        "value": summary["median_area"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Max area",
                        "value": summary["max_area"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                ]),
            ],
            anchor_id=f"{varid}tbl",
            name="Overview",
            sequence_type="grid",
        ),
        FrequencyTable(
            freq_table(
                freqtable=summary["image_dimensions"].value_counts(),
                n=summary["n"],
                max_number_to_print=n_freq_table_max,
            ),
            name="Common values",
            anchor_id=f"{varid}image_dimensions_frequency",
        ),
        Image(
            scatter_series(summary["image_dimensions"]),
            image_format=config["plot"]["image_format"].get(str),
            alt="Scatter plot of image sizes",
            caption="Scatter plot of image sizes",
            name="Scatter plot",
            anchor_id=f"{varid}image_dimensions_scatter",
        ),
    ]

    image_shape = Container(
        image_shape_items,
        sequence_type="named_list",
        name="Dimensions",
        anchor_id=f"{varid}image_dimensions",
    )

    if "exif_keys_counts" in summary:
        exif_keys = FrequencyTable(
            freq_table(
                freqtable=pd.Series(summary["exif_keys_counts"]),
                n=summary["n"],
                max_number_to_print=n_freq_table_max,
            ),
            name="Exif keys",
            anchor_id=f"{varid}exif_keys",
        )

        a = [exif_keys]
        for key, counts in summary["exif_data"].items():
            if key == "exif_keys":
                continue

            a.append(
                FrequencyTable(
                    freq_table(
                        freqtable=counts,
                        n=summary["n"],
                        max_number_to_print=n_freq_table_max,
                    ),
                    name=key,
                    anchor_id=f"{varid}_exif_{key}",
                ))

        exif_data = Container(
            a,
            anchor_id=f"{varid}exif_data",
            name="Exif data",
            sequence_type="named_list",
        )

        image_items.append(exif_data)

    image_items.append(image_shape)

    image_tab = Container(
        image_items,
        name="Image",
        sequence_type="tabs",
        anchor_id=f"{varid}image",
    )

    template_variables["bottom"].content["items"].append(image_tab)

    return template_variables
Пример #18
0
def render_count(summary):
    varid = summary["varid"]
    template_variables = render_common(summary)
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Real number (&Ropf; / &Ropf;<sub>&ge;0</sub>)",
        summary["warnings"],
        summary["description"],
    )

    table1 = Table(
        [
            {
                "name": "唯一值计数",
                "value": summary["n_unique"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "唯一值 (%)",
                "value": summary["p_unique"],
                "fmt": "fmt_percent",
                "alert": False,
            },
            {
                "name": "缺失值",
                "value": summary["n_missing"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "缺失值比例 (%)",
                "value": summary["p_missing"],
                "fmt": "fmt_percent",
                "alert": False,
            },
        ]
    )

    table2 = Table(
        [
            {
                "name": "均数",
                "value": summary["mean"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "最小值",
                "value": summary["min"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "最大值",
                "value": summary["max"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "零值",
                "value": summary["n_zeros"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "零值 (%)",
                "value": summary["p_zeros"],
                "fmt": "fmt_percent",
                "alert": False,
            },
            {
                "name": "内存占用",
                "value": summary["memory_size"],
                "fmt": "fmt_bytesize",
                "alert": False,
            },
        ]
    )

    mini_histo = Image(
        mini_histogram(*summary["histogram"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container(
        [info, table1, table2, mini_histo], sequence_type="grid"
    )

    quantile_statistics = {
        "name": "定性分析",
        "items": [
            {
                "name": "最小值",
                "value": summary["min"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "5-th 百分位",
                "value": summary["quantile_5"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q1",
                "value": summary["quantile_25"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "中位数",
                "value": summary["quantile_50"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q3",
                "value": summary["quantile_75"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "95-th 百分位",
                "value": summary["quantile_95"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "最大值",
                "value": summary["max"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "区间",
                "value": summary["range"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "四分位距",
                "value": summary["iqr"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
        ],
    }

    descriptive_statistics = {
        "name": "描述性统计",
        "items": [
            {
                "name": "标准差",
                "value": summary["std"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "变异系数",
                "value": summary["cv"],
                "fmt": "fmt_numeric",
            },
            {"name": "峰度", "value": summary["kurt"], "fmt": "fmt_numeric"},
            {"name": "均数", "value": summary["mean"], "fmt": "fmt_numeric"},
            {"name": "MAD", "value": summary["mad"], "fmt": "fmt_numeric"},
            {"name": "偏度", "value": summary["skew"], "fmt": "fmt_numeric"},
            {"name": "积", "value": summary["sum"], "fmt": "fmt_numeric"},
            {"name": "方差", "value": summary["var"], "fmt": "fmt_numeric"},
        ],
    }

    # TODO: Make sections data structure
    # statistics = ItemRenderer(
    #     'statistics',
    #     'Statistics',
    #     'table',
    #     [
    #         quantile_statistics,
    #         descriptive_statistics
    #     ]
    # )

    seqs = [
        Image(
            histogram(*summary["histogram"]),
            image_format=image_format,
            alt="Histogram",
            caption=f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
            name="Histogram",
            anchor_id="histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id="common_values",
        redact=False,
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id="firstn",
                redact=False,
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id="lastn",
                redact=False,
            ),
        ],
        sequence_type="tabs",
        name="极值",
        anchor_id="extreme_values",
    )

    template_variables["bottom"] = Container(
        [
            # statistics,
            Container(
                seqs, sequence_type="tabs", name="直方图", anchor_id="histograms"
            ),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    return template_variables
Пример #19
0
def render_complex(summary):
    varid = summary["varid"]
    template_variables = {}
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Complex number (&Copf;)",
        summary["warnings"],
    )

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt"
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent"
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt"
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt"
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt"
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt"
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt"
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent"
        },
    ])

    placeholder = HTML("")

    template_variables["top"] = Container([info, table1, table2, placeholder],
                                          sequence_type="grid")

    # Bottom
    items = [
        Image(
            scatter_complex(summary["scatter_data"]),
            image_format=image_format,
            alt="Scatterplot",
            caption="Scatterplot in the complex plane",
            name="Scatter",
            anchor_id=f"{varid}scatter",
        )
    ]

    bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"])

    template_variables["bottom"] = bottom

    return template_variables
Пример #20
0
def render_categorical(summary):
    varid = summary["varid"]
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_common(summary)

    # TODO: merge with boolean
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["count"],
        max_number_to_print=n_obs_cat,
    )

    # Top
    # Element composition
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Categorical",
        summary["warnings"],
        summary["description"],
    )

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    # TODO: settings 3,3,6
    template_variables["top"] = Container([info, table, fqm],
                                          sequence_type="grid")

    # Bottom
    items = []
    frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id=f"{varid}common_values",
    )

    items.append(frequency_table)

    check_length = config["vars"]["cat"]["length"].get(bool)
    if check_length:
        length_table = Table(
            [
                {
                    "name": "Max length",
                    "value": summary["max_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
                {
                    "name": "Median length",
                    "value": summary["median_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
                {
                    "name": "Mean length",
                    "value": summary["mean_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
                {
                    "name": "Min length",
                    "value": summary["min_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
            ],
            name="Length",
            anchor_id=f"{varid}lengthstats",
        )

        histogram_bins = 10

        length = Image(
            histogram(summary["length"], summary, histogram_bins),
            image_format=image_format,
            alt="Scatter",
            name="Length",
            anchor_id=f"{varid}length",
        )

        length_tab = Container(
            [length, length_table],
            anchor_id=f"{varid}tbl",
            name="Length",
            sequence_type="grid",
        )

        items.append(length_tab)

    check_unicode = config["vars"]["cat"]["unicode"].get(bool)
    if check_unicode:
        n_freq_table_max = config["n_freq_table_max"].get(int)

        category_items = [
            FrequencyTable(
                freq_table(
                    freqtable=summary["category_alias_counts"],
                    n=summary["category_alias_counts"].sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name="Most occurring categories",
                anchor_id=f"{varid}category_long_values",
            )
        ]
        for category_alias_name, category_alias_counts in summary[
                "category_alias_char_counts"].items():
            category_alias_name = category_alias_name.replace("_", " ")
            category_items.append(
                FrequencyTable(
                    freq_table(
                        freqtable=category_alias_counts,
                        n=category_alias_counts.sum(),
                        max_number_to_print=n_freq_table_max,
                    ),
                    name=f"Most frequent {category_alias_name} characters",
                    anchor_id=
                    f"{varid}category_alias_values_{category_alias_name}",
                ))

        script_items = [
            FrequencyTable(
                freq_table(
                    freqtable=summary["script_counts"],
                    n=summary["script_counts"].sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name="Most occurring scripts",
                anchor_id=f"{varid}script_values",
            ),
        ]
        for script_name, script_counts in summary["script_char_counts"].items(
        ):
            script_items.append(
                FrequencyTable(
                    freq_table(
                        freqtable=script_counts,
                        n=script_counts.sum(),
                        max_number_to_print=n_freq_table_max,
                    ),
                    name=f"Most frequent {script_name} characters",
                    anchor_id=f"{varid}script_values_{script_name}",
                ))

        block_items = [
            FrequencyTable(
                freq_table(
                    freqtable=summary["block_alias_counts"],
                    n=summary["block_alias_counts"].sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name="Most occurring blocks",
                anchor_id=f"{varid}block_alias_values",
            )
        ]
        for block_name, block_counts in summary[
                "block_alias_char_counts"].items():
            block_items.append(
                FrequencyTable(
                    freq_table(
                        freqtable=block_counts,
                        n=block_counts.sum(),
                        max_number_to_print=n_freq_table_max,
                    ),
                    name=f"Most frequent {block_name} characters",
                    anchor_id=f"{varid}block_alias_values_{block_name}",
                ))

        citems = [
            Container(
                [
                    Table(
                        [
                            {
                                "name": "Unique unicode characters",
                                "value": summary["n_characters"],
                                "fmt": "fmt_numeric",
                                "alert": False,
                            },
                            {
                                "name":
                                'Unique unicode categories (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_character_property#General_Category">?</a>)',
                                "value": summary["n_category"],
                                "fmt": "fmt_numeric",
                                "alert": False,
                            },
                            {
                                "name":
                                'Unique unicode scripts (<a target="_blank" href="https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode">?</a>)',
                                "value": summary["n_scripts"],
                                "fmt": "fmt_numeric",
                                "alert": False,
                            },
                            {
                                "name":
                                'Unique unicode blocks (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_block">?</a>)',
                                "value": summary["n_block_alias"],
                                "fmt": "fmt_numeric",
                                "alert": False,
                            },
                        ],
                        name="Overview of Unicode Properties",
                        caption=
                        "The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ",
                    ),
                ],
                anchor_id=f"{varid}character_overview",
                name="Overview",
                sequence_type="list",
            ),
            Container(
                [
                    FrequencyTable(
                        freq_table(
                            freqtable=summary["character_counts"],
                            n=summary["character_counts"].sum(),
                            max_number_to_print=n_freq_table_max,
                        ),
                        name="Most occurring characters",
                        anchor_id=f"{varid}character_frequency",
                    ),
                ],
                name="Characters",
                anchor_id=f"{varid}characters",
                sequence_type="named_list",
            ),
            Container(
                category_items,
                name="Categories",
                anchor_id=f"{varid}categories",
                sequence_type="named_list",
            ),
            Container(
                script_items,
                name="Scripts",
                anchor_id=f"{varid}scripts",
                sequence_type="named_list",
            ),
            Container(
                block_items,
                name="Blocks",
                anchor_id=f"{varid}blocks",
                sequence_type="named_list",
            ),
        ]

        characters = Container(
            citems,
            name="Unicode",
            sequence_type="tabs",
            anchor_id=f"{varid}unicode",
        )

        items.append(characters)

    template_variables["bottom"] = Container(items,
                                             sequence_type="tabs",
                                             anchor_id=f"{varid}bottom")

    return template_variables
Пример #21
0
def render_url(summary):
    varid = summary["varid"]
    n_freq_table_max = config["n_freq_table_max"].get(int)

    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    redact = config["vars"]["cat"]["redact"].get(bool)

    template_variables = render_common(summary)

    keys = ["scheme", "netloc", "path", "query", "fragment"]
    for url_part in keys:
        template_variables[f"freqtable_{url_part}"] = freq_table(
            freqtable=summary[f"{url_part}_counts"],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    full_frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Full",
        anchor_id=f"{varid}full_frequency",
        redact=redact,
    )
    scheme_frequency_table = FrequencyTable(
        template_variables["freqtable_scheme"],
        name="Scheme",
        anchor_id=f"{varid}scheme_frequency",
        redact=redact,
    )
    netloc_frequency_table = FrequencyTable(
        template_variables["freqtable_netloc"],
        name="Netloc",
        anchor_id=f"{varid}netloc_frequency",
        redact=redact,
    )
    path_frequency_table = FrequencyTable(
        template_variables["freqtable_path"],
        name="Path",
        anchor_id=f"{varid}path_frequency",
        redact=redact,
    )
    query_frequency_table = FrequencyTable(
        template_variables["freqtable_query"],
        name="Query",
        anchor_id=f"{varid}query_frequency",
        redact=redact,
    )
    fragment_frequency_table = FrequencyTable(
        template_variables["freqtable_fragment"],
        name="Fragment",
        anchor_id=f"{varid}fragment_frequency",
        redact=redact,
    )

    items = [
        full_frequency_table,
        scheme_frequency_table,
        netloc_frequency_table,
        path_frequency_table,
        query_frequency_table,
        fragment_frequency_table,
    ]
    template_variables["bottom"] = Container(items,
                                             sequence_type="tabs",
                                             name="url stats",
                                             anchor_id=f"{varid}urlstats")

    # Element composition
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "URL",
        summary["warnings"],
        summary["description"],
    )

    table = Table([
        {
            "name": "Distinct",
            "value": summary["n_distinct"],
            "fmt": "fmt",
            "alert": "n_distinct" in summary["warn_fields"],
        },
        {
            "name": "Distinct (%)",
            "value": summary["p_distinct"],
            "fmt": "fmt_percent",
            "alert": "p_distinct" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(
        freq_table(
            freqtable=summary["value_counts"],
            n=summary["n"],
            max_number_to_print=n_obs_cat,
        ),
        redact=redact,
    )

    template_variables["top"] = Container([info, table, fqm],
                                          sequence_type="grid")

    return template_variables
Пример #22
0
def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
    varid = summary["varid"]
    template_variables = {}

    image_format = config.plot.image_format

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Date",
        summary["alerts"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"]),
            "alert": False,
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
            "alert": False,
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"]),
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name": "Minimum",
            "value": fmt(summary["min"]),
            "alert": False
        },
        {
            "name": "Maximum",
            "value": fmt(summary["max"]),
            "alert": False
        },
    ])

    mini_histo = Image(
        mini_histogram(config,
                       summary["histogram"][0],
                       summary["histogram"][1],
                       date=True),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    # Bottom
    bottom = Container(
        [
            Image(
                histogram(config,
                          summary["histogram"][0],
                          summary["histogram"][1],
                          date=True),
                image_format=image_format,
                alt="Histogram",
                caption=
                f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
                name="Histogram",
                anchor_id=f"{varid}histogram",
            )
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    template_variables["bottom"] = bottom

    return template_variables
Пример #23
0
def render_categorical_unicode(summary, varid, redact):
    n_freq_table_max = config["n_freq_table_max"].get(int)

    category_overview = FrequencyTable(
        freq_table(
            freqtable=summary["category_alias_counts"],
            n=summary["category_alias_counts"].sum(),
            max_number_to_print=n_freq_table_max,
        ),
        name="Most occurring categories",
        anchor_id=f"{varid}category_long_values",
        redact=False,
    )

    cats = []
    for category_alias_name, category_alias_counts in sorted(
        summary["category_alias_char_counts"].items(), key=lambda x: -len(x[1])
    ):
        category_alias_name = category_alias_name.replace("_", " ")
        cats.append(
            FrequencyTable(
                freq_table(
                    freqtable=category_alias_counts,
                    n=category_alias_counts.sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name=f"{category_alias_name}",
                anchor_id=f"{varid}category_alias_values_{category_alias_name}",
                redact=redact,
            )
        )

    category_items = [
        category_overview,
        Container(
            cats,
            name="Most frequent character per category",
            sequence_type="batch_grid",
            anchor_id=f"{varid}categories",
            batch_size=3,
        ),
    ]

    script_overview = FrequencyTable(
        freq_table(
            freqtable=summary["script_counts"],
            n=summary["script_counts"].sum(),
            max_number_to_print=n_freq_table_max,
        ),
        name="Most occurring scripts",
        anchor_id=f"{varid}script_values",
        redact=False,
    )

    scripts = []
    for script_name, script_counts in sorted(
        summary["script_char_counts"].items(), key=lambda x: -len(x[1])
    ):
        scripts.append(
            FrequencyTable(
                freq_table(
                    freqtable=script_counts,
                    n=script_counts.sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name=f"{script_name}",
                anchor_id=f"{varid}script_values_{script_name}",
                redact=redact,
            )
        )

    script_items = [
        script_overview,
        Container(
            scripts,
            name="Most frequent character per script",
            sequence_type="batch_grid",
            anchor_id=f"{varid}scripts",
            batch_size=3,
        ),
    ]

    block_overview = FrequencyTable(
        freq_table(
            freqtable=summary["block_alias_counts"],
            n=summary["block_alias_counts"].sum(),
            max_number_to_print=n_freq_table_max,
        ),
        name="Most occurring blocks",
        anchor_id=f"{varid}block_alias_values",
        redact=False,
    )

    blocks = []
    for block_name, block_counts in summary["block_alias_char_counts"].items():
        blocks.append(
            FrequencyTable(
                freq_table(
                    freqtable=block_counts,
                    n=block_counts.sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name=f"{block_name}",
                anchor_id=f"{varid}block_alias_values_{block_name}",
                redact=redact,
            )
        )

    block_items = [
        block_overview,
        Container(
            blocks,
            name="Most frequent character per block",
            sequence_type="batch_grid",
            anchor_id=f"{varid}blocks",
            batch_size=3,
        ),
    ]

    overview_table = Table(
        [
            {
                "name": "Total characters",
                "value": summary["n_characters"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Distinct characters",
                "value": summary["n_characters_distinct"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Distinct categories",
                "value": f"{summary['n_category']} {help(title='Unicode categories (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_character_property#General_Category')}",
                "fmt": "raw",
                "alert": False,
            },
            {
                "name": "Distinct scripts",
                "value": f"{summary['n_scripts']} {help(title='Unicode scripts (click for more information)', url='https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode')}",
                "fmt": "raw",
                "alert": False,
            },
            {
                "name": "Distinct blocks",
                "value": f"{summary['n_block_alias']} {help(title='Unicode blocks (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_block')}",
                "fmt": "raw",
                "alert": False,
            },
        ],
        name="Characters and Unicode",
        caption="The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ",
    )

    citems = [
        Container(
            [
                FrequencyTable(
                    freq_table(
                        freqtable=summary["character_counts"],
                        n=summary["character_counts"].sum(),
                        max_number_to_print=n_freq_table_max,
                    ),
                    name="Most occurring characters",
                    anchor_id=f"{varid}character_frequency",
                    redact=redact,
                ),
            ],
            name="Characters",
            anchor_id=f"{varid}characters",
            sequence_type="named_list",
        ),
        Container(
            category_items,
            name="Categories",
            anchor_id=f"{varid}categories",
            sequence_type="named_list",
        ),
        Container(
            script_items,
            name="Scripts",
            anchor_id=f"{varid}scripts",
            sequence_type="named_list",
        ),
        Container(
            block_items,
            name="Blocks",
            anchor_id=f"{varid}blocks",
            sequence_type="named_list",
        ),
    ]

    return overview_table, Container(
        citems,
        name="Unicode",
        sequence_type="tabs",
        anchor_id=f"{varid}unicode",
    )
Пример #24
0
def render_date(summary):
    varid = summary["varid"]
    # TODO: render common?
    template_variables = {}

    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Date",
        summary["warnings"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
    ])

    mini_histo = Image(
        mini_histogram(*summary["histogram"], date=True),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    # Bottom
    bottom = Container(
        [
            Image(
                histogram(*summary["histogram"], date=True),
                image_format=image_format,
                alt="Histogram",
                caption=
                f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
                name="Histogram",
                anchor_id=f"{varid}histogram",
            )
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    template_variables["bottom"] = bottom

    return template_variables
Пример #25
0
def render_real(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    template_variables = render_common(config, summary)
    image_format = config.plot.image_format

    if summary["min"] >= 0:
        name = "Real number (&Ropf;<sub>&ge;0</sub>)"
    else:
        name = "Real number (&Ropf;)"

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        name,
        summary["alerts"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"]),
            "alert": "n_distinct" in summary["alert_fields"],
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
            "alert": "p_distinct" in summary["alert_fields"],
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"]),
            "alert": "n_missing" in summary["alert_fields"],
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
            "alert": "p_missing" in summary["alert_fields"],
        },
        {
            "name": "Infinite",
            "value": fmt(summary["n_infinite"]),
            "alert": "n_infinite" in summary["alert_fields"],
        },
        {
            "name": "Infinite (%)",
            "value": fmt_percent(summary["p_infinite"]),
            "alert": "p_infinite" in summary["alert_fields"],
        },
        {
            "name":
            "Mean",
            "value":
            fmt_numeric(summary["mean"], precision=config.report.precision),
            "alert":
            False,
        },
    ])

    table2 = Table([
        {
            "name": "Minimum",
            "value": fmt_numeric(summary["min"],
                                 precision=config.report.precision),
            "alert": False,
        },
        {
            "name": "Maximum",
            "value": fmt_numeric(summary["max"],
                                 precision=config.report.precision),
            "alert": False,
        },
        {
            "name": "Zeros",
            "value": fmt(summary["n_zeros"]),
            "alert": "n_zeros" in summary["alert_fields"],
        },
        {
            "name": "Zeros (%)",
            "value": fmt_percent(summary["p_zeros"]),
            "alert": "p_zeros" in summary["alert_fields"],
        },
        {
            "name": "Negative",
            "value": fmt(summary["n_negative"]),
            "alert": False,
        },
        {
            "name": "Negative (%)",
            "value": fmt_percent(summary["p_negative"]),
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
            "alert": False,
        },
    ])

    mini_histo = Image(
        mini_histogram(config, *summary["histogram"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    quantile_statistics = Table(
        [
            {
                "name":
                "Minimum",
                "value":
                fmt_numeric(summary["min"], precision=config.report.precision),
            },
            {
                "name":
                "5-th percentile",
                "value":
                fmt_numeric(summary["5%"], precision=config.report.precision),
            },
            {
                "name":
                "Q1",
                "value":
                fmt_numeric(summary["25%"], precision=config.report.precision),
            },
            {
                "name":
                "median",
                "value":
                fmt_numeric(summary["50%"], precision=config.report.precision),
            },
            {
                "name":
                "Q3",
                "value":
                fmt_numeric(summary["75%"], precision=config.report.precision),
            },
            {
                "name":
                "95-th percentile",
                "value":
                fmt_numeric(summary["95%"], precision=config.report.precision),
            },
            {
                "name":
                "Maximum",
                "value":
                fmt_numeric(summary["max"], precision=config.report.precision),
            },
            {
                "name":
                "Range",
                "value":
                fmt_numeric(summary["range"],
                            precision=config.report.precision),
            },
            {
                "name":
                "Interquartile range (IQR)",
                "value":
                fmt_numeric(summary["iqr"], precision=config.report.precision),
            },
        ],
        name="Quantile statistics",
    )

    descriptive_statistics = Table(
        [
            {
                "name":
                "Standard deviation",
                "value":
                fmt_numeric(summary["std"], precision=config.report.precision),
            },
            {
                "name":
                "Coefficient of variation (CV)",
                "value":
                fmt_numeric(summary["cv"], precision=config.report.precision),
            },
            {
                "name":
                "Kurtosis",
                "value":
                fmt_numeric(summary["kurtosis"],
                            precision=config.report.precision),
            },
            {
                "name":
                "Mean",
                "value":
                fmt_numeric(summary["mean"],
                            precision=config.report.precision),
            },
            {
                "name":
                "Median Absolute Deviation (MAD)",
                "value":
                fmt_numeric(summary["mad"], precision=config.report.precision),
            },
            {
                "name":
                "Skewness",
                "value":
                fmt_numeric(summary["skewness"],
                            precision=config.report.precision),
                "class":
                "alert" if "skewness" in summary["alert_fields"] else "",
            },
            {
                "name":
                "Sum",
                "value":
                fmt_numeric(summary["sum"], precision=config.report.precision),
            },
            {
                "name":
                "Variance",
                "value":
                fmt_numeric(summary["variance"],
                            precision=config.report.precision),
            },
            {
                "name": "Monotonicity",
                "value": fmt_monotonic(summary["monotonic"]),
            },
        ],
        name="Descriptive statistics",
    )

    statistics = Container(
        [quantile_statistics, descriptive_statistics],
        anchor_id=f"{varid}statistics",
        name="Statistics",
        sequence_type="grid",
    )

    hist = Image(
        histogram(config, *summary["histogram"]),
        image_format=image_format,
        alt="Histogram",
        caption=
        f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
        name="Histogram",
        anchor_id=f"{varid}histogram",
    )

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id=f"{varid}common_values",
        redact=False,
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name=f"Minimum {config.n_extreme_obs} values",
                anchor_id=f"{varid}firstn",
                redact=False,
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name=f"Maximum {config.n_extreme_obs} values",
                anchor_id=f"{varid}lastn",
                redact=False,
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id=f"{varid}extreme_values",
    )

    template_variables["bottom"] = Container(
        [statistics, hist, fq, evs],
        sequence_type="tabs",
        anchor_id=f"{varid}bottom",
    )

    return template_variables
Пример #26
0
def render_url(summary):
    n_freq_table_max = config["n_freq_table_max"].get(int)

    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)

    # TODO: merge with boolean/categorical
    mini_freq_table_rows = freq_table(freqtable=summary["value_counts"],
                                      n=summary["n"],
                                      max_number_to_print=n_obs_cat)
    template_variables = render_common(summary)

    keys = ["scheme", "netloc", "path", "query", "fragment"]
    for url_part in keys:
        template_variables["freqtable_{}".format(url_part)] = freq_table(
            freqtable=summary["{}_counts".format(url_part)],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    full_frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Full",
        anchor_id="{varid}full_frequency".format(varid=summary["varid"]),
    )
    scheme_frequency_table = FrequencyTable(
        template_variables["freqtable_scheme"],
        name="Scheme",
        anchor_id="{varid}scheme_frequency".format(varid=summary["varid"]),
    )
    netloc_frequency_table = FrequencyTable(
        template_variables["freqtable_netloc"],
        name="Netloc",
        anchor_id="{varid}netloc_frequency".format(varid=summary["varid"]),
    )
    path_frequency_table = FrequencyTable(
        template_variables["freqtable_path"],
        name="Path",
        anchor_id="{varid}path_frequency".format(varid=summary["varid"]),
    )
    query_frequency_table = FrequencyTable(
        template_variables["freqtable_query"],
        name="Query",
        anchor_id="{varid}query_frequency".format(varid=summary["varid"]),
    )
    fragment_frequency_table = FrequencyTable(
        template_variables["freqtable_fragment"],
        name="Fragment",
        anchor_id="{varid}fragment_frequency".format(varid=summary["varid"]),
    )

    items = [
        full_frequency_table,
        scheme_frequency_table,
        netloc_frequency_table,
        path_frequency_table,
        query_frequency_table,
        fragment_frequency_table,
    ]
    template_variables["bottom"] = Sequence(
        items,
        sequence_type="tabs",
        name="url stats",
        anchor_id="{varid}urlstats".format(varid=summary["varid"]),
    )

    # Element composition
    info = VariableInfo(summary["varid"], summary["varname"], "URL",
                        summary["warnings"])

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    return template_variables