def render_real(summary):
    varid = summary["varid"]
    template_variables = render_common(summary)
    image_format = config["plot"]["image_format"].get(str)

    if summary["min"] >= 0:
        name = "Real number (&Ropf;<sub>&ge;0</sub>)"
    else:
        name = "Real number (&Ropf;)"

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        name,
        summary["warnings"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Infinite",
            "value": summary["n_infinite"],
            "fmt": "fmt",
            "alert": "n_infinite" in summary["warn_fields"],
        },
        {
            "name": "Infinite (%)",
            "value": summary["p_infinite"],
            "fmt": "fmt_percent",
            "alert": "p_infinite" in summary["warn_fields"],
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt_numeric",
            "alert": False,
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt_numeric",
            "alert": False,
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt_numeric",
            "alert": False,
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt",
            "alert": "n_zeros" in summary["warn_fields"],
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent",
            "alert": "p_zeros" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    mini_histo = Image(
        mini_histogram(*summary["histogram"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    quantile_statistics = Table(
        [
            {
                "name": "Minimum",
                "value": summary["min"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "5-th percentile",
                "value": summary["5%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Q1",
                "value": summary["25%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "median",
                "value": summary["50%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Q3",
                "value": summary["75%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "95-th percentile",
                "value": summary["95%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Maximum",
                "value": summary["max"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Range",
                "value": summary["range"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Interquartile range (IQR)",
                "value": summary["iqr"],
                "fmt": "fmt_numeric",
            },
        ],
        name="Quantile statistics",
    )

    if summary["monotonic_increase_strict"]:
        monotocity = "Strictly increasing"
    elif summary["monotonic_decrease_strict"]:
        monotocity = "Strictly decreasing"
    elif summary["monotonic_increase"]:
        monotocity = "Increasing"
    elif summary["monotonic_decrease"]:
        monotocity = "Decreasing"
    else:
        monotocity = "Not monotonic"

    descriptive_statistics = Table(
        [
            {
                "name": "Standard deviation",
                "value": summary["std"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Coefficient of variation (CV)",
                "value": summary["cv"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Kurtosis",
                "value": summary["kurtosis"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Mean",
                "value": summary["mean"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Median Absolute Deviation (MAD)",
                "value": summary["mad"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Skewness",
                "value": summary["skewness"],
                "fmt": "fmt_numeric",
                "class":
                "alert" if "skewness" in summary["warn_fields"] else "",
            },
            {
                "name": "Sum",
                "value": summary["sum"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Variance",
                "value": summary["variance"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Monotocity",
                "value": monotocity,
                "fmt": "fmt"
            },
        ],
        name="Descriptive statistics",
    )

    statistics = Container(
        [quantile_statistics, descriptive_statistics],
        anchor_id=f"{varid}statistics",
        name="Statistics",
        sequence_type="grid",
    )

    hist = Image(
        histogram(*summary["histogram"]),
        image_format=image_format,
        alt="Histogram",
        caption=
        f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
        name="Histogram",
        anchor_id=f"{varid}histogram",
    )

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id=f"{varid}common_values",
        redact=False,
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id=f"{varid}firstn",
                redact=False,
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id=f"{varid}lastn",
                redact=False,
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id=f"{varid}extreme_values",
    )

    template_variables["bottom"] = Container(
        [statistics, hist, fq, evs],
        sequence_type="tabs",
        anchor_id=f"{varid}bottom",
    )

    return template_variables
示例#2
0
def get_correlation_items(summary) -> Optional[Renderable]:
    """Create the list of correlation items

    Args:
        summary: dict of correlations

    Returns:
        List of correlation items to show in the interface.
    """
    correlation_matrix_items: List[Renderable] = []
    predictivity_items: List[Renderable] = []

    pearson_description = (
        "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation "
        "between two variables. It's value lies between -1 and +1, -1 indicating total negative "
        "linear correlation, 0 indicating no linear correlation and 1 indicating total positive "
        "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location "
        "and scale of the two variables, implying that for a linear function the angle to the "
        "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two "
        "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and "
        "<em>Y</em> by the product of their standard deviations. ")
    spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic 
    correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than 
    Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 
    0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To 
    calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank 
    variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """

    kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation 
    coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, 
    -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation.
    <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of 
    concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the 
    discordant pairs divided by the total number of pairs."""

    phi_k_description = """Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case
    of a bivariate normal input distribution. There is extensive documentation available <a href='https://phik.readthedocs.io/en/latest/index.html'>here</a>."""

    cramers_description = """Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association.
    The empirical estimators used for Cramér's V have been proved to be biased, even for large samples.
    We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found <a href='http://stats.lse.ac.uk/bergsma/pdf/cramerV3.pdf'>here</a>."""

    key_to_data = {
        "pearson": (-1, "Pearson's r", pearson_description),
        "spearman": (-1, "Spearman's ρ", spearman_description),
        "kendall": (-1, "Kendall's τ", kendall_description),
        "phi_k": (0, "Phik (φk)", phi_k_description),
        "cramers": (0, "Cramér's V (φc)", cramers_description),
    }

    image_format = config["plot"]["image_format"].get(str)

    for key, item in summary["correlations"].items():
        vmin, name, description = key_to_data[key]

        correlation_matrix_diagram = Image(
            plot.correlation_matrix(item, vmin=vmin),
            image_format=image_format,
            alt="Correlation matrix",
            anchor_id=f"correlation_matrix_diagram",
            name="Correlation matrix",
            classes="correlation-diagram",
        )

        predictivity_diagram = Image(
            plot.predictivity(item),
            image_format=image_format,
            alt="Predictivity",
            anchor_id=f"predictivity_diagram",
            name="Predictivity",
            classes="correlation-diagram",
        ) if not key == "cramers" else None

        if len(description) > 0:
            desc = HTML(
                f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>',
                anchor_id=f"{key}_html",
                classes="correlation-description",
            )

            correlation_tbl = Container([correlation_matrix_diagram, desc],
                                        anchor_id=key,
                                        name=name,
                                        sequence_type="grid")

            correlation_matrix_items.append(correlation_tbl)

            if predictivity_diagram is not None:
                predictivity_tbl = Container([predictivity_diagram, desc],
                                             anchor_id=key,
                                             name=name,
                                             sequence_type="grid")
                predictivity_items.append(predictivity_tbl)
        else:
            correlation_matrix_items.append(correlation_matrix_diagram)
            if predictivity_diagram is not None:
                predictivity_items.append(predictivity_diagram)

    correlation_matrix_container = Container(
        correlation_matrix_items,
        sequence_type="tabs",
        name="Correlation matrix",
        anchor_id="correlation_matrix_tab",
    )

    predictivity_container = Container(
        predictivity_items,
        sequence_type="tabs",
        name="Predictivity",
        anchor_id="predictivity_tab",
    )

    corr = Container(
        [correlation_matrix_container, predictivity_container],
        sequence_type="tabs",
        name="Correlations Tab",
        anchor_id="correlations_tab",
    )

    if len(correlation_matrix_items) > 0:
        btn = ToggleButton(
            "Toggle correlation descriptions",
            anchor_id="toggle-correlation-description",
            name="Toggle correlation descriptions",
        )

        return Collapse(name="Correlations",
                        anchor_id="correlations",
                        button=btn,
                        item=corr)
    else:
        return None
def render_categorical(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    n_obs_cat = config.vars.cat.n_obs
    image_format = config.plot.image_format
    words = config.vars.cat.words
    characters = config.vars.cat.characters
    length = config.vars.cat.length

    template_variables = render_common(config, summary)

    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Categorical",
        summary["warnings"],
        summary["description"],
    )

    table = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"]),
            "alert": "n_distinct" in summary["warn_fields"],
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
            "alert": "p_distinct" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"]),
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(
        freq_table(
            freqtable=summary["value_counts_without_nan"],
            n=summary["count"],
            max_number_to_print=n_obs_cat,
        ),
        redact=config.vars.cat.redact,
    )

    template_variables["top"] = Container([info, table, fqm],
                                          sequence_type="grid")

    # ============================================================================================

    frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id=f"{varid}common_values",
        redact=config.vars.cat.redact,
    )

    unique_stats, value_counts = render_categorical_frequency(
        config, summary, varid)

    overview_items = []

    if length:
        length_table, length_histo = render_categorical_length(
            config, summary, varid)
        overview_items.append(length_table)

    if characters:
        overview_table_char, unitab = render_categorical_unicode(
            config, summary, varid)
        overview_items.append(overview_table_char)

    overview_items.append(unique_stats)

    if not config.vars.cat.redact:
        rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row")

        sample = Table(
            [{
                "name": name,
                "value": fmt(value),
                "alert": False,
            } for name, value in zip(rows, summary["first_rows"])],
            name="Sample",
        )
        overview_items.append(sample)

    string_items: List[Renderable] = [frequency_table]
    if length:
        string_items.append(length_histo)

    max_unique = config.plot.pie.max_unique
    if max_unique > 0 and summary["n_distinct"] <= max_unique:
        string_items.append(
            Image(
                pie_plot(
                    config,
                    summary["value_counts_without_nan"],
                    legend_kws={"loc": "upper right"},
                ),
                image_format=image_format,
                alt="Pie chart",
                name="Pie chart",
                anchor_id=f"{varid}pie_chart",
            ))

    bottom_items = [
        Container(
            overview_items,
            name="Overview",
            anchor_id=f"{varid}overview",
            sequence_type="batch_grid",
            batch_size=len(overview_items),
            titles=False,
        ),
        Container(
            string_items,
            name="Categories",
            anchor_id=f"{varid}string",
            sequence_type="batch_grid",
            batch_size=len(string_items),
        ),
    ]

    if words:
        woc = freq_table(
            freqtable=summary["word_counts"],
            n=summary["word_counts"].sum(),
            max_number_to_print=10,
        )

        fqwo = FrequencyTable(
            woc,
            name="Common words",
            anchor_id=f"{varid}cwo",
            redact=config.vars.cat.redact,
        )

        bottom_items.append(
            Container(
                [fqwo],
                name="Words",
                anchor_id=f"{varid}word",
                sequence_type="grid",
            ))

    if characters:
        bottom_items.append(
            Container(
                [unitab],
                name="Characters",
                anchor_id=f"{varid}characters",
                sequence_type="grid",
            ))

    # Bottom
    template_variables["bottom"] = Container(bottom_items,
                                             sequence_type="tabs",
                                             anchor_id=f"{varid}bottom")

    return template_variables
示例#4
0
def render_date(summary):
    varid = summary["varid"]
    # TODO: render common?
    template_variables = {}

    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Date",
        summary["warnings"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
    ])

    mini_histo = Image(
        mini_histogram(*summary["histogram"], date=True),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    # Bottom
    bottom = Container(
        [
            Image(
                histogram(*summary["histogram"], date=True),
                image_format=image_format,
                alt="Histogram",
                caption=
                f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
                name="Histogram",
                anchor_id=f"{varid}histogram",
            )
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    template_variables["bottom"] = bottom

    return template_variables
示例#5
0
def render_real(summary):
    varid = summary["varid"]
    template_variables = render_common(summary)
    image_format = config["plot"]["image_format"].get(str)

    if summary["min"] >= 0:
        name = "Real number (&Ropf;<sub>&ge;0</sub>)"
    else:
        name = "Real number (&Ropf;)"

    # Top
    info = VariableInfo(summary["varid"], summary["varname"], name,
                        summary["warnings"])

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Infinite",
            "value": summary["n_infinite"],
            "fmt": "fmt",
            "alert": "n_infinite" in summary["warn_fields"],
        },
        {
            "name": "Infinite (%)",
            "value": summary["p_infinite"],
            "fmt": "fmt_percent",
            "alert": "p_infinite" in summary["warn_fields"],
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt",
            "alert": "n_zeros" in summary["warn_fields"],
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent",
            "alert": "p_zeros" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    histogram_bins = 10

    # TODO: replace with SmallImage...
    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary, histogram_bins),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    quantile_statistics = Table(
        [
            {
                "name": "Minimum",
                "value": summary["min"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "5-th percentile",
                "value": summary["5%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Q1",
                "value": summary["25%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "median",
                "value": summary["50%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Q3",
                "value": summary["75%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "95-th percentile",
                "value": summary["95%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Maximum",
                "value": summary["max"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Range",
                "value": summary["range"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Interquartile range (IQR)",
                "value": summary["iqr"],
                "fmt": "fmt_numeric",
            },
        ],
        name="Quantile statistics",
    )

    descriptive_statistics = Table(
        [
            {
                "name": "Standard deviation",
                "value": summary["std"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Coefficient of variation (CV)",
                "value": summary["cv"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Kurtosis",
                "value": summary["kurtosis"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Mean",
                "value": summary["mean"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Median Absolute Deviation (MAD)",
                "value": summary["mad"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Skewness",
                "value": summary["skewness"],
                "fmt": "fmt_numeric",
                "class":
                "alert" if "skewness" in summary["warn_fields"] else "",
            },
            {
                "name": "Sum",
                "value": summary["sum"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Variance",
                "value": summary["variance"],
                "fmt": "fmt_numeric"
            },
        ],
        name="Descriptive statistics",
    )

    statistics = Container(
        [quantile_statistics, descriptive_statistics],
        anchor_id=f"{varid}statistics",
        name="Statistics",
        sequence_type="grid",
    )

    seqs = [
        Image(
            histogram(summary["histogram_data"], summary, histogram_bins),
            image_format=image_format,
            alt="Histogram",
            caption=
            f"<strong>Histogram with fixed size bins</strong> (bins={histogram_bins})",
            name="Histogram",
            anchor_id=f"{varid}histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id=f"{varid}common_values",
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id=f"{varid}firstn",
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id=f"{varid}lastn",
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id=f"{varid}extreme_values",
    )

    if "histogram_bins_bayesian_blocks" in summary:
        histo_dyn = Image(
            histogram(
                summary["histogram_data"],
                summary,
                summary["histogram_bins_bayesian_blocks"],
            ),
            image_format=image_format,
            alt="Histogram",
            caption=
            '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)'
            .format(
                fmt_array(summary["histogram_bins_bayesian_blocks"],
                          threshold=5)),
            name="Dynamic Histogram",
            anchor_id=f"{varid}dynamic_histogram",
        )

        seqs.append(histo_dyn)

    template_variables["bottom"] = Container(
        [
            statistics,
            Container(
                seqs,
                sequence_type="tabs",
                name="Histogram(s)",
                anchor_id=f"{varid}histograms",
            ),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=f"{varid}bottom",
    )

    return template_variables
def render_categorical(summary):
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_common(summary)

    # TODO: merge with boolean
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["count"],
        max_number_to_print=n_obs_cat,
    )

    # Top
    # Element composition
    info = Overview(summary["varid"], summary["varname"], "Categorical",
                    summary["warnings"])

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "class": "alert" if "n_unique" in summary["warn_fields"] else "",
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "class": "alert" if "p_unique" in summary["warn_fields"] else "",
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "class": "alert" if "n_missing" in summary["warn_fields"] else "",
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "class": "alert" if "p_missing" in summary["warn_fields"] else "",
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    # TODO: settings 3,3,6
    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    # Bottom
    items = []
    frequency_table = FrequencyTable(
        # 'frequency_table',
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id="{varid}common_values".format(varid=summary["varid"]),
    )

    items.append(frequency_table)

    check_compositions = config["vars"]["cat"]["check_composition"].get(bool)
    if check_compositions:
        composition = Table(
            [
                {
                    "name": "Contains chars",
                    "value": summary["composition"]["chars"],
                    "fmt": "fmt",
                },
                {
                    "name": "Contains digits",
                    "value": summary["composition"]["digits"],
                    "fmt": "fmt",
                },
                {
                    "name": "Contains whitespace",
                    "value": summary["composition"]["spaces"],
                    "fmt": "fmt",
                },
                {
                    "name": "Contains non-words",
                    "value": summary["composition"]["non-words"],
                    "fmt": "fmt",
                },
            ],
            name="Composition",
            anchor_id="{varid}composition".format(varid=summary["varid"]),
        )

        length = Table(
            [
                {
                    "name": "Max length",
                    "value": summary["max_length"],
                    "fmt": "fmt_numeric",
                },
                {
                    "name": "Mean length",
                    "value": summary["mean_length"],
                    "fmt": "fmt_numeric",
                },
                {
                    "name": "Min length",
                    "value": summary["min_length"],
                    "fmt": "fmt_numeric",
                },
            ],
            name="Length",
            anchor_id="{varid}lengthstats".format(varid=summary["varid"]),
        )

        tbl = Sequence(
            [composition, length],
            anchor_id="{varid}tbl".format(varid=summary["varid"]),
            name="Composition",
            sequence_type="grid",
        )

        items.append(tbl)

        histogram_bins = 10

        length = Image(
            histogram(summary["length"], summary, histogram_bins),
            image_format=image_format,
            alt="Scatter",
            name="Length",
            anchor_id="{varid}length".format(varid=summary["varid"]),
        )
        items.append(length)

    template_variables["bottom"] = Sequence(
        items,
        sequence_type="tabs",
        anchor_id="{varid}bottom".format(varid=summary["varid"]),
    )

    return template_variables
示例#7
0
def render_boolean(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    n_obs_bool = config.vars.bool.n_obs
    image_format = config.plot.image_format

    # Prepare variables
    template_variables = render_common(config, summary)

    # Element composition
    info = VariableInfo(
        anchor_id=summary["varid"],
        alerts=summary["alerts"],
        var_type="Boolean",
        var_name=summary["varname"],
        description=summary["description"],
    )

    table = Table(
        [
            {
                "name": "Distinct",
                "value": fmt(summary["n_distinct"]),
                "alert": "n_distinct" in summary["alert_fields"],
            },
            {
                "name": "Distinct (%)",
                "value": fmt_percent(summary["p_distinct"]),
                "alert": "p_distinct" in summary["alert_fields"],
            },
            {
                "name": "Missing",
                "value": fmt(summary["n_missing"]),
                "alert": "n_missing" in summary["alert_fields"],
            },
            {
                "name": "Missing (%)",
                "value": fmt_percent(summary["p_missing"]),
                "alert": "p_missing" in summary["alert_fields"],
            },
            {
                "name": "Memory size",
                "value": fmt_bytesize(summary["memory_size"]),
                "alert": False,
            },
        ]
    )

    fqm = FrequencyTableSmall(
        freq_table(
            freqtable=summary["value_counts_without_nan"],
            n=summary["n"],
            max_number_to_print=n_obs_bool,
        ),
        redact=False,
    )

    template_variables["top"] = Container([info, table, fqm], sequence_type="grid")

    items: List[Renderable] = [
        FrequencyTable(
            template_variables["freq_table_rows"],
            name="Common Values",
            anchor_id=f"{varid}frequency_table",
            redact=False,
        )
    ]

    show = config.plot.cat_freq.show
    max_unique = config.plot.cat_freq.max_unique

    if show and (max_unique > 0):
        items.append(
            Image(
                cat_frequency_plot(
                    config,
                    summary["value_counts_without_nan"],
                ),
                image_format=image_format,
                alt="Category Frequency Plot",
                name="Category Frequency Plot",
                anchor_id=f"{varid}cat_frequency_plot",
            )
        )

    template_variables["bottom"] = Container(
        items, sequence_type="tabs", anchor_id=f"{varid}bottom"
    )

    return template_variables
def get_correlation_items(summary) -> Optional[Renderable]:
    """Create the list of correlation items

    Args:
        summary: dict of correlations

    Returns:
        List of correlation items to show in the interface.
    """
    items: List[Renderable] = []

    pearson_description = (
        "皮尔逊相关系数 ( <em>r</em> ) 是衡量两个变量之间线性相关关系的指标, "
        "它的值在-1和+1之间,-1表示完全负线性相关,0表示没有线性"
        "相关,1表示完全正线性相关。它的值在-1和+1之间,-1表示完 "
        "全负线性相关,0表示没有线性相关,1表示完全正线性相关。 "
        "此外,在两个变量的位置和比例分别变化的情况下<em>r</em>"
        "是不变的,这意味着对于一个线性函数来说,与x轴的角度不会影响 <em>r</em>.<br /><br />要计算两个变量X和Y的 <em>r</em>"
        " 就要用X和Y的协方差除以它们的标准差的乘积。")
    spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic 
    correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than 
    Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 
    0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To 
    calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank 
    variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """

    kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation 
    coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, 
    -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation.
    <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of 
    concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the 
    discordant pairs divided by the total number of pairs."""

    phi_k_description = """Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case
    of a bivariate normal input distribution. There is extensive documentation available <a href='https://phik.readthedocs.io/en/latest/index.html'>here</a>."""

    cramers_description = """Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association.
    The empirical estimators used for Cramér's V have been proved to be biased, even for large samples.
    We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found <a href='http://stats.lse.ac.uk/bergsma/pdf/cramerV3.pdf'>here</a>."""

    key_to_data = {
        "pearson": (-1, "Pearson's r", pearson_description),
        "spearman": (-1, "Spearman's ρ", spearman_description),
        "kendall": (-1, "Kendall's τ", kendall_description),
        "phi_k": (0, "Phik (φk)", phi_k_description),
        "cramers": (0, "Cramér's V (φc)", cramers_description),
    }

    image_format = config["plot"]["image_format"].get(str)

    for key, item in summary["correlations"].items():
        vmin, name, description = key_to_data[key]

        diagram = Image(
            plot.correlation_matrix(item, vmin=vmin),
            image_format=image_format,
            alt=name,
            anchor_id=f"{key}_diagram",
            name=name,
            classes="correlation-diagram",
        )

        if len(description) > 0:
            desc = HTML(
                f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>',
                anchor_id=f"{key}_html",
                classes="correlation-description",
            )

            tbl = Container([diagram, desc],
                            anchor_id=key,
                            name=name,
                            sequence_type="grid")

            items.append(tbl)
        else:
            items.append(diagram)

    corr = Container(
        items,
        sequence_type="tabs",
        name="相关性列表",
        anchor_id="correlations_tab",
    )

    if len(items) > 0:
        btn = ToggleButton(
            "切换相关性描述",
            anchor_id="toggle-correlation-description",
            name="Toggle correlation descriptions",
        )

        return Collapse(name="相关性",
                        anchor_id="correlations",
                        button=btn,
                        item=corr)
    else:
        return None
示例#9
0
def render_complex(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    template_variables = {}
    image_format = config.plot.image_format

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Complex number (&Copf;)",
        summary["alerts"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"])
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"])
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
        },
    ])

    table2 = Table([
        {
            "name":
            "Mean",
            "value":
            fmt_numeric(summary["mean"], precision=config.report.precision),
        },
        {
            "name":
            "Minimum",
            "value":
            fmt_numeric(summary["min"], precision=config.report.precision),
        },
        {
            "name":
            "Maximum",
            "value":
            fmt_numeric(summary["max"], precision=config.report.precision),
        },
        {
            "name":
            "Zeros",
            "value":
            fmt_numeric(summary["n_zeros"], precision=config.report.precision),
        },
        {
            "name": "Zeros (%)",
            "value": fmt_percent(summary["p_zeros"])
        },
    ])

    placeholder = HTML("")

    template_variables["top"] = Container([info, table1, table2, placeholder],
                                          sequence_type="grid")

    # Bottom
    items = [
        Image(
            scatter_complex(config, summary["scatter_data"]),
            image_format=image_format,
            alt="Scatterplot",
            caption="Scatterplot in the complex plane",
            name="Scatter",
            anchor_id=f"{varid}scatter",
        )
    ]

    bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"])

    template_variables["bottom"] = bottom

    return template_variables
示例#10
0
def render_date(summary):
    # TODO: render common?
    template_variables = {}

    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(summary["varid"], summary["varname"], "Date",
                        summary["warnings"])

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
    ])

    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary,
                       summary["histogram_bins"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Sequence([info, table1, table2, mini_histo],
                                         sequence_type="grid")

    # Bottom
    bottom = Sequence(
        [
            Image(
                histogram(summary["histogram_data"], summary,
                          summary["histogram_bins"]),
                image_format=image_format,
                alt="Histogram",
                caption="Histogram",
                name="Histogram",
                anchor_id="{varid}histogram".format(varid=summary["varid"]),
            )
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    template_variables["bottom"] = bottom

    return template_variables
示例#11
0
def render_path(summary):
    varid = summary["varid"]
    n_freq_table_max = config["n_freq_table_max"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_categorical(summary)

    keys = ["name", "parent", "suffix", "stem"]
    for path_part in keys:
        template_variables[f"freqtable_{path_part}"] = freq_table(
            freqtable=summary[f"{path_part}_counts"],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    # Top
    template_variables["top"].content["items"][0].content["var_type"] = "Path"
    # TODO: colspan=2
    # template_variables['top'].content['items'][1].content['rows'].append({'name': 'Common prefix', 'value': summary['common_prefix'], 'fmt': 'fmt'})
    # {  # <td>#}
    #     {  # <div style="white-space: nowrap;overflow: hidden;text-overflow: ellipsis;max-width: 600px;">#}
    #         {  # {{ values['common_prefix'] }}#}
    #             {  # </div>#}
    #                 {  # </td>#}
    #
    # Bottom
    full = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Full",
        anchor_id=f"{varid}full_frequency",
    )

    stem = FrequencyTable(
        template_variables["freqtable_stem"],
        name="Stem",
        anchor_id=f"{varid}stem_frequency",
    )

    name = FrequencyTable(
        template_variables["freqtable_name"],
        name="Name",
        anchor_id=f"{varid}name_frequency",
    )

    suffix = FrequencyTable(
        template_variables["freqtable_suffix"],
        name="Suffix",
        anchor_id=f"{varid}suffix_frequency",
    )

    parent = FrequencyTable(
        template_variables["freqtable_parent"],
        name="Parent",
        anchor_id=f"{varid}parent_frequency",
    )

    template_variables["bottom"].content["items"].append(full)
    template_variables["bottom"].content["items"].append(stem)
    template_variables["bottom"].content["items"].append(name)
    template_variables["bottom"].content["items"].append(suffix)
    template_variables["bottom"].content["items"].append(parent)

    if "file_sizes" in summary:
        file_size_histogram = Image(
            histogram(summary["file_sizes"], summary,
                      summary["histogram_bins"]),
            image_format=image_format,
            alt="File size",
            caption=
            f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={summary['histogram_bins']})",
            name="File size",
            anchor_id=f"{varid}file_size_histogram",
        )

        # TODO: in SequeencyItem
        template_variables["bottom"].content["items"].append(
            file_size_histogram)

    return template_variables
def render_complex(summary):
    varid = summary["varid"]
    template_variables = {}
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Complex number (&Copf;)",
        summary["warnings"],
    )

    table1 = Table(
        [
            {"name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt"},
            {"name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent"},
            {"name": "Missing", "value": summary["n_missing"], "fmt": "fmt"},
            {
                "name": "Missing (%)",
                "value": summary["p_missing"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Memory size",
                "value": summary["memory_size"],
                "fmt": "fmt_bytesize",
            },
        ]
    )

    table2 = Table(
        [
            {"name": "Mean", "value": summary["mean"], "fmt": "fmt"},
            {"name": "Minimum", "value": summary["min"], "fmt": "fmt"},
            {"name": "Maximum", "value": summary["max"], "fmt": "fmt"},
            {"name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt"},
            {"name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent"},
        ]
    )

    placeholder = HTML("")

    template_variables["top"] = Sequence(
        [info, table1, table2, placeholder], sequence_type="grid"
    )

    # Bottom
    items = [
        Image(
            scatter_complex(summary["scatter_data"]),
            image_format=image_format,
            alt="Scatterplot",
            caption="Scatterplot in the complex plane",
            name="Scatter",
            anchor_id=f"{varid}scatter",
        )
    ]

    bottom = Sequence(items, sequence_type="tabs", anchor_id=summary["varid"])

    template_variables["bottom"] = bottom

    return template_variables
示例#13
0
def render_complex(summary):
    varid = summary["varid"]
    template_variables = {}
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Complex number (&Copf;)",
        summary["warnings"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "唯一值计数",
            "value": summary["n_unique"],
            "fmt": "fmt"
        },
        {
            "name": "唯一值比例 (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent"
        },
        {
            "name": "缺失值",
            "value": summary["n_missing"],
            "fmt": "fmt"
        },
        {
            "name": "缺失值比例(%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
        },
        {
            "name": "内存占用",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    table2 = Table([
        {
            "name": "均数",
            "value": summary["mean"],
            "fmt": "fmt_numeric"
        },
        {
            "name": "最小值",
            "value": summary["min"],
            "fmt": "fmt_numeric"
        },
        {
            "name": "最大值",
            "value": summary["max"],
            "fmt": "fmt_numeric"
        },
        {
            "name": "零值",
            "value": summary["n_zeros"],
            "fmt": "fmt_numeric"
        },
        {
            "name": "零值比例 (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent"
        },
    ])

    placeholder = HTML("")

    template_variables["top"] = Container([info, table1, table2, placeholder],
                                          sequence_type="grid")

    # Bottom
    items = [
        Image(
            scatter_complex(summary["scatter_data"]),
            image_format=image_format,
            alt="Scatterplot",
            caption="Scatterplot in the complex plane",
            name="Scatter",
            anchor_id=f"{varid}scatter",
        )
    ]

    bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"])

    template_variables["bottom"] = bottom

    return template_variables
def render_image(summary):
    varid = summary["varid"]
    n_freq_table_max = config["n_freq_table_max"].get(int)
    redact = config["vars"]["cat"]["redact"].get(bool)

    template_variables = render_file(summary)

    # Top
    template_variables["top"].content["items"][0].content["var_type"] = "Image"

    # Bottom
    image_items = []
    """
    Min Width           Min Height          Min Area
    Mean Width          Mean Height         Mean Height
    Median Width        Median Height       Median Height
    Max Width           Max Height          Max Height

    All dimension properties are in pixels.
    """

    image_shape_items = [
        Container(
            [
                Table([
                    {
                        "name": "Min width",
                        "value": summary["min_width"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Median width",
                        "value": summary["median_width"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Max width",
                        "value": summary["max_width"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                ]),
                Table([
                    {
                        "name": "Min height",
                        "value": summary["min_height"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Median height",
                        "value": summary["median_height"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Max height",
                        "value": summary["max_height"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                ]),
                Table([
                    {
                        "name": "Min area",
                        "value": summary["min_area"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Median area",
                        "value": summary["median_area"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                    {
                        "name": "Max area",
                        "value": summary["max_area"],
                        "fmt": "fmt_numeric",
                        "alert": False,
                    },
                ]),
            ],
            anchor_id=f"{varid}tbl",
            name="Overview",
            sequence_type="grid",
        ),
        Image(
            scatter_series(summary["image_dimensions"]),
            image_format=config["plot"]["image_format"].get(str),
            alt="Scatter plot of image sizes",
            caption="Scatter plot of image sizes",
            name="Scatter plot",
            anchor_id=f"{varid}image_dimensions_scatter",
        ),
        FrequencyTable(
            freq_table(
                freqtable=summary["image_dimensions"].value_counts(),
                n=summary["n"],
                max_number_to_print=n_freq_table_max,
            ),
            name="Common values",
            anchor_id=f"{varid}image_dimensions_frequency",
            redact=False,
        ),
    ]

    image_shape = Container(
        image_shape_items,
        sequence_type="named_list",
        name="Dimensions",
        anchor_id=f"{varid}image_dimensions",
    )

    if "exif_keys_counts" in summary:
        items = [
            FrequencyTable(
                freq_table(
                    freqtable=pd.Series(summary["exif_keys_counts"]),
                    n=summary["n"],
                    max_number_to_print=n_freq_table_max,
                ),
                name="Exif keys",
                anchor_id=f"{varid}exif_keys",
                redact=redact,
            )
        ]
        for key, counts in summary["exif_data"].items():
            if key == "exif_keys":
                continue

            items.append(
                FrequencyTable(
                    freq_table(
                        freqtable=counts,
                        n=summary["n"],
                        max_number_to_print=n_freq_table_max,
                    ),
                    name=key,
                    anchor_id=f"{varid}_exif_{key}",
                    redact=redact,
                ))

        image_items.append(
            Container(
                items,
                anchor_id=f"{varid}exif_data",
                name="Exif data",
                sequence_type="named_list",
            ))

    image_items.append(image_shape)

    image_tab = Container(
        image_items,
        name="Image",
        sequence_type="tabs",
        anchor_id=f"{varid}image",
    )

    template_variables["bottom"].content["items"].append(image_tab)

    return template_variables
示例#15
0
def render_count(summary):
    varid = summary["varid"]
    template_variables = render_common(summary)
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Real number (&Ropf; / &Ropf;<sub>&ge;0</sub>)",
        summary["warnings"],
        summary["description"],
    )

    table1 = Table(
        [
            {
                "name": "唯一值计数",
                "value": summary["n_unique"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "唯一值 (%)",
                "value": summary["p_unique"],
                "fmt": "fmt_percent",
                "alert": False,
            },
            {
                "name": "缺失值",
                "value": summary["n_missing"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "缺失值比例 (%)",
                "value": summary["p_missing"],
                "fmt": "fmt_percent",
                "alert": False,
            },
        ]
    )

    table2 = Table(
        [
            {
                "name": "均数",
                "value": summary["mean"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "最小值",
                "value": summary["min"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "最大值",
                "value": summary["max"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "零值",
                "value": summary["n_zeros"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "零值 (%)",
                "value": summary["p_zeros"],
                "fmt": "fmt_percent",
                "alert": False,
            },
            {
                "name": "内存占用",
                "value": summary["memory_size"],
                "fmt": "fmt_bytesize",
                "alert": False,
            },
        ]
    )

    mini_histo = Image(
        mini_histogram(*summary["histogram"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container(
        [info, table1, table2, mini_histo], sequence_type="grid"
    )

    quantile_statistics = {
        "name": "定性分析",
        "items": [
            {
                "name": "最小值",
                "value": summary["min"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "5-th 百分位",
                "value": summary["quantile_5"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q1",
                "value": summary["quantile_25"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "中位数",
                "value": summary["quantile_50"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q3",
                "value": summary["quantile_75"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "95-th 百分位",
                "value": summary["quantile_95"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "最大值",
                "value": summary["max"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "区间",
                "value": summary["range"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "四分位距",
                "value": summary["iqr"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
        ],
    }

    descriptive_statistics = {
        "name": "描述性统计",
        "items": [
            {
                "name": "标准差",
                "value": summary["std"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "变异系数",
                "value": summary["cv"],
                "fmt": "fmt_numeric",
            },
            {"name": "峰度", "value": summary["kurt"], "fmt": "fmt_numeric"},
            {"name": "均数", "value": summary["mean"], "fmt": "fmt_numeric"},
            {"name": "MAD", "value": summary["mad"], "fmt": "fmt_numeric"},
            {"name": "偏度", "value": summary["skew"], "fmt": "fmt_numeric"},
            {"name": "积", "value": summary["sum"], "fmt": "fmt_numeric"},
            {"name": "方差", "value": summary["var"], "fmt": "fmt_numeric"},
        ],
    }

    # TODO: Make sections data structure
    # statistics = ItemRenderer(
    #     'statistics',
    #     'Statistics',
    #     'table',
    #     [
    #         quantile_statistics,
    #         descriptive_statistics
    #     ]
    # )

    seqs = [
        Image(
            histogram(*summary["histogram"]),
            image_format=image_format,
            alt="Histogram",
            caption=f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
            name="Histogram",
            anchor_id="histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id="common_values",
        redact=False,
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id="firstn",
                redact=False,
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id="lastn",
                redact=False,
            ),
        ],
        sequence_type="tabs",
        name="极值",
        anchor_id="extreme_values",
    )

    template_variables["bottom"] = Container(
        [
            # statistics,
            Container(
                seqs, sequence_type="tabs", name="直方图", anchor_id="histograms"
            ),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    return template_variables
示例#16
0
def render_count(summary):
    varid = summary["varid"]
    template_variables = render_common(summary)
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Real number (&Ropf; / &Ropf;<sub>&ge;0</sub>)",
        summary["warnings"],
    )

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    # TODO: replace with SmallImage...
    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary,
                       summary["histogram_bins"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    quantile_statistics = {
        "name":
        "Quantile statistics",
        "items": [
            {
                "name": "Minimum",
                "value": summary["min"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "5-th percentile",
                "value": summary["quantile_5"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q1",
                "value": summary["quantile_25"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "median",
                "value": summary["quantile_50"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q3",
                "value": summary["quantile_75"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "95-th percentile",
                "value": summary["quantile_95"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Maximum",
                "value": summary["max"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Range",
                "value": summary["range"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Interquartile range",
                "value": summary["iqr"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
        ],
    }

    descriptive_statistics = {
        "name":
        "Descriptive statistics",
        "items": [
            {
                "name": "Standard deviation",
                "value": summary["std"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Coefficient of variation",
                "value": summary["cv"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Kurtosis",
                "value": summary["kurt"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Mean",
                "value": summary["mean"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "MAD",
                "value": summary["mad"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Skewness",
                "value": summary["skew"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Sum",
                "value": summary["sum"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Variance",
                "value": summary["var"],
                "fmt": "fmt_numeric"
            },
        ],
    }

    # TODO: Make sections data structure
    # statistics = ItemRenderer(
    #     'statistics',
    #     'Statistics',
    #     'table',
    #     [
    #         quantile_statistics,
    #         descriptive_statistics
    #     ]
    # )

    seqs = [
        Image(
            histogram(summary["histogram_data"], summary,
                      summary["histogram_bins"]),
            image_format=image_format,
            alt="Histogram",
            caption=
            f"<strong>Histogram with fixed size bins</strong> (bins={summary['histogram_bins']})",
            name="Histogram",
            anchor_id="histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id="common_values",
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id="firstn",
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id="lastn",
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id="extreme_values",
    )

    if "histogram_bins_bayesian_blocks" in summary:
        histo_dyn = Image(
            histogram(
                summary["histogram_data"],
                summary,
                summary["histogram_bins_bayesian_blocks"],
            ),
            image_format=image_format,
            alt="Histogram",
            caption=
            '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)'
            .format(
                fmt_array(summary["histogram_bins_bayesian_blocks"],
                          threshold=5)),
            name="Dynamic Histogram",
            anchor_id="dynamic_histogram",
        )

        seqs.append(histo_dyn)

    template_variables["bottom"] = Container(
        [
            # statistics,
            Container(seqs,
                      sequence_type="tabs",
                      name="Histogram(s)",
                      anchor_id="histograms"),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    return template_variables
示例#17
0
def render_categorical(summary):
    varid = summary["varid"]
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_common(summary)

    # TODO: merge with boolean
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["count"],
        max_number_to_print=n_obs_cat,
    )

    # Top
    # Element composition
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Categorical",
        summary["warnings"],
        summary["description"],
    )

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    # TODO: settings 3,3,6
    template_variables["top"] = Container([info, table, fqm],
                                          sequence_type="grid")

    # Bottom
    items = []
    frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id=f"{varid}common_values",
    )

    items.append(frequency_table)

    check_length = config["vars"]["cat"]["length"].get(bool)
    if check_length:
        length_table = Table(
            [
                {
                    "name": "Max length",
                    "value": summary["max_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
                {
                    "name": "Median length",
                    "value": summary["median_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
                {
                    "name": "Mean length",
                    "value": summary["mean_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
                {
                    "name": "Min length",
                    "value": summary["min_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
            ],
            name="Length",
            anchor_id=f"{varid}lengthstats",
        )

        histogram_bins = 10

        length = Image(
            histogram(summary["length"], summary, histogram_bins),
            image_format=image_format,
            alt="Scatter",
            name="Length",
            anchor_id=f"{varid}length",
        )

        length_tab = Container(
            [length, length_table],
            anchor_id=f"{varid}tbl",
            name="Length",
            sequence_type="grid",
        )

        items.append(length_tab)

    check_unicode = config["vars"]["cat"]["unicode"].get(bool)
    if check_unicode:
        n_freq_table_max = config["n_freq_table_max"].get(int)

        category_items = [
            FrequencyTable(
                freq_table(
                    freqtable=summary["category_alias_counts"],
                    n=summary["category_alias_counts"].sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name="Most occurring categories",
                anchor_id=f"{varid}category_long_values",
            )
        ]
        for category_alias_name, category_alias_counts in summary[
                "category_alias_char_counts"].items():
            category_alias_name = category_alias_name.replace("_", " ")
            category_items.append(
                FrequencyTable(
                    freq_table(
                        freqtable=category_alias_counts,
                        n=category_alias_counts.sum(),
                        max_number_to_print=n_freq_table_max,
                    ),
                    name=f"Most frequent {category_alias_name} characters",
                    anchor_id=
                    f"{varid}category_alias_values_{category_alias_name}",
                ))

        script_items = [
            FrequencyTable(
                freq_table(
                    freqtable=summary["script_counts"],
                    n=summary["script_counts"].sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name="Most occurring scripts",
                anchor_id=f"{varid}script_values",
            ),
        ]
        for script_name, script_counts in summary["script_char_counts"].items(
        ):
            script_items.append(
                FrequencyTable(
                    freq_table(
                        freqtable=script_counts,
                        n=script_counts.sum(),
                        max_number_to_print=n_freq_table_max,
                    ),
                    name=f"Most frequent {script_name} characters",
                    anchor_id=f"{varid}script_values_{script_name}",
                ))

        block_items = [
            FrequencyTable(
                freq_table(
                    freqtable=summary["block_alias_counts"],
                    n=summary["block_alias_counts"].sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name="Most occurring blocks",
                anchor_id=f"{varid}block_alias_values",
            )
        ]
        for block_name, block_counts in summary[
                "block_alias_char_counts"].items():
            block_items.append(
                FrequencyTable(
                    freq_table(
                        freqtable=block_counts,
                        n=block_counts.sum(),
                        max_number_to_print=n_freq_table_max,
                    ),
                    name=f"Most frequent {block_name} characters",
                    anchor_id=f"{varid}block_alias_values_{block_name}",
                ))

        citems = [
            Container(
                [
                    Table(
                        [
                            {
                                "name": "Unique unicode characters",
                                "value": summary["n_characters"],
                                "fmt": "fmt_numeric",
                                "alert": False,
                            },
                            {
                                "name":
                                'Unique unicode categories (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_character_property#General_Category">?</a>)',
                                "value": summary["n_category"],
                                "fmt": "fmt_numeric",
                                "alert": False,
                            },
                            {
                                "name":
                                'Unique unicode scripts (<a target="_blank" href="https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode">?</a>)',
                                "value": summary["n_scripts"],
                                "fmt": "fmt_numeric",
                                "alert": False,
                            },
                            {
                                "name":
                                'Unique unicode blocks (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_block">?</a>)',
                                "value": summary["n_block_alias"],
                                "fmt": "fmt_numeric",
                                "alert": False,
                            },
                        ],
                        name="Overview of Unicode Properties",
                        caption=
                        "The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ",
                    ),
                ],
                anchor_id=f"{varid}character_overview",
                name="Overview",
                sequence_type="list",
            ),
            Container(
                [
                    FrequencyTable(
                        freq_table(
                            freqtable=summary["character_counts"],
                            n=summary["character_counts"].sum(),
                            max_number_to_print=n_freq_table_max,
                        ),
                        name="Most occurring characters",
                        anchor_id=f"{varid}character_frequency",
                    ),
                ],
                name="Characters",
                anchor_id=f"{varid}characters",
                sequence_type="named_list",
            ),
            Container(
                category_items,
                name="Categories",
                anchor_id=f"{varid}categories",
                sequence_type="named_list",
            ),
            Container(
                script_items,
                name="Scripts",
                anchor_id=f"{varid}scripts",
                sequence_type="named_list",
            ),
            Container(
                block_items,
                name="Blocks",
                anchor_id=f"{varid}blocks",
                sequence_type="named_list",
            ),
        ]

        characters = Container(
            citems,
            name="Unicode",
            sequence_type="tabs",
            anchor_id=f"{varid}unicode",
        )

        items.append(characters)

    template_variables["bottom"] = Container(items,
                                             sequence_type="tabs",
                                             anchor_id=f"{varid}bottom")

    return template_variables
def render_categorical(summary):
    varid = summary["varid"]
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_common(summary)

    # TODO: merge with boolean
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["count"],
        max_number_to_print=n_obs_cat,
    )

    # Top
    # Element composition
    info = VariableInfo(summary["varid"], summary["varname"], "Categorical",
                        summary["warnings"])

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    # TODO: settings 3,3,6
    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    # Bottom
    items = []
    frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id=f"{varid}common_values",
    )

    items.append(frequency_table)

    check_compositions = config["vars"]["cat"]["check_composition"].get(bool)
    if check_compositions:
        length_table = Table(
            [
                {
                    "name": "Max length",
                    "value": summary["max_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
                {
                    "name": "Mean length",
                    "value": summary["mean_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
                {
                    "name": "Min length",
                    "value": summary["min_length"],
                    "fmt": "fmt_numeric",
                    "alert": False,
                },
            ],
            name="Length",
            anchor_id=f"{varid}lengthstats",
        )

        histogram_bins = 10

        length = Image(
            histogram(summary["length"], summary, histogram_bins),
            image_format=image_format,
            alt="Scatter",
            name="Length",
            anchor_id=f"{varid}length",
        )

        tbl = Sequence(
            [length, length_table],
            anchor_id=f"{varid}tbl",
            name="Length",
            sequence_type="grid",
        )

        items.append(tbl)

        n_freq_table_max = config["n_freq_table_max"].get(int)

        citems = []
        vc = pd.Series(summary["category_alias_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Categories",
                anchor_id=f"{varid}category_long_values",
            ))

        vc = pd.Series(summary["script_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Scripts",
                anchor_id=f"{varid}script_values",
            ))

        vc = pd.Series(summary["block_alias_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Blocks",
                anchor_id=f"{varid}block_alias_values",
            ))

        characters = Sequence(
            citems,
            name="Characters",
            sequence_type="tabs",
            anchor_id=f"{varid}characters",
        )

        items.append(characters)

    template_variables["bottom"] = Sequence(items,
                                            sequence_type="tabs",
                                            anchor_id=f"{varid}bottom")

    return template_variables
示例#19
0
def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
    varid = summary["varid"]
    template_variables = {}

    image_format = config.plot.image_format

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Date",
        summary["alerts"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"]),
            "alert": False,
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
            "alert": False,
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"]),
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name": "Minimum",
            "value": fmt(summary["min"]),
            "alert": False
        },
        {
            "name": "Maximum",
            "value": fmt(summary["max"]),
            "alert": False
        },
    ])

    mini_histo = Image(
        mini_histogram(config,
                       summary["histogram"][0],
                       summary["histogram"][1],
                       date=True),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    # Bottom
    bottom = Container(
        [
            Image(
                histogram(config,
                          summary["histogram"][0],
                          summary["histogram"][1],
                          date=True),
                image_format=image_format,
                alt="Histogram",
                caption=
                f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
                name="Histogram",
                anchor_id=f"{varid}histogram",
            )
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    template_variables["bottom"] = bottom

    return template_variables
示例#20
0
def render_categorical(summary):
    varid = summary["varid"]
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)
    redact = config["vars"]["cat"]["redact"].get(bool)

    template_variables = render_common(summary)

    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Categorical",
        summary["warnings"],
        summary["description"],
    )

    table = Table([
        {
            "name": "Distinct",
            "value": summary["n_distinct"],
            "fmt": "fmt",
            "alert": "n_distinct" in summary["warn_fields"],
        },
        {
            "name": "Distinct (%)",
            "value": summary["p_distinct"],
            "fmt": "fmt_percent",
            "alert": "p_distinct" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(
        freq_table(
            freqtable=summary["value_counts"],
            n=summary["count"],
            max_number_to_print=n_obs_cat,
        ),
        redact=redact,
    )

    template_variables["top"] = Container([info, table, fqm],
                                          sequence_type="grid")

    citems = [
        FrequencyTable(
            template_variables["freq_table_rows"],
            name="Common Values",
            anchor_id=f"{varid}common_values",
            redact=redact,
        ),
        render_categorical_frequency(summary, varid, image_format),
    ]

    max_unique = config["plot"]["pie"]["max_unique"].get(int)
    if max_unique > 0 and summary["n_distinct"] <= max_unique:
        citems.append(
            Image(
                pie_plot(summary["value_counts"],
                         legend_kws={"loc": "upper right"}),
                image_format=image_format,
                alt="Chart",
                name="Chart",
                anchor_id=f"{varid}pie_chart",
            ))

    # Bottom
    items = [
        Container(
            citems,
            name="Frequencies",
            anchor_id=f"{varid}frequencies",
            sequence_type="tabs",
        ),
    ]

    check_length = config["vars"]["cat"]["length"].get(bool)
    if check_length:
        items.append(render_categorical_length(summary, varid, image_format))

    check_unicode = config["vars"]["cat"]["unicode"].get(bool)
    if check_unicode:
        items.append(render_categorical_unicode(summary, varid, redact))

    template_variables["bottom"] = Container(items,
                                             sequence_type="tabs",
                                             anchor_id=f"{varid}bottom")

    return template_variables
示例#21
0
def render_count(config: Settings, summary: dict) -> dict:
    template_variables = render_common(config, summary)
    image_format = config.plot.image_format

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Real number (&Ropf; / &Ropf;<sub>&ge;0</sub>)",
        summary["alerts"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"]),
            "alert": False,
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
            "alert": False,
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"]),
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name":
            "Mean",
            "value":
            fmt_numeric(summary["mean"], precision=config.report.precision),
            "alert":
            False,
        },
        {
            "name": "Minimum",
            "value": fmt_numeric(summary["min"],
                                 precision=config.report.precision),
            "alert": False,
        },
        {
            "name": "Maximum",
            "value": fmt_numeric(summary["max"],
                                 precision=config.report.precision),
            "alert": False,
        },
        {
            "name": "Zeros",
            "value": fmt(summary["n_zeros"]),
            "alert": False,
        },
        {
            "name": "Zeros (%)",
            "value": fmt_percent(summary["p_zeros"]),
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
            "alert": False,
        },
    ])

    mini_histo = Image(
        mini_histogram(config, *summary["histogram"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    seqs = [
        Image(
            histogram(config, *summary["histogram"]),
            image_format=image_format,
            alt="Histogram",
            caption=
            f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
            name="Histogram",
            anchor_id="histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id="common_values",
        redact=False,
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name=f"Minimum {config.n_extreme_obs} values",
                anchor_id="firstn",
                redact=False,
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name=f"Maximum {config.n_extreme_obs} values",
                anchor_id="lastn",
                redact=False,
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id="extreme_values",
    )

    template_variables["bottom"] = Container(
        [
            Container(seqs,
                      sequence_type="tabs",
                      name="Histogram(s)",
                      anchor_id="histograms"),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    return template_variables