示例#1
0
def render_categorical_length(config: Settings, summary: dict,
                              varid: str) -> Tuple[Renderable, Renderable]:
    length_table = Table(
        [
            {
                "name": "Max length",
                "value": fmt_number(summary["max_length"]),
                "alert": False,
            },
            {
                "name": "Median length",
                "value": fmt_number(summary["median_length"]),
                "alert": False,
            },
            {
                "name":
                "Mean length",
                "value":
                fmt_numeric(summary["mean_length"],
                            precision=config.report.precision),
                "alert":
                False,
            },
            {
                "name": "Min length",
                "value": fmt_number(summary["min_length"]),
                "alert": False,
            },
        ],
        name="Length",
        anchor_id=f"{varid}lengthstats",
    )

    length_histo = Image(
        histogram(config, *summary["histogram_length"]),
        image_format=config.plot.image_format,
        alt="length histogram",
        name="Length",
        caption="Histogram of lengths of the category",
        anchor_id=f"{varid}length",
    )

    return length_table, length_histo
示例#2
0
def render_image(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    n_freq_table_max = config.n_freq_table_max
    redact = config.vars.cat.redact

    template_variables = render_file(config, summary)

    # Top
    template_variables["top"].content["items"][0].content["var_type"] = "Image"

    # Bottom
    image_items = []
    """
    Min Width           Min Height          Min Area
    Mean Width          Mean Height         Mean Height
    Median Width        Median Height       Median Height
    Max Width           Max Height          Max Height

    All dimension properties are in pixels.
    """

    image_shape_items = [
        Container(
            [
                Table([
                    {
                        "name":
                        "Min width",
                        "value":
                        fmt_numeric(summary["min_width"],
                                    precision=config.report.precision),
                        "alert":
                        False,
                    },
                    {
                        "name":
                        "Median width",
                        "value":
                        fmt_numeric(
                            summary["median_width"],
                            precision=config.report.precision,
                        ),
                        "alert":
                        False,
                    },
                    {
                        "name":
                        "Max width",
                        "value":
                        fmt_numeric(summary["max_width"],
                                    precision=config.report.precision),
                        "alert":
                        False,
                    },
                ]),
                Table([
                    {
                        "name":
                        "Min height",
                        "value":
                        fmt_numeric(summary["min_height"],
                                    precision=config.report.precision),
                        "alert":
                        False,
                    },
                    {
                        "name":
                        "Median height",
                        "value":
                        fmt_numeric(
                            summary["median_height"],
                            precision=config.report.precision,
                        ),
                        "alert":
                        False,
                    },
                    {
                        "name":
                        "Max height",
                        "value":
                        fmt_numeric(summary["max_height"],
                                    precision=config.report.precision),
                        "alert":
                        False,
                    },
                ]),
                Table([
                    {
                        "name":
                        "Min area",
                        "value":
                        fmt_numeric(summary["min_area"],
                                    precision=config.report.precision),
                        "alert":
                        False,
                    },
                    {
                        "name":
                        "Median area",
                        "value":
                        fmt_numeric(
                            summary["median_area"],
                            precision=config.report.precision,
                        ),
                        "alert":
                        False,
                    },
                    {
                        "name":
                        "Max area",
                        "value":
                        fmt_numeric(summary["max_area"],
                                    precision=config.report.precision),
                        "alert":
                        False,
                    },
                ]),
            ],
            anchor_id=f"{varid}tbl",
            name="Overview",
            sequence_type="grid",
        ),
        Image(
            scatter_series(config, summary["image_dimensions"]),
            image_format=config.plot.image_format,
            alt="Scatter plot of image sizes",
            caption="Scatter plot of image sizes",
            name="Scatter plot",
            anchor_id=f"{varid}image_dimensions_scatter",
        ),
        FrequencyTable(
            freq_table(
                freqtable=summary["image_dimensions"].value_counts(),
                n=summary["n"],
                max_number_to_print=n_freq_table_max,
            ),
            name="Common values",
            anchor_id=f"{varid}image_dimensions_frequency",
            redact=False,
        ),
    ]

    image_shape = Container(
        image_shape_items,
        sequence_type="named_list",
        name="Dimensions",
        anchor_id=f"{varid}image_dimensions",
    )

    if "exif_keys_counts" in summary:
        items = [
            FrequencyTable(
                freq_table(
                    freqtable=pd.Series(summary["exif_keys_counts"]),
                    n=summary["n"],
                    max_number_to_print=n_freq_table_max,
                ),
                name="Exif keys",
                anchor_id=f"{varid}exif_keys",
                redact=redact,
            )
        ]
        for key, counts in summary["exif_data"].items():
            if key == "exif_keys":
                continue

            items.append(
                FrequencyTable(
                    freq_table(
                        freqtable=counts,
                        n=summary["n"],
                        max_number_to_print=n_freq_table_max,
                    ),
                    name=key,
                    anchor_id=f"{varid}_exif_{key}",
                    redact=redact,
                ))

        image_items.append(
            Container(
                items,
                anchor_id=f"{varid}exif_data",
                name="Exif data",
                sequence_type="named_list",
            ))

    image_items.append(image_shape)

    image_tab = Container(
        image_items,
        name="Image",
        sequence_type="tabs",
        anchor_id=f"{varid}image",
    )

    template_variables["bottom"].content["items"].append(image_tab)

    return template_variables
def render_complex(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    template_variables = {}
    image_format = config.plot.image_format

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Complex number (ℂ)",
        summary["warnings"],
        summary["description"],
    )

    table1 = Table(
        [
            {"name": "Distinct", "value": fmt(summary["n_distinct"])},
            {
                "name": "Distinct (%)",
                "value": fmt_percent(summary["p_distinct"]),
            },
            {"name": "Missing", "value": fmt(summary["n_missing"])},
            {
                "name": "Missing (%)",
                "value": fmt_percent(summary["p_missing"]),
            },
            {
                "name": "Memory size",
                "value": fmt_bytesize(summary["memory_size"]),
            },
        ]
    )

    table2 = Table(
        [
            {
                "name": "Mean",
                "value": fmt_numeric(
                    summary["mean"], precision=config.report.precision
                ),
            },
            {
                "name": "Minimum",
                "value": fmt_numeric(summary["min"], precision=config.report.precision),
            },
            {
                "name": "Maximum",
                "value": fmt_numeric(summary["max"], precision=config.report.precision),
            },
            {
                "name": "Zeros",
                "value": fmt_numeric(
                    summary["n_zeros"], precision=config.report.precision
                ),
            },
            {"name": "Zeros (%)", "value": fmt_percent(summary["p_zeros"])},
        ]
    )

    placeholder = HTML("")

    template_variables["top"] = Container(
        [info, table1, table2, placeholder], sequence_type="grid"
    )

    # Bottom
    items = [
        Image(
            scatter_complex(config, summary["scatter_data"]),
            image_format=image_format,
            alt="Scatterplot",
            caption="Scatterplot in the complex plane",
            name="Scatter",
            anchor_id=f"{varid}scatter",
        )
    ]

    bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"])

    template_variables["bottom"] = bottom

    return template_variables
def test_fmt_numeric(value, precision, expected):
    assert fmt_numeric(value, precision) == expected
def render_path(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    n_freq_table_max = config.n_freq_table_max
    redact = config.vars.cat.redact

    template_variables = render_categorical(config, summary)

    keys = ["name", "parent", "suffix", "stem", "anchor"]
    for path_part in keys:
        template_variables[f"freqtable_{path_part}"] = freq_table(
            freqtable=summary[f"{path_part}_counts"],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    # Top
    template_variables["top"].content["items"][0].content["var_type"] = "Path"

    # Bottom
    path_overview_tab = Container(
        [
            Table(
                [
                    {
                        "name": "Common prefix",
                        "value": fmt(summary["common_prefix"]),
                        "alert": False,
                    },
                    {
                        "name": "Unique stems",
                        "value": fmt_numeric(
                            summary["n_stem_unique"], precision=config.report.precision
                        ),
                        "alert": False,
                    },
                    {
                        "name": "Unique names",
                        "value": fmt_numeric(
                            summary["n_name_unique"], precision=config.report.precision
                        ),
                        "alert": False,
                    },
                    {
                        "name": "Unique extensions",
                        "value": fmt_numeric(
                            summary["n_suffix_unique"],
                            precision=config.report.precision,
                        ),
                        "alert": False,
                    },
                    {
                        "name": "Unique directories",
                        "value": fmt_numeric(
                            summary["n_parent_unique"],
                            precision=config.report.precision,
                        ),
                        "alert": False,
                    },
                    {
                        "name": "Unique anchors",
                        "value": fmt_numeric(
                            summary["n_anchor_unique"],
                            precision=config.report.precision,
                        ),
                        "alert": False,
                    },
                ]
            )
        ],
        anchor_id=f"{varid}tbl",
        name="Overview",
        sequence_type="list",
    )

    path_items = [
        path_overview_tab,
        FrequencyTable(
            template_variables["freq_table_rows"],
            name="Full",
            anchor_id=f"{varid}full_frequency",
            redact=redact,
        ),
        FrequencyTable(
            template_variables["freqtable_stem"],
            name="Stem",
            anchor_id=f"{varid}stem_frequency",
            redact=redact,
        ),
        FrequencyTable(
            template_variables["freqtable_name"],
            name="Name",
            anchor_id=f"{varid}name_frequency",
            redact=redact,
        ),
        FrequencyTable(
            template_variables["freqtable_suffix"],
            name="Extension",
            anchor_id=f"{varid}suffix_frequency",
            redact=redact,
        ),
        FrequencyTable(
            template_variables["freqtable_parent"],
            name="Parent",
            anchor_id=f"{varid}parent_frequency",
            redact=redact,
        ),
        FrequencyTable(
            template_variables["freqtable_anchor"],
            name="Anchor",
            anchor_id=f"{varid}anchor_frequency",
            redact=redact,
        ),
    ]

    path_tab = Container(
        path_items,
        name="Path",
        sequence_type="tabs",
        anchor_id=f"{varid}path",
    )

    template_variables["bottom"].content["items"].append(path_tab)

    return template_variables
def render_count(config: Settings, summary: dict) -> dict:
    template_variables = render_common(config, summary)
    image_format = config.plot.image_format

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Real number (&Ropf; / &Ropf;<sub>&ge;0</sub>)",
        summary["warnings"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"]),
            "alert": False,
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
            "alert": False,
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"]),
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name":
            "Mean",
            "value":
            fmt_numeric(summary["mean"], precision=config.report.precision),
            "alert":
            False,
        },
        {
            "name": "Minimum",
            "value": fmt_numeric(summary["min"],
                                 precision=config.report.precision),
            "alert": False,
        },
        {
            "name": "Maximum",
            "value": fmt_numeric(summary["max"],
                                 precision=config.report.precision),
            "alert": False,
        },
        {
            "name": "Zeros",
            "value": fmt(summary["n_zeros"]),
            "alert": False,
        },
        {
            "name": "Zeros (%)",
            "value": fmt_percent(summary["p_zeros"]),
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
            "alert": False,
        },
    ])

    mini_histo = Image(
        mini_histogram(config, *summary["histogram"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    seqs = [
        Image(
            histogram(config, *summary["histogram"]),
            image_format=image_format,
            alt="Histogram",
            caption=
            f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
            name="Histogram",
            anchor_id="histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id="common_values",
        redact=False,
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id="firstn",
                redact=False,
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id="lastn",
                redact=False,
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id="extreme_values",
    )

    template_variables["bottom"] = Container(
        [
            Container(seqs,
                      sequence_type="tabs",
                      name="Histogram(s)",
                      anchor_id="histograms"),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    return template_variables
示例#7
0
def get_dataset_overview(config: Settings, summary: dict) -> Renderable:
    table_metrics = [
        {
            "name": "Number of variables",
            "value": fmt_number(summary["table"]["n_var"]),
        },
        {
            "name": "Number of observations",
            "value": fmt_number(summary["table"]["n"]),
        },
        {
            "name": "Missing cells",
            "value": fmt_number(summary["table"]["n_cells_missing"]),
        },
        {
            "name": "Missing cells (%)",
            "value": fmt_percent(summary["table"]["p_cells_missing"]),
        },
    ]
    if "n_duplicates" in summary["table"]:
        table_metrics.extend(
            [
                {
                    "name": "Duplicate rows",
                    "value": fmt_number(summary["table"]["n_duplicates"]),
                },
                {
                    "name": "Duplicate rows (%)",
                    "value": fmt_percent(summary["table"]["p_duplicates"]),
                },
            ]
        )

    table_metrics.extend(
        [
            {
                "name": "Total size in memory",
                "value": fmt_bytesize(summary["table"]["memory_size"]),
            },
            {
                "name": "Average record size in memory",
                "value": fmt_bytesize(summary["table"]["record_size"]),
            },
        ]
    )

    dataset_info = Table(
        table_metrics,
        name="Dataset statistics",
    )

    dataset_types = Table(
        [
            {
                "name": str(type_name),
                "value": fmt_numeric(count, precision=config.report.precision),
            }
            for type_name, count in summary["table"]["types"].items()
        ],
        name="Variable types",
    )

    return Container(
        [dataset_info, dataset_types],
        anchor_id="dataset_overview",
        name="Overview",
        sequence_type="grid",
    )
示例#8
0
def render_real(config: Settings, summary: dict) -> dict:
    varid = summary["varid"]
    template_variables = render_common(config, summary)
    image_format = config.plot.image_format

    if summary["min"] >= 0:
        name = "Real number (&Ropf;<sub>&ge;0</sub>)"
    else:
        name = "Real number (&Ropf;)"

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        name,
        summary["alerts"],
        summary["description"],
    )

    table1 = Table([
        {
            "name": "Distinct",
            "value": fmt(summary["n_distinct"]),
            "alert": "n_distinct" in summary["alert_fields"],
        },
        {
            "name": "Distinct (%)",
            "value": fmt_percent(summary["p_distinct"]),
            "alert": "p_distinct" in summary["alert_fields"],
        },
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"]),
            "alert": "n_missing" in summary["alert_fields"],
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
            "alert": "p_missing" in summary["alert_fields"],
        },
        {
            "name": "Infinite",
            "value": fmt(summary["n_infinite"]),
            "alert": "n_infinite" in summary["alert_fields"],
        },
        {
            "name": "Infinite (%)",
            "value": fmt_percent(summary["p_infinite"]),
            "alert": "p_infinite" in summary["alert_fields"],
        },
        {
            "name":
            "Mean",
            "value":
            fmt_numeric(summary["mean"], precision=config.report.precision),
            "alert":
            False,
        },
    ])

    table2 = Table([
        {
            "name": "Minimum",
            "value": fmt_numeric(summary["min"],
                                 precision=config.report.precision),
            "alert": False,
        },
        {
            "name": "Maximum",
            "value": fmt_numeric(summary["max"],
                                 precision=config.report.precision),
            "alert": False,
        },
        {
            "name": "Zeros",
            "value": fmt(summary["n_zeros"]),
            "alert": "n_zeros" in summary["alert_fields"],
        },
        {
            "name": "Zeros (%)",
            "value": fmt_percent(summary["p_zeros"]),
            "alert": "p_zeros" in summary["alert_fields"],
        },
        {
            "name": "Negative",
            "value": fmt(summary["n_negative"]),
            "alert": False,
        },
        {
            "name": "Negative (%)",
            "value": fmt_percent(summary["p_negative"]),
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
            "alert": False,
        },
    ])

    mini_histo = Image(
        mini_histogram(config, *summary["histogram"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Container([info, table1, table2, mini_histo],
                                          sequence_type="grid")

    quantile_statistics = Table(
        [
            {
                "name":
                "Minimum",
                "value":
                fmt_numeric(summary["min"], precision=config.report.precision),
            },
            {
                "name":
                "5-th percentile",
                "value":
                fmt_numeric(summary["5%"], precision=config.report.precision),
            },
            {
                "name":
                "Q1",
                "value":
                fmt_numeric(summary["25%"], precision=config.report.precision),
            },
            {
                "name":
                "median",
                "value":
                fmt_numeric(summary["50%"], precision=config.report.precision),
            },
            {
                "name":
                "Q3",
                "value":
                fmt_numeric(summary["75%"], precision=config.report.precision),
            },
            {
                "name":
                "95-th percentile",
                "value":
                fmt_numeric(summary["95%"], precision=config.report.precision),
            },
            {
                "name":
                "Maximum",
                "value":
                fmt_numeric(summary["max"], precision=config.report.precision),
            },
            {
                "name":
                "Range",
                "value":
                fmt_numeric(summary["range"],
                            precision=config.report.precision),
            },
            {
                "name":
                "Interquartile range (IQR)",
                "value":
                fmt_numeric(summary["iqr"], precision=config.report.precision),
            },
        ],
        name="Quantile statistics",
    )

    descriptive_statistics = Table(
        [
            {
                "name":
                "Standard deviation",
                "value":
                fmt_numeric(summary["std"], precision=config.report.precision),
            },
            {
                "name":
                "Coefficient of variation (CV)",
                "value":
                fmt_numeric(summary["cv"], precision=config.report.precision),
            },
            {
                "name":
                "Kurtosis",
                "value":
                fmt_numeric(summary["kurtosis"],
                            precision=config.report.precision),
            },
            {
                "name":
                "Mean",
                "value":
                fmt_numeric(summary["mean"],
                            precision=config.report.precision),
            },
            {
                "name":
                "Median Absolute Deviation (MAD)",
                "value":
                fmt_numeric(summary["mad"], precision=config.report.precision),
            },
            {
                "name":
                "Skewness",
                "value":
                fmt_numeric(summary["skewness"],
                            precision=config.report.precision),
                "class":
                "alert" if "skewness" in summary["alert_fields"] else "",
            },
            {
                "name":
                "Sum",
                "value":
                fmt_numeric(summary["sum"], precision=config.report.precision),
            },
            {
                "name":
                "Variance",
                "value":
                fmt_numeric(summary["variance"],
                            precision=config.report.precision),
            },
            {
                "name": "Monotonicity",
                "value": fmt_monotonic(summary["monotonic"]),
            },
        ],
        name="Descriptive statistics",
    )

    statistics = Container(
        [quantile_statistics, descriptive_statistics],
        anchor_id=f"{varid}statistics",
        name="Statistics",
        sequence_type="grid",
    )

    hist = Image(
        histogram(config, *summary["histogram"]),
        image_format=image_format,
        alt="Histogram",
        caption=
        f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
        name="Histogram",
        anchor_id=f"{varid}histogram",
    )

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id=f"{varid}common_values",
        redact=False,
    )

    evs = Container(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name=f"Minimum {config.n_extreme_obs} values",
                anchor_id=f"{varid}firstn",
                redact=False,
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name=f"Maximum {config.n_extreme_obs} values",
                anchor_id=f"{varid}lastn",
                redact=False,
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id=f"{varid}extreme_values",
    )

    template_variables["bottom"] = Container(
        [statistics, hist, fq, evs],
        sequence_type="tabs",
        anchor_id=f"{varid}bottom",
    )

    return template_variables