예제 #1
0
def render_common(summary):
    n_extreme_obs = config["n_extreme_obs"].get(int)
    n_freq_table_max = config["n_freq_table_max"].get(int)

    template_variables = {
        "freq_table_rows":
        freq_table(
            freqtable=summary["value_counts"],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        ),
        "firstn_expanded":
        extreme_obs_table(
            freqtable=summary["value_counts"],
            number_to_print=n_extreme_obs,
            n=summary["n"],
            ascending=True,
        ),
        "lastn_expanded":
        extreme_obs_table(
            freqtable=summary["value_counts"],
            number_to_print=n_extreme_obs,
            n=summary["n"],
            ascending=False,
        ),
    }

    return template_variables
예제 #2
0
def render_path_image(summary):
    varid = summary["varid"]
    n_freq_table_max = config["n_freq_table_max"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_path(summary)

    # Top
    template_variables["top"].content["items"][0].content["var_type"] = "Image Path"

    # Bottom
    keys = {"Image shape": "image_shape", "Exif keys": "exif_keys"}

    for title, key in keys.items():
        template_variables[f"freqtable_{key}"] = freq_table(
            freqtable=summary[f"{key}_counts"],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    exif_keys = FrequencyTable(
        template_variables["freqtable_exif_keys"],
        name="Exif keys",
        anchor_id=f"{varid}exif_frequency",
    )

    template_variables["bottom"].content["items"].append(exif_keys)

    image_shape_freq = FrequencyTable(
        template_variables["freqtable_image_shape"],
        name="Frequency",
        anchor_id=f"{varid}image_shape_frequency",
    )

    image_shape_scatter = Image(
        scatter_series(summary["scatter_data"]),
        image_format=image_format,
        alt="Scatterplot of image sizes",
        caption="Scatterplot of image sizes",
        name="Scatter",
        anchor_id=f"{varid}scatter",
    )

    image_shape = Sequence(
        [image_shape_freq, image_shape_scatter],
        sequence_type="tabs",
        name="Image shape",
        anchor_id=f"{varid}image_shape",
    )

    template_variables["bottom"].content["items"].append(image_shape)

    return template_variables
예제 #3
0
def render_boolean(summary):
    varid = summary["varid"]
    n_obs_bool = config["vars"]["bool"]["n_obs"].get(int)

    # Prepare variables
    template_variables = render_common(summary)
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["n"],
        max_number_to_print=n_obs_bool,
    )

    # Element composition
    info = VariableInfo(
        anchor_id=summary["varid"],
        warnings=summary["warnings"],
        var_type="Boolean",
        var_name=summary["varname"],
    )

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    freqtable = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Frequency Table",
        anchor_id=f"{varid}frequency_table",
    )

    template_variables["bottom"] = Sequence([freqtable],
                                            sequence_type="tabs",
                                            anchor_id=f"{varid}bottom")

    return template_variables
예제 #4
0
def render_categorical(summary):
    varid = summary["varid"]
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_common(summary)

    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["count"],
        max_number_to_print=n_obs_cat,
    )

    # Top
    # Element composition
    info = VariableInfo(summary["varid"], summary["varname"], "Categorical",
                        summary["warnings"])

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    # Bottom
    items = []
    frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id=f"{varid}common_values",
    )

    items.append(frequency_table)

    check_compositions = config["vars"]["cat"]["check_composition"].get(bool)
    if check_compositions:
        n_freq_table_max = config["n_freq_table_max"].get(int)

        citems = []
        vc = pd.Series(summary["category_alias_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Categories",
                anchor_id=f"{varid}category_long_values",
            ))

        vc = pd.Series(summary["script_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Scripts",
                anchor_id=f"{varid}script_values",
            ))

        vc = pd.Series(summary["block_alias_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Blocks",
                anchor_id=f"{varid}block_alias_values",
            ))

        characters = Sequence(
            citems,
            name="Characters",
            sequence_type="tabs",
            anchor_id=f"{varid}characters",
        )

        items.append(characters)

    template_variables["bottom"] = Sequence(items,
                                            sequence_type="tabs",
                                            anchor_id=f"{varid}bottom")

    return template_variables
예제 #5
0
def render_url(summary):
    varid = summary["varid"]
    n_freq_table_max = config["n_freq_table_max"].get(int)

    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)

    mini_freq_table_rows = freq_table(freqtable=summary["value_counts"],
                                      n=summary["n"],
                                      max_number_to_print=n_obs_cat)
    template_variables = render_common(summary)

    keys = ["scheme", "netloc", "path", "query", "fragment"]
    for url_part in keys:
        template_variables[f"freqtable_{url_part}"] = freq_table(
            freqtable=summary[f"{url_part}_counts"],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    full_frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Full",
        anchor_id=f"{varid}full_frequency",
    )
    scheme_frequency_table = FrequencyTable(
        template_variables["freqtable_scheme"],
        name="Scheme",
        anchor_id=f"{varid}scheme_frequency",
    )
    netloc_frequency_table = FrequencyTable(
        template_variables["freqtable_netloc"],
        name="Netloc",
        anchor_id=f"{varid}netloc_frequency",
    )
    path_frequency_table = FrequencyTable(
        template_variables["freqtable_path"],
        name="Path",
        anchor_id=f"{varid}path_frequency",
    )
    query_frequency_table = FrequencyTable(
        template_variables["freqtable_query"],
        name="Query",
        anchor_id=f"{varid}query_frequency",
    )
    fragment_frequency_table = FrequencyTable(
        template_variables["freqtable_fragment"],
        name="Fragment",
        anchor_id=f"{varid}fragment_frequency",
    )

    items = [
        full_frequency_table,
        scheme_frequency_table,
        netloc_frequency_table,
        path_frequency_table,
        query_frequency_table,
        fragment_frequency_table,
    ]
    template_variables["bottom"] = Sequence(items,
                                            sequence_type="tabs",
                                            name="url stats",
                                            anchor_id=f"{varid}urlstats")

    # Element composition
    info = VariableInfo(summary["varid"], summary["varname"], "URL",
                        summary["warnings"])

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    return template_variables
예제 #6
0
def render_path(summary):
    varid = summary["varid"]
    n_freq_table_max = config["n_freq_table_max"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_categorical(summary)

    keys = ["name", "parent", "suffix", "stem"]
    for path_part in keys:
        template_variables[f"freqtable_{path_part}"] = freq_table(
            freqtable=summary[f"{path_part}_counts"],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    # Top
    template_variables["top"].content["items"][0].content["var_type"] = "Path"

    # Bottom
    full = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Full",
        anchor_id=f"{varid}full_frequency",
    )

    stem = FrequencyTable(
        template_variables["freqtable_stem"],
        name="Stem",
        anchor_id=f"{varid}stem_frequency",
    )

    name = FrequencyTable(
        template_variables["freqtable_name"],
        name="Name",
        anchor_id=f"{varid}name_frequency",
    )

    suffix = FrequencyTable(
        template_variables["freqtable_suffix"],
        name="Suffix",
        anchor_id=f"{varid}suffix_frequency",
    )

    parent = FrequencyTable(
        template_variables["freqtable_parent"],
        name="Parent",
        anchor_id=f"{varid}parent_frequency",
    )

    template_variables["bottom"].content["items"].append(full)
    template_variables["bottom"].content["items"].append(stem)
    template_variables["bottom"].content["items"].append(name)
    template_variables["bottom"].content["items"].append(suffix)
    template_variables["bottom"].content["items"].append(parent)

    if "file_sizes" in summary:
        file_size_histogram = Image(
            histogram(summary["file_sizes"], summary,
                      summary["histogram_bins"]),
            image_format=image_format,
            alt="File size",
            caption=
            f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={summary['histogram_bins']})",
            name="File size",
            anchor_id=f"{varid}file_size_histogram",
        )

        template_variables["bottom"].content["items"].append(
            file_size_histogram)

    return template_variables