예제 #1
0
def render_categorical_length(config: Settings, summary: dict,
                              varid: str) -> Tuple[Renderable, Renderable]:
    length_table = Table(
        [
            {
                "name": "Max length",
                "value": fmt_number(summary["max_length"]),
                "alert": False,
            },
            {
                "name": "Median length",
                "value": fmt_number(summary["median_length"]),
                "alert": False,
            },
            {
                "name":
                "Mean length",
                "value":
                fmt_numeric(summary["mean_length"],
                            precision=config.report.precision),
                "alert":
                False,
            },
            {
                "name": "Min length",
                "value": fmt_number(summary["min_length"]),
                "alert": False,
            },
        ],
        name="Length",
        anchor_id=f"{varid}lengthstats",
    )

    length_histo = Image(
        histogram(config, *summary["histogram_length"]),
        image_format=config.plot.image_format,
        alt="length histogram",
        name="Length",
        caption="Histogram of lengths of the category",
        anchor_id=f"{varid}length",
    )

    return length_table, length_histo
예제 #2
0
def render_categorical_unicode(config: Settings, summary: dict,
                               varid: str) -> Tuple[Renderable, Renderable]:
    n_freq_table_max = config.n_freq_table_max

    category_overview = FrequencyTable(
        freq_table(
            freqtable=summary["category_alias_counts"],
            n=summary["category_alias_counts"].sum(),
            max_number_to_print=n_freq_table_max,
        ),
        name="Most occurring categories",
        anchor_id=f"{varid}category_long_values",
        redact=False,
    )

    cats = []
    for category_alias_name, category_alias_counts in sorted(
            summary["category_alias_char_counts"].items(),
            key=lambda x: -len(x[1])):
        category_alias_name = category_alias_name.replace("_", " ")
        cats.append(
            FrequencyTable(
                freq_table(
                    freqtable=category_alias_counts,
                    n=category_alias_counts.sum(),
                    max_number_to_print=n_freq_table_max,
                ),
                name=f"{category_alias_name}",
                anchor_id=f"{varid}category_alias_values_{category_alias_name}",
                redact=config.vars.cat.redact,
            ))

    category_items = [
        category_overview,
        Container(
            cats,
            name="Most frequent character per category",
            sequence_type="batch_grid",
            anchor_id=f"{varid}categories",
            batch_size=2,
            subtitles=True,
        ),
    ]

    script_overview = FrequencyTable(
        freq_table(
            freqtable=summary["script_counts"],
            n=summary["script_counts"].sum(),
            max_number_to_print=n_freq_table_max,
        ),
        name="Most occurring scripts",
        anchor_id=f"{varid}script_values",
        redact=False,
    )

    scripts = [
        FrequencyTable(
            freq_table(
                freqtable=script_counts,
                n=script_counts.sum(),
                max_number_to_print=n_freq_table_max,
            ),
            name=f"{script_name}",
            anchor_id=f"{varid}script_values_{script_name}",
            redact=config.vars.cat.redact,
        ) for script_name, script_counts in sorted(
            summary["script_char_counts"].items(), key=lambda x: -len(x[1]))
    ]

    script_items = [
        script_overview,
        Container(
            scripts,
            name="Most frequent character per script",
            sequence_type="batch_grid",
            anchor_id=f"{varid}scripts",
            batch_size=2,
            subtitles=True,
        ),
    ]

    block_overview = FrequencyTable(
        freq_table(
            freqtable=summary["block_alias_counts"],
            n=summary["block_alias_counts"].sum(),
            max_number_to_print=n_freq_table_max,
        ),
        name="Most occurring blocks",
        anchor_id=f"{varid}block_alias_values",
        redact=False,
    )

    blocks = [
        FrequencyTable(
            freq_table(
                freqtable=block_counts,
                n=block_counts.sum(),
                max_number_to_print=n_freq_table_max,
            ),
            name=f"{block_name}",
            anchor_id=f"{varid}block_alias_values_{block_name}",
            redact=config.vars.cat.redact,
        ) for block_name, block_counts in
        summary["block_alias_char_counts"].items()
    ]

    block_items = [
        block_overview,
        Container(
            blocks,
            name="Most frequent character per block",
            sequence_type="batch_grid",
            anchor_id=f"{varid}blocks",
            batch_size=2,
            subtitles=True,
        ),
    ]

    overview_table = Table(
        [
            {
                "name": "Total characters",
                "value": fmt_number(summary["n_characters"]),
                "alert": False,
            },
            {
                "name": "Distinct characters",
                "value": fmt_number(summary["n_characters_distinct"]),
                "alert": False,
            },
            {
                "name": "Distinct categories",
                "value":
                f"{fmt_number(summary['n_category'])} {help(title='Unicode categories (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_character_property#General_Category')}",
                "alert": False,
            },
            {
                "name": "Distinct scripts",
                "value":
                f"{fmt_number(summary['n_scripts'])} {help(title='Unicode scripts (click for more information)', url='https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode')}",
                "alert": False,
            },
            {
                "name": "Distinct blocks",
                "value":
                f"{fmt_number(summary['n_block_alias'])} {help(title='Unicode blocks (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_block')}",
                "alert": False,
            },
        ],
        name="Characters and Unicode",
        caption=
        "The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ",
    )

    citems = [
        Container(
            [
                FrequencyTable(
                    freq_table(
                        freqtable=summary["character_counts"],
                        n=summary["n_characters"],
                        max_number_to_print=n_freq_table_max,
                    ),
                    name="Most occurring characters",
                    anchor_id=f"{varid}character_frequency",
                    redact=config.vars.cat.redact,
                ),
            ],
            name="Characters",
            anchor_id=f"{varid}characters",
            sequence_type="named_list",
        ),
        Container(
            category_items,
            name="Categories",
            anchor_id=f"{varid}categories",
            sequence_type="named_list",
        ),
        Container(
            script_items,
            name="Scripts",
            anchor_id=f"{varid}scripts",
            sequence_type="named_list",
        ),
        Container(
            block_items,
            name="Blocks",
            anchor_id=f"{varid}blocks",
            sequence_type="named_list",
        ),
    ]

    return overview_table, Container(
        citems,
        name="Unicode",
        sequence_type="tabs",
        anchor_id=f"{varid}unicode",
    )
예제 #3
0
def get_dataset_overview(config: Settings, summary: dict) -> Renderable:
    table_metrics = [
        {
            "name": "Number of variables",
            "value": fmt_number(summary["table"]["n_var"]),
        },
        {
            "name": "Number of observations",
            "value": fmt_number(summary["table"]["n"]),
        },
        {
            "name": "Missing cells",
            "value": fmt_number(summary["table"]["n_cells_missing"]),
        },
        {
            "name": "Missing cells (%)",
            "value": fmt_percent(summary["table"]["p_cells_missing"]),
        },
    ]
    if "n_duplicates" in summary["table"]:
        table_metrics.extend(
            [
                {
                    "name": "Duplicate rows",
                    "value": fmt_number(summary["table"]["n_duplicates"]),
                },
                {
                    "name": "Duplicate rows (%)",
                    "value": fmt_percent(summary["table"]["p_duplicates"]),
                },
            ]
        )

    table_metrics.extend(
        [
            {
                "name": "Total size in memory",
                "value": fmt_bytesize(summary["table"]["memory_size"]),
            },
            {
                "name": "Average record size in memory",
                "value": fmt_bytesize(summary["table"]["record_size"]),
            },
        ]
    )

    dataset_info = Table(
        table_metrics,
        name="Dataset statistics",
    )

    dataset_types = Table(
        [
            {
                "name": str(type_name),
                "value": fmt_numeric(count, precision=config.report.precision),
            }
            for type_name, count in summary["table"]["types"].items()
        ],
        name="Variable types",
    )

    return Container(
        [dataset_info, dataset_types],
        anchor_id="dataset_overview",
        name="Overview",
        sequence_type="grid",
    )