예제 #1
0
def text_summary(series: pd.Series) -> dict:
    """

    Args:
        series: series to summarize

    Returns:

    """
    # Distribution of length
    summary = {
        "length": series.map(lambda x: len(str(x))).value_counts().to_dict()
    }

    # Unicode Character Summaries (category and script name)
    character_counts = get_character_counts(series)

    summary["category_short_values"] = {
        key: category(key)
        for key in character_counts.keys()
    }
    summary["category_alias_values"] = {
        key: category_long(value)
        for key, value in summary["category_short_values"].items()
    }
    summary["script_values"] = {
        key: script(key)
        for key in character_counts.keys()
    }
    summary["block_values"] = {
        key: block(key)
        for key in character_counts.keys()
    }
    summary["block_alias_values"] = {
        key: block_abbr(value)
        for key, value in summary["block_values"].items()
    }

    return summary
예제 #2
0
def unicode_summary_vc(vc: pd.Series) -> dict:
    from tangled_up_in_unicode import block, block_abbr, category, category_long, script

    # Unicode Character Summaries (category and script name)
    character_counts = get_character_counts_vc(vc)

    character_counts_series = character_counts
    summary = {
        "n_characters_distinct": len(character_counts_series),
        "n_characters": np.sum(character_counts_series.values),
        "character_counts": character_counts_series,
    }

    char_to_block = {key: block(key) for key in character_counts.keys()}
    char_to_category_short = {
        key: category(key)
        for key in character_counts.keys()
    }
    char_to_script = {key: script(key) for key in character_counts.keys()}

    summary.update({
        "category_alias_values": {
            key: category_long(value)
            for key, value in char_to_category_short.items()
        },
        "block_alias_values":
        {key: block_abbr(value)
         for key, value in char_to_block.items()},
    })

    # Retrieve original distribution
    block_alias_counts: Counter = Counter()
    per_block_char_counts: dict = {
        k: Counter()
        for k in summary["block_alias_values"].values()
    }
    for char, n_char in character_counts.items():
        block_name = summary["block_alias_values"][char]
        block_alias_counts[block_name] += n_char
        per_block_char_counts[block_name][char] = n_char
    summary["block_alias_counts"] = counter_to_series(block_alias_counts)
    summary["n_block_alias"] = len(summary["block_alias_counts"])
    summary["block_alias_char_counts"] = {
        k: counter_to_series(v)
        for k, v in per_block_char_counts.items()
    }

    script_counts: Counter = Counter()
    per_script_char_counts: dict = {
        k: Counter()
        for k in char_to_script.values()
    }
    for char, n_char in character_counts.items():
        script_name = char_to_script[char]
        script_counts[script_name] += n_char
        per_script_char_counts[script_name][char] = n_char
    summary["script_counts"] = counter_to_series(script_counts)
    summary["n_scripts"] = len(summary["script_counts"])
    summary["script_char_counts"] = {
        k: counter_to_series(v)
        for k, v in per_script_char_counts.items()
    }

    category_alias_counts: Counter = Counter()
    per_category_alias_char_counts: dict = {
        k: Counter()
        for k in summary["category_alias_values"].values()
    }
    for char, n_char in character_counts.items():
        category_alias_name = summary["category_alias_values"][char]
        category_alias_counts[category_alias_name] += n_char
        per_category_alias_char_counts[category_alias_name][char] += n_char
    summary["category_alias_counts"] = counter_to_series(category_alias_counts)
    if len(summary["category_alias_counts"]) > 0:
        summary["category_alias_counts"].index = summary[
            "category_alias_counts"].index.str.replace("_", " ")
    summary["n_category"] = len(summary["category_alias_counts"])
    summary["category_alias_char_counts"] = {
        k: counter_to_series(v)
        for k, v in per_category_alias_char_counts.items()
    }

    with contextlib.suppress(AttributeError):
        summary["category_alias_counts"].index = summary[
            "category_alias_counts"].index.str.replace("_", " ")

    return summary
예제 #3
0
def unicode_summary(series: pd.Series) -> dict:
    # Unicode Character Summaries (category and script name)
    character_counts = get_character_counts(series)

    character_counts_series = counter_to_series(character_counts)

    char_to_block = {key: block(key) for key in character_counts.keys()}
    char_to_category_short = {
        key: category(key)
        for key in character_counts.keys()
    }
    char_to_script = {key: script(key) for key in character_counts.keys()}

    summary = {
        "n_characters": len(character_counts_series),
        "character_counts": character_counts_series,
        "category_alias_values": {
            key: category_long(value)
            for key, value in char_to_category_short.items()
        },
        "block_alias_values":
        {key: block_abbr(value)
         for key, value in char_to_block.items()},
    }

    # Retrieve original distribution
    block_alias_counts: Counter = Counter()
    per_block_char_counts: dict = {
        k: Counter()
        for k in summary["block_alias_values"].values()
    }
    for char, n_char in character_counts.items():
        block_name = summary["block_alias_values"][char]
        block_alias_counts[block_name] += n_char
        per_block_char_counts[block_name][char] = n_char
    summary["block_alias_counts"] = counter_to_series(block_alias_counts)
    summary["block_alias_char_counts"] = {
        k: counter_to_series(v)
        for k, v in per_block_char_counts.items()
    }

    script_counts: Counter = Counter()
    per_script_char_counts: dict = {
        k: Counter()
        for k in char_to_script.values()
    }
    for char, n_char in character_counts.items():
        script_name = char_to_script[char]
        script_counts[script_name] += n_char
        per_script_char_counts[script_name][char] = n_char
    summary["script_counts"] = counter_to_series(script_counts)
    summary["script_char_counts"] = {
        k: counter_to_series(v)
        for k, v in per_script_char_counts.items()
    }

    category_alias_counts: Counter = Counter()
    per_category_alias_char_counts: dict = {
        k: Counter()
        for k in summary["category_alias_values"].values()
    }
    for char, n_char in character_counts.items():
        category_alias_name = summary["category_alias_values"][char]
        category_alias_counts[category_alias_name] += n_char
        per_category_alias_char_counts[category_alias_name][char] += n_char
    summary["category_alias_counts"] = counter_to_series(category_alias_counts)
    summary["category_alias_char_counts"] = {
        k: counter_to_series(v)
        for k, v in per_category_alias_char_counts.items()
    }

    # Unique counts
    summary["n_category"] = len(summary["category_alias_counts"])
    summary["n_scripts"] = len(summary["script_counts"])
    summary["n_block_alias"] = len(summary["block_alias_counts"])

    return summary
            "property": "East Asian Width",
            "standard": unicodedata.east_asian_width,
            "new": unicode_data.east_asian_width,
        },
        {
            "property": "Decomposition",
            "standard": unicodedata.decomposition,
            "new": unicode_data.decomposition,
        },
    ]

    extended = [
        {
            "property": "Category (long)",
            "new":
            lambda x: unicode_data.category_long(unicode_data.category(x)),
        },
        {
            "property": "Script (long)",
            "new": unicode_data.script
        },
        {
            "property": "Script (short)",
            "new": lambda x: unicode_data.script_abbr(unicode_data.script(x)),
        },
        {
            "property": "Block (long)",
            "new": unicode_data.block
        },
        {
            "property": "Block (short)",