def text_summary(series: pd.Series) -> dict: """ Args: series: series to summarize Returns: """ # Distribution of length summary = { "length": series.map(lambda x: len(str(x))).value_counts().to_dict() } # Unicode Character Summaries (category and script name) character_counts = get_character_counts(series) summary["category_short_values"] = { key: category(key) for key in character_counts.keys() } summary["category_alias_values"] = { key: category_long(value) for key, value in summary["category_short_values"].items() } summary["script_values"] = { key: script(key) for key in character_counts.keys() } summary["block_values"] = { key: block(key) for key in character_counts.keys() } summary["block_alias_values"] = { key: block_abbr(value) for key, value in summary["block_values"].items() } return summary
def unicode_summary_vc(vc: pd.Series) -> dict: from tangled_up_in_unicode import block, block_abbr, category, category_long, script # Unicode Character Summaries (category and script name) character_counts = get_character_counts_vc(vc) character_counts_series = character_counts summary = { "n_characters_distinct": len(character_counts_series), "n_characters": np.sum(character_counts_series.values), "character_counts": character_counts_series, } char_to_block = {key: block(key) for key in character_counts.keys()} char_to_category_short = { key: category(key) for key in character_counts.keys() } char_to_script = {key: script(key) for key in character_counts.keys()} summary.update({ "category_alias_values": { key: category_long(value) for key, value in char_to_category_short.items() }, "block_alias_values": {key: block_abbr(value) for key, value in char_to_block.items()}, }) # Retrieve original distribution block_alias_counts: Counter = Counter() per_block_char_counts: dict = { k: Counter() for k in summary["block_alias_values"].values() } for char, n_char in character_counts.items(): block_name = summary["block_alias_values"][char] block_alias_counts[block_name] += n_char per_block_char_counts[block_name][char] = n_char summary["block_alias_counts"] = counter_to_series(block_alias_counts) summary["n_block_alias"] = len(summary["block_alias_counts"]) summary["block_alias_char_counts"] = { k: counter_to_series(v) for k, v in per_block_char_counts.items() } script_counts: Counter = Counter() per_script_char_counts: dict = { k: Counter() for k in char_to_script.values() } for char, n_char in character_counts.items(): script_name = char_to_script[char] script_counts[script_name] += n_char per_script_char_counts[script_name][char] = n_char summary["script_counts"] = counter_to_series(script_counts) summary["n_scripts"] = len(summary["script_counts"]) summary["script_char_counts"] = { k: counter_to_series(v) for k, v in per_script_char_counts.items() } category_alias_counts: Counter = Counter() per_category_alias_char_counts: dict = { k: Counter() for k in summary["category_alias_values"].values() } for char, n_char in character_counts.items(): category_alias_name = summary["category_alias_values"][char] category_alias_counts[category_alias_name] += n_char per_category_alias_char_counts[category_alias_name][char] += n_char summary["category_alias_counts"] = counter_to_series(category_alias_counts) if len(summary["category_alias_counts"]) > 0: summary["category_alias_counts"].index = summary[ "category_alias_counts"].index.str.replace("_", " ") summary["n_category"] = len(summary["category_alias_counts"]) summary["category_alias_char_counts"] = { k: counter_to_series(v) for k, v in per_category_alias_char_counts.items() } with contextlib.suppress(AttributeError): summary["category_alias_counts"].index = summary[ "category_alias_counts"].index.str.replace("_", " ") return summary
def unicode_summary(series: pd.Series) -> dict: # Unicode Character Summaries (category and script name) character_counts = get_character_counts(series) character_counts_series = counter_to_series(character_counts) char_to_block = {key: block(key) for key in character_counts.keys()} char_to_category_short = { key: category(key) for key in character_counts.keys() } char_to_script = {key: script(key) for key in character_counts.keys()} summary = { "n_characters": len(character_counts_series), "character_counts": character_counts_series, "category_alias_values": { key: category_long(value) for key, value in char_to_category_short.items() }, "block_alias_values": {key: block_abbr(value) for key, value in char_to_block.items()}, } # Retrieve original distribution block_alias_counts: Counter = Counter() per_block_char_counts: dict = { k: Counter() for k in summary["block_alias_values"].values() } for char, n_char in character_counts.items(): block_name = summary["block_alias_values"][char] block_alias_counts[block_name] += n_char per_block_char_counts[block_name][char] = n_char summary["block_alias_counts"] = counter_to_series(block_alias_counts) summary["block_alias_char_counts"] = { k: counter_to_series(v) for k, v in per_block_char_counts.items() } script_counts: Counter = Counter() per_script_char_counts: dict = { k: Counter() for k in char_to_script.values() } for char, n_char in character_counts.items(): script_name = char_to_script[char] script_counts[script_name] += n_char per_script_char_counts[script_name][char] = n_char summary["script_counts"] = counter_to_series(script_counts) summary["script_char_counts"] = { k: counter_to_series(v) for k, v in per_script_char_counts.items() } category_alias_counts: Counter = Counter() per_category_alias_char_counts: dict = { k: Counter() for k in summary["category_alias_values"].values() } for char, n_char in character_counts.items(): category_alias_name = summary["category_alias_values"][char] category_alias_counts[category_alias_name] += n_char per_category_alias_char_counts[category_alias_name][char] += n_char summary["category_alias_counts"] = counter_to_series(category_alias_counts) summary["category_alias_char_counts"] = { k: counter_to_series(v) for k, v in per_category_alias_char_counts.items() } # Unique counts summary["n_category"] = len(summary["category_alias_counts"]) summary["n_scripts"] = len(summary["script_counts"]) summary["n_block_alias"] = len(summary["block_alias_counts"]) return summary
}, { "property": "Script (long)", "new": unicode_data.script }, { "property": "Script (short)", "new": lambda x: unicode_data.script_abbr(unicode_data.script(x)), }, { "property": "Block (long)", "new": unicode_data.block }, { "property": "Block (short)", "new": lambda x: unicode_data.block_abbr(unicode_data.block(x)), }, { "property": "East Asian Width (long)", "new": lambda x: unicode_data.east_asian_width_long( unicode_data.east_asian_width(x)), }, { "property": "Bidirectional (long)", "new": lambda x: unicode_data.bidirectional_long( unicode_data.bidirectional(x)), },