def render_real(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) if summary["min"] >= 0: name = "Real number (ℝ<sub>≥0</sub>)" else: name = "Real number (ℝ)" # Top info = VariableInfo( summary["varid"], summary["varname"], name, summary["warnings"], summary["description"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Infinite", "value": summary["n_infinite"], "fmt": "fmt", "alert": "n_infinite" in summary["warn_fields"], }, { "name": "Infinite (%)", "value": summary["p_infinite"], "fmt": "fmt_percent", "alert": "p_infinite" in summary["warn_fields"], }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt", "alert": "n_zeros" in summary["warn_fields"], }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": "p_zeros" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) mini_histo = Image( mini_histogram(*summary["histogram"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = Table( [ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric" }, { "name": "5-th percentile", "value": summary["5%"], "fmt": "fmt_numeric" }, { "name": "Q1", "value": summary["25%"], "fmt": "fmt_numeric" }, { "name": "median", "value": summary["50%"], "fmt": "fmt_numeric" }, { "name": "Q3", "value": summary["75%"], "fmt": "fmt_numeric" }, { "name": "95-th percentile", "value": summary["95%"], "fmt": "fmt_numeric" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric" }, { "name": "Range", "value": summary["range"], "fmt": "fmt_numeric" }, { "name": "Interquartile range (IQR)", "value": summary["iqr"], "fmt": "fmt_numeric", }, ], name="Quantile statistics", ) if summary["monotonic_increase_strict"]: monotocity = "Strictly increasing" elif summary["monotonic_decrease_strict"]: monotocity = "Strictly decreasing" elif summary["monotonic_increase"]: monotocity = "Increasing" elif summary["monotonic_decrease"]: monotocity = "Decreasing" else: monotocity = "Not monotonic" descriptive_statistics = Table( [ { "name": "Standard deviation", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "Coefficient of variation (CV)", "value": summary["cv"], "fmt": "fmt_numeric", }, { "name": "Kurtosis", "value": summary["kurtosis"], "fmt": "fmt_numeric" }, { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric" }, { "name": "Median Absolute Deviation (MAD)", "value": summary["mad"], "fmt": "fmt_numeric", }, { "name": "Skewness", "value": summary["skewness"], "fmt": "fmt_numeric", "class": "alert" if "skewness" in summary["warn_fields"] else "", }, { "name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric" }, { "name": "Variance", "value": summary["variance"], "fmt": "fmt_numeric" }, { "name": "Monotocity", "value": monotocity, "fmt": "fmt" }, ], name="Descriptive statistics", ) statistics = Container( [quantile_statistics, descriptive_statistics], anchor_id=f"{varid}statistics", name="Statistics", sequence_type="grid", ) hist = Image( histogram(*summary["histogram"]), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id=f"{varid}histogram", ) fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id=f"{varid}common_values", redact=False, ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id=f"{varid}firstn", redact=False, ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id=f"{varid}lastn", redact=False, ), ], sequence_type="tabs", name="Extreme values", anchor_id=f"{varid}extreme_values", ) template_variables["bottom"] = Container( [statistics, hist, fq, evs], sequence_type="tabs", anchor_id=f"{varid}bottom", ) return template_variables
def get_correlation_items(summary) -> Optional[Renderable]: """Create the list of correlation items Args: summary: dict of correlations Returns: List of correlation items to show in the interface. """ correlation_matrix_items: List[Renderable] = [] predictivity_items: List[Renderable] = [] pearson_description = ( "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation " "between two variables. It's value lies between -1 and +1, -1 indicating total negative " "linear correlation, 0 indicating no linear correlation and 1 indicating total positive " "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location " "and scale of the two variables, implying that for a linear function the angle to the " "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two " "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and " "<em>Y</em> by the product of their standard deviations. ") spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """ kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation. <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.""" phi_k_description = """Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case of a bivariate normal input distribution. There is extensive documentation available <a href='https://phik.readthedocs.io/en/latest/index.html'>here</a>.""" cramers_description = """Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association. The empirical estimators used for Cramér's V have been proved to be biased, even for large samples. We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found <a href='http://stats.lse.ac.uk/bergsma/pdf/cramerV3.pdf'>here</a>.""" key_to_data = { "pearson": (-1, "Pearson's r", pearson_description), "spearman": (-1, "Spearman's ρ", spearman_description), "kendall": (-1, "Kendall's τ", kendall_description), "phi_k": (0, "Phik (φk)", phi_k_description), "cramers": (0, "Cramér's V (φc)", cramers_description), } image_format = config["plot"]["image_format"].get(str) for key, item in summary["correlations"].items(): vmin, name, description = key_to_data[key] correlation_matrix_diagram = Image( plot.correlation_matrix(item, vmin=vmin), image_format=image_format, alt="Correlation matrix", anchor_id=f"correlation_matrix_diagram", name="Correlation matrix", classes="correlation-diagram", ) predictivity_diagram = Image( plot.predictivity(item), image_format=image_format, alt="Predictivity", anchor_id=f"predictivity_diagram", name="Predictivity", classes="correlation-diagram", ) if not key == "cramers" else None if len(description) > 0: desc = HTML( f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>', anchor_id=f"{key}_html", classes="correlation-description", ) correlation_tbl = Container([correlation_matrix_diagram, desc], anchor_id=key, name=name, sequence_type="grid") correlation_matrix_items.append(correlation_tbl) if predictivity_diagram is not None: predictivity_tbl = Container([predictivity_diagram, desc], anchor_id=key, name=name, sequence_type="grid") predictivity_items.append(predictivity_tbl) else: correlation_matrix_items.append(correlation_matrix_diagram) if predictivity_diagram is not None: predictivity_items.append(predictivity_diagram) correlation_matrix_container = Container( correlation_matrix_items, sequence_type="tabs", name="Correlation matrix", anchor_id="correlation_matrix_tab", ) predictivity_container = Container( predictivity_items, sequence_type="tabs", name="Predictivity", anchor_id="predictivity_tab", ) corr = Container( [correlation_matrix_container, predictivity_container], sequence_type="tabs", name="Correlations Tab", anchor_id="correlations_tab", ) if len(correlation_matrix_items) > 0: btn = ToggleButton( "Toggle correlation descriptions", anchor_id="toggle-correlation-description", name="Toggle correlation descriptions", ) return Collapse(name="Correlations", anchor_id="correlations", button=btn, item=corr) else: return None
def render_categorical(config: Settings, summary: dict) -> dict: varid = summary["varid"] n_obs_cat = config.vars.cat.n_obs image_format = config.plot.image_format words = config.vars.cat.words characters = config.vars.cat.characters length = config.vars.cat.length template_variables = render_common(config, summary) info = VariableInfo( summary["varid"], summary["varname"], "Categorical", summary["warnings"], summary["description"], ) table = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": "n_distinct" in summary["warn_fields"], }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": "p_distinct" in summary["warn_fields"], }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts_without_nan"], n=summary["count"], max_number_to_print=n_obs_cat, ), redact=config.vars.cat.redact, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") # ============================================================================================ frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}common_values", redact=config.vars.cat.redact, ) unique_stats, value_counts = render_categorical_frequency( config, summary, varid) overview_items = [] if length: length_table, length_histo = render_categorical_length( config, summary, varid) overview_items.append(length_table) if characters: overview_table_char, unitab = render_categorical_unicode( config, summary, varid) overview_items.append(overview_table_char) overview_items.append(unique_stats) if not config.vars.cat.redact: rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row") sample = Table( [{ "name": name, "value": fmt(value), "alert": False, } for name, value in zip(rows, summary["first_rows"])], name="Sample", ) overview_items.append(sample) string_items: List[Renderable] = [frequency_table] if length: string_items.append(length_histo) max_unique = config.plot.pie.max_unique if max_unique > 0 and summary["n_distinct"] <= max_unique: string_items.append( Image( pie_plot( config, summary["value_counts_without_nan"], legend_kws={"loc": "upper right"}, ), image_format=image_format, alt="Pie chart", name="Pie chart", anchor_id=f"{varid}pie_chart", )) bottom_items = [ Container( overview_items, name="Overview", anchor_id=f"{varid}overview", sequence_type="batch_grid", batch_size=len(overview_items), titles=False, ), Container( string_items, name="Categories", anchor_id=f"{varid}string", sequence_type="batch_grid", batch_size=len(string_items), ), ] if words: woc = freq_table( freqtable=summary["word_counts"], n=summary["word_counts"].sum(), max_number_to_print=10, ) fqwo = FrequencyTable( woc, name="Common words", anchor_id=f"{varid}cwo", redact=config.vars.cat.redact, ) bottom_items.append( Container( [fqwo], name="Words", anchor_id=f"{varid}word", sequence_type="grid", )) if characters: bottom_items.append( Container( [unitab], name="Characters", anchor_id=f"{varid}characters", sequence_type="grid", )) # Bottom template_variables["bottom"] = Container(bottom_items, sequence_type="tabs", anchor_id=f"{varid}bottom") return template_variables
def render_date(summary): varid = summary["varid"] # TODO: render common? template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Date", summary["warnings"], summary["description"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, ]) mini_histo = Image( mini_histogram(*summary["histogram"], date=True), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") # Bottom bottom = Container( [ Image( histogram(*summary["histogram"], date=True), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id=f"{varid}histogram", ) ], sequence_type="tabs", anchor_id=summary["varid"], ) template_variables["bottom"] = bottom return template_variables
def render_real(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) if summary["min"] >= 0: name = "Real number (ℝ<sub>≥0</sub>)" else: name = "Real number (ℝ)" # Top info = VariableInfo(summary["varid"], summary["varname"], name, summary["warnings"]) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Infinite", "value": summary["n_infinite"], "fmt": "fmt", "alert": "n_infinite" in summary["warn_fields"], }, { "name": "Infinite (%)", "value": summary["p_infinite"], "fmt": "fmt_percent", "alert": "p_infinite" in summary["warn_fields"], }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt", "alert": False }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt", "alert": "n_zeros" in summary["warn_fields"], }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": "p_zeros" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) histogram_bins = 10 # TODO: replace with SmallImage... mini_histo = Image( mini_histogram(summary["histogram_data"], summary, histogram_bins), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = Table( [ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric" }, { "name": "5-th percentile", "value": summary["5%"], "fmt": "fmt_numeric" }, { "name": "Q1", "value": summary["25%"], "fmt": "fmt_numeric" }, { "name": "median", "value": summary["50%"], "fmt": "fmt_numeric" }, { "name": "Q3", "value": summary["75%"], "fmt": "fmt_numeric" }, { "name": "95-th percentile", "value": summary["95%"], "fmt": "fmt_numeric" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric" }, { "name": "Range", "value": summary["range"], "fmt": "fmt_numeric" }, { "name": "Interquartile range (IQR)", "value": summary["iqr"], "fmt": "fmt_numeric", }, ], name="Quantile statistics", ) descriptive_statistics = Table( [ { "name": "Standard deviation", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "Coefficient of variation (CV)", "value": summary["cv"], "fmt": "fmt_numeric", }, { "name": "Kurtosis", "value": summary["kurtosis"], "fmt": "fmt_numeric" }, { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric" }, { "name": "Median Absolute Deviation (MAD)", "value": summary["mad"], "fmt": "fmt_numeric", }, { "name": "Skewness", "value": summary["skewness"], "fmt": "fmt_numeric", "class": "alert" if "skewness" in summary["warn_fields"] else "", }, { "name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric" }, { "name": "Variance", "value": summary["variance"], "fmt": "fmt_numeric" }, ], name="Descriptive statistics", ) statistics = Container( [quantile_statistics, descriptive_statistics], anchor_id=f"{varid}statistics", name="Statistics", sequence_type="grid", ) seqs = [ Image( histogram(summary["histogram_data"], summary, histogram_bins), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={histogram_bins})", name="Histogram", anchor_id=f"{varid}histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id=f"{varid}common_values", ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id=f"{varid}firstn", ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id=f"{varid}lastn", ), ], sequence_type="tabs", name="Extreme values", anchor_id=f"{varid}extreme_values", ) if "histogram_bins_bayesian_blocks" in summary: histo_dyn = Image( histogram( summary["histogram_data"], summary, summary["histogram_bins_bayesian_blocks"], ), image_format=image_format, alt="Histogram", caption= '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)' .format( fmt_array(summary["histogram_bins_bayesian_blocks"], threshold=5)), name="Dynamic Histogram", anchor_id=f"{varid}dynamic_histogram", ) seqs.append(histo_dyn) template_variables["bottom"] = Container( [ statistics, Container( seqs, sequence_type="tabs", name="Histogram(s)", anchor_id=f"{varid}histograms", ), fq, evs, ], sequence_type="tabs", anchor_id=f"{varid}bottom", ) return template_variables
def render_categorical(summary): n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_common(summary) # TODO: merge with boolean mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ) # Top # Element composition info = Overview(summary["varid"], summary["varname"], "Categorical", summary["warnings"]) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "class": "alert" if "n_unique" in summary["warn_fields"] else "", }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "class": "alert" if "p_unique" in summary["warn_fields"] else "", }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "class": "alert" if "n_missing" in summary["warn_fields"] else "", }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "class": "alert" if "p_missing" in summary["warn_fields"] else "", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) # TODO: settings 3,3,6 template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid") # Bottom items = [] frequency_table = FrequencyTable( # 'frequency_table', template_variables["freq_table_rows"], name="Common Values", anchor_id="{varid}common_values".format(varid=summary["varid"]), ) items.append(frequency_table) check_compositions = config["vars"]["cat"]["check_composition"].get(bool) if check_compositions: composition = Table( [ { "name": "Contains chars", "value": summary["composition"]["chars"], "fmt": "fmt", }, { "name": "Contains digits", "value": summary["composition"]["digits"], "fmt": "fmt", }, { "name": "Contains whitespace", "value": summary["composition"]["spaces"], "fmt": "fmt", }, { "name": "Contains non-words", "value": summary["composition"]["non-words"], "fmt": "fmt", }, ], name="Composition", anchor_id="{varid}composition".format(varid=summary["varid"]), ) length = Table( [ { "name": "Max length", "value": summary["max_length"], "fmt": "fmt_numeric", }, { "name": "Mean length", "value": summary["mean_length"], "fmt": "fmt_numeric", }, { "name": "Min length", "value": summary["min_length"], "fmt": "fmt_numeric", }, ], name="Length", anchor_id="{varid}lengthstats".format(varid=summary["varid"]), ) tbl = Sequence( [composition, length], anchor_id="{varid}tbl".format(varid=summary["varid"]), name="Composition", sequence_type="grid", ) items.append(tbl) histogram_bins = 10 length = Image( histogram(summary["length"], summary, histogram_bins), image_format=image_format, alt="Scatter", name="Length", anchor_id="{varid}length".format(varid=summary["varid"]), ) items.append(length) template_variables["bottom"] = Sequence( items, sequence_type="tabs", anchor_id="{varid}bottom".format(varid=summary["varid"]), ) return template_variables
def render_boolean(config: Settings, summary: dict) -> dict: varid = summary["varid"] n_obs_bool = config.vars.bool.n_obs image_format = config.plot.image_format # Prepare variables template_variables = render_common(config, summary) # Element composition info = VariableInfo( anchor_id=summary["varid"], alerts=summary["alerts"], var_type="Boolean", var_name=summary["varname"], description=summary["description"], ) table = Table( [ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": "n_distinct" in summary["alert_fields"], }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": "p_distinct" in summary["alert_fields"], }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ] ) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts_without_nan"], n=summary["n"], max_number_to_print=n_obs_bool, ), redact=False, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") items: List[Renderable] = [ FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}frequency_table", redact=False, ) ] show = config.plot.cat_freq.show max_unique = config.plot.cat_freq.max_unique if show and (max_unique > 0): items.append( Image( cat_frequency_plot( config, summary["value_counts_without_nan"], ), image_format=image_format, alt="Category Frequency Plot", name="Category Frequency Plot", anchor_id=f"{varid}cat_frequency_plot", ) ) template_variables["bottom"] = Container( items, sequence_type="tabs", anchor_id=f"{varid}bottom" ) return template_variables
def get_correlation_items(summary) -> Optional[Renderable]: """Create the list of correlation items Args: summary: dict of correlations Returns: List of correlation items to show in the interface. """ items: List[Renderable] = [] pearson_description = ( "皮尔逊相关系数 ( <em>r</em> ) 是衡量两个变量之间线性相关关系的指标, " "它的值在-1和+1之间,-1表示完全负线性相关,0表示没有线性" "相关,1表示完全正线性相关。它的值在-1和+1之间,-1表示完 " "全负线性相关,0表示没有线性相关,1表示完全正线性相关。 " "此外,在两个变量的位置和比例分别变化的情况下<em>r</em>" "是不变的,这意味着对于一个线性函数来说,与x轴的角度不会影响 <em>r</em>.<br /><br />要计算两个变量X和Y的 <em>r</em>" " 就要用X和Y的协方差除以它们的标准差的乘积。") spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """ kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation. <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.""" phi_k_description = """Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case of a bivariate normal input distribution. There is extensive documentation available <a href='https://phik.readthedocs.io/en/latest/index.html'>here</a>.""" cramers_description = """Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association. The empirical estimators used for Cramér's V have been proved to be biased, even for large samples. We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found <a href='http://stats.lse.ac.uk/bergsma/pdf/cramerV3.pdf'>here</a>.""" key_to_data = { "pearson": (-1, "Pearson's r", pearson_description), "spearman": (-1, "Spearman's ρ", spearman_description), "kendall": (-1, "Kendall's τ", kendall_description), "phi_k": (0, "Phik (φk)", phi_k_description), "cramers": (0, "Cramér's V (φc)", cramers_description), } image_format = config["plot"]["image_format"].get(str) for key, item in summary["correlations"].items(): vmin, name, description = key_to_data[key] diagram = Image( plot.correlation_matrix(item, vmin=vmin), image_format=image_format, alt=name, anchor_id=f"{key}_diagram", name=name, classes="correlation-diagram", ) if len(description) > 0: desc = HTML( f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>', anchor_id=f"{key}_html", classes="correlation-description", ) tbl = Container([diagram, desc], anchor_id=key, name=name, sequence_type="grid") items.append(tbl) else: items.append(diagram) corr = Container( items, sequence_type="tabs", name="相关性列表", anchor_id="correlations_tab", ) if len(items) > 0: btn = ToggleButton( "切换相关性描述", anchor_id="toggle-correlation-description", name="Toggle correlation descriptions", ) return Collapse(name="相关性", anchor_id="correlations", button=btn, item=corr) else: return None
def render_complex(config: Settings, summary: dict) -> dict: varid = summary["varid"] template_variables = {} image_format = config.plot.image_format # Top info = VariableInfo( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["alerts"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]) }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), }, { "name": "Missing", "value": fmt(summary["n_missing"]) }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), }, ]) table2 = Table([ { "name": "Mean", "value": fmt_numeric(summary["mean"], precision=config.report.precision), }, { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), }, { "name": "Zeros", "value": fmt_numeric(summary["n_zeros"], precision=config.report.precision), }, { "name": "Zeros (%)", "value": fmt_percent(summary["p_zeros"]) }, ]) placeholder = HTML("") template_variables["top"] = Container([info, table1, table2, placeholder], sequence_type="grid") # Bottom items = [ Image( scatter_complex(config, summary["scatter_data"]), image_format=image_format, alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id=f"{varid}scatter", ) ] bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables
def render_date(summary): # TODO: render common? template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo(summary["varid"], summary["varname"], "Date", summary["warnings"]) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, ]) mini_histo = Image( mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Sequence([info, table1, table2, mini_histo], sequence_type="grid") # Bottom bottom = Sequence( [ Image( histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Histogram", caption="Histogram", name="Histogram", anchor_id="{varid}histogram".format(varid=summary["varid"]), ) ], sequence_type="tabs", anchor_id=summary["varid"], ) template_variables["bottom"] = bottom return template_variables
def render_path(summary): varid = summary["varid"] n_freq_table_max = config["n_freq_table_max"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_categorical(summary) keys = ["name", "parent", "suffix", "stem"] for path_part in keys: template_variables[f"freqtable_{path_part}"] = freq_table( freqtable=summary[f"{path_part}_counts"], n=summary["n"], max_number_to_print=n_freq_table_max, ) # Top template_variables["top"].content["items"][0].content["var_type"] = "Path" # TODO: colspan=2 # template_variables['top'].content['items'][1].content['rows'].append({'name': 'Common prefix', 'value': summary['common_prefix'], 'fmt': 'fmt'}) # { # <td>#} # { # <div style="white-space: nowrap;overflow: hidden;text-overflow: ellipsis;max-width: 600px;">#} # { # {{ values['common_prefix'] }}#} # { # </div>#} # { # </td>#} # # Bottom full = FrequencyTable( template_variables["freq_table_rows"], name="Full", anchor_id=f"{varid}full_frequency", ) stem = FrequencyTable( template_variables["freqtable_stem"], name="Stem", anchor_id=f"{varid}stem_frequency", ) name = FrequencyTable( template_variables["freqtable_name"], name="Name", anchor_id=f"{varid}name_frequency", ) suffix = FrequencyTable( template_variables["freqtable_suffix"], name="Suffix", anchor_id=f"{varid}suffix_frequency", ) parent = FrequencyTable( template_variables["freqtable_parent"], name="Parent", anchor_id=f"{varid}parent_frequency", ) template_variables["bottom"].content["items"].append(full) template_variables["bottom"].content["items"].append(stem) template_variables["bottom"].content["items"].append(name) template_variables["bottom"].content["items"].append(suffix) template_variables["bottom"].content["items"].append(parent) if "file_sizes" in summary: file_size_histogram = Image( histogram(summary["file_sizes"], summary, summary["histogram_bins"]), image_format=image_format, alt="File size", caption= f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={summary['histogram_bins']})", name="File size", anchor_id=f"{varid}file_size_histogram", ) # TODO: in SequeencyItem template_variables["bottom"].content["items"].append( file_size_histogram) return template_variables
def render_complex(summary): varid = summary["varid"] template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["warnings"], ) table1 = Table( [ {"name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt"}, {"name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent"}, {"name": "Missing", "value": summary["n_missing"], "fmt": "fmt"}, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ] ) table2 = Table( [ {"name": "Mean", "value": summary["mean"], "fmt": "fmt"}, {"name": "Minimum", "value": summary["min"], "fmt": "fmt"}, {"name": "Maximum", "value": summary["max"], "fmt": "fmt"}, {"name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt"}, {"name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent"}, ] ) placeholder = HTML("") template_variables["top"] = Sequence( [info, table1, table2, placeholder], sequence_type="grid" ) # Bottom items = [ Image( scatter_complex(summary["scatter_data"]), image_format=image_format, alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id=f"{varid}scatter", ) ] bottom = Sequence(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables
def render_complex(summary): varid = summary["varid"] template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["warnings"], summary["description"], ) table1 = Table([ { "name": "唯一值计数", "value": summary["n_unique"], "fmt": "fmt" }, { "name": "唯一值比例 (%)", "value": summary["p_unique"], "fmt": "fmt_percent" }, { "name": "缺失值", "value": summary["n_missing"], "fmt": "fmt" }, { "name": "缺失值比例(%)", "value": summary["p_missing"], "fmt": "fmt_percent", }, { "name": "内存占用", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) table2 = Table([ { "name": "均数", "value": summary["mean"], "fmt": "fmt_numeric" }, { "name": "最小值", "value": summary["min"], "fmt": "fmt_numeric" }, { "name": "最大值", "value": summary["max"], "fmt": "fmt_numeric" }, { "name": "零值", "value": summary["n_zeros"], "fmt": "fmt_numeric" }, { "name": "零值比例 (%)", "value": summary["p_zeros"], "fmt": "fmt_percent" }, ]) placeholder = HTML("") template_variables["top"] = Container([info, table1, table2, placeholder], sequence_type="grid") # Bottom items = [ Image( scatter_complex(summary["scatter_data"]), image_format=image_format, alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id=f"{varid}scatter", ) ] bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables
def render_image(summary): varid = summary["varid"] n_freq_table_max = config["n_freq_table_max"].get(int) redact = config["vars"]["cat"]["redact"].get(bool) template_variables = render_file(summary) # Top template_variables["top"].content["items"][0].content["var_type"] = "Image" # Bottom image_items = [] """ Min Width Min Height Min Area Mean Width Mean Height Mean Height Median Width Median Height Median Height Max Width Max Height Max Height All dimension properties are in pixels. """ image_shape_items = [ Container( [ Table([ { "name": "Min width", "value": summary["min_width"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Median width", "value": summary["median_width"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Max width", "value": summary["max_width"], "fmt": "fmt_numeric", "alert": False, }, ]), Table([ { "name": "Min height", "value": summary["min_height"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Median height", "value": summary["median_height"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Max height", "value": summary["max_height"], "fmt": "fmt_numeric", "alert": False, }, ]), Table([ { "name": "Min area", "value": summary["min_area"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Median area", "value": summary["median_area"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Max area", "value": summary["max_area"], "fmt": "fmt_numeric", "alert": False, }, ]), ], anchor_id=f"{varid}tbl", name="Overview", sequence_type="grid", ), Image( scatter_series(summary["image_dimensions"]), image_format=config["plot"]["image_format"].get(str), alt="Scatter plot of image sizes", caption="Scatter plot of image sizes", name="Scatter plot", anchor_id=f"{varid}image_dimensions_scatter", ), FrequencyTable( freq_table( freqtable=summary["image_dimensions"].value_counts(), n=summary["n"], max_number_to_print=n_freq_table_max, ), name="Common values", anchor_id=f"{varid}image_dimensions_frequency", redact=False, ), ] image_shape = Container( image_shape_items, sequence_type="named_list", name="Dimensions", anchor_id=f"{varid}image_dimensions", ) if "exif_keys_counts" in summary: items = [ FrequencyTable( freq_table( freqtable=pd.Series(summary["exif_keys_counts"]), n=summary["n"], max_number_to_print=n_freq_table_max, ), name="Exif keys", anchor_id=f"{varid}exif_keys", redact=redact, ) ] for key, counts in summary["exif_data"].items(): if key == "exif_keys": continue items.append( FrequencyTable( freq_table( freqtable=counts, n=summary["n"], max_number_to_print=n_freq_table_max, ), name=key, anchor_id=f"{varid}_exif_{key}", redact=redact, )) image_items.append( Container( items, anchor_id=f"{varid}exif_data", name="Exif data", sequence_type="named_list", )) image_items.append(image_shape) image_tab = Container( image_items, name="Image", sequence_type="tabs", anchor_id=f"{varid}image", ) template_variables["bottom"].content["items"].append(image_tab) return template_variables
def render_count(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Real number (ℝ / ℝ<sub>≥0</sub>)", summary["warnings"], summary["description"], ) table1 = Table( [ { "name": "唯一值计数", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "唯一值 (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "缺失值", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "缺失值比例 (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, ] ) table2 = Table( [ { "name": "均数", "value": summary["mean"], "fmt": "fmt_numeric", "alert": False, }, { "name": "最小值", "value": summary["min"], "fmt": "fmt_numeric", "alert": False, }, { "name": "最大值", "value": summary["max"], "fmt": "fmt_numeric", "alert": False, }, { "name": "零值", "value": summary["n_zeros"], "fmt": "fmt", "alert": False, }, { "name": "零值 (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": False, }, { "name": "内存占用", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ] ) mini_histo = Image( mini_histogram(*summary["histogram"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container( [info, table1, table2, mini_histo], sequence_type="grid" ) quantile_statistics = { "name": "定性分析", "items": [ { "name": "最小值", "value": summary["min"], "fmt": "fmt_numeric", "alert": False, }, { "name": "5-th 百分位", "value": summary["quantile_5"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q1", "value": summary["quantile_25"], "fmt": "fmt_numeric", "alert": False, }, { "name": "中位数", "value": summary["quantile_50"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q3", "value": summary["quantile_75"], "fmt": "fmt_numeric", "alert": False, }, { "name": "95-th 百分位", "value": summary["quantile_95"], "fmt": "fmt_numeric", "alert": False, }, { "name": "最大值", "value": summary["max"], "fmt": "fmt_numeric", "alert": False, }, { "name": "区间", "value": summary["range"], "fmt": "fmt_numeric", "alert": False, }, { "name": "四分位距", "value": summary["iqr"], "fmt": "fmt_numeric", "alert": False, }, ], } descriptive_statistics = { "name": "描述性统计", "items": [ { "name": "标准差", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "变异系数", "value": summary["cv"], "fmt": "fmt_numeric", }, {"name": "峰度", "value": summary["kurt"], "fmt": "fmt_numeric"}, {"name": "均数", "value": summary["mean"], "fmt": "fmt_numeric"}, {"name": "MAD", "value": summary["mad"], "fmt": "fmt_numeric"}, {"name": "偏度", "value": summary["skew"], "fmt": "fmt_numeric"}, {"name": "积", "value": summary["sum"], "fmt": "fmt_numeric"}, {"name": "方差", "value": summary["var"], "fmt": "fmt_numeric"}, ], } # TODO: Make sections data structure # statistics = ItemRenderer( # 'statistics', # 'Statistics', # 'table', # [ # quantile_statistics, # descriptive_statistics # ] # ) seqs = [ Image( histogram(*summary["histogram"]), image_format=image_format, alt="Histogram", caption=f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id="histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id="common_values", redact=False, ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id="firstn", redact=False, ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id="lastn", redact=False, ), ], sequence_type="tabs", name="极值", anchor_id="extreme_values", ) template_variables["bottom"] = Container( [ # statistics, Container( seqs, sequence_type="tabs", name="直方图", anchor_id="histograms" ), fq, evs, ], sequence_type="tabs", anchor_id=summary["varid"], ) return template_variables
def render_count(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Real number (ℝ / ℝ<sub>≥0</sub>)", summary["warnings"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt", "alert": False }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt", "alert": False, }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) # TODO: replace with SmallImage... mini_histo = Image( mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = { "name": "Quantile statistics", "items": [ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric", "alert": False, }, { "name": "5-th percentile", "value": summary["quantile_5"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q1", "value": summary["quantile_25"], "fmt": "fmt_numeric", "alert": False, }, { "name": "median", "value": summary["quantile_50"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q3", "value": summary["quantile_75"], "fmt": "fmt_numeric", "alert": False, }, { "name": "95-th percentile", "value": summary["quantile_95"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Range", "value": summary["range"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Interquartile range", "value": summary["iqr"], "fmt": "fmt_numeric", "alert": False, }, ], } descriptive_statistics = { "name": "Descriptive statistics", "items": [ { "name": "Standard deviation", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "Coefficient of variation", "value": summary["cv"], "fmt": "fmt_numeric", }, { "name": "Kurtosis", "value": summary["kurt"], "fmt": "fmt_numeric" }, { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric" }, { "name": "MAD", "value": summary["mad"], "fmt": "fmt_numeric" }, { "name": "Skewness", "value": summary["skew"], "fmt": "fmt_numeric" }, { "name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric" }, { "name": "Variance", "value": summary["var"], "fmt": "fmt_numeric" }, ], } # TODO: Make sections data structure # statistics = ItemRenderer( # 'statistics', # 'Statistics', # 'table', # [ # quantile_statistics, # descriptive_statistics # ] # ) seqs = [ Image( histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={summary['histogram_bins']})", name="Histogram", anchor_id="histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id="common_values", ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id="firstn", ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id="lastn", ), ], sequence_type="tabs", name="Extreme values", anchor_id="extreme_values", ) if "histogram_bins_bayesian_blocks" in summary: histo_dyn = Image( histogram( summary["histogram_data"], summary, summary["histogram_bins_bayesian_blocks"], ), image_format=image_format, alt="Histogram", caption= '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)' .format( fmt_array(summary["histogram_bins_bayesian_blocks"], threshold=5)), name="Dynamic Histogram", anchor_id="dynamic_histogram", ) seqs.append(histo_dyn) template_variables["bottom"] = Container( [ # statistics, Container(seqs, sequence_type="tabs", name="Histogram(s)", anchor_id="histograms"), fq, evs, ], sequence_type="tabs", anchor_id=summary["varid"], ) return template_variables
def render_categorical(summary): varid = summary["varid"] n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_common(summary) # TODO: merge with boolean mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ) # Top # Element composition info = VariableInfo( summary["varid"], summary["varname"], "Categorical", summary["warnings"], summary["description"], ) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) # TODO: settings 3,3,6 template_variables["top"] = Container([info, table, fqm], sequence_type="grid") # Bottom items = [] frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}common_values", ) items.append(frequency_table) check_length = config["vars"]["cat"]["length"].get(bool) if check_length: length_table = Table( [ { "name": "Max length", "value": summary["max_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Median length", "value": summary["median_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Mean length", "value": summary["mean_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Min length", "value": summary["min_length"], "fmt": "fmt_numeric", "alert": False, }, ], name="Length", anchor_id=f"{varid}lengthstats", ) histogram_bins = 10 length = Image( histogram(summary["length"], summary, histogram_bins), image_format=image_format, alt="Scatter", name="Length", anchor_id=f"{varid}length", ) length_tab = Container( [length, length_table], anchor_id=f"{varid}tbl", name="Length", sequence_type="grid", ) items.append(length_tab) check_unicode = config["vars"]["cat"]["unicode"].get(bool) if check_unicode: n_freq_table_max = config["n_freq_table_max"].get(int) category_items = [ FrequencyTable( freq_table( freqtable=summary["category_alias_counts"], n=summary["category_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring categories", anchor_id=f"{varid}category_long_values", ) ] for category_alias_name, category_alias_counts in summary[ "category_alias_char_counts"].items(): category_alias_name = category_alias_name.replace("_", " ") category_items.append( FrequencyTable( freq_table( freqtable=category_alias_counts, n=category_alias_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"Most frequent {category_alias_name} characters", anchor_id= f"{varid}category_alias_values_{category_alias_name}", )) script_items = [ FrequencyTable( freq_table( freqtable=summary["script_counts"], n=summary["script_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring scripts", anchor_id=f"{varid}script_values", ), ] for script_name, script_counts in summary["script_char_counts"].items( ): script_items.append( FrequencyTable( freq_table( freqtable=script_counts, n=script_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"Most frequent {script_name} characters", anchor_id=f"{varid}script_values_{script_name}", )) block_items = [ FrequencyTable( freq_table( freqtable=summary["block_alias_counts"], n=summary["block_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring blocks", anchor_id=f"{varid}block_alias_values", ) ] for block_name, block_counts in summary[ "block_alias_char_counts"].items(): block_items.append( FrequencyTable( freq_table( freqtable=block_counts, n=block_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"Most frequent {block_name} characters", anchor_id=f"{varid}block_alias_values_{block_name}", )) citems = [ Container( [ Table( [ { "name": "Unique unicode characters", "value": summary["n_characters"], "fmt": "fmt_numeric", "alert": False, }, { "name": 'Unique unicode categories (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_character_property#General_Category">?</a>)', "value": summary["n_category"], "fmt": "fmt_numeric", "alert": False, }, { "name": 'Unique unicode scripts (<a target="_blank" href="https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode">?</a>)', "value": summary["n_scripts"], "fmt": "fmt_numeric", "alert": False, }, { "name": 'Unique unicode blocks (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_block">?</a>)', "value": summary["n_block_alias"], "fmt": "fmt_numeric", "alert": False, }, ], name="Overview of Unicode Properties", caption= "The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ", ), ], anchor_id=f"{varid}character_overview", name="Overview", sequence_type="list", ), Container( [ FrequencyTable( freq_table( freqtable=summary["character_counts"], n=summary["character_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring characters", anchor_id=f"{varid}character_frequency", ), ], name="Characters", anchor_id=f"{varid}characters", sequence_type="named_list", ), Container( category_items, name="Categories", anchor_id=f"{varid}categories", sequence_type="named_list", ), Container( script_items, name="Scripts", anchor_id=f"{varid}scripts", sequence_type="named_list", ), Container( block_items, name="Blocks", anchor_id=f"{varid}blocks", sequence_type="named_list", ), ] characters = Container( citems, name="Unicode", sequence_type="tabs", anchor_id=f"{varid}unicode", ) items.append(characters) template_variables["bottom"] = Container(items, sequence_type="tabs", anchor_id=f"{varid}bottom") return template_variables
def render_categorical(summary): varid = summary["varid"] n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_common(summary) # TODO: merge with boolean mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ) # Top # Element composition info = VariableInfo(summary["varid"], summary["varname"], "Categorical", summary["warnings"]) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) # TODO: settings 3,3,6 template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid") # Bottom items = [] frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}common_values", ) items.append(frequency_table) check_compositions = config["vars"]["cat"]["check_composition"].get(bool) if check_compositions: length_table = Table( [ { "name": "Max length", "value": summary["max_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Mean length", "value": summary["mean_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Min length", "value": summary["min_length"], "fmt": "fmt_numeric", "alert": False, }, ], name="Length", anchor_id=f"{varid}lengthstats", ) histogram_bins = 10 length = Image( histogram(summary["length"], summary, histogram_bins), image_format=image_format, alt="Scatter", name="Length", anchor_id=f"{varid}length", ) tbl = Sequence( [length, length_table], anchor_id=f"{varid}tbl", name="Length", sequence_type="grid", ) items.append(tbl) n_freq_table_max = config["n_freq_table_max"].get(int) citems = [] vc = pd.Series(summary["category_alias_values"]).value_counts() citems.append( FrequencyTable( freq_table(freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max), name="Categories", anchor_id=f"{varid}category_long_values", )) vc = pd.Series(summary["script_values"]).value_counts() citems.append( FrequencyTable( freq_table(freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max), name="Scripts", anchor_id=f"{varid}script_values", )) vc = pd.Series(summary["block_alias_values"]).value_counts() citems.append( FrequencyTable( freq_table(freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max), name="Blocks", anchor_id=f"{varid}block_alias_values", )) characters = Sequence( citems, name="Characters", sequence_type="tabs", anchor_id=f"{varid}characters", ) items.append(characters) template_variables["bottom"] = Sequence(items, sequence_type="tabs", anchor_id=f"{varid}bottom") return template_variables
def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: varid = summary["varid"] template_variables = {} image_format = config.plot.image_format # Top info = VariableInfo( summary["varid"], summary["varname"], "Date", summary["alerts"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": False, }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": False, }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": False, }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": False, }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": fmt(summary["min"]), "alert": False }, { "name": "Maximum", "value": fmt(summary["max"]), "alert": False }, ]) mini_histo = Image( mini_histogram(config, summary["histogram"][0], summary["histogram"][1], date=True), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") # Bottom bottom = Container( [ Image( histogram(config, summary["histogram"][0], summary["histogram"][1], date=True), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id=f"{varid}histogram", ) ], sequence_type="tabs", anchor_id=summary["varid"], ) template_variables["bottom"] = bottom return template_variables
def render_categorical(summary): varid = summary["varid"] n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) redact = config["vars"]["cat"]["redact"].get(bool) template_variables = render_common(summary) info = VariableInfo( summary["varid"], summary["varname"], "Categorical", summary["warnings"], summary["description"], ) table = Table([ { "name": "Distinct", "value": summary["n_distinct"], "fmt": "fmt", "alert": "n_distinct" in summary["warn_fields"], }, { "name": "Distinct (%)", "value": summary["p_distinct"], "fmt": "fmt_percent", "alert": "p_distinct" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ), redact=redact, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") citems = [ FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}common_values", redact=redact, ), render_categorical_frequency(summary, varid, image_format), ] max_unique = config["plot"]["pie"]["max_unique"].get(int) if max_unique > 0 and summary["n_distinct"] <= max_unique: citems.append( Image( pie_plot(summary["value_counts"], legend_kws={"loc": "upper right"}), image_format=image_format, alt="Chart", name="Chart", anchor_id=f"{varid}pie_chart", )) # Bottom items = [ Container( citems, name="Frequencies", anchor_id=f"{varid}frequencies", sequence_type="tabs", ), ] check_length = config["vars"]["cat"]["length"].get(bool) if check_length: items.append(render_categorical_length(summary, varid, image_format)) check_unicode = config["vars"]["cat"]["unicode"].get(bool) if check_unicode: items.append(render_categorical_unicode(summary, varid, redact)) template_variables["bottom"] = Container(items, sequence_type="tabs", anchor_id=f"{varid}bottom") return template_variables
def render_count(config: Settings, summary: dict) -> dict: template_variables = render_common(config, summary) image_format = config.plot.image_format # Top info = VariableInfo( summary["varid"], summary["varname"], "Real number (ℝ / ℝ<sub>≥0</sub>)", summary["alerts"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": False, }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": False, }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": False, }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": False, }, ]) table2 = Table([ { "name": "Mean", "value": fmt_numeric(summary["mean"], precision=config.report.precision), "alert": False, }, { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), "alert": False, }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), "alert": False, }, { "name": "Zeros", "value": fmt(summary["n_zeros"]), "alert": False, }, { "name": "Zeros (%)", "value": fmt_percent(summary["p_zeros"]), "alert": False, }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) mini_histo = Image( mini_histogram(config, *summary["histogram"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") seqs = [ Image( histogram(config, *summary["histogram"]), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id="histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id="common_values", redact=False, ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name=f"Minimum {config.n_extreme_obs} values", anchor_id="firstn", redact=False, ), FrequencyTable( template_variables["lastn_expanded"], name=f"Maximum {config.n_extreme_obs} values", anchor_id="lastn", redact=False, ), ], sequence_type="tabs", name="Extreme values", anchor_id="extreme_values", ) template_variables["bottom"] = Container( [ Container(seqs, sequence_type="tabs", name="Histogram(s)", anchor_id="histograms"), fq, evs, ], sequence_type="tabs", anchor_id=summary["varid"], ) return template_variables