def get_report_structure(date_start: datetime, date_end: datetime, sample: dict, summary: dict) -> Renderable: """Generate a HTML report from summary statistics and a given sample. Args: sample: A dict containing the samples to print. summary: Statistics to use for the overview, variables, correlations and missing values. Returns: The profile report in HTML format """ sections = Sequence( [ Dataset( package=summary["package"], date_start=date_start, date_end=date_end, values=summary["table"], messages=summary["messages"], variables=summary["variables"], name="Overview", anchor_id="overview", ), Sequence( render_variables_section(summary), sequence_type="accordion", name="Variables", anchor_id="variables", ), Sequence( get_correlation_items(summary), sequence_type="tabs", name="Correlations", anchor_id="correlations", ), Sequence( get_missing_items(summary), sequence_type="tabs", name="Missing values", anchor_id="missing", ), Sequence( get_sample_items(sample), sequence_type="list", name="Sample", anchor_id="sample", ), ], name="Report", sequence_type="sections", ) return sections
def get_scatter_matrix(scatter_matrix): image_format = config["plot"]["image_format"].get(str) titems = [] for x_col, y_cols in scatter_matrix.items(): items = [] for y_col, splot in y_cols.items(): items.append( Image( splot, image_format=image_format, alt=f"{x_col} x {y_col}", anchor_id=f"interactions_{x_col}_{y_col}", name=y_col, ) ) titems.append( Sequence( items, sequence_type="tabs", name=x_col, anchor_id=f"interactions_{x_col}", ) ) return titems
def render_generic(summary): template_variables = {} # render_common(summary) info = Overview( anchor_id=summary["varid"], warnings=summary["warnings"], var_type="Unsupported", var_name=summary["varname"], ) table = Table([ { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "class": "alert" if "n_missing" in summary["warn_fields"] else "", }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "class": "alert" if "p_missing" in summary["warn_fields"] else "", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) return { "top": Sequence([info, table, HTML("")], sequence_type="grid"), "bottom": None, "ignore": "ignore", }
def render_path_image(summary): n_freq_table_max = config["n_freq_table_max"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_path(summary) # Top template_variables["top"].content["items"][0].content[ "var_type"] = "Image Path" # Bottom keys = {"Image shape": "image_shape", "Exif keys": "exif_keys"} for title, key in keys.items(): template_variables["freqtable_{}".format(key)] = freq_table( freqtable=summary["{}_counts".format(key)], n=summary["n"], max_number_to_print=n_freq_table_max, ) # TODO: add dropdown to switch to specific values exif_keys = FrequencyTable( template_variables["freqtable_{}".format("exif_keys")], name="Exif keys", anchor_id="{varid}exif_frequency".format(varid=summary["varid"]), ) template_variables["bottom"].content["items"].append(exif_keys) image_shape_freq = FrequencyTable( template_variables["freqtable_{}".format("image_shape")], name="Frequency", anchor_id="{varid}image_shape_frequency".format( varid=summary["varid"]), ) image_shape_scatter = Image( scatter_series(summary["scatter_data"]), image_format=image_format, alt="Scatterplot of image sizes", caption="Scatterplot of image sizes", name="Scatter", anchor_id="{varid}scatter".format(varid=summary["varid"]), ) image_shape = Sequence( [image_shape_freq, image_shape_scatter], sequence_type="tabs", name="Image shape", anchor_id="{varid}image_shape".format(varid=summary["varid"]), ) template_variables["bottom"].content["items"].append(image_shape) return template_variables
def get_report_structure(date_start: datetime, date_end: datetime, sample: dict, summary: dict) -> Renderable: """Generate a HTML report from summary statistics and a given sample. Args: sample: A dict containing the samples to print. summary: Statistics to use for the overview, variables, correlations and missing values. Returns: The profile report in HTML format """ warnings = summary["messages"] section_items = get_section_items() section_items.append( Sequence( get_dataset_items(summary, date_start, date_end, warnings), sequence_type="tabs", name="Overview", anchor_id="overview", )) section_items.append( Sequence( render_variables_section(summary), sequence_type="accordion", name="Variables", anchor_id="variables", )) section_items.append( Sequence( get_scatter_matrix(summary["scatter"]), sequence_type="tabs", name="Interactions", anchor_id="interactions", )) corr = get_correlation_items(summary) if corr is not None: section_items.append(corr) section_items.append( Sequence( get_missing_items(summary), sequence_type="tabs", name="Missing values", anchor_id="missing", )) section_items.append( Sequence( get_sample_items(sample), sequence_type="list", name="Sample", anchor_id="sample", )) sections = Sequence(section_items, name="Report", sequence_type="sections") return sections
def get_dataset_overview(summary): dataset_info = Table( [ { "name": "Total Number of Records", "value": summary["table"]["n"], "fmt": "fmt_numeric", }, { "name": "Total Number of Columns", "value": summary["table"]["n_var"], "fmt": "fmt_numeric", }, { "name": "Missing row cells", "value": summary["table"]["n_cells_missing"], "fmt": "fmt_numeric", }, { "name": "Missing row cells (%)", "value": summary["table"]["p_cells_missing"], "fmt": "fmt_percent", }, { "name": "Duplicate rows", "value": summary["table"]["n_duplicates"], "fmt": "fmt_numeric", }, { "name": "Duplicate rows (%)", "value": summary["table"]["p_duplicates"], "fmt": "fmt_percent", }, ], name="Table statistics", ) dataset_types = Table( [{ "name": type_name, "value": count, "fmt": "fmt_numeric" } for type_name, count in summary["table"]["types"].items()], name="Variable types", ) return Sequence( [dataset_info, dataset_types], anchor_id="dataset_overview", name="Overview", sequence_type="grid", )
def render_url(summary): n_freq_table_max = config["n_freq_table_max"].get(int) n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) # TODO: merge with boolean/categorical mini_freq_table_rows = freq_table(freqtable=summary["value_counts"], n=summary["n"], max_number_to_print=n_obs_cat) template_variables = render_common(summary) keys = ["scheme", "netloc", "path", "query", "fragment"] for url_part in keys: template_variables["freqtable_{}".format(url_part)] = freq_table( freqtable=summary["{}_counts".format(url_part)], n=summary["n"], max_number_to_print=n_freq_table_max, ) full_frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Full", anchor_id="{varid}full_frequency".format(varid=summary["varid"]), ) scheme_frequency_table = FrequencyTable( template_variables["freqtable_scheme"], name="Scheme", anchor_id="{varid}scheme_frequency".format(varid=summary["varid"]), ) netloc_frequency_table = FrequencyTable( template_variables["freqtable_netloc"], name="Netloc", anchor_id="{varid}netloc_frequency".format(varid=summary["varid"]), ) path_frequency_table = FrequencyTable( template_variables["freqtable_path"], name="Path", anchor_id="{varid}path_frequency".format(varid=summary["varid"]), ) query_frequency_table = FrequencyTable( template_variables["freqtable_query"], name="Query", anchor_id="{varid}query_frequency".format(varid=summary["varid"]), ) fragment_frequency_table = FrequencyTable( template_variables["freqtable_fragment"], name="Fragment", anchor_id="{varid}fragment_frequency".format(varid=summary["varid"]), ) items = [ full_frequency_table, scheme_frequency_table, netloc_frequency_table, path_frequency_table, query_frequency_table, fragment_frequency_table, ] template_variables["bottom"] = Sequence(items, sequence_type="tabs") # Element composition info = Overview(summary["varid"], summary["varname"], "URL", summary["warnings"]) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt" }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent" }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt" }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) # TODO: settings 3,3,6 template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid") return template_variables
def render_categorical(summary): n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) template_variables = render_common(summary) # TODO: merge with boolean mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ) # Top # Element composition info = Overview( summary["varid"], summary["varname"], "Categorical", summary["warnings"] ) table = Table( [ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "class": "alert" if "n_unique" in summary["warn_fields"] else "", }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "class": "alert" if "p_unique" in summary["warn_fields"] else "", }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "class": "alert" if "n_missing" in summary["warn_fields"] else "", }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "class": "alert" if "p_missing" in summary["warn_fields"] else "", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ] ) fqm = FrequencyTableSmall(mini_freq_table_rows) # TODO: settings 3,3,6 template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid") # Bottom items = [] frequency_table = FrequencyTable( # 'frequency_table', template_variables["freq_table_rows"], name="Common Values", anchor_id="{varid}common_values".format(varid=summary["varid"]), ) items.append(frequency_table) check_compositions = config["vars"]["cat"]["check_composition"].get(bool) if check_compositions: composition = Table( [ { "name": "Contains chars", "value": summary["composition"]["chars"], "fmt": "fmt", }, { "name": "Contains digits", "value": summary["composition"]["digits"], "fmt": "fmt", }, { "name": "Contains whitespace", "value": summary["composition"]["spaces"], "fmt": "fmt", }, { "name": "Contains non-words", "value": summary["composition"]["non-words"], "fmt": "fmt", }, ], name="Composition", anchor_id="{varid}composition".format(varid=summary["varid"]), ) length = Table( [ { "name": "Max length", "value": summary["max_length"], "fmt": "fmt_numeric", }, { "name": "Mean length", "value": summary["mean_length"], "fmt": "fmt_numeric", }, { "name": "Min length", "value": summary["min_length"], "fmt": "fmt_numeric", }, ], name="Length", anchor_id="{varid}lengthstats".format(varid=summary["varid"]), ) tbl = Sequence( [composition, length], anchor_id="{varid}tbl".format(varid=summary["varid"]), name="Composition", sequence_type="grid", ) items.append(tbl) histogram_bins = 10 length = Image( histogram(summary["length"], summary, histogram_bins), alt="Scatter", name="Length", anchor_id="{varid}length".format(varid=summary["varid"]), ) items.append(length) template_variables["bottom"] = Sequence( items, sequence_type="tabs", anchor_id="{varid}bottom".format(varid=summary["varid"]), ) return template_variables
def get_correlation_items(summary) -> Optional[Renderable]: """Create the list of correlation items Args: summary: dict of correlations Returns: List of correlation items to show in the interface. """ items = get_items() pearson_description = ( "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation " "between two variables. It's value lies between -1 and +1, -1 indicating total negative " "linear correlation, 0 indicating no linear correlation and 1 indicating total positive " "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location " "and scale of the two variables, implying that for a linear function the angle to the " "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two " "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and " "<em>Y</em> by the product of their standard deviations. ") spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """ kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation. <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.""" phi_k_description = """Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case of a bivariate normal input distribution. There is extensive documentation available <a href='https://phik.readthedocs.io/en/latest/index.html'>here</a>.""" cramers_description = """Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association. The empirical estimators used for Cramér's V have been proved to be biased, even for large samples. We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found <a href='http://stats.lse.ac.uk/bergsma/pdf/cramerV3.pdf'>here</a>.""" key_to_data = { "pearson": (-1, "Pearson's r", pearson_description), "spearman": (-1, "Spearman's ρ", spearman_description), "kendall": (-1, "Kendall's τ", kendall_description), "phi_k": (0, "Phik (φk)", phi_k_description), "cramers": (0, "Cramér's V (φc)", cramers_description), } image_format = config["plot"]["image_format"].get(str) for key, item in summary["correlations"].items(): vmin, name, description = key_to_data[key] diagram = Image( plot.correlation_matrix(item, vmin=vmin), image_format=image_format, alt=name, anchor_id=f"{key}_diagram", name=name, classes="correlation-diagram", ) if len(description) > 0: desc = HTML( f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>', anchor_id=f"{key}_html", classes="correlation-description", ) tbl = Sequence([diagram, desc], anchor_id=key, name=name, sequence_type="grid") items.append(tbl) else: items.append(diagram) corr = Sequence( items, sequence_type="tabs", name="Correlations Tab", anchor_id="correlations_tab", ) if len(items) > 0: btn = ToggleButton( "Toggle correlation descriptions", anchor_id="toggle-correlation-description", name="Toggle correlation descriptions", ) return Collapse(name="Correlations", anchor_id="correlations", button=btn, item=corr) else: return None
def render_categorical(summary): n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_common(summary) # TODO: merge with boolean mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ) # Top # Element composition info = Overview(summary["varid"], summary["varname"], "Categorical", summary["warnings"]) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "class": "alert" if "n_unique" in summary["warn_fields"] else "", }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "class": "alert" if "p_unique" in summary["warn_fields"] else "", }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "class": "alert" if "n_missing" in summary["warn_fields"] else "", }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "class": "alert" if "p_missing" in summary["warn_fields"] else "", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) # TODO: settings 3,3,6 template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid") # Bottom items = [] frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id="{varid}common_values".format(varid=summary["varid"]), ) items.append(frequency_table) check_compositions = config["vars"]["cat"]["check_composition"].get(bool) if check_compositions: length_table = Table( [ { "name": "Max length", "value": summary["max_length"], "fmt": "fmt_numeric", }, { "name": "Mean length", "value": summary["mean_length"], "fmt": "fmt_numeric", }, { "name": "Min length", "value": summary["min_length"], "fmt": "fmt_numeric", }, ], name="Length", anchor_id="{varid}lengthstats".format(varid=summary["varid"]), ) histogram_bins = 10 length = Image( histogram(summary["length"], summary, histogram_bins), image_format=image_format, alt="Scatter", name="Length", anchor_id="{varid}length".format(varid=summary["varid"]), ) tbl = Sequence( [length, length_table], anchor_id="{varid}tbl".format(varid=summary["varid"]), name="Length", sequence_type="grid", ) items.append(tbl) n_freq_table_max = config["n_freq_table_max"].get(int) citems = [] vc = pd.Series(summary["category_alias_values"]).value_counts() citems.append( FrequencyTable( freq_table(freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max), name="Categories", anchor_id="{varid}category_long_values".format( varid=summary["varid"]), )) vc = pd.Series(summary["script_values"]).value_counts() citems.append( FrequencyTable( freq_table(freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max), name="Scripts", anchor_id="{varid}script_values".format( varid=summary["varid"]), )) vc = pd.Series(summary["block_alias_values"]).value_counts() citems.append( FrequencyTable( freq_table(freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max), name="Blocks", anchor_id="{varid}block_alias_values".format( varid=summary["varid"]), )) characters = Sequence( citems, name="Characters", sequence_type="tabs", anchor_id="{varid}characters".format(varid=summary["varid"]), ) items.append(characters) template_variables["bottom"] = Sequence( items, sequence_type="tabs", anchor_id="{varid}bottom".format(varid=summary["varid"]), ) return template_variables
def render_count(summary): template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Real number (ℝ / ℝ<sub>≥0</sub>)", summary["warnings"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt", "alert": False }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt", "alert": False, }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) # TODO: replace with SmallImage... mini_histo = Image( mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Sequence([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = { "name": "Quantile statistics", "items": [ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric", "alert": False, }, { "name": "5-th percentile", "value": summary["quantile_5"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q1", "value": summary["quantile_25"], "fmt": "fmt_numeric", "alert": False, }, { "name": "median", "value": summary["quantile_50"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q3", "value": summary["quantile_75"], "fmt": "fmt_numeric", "alert": False, }, { "name": "95-th percentile", "value": summary["quantile_95"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Range", "value": summary["range"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Interquartile range", "value": summary["iqr"], "fmt": "fmt_numeric", "alert": False, }, ], } descriptive_statistics = { "name": "Descriptive statistics", "items": [ { "name": "Standard deviation", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "Coefficient of variation", "value": summary["cv"], "fmt": "fmt_numeric", }, { "name": "Kurtosis", "value": summary["kurt"], "fmt": "fmt_numeric" }, { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric" }, { "name": "MAD", "value": summary["mad"], "fmt": "fmt_numeric" }, { "name": "Skewness", "value": summary["skew"], "fmt": "fmt_numeric" }, { "name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric" }, { "name": "Variance", "value": summary["var"], "fmt": "fmt_numeric" }, ], } # TODO: Make sections data structure # statistics = ItemRenderer( # 'statistics', # 'Statistics', # 'table', # [ # quantile_statistics, # descriptive_statistics # ] # ) seqs = [ Image( histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Histogram", caption="<strong>Histogram with fixed size bins</strong> (bins={})" .format(summary["histogram_bins"]), name="Histogram", anchor_id="histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id="common_values", ) evs = Sequence( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id="firstn", ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id="lastn", ), ], sequence_type="tabs", name="Extreme values", anchor_id="extreme_values", ) if "histogram_bins_bayesian_blocks" in summary: histo_dyn = Image( histogram( summary["histogram_data"], summary, summary["histogram_bins_bayesian_blocks"], ), image_format=image_format, alt="Histogram", caption= '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)' .format( fmt_array(summary["histogram_bins_bayesian_blocks"], threshold=5)), name="Dynamic Histogram", anchor_id="dynamic_histogram", ) seqs.append(histo_dyn) template_variables["bottom"] = Sequence( [ # statistics, Sequence(seqs, sequence_type="tabs", name="Histogram(s)", anchor_id="histograms"), fq, evs, ], sequence_type="tabs", anchor_id=summary["varid"], ) return template_variables
def get_dataset_overview(summary): dataset_info = Table( [ { "name": "Number of variables", "value": summary["table"]["n_var"], "fmt": "fmt_numeric", }, { "name": "Number of observations", "value": summary["table"]["n"], "fmt": "fmt_numeric", }, { "name": "Missing cells", "value": summary["table"]["n_cells_missing"], "fmt": "fmt_numeric", }, { "name": "Missing cells (%)", "value": summary["table"]["p_cells_missing"], "fmt": "fmt_percent", }, { "name": "Duplicate rows", "value": summary["table"]["n_duplicates"], "fmt": "fmt_numeric", }, { "name": "Duplicate rows (%)", "value": summary["table"]["p_duplicates"], "fmt": "fmt_percent", }, { "name": "Total size in memory", "value": summary["table"]["memory_size"], "fmt": "fmt_bytesize", }, { "name": "Average record size in memory", "value": summary["table"]["record_size"], "fmt": "fmt_bytesize", }, ], name="Dataset statistics", ) dataset_types = Table( [{ "name": type_name, "value": count, "fmt": "fmt_numeric" } for type_name, count in summary["table"]["types"].items()], name="Variable types", ) return Sequence( [dataset_info, dataset_types], anchor_id="dataset_overview", name="Overview", sequence_type="grid", )
def get_correlation_items(summary) -> Optional[Renderable]: """Create the list of correlation items Args: summary: dict of correlations Returns: List of correlation items to show in the interface. """ items = get_items() pearson_description = ( "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation " "between two variables. It's value lies between -1 and +1, -1 indicating total negative " "linear correlation, 0 indicating no linear correlation and 1 indicating total positive " "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location " "and scale of the two variables, implying that for a linear function the angle to the " "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two " "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and " "<em>Y</em> by the product of their standard deviations. ") spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """ kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation. <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.""" key_to_data = { "pearson": (-1, "Pearson's r", pearson_description), "spearman": (-1, "Spearman's ρ", spearman_description), "kendall": (-1, "Kendall's τ", kendall_description), "phi_k": (0, "Phik (φk)", ""), "cramers": (0, "Cramér's V (φc)", ""), "recoded": (0, "Recoded", ""), } image_format = config["plot"]["image_format"].get(str) for key, item in summary["correlations"].items(): vmin, name, description = key_to_data[key] diagram = Image( plot.correlation_matrix(item, vmin=vmin), image_format=image_format, alt=name, anchor_id="{key}_diagram".format(key=key), name=name, classes="correlation-diagram", ) if len(description) > 0: desc = HTML( '<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>' .format(description=description, name=name), anchor_id="{key}_html".format(key=key), classes="correlation-description", ) tbl = Sequence([diagram, desc], anchor_id=key, name=name, sequence_type="grid") items.append(tbl) else: items.append(diagram) corr = Sequence( items, sequence_type="tabs", name="Correlations Tab", anchor_id="correlations_tab", ) if len(items) > 0: btn = ToggleButton( "Toggle correlation descriptions", anchor_id="toggle-correlation-description", name="Toggle correlation descriptions", ) return Collapse(name="Correlations", anchor_id="correlations", button=btn, item=corr) else: return None
def render_complex(summary): template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = Overview( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["warnings"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt" }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent" }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt" }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt" }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt" }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt" }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent" }, ]) placeholder = HTML("") template_variables["top"] = Sequence([info, table1, table2, placeholder], sequence_type="grid") # Bottom items = [ Image( scatter_complex(summary["scatter_data"]), image_format=image_format, alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id="{varid}scatter".format(varid=summary["varid"]), ) ] bottom = Sequence(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables
def render_date(summary): # TODO: render common? template_variables = {} # Top info = Overview(summary["varid"], summary["varname"], "Date", []) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt" }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent" }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt" }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) table2 = Table([ { "name": "Minimum", "value": summary["min"], "fmt": "fmt" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt" }, # {'name': '', 'value': '', 'fmt': 'fmt'}, # {'name': '', 'value': '', 'fmt': 'fmt'}, # {'name': '', 'value': '', 'fmt': 'fmt'}, # {'name': '', 'value': '', 'fmt': 'fmt'}, ]) mini_histo = Image( mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]), "Mini histogram", ) template_variables["top"] = Sequence([info, table1, table2, mini_histo], sequence_type="grid") # Bottom bottom = Sequence( [ Image( histogram(summary["histogram_data"], summary, summary["histogram_bins"]), alt="Histogram", caption="Histogram", name="Histogram", anchor_id="{varid}histogram".format(varid=summary["varid"]), ) ], sequence_type="tabs", anchor_id=summary["varid"], ) template_variables["bottom"] = bottom return template_variables
def render_date(summary): varid = summary["varid"] # TODO: render common? template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Date", summary["warnings"] ) table1 = Table( [ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ] ) table2 = Table( [ {"name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False}, {"name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False}, ] ) mini_histo = Image( mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Sequence( [info, table1, table2, mini_histo], sequence_type="grid" ) # Bottom bottom = Sequence( [ Image( histogram( summary["histogram_data"], summary, summary["histogram_bins"] ), image_format=image_format, alt="Histogram", caption="Histogram", name="Histogram", anchor_id=f"{varid}histogram", ) ], sequence_type="tabs", anchor_id=summary["varid"], ) template_variables["bottom"] = bottom return template_variables
def render_real(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) if summary["min"] >= 0: name = "Real number (ℝ<sub>≥0</sub>)" else: name = "Real number (ℝ)" # Top info = VariableInfo(summary["varid"], summary["varname"], name, summary["warnings"]) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Infinite", "value": summary["n_infinite"], "fmt": "fmt", "alert": "n_infinite" in summary["warn_fields"], }, { "name": "Infinite (%)", "value": summary["p_infinite"], "fmt": "fmt_percent", "alert": "p_infinite" in summary["warn_fields"], }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt", "alert": False }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt", "alert": "n_zeros" in summary["warn_fields"], }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": "p_zeros" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) histogram_bins = 10 # TODO: replace with SmallImage... mini_histo = Image( mini_histogram(summary["histogram_data"], summary, histogram_bins), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Sequence([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = Table( [ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric" }, { "name": "5-th percentile", "value": summary["5%"], "fmt": "fmt_numeric" }, { "name": "Q1", "value": summary["25%"], "fmt": "fmt_numeric" }, { "name": "median", "value": summary["50%"], "fmt": "fmt_numeric" }, { "name": "Q3", "value": summary["75%"], "fmt": "fmt_numeric" }, { "name": "95-th percentile", "value": summary["95%"], "fmt": "fmt_numeric" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric" }, { "name": "Range", "value": summary["range"], "fmt": "fmt_numeric" }, { "name": "Interquartile range (IQR)", "value": summary["iqr"], "fmt": "fmt_numeric", }, ], name="Quantile statistics", ) descriptive_statistics = Table( [ { "name": "Standard deviation", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "Coefficient of variation (CV)", "value": summary["cv"], "fmt": "fmt_numeric", }, { "name": "Kurtosis", "value": summary["kurtosis"], "fmt": "fmt_numeric" }, { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric" }, { "name": "Median Absolute Deviation (MAD)", "value": summary["mad"], "fmt": "fmt_numeric", }, { "name": "Skewness", "value": summary["skewness"], "fmt": "fmt_numeric", "class": "alert" if "skewness" in summary["warn_fields"] else "", }, { "name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric" }, { "name": "Variance", "value": summary["variance"], "fmt": "fmt_numeric" }, ], name="Descriptive statistics", ) statistics = Sequence( [quantile_statistics, descriptive_statistics], anchor_id=f"{varid}statistics", name="Statistics", sequence_type="grid", ) seqs = [ Image( histogram(summary["histogram_data"], summary, histogram_bins), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={histogram_bins})", name="Histogram", anchor_id=f"{varid}histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id=f"{varid}common_values", ) evs = Sequence( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id=f"{varid}firstn", ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id=f"{varid}lastn", ), ], sequence_type="tabs", name="Extreme values", anchor_id=f"{varid}extreme_values", ) if "histogram_bins_bayesian_blocks" in summary: histo_dyn = Image( histogram( summary["histogram_data"], summary, summary["histogram_bins_bayesian_blocks"], ), image_format=image_format, alt="Histogram", caption= '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)' .format( fmt_array(summary["histogram_bins_bayesian_blocks"], threshold=5)), name="Dynamic Histogram", anchor_id=f"{varid}dynamic_histogram", ) seqs.append(histo_dyn) template_variables["bottom"] = Sequence( [ statistics, Sequence( seqs, sequence_type="tabs", name="Histogram(s)", anchor_id=f"{varid}histograms", ), fq, evs, ], sequence_type="tabs", anchor_id=f"{varid}bottom", ) return template_variables
def render_boolean(summary): n_obs_bool = config["vars"]["bool"]["n_obs"].get(int) # Prepare variables template_variables = render_common(summary) mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["n"], max_number_to_print=n_obs_bool, ) # Element composition info = VariableInfo( anchor_id=summary["varid"], warnings=summary["warnings"], var_type="Boolean", var_name=summary["varname"], ) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid") freqtable = FrequencyTable( template_variables["freq_table_rows"], name="Frequency Table", anchor_id="{varid}frequency_table".format(varid=summary["varid"]), ) template_variables["bottom"] = Sequence( [freqtable], sequence_type="tabs", anchor_id="{varid}bottom".format(varid=summary["varid"]), ) return template_variables