def render_path_image(summary): varid = summary["varid"] n_freq_table_max = config["n_freq_table_max"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_path(summary) # Top template_variables["top"].content["items"][0].content["var_type"] = "Image Path" # Bottom keys = {"Image shape": "image_shape", "Exif keys": "exif_keys"} for title, key in keys.items(): template_variables[f"freqtable_{key}"] = freq_table( freqtable=summary[f"{key}_counts"], n=summary["n"], max_number_to_print=n_freq_table_max, ) exif_keys = FrequencyTable( template_variables["freqtable_exif_keys"], name="Exif keys", anchor_id=f"{varid}exif_frequency", ) template_variables["bottom"].content["items"].append(exif_keys) image_shape_freq = FrequencyTable( template_variables["freqtable_image_shape"], name="Frequency", anchor_id=f"{varid}image_shape_frequency", ) image_shape_scatter = Image( scatter_series(summary["scatter_data"]), image_format=image_format, alt="Scatterplot of image sizes", caption="Scatterplot of image sizes", name="Scatter", anchor_id=f"{varid}scatter", ) image_shape = Sequence( [image_shape_freq, image_shape_scatter], sequence_type="tabs", name="Image shape", anchor_id=f"{varid}image_shape", ) template_variables["bottom"].content["items"].append(image_shape) return template_variables
def get_missing_items(summary) -> list: image_format = config["plot"]["image_format"].get(str) items = [] for key, item in summary["missing"].items(): items.append( Image( item["matrix"], image_format=image_format, alt=item["name"], name=item["name"], anchor_id=key, )) return items
def get_pie_chart(pie_charts) -> list: image_format = config["plot"]["image_format"].get(str) # df = pd.read_csv('/Users/tyler.estes/Documents/projects/vonage/sample_data/WA_Fn-UseC_-Telco-Customer-Churn.csv') # pie = make_pie_chart(df, 'Churn') # plot = {'name': {'Churn': pie}} # titems = [] items = [] for key, item in pie_charts.items(): items.append( Image( item, image_format=image_format, alt=key, name=key, anchor_id='pie-chart', )) return items
def get_scatter_matrix(scatter_matrix): image_format = config["plot"]["image_format"].get(str) titems = [] for x_col, y_cols in scatter_matrix.items(): items = [] for y_col, splot in y_cols.items(): items.append( Image( splot, image_format=image_format, alt=f"{x_col} x {y_col}", anchor_id=f"scatter_{x_col}_{y_col}", name=y_col, )) titems.append( Sequence( items, sequence_type="tabs", name=x_col, anchor_id=f"scatter_{x_col}", )) return titems
def render_count(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Real number (ℝ / ℝ<sub>≥0</sub>)", summary["warnings"], ) table1 = Table( [ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, ] ) table2 = Table( [ {"name": "Mean", "value": summary["mean"], "fmt": "fmt", "alert": False}, {"name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False}, {"name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False}, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt", "alert": False, }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ] ) mini_histo = Image( mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Sequence( [info, table1, table2, mini_histo], sequence_type="grid" ) quantile_statistics = { "name": "Quantile statistics", "items": [ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric", "alert": False, }, { "name": "5-th percentile", "value": summary["quantile_5"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q1", "value": summary["quantile_25"], "fmt": "fmt_numeric", "alert": False, }, { "name": "median", "value": summary["quantile_50"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q3", "value": summary["quantile_75"], "fmt": "fmt_numeric", "alert": False, }, { "name": "95-th percentile", "value": summary["quantile_95"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Range", "value": summary["range"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Interquartile range", "value": summary["iqr"], "fmt": "fmt_numeric", "alert": False, }, ], } descriptive_statistics = { "name": "Descriptive statistics", "items": [ { "name": "Standard deviation", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "Coefficient of variation", "value": summary["cv"], "fmt": "fmt_numeric", }, {"name": "Kurtosis", "value": summary["kurt"], "fmt": "fmt_numeric"}, {"name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric"}, {"name": "MAD", "value": summary["mad"], "fmt": "fmt_numeric"}, {"name": "Skewness", "value": summary["skew"], "fmt": "fmt_numeric"}, {"name": "Square Mean Error (SME)", "value": summary["sme"], "fmt": "fmt_numeric"}, {"name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric"}, {"name": "Variance", "value": summary["var"], "fmt": "fmt_numeric"}, ], } seqs = [ Image( histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Histogram", caption=f"<strong>Histogram with fixed size bins</strong> (bins={summary['histogram_bins']})", name="Histogram", anchor_id="histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id="common_values", ) evs = Sequence( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id="firstn", ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id="lastn", ), ], sequence_type="tabs", name="Extreme values", anchor_id="extreme_values", ) if "histogram_bins_bayesian_blocks" in summary: histo_dyn = Image( histogram( summary["histogram_data"], summary, summary["histogram_bins_bayesian_blocks"], ), image_format=image_format, alt="Histogram", caption='<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)'.format( fmt_array(summary["histogram_bins_bayesian_blocks"], threshold=5) ), name="Dynamic Histogram", anchor_id="dynamic_histogram", ) seqs.append(histo_dyn) template_variables["bottom"] = Sequence( [ # statistics, Sequence( seqs, sequence_type="tabs", name="Histogram(s)", anchor_id="histograms" ), fq, evs, ], sequence_type="tabs", anchor_id=summary["varid"], ) return template_variables
def render_complex(summary): varid = summary["varid"] template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["warnings"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt" }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent" }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt" }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt" }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt" }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt" }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent" }, ]) placeholder = HTML("") template_variables["top"] = Sequence([info, table1, table2, placeholder], sequence_type="grid") # Bottom items = [ Image( scatter_complex(summary["scatter_data"]), image_format=image_format, alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id=f"{varid}scatter", ) ] bottom = Sequence(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables
def render_date(summary): varid = summary["varid"] template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo(summary["varid"], summary["varname"], "Date", summary["warnings"]) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, ]) mini_histo = Image( mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Sequence([info, table1, table2, mini_histo], sequence_type="grid") # Bottom bottom = Sequence( [ Image( histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Histogram", caption="Histogram", name="Histogram", anchor_id=f"{varid}histogram", ) ], sequence_type="tabs", anchor_id=summary["varid"], ) template_variables["bottom"] = bottom return template_variables
def get_correlation_items(summary) -> Optional[Renderable]: """Create the list of correlation items Args: summary: dict of correlations Returns: List of correlation items to show in the interface. """ items = get_items() pearson_description = ( "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear " "correlation between two variables. It's value lies between -1 and +1, where " "-1 indicates a total negative linear correlation, 0 indicates no linear " "correlation and +1 indicates a total positive linear correlation. <br />" "<br /> Pearson's <em>r</em> assumes the following: <br /> " " - Variables are continuous (Spearman's correlation should be used for ordinal) <br /> " " - Measurements are related (e.g. every row has a height and weight measurement) <br /> " " - Minimal to no outliers <br /> " " - Variables are normally distributed <br /> " " - Variables are linearly related <br /> " " - Homoscedasticy (equal variance of data around regression line)<br /> " "<br /> To calculate <em>r</em> for two variables <em>X</em> and <em>Y</em>, one divides the " "covariance of <em>X</em> and <em>Y</em> by the product of their standard deviations. " ) spearman_description = ( "The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of " "monotonic correlation between two variables, and is therefore better in " "catching nonlinear correlations than Pearson's <em>r</em>. It's value lies " "between -1 and +1, where -1 indicates a total negative correlation, indicates " "no correlation, and 1 indicates total positive correlation.<br /> " "<br />Spearman's rank correlation assumes two things:<br /> " " - Data is monotomnically related<br /> " " - At least one variable in the correlation is ordinal<br /> " "<br />A monotonic relationship states one of the following:<br /> " " - As the value of one variable INCREASES, so does the value of another<br /> " " - As the value of one variable DECREASES, the value of another INCREASES<br /> " "<br />To calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one " "divides the covariance of the rank variables of <em>X</em> and <em>Y</em> by the" "product of their standard deviations. ") cramer_description = ( "Cramer's V is a measure between two nominal (categorical) variables, where the " "score is between 0 and 1. Unlike Pearson's and Spearman correlations, Cramer's V " "does not indicate a direction of the relationship (positive or negative), but instead " "indicates the strength of the relationship. <br /> " "<br />The following guidelines can be used to determine the strength of the correlation: <br /> " " - Very strong relationship: 0.25 or higher <br /> " " - Strong relationship: 0.15 to 0.25 <br /> " " - Moderate relationship: 0.11 to 0.15 <br /> " " - weak relationship: 0.06 to 0.10 <br />" " - No or negligible relationship: 0.01 to 0.05 <br /> " "<br />Cramer's V correlation assumes that your data has more than 2 columns and 2 rows (2x2)." ) key_to_data = { "pearson": (-1, "Pearson's r", pearson_description), "spearman": (-1, "Spearman's ρ", spearman_description), "cramers": (0, "Cramér's V (φc)", cramer_description) } image_format = config["plot"]["image_format"].get(str) for key, item in summary["correlations"].items(): vmin, name, description = key_to_data[key] diagram = Image( plot.correlation_matrix(item, vmin=vmin), image_format=image_format, alt=name, anchor_id=f"{key}_diagram", name=name, classes="correlation-diagram", ) if len(description) > 0: desc = HTML( f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>', anchor_id=f"{key}_html", classes="correlation-description", ) tbl = Sequence([diagram, desc], anchor_id=key, name=name, sequence_type="grid") items.append(tbl) else: items.append(diagram) corr = Sequence( items, sequence_type="tabs", name="Correlations Tab", anchor_id="correlations_tab", ) if len(items) > 0: btn = ToggleButton( "Toggle correlation descriptions", anchor_id="toggle-correlation-description", name="Toggle correlation descriptions", ) return Collapse(name="Correlations", anchor_id="correlations", button=btn, item=corr) else: return None
def get_correlation_items(summary) -> Optional[Renderable]: """Create the list of correlation items Args: summary: dict of correlations Returns: List of correlation items to show in the interface. """ items = get_items() pearson_description = ( "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation " "between two variables. It's value lies between -1 and +1, -1 indicating total negative " "linear correlation, 0 indicating no linear correlation and 1 indicating total positive " "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location " "and scale of the two variables, implying that for a linear function the angle to the " "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two " "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and " "<em>Y</em> by the product of their standard deviations. ") spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """ kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation. <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.""" key_to_data = { "pearson": (-1, "Pearson's r", pearson_description), "spearman": (-1, "Spearman's ρ", spearman_description), "kendall": (-1, "Kendall's τ", kendall_description), "phi_k": (0, "Phik (φk)", ""), "cramers": (0, "Cramér's V (φc)", ""), "recoded": (0, "Recoded", ""), } image_format = config["plot"]["image_format"].get(str) for key, item in summary["correlations"].items(): vmin, name, description = key_to_data[key] diagram = Image( plot.correlation_matrix(item, vmin=vmin), image_format=image_format, alt=name, anchor_id=f"{key}_diagram", name=name, classes="correlation-diagram", ) if len(description) > 0: desc = HTML( f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>', anchor_id=f"{key}_html", classes="correlation-description", ) tbl = Sequence([diagram, desc], anchor_id=key, name=name, sequence_type="grid") items.append(tbl) else: items.append(diagram) corr = Sequence( items, sequence_type="tabs", name="Correlations Tab", anchor_id="correlations_tab", ) if len(items) > 0: btn = ToggleButton( "Toggle correlation descriptions", anchor_id="toggle-correlation-description", name="Toggle correlation descriptions", ) return Collapse(name="Correlations", anchor_id="correlations", button=btn, item=corr) else: return None
def render_path(summary): varid = summary["varid"] n_freq_table_max = config["n_freq_table_max"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_categorical(summary) keys = ["name", "parent", "suffix", "stem"] for path_part in keys: template_variables[f"freqtable_{path_part}"] = freq_table( freqtable=summary[f"{path_part}_counts"], n=summary["n"], max_number_to_print=n_freq_table_max, ) # Top template_variables["top"].content["items"][0].content["var_type"] = "Path" # Bottom full = FrequencyTable( template_variables["freq_table_rows"], name="Full", anchor_id=f"{varid}full_frequency", ) stem = FrequencyTable( template_variables["freqtable_stem"], name="Stem", anchor_id=f"{varid}stem_frequency", ) name = FrequencyTable( template_variables["freqtable_name"], name="Name", anchor_id=f"{varid}name_frequency", ) suffix = FrequencyTable( template_variables["freqtable_suffix"], name="Suffix", anchor_id=f"{varid}suffix_frequency", ) parent = FrequencyTable( template_variables["freqtable_parent"], name="Parent", anchor_id=f"{varid}parent_frequency", ) template_variables["bottom"].content["items"].append(full) template_variables["bottom"].content["items"].append(stem) template_variables["bottom"].content["items"].append(name) template_variables["bottom"].content["items"].append(suffix) template_variables["bottom"].content["items"].append(parent) if "file_sizes" in summary: file_size_histogram = Image( histogram(summary["file_sizes"], summary, summary["histogram_bins"]), image_format=image_format, alt="File size", caption= f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={summary['histogram_bins']})", name="File size", anchor_id=f"{varid}file_size_histogram", ) template_variables["bottom"].content["items"].append( file_size_histogram) return template_variables