def to_html(self): """Generate and return complete template as lengthy string for using with frameworks Returns ------- str The HTML output. """ return templates.template('wrapper').render(content=self.html)
def to_html(self): """Generate and return complete template as lengthy string for using with frameworks. Returns ------- str The HTML output. """ return templates.template('wrapper').render(content=self.html)
def to_file(self, outputfile=DEFAULT_OUTPUTFILE): """Write the report to a file. By default a name is generated. Parameters: ---------- outputfile : str The name or the path of the file to generale including the extension (.html). """ if outputfile != NO_OUTPUTFILE: if outputfile == DEFAULT_OUTPUTFILE: outputfile = 'profile_' + str(hash(self)) + ".html" # TODO: should be done in the template with codecs.open(outputfile, 'w+b', encoding='utf8') as self.file: self.file.write(templates.template('wrapper').render(content=self.html))
def to_file(self, outputfile=DEFAULT_OUTPUTFILE): """Write the report to a file. By default a name is generated. Parameters: ---------- outputfile : str The name or the path of the file to generale including the extension (.html). """ if outputfile != NO_OUTPUTFILE: if outputfile == DEFAULT_OUTPUTFILE: outputfile = 'profile_' + str(hash(self)) + ".html" # TODO: should be done in the template with codecs.open(outputfile, 'w+b', encoding='utf8') as self.file: self.file.write(templates.template('wrapper').render(content=self.html))
def to_html(sample, stats_object): """Generate a HTML report from summary statistics and a given sample. Parameters ---------- sample: DataFrame containing the sample you want to print stats_object: Dictionary containing summary statistics. Should be generated with an appropriate describe() function Returns ------- str, containing profile report in HTML format """ n_obs = stats_object['table']['n'] value_formatters = formatters.value_formatters row_formatters = formatters.row_formatters if not isinstance(sample, pd.DataFrame): raise TypeError("sample must be of type pandas.DataFrame") if not isinstance(stats_object, dict): raise TypeError( "stats_object must be of type dict. Did you generate this using the pandas_profiling.describe() function?" ) if set(stats_object.keys()) != {'table', 'variables', 'freq'}: raise TypeError( "stats_object badly formatted. Did you generate this using the pandas_profiling-eda.describe() function?" ) def fmt(value, name): if pd.isnull(value): return "" if name in value_formatters: return value_formatters[name](value) elif isinstance(value, float): return value_formatters[formatters.DEFAULT_FLOAT_FORMATTER](value) else: if sys.version_info.major == 3: return str(value) else: return unicode(value) def _format_row(freq, label, max_freq, row_template, n, extra_class=''): width = int(freq / max_freq * 99) + 1 if width > 20: label_in_bar = freq label_after_bar = "" else: label_in_bar = " " label_after_bar = freq return row_template.render(label=label, width=width, count=freq, percentage='{:2.1f}'.format(freq / n * 100), extra_class=extra_class, label_in_bar=label_in_bar, label_after_bar=label_after_bar) def freq_table(freqtable, n, table_template, row_template, max_number_to_print): freq_rows_html = u'' if max_number_to_print > n: max_number_to_print = n if max_number_to_print < len(freqtable): freq_other = sum(freqtable.iloc[max_number_to_print:]) min_freq = freqtable.values[max_number_to_print] else: freq_other = 0 min_freq = 0 freq_missing = n - sum(freqtable) max_freq = max(freqtable.values[0], freq_other, freq_missing) # TODO: Correctly sort missing and other for label, freq in six.iteritems( freqtable.iloc[0:max_number_to_print]): freq_rows_html += _format_row(freq, label, max_freq, row_template, n) if freq_other > min_freq: freq_rows_html += _format_row( freq_other, "Other values (%s)" % (freqtable.count() - max_number_to_print), max_freq, row_template, n, extra_class='other') if freq_missing > min_freq: freq_rows_html += _format_row(freq_missing, "(Missing)", max_freq, row_template, n, extra_class='missing') return table_template.render(rows=freq_rows_html, varid=hash(idx)) def extreme_obs_table(freqtable, table_template, row_template, number_to_print, n, ascending=True): if ascending: obs_to_print = freqtable.sort_index().iloc[:number_to_print] else: obs_to_print = freqtable.sort_index().iloc[-number_to_print:] freq_rows_html = '' max_freq = max(obs_to_print.values) for label, freq in six.iteritems(obs_to_print): freq_rows_html += _format_row(freq, label, max_freq, row_template, n) return table_template.render(rows=freq_rows_html) # Variables rows_html = u"" messages = [] for idx, row in stats_object['variables'].iterrows(): formatted_values = {'varname': idx, 'varid': hash(idx)} row_classes = {} for col, value in six.iteritems(row): formatted_values[col] = fmt(value, col) for col in set(row.index) & six.viewkeys(row_formatters): row_classes[col] = row_formatters[col](row[col]) if row_classes[col] == "alert" and col in templates.messages: messages.append(templates.messages[col].format( formatted_values, varname=formatters.fmt_varname(idx))) if row['type'] == 'CAT': formatted_values['minifreqtable'] = freq_table( stats_object['freq'][idx], n_obs, templates.template('mini_freq_table'), templates.template('mini_freq_table_row'), 3) if row['distinct_count'] > 50: messages.append(templates.messages['HIGH_CARDINALITY'].format( formatted_values, varname=formatters.fmt_varname(idx))) row_classes['distinct_count'] = "alert" else: row_classes['distinct_count'] = "" if row['type'] == 'UNIQUE': obs = stats_object['freq'][idx].index formatted_values['firstn'] = pd.DataFrame( obs[0:3], columns=["First 3 values"]).to_html(classes="example_values", index=False) formatted_values['lastn'] = pd.DataFrame( obs[-3:], columns=["Last 3 values"]).to_html(classes="example_values", index=False) if row['type'] in {'CORR', 'CONST'}: formatted_values['varname'] = formatters.fmt_varname(idx) messages.append( templates.messages[row['type']].format(formatted_values)) else: formatted_values['freqtable'] = freq_table( stats_object['freq'][idx], n_obs, templates.template('freq_table'), templates.template('freq_table_row'), 10) formatted_values['firstn_expanded'] = extreme_obs_table( stats_object['freq'][idx], templates.template('freq_table'), templates.template('freq_table_row'), 5, n_obs, ascending=True) formatted_values['lastn_expanded'] = extreme_obs_table( stats_object['freq'][idx], templates.template('freq_table'), templates.template('freq_table_row'), 5, n_obs, ascending=False) rows_html += templates.row_templates_dict[row['type']].render( values=formatted_values, row_classes=row_classes) # Overview formatted_values = { k: fmt(v, k) for k, v in six.iteritems(stats_object['table']) } row_classes = {} for col in six.viewkeys( stats_object['table']) & six.viewkeys(row_formatters): row_classes[col] = row_formatters[col](stats_object['table'][col]) if row_classes[col] == "alert" and col in templates.messages: messages.append(templates.messages[col].format( formatted_values, varname=formatters.fmt_varname(idx))) messages_html = u'' for msg in messages: messages_html += templates.message_row.format(message=msg) overview_html = templates.template('overview').render( values=formatted_values, row_classes=row_classes, messages=messages_html) # Sample sample_html = templates.template('sample').render( sample_table_html=sample.to_html(classes="sample")) # TODO: should be done in the template return templates.template('base').render({ 'overview_html': overview_html, 'rows_html': rows_html, 'sample_html': sample_html })
def to_html(sample, stats_object): """Generate a HTML report from summary statistics and a given sample. Parameters ---------- sample : DataFrame the sample you want to print stats_object : dict Summary statistics. Should be generated with an appropriate describe() function Returns ------- str containing profile report in HTML format Notes ----- * This function as to be refactored since it's huge and it contains inner functions """ n_obs = stats_object['table']['n'] value_formatters = formatters.value_formatters row_formatters = formatters.row_formatters if not isinstance(sample, pd.DataFrame): raise TypeError("sample must be of type pandas.DataFrame") if not isinstance(stats_object, dict): raise TypeError( "stats_object must be of type dict. Did you generate this using the pandas_profiling.describe() function?" ) if not set({'table', 'variables', 'freq'}).issubset( set(stats_object.keys())): raise TypeError( "stats_object badly formatted. Did you generate this using the pandas_profiling.describe() function?" ) def fmt(value, name): if pd.isnull(value): return "" if name in value_formatters: return value_formatters[name](value) elif isinstance(value, float): return value_formatters[formatters.DEFAULT_FLOAT_FORMATTER](value) else: try: return unicode(value) # Python 2 except NameError: return str(value) # Python 3 def _format_row(freq, label, max_freq, row_template, n, extra_class='', full_width=99, bar_color=None): if max_freq != 0: width = int(freq / max_freq * full_width) + 1 else: width = 1 if width > (full_width / 1.5): label_in_bar = f"{freq:.2f}" label_after_bar = "" else: label_in_bar = " " label_after_bar = f"{freq:.2f}" context = dict(label=label, width=width, count=freq, percentage='{:2.1f}'.format(freq / n * 100), extra_class=extra_class, label_in_bar=label_in_bar, label_after_bar=label_after_bar) if bar_color is not None: context['bar_color'] = bar_color return row_template.render(**context) def freq_table(freqtable, n, table_template, row_template, max_number_to_print, nb_col=6, eps=1e-8, bar_color=None, varid=None): freq_rows_html = u'' freqtable = freqtable.sort_values(ascending=False) if max_number_to_print > n: max_number_to_print = n if max_number_to_print < len(freqtable): freq_other = sum(freqtable.iloc[max_number_to_print:]) min_freq = freqtable.values[max_number_to_print] else: freq_other = 0 min_freq = 0 freq_missing = n - sum(freqtable) if freq_missing < eps: freq_missing = 0 max_freq = max(freqtable.values[0], freq_other, freq_missing) # TODO: Correctly sort missing and other for label, freq in six.iteritems( freqtable.iloc[0:max_number_to_print]): freq_rows_html += _format_row(freq, label, max_freq, row_template, n, bar_color=bar_color) if freq_other > min_freq: freq_rows_html += _format_row( freq_other, "Other values (%s)" % (freqtable.count() - max_number_to_print), max_freq, row_template, n, extra_class='other', bar_color=bar_color) if freq_missing > min_freq: freq_rows_html += _format_row(freq_missing, "(Missing)", max_freq, row_template, n, extra_class='missing', bar_color=bar_color) context = dict(rows=freq_rows_html, varid=varid, nb_col=nb_col) return table_template.render(**context) def extreme_obs_table(freqtable, table_template, row_template, number_to_print, n, ascending=True, bar_color=None): # If it's mixed between base types (str, int) convert to str. Pure "mixed" types are filtered during type discovery if "mixed" in freqtable.index.inferred_type: freqtable.index = freqtable.index.astype(str) sorted_freqTable = freqtable.sort_index() if ascending: obs_to_print = sorted_freqTable.iloc[:number_to_print] else: obs_to_print = sorted_freqTable.iloc[-number_to_print:] freq_rows_html = '' max_freq = max(obs_to_print.values) for label, freq in six.iteritems(obs_to_print): freq_rows_html += _format_row(freq, label, max_freq, row_template, n, bar_color=bar_color) return table_template.render(rows=freq_rows_html) # Variables rows_html = u"" messages = [] for idx, row in stats_object['variables'].iterrows(): formatted_values = { 'varname': idx, 'varid': hash(str(idx) + datetime.now().isoformat()) } row_classes = {} for col, value in six.iteritems(row): formatted_values[col] = fmt(value, col) for col in set(row.index) & six.viewkeys(row_formatters): row_classes[col] = row_formatters[col](row[col]) if row_classes[col] == "alert" and col in templates.messages: messages.append(templates.messages[col].format( formatted_values, varname=idx)) if row['type'] in {'CAT', 'BOOL'}: formatted_values['minifreqtable'] = freq_table( stats_object['freq'][idx], n_obs, templates.template('mini_freq_table'), templates.template('mini_freq_table_row'), 5, templates.mini_freq_table_nb_col[row['type']], bar_color=stats_object.get('bar_color', None), varid=formatted_values['varid']) if row['distinct_count'] > 50: messages.append(templates.messages['HIGH_CARDINALITY'].format( formatted_values, varname=idx)) row_classes['distinct_count'] = "alert" else: row_classes['distinct_count'] = "" if row['type'] == 'UNIQUE': obs = stats_object['freq'][idx].index formatted_values['firstn'] = pd.DataFrame( obs[0:3], columns=["First 3 values"]).to_html(classes="example_values", index=False) formatted_values['lastn'] = pd.DataFrame( obs[-3:], columns=["Last 3 values"]).to_html(classes="example_values", index=False) if row['type'] == 'UNSUPPORTED': formatted_values['varname'] = idx messages.append( templates.messages[row['type']].format(formatted_values)) elif row['type'] in {'CORR', 'CONST', 'RECODED'}: formatted_values['varname'] = idx messages.append( templates.messages[row['type']].format(formatted_values)) else: formatted_values['freqtable'] = freq_table( stats_object['freq'][idx], n_obs, templates.template('freq_table'), templates.template('freq_table_row'), 10, bar_color=stats_object.get('bar_color', None), varid=formatted_values['varid']) formatted_values['firstn_expanded'] = extreme_obs_table( stats_object['freq'][idx], templates.template('freq_table'), templates.template('freq_table_row'), 5, n_obs, ascending=True, bar_color=stats_object.get('bar_color', None)) formatted_values['lastn_expanded'] = extreme_obs_table( stats_object['freq'][idx], templates.template('freq_table'), templates.template('freq_table_row'), 5, n_obs, ascending=False, bar_color=stats_object.get('bar_color', None)) rows_html += templates.row_templates_dict[row['type']].render( values=formatted_values, row_classes=row_classes) # Overview formatted_values = { k: fmt(v, k) for k, v in six.iteritems(stats_object['table']) } row_classes = {} for col in six.viewkeys( stats_object['table']) & six.viewkeys(row_formatters): row_classes[col] = row_formatters[col](stats_object['table'][col]) if row_classes[col] == "alert" and col in templates.messages: messages.append(templates.messages[col].format(formatted_values, varname=idx)) messages_html = u'' # for msg in messages: # messages_html += templates.message_row.format(message=msg) overview_html = templates.template('overview').render( values=formatted_values, row_classes=row_classes, messages=messages_html, weighted=hasattr(sample, 'weights'), ) # Add plot of matrix correlation # pearson_matrix = plot.correlation_matrix(stats_object['correlations']['pearson'], 'Pearson') # spearman_matrix = plot.correlation_matrix(stats_object['correlations']['spearman'], 'Spearman') # correlations_html = templates.template('correlations').render( # values={'pearson_matrix': pearson_matrix, 'spearman_matrix': spearman_matrix}) # Add sample sample_html = templates.template('sample').render( sample_table_html=sample.to_html(classes="sample")) # TODO: should be done in the template return templates.template('base').render({ 'overview_html': overview_html, 'rows_html': rows_html, 'sample_html': sample_html, # 'correlation_html': correlations_html })
def to_html(sample, stats_object, missing_count): """Generate a HTML report from summary statistics and a given sample. Parameters ---------- sample : DataFrame the sample you want to print stats_object : dict Summary statistics. Should be generated with an appropriate describe() function Returns ------- str containing profile report in HTML format Notes ----- * This function as to be refactored since it's huge and it contains inner functions """ n_obs = stats_object['table']['n'] value_formatters = formatters.value_formatters row_formatters = formatters.row_formatters if not isinstance(sample, pd.DataFrame): raise TypeError("sample must be of type pandas.DataFrame") if not isinstance(stats_object, dict): raise TypeError( "stats_object must be of type dict. Did you generate this using the pandas_profiling.describe() function?" ) if not set({'table', 'variables', 'freq', 'correlations'}).issubset( set(stats_object.keys())): raise TypeError( "stats_object badly formatted. Did you generate this using the pandas_profiling.describe() function?" ) def fmt(value, name): if pd.isnull(value): return "" if name in value_formatters: return value_formatters[name](value) elif isinstance(value, float): return value_formatters[formatters.DEFAULT_FLOAT_FORMATTER](value) else: try: return unicode(value) # Python 2 except NameError: return str(value) # Python 3 def _format_row(freq, label, max_freq, row_template, n, extra_class=''): if max_freq != 0: width = int(freq / max_freq * 99) + 1 else: width = 1 if width > 20: label_in_bar = freq label_after_bar = "" else: label_in_bar = " " label_after_bar = freq return row_template.render(label=label, width=width, count=freq, percentage='{:2.1f}'.format(freq / n * 100), extra_class=extra_class, label_in_bar=label_in_bar, label_after_bar=label_after_bar) def freq_table(freqtable, n, table_template, row_template, max_number_to_print, nb_col=6): freq_rows_html = u'' if max_number_to_print > n: max_number_to_print = n if max_number_to_print < len(freqtable): freq_other = sum(freqtable.iloc[max_number_to_print:]) min_freq = freqtable.values[max_number_to_print] else: freq_other = 0 min_freq = 0 freq_missing = n - sum(freqtable) max_freq = max(freqtable.values[0], freq_other, freq_missing) # TODO: Correctly sort missing and other for label, freq in six.iteritems( freqtable.iloc[0:max_number_to_print]): freq_rows_html += _format_row(freq, label, max_freq, row_template, n) if freq_other > min_freq: freq_rows_html += _format_row( freq_other, "Other values (%s)" % (freqtable.count() - max_number_to_print), max_freq, row_template, n, extra_class='other') if freq_missing > min_freq: freq_rows_html += _format_row(freq_missing, "(Missing)", max_freq, row_template, n, extra_class='missing') return table_template.render(rows=freq_rows_html, varid=hash(idx), nb_col=nb_col) def extreme_obs_table(freqtable, table_template, row_template, number_to_print, n, ascending=True): # If it's mixed between base types (str, int) convert to str. Pure "mixed" types are filtered during type discovery if "mixed" in freqtable.index.inferred_type: freqtable.index = freqtable.index.astype(str) sorted_freqTable = freqtable.sort_index() if ascending: obs_to_print = sorted_freqTable.iloc[:number_to_print] else: obs_to_print = sorted_freqTable.iloc[-number_to_print:] freq_rows_html = '' max_freq = max(obs_to_print.values) for label, freq in six.iteritems(obs_to_print): freq_rows_html += _format_row(freq, label, max_freq, row_template, n) return table_template.render(rows=freq_rows_html) # Variables rows_html = u"" messages = [] variable = 0 for idx, row in stats_object['variables'].iterrows(): df = stats_object['dataframe'] formatted_values = { 'varname': idx, 'varid': hash(idx), 'column_values': df[idx].dropna().tolist() } row_classes = {} for col, value in six.iteritems(row): formatted_values[col] = fmt(value, col) for col in set(row.index) & six.viewkeys(row_formatters): row_classes[col] = row_formatters[col](row[col]) if row_classes[col] == "alert" and col in templates.messages: messages.append(templates.messages[col].format( formatted_values, varname=idx)) if row['type'] in {'CAT', 'BOOL'}: formatted_values['minifreqtable'] = freq_table( stats_object['freq'][idx], n_obs, templates.template('mini_freq_table'), templates.template('mini_freq_table_row'), 3, templates.mini_freq_table_nb_col[row['type']]) if row['distinct_count'] > 50: messages.append(templates.messages['HIGH_CARDINALITY'].format( formatted_values, varname=idx)) row_classes['distinct_count'] = "alert" else: row_classes['distinct_count'] = "" if row['type'] == 'UNIQUE': obs = stats_object['freq'][idx].index formatted_values['firstn'] = pd.DataFrame( obs[0:3], columns=["First 3 values"]).to_html(classes="example_values", index=False) formatted_values['lastn'] = pd.DataFrame( obs[-3:], columns=["Last 3 values"]).to_html(classes="example_values", index=False) aux = [] if row['type'] == 'DATE': for date in formatted_values['column_values']: aux.append((date - dt.datetime(1970, 1, 1)).total_seconds()) formatted_values['column_values'] = aux if row['type'] == 'UNSUPPORTED': formatted_values['varname'] = idx messages.append( templates.messages[row['type']].format(formatted_values)) elif row['type'] in {'CORR', 'CONST', 'RECODED'}: formatted_values['varname'] = idx messages.append( templates.messages[row['type']].format(formatted_values)) else: formatted_values['freqtable'] = freq_table( stats_object['freq'][idx], n_obs, templates.template('freq_table'), templates.template('freq_table_row'), 10) formatted_values['firstn_expanded'] = extreme_obs_table( stats_object['freq'][idx], templates.template('freq_table'), templates.template('freq_table_row'), 5, n_obs, ascending=True) formatted_values['lastn_expanded'] = extreme_obs_table( stats_object['freq'][idx], templates.template('freq_table'), templates.template('freq_table_row'), 5, n_obs, ascending=False) if variable == 3: rows_html += templates.template('more_variables_header').render() rows_html += templates.row_templates_dict[row['type']].render( values=formatted_values, row_classes=row_classes) variable += 1 if variable > 3: rows_html += templates.template('more_variables_footer').render() # Overview formatted_values = { k: fmt(v, k) for k, v in six.iteritems(stats_object['table']) } formatted_values['missing_count'] = missing_count row_classes = {} for col in six.viewkeys( stats_object['table']) & six.viewkeys(row_formatters): row_classes[col] = row_formatters[col](stats_object['table'][col]) if row_classes[col] == "alert" and col in templates.messages: messages.append(templates.messages[col].format(formatted_values, varname=idx)) messages_html = u'' for msg in messages: messages_html += templates.message_row.format(message=msg) overview_html = templates.template('overview').render( values=formatted_values, row_classes=row_classes, messages=messages_html) # Add plot of matrix correlation correlations = len(stats_object['correlations']['pearson']) > 1 if (correlations): pearson_matrix = plot.correlation_matrix( stats_object['correlations']['pearson'], 'Pearson') spearman_matrix = plot.correlation_matrix( stats_object['correlations']['spearman'], 'Spearman') correlations_html = templates.template('correlations').render( values={ 'pearson_matrix': pearson_matrix, 'pearson_numeric': df.corr().to_html(), 'spearman_matrix': spearman_matrix }) # Add sample # sample_html = templates.template('sample').render(sample_table_html=sample.to_html(classes="sample")) #add viz bid_html = templates.template('bid').render( keys=stats_object['bid'][0], data_array=stats_object['bid'][1]) table_html = templates.template('table').render( metadata_table_viz=stats_object['table_viz'][0], dataarray_table_viz=stats_object['table_viz'][1]) # table_html = templates.template('table').render() # TODO: should be done in the template missing = df.isnull().values.any() if (missing): values = {} values['matrix'] = plot.missing_matrix(df) values['bar'] = plot.missing_bar(df) values['heat'] = plot.missing_heat(df) values['dendrogram'] = plot.missing_dendrogram(df) missing_html = templates.template('missing').render(values=values) sections = { ##add viz to sections 'overview_html': overview_html, 'rows_html': rows_html, # 'sample_html': sample_html, 'bid_html': bid_html, 'table_html': table_html } if (missing): sections['missing_html'] = missing_html if (correlations): sections['correlation_html'] = correlations_html return templates.template('base').render(sections, values={ 'missing': missing, 'correlations': correlations })
def to_html(sample, stats_object): """Generate a HTML report from summary statistics and a given sample. Parameters ---------- sample : DataFrame the sample you want to print stats_object : dict Summary statistics. Should be generated with an appropriate describe() function Returns ------- str containing profile report in HTML format Notes ----- * This function as to be refactored since it's huge and it contains inner functions """ n_obs = stats_object['table']['n'] value_formatters = formatters.value_formatters row_formatters = formatters.row_formatters if not isinstance(sample, pd.DataFrame): raise TypeError("sample must be of type pandas.DataFrame") if not isinstance(stats_object, dict): raise TypeError("stats_object must be of type dict. Did you generate this using the pandas_profiling.describe() function?") if set(stats_object.keys()) != {'table', 'variables', 'freq'}: raise TypeError("stats_object badly formatted. Did you generate this using the pandas_profiling-eda.describe() function?") def fmt(value, name): if pd.isnull(value): return "" if name in value_formatters: return value_formatters[name](value) elif isinstance(value, float): return value_formatters[formatters.DEFAULT_FLOAT_FORMATTER](value) else: if sys.version_info.major == 3: return str(value) else: return unicode(value) def _format_row(freq, label, max_freq, row_template, n, extra_class=''): width = int(freq / max_freq * 99) + 1 if width > 20: label_in_bar = freq label_after_bar = "" else: label_in_bar = " " label_after_bar = freq return row_template.render(label=label, width=width, count=freq, percentage='{:2.1f}'.format(freq / n * 100), extra_class=extra_class, label_in_bar=label_in_bar, label_after_bar=label_after_bar) def freq_table(freqtable, n, table_template, row_template, max_number_to_print, nb_col=6): freq_rows_html = u'' if max_number_to_print > n: max_number_to_print=n if max_number_to_print < len(freqtable): freq_other = sum(freqtable.iloc[max_number_to_print:]) min_freq = freqtable.values[max_number_to_print] else: freq_other = 0 min_freq = 0 freq_missing = n - sum(freqtable) max_freq = max(freqtable.values[0], freq_other, freq_missing) # TODO: Correctly sort missing and other for label, freq in six.iteritems(freqtable.iloc[0:max_number_to_print]): freq_rows_html += _format_row(freq, label, max_freq, row_template, n) if freq_other > min_freq: freq_rows_html += _format_row(freq_other, "Other values (%s)" % (freqtable.count() - max_number_to_print), max_freq, row_template, n, extra_class='other') if freq_missing > min_freq: freq_rows_html += _format_row(freq_missing, "(Missing)", max_freq, row_template, n, extra_class='missing') return table_template.render(rows=freq_rows_html, varid=hash(idx), nb_col=nb_col) def extreme_obs_table(freqtable, table_template, row_template, number_to_print, n, ascending = True): if ascending: obs_to_print = freqtable.sort_index().iloc[:number_to_print] else: obs_to_print = freqtable.sort_index().iloc[-number_to_print:] freq_rows_html = '' max_freq = max(obs_to_print.values) for label, freq in six.iteritems(obs_to_print): freq_rows_html += _format_row(freq, label, max_freq, row_template, n) return table_template.render(rows=freq_rows_html) # Variables rows_html = u"" messages = [] for idx, row in stats_object['variables'].iterrows(): formatted_values = {'varname': idx, 'varid': hash(idx)} row_classes = {} for col, value in six.iteritems(row): formatted_values[col] = fmt(value, col) for col in set(row.index) & six.viewkeys(row_formatters): row_classes[col] = row_formatters[col](row[col]) if row_classes[col] == "alert" and col in templates.messages: messages.append(templates.messages[col].format(formatted_values, varname = formatters.fmt_varname(idx))) if row['type'] in {'CAT', 'BOOL'}: formatted_values['minifreqtable'] = freq_table(stats_object['freq'][idx], n_obs, templates.template('mini_freq_table'), templates.template('mini_freq_table_row'), 3, templates.mini_freq_table_nb_col[row['type']]) if row['distinct_count'] > 50: messages.append(templates.messages['HIGH_CARDINALITY'].format(formatted_values, varname = formatters.fmt_varname(idx))) row_classes['distinct_count'] = "alert" else: row_classes['distinct_count'] = "" if row['type'] == 'UNIQUE': obs = stats_object['freq'][idx].index formatted_values['firstn'] = pd.DataFrame(obs[0:3], columns=["First 3 values"]).to_html(classes="example_values", index=False) formatted_values['lastn'] = pd.DataFrame(obs[-3:], columns=["Last 3 values"]).to_html(classes="example_values", index=False) if row['type'] in {'CORR', 'CONST', 'RECODED'}: formatted_values['varname'] = formatters.fmt_varname(idx) messages.append(templates.messages[row['type']].format(formatted_values)) else: formatted_values['freqtable'] = freq_table(stats_object['freq'][idx], n_obs, templates.template('freq_table'), templates.template('freq_table_row'), 10) formatted_values['firstn_expanded'] = extreme_obs_table(stats_object['freq'][idx], templates.template('freq_table'), templates.template('freq_table_row'), 5, n_obs, ascending = True) formatted_values['lastn_expanded'] = extreme_obs_table(stats_object['freq'][idx], templates.template('freq_table'), templates.template('freq_table_row'), 5, n_obs, ascending = False) rows_html += templates.row_templates_dict[row['type']].render(values=formatted_values, row_classes=row_classes) # Overview formatted_values = {k: fmt(v, k) for k, v in six.iteritems(stats_object['table'])} row_classes={} for col in six.viewkeys(stats_object['table']) & six.viewkeys(row_formatters): row_classes[col] = row_formatters[col](stats_object['table'][col]) if row_classes[col] == "alert" and col in templates.messages: messages.append(templates.messages[col].format(formatted_values, varname = formatters.fmt_varname(idx))) messages_html = u'' for msg in messages: messages_html += templates.message_row.format(message=msg) overview_html = templates.template('overview').render(values=formatted_values, row_classes = row_classes, messages=messages_html) # Sample sample_html = templates.template('sample').render(sample_table_html=sample.to_html(classes="sample")) # TODO: should be done in the template return templates.template('base').render({'overview_html': overview_html, 'rows_html': rows_html, 'sample_html': sample_html})
def __init__(self): self.html = templates.template('importer').render()