Exemplo n.º 1
0
    def to_file(self, output=DEFAULT_OUTPUTFILE):
        """
        Write the report to a file
        By default a name is generated.
        :param output: The name or the path of the file to generale including the extension (.html)
        :type: string
        """
        if output != NO_OUTPUTFILE:
            if output == DEFAULT_OUTPUTFILE:
                output = 'profile_' + str(hash(self)) + '.html'

            # TODO: should be done in the template
            with codecs.open(output, 'w+b', encoding='utf8') as self.file:
                self.file.write(template('wrapper').render(content=self.html))
Exemplo n.º 2
0
 def render_standalone(self, mode='databricks', utils=None):
     if mode != 'databricks':
         raise NotImplementedError(
             'Only databricks mode is supported for now')
     else:
         library_path = os.path.abspath(os.path.dirname(__file__))
         css_path = os.path.join(library_path, 'templates/css/')
         js_path = os.path.join(library_path, 'templates/js/')
         utils.fs.mkdirs('/FileStore/spark_df_profiling/css')
         utils.fs.mkdirs('/FileStore/spark_df_profiling/js')
         utils.fs.cp(
             'file:' + css_path + 'bootstrap-theme.min.css',
             '/FileStore/spark_df_profiling/css/bootstrap-theme.min.css')
         utils.fs.cp('file:' + css_path + 'bootstrap.min.css',
                     '/FileStore/spark_df_profiling/css/bootstrap.min.css')
         utils.fs.cp('file:' + js_path + 'bootstrap.min.js',
                     '/FileStore/spark_df_profiling/js/bootstrap.min.js')
         utils.fs.cp('file:' + js_path + 'jquery.min.js',
                     '/FileStore/spark_df_profiling/js/jquery.min.js')
         return template('wrapper_static').render(content=self.html)
Exemplo n.º 3
0
def to_html(sample, stats_object):

    """
    Generate a HTML report from summary statistics and a given sample
    Parameters
    ----------
    sample: DataFrame containing the sample you want to print
    stats_object: Dictionary containing summary statistics. Should be generated with an appropriate describe() function

    Returns
    -------
    str, containing profile report in HTML format
    """

    n_obs = stats_object['table']['n']

    value_formatters = formatters.value_formatters
    row_formatters = formatters.row_formatters

    if not isinstance(sample, pd.DataFrame):
        raise TypeError("sample must be of type pandas.DataFrame")

    if not isinstance(stats_object, dict):
        raise TypeError("stats_object must be of type dict. Did you generate this using the spark_df_profiling.describe() function?")

    if set(stats_object.keys()) != {'table', 'variables', 'freq'}:
        raise TypeError("stats_object badly formatted. Did you generate this using the spark_df_profiling-eda.describe() function?")

    def fmt(value, name):
        if not isinstance(value, list):
            if pd.isnull(value):
                return ""
        else:
            if not value:
                return "[]"

        if name in value_formatters:
            return value_formatters[name](value)
        elif isinstance(value, float):
            return value_formatters[formatters.DEFAULT_FLOAT_FORMATTER](value)
        else:
            if sys.version_info.major == 3:
                return str(value)
            else:
                return unicode(value)

    def freq_table(freqtable, n, var_table, table_template, row_template, max_number_of_items_in_table):

        local_var_table = var_table.copy()
        freq_other_prefiltered = freqtable["***Other Values***"]
        freq_other_prefiltered_num = freqtable["***Other Values Distinct Count***"]
        freqtable = freqtable.drop(["***Other Values***", "***Other Values Distinct Count***"])

        freq_rows_html = u''

        freq_other = sum(freqtable[max_number_of_items_in_table:]) + freq_other_prefiltered
        freq_missing = var_table["n_missing"]
        max_freq = max(freqtable.values[0], freq_other, freq_missing)
        try:
            min_freq = freqtable.values[max_number_of_items_in_table]
        except IndexError:
            min_freq = 0

        # TODO: Correctly sort missing and other

        def format_row(freq, label, extra_class=''):
            width = int(freq / float(max_freq) * 99) + 1
            if width > 20:
                label_in_bar = freq
                label_after_bar = ""
            else:
                label_in_bar = " "
                label_after_bar = freq

            return row_template.render(label=label,
                                       width=width,
                                       count=freq,
                                       percentage='{:2.1f}'.format(freq / float(n) * 100),
                                       extra_class=extra_class,
                                       label_in_bar=label_in_bar,
                                       label_after_bar=label_after_bar)

        for label, freq in six.iteritems(freqtable[0:max_number_of_items_in_table]):
            freq_rows_html += format_row(freq, label)

        if freq_other > min_freq:
            freq_rows_html += format_row(freq_other,
                                         "Other values (%s)" % (freqtable.count()
                                                                + freq_other_prefiltered_num
                                                                - max_number_of_items_in_table),
                                         extra_class='other')

        if freq_missing > min_freq:
            freq_rows_html += format_row(freq_missing, "(Missing)", extra_class='missing')

        return table_template.render(rows=freq_rows_html, varid=hash(idx))

    # Variables
    rows_html = u""
    messages = []

    for idx, row in stats_object['variables'].iterrows():

        formatted_values = {'varname': idx, 'varid': hash(idx)}
        row_classes = {}

        for col, value in six.iteritems(row):
            formatted_values[col] = fmt(value, col)

        for col in set(row.index) & six.viewkeys(row_formatters):
            row_classes[col] = row_formatters[col](row[col])
            if row_classes[col] == "alert" and col in templates.messages:
                messages.append(templates.messages[col].format(formatted_values, varname = formatters.fmt_varname(idx)))

        if row['type'] == 'CAT':
            formatted_values['minifreqtable'] = freq_table(stats_object['freq'][idx], n_obs, stats_object['variables'].loc[idx],
                                                           templates.template('mini_freq_table'), templates.template('mini_freq_table_row'), 3)
            formatted_values['freqtable'] = freq_table(stats_object['freq'][idx], n_obs, stats_object['variables'].loc[idx],
                                                       templates.template('freq_table'), templates.template('freq_table_row'), 20)
            if row['distinct_count'] > 50:
                messages.append(templates.messages['HIGH_CARDINALITY'].format(formatted_values, varname = formatters.fmt_varname(idx)))
                row_classes['distinct_count'] = "alert"
            else:
                row_classes['distinct_count'] = ""

        if row['type'] == 'UNIQUE':
            obs = stats_object['freq'][idx].index

            formatted_values['firstn'] = pd.DataFrame(obs[0:3], columns=["First 3 values"]).to_html(classes="example_values", index=False)
            formatted_values['lastn'] = pd.DataFrame(obs[-3:], columns=["Last 3 values"]).to_html(classes="example_values", index=False)

            if n_obs > 40:
                formatted_values['firstn_expanded'] = pd.DataFrame(obs[0:20], index=range(1, 21)).to_html(classes="sample table table-hover", header=False)
                formatted_values['lastn_expanded'] = pd.DataFrame(obs[-20:], index=range(n_obs - 20 + 1, n_obs+1)).to_html(classes="sample table table-hover", header=False)
            else:
                formatted_values['firstn_expanded'] = pd.DataFrame(obs, index=range(1, n_obs+1)).to_html(classes="sample table table-hover", header=False)
                formatted_values['lastn_expanded'] = ''

        rows_html += templates.row_templates_dict[row['type']].render(values=formatted_values, row_classes=row_classes)

        if row['type'] in {'CORR', 'CONST'}:
            formatted_values['varname'] = formatters.fmt_varname(idx)
            messages.append(templates.messages[row['type']].format(formatted_values))


    # Overview
    formatted_values = {k: fmt(v, k) for k, v in six.iteritems(stats_object['table'])}

    row_classes={}
    for col in six.viewkeys(stats_object['table']) & six.viewkeys(row_formatters):
        row_classes[col] = row_formatters[col](stats_object['table'][col])
        if row_classes[col] == "alert" and col in templates.messages:
            messages.append(templates.messages[col].format(formatted_values, varname = formatters.fmt_varname(idx)))

    messages_html = u''
    for msg in messages:
        messages_html += templates.message_row.format(message=msg)

    overview_html = templates.template('overview').render(values=formatted_values, row_classes = row_classes, messages=messages_html)

    # Sample

    sample_html = templates.template('sample').render(sample_table_html=sample.to_html(classes="sample"))
    # TODO: should be done in the template
    return templates.template('base').render({'overview_html': overview_html, 'rows_html': rows_html, 'sample_html': sample_html})
Exemplo n.º 4
0
def to_html(sample, stats_object):

    """
    Generate a HTML report from summary statistics and a given sample
    Parameters
    ----------
    sample: DataFrame containing the sample you want to print
    stats_object: Dictionary containing summary statistics. Should be generated with an appropriate describe() function

    Returns
    -------
    str, containing profile report in HTML format
    """

    n_obs = stats_object['table']['n']

    value_formatters = formatters.value_formatters
    row_formatters = formatters.row_formatters

    if not isinstance(sample, pd.DataFrame):
        raise TypeError("sample must be of type pandas.DataFrame")

    if not isinstance(stats_object, dict):
        raise TypeError("stats_object must be of type dict. Did you generate this using the spark_df_profiling.describe() function?")

    if set(stats_object.keys()) != {'table', 'variables', 'freq'}:
        raise TypeError("stats_object badly formatted. Did you generate this using the spark_df_profiling-eda.describe() function?")

    def fmt(value, name):
        if not isinstance(value, list):
            if pd.isnull(value):
                return ""
        else:
            if not value:
                return "[]"

        if name in value_formatters:
            return value_formatters[name](value)
        elif isinstance(value, float):
            return value_formatters[formatters.DEFAULT_FLOAT_FORMATTER](value)
        else:
            if sys.version_info.major == 3:
                return str(value)
            else:
                return unicode(value)

    def freq_table(freqtable, n, var_table, table_template, row_template, max_number_of_items_in_table):

        local_var_table = var_table.copy()
        freq_other_prefiltered = freqtable["***Other Values***"]
        freq_other_prefiltered_num = freqtable["***Other Values Distinct Count***"]
        freqtable = freqtable.drop(["***Other Values***", "***Other Values Distinct Count***"])

        freq_rows_html = u''

        freq_other = sum(freqtable[max_number_of_items_in_table:]) + freq_other_prefiltered
        freq_missing = var_table["n_missing"]
        max_freq = max(freqtable.values[0], freq_other, freq_missing)
        try:
            min_freq = freqtable.values[max_number_of_items_in_table]
        except IndexError:
            min_freq = 0

        # TODO: Correctly sort missing and other

        def format_row(freq, label, extra_class=''):
            width = int(freq / float(max_freq) * 99) + 1
            if width > 20:
                label_in_bar = freq
                label_after_bar = ""
            else:
                label_in_bar = " "
                label_after_bar = freq

            return row_template.render(label=label,
                                       width=width,
                                       count=freq,
                                       percentage='{:2.1f}'.format(freq / float(n) * 100),
                                       extra_class=extra_class,
                                       label_in_bar=label_in_bar,
                                       label_after_bar=label_after_bar)

        for label, freq in six.iteritems(freqtable[0:max_number_of_items_in_table]):
            freq_rows_html += format_row(freq, label)

        if freq_other > min_freq:
            freq_rows_html += format_row(freq_other,
                                         "Other values (%s)" % (freqtable.count()
                                                                + freq_other_prefiltered_num
                                                                - max_number_of_items_in_table),
                                         extra_class='other')

        if freq_missing > min_freq:
            freq_rows_html += format_row(freq_missing, "(Missing)", extra_class='missing')

        return table_template.render(rows=freq_rows_html, varid=hash(idx))

    # Variables
    rows_html = u""
    messages = []

    for idx, row in stats_object['variables'].iterrows():

        formatted_values = {'varname': idx, 'varid': hash(idx)}
        row_classes = {}

        for col, value in six.iteritems(row):
            formatted_values[col] = fmt(value, col)

        for col in set(row.index) & six.viewkeys(row_formatters):
            row_classes[col] = row_formatters[col](row[col])
            if row_classes[col] == "alert" and col in templates.messages:
                messages.append(templates.messages[col].format(formatted_values, varname = formatters.fmt_varname(idx)))

        if row['type'] == 'CAT':
            formatted_values['minifreqtable'] = freq_table(stats_object['freq'][idx], n_obs, stats_object['variables'].ix[idx],
                                                           templates.template('mini_freq_table'), templates.template('mini_freq_table_row'), 3)
            formatted_values['freqtable'] = freq_table(stats_object['freq'][idx], n_obs, stats_object['variables'].ix[idx],
                                                       templates.template('freq_table'), templates.template('freq_table_row'), 20)
            if row['distinct_count'] > 50:
                messages.append(templates.messages['HIGH_CARDINALITY'].format(formatted_values, varname = formatters.fmt_varname(idx)))
                row_classes['distinct_count'] = "alert"
            else:
                row_classes['distinct_count'] = ""

        if row['type'] == 'UNIQUE':
            obs = stats_object['freq'][idx].index

            formatted_values['firstn'] = pd.DataFrame(obs[0:3], columns=["First 3 values"]).to_html(classes="example_values", index=False)
            formatted_values['lastn'] = pd.DataFrame(obs[-3:], columns=["Last 3 values"]).to_html(classes="example_values", index=False)

            if n_obs > 40:
                formatted_values['firstn_expanded'] = pd.DataFrame(obs[0:20], index=range(1, 21)).to_html(classes="sample table table-hover", header=False)
                formatted_values['lastn_expanded'] = pd.DataFrame(obs[-20:], index=range(n_obs - 20 + 1, n_obs+1)).to_html(classes="sample table table-hover", header=False)
            else:
                formatted_values['firstn_expanded'] = pd.DataFrame(obs, index=range(1, n_obs+1)).to_html(classes="sample table table-hover", header=False)
                formatted_values['lastn_expanded'] = ''

        rows_html += templates.row_templates_dict[row['type']].render(values=formatted_values, row_classes=row_classes)

        if row['type'] in {'CORR', 'CONST'}:
            formatted_values['varname'] = formatters.fmt_varname(idx)
            messages.append(templates.messages[row['type']].format(formatted_values))


    # Overview
    formatted_values = {k: fmt(v, k) for k, v in six.iteritems(stats_object['table'])}

    row_classes={}
    for col in six.viewkeys(stats_object['table']) & six.viewkeys(row_formatters):
        row_classes[col] = row_formatters[col](stats_object['table'][col])
        if row_classes[col] == "alert" and col in templates.messages:
            messages.append(templates.messages[col].format(formatted_values, varname = formatters.fmt_varname(idx)))

    messages_html = u''
    for msg in messages:
        messages_html += templates.message_row.format(message=msg)

    overview_html = templates.template('overview').render(values=formatted_values, row_classes = row_classes, messages=messages_html)

    # Sample

    sample_html = templates.template('sample').render(sample_table_html=sample.to_html(classes="sample"))
    # TODO: should be done in the template
    return templates.template('base').render({'overview_html': overview_html, 'rows_html': rows_html, 'sample_html': sample_html})
Exemplo n.º 5
0
def to_html(sample, stats_object):
    """
    Generate a HTML report from summary statistics and a given sample
    :param sample: DataFrame containing the sample you want to print
    :param stats_object: Dictionary containing summary statistics. Should be generated with an appropriate describe() function
    :return: profile report in HTML format
    :type: string
    """
    n_obs = stats_object['table']['n']
    row_formatters = formatters.row_formatters

    if not isinstance(sample, pd.DataFrame):
        raise TypeError('sample must be of type pandas.DataFrame')

    if not isinstance(stats_object, dict):
        raise TypeError(
            'stats_object must be of type dict. Did you generate this using the spark_df_profiling.describe() function?'
        )

    if set(stats_object.keys()) != {'table', 'variables', 'freq'}:
        raise TypeError(
            'stats_object badly formatted. Did you generate this using the spark_df_profiling-eda.describe() function?'
        )

    # Variables
    rows_html = ''
    messages = []

    for idx, row in stats_object['variables'].iterrows():
        formatted_values = {'varname': idx, 'varid': hash(idx)}
        for col, value in six.iteritems(row):
            formatted_values[col] = value_format(value, col)

        row_classes = {}
        for col in set(row.index) & six.viewkeys(row_formatters):
            row_classes[col] = row_formatters[col](row[col])
            if row_classes[col] == 'alert' and col in templates.messages:
                messages.append(templates.messages[col].format(
                    formatted_values, varname=formatters.fmt_varname(idx)))

        if row['type'] == 'CAT':
            formatted_values['minifreqtable'] = format_freq_table(
                idx, stats_object['freq'][idx], n_obs,
                stats_object['variables'].ix[idx],
                templates.template('mini_freq_table'),
                templates.template('mini_freq_table_row'), 3)
            formatted_values['freqtable'] = format_freq_table(
                idx, stats_object['freq'][idx], n_obs,
                stats_object['variables'].ix[idx],
                templates.template('freq_table'),
                templates.template('freq_table_row'), 20)
            if row['distinct_count'] > 50:
                messages.append(templates.messages['HIGH_CARDINALITY'].format(
                    formatted_values, varname=formatters.fmt_varname(idx)))
                row_classes['distinct_count'] = 'alert'
            else:
                row_classes['distinct_count'] = ''

        if row['type'] == 'UNIQUE':
            obs = stats_object['freq'][idx].index
            formatted_values['firstn'] = pd.DataFrame(
                obs[0:3],
                columns=['First 3 values']).to_html(classes='example_values',
                                                    index=False)
            formatted_values['lastn'] = pd.DataFrame(
                obs[-3:],
                columns=['Last 3 values']).to_html(classes='example_values',
                                                   index=False)

            if n_obs > 40:
                formatted_values['firstn_expanded'] = pd.DataFrame(
                    obs[0:20], index=range(1, 21)).to_html(
                        classes='sample table table-hover', header=False)
                formatted_values['lastn_expanded'] = pd.DataFrame(
                    obs[-20:], index=range(n_obs - 20 + 1, n_obs + 1)).to_html(
                        classes='sample table table-hover', header=False)
            else:
                formatted_values['firstn_expanded'] = pd.DataFrame(
                    obs, index=range(1, n_obs + 1)).to_html(
                        classes='sample table table-hover', header=False)
                formatted_values['lastn_expanded'] = ''

        rows_html += templates.row_templates_dict[row['type']].render(
            values=formatted_values, row_classes=row_classes)

        if row['type'] in ['CORR', 'CONST']:
            formatted_values['varname'] = formatters.fmt_varname(idx)
            messages.append(
                templates.messages[row['type']].format(formatted_values))

    # Overview
    formatted_values = {
        k: value_format(v, k)
        for k, v in six.iteritems(stats_object['table'])
    }

    row_classes = {}
    for col in six.viewkeys(
            stats_object['table']) & six.viewkeys(row_formatters):
        row_classes[col] = row_formatters[col](stats_object['table'][col])
        if row_classes[col] == 'alert' and col in templates.messages:
            messages.append(templates.messages[col].format(
                formatted_values, varname=formatters.fmt_varname(idx)))

    messages_html = ''
    for msg in messages:
        messages_html += templates.message_row.format(message=msg)

    overview_html = templates.template('overview').render(
        values=formatted_values,
        row_classes=row_classes,
        messages=messages_html)

    # Add Sample
    sample_html = templates.template('sample').render(
        sample_table_html=sample.to_html(classes='sample', index=False))
    # TODO: should be done in the template
    return templates.template('base').render({
        'overview_html': overview_html,
        'rows_html': rows_html,
        'sample_html': sample_html
    })