Exemplo n.º 1
0
def render_path_image(summary):
    varid = summary["varid"]
    n_freq_table_max = config["n_freq_table_max"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_path(summary)

    # Top
    template_variables["top"].content["items"][0].content["var_type"] = "Image Path"

    # Bottom
    keys = {"Image shape": "image_shape", "Exif keys": "exif_keys"}

    for title, key in keys.items():
        template_variables[f"freqtable_{key}"] = freq_table(
            freqtable=summary[f"{key}_counts"],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    exif_keys = FrequencyTable(
        template_variables["freqtable_exif_keys"],
        name="Exif keys",
        anchor_id=f"{varid}exif_frequency",
    )

    template_variables["bottom"].content["items"].append(exif_keys)

    image_shape_freq = FrequencyTable(
        template_variables["freqtable_image_shape"],
        name="Frequency",
        anchor_id=f"{varid}image_shape_frequency",
    )

    image_shape_scatter = Image(
        scatter_series(summary["scatter_data"]),
        image_format=image_format,
        alt="Scatterplot of image sizes",
        caption="Scatterplot of image sizes",
        name="Scatter",
        anchor_id=f"{varid}scatter",
    )

    image_shape = Sequence(
        [image_shape_freq, image_shape_scatter],
        sequence_type="tabs",
        name="Image shape",
        anchor_id=f"{varid}image_shape",
    )

    template_variables["bottom"].content["items"].append(image_shape)

    return template_variables
Exemplo n.º 2
0
def get_missing_items(summary) -> list:
    image_format = config["plot"]["image_format"].get(str)
    items = []
    for key, item in summary["missing"].items():
        items.append(
            Image(
                item["matrix"],
                image_format=image_format,
                alt=item["name"],
                name=item["name"],
                anchor_id=key,
            ))

    return items
Exemplo n.º 3
0
def get_pie_chart(pie_charts) -> list:
    image_format = config["plot"]["image_format"].get(str)

    # df = pd.read_csv('/Users/tyler.estes/Documents/projects/vonage/sample_data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
    # pie = make_pie_chart(df, 'Churn')
    # plot = {'name': {'Churn': pie}}

    # titems = []

    items = []
    for key, item in pie_charts.items():
        items.append(
            Image(
                item,
                image_format=image_format,
                alt=key,
                name=key,
                anchor_id='pie-chart',
            ))

    return items
Exemplo n.º 4
0
def get_scatter_matrix(scatter_matrix):
    image_format = config["plot"]["image_format"].get(str)

    titems = []
    for x_col, y_cols in scatter_matrix.items():
        items = []
        for y_col, splot in y_cols.items():
            items.append(
                Image(
                    splot,
                    image_format=image_format,
                    alt=f"{x_col} x {y_col}",
                    anchor_id=f"scatter_{x_col}_{y_col}",
                    name=y_col,
                ))

        titems.append(
            Sequence(
                items,
                sequence_type="tabs",
                name=x_col,
                anchor_id=f"scatter_{x_col}",
            ))
    return titems
Exemplo n.º 5
0
def render_count(summary):
    varid = summary["varid"]
    template_variables = render_common(summary)
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Real number (&Ropf; / &Ropf;<sub>&ge;0</sub>)",
        summary["warnings"],
    )

    table1 = Table(
        [
            {
                "name": "Distinct count",
                "value": summary["n_unique"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "Unique (%)",
                "value": summary["p_unique"],
                "fmt": "fmt_percent",
                "alert": False,
            },
            {
                "name": "Missing",
                "value": summary["n_missing"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "Missing (%)",
                "value": summary["p_missing"],
                "fmt": "fmt_percent",
                "alert": False,
            },
        ]
    )

    table2 = Table(
        [
            {"name": "Mean", "value": summary["mean"], "fmt": "fmt", "alert": False},
            {"name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False},
            {"name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False},
            {
                "name": "Zeros",
                "value": summary["n_zeros"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "Zeros (%)",
                "value": summary["p_zeros"],
                "fmt": "fmt_percent",
                "alert": False,
            },
            {
                "name": "Memory size",
                "value": summary["memory_size"],
                "fmt": "fmt_bytesize",
                "alert": False,
            },
        ]
    )

    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Sequence(
        [info, table1, table2, mini_histo], sequence_type="grid"
    )

    quantile_statistics = {
        "name": "Quantile statistics",
        "items": [
            {
                "name": "Minimum",
                "value": summary["min"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "5-th percentile",
                "value": summary["quantile_5"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q1",
                "value": summary["quantile_25"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "median",
                "value": summary["quantile_50"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q3",
                "value": summary["quantile_75"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "95-th percentile",
                "value": summary["quantile_95"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Maximum",
                "value": summary["max"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Range",
                "value": summary["range"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Interquartile range",
                "value": summary["iqr"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
        ],
    }

    descriptive_statistics = {
        "name": "Descriptive statistics",
        "items": [
            {
                "name": "Standard deviation",
                "value": summary["std"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Coefficient of variation",
                "value": summary["cv"],
                "fmt": "fmt_numeric",
            },
            {"name": "Kurtosis", "value": summary["kurt"], "fmt": "fmt_numeric"},
            {"name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric"},
            {"name": "MAD", "value": summary["mad"], "fmt": "fmt_numeric"},
            {"name": "Skewness", "value": summary["skew"], "fmt": "fmt_numeric"},
            {"name": "Square Mean Error (SME)", "value": summary["sme"], "fmt": "fmt_numeric"},
            {"name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric"},
            {"name": "Variance", "value": summary["var"], "fmt": "fmt_numeric"},
        ],
    }

    seqs = [
        Image(
            histogram(summary["histogram_data"], summary, summary["histogram_bins"]),
            image_format=image_format,
            alt="Histogram",
            caption=f"<strong>Histogram with fixed size bins</strong> (bins={summary['histogram_bins']})",
            name="Histogram",
            anchor_id="histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id="common_values",
    )

    evs = Sequence(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id="firstn",
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id="lastn",
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id="extreme_values",
    )

    if "histogram_bins_bayesian_blocks" in summary:
        histo_dyn = Image(
            histogram(
                summary["histogram_data"],
                summary,
                summary["histogram_bins_bayesian_blocks"],
            ),
            image_format=image_format,
            alt="Histogram",
            caption='<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)'.format(
                fmt_array(summary["histogram_bins_bayesian_blocks"], threshold=5)
            ),
            name="Dynamic Histogram",
            anchor_id="dynamic_histogram",
        )

        seqs.append(histo_dyn)

    template_variables["bottom"] = Sequence(
        [
            # statistics,
            Sequence(
                seqs, sequence_type="tabs", name="Histogram(s)", anchor_id="histograms"
            ),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    return template_variables
Exemplo n.º 6
0
def render_complex(summary):
    varid = summary["varid"]
    template_variables = {}
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Complex number (&Copf;)",
        summary["warnings"],
    )

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt"
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent"
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt"
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt"
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt"
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt"
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt"
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent"
        },
    ])

    placeholder = HTML("")

    template_variables["top"] = Sequence([info, table1, table2, placeholder],
                                         sequence_type="grid")

    # Bottom
    items = [
        Image(
            scatter_complex(summary["scatter_data"]),
            image_format=image_format,
            alt="Scatterplot",
            caption="Scatterplot in the complex plane",
            name="Scatter",
            anchor_id=f"{varid}scatter",
        )
    ]

    bottom = Sequence(items, sequence_type="tabs", anchor_id=summary["varid"])

    template_variables["bottom"] = bottom

    return template_variables
Exemplo n.º 7
0
def render_date(summary):
    varid = summary["varid"]
    template_variables = {}

    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(summary["varid"], summary["varname"], "Date",
                        summary["warnings"])

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
    ])

    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary,
                       summary["histogram_bins"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Sequence([info, table1, table2, mini_histo],
                                         sequence_type="grid")

    # Bottom
    bottom = Sequence(
        [
            Image(
                histogram(summary["histogram_data"], summary,
                          summary["histogram_bins"]),
                image_format=image_format,
                alt="Histogram",
                caption="Histogram",
                name="Histogram",
                anchor_id=f"{varid}histogram",
            )
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    template_variables["bottom"] = bottom

    return template_variables
Exemplo n.º 8
0
def get_correlation_items(summary) -> Optional[Renderable]:
    """Create the list of correlation items

    Args:
        summary: dict of correlations

    Returns:
        List of correlation items to show in the interface.
    """
    items = get_items()

    pearson_description = (
        "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear "
        "correlation between two variables. It's value lies between -1 and +1, where "
        "-1 indicates a total negative linear correlation, 0 indicates no linear "
        "correlation and +1 indicates a total positive linear correlation. <br />"
        "<br /> Pearson's <em>r</em> assumes the following: <br /> "
        "  - Variables are continuous (Spearman's correlation should be used for ordinal) <br /> "
        "  - Measurements are related (e.g. every row has a height and weight measurement) <br /> "
        "  - Minimal to no outliers <br /> "
        "  - Variables are  normally distributed <br /> "
        "  - Variables are linearly related <br /> "
        "  - Homoscedasticy (equal variance of data around regression line)<br /> "
        "<br /> To calculate <em>r</em> for two variables <em>X</em> and <em>Y</em>, one divides the "
        "covariance of <em>X</em> and <em>Y</em> by the product of their standard deviations. "
    )

    spearman_description = (
        "The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of "
        "monotonic correlation between two variables, and is therefore better in "
        "catching nonlinear correlations than Pearson's <em>r</em>. It's value lies "
        "between -1 and +1, where -1 indicates a total negative correlation, indicates "
        "no correlation, and 1 indicates total positive correlation.<br /> "
        "<br />Spearman's rank correlation assumes two things:<br /> "
        "  - Data is monotomnically related<br /> "
        "  - At least one variable in the correlation is ordinal<br /> "
        "<br />A monotonic relationship states one of the following:<br /> "
        "  - As the value of one variable INCREASES, so does the value of another<br /> "
        "  - As the value of one variable DECREASES, the value of another INCREASES<br /> "
        "<br />To calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one "
        "divides the covariance of the rank variables of <em>X</em> and <em>Y</em> by the"
        "product of their standard deviations. ")
    cramer_description = (
        "Cramer's V is a measure between two nominal (categorical) variables, where the "
        "score is between 0 and 1. Unlike Pearson's and Spearman correlations, Cramer's V "
        "does not indicate a direction of the relationship (positive or negative), but instead "
        "indicates the strength of the relationship. <br /> "
        "<br />The following guidelines can be used to determine the strength of the correlation: <br /> "
        "  - Very strong relationship: 0.25 or higher <br /> "
        "  - Strong relationship: 0.15 to 0.25 <br /> "
        "  - Moderate relationship: 0.11 to 0.15 <br /> "
        "  - weak relationship: 0.06 to 0.10 <br />"
        "  - No or negligible relationship: 0.01 to 0.05 <br /> "
        "<br />Cramer's V correlation assumes that your data has more than 2 columns and 2 rows (2x2)."
    )

    key_to_data = {
        "pearson": (-1, "Pearson's r", pearson_description),
        "spearman": (-1, "Spearman's ρ", spearman_description),
        "cramers": (0, "Cramér's V (φc)", cramer_description)
    }

    image_format = config["plot"]["image_format"].get(str)

    for key, item in summary["correlations"].items():
        vmin, name, description = key_to_data[key]

        diagram = Image(
            plot.correlation_matrix(item, vmin=vmin),
            image_format=image_format,
            alt=name,
            anchor_id=f"{key}_diagram",
            name=name,
            classes="correlation-diagram",
        )

        if len(description) > 0:
            desc = HTML(
                f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>',
                anchor_id=f"{key}_html",
                classes="correlation-description",
            )

            tbl = Sequence([diagram, desc],
                           anchor_id=key,
                           name=name,
                           sequence_type="grid")

            items.append(tbl)
        else:
            items.append(diagram)

    corr = Sequence(
        items,
        sequence_type="tabs",
        name="Correlations Tab",
        anchor_id="correlations_tab",
    )

    if len(items) > 0:
        btn = ToggleButton(
            "Toggle correlation descriptions",
            anchor_id="toggle-correlation-description",
            name="Toggle correlation descriptions",
        )

        return Collapse(name="Correlations",
                        anchor_id="correlations",
                        button=btn,
                        item=corr)
    else:
        return None
Exemplo n.º 9
0
def get_correlation_items(summary) -> Optional[Renderable]:
    """Create the list of correlation items

    Args:
        summary: dict of correlations

    Returns:
        List of correlation items to show in the interface.
    """
    items = get_items()

    pearson_description = (
        "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation "
        "between two variables. It's value lies between -1 and +1, -1 indicating total negative "
        "linear correlation, 0 indicating no linear correlation and 1 indicating total positive "
        "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location "
        "and scale of the two variables, implying that for a linear function the angle to the "
        "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two "
        "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and "
        "<em>Y</em> by the product of their standard deviations. ")
    spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic 
    correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than 
    Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 
    0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To 
    calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank 
    variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """

    kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation 
    coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, 
    -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation.
    <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of 
    concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the 
    discordant pairs divided by the total number of pairs."""

    key_to_data = {
        "pearson": (-1, "Pearson's r", pearson_description),
        "spearman": (-1, "Spearman's ρ", spearman_description),
        "kendall": (-1, "Kendall's τ", kendall_description),
        "phi_k": (0, "Phik (φk)", ""),
        "cramers": (0, "Cramér's V (φc)", ""),
        "recoded": (0, "Recoded", ""),
    }

    image_format = config["plot"]["image_format"].get(str)

    for key, item in summary["correlations"].items():
        vmin, name, description = key_to_data[key]

        diagram = Image(
            plot.correlation_matrix(item, vmin=vmin),
            image_format=image_format,
            alt=name,
            anchor_id=f"{key}_diagram",
            name=name,
            classes="correlation-diagram",
        )

        if len(description) > 0:
            desc = HTML(
                f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>',
                anchor_id=f"{key}_html",
                classes="correlation-description",
            )

            tbl = Sequence([diagram, desc],
                           anchor_id=key,
                           name=name,
                           sequence_type="grid")

            items.append(tbl)
        else:
            items.append(diagram)

    corr = Sequence(
        items,
        sequence_type="tabs",
        name="Correlations Tab",
        anchor_id="correlations_tab",
    )

    if len(items) > 0:
        btn = ToggleButton(
            "Toggle correlation descriptions",
            anchor_id="toggle-correlation-description",
            name="Toggle correlation descriptions",
        )

        return Collapse(name="Correlations",
                        anchor_id="correlations",
                        button=btn,
                        item=corr)
    else:
        return None
Exemplo n.º 10
0
def render_path(summary):
    varid = summary["varid"]
    n_freq_table_max = config["n_freq_table_max"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_categorical(summary)

    keys = ["name", "parent", "suffix", "stem"]
    for path_part in keys:
        template_variables[f"freqtable_{path_part}"] = freq_table(
            freqtable=summary[f"{path_part}_counts"],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    # Top
    template_variables["top"].content["items"][0].content["var_type"] = "Path"

    # Bottom
    full = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Full",
        anchor_id=f"{varid}full_frequency",
    )

    stem = FrequencyTable(
        template_variables["freqtable_stem"],
        name="Stem",
        anchor_id=f"{varid}stem_frequency",
    )

    name = FrequencyTable(
        template_variables["freqtable_name"],
        name="Name",
        anchor_id=f"{varid}name_frequency",
    )

    suffix = FrequencyTable(
        template_variables["freqtable_suffix"],
        name="Suffix",
        anchor_id=f"{varid}suffix_frequency",
    )

    parent = FrequencyTable(
        template_variables["freqtable_parent"],
        name="Parent",
        anchor_id=f"{varid}parent_frequency",
    )

    template_variables["bottom"].content["items"].append(full)
    template_variables["bottom"].content["items"].append(stem)
    template_variables["bottom"].content["items"].append(name)
    template_variables["bottom"].content["items"].append(suffix)
    template_variables["bottom"].content["items"].append(parent)

    if "file_sizes" in summary:
        file_size_histogram = Image(
            histogram(summary["file_sizes"], summary,
                      summary["histogram_bins"]),
            image_format=image_format,
            alt="File size",
            caption=
            f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={summary['histogram_bins']})",
            name="File size",
            anchor_id=f"{varid}file_size_histogram",
        )

        template_variables["bottom"].content["items"].append(
            file_size_histogram)

    return template_variables