示例#1
0
def _seaborn_viz_decomposition(df, result):
    """Create timeseries decomposition visualization.

    Args:
        df: The dataframe
        result: The statsmodels.tsa.seasonal.DecomposeResult object.

    Returns:
        The visualization
    """
    fig, ax = plt.subplots(
        nrows=4,
        ncols=1,
        sharex=True,
        figsize=(
            get_option("display.matplotlib.fig_width"),
            get_option("display.matplotlib.fig_height"),
        ),
    )
    sns.lineplot(y=result.observed, x=df.index, ax=ax[0])
    sns.lineplot(y=result.trend, x=df.index, ax=ax[1])
    sns.lineplot(y=result.seasonal, x=df.index, ax=ax[2])
    sns.lineplot(y=result.resid, x=df.index, ax=ax[3])
    fig.suptitle("Time Series Decomposition", fontsize=18)

    plt.close()
    return fig
示例#2
0
def _scatter_plot(data, xname, yname, **kwargs):
    """Generate one scatter (joint) plot.

    Args:
        data: A Pandas data frame
        xname: The x-axis column name
        yname: The y-axis column name
        kwargs: Keyword arguments

    Returns:
        The Seaborn figure
    """
    default_joint_kwargs = {
        "height":
        max(
            get_option("display.matplotlib.fig_width"),
            get_option("display.matplotlib.fig_height"),
        )
    }
    default_scatter_kwargs = {}
    default_dist_kwargs = {"kde": False}
    default_joint_kwargs.update(kwargs.get("joint_kwargs", {}))
    default_scatter_kwargs.update(kwargs.get("scatter_kwargs", {}))
    default_dist_kwargs.update(kwargs.get("dist_kwargs", {}))

    g = sns.JointGrid(x=data[xname], y=data[yname], **default_joint_kwargs)
    g = g.plot_joint(sns.scatterplot, **default_scatter_kwargs)
    g = g.plot_marginals(sns.histplot, **default_dist_kwargs)
    return g
示例#3
0
def figure_layout(title="Time Series", xlabel="Date", ylabel="Variable"):
    """Generates the figure layout.

    Args:
        title: Title of the plot. Defaults to "Time Series".
        xlabel: x-axis label. Defaults to "Date".
        ylabel: y-axis label. Defaults to "Variable".

    Returns:
        The plotly layout
    """
    layout = go.Layout(
        title={
            "text": title,
            "font": {
                "size": get_option("display.plotly.title_size")
            },
        },
        width=get_option("display.plotly.fig_width"),
        height=get_option("display.plotly.fig_height"),
        xaxis=go.layout.XAxis(ticks="", title=xlabel, showgrid=True),
        yaxis=go.layout.YAxis(ticks="",
                              title=ylabel,
                              automargin=True,
                              showgrid=True),
    )
    return layout
示例#4
0
def _seaborn_viz_plot_time_series(df, col, result=None, decompose=False):
    """Create timeseries visualization.

    Args:
        df: The dataframe
        col (str or [str]): Column of interest. Column datatype must be numerical.
        result: The statsmodels.tsa.seasonal.DecomposeResult object. Defaults to None.
        decompose: Set as True to decompose the timeseries with moving average. result must not be None. Defaults to False.

    Returns:
        The visualization
    """
    fig, ax = plt.subplots(figsize=(
        get_option("display.matplotlib.fig_width"),
        get_option("display.matplotlib.fig_height"),
    ))

    if isinstance(col, list):
        for i in col:
            fig = sns.lineplot(x=df.index, y=df[i], legend="full", ax=ax)
        ax.legend(labels=col)
    elif isinstance(col, str) and not decompose:
        fig = sns.lineplot(x=df.index, y=df[col], legend="full", ax=ax)
    elif decompose:
        fig = _seaborn_viz_decomposition(df, result)
        plt.close()
    return fig
示例#5
0
def _seaborn_viz_cluster_search_plot(
    cluster_range: Tuple[int, int],
    scores: List[Union[int, float]],
    metric: str,
    **kwargs,
):
    """Visualize the cluster search plot for K-means clusters.

    Args:
        cluster_range (Tuple[int, int]): The range of n_clusters (k)
            searched as (min_cluster, max_cluster)
        scores (List[Union[int, float]]): The scores from the evaluation
            metric used to determine the "optimal" n_clusters.
        metric (str): The evaluation metric used.
        **kwargs: Keyword arguments.

    Returns:
        Seaborn plot
    """
    n_clusters = list(range(*cluster_range))
    plt.figure(figsize=(
        get_option("display.matplotlib.fig_width"),
        get_option("display.matplotlib.fig_height"),
    ))
    ax = sns.lineplot(n_clusters, scores)
    ax.set_title("Optimal Number of Clusters")
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.xlabel("Number of Clusters")
    plt.ylabel(f"{' '.join(metric.split('_'))}")
    return ax
示例#6
0
def _seaborn_viz_elbow_plot(min_topics: int, max_topics: int,
                            coherence_values: List[float]):
    """Creates an elbow plot displaying coherence values vs number of topics.

    Args:
        min_topics: Starting number of topics that were optimized for
        max_topics: Maximum number of topics that were optimized for
        coherence_values: A list of coherence values mapped from min_topics to
            max_topics

    Returns:
        Elbow plot showing coherence values vs number of topics
    """
    ax = sns.lineplot(
        x=[num for num in range(min_topics, max_topics + 1)],
        y=coherence_values,
    )
    ax.set_title("Coherence Values Across Topic Numbers")
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence Values")
    plt.figure(figsize=(
        get_option("display.matplotlib.fig_width"),
        get_option("display.matplotlib.fig_height"),
    ))
    return ax
示例#7
0
def _plotly_viz_correlation_matrix(association_matrix):
    """Plot the heatmap for the association matrix.

    Args:
        association_matrix (DataFrame): The association matrix

    Returns:
        The plotly figure
    """
    # Plot lower left triangle
    x_ind, y_ind = np.triu_indices(association_matrix.shape[0])
    corr = association_matrix.to_numpy()
    for x, y in zip(x_ind, y_ind):
        corr[x, y] = None

    # Set up the color scale
    cscale = mpl_to_plotly_cmap(get_p_RdBl_cmap())

    # Generate a custom diverging colormap
    fig = go.Figure(
        data=[
            go.Heatmap(
                z=np.flip(corr, axis=0),
                x=association_matrix.columns.values,
                y=association_matrix.columns.values[::-1],
                connectgaps=False,
                xgap=2,
                ygap=2,
                zmid=0,
                colorscale=cscale,
                colorbar={"title": "Strength"},
            )
        ],
        layout=go.Layout(
            autosize=False,
            width=get_option("display.plotly.fig_width"),
            height=get_option("display.plotly.fig_height"),
            title={
                "text": "Correlation Matrix",
                "font": {
                    "size": get_option("display.plotly.title_size")
                },
            },
            xaxis=go.layout.XAxis(automargin=True,
                                  tickangle=270,
                                  ticks="",
                                  showgrid=False),
            yaxis=go.layout.YAxis(automargin=True, ticks="", showgrid=False),
            plot_bgcolor="rgb(0,0,0,0)",
            paper_bgcolor="rgb(0,0,0,0)",
        ),
    )

    if _in_notebook():
        po.init_notebook_mode(connected=True)
        return po.iplot(fig, config={"displayModeBar": False})
    else:
        return fig
示例#8
0
def _plotly_viz_data_heatmap(data,
                             colnames: List[str],
                             missing: bool = False,
                             **kwargs):
    """Plots the data heatmap.

    Args:
        data: The dataframe
        colnames (List[str]): The column names, used for tick labels
        missing (bool): If True, plots missing values instead
        **kwargs: Keyword arguments.

    Returns:
        The data heatmap as a Plotly figure.
    """
    data_fig = go.Heatmap(
        z=np.flip(data.values, axis=0),
        x=list(range(data.shape[0])),
        y=list(colnames[::-1]),
        ygap=1,
        zmin=-3 if not missing else 0,
        zmax=3 if not missing else 1,
        colorscale="viridis" if not missing else "greys",
        colorbar={"title": "z-score (bounded)" if not missing else "Missing"},
    )

    figure = go.Figure(
        data=[data_fig],
        layout=go.Layout(
            autosize=False,
            title={
                "text": "Data Heatmap",
                "font": {
                    "size": get_option("display.plotly.title_size")
                },
            },
            width=get_option("display.plotly.fig_width"),
            height=get_option("display.plotly.fig_height"),
            xaxis=go.layout.XAxis(ticks="", title="Record #", showgrid=False),
            yaxis=go.layout.YAxis(ticks="",
                                  title="Variable",
                                  automargin=True,
                                  showgrid=False),
            plot_bgcolor="rgb(0,0,0,0)",
            paper_bgcolor="rgb(0,0,0,0)",
        ),
    )

    if _in_notebook():
        init_notebook_mode(connected=True)
        return iplot(figure, config={"displayModeBar": False})
    else:
        return figure
示例#9
0
def _get_viz_backend(backend: str = None) -> _Backend:
    """Get the visualization backend by name.

    Args:
        backend: The name of the backend, usually the package name

    Returns:
        _Backend
    """
    if backend:
        backend_types = [backend]
    else:
        backend_types = [get_option("backends.viz")]

    backend_collection = []
    for backend in backend_types:
        if _check_backend(backend, _viz_backends):
            modules = _viz_backends[backend]
        else:
            modules = _load_viz_backend(backend)
        backend_collection.append(modules)
    backend_list = [
        module for d in backend_collection for _, module in d.items()
    ]
    return _Backend(backend_list)
示例#10
0
def _seaborn_viz_data_heatmap(data,
                              colnames: List[str],
                              missing: bool = False,
                              **kwargs):
    """Plots the data heatmap.

    Args:
        data: The dataframe
        colnames: The column names, used for tick labels
        missing: If True, plots missing values instead
        kwargs: Keyword arguments passed to seaborn.heatmap

    Returns:
        The seaborn figure
    """
    plot_options = {
        "cmap": "viridis" if not missing else "Greys",
        "robust": True,
        "center": 0 if not missing else 0.5,
        "xticklabels": False,
        "yticklabels": colnames,
        "cbar_kws": {
            "shrink": 0.5,
            "label": "z-score (bounded)"
        },
        "vmin": -3 if not missing else 0,
        "vmax": 3 if not missing else 1,
    }

    plot_options.update(kwargs)

    fig = Figure(figsize=(
        get_option("display.matplotlib.fig_width"),
        get_option("display.matplotlib.fig_height"),
    ))
    ax = fig.add_subplot(111)
    ax = sns.heatmap(data, ax=ax, **plot_options)
    ax.set_title("Data Heatmap")
    ax.set_xlabel("Record #")
    ax.set_ylabel("Variable")
    ax.set_yticklabels(colnames, rotation=0)

    return fig
示例#11
0
def _plotly_viz_decomposition(result,
                              dates,
                              title="Time Series Decomposition"):
    """Create timeseries decomposition visualization.

    Args:
        result: The statsmodels.tsa.seasonal.DecomposeResult object. Defaults to None.
        dates: The datetime index
        title: Title of the plot. Defaults to "Time Series Decomposition".

    Returns:
        The visualization
    """
    fig = make_subplots(rows=4, cols=1, x_title="Time", shared_xaxes=True)
    fig.add_trace(
        go.Scatter(
            x=dates,
            y=result.observed,
            name="observed",
        ),
        row=1,
        col=1,
    )
    fig.add_trace(go.Scatter(x=dates, y=result.trend, name="trend"),
                  row=2,
                  col=1)
    fig.add_trace(
        go.Scatter(x=dates, y=result.seasonal, name="seasonal"),
        row=3,
        col=1,
    )
    fig.add_trace(go.Scatter(x=dates, y=result.resid, name="residual"),
                  row=4,
                  col=1)
    fig.update_layout(
        height=get_option("display.plotly.fig_height"),
        width=get_option("display.plotly.fig_width"),
        title_text=title,
        legend_title_text="Decomposition",
    )
    return fig
示例#12
0
def _seaborn_viz_correlation_matrix(association_matrix,
                                    annot=False,
                                    xticks_rotation=90,
                                    yticks_rotation=0):
    """Plot the heatmap for the association matrix.

    Args:
        association_matrix (DataFrame): The association matrix
        annot (bool): If True, add correlation values to plot. Defaluts to False.
        xticks_rotation (int): Degrees of rotation for the xticks. Defaults to 90.
        yticks_rotation (int): Degrees of rotation for the yticks. Defaults to 0.

    Returns:
        The Seaborn figure
    """
    mask = np.triu(np.ones_like(association_matrix, dtype=np.bool))
    corr = association_matrix.to_numpy()
    vmin = min(corr.flatten()[~np.isnan(corr.flatten())])
    vmax = max(corr.flatten()[~np.isnan(corr.flatten())])

    plt.figure(figsize=(
        get_option("display.matplotlib.fig_width"),
        get_option("display.matplotlib.fig_height"),
    ))

    ax = sns.heatmap(
        association_matrix,
        vmin=vmin,
        vmax=vmax,
        cmap=get_p_RdBl_cmap(),
        annot=annot,
        mask=mask,
        center=0,
        linewidths=2,
        square=True,
    )

    plt.title("Correlation Matrix")
    plt.xticks(rotation=xticks_rotation)
    plt.yticks(rotation=yticks_rotation)
    return ax
示例#13
0
def _seaborn_viz_importance(importance_values, idx, cols):
    """Plot feature importances.

    Args:
        importance_values: The importances
        idx: The sorted indices
        cols: The columns

    Returns:
        fig: The figure
    """
    plt.figure(figsize=(
        get_option("display.matplotlib.fig_width"),
        get_option("display.matplotlib.fig_height"),
    ))
    plt.xlabel("Permutation Importance Value")
    plt.ylabel("Features")

    fig = sns.barplot(y=cols[idx], x=importance_values[idx],
                      palette="Blues_d").set_title("Feature Importance")
    return fig
示例#14
0
def _seaborn_viz_cluster(
    data,
    method: str,
    xlabel: Optional[str] = None,
    ylabel: Optional[str] = None,
    **kwargs,
):
    """Visualize clusters using Seaborn.

    Args:
        data (DataFrame): The data
        method (str): The clustering method, to be used as the plot title
        xlabel (str, optional): The x-axis label. Defaults to "Reduced Dimension 1".
        ylabel (str, optional): The y-axis label. Defaults to "Reduced Dimension 2".
        **kwargs: Keyword arguments.

    Returns:
        Seaborn plot
    """
    xlabel = xlabel or "Reduced Dimension 1"
    ylabel = ylabel or "Reduced Dimension 2"
    plt.figure(figsize=(
        get_option("display.matplotlib.fig_width"),
        get_option("display.matplotlib.fig_height"),
    ))
    unique_labels = len(np.unique(data["clusters"]))
    pal = sns.color_palette(n_colors=unique_labels)
    ax = sns.scatterplot(data=data,
                         x="x",
                         y="y",
                         hue="clusters",
                         palette=pal,
                         legend="full",
                         alpha=0.7)
    sns.set_context("talk")
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    plt.legend(bbox_to_anchor=(1.25, 1), loc="upper right", ncol=1)
    plt.title(method + " Cluster")
    return ax
示例#15
0
def _seaborn_viz_plot_autocorrelation(timeseries,
                                      plot_type="acf",
                                      n_lags=40,
                                      fft=False,
                                      **kwargs):
    """Create timeseries autocorrelation visualization.

    Args:
        timeseries: Series object containing datetime index
        plot_type: Choose between 'acf' or 'pacf. Defaults to "pacf".
        n_lags: Number of lags to return autocorrelation for. Defaults to 40.
        fft (bool): If True, computes ACF via FFT.
        **kwargs: Keyword arguments for plot_acf or plot_pacf.

    Raises:
        ValueError: Invalid `plot_type`.

    Returns:
        The visualization
    """
    fig, ax = plt.subplots(figsize=(
        get_option("display.matplotlib.fig_width"),
        get_option("display.matplotlib.fig_height"),
    ))
    if plot_type == "acf":
        fig = _compat["statsmodels.api"].graphics.tsa.plot_acf(  # type: ignore
            timeseries, ax=ax, lags=n_lags, fft=fft, **kwargs)
    elif plot_type == "pacf":
        fig = _compat[
            "statsmodels.api"].graphics.tsa.plot_pacf(  # type: ignore
                timeseries, ax=ax, lags=n_lags, **kwargs)
    else:
        raise ValueError("Unsupported input data type")
    plt.xlabel("Lags")
    plt.ylabel(plot_type)
    plt.close()
    return fig
示例#16
0
def _seaborn_viz_categorical(data,
                             x: str,
                             contrast: Optional[str] = None,
                             **kwargs):
    """Plots a bar count plot for a categorical feature.

    Args:
        data (DataFrame): The data
        x (str): The name of the column to plot
        contrast (str, optional): The name of the categorical column to use for multiple contrasts.
        **kwargs: Keyword args for seaborn.countplot.

    Returns:
        Matplotlib figure
    """
    bar_kwargs = kwargs or {}
    fig = Figure(figsize=(
        get_option("display.matplotlib.fig_width"),
        get_option("display.matplotlib.fig_height"),
    ))
    ax = fig.add_subplot()
    _seaborn_viz_bar(data, x, contrast, ax=ax, **bar_kwargs)
    ax.set_title(x)
    return fig
示例#17
0
def _get_compute_backend(backend: str = None, df=None) -> _Backend:
    """Get the compute backend by name.

    In addition to searching through entrypoints, the input data (DataFrame)
    type will be used to infer an appropriate compute backend.

    Args:
        backend: The name of the backend, usually the package name
        df: The input dataframe which may be used to infer the backend

    Returns:
        _Backend
    """
    if backend:
        backend_types = [backend]
    else:
        data_type = str(type(df))
        backend_types = [
            *_DATAFRAME_BACKENDS.get(data_type, []),
            get_option("backends.compute"),
        ]

        # Remove duplicates, maintain order
        seen = set()
        for idx, backend_name in enumerate(backend_types):
            if backend_name in seen:
                backend_types.pop(idx)
            else:
                seen.add(backend_name)

    backend_collection = []
    for backend in backend_types:
        if _check_backend(backend, _compute_backends):
            modules = _compute_backends[backend]
        else:
            modules = _load_compute_backend(backend)
        backend_collection.append(modules)
    backend_list = [
        module for d in backend_collection for _, module in d.items()
    ]
    return _Backend(backend_list)
示例#18
0
def _seaborn_viz_numeric(
    data,
    x: str,
    contrast: Optional[str] = None,
    mode: str = "combo",
    hist_kwargs: Optional[dict] = None,
    violin_kwargs: Optional[dict] = None,
    **kwargs,
):
    """Plots a histogram/violin plot.

    Args:
        data (DataFrame): The data
        x (str): The name of the column to plot
        contrast (str, optional): The name of the categorical column to use for multiple contrasts.
        mode (str): {'combo', 'violin', 'hist'} The type of plot to display.
            Defaults to a combined histogram/violin plot.
        hist_kwargs (dict, optional): Keyword args for seaborn.histplot.
        violin_kwargs (dict, optional): Keyword args for seaborn.violinplot.
        **kwargs: Keyword args to be passed to all underlying plotting functions.

    Raises:
        ValueError: Unknown plot mode.

    Returns:
        Matplotlib figure
    """
    hist_kwargs = hist_kwargs or {}
    violin_kwargs = violin_kwargs or {}
    fig = Figure(figsize=(
        get_option("display.matplotlib.fig_width"),
        get_option("display.matplotlib.fig_height"),
    ))
    if mode == "combo":
        gs = GridSpec(nrows=5, ncols=1)
        ax1 = fig.add_subplot(gs.new_subplotspec((0, 0), 1, 1))
        ax2 = fig.add_subplot(gs.new_subplotspec((1, 0), 4, 1))

        ax1.spines["right"].set_visible(False)
        ax1.spines["top"].set_visible(False)
        _seaborn_viz_histogram(data, x, contrast, ax=ax1, **hist_kwargs)
        _seaborn_viz_violin(data, x, contrast, ax=ax2, **violin_kwargs)
        ax1.set_title(x)
        return fig
    elif mode == "hist":
        ax = fig.add_subplot()
        _seaborn_viz_histogram(data,
                               x,
                               contrast=contrast,
                               ax=ax,
                               **hist_kwargs,
                               **kwargs)
        ax.set_title(x)
        return fig
    elif mode == "violin":
        ax = fig.add_subplot()
        _seaborn_viz_violin(data,
                            x,
                            contrast=contrast,
                            ax=ax,
                            **violin_kwargs,
                            **kwargs)
        ax.set_title(x)
        return fig
    else:
        raise ValueError("Unknown value for 'mode' plot type")
示例#19
0
def _plotly_viz_cluster(
    data,
    method: str,
    xlabel: Optional[str] = None,
    ylabel: Optional[str] = None,
    **kwargs,
):
    """Visualize clusters using Plotly.

    Args:
        data (DataFrame): The data
        method (str): The clustering method, to be used as the plot title
        xlabel (str, optional): The x-axis label. Defaults to "Reduced Dimension 1".
        ylabel (str, optional): The y-axis label. Defaults to "Reduced Dimension 2".
        **kwargs: Keyword arguments.

    Returns:
        Plotly plot
    """
    xlabel = xlabel or "Reduced Dimension 1"
    ylabel = ylabel or "Reduced Dimension 2"
    labels = data["clusters"].unique()

    trace_list = []
    for i in labels:
        if int(i) < 0:
            trace = go.Scatter(
                x=data.loc[data["clusters"] == i, "x"],
                y=data.loc[data["clusters"] == i, "y"],
                name="Noise",
                mode="markers",
                marker=dict(size=10, color="grey", line=None, opacity=0.7),
            )
            trace_list.append(trace)
        else:
            trace = go.Scatter(
                x=data.loc[data["clusters"] == i, "x"],
                y=data.loc[data["clusters"] == i, "y"],
                name=f"Cluster #{i}",
                mode="markers",
                marker=dict(size=10,
                            colorscale="earth",
                            line=None,
                            opacity=0.7),
            )
            trace_list.append(trace)

    layout = dict(
        yaxis=dict(zeroline=False, title=data.columns[0]),
        xaxis=dict(zeroline=False, title=data.columns[1]),
        yaxis_title=ylabel,
        xaxis_title=xlabel,
        autosize=False,
        width=int(get_option("display.plotly.fig_width")),
        height=int(get_option("display.plotly.fig_height")),
        title={
            "text": "{} Cluster".format(method),
            "font": {
                "size": get_option("display.plotly.title_size")
            },
        },
    )

    fig = go.Figure(dict(data=trace_list, layout=layout))

    if _in_notebook():
        po.init_notebook_mode(connected=True)
        return po.iplot(fig)
    else:
        return fig
示例#20
0
import hashlib
import logging
from functools import reduce
from typing import Optional, Union
import warnings

from data_describe.misc.logging import OutputLogger
from data_describe.backends import _get_compute_backend
from data_describe.compat import _is_dataframe, _compat, _requires
from data_describe.config._config import get_option
from data_describe._widget import BaseWidget

_DEFAULT_SCORE_THRESHOLD = get_option("sensitive_data.score_threshold")
_SAMPLE_SIZE = get_option("sensitive_data.sample_size")

logger = logging.getLogger("presidio")
logger.setLevel(logging.WARNING)


@_requires("presidio_analyzer")
def sensitive_data(
    df,
    mode: str = "redact",
    detect_infotypes: bool = True,
    columns: Optional[list] = None,
    score_threshold: float = _DEFAULT_SCORE_THRESHOLD,
    sample_size: int = _SAMPLE_SIZE,
    engine_backend=None,
    compute_backend: Optional[str] = None,
):
    """Identifies, redacts, and/or encrypts PII data.