def _seaborn_viz_decomposition(df, result): """Create timeseries decomposition visualization. Args: df: The dataframe result: The statsmodels.tsa.seasonal.DecomposeResult object. Returns: The visualization """ fig, ax = plt.subplots( nrows=4, ncols=1, sharex=True, figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), ), ) sns.lineplot(y=result.observed, x=df.index, ax=ax[0]) sns.lineplot(y=result.trend, x=df.index, ax=ax[1]) sns.lineplot(y=result.seasonal, x=df.index, ax=ax[2]) sns.lineplot(y=result.resid, x=df.index, ax=ax[3]) fig.suptitle("Time Series Decomposition", fontsize=18) plt.close() return fig
def _scatter_plot(data, xname, yname, **kwargs): """Generate one scatter (joint) plot. Args: data: A Pandas data frame xname: The x-axis column name yname: The y-axis column name kwargs: Keyword arguments Returns: The Seaborn figure """ default_joint_kwargs = { "height": max( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), ) } default_scatter_kwargs = {} default_dist_kwargs = {"kde": False} default_joint_kwargs.update(kwargs.get("joint_kwargs", {})) default_scatter_kwargs.update(kwargs.get("scatter_kwargs", {})) default_dist_kwargs.update(kwargs.get("dist_kwargs", {})) g = sns.JointGrid(x=data[xname], y=data[yname], **default_joint_kwargs) g = g.plot_joint(sns.scatterplot, **default_scatter_kwargs) g = g.plot_marginals(sns.histplot, **default_dist_kwargs) return g
def figure_layout(title="Time Series", xlabel="Date", ylabel="Variable"): """Generates the figure layout. Args: title: Title of the plot. Defaults to "Time Series". xlabel: x-axis label. Defaults to "Date". ylabel: y-axis label. Defaults to "Variable". Returns: The plotly layout """ layout = go.Layout( title={ "text": title, "font": { "size": get_option("display.plotly.title_size") }, }, width=get_option("display.plotly.fig_width"), height=get_option("display.plotly.fig_height"), xaxis=go.layout.XAxis(ticks="", title=xlabel, showgrid=True), yaxis=go.layout.YAxis(ticks="", title=ylabel, automargin=True, showgrid=True), ) return layout
def _seaborn_viz_plot_time_series(df, col, result=None, decompose=False): """Create timeseries visualization. Args: df: The dataframe col (str or [str]): Column of interest. Column datatype must be numerical. result: The statsmodels.tsa.seasonal.DecomposeResult object. Defaults to None. decompose: Set as True to decompose the timeseries with moving average. result must not be None. Defaults to False. Returns: The visualization """ fig, ax = plt.subplots(figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), )) if isinstance(col, list): for i in col: fig = sns.lineplot(x=df.index, y=df[i], legend="full", ax=ax) ax.legend(labels=col) elif isinstance(col, str) and not decompose: fig = sns.lineplot(x=df.index, y=df[col], legend="full", ax=ax) elif decompose: fig = _seaborn_viz_decomposition(df, result) plt.close() return fig
def _seaborn_viz_cluster_search_plot( cluster_range: Tuple[int, int], scores: List[Union[int, float]], metric: str, **kwargs, ): """Visualize the cluster search plot for K-means clusters. Args: cluster_range (Tuple[int, int]): The range of n_clusters (k) searched as (min_cluster, max_cluster) scores (List[Union[int, float]]): The scores from the evaluation metric used to determine the "optimal" n_clusters. metric (str): The evaluation metric used. **kwargs: Keyword arguments. Returns: Seaborn plot """ n_clusters = list(range(*cluster_range)) plt.figure(figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), )) ax = sns.lineplot(n_clusters, scores) ax.set_title("Optimal Number of Clusters") ax.xaxis.set_major_locator(MaxNLocator(integer=True)) plt.xlabel("Number of Clusters") plt.ylabel(f"{' '.join(metric.split('_'))}") return ax
def _seaborn_viz_elbow_plot(min_topics: int, max_topics: int, coherence_values: List[float]): """Creates an elbow plot displaying coherence values vs number of topics. Args: min_topics: Starting number of topics that were optimized for max_topics: Maximum number of topics that were optimized for coherence_values: A list of coherence values mapped from min_topics to max_topics Returns: Elbow plot showing coherence values vs number of topics """ ax = sns.lineplot( x=[num for num in range(min_topics, max_topics + 1)], y=coherence_values, ) ax.set_title("Coherence Values Across Topic Numbers") plt.xlabel("Number of Topics") plt.ylabel("Coherence Values") plt.figure(figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), )) return ax
def _plotly_viz_correlation_matrix(association_matrix): """Plot the heatmap for the association matrix. Args: association_matrix (DataFrame): The association matrix Returns: The plotly figure """ # Plot lower left triangle x_ind, y_ind = np.triu_indices(association_matrix.shape[0]) corr = association_matrix.to_numpy() for x, y in zip(x_ind, y_ind): corr[x, y] = None # Set up the color scale cscale = mpl_to_plotly_cmap(get_p_RdBl_cmap()) # Generate a custom diverging colormap fig = go.Figure( data=[ go.Heatmap( z=np.flip(corr, axis=0), x=association_matrix.columns.values, y=association_matrix.columns.values[::-1], connectgaps=False, xgap=2, ygap=2, zmid=0, colorscale=cscale, colorbar={"title": "Strength"}, ) ], layout=go.Layout( autosize=False, width=get_option("display.plotly.fig_width"), height=get_option("display.plotly.fig_height"), title={ "text": "Correlation Matrix", "font": { "size": get_option("display.plotly.title_size") }, }, xaxis=go.layout.XAxis(automargin=True, tickangle=270, ticks="", showgrid=False), yaxis=go.layout.YAxis(automargin=True, ticks="", showgrid=False), plot_bgcolor="rgb(0,0,0,0)", paper_bgcolor="rgb(0,0,0,0)", ), ) if _in_notebook(): po.init_notebook_mode(connected=True) return po.iplot(fig, config={"displayModeBar": False}) else: return fig
def _plotly_viz_data_heatmap(data, colnames: List[str], missing: bool = False, **kwargs): """Plots the data heatmap. Args: data: The dataframe colnames (List[str]): The column names, used for tick labels missing (bool): If True, plots missing values instead **kwargs: Keyword arguments. Returns: The data heatmap as a Plotly figure. """ data_fig = go.Heatmap( z=np.flip(data.values, axis=0), x=list(range(data.shape[0])), y=list(colnames[::-1]), ygap=1, zmin=-3 if not missing else 0, zmax=3 if not missing else 1, colorscale="viridis" if not missing else "greys", colorbar={"title": "z-score (bounded)" if not missing else "Missing"}, ) figure = go.Figure( data=[data_fig], layout=go.Layout( autosize=False, title={ "text": "Data Heatmap", "font": { "size": get_option("display.plotly.title_size") }, }, width=get_option("display.plotly.fig_width"), height=get_option("display.plotly.fig_height"), xaxis=go.layout.XAxis(ticks="", title="Record #", showgrid=False), yaxis=go.layout.YAxis(ticks="", title="Variable", automargin=True, showgrid=False), plot_bgcolor="rgb(0,0,0,0)", paper_bgcolor="rgb(0,0,0,0)", ), ) if _in_notebook(): init_notebook_mode(connected=True) return iplot(figure, config={"displayModeBar": False}) else: return figure
def _get_viz_backend(backend: str = None) -> _Backend: """Get the visualization backend by name. Args: backend: The name of the backend, usually the package name Returns: _Backend """ if backend: backend_types = [backend] else: backend_types = [get_option("backends.viz")] backend_collection = [] for backend in backend_types: if _check_backend(backend, _viz_backends): modules = _viz_backends[backend] else: modules = _load_viz_backend(backend) backend_collection.append(modules) backend_list = [ module for d in backend_collection for _, module in d.items() ] return _Backend(backend_list)
def _seaborn_viz_data_heatmap(data, colnames: List[str], missing: bool = False, **kwargs): """Plots the data heatmap. Args: data: The dataframe colnames: The column names, used for tick labels missing: If True, plots missing values instead kwargs: Keyword arguments passed to seaborn.heatmap Returns: The seaborn figure """ plot_options = { "cmap": "viridis" if not missing else "Greys", "robust": True, "center": 0 if not missing else 0.5, "xticklabels": False, "yticklabels": colnames, "cbar_kws": { "shrink": 0.5, "label": "z-score (bounded)" }, "vmin": -3 if not missing else 0, "vmax": 3 if not missing else 1, } plot_options.update(kwargs) fig = Figure(figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), )) ax = fig.add_subplot(111) ax = sns.heatmap(data, ax=ax, **plot_options) ax.set_title("Data Heatmap") ax.set_xlabel("Record #") ax.set_ylabel("Variable") ax.set_yticklabels(colnames, rotation=0) return fig
def _plotly_viz_decomposition(result, dates, title="Time Series Decomposition"): """Create timeseries decomposition visualization. Args: result: The statsmodels.tsa.seasonal.DecomposeResult object. Defaults to None. dates: The datetime index title: Title of the plot. Defaults to "Time Series Decomposition". Returns: The visualization """ fig = make_subplots(rows=4, cols=1, x_title="Time", shared_xaxes=True) fig.add_trace( go.Scatter( x=dates, y=result.observed, name="observed", ), row=1, col=1, ) fig.add_trace(go.Scatter(x=dates, y=result.trend, name="trend"), row=2, col=1) fig.add_trace( go.Scatter(x=dates, y=result.seasonal, name="seasonal"), row=3, col=1, ) fig.add_trace(go.Scatter(x=dates, y=result.resid, name="residual"), row=4, col=1) fig.update_layout( height=get_option("display.plotly.fig_height"), width=get_option("display.plotly.fig_width"), title_text=title, legend_title_text="Decomposition", ) return fig
def _seaborn_viz_correlation_matrix(association_matrix, annot=False, xticks_rotation=90, yticks_rotation=0): """Plot the heatmap for the association matrix. Args: association_matrix (DataFrame): The association matrix annot (bool): If True, add correlation values to plot. Defaluts to False. xticks_rotation (int): Degrees of rotation for the xticks. Defaults to 90. yticks_rotation (int): Degrees of rotation for the yticks. Defaults to 0. Returns: The Seaborn figure """ mask = np.triu(np.ones_like(association_matrix, dtype=np.bool)) corr = association_matrix.to_numpy() vmin = min(corr.flatten()[~np.isnan(corr.flatten())]) vmax = max(corr.flatten()[~np.isnan(corr.flatten())]) plt.figure(figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), )) ax = sns.heatmap( association_matrix, vmin=vmin, vmax=vmax, cmap=get_p_RdBl_cmap(), annot=annot, mask=mask, center=0, linewidths=2, square=True, ) plt.title("Correlation Matrix") plt.xticks(rotation=xticks_rotation) plt.yticks(rotation=yticks_rotation) return ax
def _seaborn_viz_importance(importance_values, idx, cols): """Plot feature importances. Args: importance_values: The importances idx: The sorted indices cols: The columns Returns: fig: The figure """ plt.figure(figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), )) plt.xlabel("Permutation Importance Value") plt.ylabel("Features") fig = sns.barplot(y=cols[idx], x=importance_values[idx], palette="Blues_d").set_title("Feature Importance") return fig
def _seaborn_viz_cluster( data, method: str, xlabel: Optional[str] = None, ylabel: Optional[str] = None, **kwargs, ): """Visualize clusters using Seaborn. Args: data (DataFrame): The data method (str): The clustering method, to be used as the plot title xlabel (str, optional): The x-axis label. Defaults to "Reduced Dimension 1". ylabel (str, optional): The y-axis label. Defaults to "Reduced Dimension 2". **kwargs: Keyword arguments. Returns: Seaborn plot """ xlabel = xlabel or "Reduced Dimension 1" ylabel = ylabel or "Reduced Dimension 2" plt.figure(figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), )) unique_labels = len(np.unique(data["clusters"])) pal = sns.color_palette(n_colors=unique_labels) ax = sns.scatterplot(data=data, x="x", y="y", hue="clusters", palette=pal, legend="full", alpha=0.7) sns.set_context("talk") ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) plt.legend(bbox_to_anchor=(1.25, 1), loc="upper right", ncol=1) plt.title(method + " Cluster") return ax
def _seaborn_viz_plot_autocorrelation(timeseries, plot_type="acf", n_lags=40, fft=False, **kwargs): """Create timeseries autocorrelation visualization. Args: timeseries: Series object containing datetime index plot_type: Choose between 'acf' or 'pacf. Defaults to "pacf". n_lags: Number of lags to return autocorrelation for. Defaults to 40. fft (bool): If True, computes ACF via FFT. **kwargs: Keyword arguments for plot_acf or plot_pacf. Raises: ValueError: Invalid `plot_type`. Returns: The visualization """ fig, ax = plt.subplots(figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), )) if plot_type == "acf": fig = _compat["statsmodels.api"].graphics.tsa.plot_acf( # type: ignore timeseries, ax=ax, lags=n_lags, fft=fft, **kwargs) elif plot_type == "pacf": fig = _compat[ "statsmodels.api"].graphics.tsa.plot_pacf( # type: ignore timeseries, ax=ax, lags=n_lags, **kwargs) else: raise ValueError("Unsupported input data type") plt.xlabel("Lags") plt.ylabel(plot_type) plt.close() return fig
def _seaborn_viz_categorical(data, x: str, contrast: Optional[str] = None, **kwargs): """Plots a bar count plot for a categorical feature. Args: data (DataFrame): The data x (str): The name of the column to plot contrast (str, optional): The name of the categorical column to use for multiple contrasts. **kwargs: Keyword args for seaborn.countplot. Returns: Matplotlib figure """ bar_kwargs = kwargs or {} fig = Figure(figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), )) ax = fig.add_subplot() _seaborn_viz_bar(data, x, contrast, ax=ax, **bar_kwargs) ax.set_title(x) return fig
def _get_compute_backend(backend: str = None, df=None) -> _Backend: """Get the compute backend by name. In addition to searching through entrypoints, the input data (DataFrame) type will be used to infer an appropriate compute backend. Args: backend: The name of the backend, usually the package name df: The input dataframe which may be used to infer the backend Returns: _Backend """ if backend: backend_types = [backend] else: data_type = str(type(df)) backend_types = [ *_DATAFRAME_BACKENDS.get(data_type, []), get_option("backends.compute"), ] # Remove duplicates, maintain order seen = set() for idx, backend_name in enumerate(backend_types): if backend_name in seen: backend_types.pop(idx) else: seen.add(backend_name) backend_collection = [] for backend in backend_types: if _check_backend(backend, _compute_backends): modules = _compute_backends[backend] else: modules = _load_compute_backend(backend) backend_collection.append(modules) backend_list = [ module for d in backend_collection for _, module in d.items() ] return _Backend(backend_list)
def _seaborn_viz_numeric( data, x: str, contrast: Optional[str] = None, mode: str = "combo", hist_kwargs: Optional[dict] = None, violin_kwargs: Optional[dict] = None, **kwargs, ): """Plots a histogram/violin plot. Args: data (DataFrame): The data x (str): The name of the column to plot contrast (str, optional): The name of the categorical column to use for multiple contrasts. mode (str): {'combo', 'violin', 'hist'} The type of plot to display. Defaults to a combined histogram/violin plot. hist_kwargs (dict, optional): Keyword args for seaborn.histplot. violin_kwargs (dict, optional): Keyword args for seaborn.violinplot. **kwargs: Keyword args to be passed to all underlying plotting functions. Raises: ValueError: Unknown plot mode. Returns: Matplotlib figure """ hist_kwargs = hist_kwargs or {} violin_kwargs = violin_kwargs or {} fig = Figure(figsize=( get_option("display.matplotlib.fig_width"), get_option("display.matplotlib.fig_height"), )) if mode == "combo": gs = GridSpec(nrows=5, ncols=1) ax1 = fig.add_subplot(gs.new_subplotspec((0, 0), 1, 1)) ax2 = fig.add_subplot(gs.new_subplotspec((1, 0), 4, 1)) ax1.spines["right"].set_visible(False) ax1.spines["top"].set_visible(False) _seaborn_viz_histogram(data, x, contrast, ax=ax1, **hist_kwargs) _seaborn_viz_violin(data, x, contrast, ax=ax2, **violin_kwargs) ax1.set_title(x) return fig elif mode == "hist": ax = fig.add_subplot() _seaborn_viz_histogram(data, x, contrast=contrast, ax=ax, **hist_kwargs, **kwargs) ax.set_title(x) return fig elif mode == "violin": ax = fig.add_subplot() _seaborn_viz_violin(data, x, contrast=contrast, ax=ax, **violin_kwargs, **kwargs) ax.set_title(x) return fig else: raise ValueError("Unknown value for 'mode' plot type")
def _plotly_viz_cluster( data, method: str, xlabel: Optional[str] = None, ylabel: Optional[str] = None, **kwargs, ): """Visualize clusters using Plotly. Args: data (DataFrame): The data method (str): The clustering method, to be used as the plot title xlabel (str, optional): The x-axis label. Defaults to "Reduced Dimension 1". ylabel (str, optional): The y-axis label. Defaults to "Reduced Dimension 2". **kwargs: Keyword arguments. Returns: Plotly plot """ xlabel = xlabel or "Reduced Dimension 1" ylabel = ylabel or "Reduced Dimension 2" labels = data["clusters"].unique() trace_list = [] for i in labels: if int(i) < 0: trace = go.Scatter( x=data.loc[data["clusters"] == i, "x"], y=data.loc[data["clusters"] == i, "y"], name="Noise", mode="markers", marker=dict(size=10, color="grey", line=None, opacity=0.7), ) trace_list.append(trace) else: trace = go.Scatter( x=data.loc[data["clusters"] == i, "x"], y=data.loc[data["clusters"] == i, "y"], name=f"Cluster #{i}", mode="markers", marker=dict(size=10, colorscale="earth", line=None, opacity=0.7), ) trace_list.append(trace) layout = dict( yaxis=dict(zeroline=False, title=data.columns[0]), xaxis=dict(zeroline=False, title=data.columns[1]), yaxis_title=ylabel, xaxis_title=xlabel, autosize=False, width=int(get_option("display.plotly.fig_width")), height=int(get_option("display.plotly.fig_height")), title={ "text": "{} Cluster".format(method), "font": { "size": get_option("display.plotly.title_size") }, }, ) fig = go.Figure(dict(data=trace_list, layout=layout)) if _in_notebook(): po.init_notebook_mode(connected=True) return po.iplot(fig) else: return fig
import hashlib import logging from functools import reduce from typing import Optional, Union import warnings from data_describe.misc.logging import OutputLogger from data_describe.backends import _get_compute_backend from data_describe.compat import _is_dataframe, _compat, _requires from data_describe.config._config import get_option from data_describe._widget import BaseWidget _DEFAULT_SCORE_THRESHOLD = get_option("sensitive_data.score_threshold") _SAMPLE_SIZE = get_option("sensitive_data.sample_size") logger = logging.getLogger("presidio") logger.setLevel(logging.WARNING) @_requires("presidio_analyzer") def sensitive_data( df, mode: str = "redact", detect_infotypes: bool = True, columns: Optional[list] = None, score_threshold: float = _DEFAULT_SCORE_THRESHOLD, sample_size: int = _SAMPLE_SIZE, engine_backend=None, compute_backend: Optional[str] = None, ): """Identifies, redacts, and/or encrypts PII data.