예제 #1
0
def test_kmeans_default(kmeans_default):
    cl = kmeans_default
    assert isinstance(
        cl.show(),
        matplotlib.axes.Axes), "Default show() didn't return a mpl Axes object"
    assert isinstance(cl, ClusterWidget)
    assert isinstance(cl.estimator,
                      KMeans), "Saved cluster estimator was not KMeans"
    assert hasattr(cl, "input_data"), "Widget does not have input data"
    assert _is_dataframe(cl.input_data), "Input data is not a data frame"
    assert hasattr(cl, "scaled_data"), "Widget does not have standardized data"
    assert _is_dataframe(cl.scaled_data), "Scaled data is not a data frame"
    assert hasattr(
        cl, "viz_data"), "Widget does not have visualization (reduced) data"
    assert _is_dataframe(cl.viz_data), "Viz data is not a data frame"
    assert cl.clusters is not None, "Widget is missing cluster labels"
    assert cl.n_clusters == 19, "Expected number of clusters found to be 19"
    assert isinstance(cl.cluster_range,
                      tuple), "Widget is missing cluster range tuple"
    assert len(
        cl.cluster_range) == 2, "Cluster range tuple had the wrong shape"
    assert isinstance(cl.cluster_range[0], int) and isinstance(
        cl.cluster_range[1],
        int), "Cluster range tuple does not contain integers"
    assert (
        cl.cluster_range[0] < cl.cluster_range[1]
    ), "Cluster range had an invalid default; the maximum is <= the minimum"
    assert (cl.metric == "silhouette_score"
            ), "Default search metric was not silhouette_score"
    assert cl.scores is not None, "Widget is missing cluster search scores"
    assert cl.search, "Widget is missing boolean attribute 'search' for cluster search"
    assert isinstance(cl.show(), mpl_plot)
    assert hasattr(cl, "cluster_search_plot")
    assert isinstance(cl.cluster_search_plot(), mpl_plot)
    assert hasattr(cl, "reductor")
예제 #2
0
def test_series_attributes(load_series_summary, compute_backend_df):
    assert isinstance(load_series_summary, SummaryWidget)
    assert str(load_series_summary) == "data-describe Summary Widget"
    assert _is_dataframe(load_series_summary.input_data)
    assert _is_dataframe(load_series_summary.info_data)
    assert _is_dataframe(load_series_summary.summary_data)
    assert hasattr(load_series_summary, "as_percentage")
    assert hasattr(load_series_summary, "auto_float")
예제 #3
0
def test_figure_categorical_cluster(data):
    cr = dd.correlation_matrix(data, cluster=True, categorical=True)
    assert isinstance(cr.show(viz_backend="plotly"), plotly.graph_objs.Figure)
    assert isinstance(cr.show(), mpl_plot)
    assert _is_dataframe(cr.association_matrix)
    assert _is_dataframe(cr.association_matrix)
    assert _is_dataframe(cr.viz_data)
    assert isinstance(cr, CorrelationWidget)
    assert_frame_equal(cr.viz_data, cr.cluster_matrix)
예제 #4
0
def test_pandas_compute_stationarity_test(compute_time_data):
    test_df = _pandas_compute_stationarity_test(compute_time_data["var"],
                                                test="dickey-fuller")
    assert _is_dataframe(test_df)
    assert test_df.shape == (7, 1)
    test_df = _pandas_compute_stationarity_test(compute_time_data["var"],
                                                test="kpss")
    assert _is_dataframe(test_df)
    assert test_df.shape == (7, 1)
예제 #5
0
    def show(self, **kwargs):
        """Show the transformed data or infotypes."""
        if _is_dataframe(self.encrypt):
            viz_data = self.encrypt

        elif _is_dataframe(self.redact):
            viz_data = self.redact

        elif self.infotypes:
            viz_data = self.infotypes

        return viz_data
예제 #6
0
def test_cluster_no_categorical_figure(data):
    cr = dd.correlation_matrix(data, cluster=True)
    assert isinstance(cr.show(viz_backend="plotly"), plotly.graph_objs.Figure)
    assert isinstance(cr.show(), mpl_plot)
    assert _is_dataframe(cr.association_matrix)
    assert _is_dataframe(cr.viz_data)
    assert isinstance(cr, CorrelationWidget)
    assert_frame_equal(cr.viz_data, cr.cluster_matrix)
    assert data.select_dtypes(["number"]).shape[1] == cr.association_matrix.shape[1]
    assert data.select_dtypes(["number"]).shape[1] == cr.association_matrix.shape[0]
    assert data.select_dtypes(["number"]).shape[1] == cr.viz_data.shape[1]
    assert data.select_dtypes(["number"]).shape[1] == cr.viz_data.shape[0]
예제 #7
0
def test_categorical_and_numerical_data(data):
    cr = dd.correlation_matrix(data, categorical=True)
    assert isinstance(cr.show(viz_backend="plotly"), plotly.graph_objs.Figure)
    assert isinstance(cr.show(), mpl_plot)
    assert _is_dataframe(cr.association_matrix)
    assert _is_dataframe(cr.viz_data)
    assert isinstance(cr, CorrelationWidget)
    assert_frame_equal(cr.viz_data, cr.association_matrix)
    assert data.shape[1] == cr.association_matrix.shape[1]
    assert data.shape[1] == cr.association_matrix.shape[0]
    assert data.shape[1] == cr.viz_data.shape[1]
    assert data.shape[1] == cr.viz_data.shape[0]
    assert isinstance(cr.cluster_matrix, type(None))
예제 #8
0
def stationarity_test(df,
                      col,
                      test="dickey-fuller",
                      regression="c",
                      compute_backend=None,
                      **kwargs):
    """Perform stationarity tests to see if mean and variance are changing over time.

    Backend uses statsmodel's statsmodels.tsa.stattools.adfuller or statsmodels.tsa.stattools.kpss

    Args:
        df: The dataframe. Must contain a datetime index
        col: The feature of interest
        test: Choice of stationarity test. "kpss" or "dickey-fuller". Defaults to "dickey-fuller".
        regression: Constant and trend order to include in regression. Choose between 'c','ct','ctt', and 'nc'. Defaults to 'c'
        compute_backend: Select computing backend. Defaults to None (pandas).
        **kwargs: Keyword arguments

    Raises:
        ValueError: Invalid input data type.
        ValueError: `col` not found in dataframe.

    Returns:
        Pandas dataframe containing the statistics
    """
    if not _is_dataframe(df):
        raise ValueError("Unsupported input data type")
    if not isinstance(col, str):
        raise ValueError(f"{col} not found in dataframe")

    data = _get_compute_backend(compute_backend, df).compute_stationarity_test(
        df[col], test, regression, **kwargs)
    return data
def test_ipca(compute_numeric_backend_df):
    x = dim_reduc(data=compute_numeric_backend_df,
                  n_components=2,
                  dim_method="ipca")
    assert isinstance(x, tuple)
    assert _is_dataframe(x[0])
    assert isinstance(x[1], sklearn.decomposition.IncrementalPCA)
def test_tsvd(compute_numeric_backend_df):
    x = dim_reduc(data=compute_numeric_backend_df,
                  n_components=2,
                  dim_method="tsvd")
    assert isinstance(x, tuple)
    assert _is_dataframe(x[0])
    assert isinstance(x[1], sklearn.decomposition.TruncatedSVD)
예제 #11
0
def distribution(data,
                 diagnostic=True,
                 compute_backend=None,
                 viz_backend=None,
                 **kwargs) -> DistributionWidget:
    """Distribution Plots.

    Visualizes univariate distributions. This feature can be used for generating
    various types of plots for univariate distributions, including: histograms, violin
    plots, bar (count) plots.

    Args:
        data: Data Frame
        diagnostic: If True, will run diagnostics to select "interesting" plots.
        compute_backend: The compute backend.
        viz_backend: The visualization backend.
        **kwargs: Keyword arguments.

    Raises:
        ValueError: Invalid input data type.

    Returns:
        DistributionWidget
    """
    if not _is_dataframe(data):
        raise ValueError("DataFrame required.")

    widget = _get_compute_backend(compute_backend, data).compute_distribution(
        data, diagnostic=diagnostic, **kwargs)
    return widget
예제 #12
0
def test_categorical_data_only(data):
    cat_data = data[
        [c for c in data.columns if c not in data.select_dtypes(["number"]).columns]
    ]
    cr = dd.correlation_matrix(cat_data, categorical=True)
    assert isinstance(cr.show(viz_backend="plotly"), plotly.graph_objs.Figure)
    assert isinstance(cr.show(), mpl_plot)
    assert _is_dataframe(cr.association_matrix)
    assert _is_dataframe(cr.viz_data)
    assert isinstance(cr, CorrelationWidget)
    assert_frame_equal(cr.viz_data, cr.association_matrix)
    assert cat_data.shape[1] == cr.association_matrix.shape[1]
    assert cat_data.shape[1] == cr.association_matrix.shape[0]
    assert cat_data.shape[1] == cr.viz_data.shape[1]
    assert cat_data.shape[1] == cr.viz_data.shape[0]
    assert isinstance(cr.cluster_matrix, type(None))
예제 #13
0
def test_only_encrypt_data(compute_backend_pii_df):
    sensitivewidget = sensitive_data(compute_backend_pii_df,
                                     mode="encrypt",
                                     detect_infotypes=False)
    assert isinstance(sensitivewidget, SensitiveDataWidget)
    assert _is_dataframe(sensitivewidget.encrypt)
    assert isinstance(sensitivewidget.encrypt["name"][1], str)
    assert isinstance(sensitivewidget.encrypt["domain"][1], str)
    assert isinstance(sensitivewidget.redact, type(None))
    assert isinstance(sensitivewidget.infotypes, type(None))
예제 #14
0
def test_sensitive_data_cols(compute_backend_pii_df):
    sensitivewidget = sensitive_data(compute_backend_pii_df,
                                     mode="redact",
                                     columns=["name"],
                                     detect_infotypes=False)
    assert isinstance(sensitivewidget, SensitiveDataWidget)
    assert _is_dataframe(sensitivewidget.redact)
    assert sensitivewidget.redact.shape == (1, 1)
    assert isinstance(sensitivewidget.infotypes, type(None))
    assert isinstance(sensitivewidget.encrypt, type(None))
예제 #15
0
def plot_autocorrelation(
    df,
    col,
    plot_type="acf",
    n_lags=40,
    fft=False,
    compute_backend=None,
    viz_backend=None,
    **kwargs,
):
    """Correlation estimate using partial autocorrelation or autocorrelation.

    Statistics are computed using the statsmodels API.

    Args:
        df: The dataframe with datetime index
        col: The feature of interest
        plot_type: Choose between 'acf' or 'pacf. Defaults to "pacf".
        n_lags: Number of lags to return autocorrelation for. Defaults to 40.
        fft: If True, computes ACF via fourier fast transform (FFT). Defaults to False.
        compute_backend: Select computing backend. Defaults to None (pandas).
        viz_backend: Select visualization backend. Defaults to None (seaborn).
        **kwargs: Keyword arguments

    Raises:
        ValueError: Invalid input data type.
        ValueError: `col` not found in dataframe.

    Returns:
        The visualization
    """
    if not _is_dataframe(df):
        raise ValueError("Unsupported input data type")
    if isinstance(col, str):
        if col not in df.columns:
            raise ValueError(f"{col} not found in dataframe")
    if viz_backend == "plotly":
        data, white_noise = _get_compute_backend(
            compute_backend, df).compute_autocorrelation(df[col],
                                                         plot_type=plot_type,
                                                         n_lags=n_lags,
                                                         fft=fft,
                                                         **kwargs)
        fig = _get_viz_backend(viz_backend).viz_plot_autocorrelation(
            data,
            plot_type=plot_type,
            white_noise=white_noise,
            n_lags=n_lags,
            **kwargs)
    else:
        fig = _get_viz_backend(viz_backend).viz_plot_autocorrelation(
            df[col], plot_type=plot_type, n_lags=n_lags, fft=fft, **kwargs)
    return fig
예제 #16
0
def test_encrypt_data_and_infotypes(compute_backend_pii_df):
    sensitivewidget = sensitive_data(compute_backend_pii_df,
                                     mode="encrypt",
                                     detect_infotypes=True,
                                     sample_size=1)
    assert isinstance(sensitivewidget.encrypt["name"][1], str)
    assert isinstance(sensitivewidget.encrypt["domain"][1], str)
    assert isinstance(sensitivewidget.infotypes, dict)
    assert sensitivewidget.infotypes["domain"][0] == "DOMAIN_NAME"
    assert sensitivewidget.infotypes["name"][0] == "PERSON"
    assert len(sensitivewidget.infotypes) == 2
    assert isinstance(sensitivewidget.redact, type(None))
    assert _is_dataframe(sensitivewidget.show())
예제 #17
0
def test_only_redact_data(compute_backend_pii_df):
    sensitivewidget = sensitive_data(compute_backend_pii_df,
                                     mode="redact",
                                     detect_infotypes=False)
    assert isinstance(sensitivewidget, SensitiveDataWidget)
    assert sensitivewidget.redact.shape == (1, 2)
    assert _is_dataframe(sensitivewidget.redact)
    assert sensitivewidget.redact["name"][1] == "<PERSON>"
    assert sensitivewidget.redact["domain"][1] == "<DOMAIN_NAME>"
    assert isinstance(sensitivewidget.redact["name"][1], str)
    assert isinstance(sensitivewidget.redact["domain"][1], str)
    assert isinstance(sensitivewidget.encrypt, type(None))
    assert isinstance(sensitivewidget.infotypes, type(None))
def test_tsne(compute_numeric_backend_df):
    x = dim_reduc(data=compute_numeric_backend_df,
                  n_components=2,
                  dim_method="tsne")
    y = dim_reduc(
        data=compute_numeric_backend_df,
        n_components=2,
        apply_tsvd=False,
        dim_method="tsne",
    )
    assert isinstance(x, tuple)
    assert _is_dataframe(x[0])
    assert isinstance(x[1], sklearn.manifold.TSNE)
    assert not x[0].equals(y[0])
예제 #19
0
def _modin_compute_data_summary(data):
    """Perform computation for summary statistics and data description.

    Args:
        data: The dataframe

    Raises:
        ValueError: Invalid input data type.

    Returns:
        The Modin dataframe with metrics in rows
    """
    if _is_series(data):
        data = _compat["modin.pandas"].DataFrame(data)

    if not _is_dataframe(data):
        raise ValueError("Data must be a Modin DataFrame")

    # Save column order
    columns = data.columns
    dtypes = data.agg([lambda x: x.dtype])
    moments = data.agg(["mean", "std", "median"])
    minmax = data.select_dtypes("number").agg(["min",
                                               "max"]).reindex(columns=columns)
    zeros = data.select_dtypes("number").agg([_count_zeros
                                              ]).reindex(columns=columns)
    null_summary = data.agg([_count_nulls])
    freq_summary = data.agg([_most_frequent])

    summary = (dtypes.append(moments, ignore_index=True).append(
        minmax, ignore_index=True).append(zeros, ignore_index=True).append(
            null_summary, ignore_index=True).append(freq_summary,
                                                    ignore_index=True))
    summary = summary[columns]
    summary.index = [
        "Data Type",
        "Mean",
        "Standard Deviation",
        "Median",
        "Min",
        "Max",
        "# Zeros",
        "# Nulls",
        "% Most Frequent Value",
    ]

    # Removing NaNs
    summary.fillna("", inplace=True)

    return SummaryWidget(data, summary)
예제 #20
0
def correlation_matrix(
    data,
    cluster=False,
    categorical=False,
    compute_backend=None,
    viz_backend=None,
    **kwargs,
) -> CorrelationWidget:
    """Computes correlations (associations) and visualizes as a heatmap.

    This feature combines measures of association for pairs of variables:
        * Numeric-numeric pairs: Pearson correlation
        * Categorical-numeric pairs: Correlation ratio
        * Categorical-categorical pairs
            * More than 2 levels: Cramer's V
            * Only 2 levels for both variables: Point-biserial coefficient

    Args:
        data (DataFrame): A data frame
        cluster (bool): If True, use clustering to reorder similar columns together
        categorical (bool): If True, include categorical associations using Cramer's
            V, Correlation Ratio, and Point-biserial coefficient (a.k.a. Matthews
            correlation coefficient). All associations (including Pearson correlation)
            are scaled to be in the range [0, 1].
        compute_backend: The compute backend.
        viz_backend: The visualization backend.
        **kwargs: Keyword arguments.

    Raises:
        ValueError: Invalid data input type.

    Returns:
        CorrelationWidget
    """
    if not _is_dataframe(data):
        raise ValueError("Data frame required")

    corrwidget = _get_compute_backend(compute_backend,
                                      data).compute_correlation_matrix(
                                          data,
                                          cluster=cluster,
                                          categorical=categorical,
                                          **kwargs)

    corrwidget.viz_backend = viz_backend

    return corrwidget
예제 #21
0
def scatter_plots(
    data,
    mode="matrix",
    sample=None,
    threshold=None,
    compute_backend=None,
    viz_backend=None,
    **kwargs,
):
    """Scatter plots of numeric data.

    Args:
        data: A Pandas data frame
        mode (str): {``diagnostic``, ``matrix``, ``all``} The visualization mode.

            * ``diagnostic``: Plots selected by scagnostics (scatter plot diagnostics)
            * ``matrix``: Generate the full scatter plot matrix
            * ``all``: Generate all individual scatter plots
        sample: The sampling method to use. Currently not used.
        threshold: The scatter plot diagnostic threshold value [0,1] for returning a
            plot. Only used with "diagnostic" mode. For example, ``{"Outlying": 0.9}``
            returns plots with outlier metrics above 0.9. See
            ``pyscagnostics.measure_names`` for a list of metrics.

            * If a number: Returns all plots where at least one metric is above this threshold
            * If a dictionary: Returns plots where the metric is above its threshold.
        compute_backend: The compute backend
        viz_backend: The vizualization backend
        **kwargs: Passed to the visualization framework

    Raises:
        ValueError: Invalid input data type.

    Returns:
        Scatter plot.
    """
    if not _is_dataframe(data):
        raise ValueError("Unsupported input data type")

    swidget = _get_compute_backend(compute_backend, data).compute_scatter_plot(
        data, mode, sample, threshold, **kwargs)

    swidget.compute_backend = compute_backend
    swidget.viz_backend = viz_backend
    return swidget
예제 #22
0
def plot_time_series(
    df,
    col,
    decompose=False,
    model="additive",
    compute_backend=None,
    viz_backend=None,
    **kwargs,
):
    """Plots time series given a dataframe with datetime index. Statistics are computed using the statsmodels API.

    Args:
        df: The dataframe with datetime index
        col (str or [str]): Column of interest. Column datatype must be numerical
        decompose: Set as True to decompose the timeseries with moving average. Defaults to False.
        model: Specify seasonal component when decompose is True. Defaults to "additive".
        compute_backend: Select computing backend. Defaults to None (pandas).
        viz_backend: Select visualization backend. Defaults to None (seaborn).
        **kwargs: Keyword arguments

    Raises:
        ValueError: Invalid input data type.
        ValueError: ```col``` not a list or string.

    Returns:
        The visualization
    """
    if not _is_dataframe(df):
        raise ValueError("Unsupported input data type")
    if not isinstance(col, (list, str)):
        raise ValueError(f"{col} must be list type or string type")
    if decompose:
        result = _get_compute_backend(
            compute_backend, df).compute_decompose_timeseries(df,
                                                              col=col,
                                                              model=model,
                                                              **kwargs)
        fig = _get_viz_backend(viz_backend).viz_plot_time_series(
            df, col=col, result=result, decompose=decompose, **kwargs)
    else:
        fig = _get_viz_backend(viz_backend).viz_plot_time_series(
            df, col, **kwargs)
    return fig
예제 #23
0
def _pandas_compute_data_heatmap(data,
                                 missing: bool = False,
                                 **kwargs) -> HeatmapWidget:
    """Pre-processes data for the data heatmap.

    Values are standardized (removing the mean and scaling to unit variance).
    If `missing` is set to True, the dataframe flags missing records using 1/0.

    Args:
        data: The dataframe
        missing (bool): If True, uses missing values instead
        **kwargs: Keyword arguments.

    Raises:
        ValueError: Invalid input data type.

    Returns:
        HeatmapWidget
    """
    if not _is_dataframe(data):
        raise ValueError("Unsupported input data type")

    if missing:
        missing_data = data.isna().astype(int)
        colnames = data.columns.values
        return HeatmapWidget(
            input_data=data,
            colnames=colnames,
            missing=True,
            missing_data=missing_data.transpose(),
        )
    else:
        data = data.select_dtypes(["number"])
        colnames = data.columns.values
        scaler = StandardScaler()
        std_data = pd.DataFrame(scaler.fit_transform(data),
                                columns=data.columns)
        return HeatmapWidget(input_data=data,
                             colnames=colnames,
                             std_data=std_data.transpose())
def dim_reduc(
    data,
    n_components: int,
    dim_method: str,
    apply_tsvd: bool = True,
    compute_backend=None,
):
    """Reduces the number of dimensions of the input data.

    Args:
        data: The dataframe
        n_components: Desired dimensionality for the data set prior to modeling
        dim_method: {'pca', 'ipca', 'tsne', 'tsvd'}
        - pca: Principal Component Analysis
        - ipca: Incremental Principal Component Analysis. Highly suggested for very large datasets
        - tsne: T-distributed Stochastic Neighbor Embedding
        - tsvd: Truncated Singular Value Decomposition
        apply_tsvd: If True, TSVD will be run before t-SNE. This is highly recommended when running t-SNE

    Returns:
        The dimensionally-reduced dataframe and reduction object
    """
    if not _is_dataframe(data):
        raise ValueError("Data must be a Pandas (or Modin) DataFrame")

    if dim_method == "pca":
        reduc_df, reductor = run_pca(data, n_components, compute_backend)
    elif dim_method == "ipca":
        reduc_df, reductor = run_ipca(data, n_components, compute_backend)
    elif dim_method == "tsne":
        reduc_df, reductor = run_tsne(data, n_components, apply_tsvd,
                                      compute_backend)
    elif dim_method == "tsvd":
        reduc_df, reductor = run_tsvd(data, n_components, compute_backend)
    else:
        raise NotImplementedError("{} is not supported".format(dim_method))
    return reduc_df, reductor
예제 #25
0
def test_series(load_series_summary):
    assert isinstance(load_series_summary, SummaryWidget)
    assert _is_dataframe(load_series_summary.summary)
예제 #26
0
def _modin_compute_data_summary(data):
    """Perform computation for summary statistics and data description.

    Args:
        data: The dataframe

    Raises:
        ValueError: Invalid input data type.

    Returns:
        The Modin dataframe with metrics in rows
    """
    if _is_series(data):
        data = _compat["modin.pandas"].DataFrame(data)

    if not _is_dataframe(data):
        raise ValueError("Data must be a Modin DataFrame")

    info_data = pd.DataFrame(
        {
            "Info": [
                data.shape[0],
                data.shape[1],
                _sizeof_fmt(data.memory_usage().sum(), ""),
            ]
        },
        index=["Rows", "Columns", "Size in Memory"],
    )

    # Save column order
    columns = data.columns

    dtypes = data.dtypes.to_numpy()
    s_mean = data.mean(numeric_only=True).reindex().to_numpy()
    s_sd = data.std(numeric_only=True).reindex(columns).to_numpy()
    s_med = data.median(numeric_only=True).reindex(columns).to_numpy()
    s_min = data.min(numeric_only=True).reindex(columns).to_numpy()
    s_max = data.max(numeric_only=True).reindex(columns).to_numpy()
    s_zero = data[data == 0].fillna(0).sum().astype(int).to_numpy()
    s_null = data.isnull().sum().astype(int).to_numpy()
    s_unique = data.nunique().to_numpy()
    s_freq = (
        data.apply(lambda x: mode1(x.astype("str")))
        .iloc[
            0,
        ]
        .to_numpy()
    )

    summary_data = pd.DataFrame(
        np.vstack(
            [
                dtypes,
                s_null,
                s_zero,
                s_min,
                s_med,
                s_max,
                s_mean,
                s_sd,
                s_unique,
                s_freq,
            ]
        ).transpose(),
        columns=[
            "Data Type",
            "Nulls",
            "Zeros",
            "Min",
            "Median",
            "Max",
            "Mean",
            "Standard Deviation",
            "Unique",
            "Top Frequency",
        ],
        index=columns,
    )

    return SummaryWidget(data, info_data, summary_data)
예제 #27
0
def _pandas_compute_data_summary(data):
    """Perform computation for summary statistics and data description.

    Args:
        data: The dataframe

    Raises:
        ValueError: Invalid input data type.

    Returns:
        The Pandas dataframe with metrics in rows
    """
    if _is_series(data):
        data = pd.DataFrame(data, columns=[data.name])

    if not _is_dataframe(data):
        raise ValueError("Data must be a Pandas DataFrame")

    info_data = pd.DataFrame(
        {
            "Info": [
                data.shape[0],
                data.shape[1],
                _sizeof_fmt(data.memory_usage().sum()),
            ]
        },
        index=["Rows", "Columns", "Size in Memory"],
    )

    columns = data.columns
    val = data.values
    num_columns = data.select_dtypes("number").columns
    num_ind = np.nonzero([c in num_columns for c in columns])[0]
    date_columns = data.select_dtypes(["datetime", "datetimetz"]).columns
    date_ind = np.nonzero([c in date_columns for c in columns])[0]
    other_columns = data.select_dtypes(
        exclude=["number", "datetime", "datetimetz"]
    ).columns
    other_ind = np.nonzero([c in other_columns for c in columns])[0]
    order = np.concatenate([num_ind, date_ind, other_ind], axis=0)

    dtypes = data.dtypes[order]
    s_mean = np.pad(
        np.mean(val[:, num_ind], axis=0),
        (0, len(data.columns) - num_ind.size),
        constant_values=np.nan,
    )
    s_sd = np.pad(
        np.std(val[:, num_ind].astype(np.float), axis=0),
        (0, len(data.columns) - num_ind.size),
        constant_values=np.nan,
    )
    s_med = np.pad(
        np.median(val[:, num_ind], axis=0),
        (0, len(data.columns) - num_ind.size),
        constant_values=np.nan,
    )
    s_min = np.pad(
        np.min(val[:, np.concatenate([num_ind, date_ind])], axis=0),
        (0, len(data.columns) - num_ind.size - date_ind.size),
        constant_values=np.nan,
    )
    s_max = np.pad(
        np.max(val[:, np.concatenate([num_ind, date_ind])], axis=0),
        (0, len(data.columns) - num_ind.size - date_ind.size),
        constant_values=np.nan,
    )
    s_zero = data[data == 0].fillna(0).sum().astype(int)[order]
    s_null = data.isnull().sum().astype(int)[order]
    s_unique = data.nunique()[order]
    s_freq = np.apply_along_axis(mode1, 0, val.astype("str"))[order]

    summary_data = pd.DataFrame(
        np.vstack(
            [
                dtypes,
                s_null,
                s_zero,
                s_min,
                s_med,
                s_max,
                s_mean,
                s_sd,
                s_unique,
                s_freq,
            ]
        ).transpose()[np.argsort(order), :],
        columns=[
            "Data Type",
            "Nulls",
            "Zeros",
            "Min",
            "Median",
            "Max",
            "Mean",
            "Standard Deviation",
            "Unique",
            "Top Frequency",
        ],
        index=data.columns,
    )

    return SummaryWidget(data, info_data, summary_data)
예제 #28
0
def sensitive_data(
    df,
    mode: str = "redact",
    detect_infotypes: bool = True,
    columns: Optional[list] = None,
    score_threshold: float = _DEFAULT_SCORE_THRESHOLD,
    sample_size: int = _SAMPLE_SIZE,
    engine_backend=None,
    compute_backend: Optional[str] = None,
):
    """Identifies, redacts, and/or encrypts PII data.

    Note:
        `sensitive_data` uses Microsoft's Presidio in the backend. Presidio can be used
        to help identify sensitive data. However, because Presidio uses trained ML models,
        there is no guarantee that Presidio will find all sensitive information.

    Args:
        df (DataFrame): The dataframe
        mode (str): {'redact', 'encrypt'}
            redact: Redact the sensitive data
            encrypt: Anonymize the sensitive data
        detect_infotypes (bool): If True, identifies infotypes for each column
        columns ([str]): Defaults to None
        score_threshold (float): Minimum confidence value for detected entities to be returned. Default is 0.2.
        sample_size (int): Number of sampled rows used for identifying column infotypes. Default is 100.
        engine_backend: The backend analyzer engine. Default is presidio_analyzer.
        compute_backend (str): Select compute backend

    Raises:
        ValueError: Invalid input data type.
        TypeError: `columns` not a list of strings.

    Returns:
        SensitiveDataWidget
    """
    if not engine_backend:
        engine_backend = presidio_engine()

    if not _is_dataframe(df):
        raise ValueError("Pandas data frame or modin data frame required")

    if _compat.check_install("modin.pandas"):
        if _is_dataframe(df, "modin"):
            warnings.warn(
                "Sensitive data does not currently support Modin DataFrames. Converting to Pandas."
            )
            df = df._to_pandas()

    if columns:
        if not isinstance(columns, list):
            raise TypeError("cols must be type list")

    if mode not in ["encrypt", "redact", None]:
        raise ValueError("mode must be set to 'encrypt', 'redact', or None")

    sensitivewidget = _get_compute_backend(
        compute_backend, df).compute_sensitive_data(
            df=df,
            mode=mode,
            detect_infotypes=detect_infotypes,
            columns=columns,
            score_threshold=score_threshold,
            sample_size=sample_size,
            engine_backend=engine_backend,
        )

    sensitivewidget.columns = columns
    sensitivewidget.score_threshold = score_threshold
    sensitivewidget.sample_size = sample_size if detect_infotypes else None
    sensitivewidget.engine = engine_backend

    return sensitivewidget
예제 #29
0
def cluster(
    data,
    method="kmeans",
    dim_method="pca",
    compute_backend=None,
    viz_backend=None,
    **kwargs,
) -> ClusterWidget:
    """Unsupervised determination of clusters.

    This feature computes clusters using various algorithms (KMeans, HDBSCAN) and then
    projects the data onto a two-dimensional plot for visualization.

    Args:
        data (DataFrame): The data.
        method (str, optional): {'kmeans', 'hdbscan'} The clustering method.
        dim_method (str, optional): The method to use for dimensionality reduction.
        compute_backend (str, optional): The compute backend.
        viz_backend (str, optional): The visualization backend.
        n_clusters (Optional[int], optional): (KMeans) The number of clusters.
        cluster_range (Tuple[int, int], optional): (KMeans) A tuple of the minimum and
            maximum cluster search range. Defaults to (2, 20).
        metric (str): (KMeans) The metric to optimize (from sklearn.metrics).
        target: (KMeans) The labels for supervised clustering, as a 1-D array.
        **kwargs: Keyword arguments.

    Raises:
        ValueError: Data frame required
        ValueError: Clustering method not implemented

    Returns:
        ClusterWidget
    """
    if not _is_dataframe(data):
        raise ValueError("Data frame required")

    if method not in ["kmeans", "hdbscan"]:
        raise ValueError(f"{method} not implemented")

    data = data.select_dtypes("number")

    clusterwidget = _get_compute_backend(compute_backend,
                                         data).compute_cluster(data=data,
                                                               method=method,
                                                               **kwargs)

    viz_data, reductor = dim_reduc(clusterwidget.scaled_data,
                                   2,
                                   dim_method=dim_method)
    viz_data.columns = ["x", "y"]
    viz_data["clusters"] = clusterwidget.clusters

    clusterwidget.viz_data = viz_data
    clusterwidget.reductor = reductor

    if dim_method == "pca":
        var_explained = np.round(reductor.explained_variance_ratio_[:2],
                                 2) * 100
        clusterwidget.xlabel = f"Component 1 ({var_explained[0]}% variance explained)"
        clusterwidget.ylabel = f"Component 2 ({var_explained[1]}% variance explained)"
    else:
        clusterwidget.xlabel = "Dimension 1"
        clusterwidget.ylabel = "Dimension 2"

    clusterwidget.viz_backend = viz_backend

    return clusterwidget