示例#1
0
def drop_missing(
    data: pd.DataFrame,
    drop_threshold_cols: float = 1,
    drop_threshold_rows: float = 1,
    col_exclude: Optional[List[str]] = None,
) -> pd.DataFrame:
    """ Drops completely empty columns and rows by default and optionally provides \
        flexibility to loosen restrictions to drop additional non-empty columns and \
        rows based on the fraction of NA-values.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    drop_threshold_cols : float, optional
        Drop columns with NA-ratio equal to or above the specified threshold, by \
        default 1
    drop_threshold_rows : float, optional
        Drop rows with NA-ratio equal to or above the specified threshold, by default 1
    col_exclude : Optional[List[str]], optional
        Specify a list of columns to exclude from dropping. The excluded columns do \
        not affect the drop thresholds, by default None

    Returns
    -------
    pd.DataFrame
        Pandas DataFrame without any empty columns or rows

    Notes
    -----
    Columns are dropped first
    """

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)

    col_exclude = [] if col_exclude is None else col_exclude.copy()
    data_exclude = data[col_exclude]

    data = pd.DataFrame(data).copy()

    data_dropped = data.drop(columns=col_exclude, errors="ignore")
    data_dropped = data_dropped.drop(
        columns=data_dropped.loc[
            :, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols
        ].columns
    ).dropna(axis=1, how="all")

    data = pd.concat([data_dropped, data_exclude], axis=1)

    data_cleaned = data.drop(
        index=data.loc[
            _missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, :
        ].index
    ).dropna(axis=0, how="all")
    return data_cleaned
示例#2
0
文件: clean.py 项目: aniruhil/klib
def mv_col_handling(
    data: pd.DataFrame,
    target: Optional[Union[str, pd.Series, List]] = None,
    mv_threshold: float = 0.1,
    corr_thresh_features: float = 0.5,
    corr_thresh_target: float = 0.3,
    return_details: bool = False,
) -> pd.DataFrame:
    """ Converts columns with a high ratio of missing values into binary features and \
    eventually drops them based on their correlation with other features and the \
    target variable. This function follows a three step process:
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
    - 2) Identify high correlations of these features among themselves and with \
        other features in the dataset (above 'corr_thresh_features').
    - 3) Features with high ratio of missing values and high correlation among each \
        other are dropped unless they correlate reasonably well with the target \
        variable (above 'corr_thresh_target').

    Note: If no target is provided, the process exits after step two and drops columns \
    identified up to this point.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    target : Optional[Union[str, pd.Series, List]], optional
        Specify target for correlation. I.e. label column to generate only the \
        correlations between each feature and the label, by default None
    mv_threshold : float, optional
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \
        than mv_threshold are candidates for dropping and undergo further analysis, by \
        default 0.1
    corr_thresh_features : float, optional
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified \
        features (with a high mv-ratio) is allowed to have with another feature. If \
        this threshold is overstepped, the feature undergoes further analysis, by \
        default 0.5
    corr_thresh_target : float, optional
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \
        feature (i.e. feature with a high mv-ratio and high correlation to another \
        existing feature) with the target. If this threshold is not met the feature is \
        ultimately dropped, by default 0.3
    return_details : bool, optional
        Provdies flexibility to return intermediary results, by default False

    Returns
    -------
    pd.DataFrame
        Updated Pandas DataFrame

    optional:
    cols_mv: Columns with missing values included in the analysis
    drop_cols: List of dropped columns
    """

    # Validate Inputs
    _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
    _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
    _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)

    data = pd.DataFrame(data).copy()
    data_local = data.copy()
    mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_local[cols_mv] = (data_local[cols_mv].applymap(
        lambda x: 1 if not pd.isnull(x) else x).fillna(0))

    high_corr_features = []
    data_temp = data_local.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    if target is None:
        data = data.drop(columns=high_corr_features)
    else:
        corrs = corr_mat(data_local, target=target,
                         colored=False).loc[high_corr_features]
        drop_cols = corrs.loc[
            abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
        data = data.drop(columns=drop_cols)

    if return_details:
        return data, cols_mv, drop_cols

    return data
示例#3
0
def missingval_plot(
    data: pd.DataFrame,
    cmap: str = "PuBuGn",
    figsize: Tuple = (20, 20),
    sort: bool = False,
    spine_color: str = "#EEEEEE",
):
    """ Two-dimensional visualization of the missing values in a dataset.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \
        index/column information is used to label the plots
    cmap : str, optional
        Any valid colormap can be used. E.g. "Greys", "RdPu". More information can be found in the \
        matplotlib documentation, by default "PuBuGn"
    figsize : Tuple, optional
        Use to control the figure size, by default (20, 20)
    sort : bool, optional
        Sort columns based on missing values in descending order and drop columns without any missing \
        values, by default False
    spine_color : str, optional
        Set to "None" to hide the spines on all plots or use any valid matplotlib color argument, by default \
        "#EEEEEE"

    Returns
    -------
    GridSpec
        gs: Figure with array of Axes objects
    """

    # Validate Inputs
    _validate_input_bool(sort, "sort")

    data = pd.DataFrame(data)

    if sort:
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
        final_cols = (mv_cols_sorted.drop(mv_cols_sorted[
            mv_cols_sorted.values == 0].keys().tolist()).keys().tolist())
        data = data[final_cols]
        print("Displaying only columns with missing values.")

    # Identify missing values
    mv_total, mv_rows, mv_cols, _, mv_cols_ratio = _missing_vals(data).values()
    total_datapoints = data.shape[0] * data.shape[1]

    if mv_total == 0:
        print("No missing values found in the dataset.")
    else:
        # Create figure and axes
        fig = plt.figure(figsize=figsize)
        gs = fig.add_gridspec(nrows=6, ncols=6, left=0.1, wspace=0.05)
        ax1 = fig.add_subplot(gs[:1, :5])
        ax2 = fig.add_subplot(gs[1:, :5])
        ax3 = fig.add_subplot(gs[:1, 5:])
        ax4 = fig.add_subplot(gs[1:, 5:])

        # ax1 - Barplot
        colors = plt.get_cmap(cmap)(mv_cols /
                                    np.max(mv_cols))  # color bars by height
        ax1.bar(range(len(mv_cols)),
                np.round((mv_cols_ratio) * 100, 2),
                color=colors)
        ax1.get_xaxis().set_visible(False)
        ax1.set(frame_on=False, xlim=(-0.5, len(mv_cols) - 0.5))
        ax1.set_ylim(0, np.max(mv_cols_ratio) * 100)
        ax1.grid(linestyle=":", linewidth=1)
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
        ax1.tick_params(axis="y", colors="#111111", length=1)

        # annotate values on top of the bars
        for rect, label in zip(ax1.patches, mv_cols):
            height = rect.get_height()
            ax1.text(
                0.1 + rect.get_x() + rect.get_width() / 2,
                height + 0.5,
                label,
                ha="center",
                va="bottom",
                rotation="90",
                alpha=0.5,
                fontsize="11",
            )

        ax1.set_frame_on(True)
        for _, spine in ax1.spines.items():
            spine.set_visible(True)
            spine.set_color(spine_color)
        ax1.spines["top"].set_color(None)

        # ax2 - Heatmap
        sns.heatmap(data.isna(), cbar=False, cmap="binary", ax=ax2)
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
        ax2.set_yticklabels(ax2.get_yticks())
        ax2.set_xticklabels(ax2.get_xticklabels(),
                            horizontalalignment="center",
                            fontweight="light",
                            fontsize="12")
        ax2.tick_params(length=1, colors="#111111")
        for _, spine in ax2.spines.items():
            spine.set_visible(True)
            spine.set_color(spine_color)

        # ax3 - Summary
        fontax3 = {"color": "#111111", "weight": "normal", "size": 14}
        ax3.get_xaxis().set_visible(False)
        ax3.get_yaxis().set_visible(False)
        ax3.set(frame_on=False)

        ax3.text(
            0.025,
            0.875,
            f"Total: {np.round(total_datapoints/1000,1)}K",
            transform=ax3.transAxes,
            fontdict=fontax3,
        )
        ax3.text(0.025,
                 0.675,
                 f"Missing: {np.round(mv_total/1000,1)}K",
                 transform=ax3.transAxes,
                 fontdict=fontax3)
        ax3.text(
            0.025,
            0.475,
            f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
            transform=ax3.transAxes,
            fontdict=fontax3,
        )
        ax3.text(
            0.025,
            0.275,
            f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
            transform=ax3.transAxes,
            fontdict=fontax3,
        )
        ax3.text(
            0.025,
            0.075,
            f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
            transform=ax3.transAxes,
            fontdict=fontax3,
        )

        # ax4 - Scatter plot
        ax4.get_yaxis().set_visible(False)
        for _, spine in ax4.spines.items():
            spine.set_color(spine_color)
        ax4.tick_params(axis="x", colors="#111111", length=1)

        ax4.scatter(mv_rows,
                    range(len(mv_rows)),
                    s=mv_rows,
                    c=mv_rows,
                    cmap=cmap,
                    marker=".",
                    vmin=1)
        ax4.set_ylim((0, len(mv_rows))[::-1])  # limit and invert y-axis
        ax4.set_xlim(0, max(mv_rows) + 0.5)
        ax4.grid(linestyle=":", linewidth=1)

        gs.figure.suptitle("Missing value plot",
                           x=0.45,
                           y=0.94,
                           fontsize=18,
                           color="#111111")

        return gs