def convert_datatypes( data: pd.DataFrame, category: bool = True, cat_threshold: float = 0.05, cat_exclude: Optional[List[Union[str, int]]] = None, ) -> pd.DataFrame: """ Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting to integers due to an issue in pandas. This is expected \ to be fixed in pandas 1.1. See https://github.com/pandas-dev/pandas/issues/33803 Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame category : bool, optional Change dtypes of columns with dtype "object" to "category". Set threshold \ using cat_threshold or exclude columns using cat_exclude, by default True cat_threshold : float, optional Ratio of unique values below which categories are inferred and column dtype is \ changed to categorical, by default 0.05 cat_exclude : Optional[List[Union[str, int]]], optional List of columns to exclude from categorical conversion, by default None Returns ------- pd.DataFrame Pandas DataFrame with converted Datatypes """ # Validate Inputs _validate_input_bool(category, "Category") _validate_input_range(cat_threshold, "cat_threshold", 0, 1) cat_exclude = [] if cat_exclude is None else cat_exclude.copy() data = pd.DataFrame(data).copy() for col in data.columns: unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0] if ( category and unique_vals_ratio < cat_threshold and col not in cat_exclude and data[col].dtype == "object" ): data[col] = data[col].astype("category") data[col] = data[col].convert_dtypes( infer_objects=True, convert_string=True, convert_integer=False, convert_boolean=True, ) data = optimize_ints(data) data = optimize_floats(data) return data
def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame: """ Cleans the column names of the provided Pandas Dataframe and optionally \ provides hints on duplicate and long column names. Parameters ---------- data : pd.DataFrame Original Dataframe with columns to be cleaned hints : bool, optional Print out hints on column name duplication and colum name length, by default \ True Returns ------- pd.DataFrame Pandas DataFrame with cleaned column names """ _validate_input_bool(hints, "hints") # Handle CamelCase for i, col in enumerate(data.columns): matches = re.findall(re.compile("[a-z][A-Z]"), col) column = col for match in matches: column = column.replace(match, match[0] + "_" + match[1]) data.rename(columns={data.columns[i]: column}, inplace=True) data.columns = (data.columns.str.replace("\n", "_").str.replace( "(", "_").str.replace(")", "_").str.replace("'", "_").str.replace( '"', "_").str.replace(".", "_").str.replace("-", "_").str.replace( r"[!?:;/]", "_", regex=True).str.replace("+", "_plus_").str.replace( "*", "_times_").str.replace("<", "_smaller").str.replace( ">", "_larger_").str.replace("=", "_equal_").str.replace( "ä", "ae").str.replace("ö", "oe").str.replace( "ü", "ue").str.replace("ß", "ss").str.replace( "%", "_percent_").str.replace("$", "_dollar_"). str.replace("€", "_euro_").str.replace( "@", "_at_").str.replace("#", "_hash_").str.replace( "&", "_and_").str.replace( r"\s+", "_", regex=True).str.replace( r"_+", "_", regex=True).str.strip("_").str.lower()) dupl_idx = [i for i, x in enumerate(data.columns.duplicated()) if x] if len(dupl_idx) > 0: dupl_before = data.columns[dupl_idx].tolist() data.columns = [ col if col not in data.columns[:i] else col + "_" + str(i) for i, col in enumerate(data.columns) ] if hints: print( f"Duplicate column names detected! Columns with index {dupl_idx} and " f"names {dupl_before}) have been renamed to " f"{data.columns[dupl_idx].tolist()}.") long_col_names = [x for x in data.columns if len(x) > 25] if len(long_col_names) > 0 and hints: print( "Long column names detected (>25 characters). Consider renaming the " f"following columns {long_col_names}.") return data
def data_cleaning( data: pd.DataFrame, drop_threshold_cols: float = 0.9, drop_threshold_rows: float = 0.9, drop_duplicates: bool = True, convert_dtypes: bool = True, col_exclude: Optional[List[str]] = None, category: bool = True, cat_threshold: float = 0.03, cat_exclude: Optional[List[Union[str, int]]] = None, clean_col_names: bool = True, show: str = "changes", ) -> pd.DataFrame: """ Perform initial data cleaning tasks on a dataset, such as dropping single \ valued and empty rows, empty columns as well as optimizing the datatypes. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame drop_threshold_cols : float, optional Drop columns with NA-ratio equal to or above the specified threshold, by \ default 0.9 drop_threshold_rows : float, optional Drop rows with NA-ratio equal to or above the specified threshold, by \ default 0.9 drop_duplicates : bool, optional Drop duplicate rows, keeping the first occurence. This step comes after the \ dropping of missing values, by default True convert_dtypes : bool, optional Convert dtypes using pd.convert_dtypes(), by default True col_exclude : Optional[List[str]], optional Specify a list of columns to exclude from dropping, by default None category : bool, optional Enable changing dtypes of "object" columns to "category". Set threshold using \ cat_threshold. Requires convert_dtypes=True, by default True cat_threshold : float, optional Ratio of unique values below which categories are inferred and column dtype is \ changed to categorical, by default 0.03 cat_exclude : Optional[List[str]], optional List of columns to exclude from categorical conversion, by default None clean_column_names: bool, optional Cleans the column names and provides hints on duplicate and long names, by \ default True show : str, optional {"all", "changes", None}, by default "changes" Specify verbosity of the output: * "all": Print information about the data before and after cleaning as \ well as information about changes and memory usage (deep). Please be \ aware, that this can slow down the function by quite a bit. * "changes": Print out differences in the data before and after cleaning. * None: No information about the data and the data cleaning is printed. Returns ------- pd.DataFrame Cleaned Pandas DataFrame See also -------- convert_datatypes: Convert columns to best possible dtypes. drop_missing : Flexibly drop columns and rows. _memory_usage: Gives the total memory usage in megabytes. _missing_vals: Metrics about missing values in the dataset. Notes ----- The category dtype is not grouped in the summary, unless it contains exactly the \ same categories. """ # Validate Inputs _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1) _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1) _validate_input_bool(drop_duplicates, "drop_duplicates") _validate_input_bool(convert_dtypes, "convert_datatypes") _validate_input_bool(category, "category") _validate_input_range(cat_threshold, "cat_threshold", 0, 1) data = pd.DataFrame(data).copy() data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows, col_exclude=col_exclude) if clean_col_names: data_cleaned = clean_column_names(data_cleaned) single_val_cols = data_cleaned.columns[data_cleaned.nunique( dropna=False) == 1].tolist() data_cleaned = data_cleaned.drop(columns=single_val_cols) dupl_rows = None if drop_duplicates: data_cleaned, dupl_rows = _drop_duplicates(data_cleaned) if convert_dtypes: data_cleaned = convert_datatypes( data_cleaned, category=category, cat_threshold=cat_threshold, cat_exclude=cat_exclude, ) _diff_report( data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show, ) return data_cleaned
def missingval_plot( data: pd.DataFrame, cmap: str = "PuBuGn", figsize: Tuple = (20, 20), sort: bool = False, spine_color: str = "#EEEEEE", ): """ Two-dimensional visualization of the missing values in a dataset. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \ index/column information is used to label the plots cmap : str, optional Any valid colormap can be used. E.g. "Greys", "RdPu". More information can be found in the \ matplotlib documentation, by default "PuBuGn" figsize : Tuple, optional Use to control the figure size, by default (20, 20) sort : bool, optional Sort columns based on missing values in descending order and drop columns without any missing \ values, by default False spine_color : str, optional Set to "None" to hide the spines on all plots or use any valid matplotlib color argument, by default \ "#EEEEEE" Returns ------- GridSpec gs: Figure with array of Axes objects """ # Validate Inputs _validate_input_bool(sort, "sort") data = pd.DataFrame(data) if sort: mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False) final_cols = (mv_cols_sorted.drop(mv_cols_sorted[ mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()) data = data[final_cols] print("Displaying only columns with missing values.") # Identify missing values mv_total, mv_rows, mv_cols, _, mv_cols_ratio = _missing_vals(data).values() total_datapoints = data.shape[0] * data.shape[1] if mv_total == 0: print("No missing values found in the dataset.") else: # Create figure and axes fig = plt.figure(figsize=figsize) gs = fig.add_gridspec(nrows=6, ncols=6, left=0.1, wspace=0.05) ax1 = fig.add_subplot(gs[:1, :5]) ax2 = fig.add_subplot(gs[1:, :5]) ax3 = fig.add_subplot(gs[:1, 5:]) ax4 = fig.add_subplot(gs[1:, 5:]) # ax1 - Barplot colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols)) # color bars by height ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio) * 100, 2), color=colors) ax1.get_xaxis().set_visible(False) ax1.set(frame_on=False, xlim=(-0.5, len(mv_cols) - 0.5)) ax1.set_ylim(0, np.max(mv_cols_ratio) * 100) ax1.grid(linestyle=":", linewidth=1) ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0)) ax1.tick_params(axis="y", colors="#111111", length=1) # annotate values on top of the bars for rect, label in zip(ax1.patches, mv_cols): height = rect.get_height() ax1.text( 0.1 + rect.get_x() + rect.get_width() / 2, height + 0.5, label, ha="center", va="bottom", rotation="90", alpha=0.5, fontsize="11", ) ax1.set_frame_on(True) for _, spine in ax1.spines.items(): spine.set_visible(True) spine.set_color(spine_color) ax1.spines["top"].set_color(None) # ax2 - Heatmap sns.heatmap(data.isna(), cbar=False, cmap="binary", ax=ax2) ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1)) ax2.set_yticklabels(ax2.get_yticks()) ax2.set_xticklabels(ax2.get_xticklabels(), horizontalalignment="center", fontweight="light", fontsize="12") ax2.tick_params(length=1, colors="#111111") for _, spine in ax2.spines.items(): spine.set_visible(True) spine.set_color(spine_color) # ax3 - Summary fontax3 = {"color": "#111111", "weight": "normal", "size": 14} ax3.get_xaxis().set_visible(False) ax3.get_yaxis().set_visible(False) ax3.set(frame_on=False) ax3.text( 0.025, 0.875, f"Total: {np.round(total_datapoints/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3, ) ax3.text(0.025, 0.675, f"Missing: {np.round(mv_total/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3) ax3.text( 0.025, 0.475, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%", transform=ax3.transAxes, fontdict=fontax3, ) ax3.text( 0.025, 0.275, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%", transform=ax3.transAxes, fontdict=fontax3, ) ax3.text( 0.025, 0.075, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%", transform=ax3.transAxes, fontdict=fontax3, ) # ax4 - Scatter plot ax4.get_yaxis().set_visible(False) for _, spine in ax4.spines.items(): spine.set_color(spine_color) ax4.tick_params(axis="x", colors="#111111", length=1) ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1) ax4.set_ylim((0, len(mv_rows))[::-1]) # limit and invert y-axis ax4.set_xlim(0, max(mv_rows) + 0.5) ax4.grid(linestyle=":", linewidth=1) gs.figure.suptitle("Missing value plot", x=0.45, y=0.94, fontsize=18, color="#111111") return gs
def dist_plot( data: pd.DataFrame, mean_color: str = "orange", figsize: Tuple = (16, 2), fill_range: Tuple = (0.025, 0.975), showall: bool = False, kde_kws: Dict[str, Any] = None, rug_kws: Dict[str, Any] = None, fill_kws: Dict[str, Any] = None, font_kws: Dict[str, Any] = None, ): """ Two-dimensional visualization of the distribution of non binary numerical features. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \ index/column information is used to label the plots mean_color : str, optional Color of the vertical line indicating the mean of the data, by default "orange" figsize : Tuple, optional Controls the figure size, by default (16, 2) fill_range : Tuple, optional Set the quantiles for shading. Default spans 95% of the data, which is about two std. deviations \ above and below the mean, by default (0.025, 0.975) showall : bool, optional Set to True to remove the output limit of 20 plots, by default False kde_kws : Dict[str, Any], optional Keyword arguments for kdeplot(), by default {"color": "k", "alpha": 0.7, "linewidth": 1.5, "bw": 0.3} rug_kws : Dict[str, Any], optional Keyword arguments for rugplot(), by default {"color": "#ff3333", "alpha": 0.05, "linewidth": 4, \ "height": 0.075} fill_kws : Dict[str, Any], optional Keyword arguments to control the fill, by default {"color": "#80d4ff", "alpha": 0.2} font_kws : Dict[str, Any], optional Keyword arguments to control the font, by default {"color": "#111111", "weight": "normal", "size": \ 11} Returns ------- ax: matplotlib Axes Returns the Axes object with the plot for further tweaking. """ # Validate Inputs _validate_input_range(fill_range[0], "fill_range_lower", 0, 1) _validate_input_range(fill_range[1], "fill_range_upper", 0, 1) _validate_input_smaller(fill_range[0], fill_range[1], "fill_range") _validate_input_bool(showall, "showall") # Handle dictionary defaults kde_kws = { "alpha": 0.75, "linewidth": 1.5, "bw": 0.4 } if kde_kws is None else kde_kws.copy() rug_kws = ({ "color": "#ff3333", "alpha": 0.05, "linewidth": 4, "height": 0.075 } if rug_kws is None else rug_kws.copy()) fill_kws = { "color": "#80d4ff", "alpha": 0.2 } if fill_kws is None else fill_kws.copy() font_kws = { "color": "#111111", "weight": "normal", "size": 11 } if font_kws is None else font_kws.copy() data = pd.DataFrame(data.copy()).dropna(axis=1, how="all") data = data.loc[:, data.nunique() > 2] cols = list(data.select_dtypes(include=["number"]).columns) data = data[cols] data = data.loc[:, data.nunique() > 2] if len(cols) == 0: print("No columns with numeric data were detected.") return elif len(cols) >= 20 and showall is False: print( f"Note: The number of non binary numerical features is very large ({len(cols)}), please consider" " splitting the data. Showing plots for the first 20 numerical features. Override this by setting" " showall=True.") cols = cols[:20] for col in cols: num_dropped_vals = data[col].isna().sum() if num_dropped_vals > 0: col_data = data[col].dropna(axis=0) print( f"Dropped {num_dropped_vals} missing values from column {col}." ) else: col_data = data[col] _, ax = plt.subplots(figsize=figsize) ax = sns.distplot( col_data, hist=False, rug=True, kde_kws=kde_kws, rug_kws=rug_kws, ) # Vertical lines and fill x, y = ax.lines[0].get_xydata().T ax.fill_between( x, y, where=((x >= np.quantile(col_data, fill_range[0])) & (x <= np.quantile(col_data, fill_range[1]))), label=f"{fill_range[0]*100:.1f}% - {fill_range[1]*100:.1f}%", **fill_kws, ) mean = np.mean(col_data) std = scipy.stats.tstd(col_data) ax.vlines(x=mean, ymin=0, ymax=np.interp(mean, x, y), ls="dotted", color=mean_color, lw=2, label="mean") ax.vlines( x=np.median(col_data), ymin=0, ymax=np.interp(np.median(col_data), x, y), ls=":", color=".3", label="median", ) ax.vlines( x=[mean - std, mean + std], ymin=0, ymax=[np.interp(mean - std, x, y), np.interp(mean + std, x, y)], ls=":", color=".5", label="\u03BC \u00B1 \u03C3", ) ax.set_ylim(0) ax.set_xlim(ax.get_xlim()[0] * 1.15, ax.get_xlim()[1] * 1.15) # Annotations and legend ax.text(0.01, 0.85, f"Mean: {mean:.2f}", fontdict=font_kws, transform=ax.transAxes) ax.text(0.01, 0.7, f"Std. dev: {std:.2f}", fontdict=font_kws, transform=ax.transAxes) ax.text( 0.01, 0.55, f"Skew: {scipy.stats.skew(col_data):.2f}", fontdict=font_kws, transform=ax.transAxes, ) ax.text( 0.01, 0.4, f"Kurtosis: {scipy.stats.kurtosis(col_data):.2f}", # Excess Kurtosis fontdict=font_kws, transform=ax.transAxes, ) ax.text(0.01, 0.25, f"Count: {len(col_data)}", fontdict=font_kws, transform=ax.transAxes) ax.legend(loc="upper right") return ax
def corr_plot( data: pd.DataFrame, split: Optional[str] = None, threshold: float = 0, target: Optional[Union[pd.Series, str]] = None, method: str = "pearson", cmap: str = "BrBG", figsize: Tuple = (12, 10), annot: bool = True, dev: bool = False, **kwargs, ): """ Two-dimensional visualization of the correlation between feature-columns, excluding NA values. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \ index/column information is used to label the plots split : Optional[str], optional Type of split to be performed {None, "pos", "neg", "high", "low"}, by default None * None: visualize all correlations between the feature-columns * pos: visualize all positive correlations between the feature-columns above the threshold * neg: visualize all negative correlations between the feature-columns below the threshold * high: visualize all correlations between the feature-columns for which abs(corr) > threshold \ is True * low: visualize all correlations between the feature-columns for which abs(corr) < threshold \ is True threshold : float, optional Value between 0 and 1 to set the correlation threshold, by default 0 unless split = "high" \ or split = "low", in which case default is 0.3 target : Optional[Union[pd.Series, str]], optional Specify target for correlation. E.g. label column to generate only the correlations between each \ feature and the label, by default None method : str, optional method: {"pearson", "spearman", "kendall"}, by default "pearson" * pearson: measures linear relationships and requires normally distributed and homoscedastic data. * spearman: ranked/ordinal correlation, measures monotonic relationships. * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more \ expensive but more robust in smaller dataets than "spearman". cmap : str, optional The mapping from data values to color space, matplotlib colormap name or object, or list of colors, \ by default "BrBG" figsize : Tuple, optional Use to control the figure size, by default (12, 10) annot : bool, optional Use to show or hide annotations, by default True dev : bool, optional Display figure settings in the plot by setting dev = True. If False, the settings are not displayed, \ by default False Keyword Arguments : optional Additional elements to control the visualization of the plot, e.g.: * mask: bool, default True If set to False the entire correlation matrix, including the upper triangle is shown. Set \ dev = False in this case to avoid overlap. * vmax: float, default is calculated from the given correlation coefficients. Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar. * vmin: float, default is calculated from the given correlation coefficients. Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar. * linewidths: float, default 0.5 Controls the line-width inbetween the squares. * annot_kws: dict, default {"size" : 10} Controls the font size of the annotations. Only available when annot = True. * cbar_kws: dict, default {"shrink": .95, "aspect": 30} Controls the size of the colorbar. * Many more kwargs are available, i.e. "alpha" to control blending, or options to adjust labels, \ ticks ... Kwargs can be supplied through a dictionary of key-value pairs (see above). Returns ------- ax: matplotlib Axes Returns the Axes object with the plot for further tweaking. """ # Validate Inputs _validate_input_range(threshold, "threshold", -1, 1) _validate_input_bool(annot, "annot") _validate_input_bool(dev, "dev") data = pd.DataFrame(data) corr = corr_mat(data, split=split, threshold=threshold, target=target, method=method, colored=False) mask = np.zeros_like(corr, dtype=np.bool) if target is None: mask = np.triu(np.ones_like(corr, dtype=np.bool)) vmax = np.round(np.nanmax(corr.where(~mask)) - 0.05, 2) vmin = np.round(np.nanmin(corr.where(~mask)) + 0.05, 2) fig, ax = plt.subplots(figsize=figsize) # Specify kwargs for the heatmap kwargs = { "mask": mask, "cmap": cmap, "annot": annot, "vmax": vmax, "vmin": vmin, "linewidths": 0.5, "annot_kws": { "size": 10 }, "cbar_kws": { "shrink": 0.95, "aspect": 30 }, **kwargs, } # Draw heatmap with mask and default settings sns.heatmap(corr, center=0, fmt=".2f", **kwargs) ax.set_title(f"Feature-correlation ({method})", fontdict={"fontsize": 18}) # Settings if dev: fig.suptitle( f"\ Settings (dev-mode): \n\ - split-mode: {split} \n\ - threshold: {threshold} \n\ - method: {method} \n\ - annotations: {annot} \n\ - cbar: \n\ - vmax: {vmax} \n\ - vmin: {vmin} \n\ - linewidths: {kwargs['linewidths']} \n\ - annot_kws: {kwargs['annot_kws']} \n\ - cbar_kws: {kwargs['cbar_kws']}", fontsize=12, color="gray", x=0.35, y=0.85, ha="left", ) return ax
def corr_mat( data: pd.DataFrame, split: Optional[ str] = None, # Optional[Literal['pos', 'neg', 'high', 'low']] = None, threshold: float = 0, target: Optional[Union[pd.DataFrame, pd.Series, np.ndarray, str]] = None, method: str = "pearson", # Literal['pearson', 'spearman', 'kendall'] = "pearson", colored: bool = True, ) -> Union[pd.DataFrame, Any]: """ Returns a color-encoded correlation matrix. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \ index/column information is used to label the plots split : Optional[str], optional Type of split to be performed, by default None {None, "pos", "neg", "high", "low"} threshold : float, optional Value between 0 and 1 to set the correlation threshold, by default 0 unless split = "high" \ or split = "low", in which case default is 0.3 target : Optional[Union[pd.DataFrame, str]], optional Specify target for correlation. E.g. label column to generate only the correlations between each \ feature and the label, by default None method : str, optional method: {"pearson", "spearman", "kendall"}, by default "pearson" * pearson: measures linear relationships and requires normally distributed and homoscedastic data. * spearman: ranked/ordinal correlation, measures monotonic relationships. * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more \ expensive but more robust in smaller dataets than "spearman" colored : bool, optional If True the negative values in the correlation matrix are colored in red, by default True Returns ------- Union[pd.DataFrame, pd.Styler] If colored = True - corr: Pandas Styler object If colored = False - corr: Pandas DataFrame """ # Validate Inputs _validate_input_range(threshold, "threshold", -1, 1) _validate_input_bool(colored, "colored") def color_negative_red(val): color = "#FF3344" if val < 0 else None return "color: %s" % color data = pd.DataFrame(data) if isinstance(target, (str, list, pd.Series, np.ndarray)): target_data = [] if isinstance(target, str): target_data = data[target] data = data.drop(target, axis=1) elif isinstance(target, (list, pd.Series, np.ndarray)): target_data = pd.Series(target) target = target_data.name corr = pd.DataFrame(data.corrwith(target_data, method=method)) corr = corr.sort_values(corr.columns[0], ascending=False) corr.columns = [target] else: corr = data.corr(method=method) corr = _corr_selector(corr, split=split, threshold=threshold) if colored: return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep="-") else: return corr
def dist_plot( data: pd.DataFrame, mean_color: str = "orange", size: int = 2.5, fill_range: Tuple = (0.025, 0.975), showall: bool = False, kde_kws: Dict[str, Any] = None, rug_kws: Dict[str, Any] = None, fill_kws: Dict[str, Any] = None, font_kws: Dict[str, Any] = None, ): """ Two-dimensional visualization of the distribution of non binary numerical features. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \ is provided, the index/column information is used to label the plots mean_color : str, optional Color of the vertical line indicating the mean of the data, by default "orange" size : int, optional Controls the plot size, by default 2.5 fill_range : Tuple, optional Set the quantiles for shading. Default spans 95% of the data, which is about \ two std. deviations above and below the mean, by default (0.025, 0.975) showall : bool, optional Set to True to remove the output limit of 20 plots, by default False kde_kws : Dict[str, Any], optional Keyword arguments for kdeplot(), by default {"color": "k", "alpha": 0.75, \ "linewidth": 1.5, "bw_adjust": 0.8} rug_kws : Dict[str, Any], optional Keyword arguments for rugplot(), by default {"color": "#ff3333", \ "alpha": 0.15, "lw": 3, "height": 0.075} fill_kws : Dict[str, Any], optional Keyword arguments to control the fill, by default {"color": "#80d4ff", \ "alpha": 0.2} font_kws : Dict[str, Any], optional Keyword arguments to control the font, by default {"color": "#111111", \ "weight": "normal", "size": 11} Returns ------- ax: matplotlib Axes Returns the Axes object with the plot for further tweaking. """ # Validate Inputs _validate_input_range(fill_range[0], "fill_range_lower", 0, 1) _validate_input_range(fill_range[1], "fill_range_upper", 0, 1) _validate_input_smaller(fill_range[0], fill_range[1], "fill_range") _validate_input_bool(showall, "showall") # Handle dictionary defaults kde_kws = ({ "alpha": 0.75, "linewidth": 1.5, "bw_adjust": 0.8 } if kde_kws is None else kde_kws.copy()) rug_kws = ({ "color": "#ff3333", "alpha": 0.15, "lw": 3, "height": 0.075 } if rug_kws is None else rug_kws.copy()) fill_kws = ({ "color": "#80d4ff", "alpha": 0.2 } if fill_kws is None else fill_kws.copy()) font_kws = ({ "color": "#111111", "weight": "normal", "size": 11 } if font_kws is None else font_kws.copy()) data = pd.DataFrame(data.copy()).dropna(axis=1, how="all") df = data.copy() data = data.loc[:, data.nunique() > 2] if data.shape[0] > 10000: data = data.sample(n=10000, random_state=408) print( "Large dataset detected, using 10000 random samples for the plots. Summary" " statistics are still based on the entire dataset.") cols = list(data.select_dtypes(include=["number"]).columns) data = data[cols] if len(cols) == 0: print("No columns with numeric data were detected.") return None if len(cols) >= 20 and showall is False: print( "Note: The number of non binary numerical features is very large " f"({len(cols)}), please consider splitting the data. Showing plots for " "the first 20 numerical features. Override this by setting showall=True." ) cols = cols[:20] g = None for col in cols: col_data = data[col].dropna(axis=0) col_df = df[col].dropna(axis=0) g = sns.displot( col_data, kind="kde", rug=True, height=size, aspect=5, legend=False, rug_kws=rug_kws, **kde_kws, ) # Vertical lines and fill x, y = g.axes[0, 0].lines[0].get_xydata().T g.axes[0, 0].fill_between( x, y, where=((x >= np.quantile(col_df, fill_range[0])) & (x <= np.quantile(col_df, fill_range[1]))), label=f"{fill_range[0]*100:.1f}% - {fill_range[1]*100:.1f}%", **fill_kws, ) mean = np.mean(col_df) std = scipy.stats.tstd(col_df) g.axes[0, 0].vlines( x=mean, ymin=0, ymax=np.interp(mean, x, y), ls="dotted", color=mean_color, lw=2, label="mean", ) g.axes[0, 0].vlines( x=np.median(col_df), ymin=0, ymax=np.interp(np.median(col_df), x, y), ls=":", color=".3", label="median", ) g.axes[0, 0].vlines( x=[mean - std, mean + std], ymin=0, ymax=[np.interp(mean - std, x, y), np.interp(mean + std, x, y)], ls=":", color=".5", label="\u03BC \u00B1 \u03C3", ) g.axes[0, 0].set_ylim(0) g.axes[0, 0].set_xlim( g.axes[0, 0].get_xlim()[0] - g.axes[0, 0].get_xlim()[1] * 0.05, g.axes[0, 0].get_xlim()[1] * 1.03, ) # Annotations and legend g.axes[0, 0].text( 0.005, 0.9, f"Mean: {mean:.2f}", fontdict=font_kws, transform=g.axes[0, 0].transAxes, ) g.axes[0, 0].text( 0.005, 0.7, f"Std. dev: {std:.2f}", fontdict=font_kws, transform=g.axes[0, 0].transAxes, ) g.axes[0, 0].text( 0.005, 0.5, f"Skew: {scipy.stats.skew(col_df):.2f}", fontdict=font_kws, transform=g.axes[0, 0].transAxes, ) g.axes[0, 0].text( 0.005, 0.3, f"Kurtosis: {scipy.stats.kurtosis(col_df):.2f}", # Excess Kurtosis fontdict=font_kws, transform=g.axes[0, 0].transAxes, ) g.axes[0, 0].text( 0.005, 0.1, f"Count: {len(col_df)}", fontdict=font_kws, transform=g.axes[0, 0].transAxes, ) g.axes[0, 0].legend(loc="upper right") return g.axes[0, 0]
def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame: """ Cleans the column names of the provided Pandas Dataframe and optionally provides hints on duplicate \ and long column names. Parameters ---------- data : pd.DataFrame Original Dataframe with columns to be cleaned hints : bool, optional Print out hints on column name duplication and colum name length, by default True Returns ------- pd.DataFrame Pandas DataFrame with cleaned column names """ _validate_input_bool(hints, "hints") for i, col in enumerate(data.columns): matches = re.findall(re.compile("[a-z][A-Z]"), col) column = col for match in matches: column = column.replace(match, match[0] + "_" + match[1]) data.rename(columns={data.columns[i]: column}, inplace=True) data.columns = (data.columns.str.replace("(", " ").str.replace( ")", " ").str.replace("'", " ").str.replace('"', " ").str.replace( ".", "_").str.replace("!", "_").str.replace("?", "_").str.replace( ":", "_").str.replace(";", "_").str.replace("-", "_").str.replace( "/", " ").str.replace("+", " plus ").str.replace( "*", " times ").str.replace("ä", "ae").str.replace( "ö", "oe").str.replace("ü", "ue").str.replace( "ß", "ss").str.replace( "%", " percent ").str.replace( "$", " dollar ").str.replace( "€", " euro ").str.replace( "@", " at ").str.replace( "#", " number ").str.replace( "&", " and "). str.lower().str.strip().str.strip("_").str.replace( " ", " ").str.replace(" ", " ").str.replace( " ", "_").str.replace("__", "_").str.replace("___", "_")) dupl_idx = [i for i, x in enumerate(data.columns.duplicated()) if x] if len(dupl_idx) > 0: dupl_before = data.columns[dupl_idx].tolist() data.columns = [ col if col not in data.columns[:i] else col + "_" + str(i) for i, col in enumerate(data.columns) ] if hints: print( f"- Duplicate column names detected! Columns with index {dupl_idx} and names {dupl_before}) " f"have been renamed to {data.columns[dupl_idx].tolist()}.") long_col_names = [x for x in data.columns if len(x) > 25] if len(long_col_names) > 0 and hints: print( f"- Long column names detected (>25 characters)! Consider renaming the following columns " f"{long_col_names}.") return data