def __init__( self, df=None, minimal=False, explorative=False, config_file: Union[Path, str] = None, lazy: bool = True, **kwargs, ): """Generate a ProfileReport based on a pandas DataFrame Args: df: the pandas DataFrame minimal: minimal mode is a default configuration with minimal computation config_file: a config file (.yml), mutually exclusive with `minimal` lazy: compute when needed **kwargs: other arguments, for valid arguments, check the default configuration file. """ if config_file is not None and minimal: raise ValueError( "Arguments `config_file` and `minimal` are mutually exclusive." ) if df is None and not lazy: raise ValueError( "Can init a not-lazy ProfileReport with no DataFrame") if config_file: config.set_file(config_file) elif minimal: config.set_file(get_resource("configs/config_minimal.yaml")) elif explorative: config.set_file(get_resource("configs/config_explorative.yaml")) elif not config.is_default: pass # TODO: logging instead of warning # warnings.warn( # "Currently configuration is not the default, if you want to restore " # "default configuration, please run 'pandas_profiling.clear_config()'" # ) config.set_kwargs(kwargs) self.df = None self._df_hash = -1 self._description_set = None self._title = None self._report = None self._html = None self._widgets = None self._json = None if df is not None: # preprocess df self.df = self.preprocess(df) if not lazy: # Trigger building the report structure _ = self.report
def scatter_series(series, x_label="Width", y_label="Height") -> str: """Scatter plot (or hexbin plot) from one series of sequences with length 2 Examples: >>> scatter_series(file_sizes, "Width", "Height") Args: series: the Series x_label: the label on the x-axis y_label: the label on the y-axis Returns: A string containing (a reference to) the image """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): plt.xlabel(x_label) plt.ylabel(y_label) color = config["html"]["style"]["primary_color"].get(str) scatter_threshold = config["plot"]["scatter_threshold"].get(int) if len(series) > scatter_threshold: cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(*zip(*series.tolist()), cmap=cmap) else: plt.scatter(*zip(*series.tolist()), color=color) return plot_360_n0sc0pe(plt)
def scatter_dataset(data: pd.DataFrame, labels=None, visualisation=PCA(random_state=0), n_components=2, figsize=(6.5, 6.5)) -> str: """Generate scatter plot of the whole dataset Args: data: Pandas DataFrame to generate scatter plot from. visualisation: visualisation technique. n_components: number of components. Returns: The resulting scatter plot encoded as a string. :param labels: :param figsize: """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): fig = plt.figure( figsize=figsize if n_components == 2 else (figsize[0] + 1, figsize[1] + 1)) plot = fig.add_subplot(111) if n_components == 3: plot = fig.add_subplot(111, projection="3d") plot.set_xlabel("x") plot.set_ylabel("y") if n_components == 3: plot.set_zlabel("z") visualisation.n_components = n_components _plot_dataset(plot, data, labels, visualisation) plt.subplots_adjust(bottom=0.2) return plot_360_n0sc0pe(plt)
def missing_bar(data: pd.DataFrame) -> str: """Generate missing values bar plot. Args: data: Pandas DataFrame to generate missing values bar plot from. Returns: The resulting missing values bar plot encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): labels = config["plot"]["missing"]["force_labels"].get(bool) ax = missingno.bar( data, figsize=(10, 5), color=hex_to_rgb( config["html"]["style"]["primary_color"].get(str)), fontsize=get_font_size(data), labels=labels, ) for _, spine in ax.spines.items(): spine.set_visible(True) for ax0 in plt.gcf().get_axes(): ax0.grid(False) plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.3) return plot_360_n0sc0pe(plt)
def scatter_complex(series: pd.Series) -> str: """Scatter plot (or hexbin plot) from a series of complex values Examples: >>> complex_series = pd.Series([complex(1, 3), complex(3, 1)]) >>> scatter_complex(complex_series) Args: series: the Series Returns: A string containing (a reference to) the image """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): plt.ylabel("Imaginary") plt.xlabel("Real") color = config["html"]["style"]["primary_color"].get(str) scatter_threshold = config["plot"]["scatter_threshold"].get(int) if len(series) > scatter_threshold: cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(series.real, series.imag, cmap=cmap) else: plt.scatter(series.real, series.imag, color=color) return plot_360_n0sc0pe(plt)
def correlation_matrix(data: pd.DataFrame, vmin: int = -1) -> str: """Plot image of a matrix correlation. Args: data: The matrix correlation to plot. vmin: Minimum value of value range. Returns: The resulting correlation matrix encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): fig_cor, axes_cor = plt.subplots() cmap_name = config["plot"]["correlation"]["cmap"].get(str) cmap_bad = config["plot"]["correlation"]["bad"].get(str) cmap = plt.get_cmap(cmap_name) if vmin == 0: cmap = get_cmap_half(cmap) cmap.set_bad(cmap_bad) labels = data.columns matrix_image = axes_cor.imshow(data, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap) cbar = plt.colorbar(matrix_image) cbar.outline.set_visible(False) if data.isnull().values.any(): legend_elements = [ Patch(facecolor=cmap(np.nan), label="invalid\ncoefficient") ] plt.legend( handles=legend_elements, loc="upper right", handleheight=2.5, ) axes_cor.set_xticks( np.arange(0, data.shape[0], float(data.shape[0]) / len(labels))) axes_cor.set_yticks( np.arange(0, data.shape[1], float(data.shape[1]) / len(labels))) font_size = get_correlation_font_size(len(labels)) axes_cor.set_xticklabels(labels, rotation=90, fontsize=font_size) axes_cor.set_yticklabels(labels, fontsize=font_size) plt.subplots_adjust(bottom=0.2) return plot_360_n0sc0pe(plt)
def predictivity(data: pd.DataFrame) -> str: """Plot image of a matrix correlation. Args: data: The matrix correlation to plot. Returns: The resulting predictivity plot encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): target_variables = config["correlations"]["targets"].get() if len(target_variables) == 0: target_variables = list( data.select_dtypes(include=np.number).columns) palette = sns.color_palette().as_hex() tmp = palette[3] palette[3] = palette[1] palette[1] = tmp fig_pred, axes_pred = plt.subplots() axes_pred.set_ylim(0, 100) # Rescale in range [0, 100] for better visualization predictivity = (100 * data[target_variables].round(2).abs()).astype(int) # Barplot predictivity predictivity.plot.bar( figsize=(10, 6), width=0.8, legend=True, fontsize=get_predictivity_font_size(predictivity), rot=45, ax=axes_pred, color=palette) for patch in axes_pred.patches: axes_pred.annotate(patch.get_height(), (patch.get_x() + patch.get_width() / 2., 100), ha="center", va="center", xytext=(0, 15), textcoords="offset points", rotation=45) plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.2) return plot_360_n0sc0pe(plt)
def missing_dendrogram(data: pd.DataFrame) -> str: """Generate a dendrogram plot for missing values. Args: data: Pandas DataFrame to generate missing values dendrogram plot from. Returns: The resulting missing values dendrogram plot encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): missingno.dendrogram(data, fontsize=get_font_size(data) * 2.0) plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2) return plot_360_n0sc0pe(plt)
def test_double_config(console_data, test_output_dir): report = test_output_dir / "test_double_config.html" with pytest.raises(ValueError) as e: console.main( [ "-s", "--config_file", str(get_resource("configs/config_default.yaml")), "--minimal", str(console_data), str(report), ] ) assert ( str(e.value) == "Arguments `config_file` and `minimal` are mutually exclusive." )
def clustermap(data: pd.DataFrame) -> str: """Plot a clustermap of the data. Args: series: The data to plot. Returns: The resulting clustermap encoded as a string. :param data: """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): plot = _plot_clustermap(data) return plot_360_n0sc0pe(plt)
def boxplot(series: np.ndarray, series_description: dict) -> str: """Plot a boxplot of the data. Args: series: The data to plot. series_description: Returns: The resulting boxplot encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): plot = _plot_boxplot(series, series_description) plot.figure.tight_layout() return plot_360_n0sc0pe(plt)
def histogram(series: np.ndarray, series_description: dict, bins: Union[int, np.ndarray]) -> str: """Plot an histogram of the data. Args: series: The data to plot. series_description: bins: number of bins (int for equal size, ndarray for variable size) Returns: The resulting histogram encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): plot = _plot_histogram(series, series_description, bins) plot.xaxis.set_tick_params(rotation=45) plot.figure.tight_layout() return plot_360_n0sc0pe(plt)
def missing_heatmap(data: pd.DataFrame) -> str: """Generate missing values heatmap plot. Args: data: Pandas DataFrame to generate missing values heatmap plot from. Returns: The resulting missing values heatmap plot encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): height = 4 if len(data.columns) > 10: height += int((len(data.columns) - 10) / 5) height = min(height, 10) font_size = get_font_size(data) if len(data.columns) > 40: font_size /= 1.4 labels = config["plot"]["missing"]["force_labels"].get(bool) ax = missingno.heatmap( data, figsize=(10, height), fontsize=font_size, cmap=config["plot"]["missing"]["cmap"].get(str), labels=labels, ) for _, spine in ax.spines.items(): spine.set_visible(True) if len(data.columns) > 40: plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.3) else: plt.subplots_adjust(left=0.2, right=0.9, top=0.8, bottom=0.3) return plot_360_n0sc0pe(plt)
def scatter_pairwise(series1, series2, x_label, y_label) -> str: """Scatter plot (or hexbin plot) from two series Examples: >>> widths = pd.Series([800, 1024]) >>> heights = pd.Series([600, 768]) >>> scatter_series(widths, heights, "Width", "Height") Args: series1: the series corresponding to the x-axis series2: the series corresponding to the y-axis x_label: the label on the x-axis y_label: the label on the y-axis Returns: A string containing (a reference to) the image """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): plt.xlabel(x_label) plt.ylabel(y_label) color = config["html"]["style"]["primary_color"].get(str) scatter_threshold = config["plot"]["scatter_threshold"].get(int) if len(series1) > scatter_threshold: cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(series1.tolist(), series2.tolist(), gridsize=15, cmap=cmap) else: plt.scatter(series1.tolist(), series2.tolist(), color=color) return plot_360_n0sc0pe(plt)
def __init__(self): """The config constructor should be called only once.""" if self.config is None: self.clear() else: self.set_file(str(get_resource("configs/config_default.yaml")))
def clear(self): self.config = confuse.Configuration("PandasProfiling", __name__, read=False) self.set_file(str(get_resource("configs/config_default.yaml")))
from matplotlib.colors import LinearSegmentedColormap from matplotlib.patches import Patch import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import numpy as np import pandas as pd from pandas.plotting import register_matplotlib_converters import seaborn as sns from sklearn.decomposition import PCA from pandas_profiling.config import config from pandas_profiling.utils.resources import get_resource from pandas_profiling.visualisation.utils import hex_to_rgb, plot_360_n0sc0pe register_matplotlib_converters() matplotlib.style.use(str(get_resource("styles/pandas_profiling.mplstyle"))) sns.set_style(style="white") def _plot_boxplot( series: np.ndarray, series_description: dict, figsize: tuple = (6, 4), ): """Plot a boxplot from the data and return the AxesSubplot object. Args: series: The data to plot figsize: The size of the figure (width, height) in inches, default (6,4) Returns: