def scatter_pairwise(series1, series2, x_label, y_label) -> str: """Scatter plot (or hexbin plot) from two series Examples: >>> widths = pd.Series([800, 1024]) >>> heights = pd.Series([600, 768]) >>> scatter_series(widths, heights, "Width", "Height") Args: series1: the series corresponding to the x-axis series2: the series corresponding to the y-axis x_label: the label on the x-axis y_label: the label on the y-axis Returns: A string containing (a reference to) the image """ plt.xlabel(x_label) plt.ylabel(y_label) color = config["html"]["style"]["primary_color"].get(str) scatter_threshold = config["plot"]["scatter_threshold"].get(int) if len(series1) > scatter_threshold: cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(series1.tolist(), series2.tolist(), gridsize=15, cmap=cmap) else: plt.scatter(series1.tolist(), series2.tolist(), color=color) return plot_360_n0sc0pe(plt)
def scatter_series(config: Settings, series: pd.Series, x_label: str = "Width", y_label: str = "Height") -> str: """Scatter plot (or hexbin plot) from one series of sequences with length 2 Examples: >>> scatter_series(file_sizes, "Width", "Height") Args: config: report Settings object series: the Series x_label: the label on the x-axis y_label: the label on the y-axis Returns: A string containing (a reference to) the image """ plt.xlabel(x_label) plt.ylabel(y_label) color = config.html.style.primary_color data = zip(*series.tolist()) if len(series) > config.plot.scatter_threshold: cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(*data, cmap=cmap) else: plt.scatter(*data, color=color) return plot_360_n0sc0pe(config)
def scatter_pairwise(config: Settings, series1: pd.Series, series2: pd.Series, x_label: str, y_label: str) -> str: """Scatter plot (or hexbin plot) from two series Examples: >>> widths = pd.Series([800, 1024]) >>> heights = pd.Series([600, 768]) >>> scatter_series(widths, heights, "Width", "Height") Args: config: Settings series1: the series corresponding to the x-axis series2: the series corresponding to the y-axis x_label: the label on the x-axis y_label: the label on the y-axis Returns: A string containing (a reference to) the image """ plt.xlabel(x_label) plt.ylabel(y_label) color = config.html.style.primary_color indices = (series1.notna()) & (series2.notna()) if len(series1) > config.plot.scatter_threshold: cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(series1[indices], series2[indices], gridsize=15, cmap=cmap) else: plt.scatter(series1[indices], series2[indices], color=color) return plot_360_n0sc0pe(config)
def scatter_series(series, x_label="Width", y_label="Height") -> str: """Scatter plot (or hexbin plot) from one series of sequences with length 2 Examples: >>> scatter_series(file_sizes, "Width", "Height") Args: series: the Series x_label: the label on the x-axis y_label: the label on the y-axis Returns: A string containing (a reference to) the image """ plt.xlabel(x_label) plt.ylabel(y_label) color = config["html"]["style"]["primary_color"].get(str) scatter_threshold = config["plot"]["scatter_threshold"].get(int) if len(series) > scatter_threshold: cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(*zip(*series.tolist()), cmap=cmap) else: plt.scatter(*zip(*series.tolist()), color=color) return plot_360_n0sc0pe(plt)
def scatter_complex(series: pd.Series) -> str: """Scatter plot (or hexbin plot) from a series of complex values Examples: >>> complex_series = pd.Series([complex(1, 3), complex(3, 1)]) >>> scatter_complex(complex_series) Args: series: the Series Returns: A string containing (a reference to) the image """ plt.ylabel("Imaginary") plt.xlabel("Real") color = config["html"]["style"]["primary_color"].get(str) scatter_threshold = config["plot"]["scatter_threshold"].get(int) if len(series) > scatter_threshold: cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(series.real, series.imag, cmap=cmap) else: plt.scatter(series.real, series.imag, color=color) return plot_360_n0sc0pe(plt)
def scatter_series(series, x_label="Width", y_label="Height") -> str: """ Examples: >>> scatter_series(file_sizes, "Width", "Height") Args: series: x_label: y_label: Returns: """ plt.xlabel(x_label) plt.ylabel(y_label) color = config["html"]["style"]["primary_color"].get(str) if len(series) > 1000: cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(*zip(*series.tolist()), cmap=cmap) else: plt.scatter(*zip(*series.tolist()), color=color) return plot_360_n0sc0pe(plt)
def correlation_matrix(data: pd.DataFrame, vmin: int = -1) -> str: """Plot image of a matrix correlation. Args: data: The matrix correlation to plot. vmin: Minimum value of value range. Returns: The resulting correlation matrix encoded as a string. """ fig_cor, axes_cor = plt.subplots() cmap = config["plot"]["correlation"]["cmap"].get(str) if vmin == 0: cmap = get_cmap_half(cmap) labels = data.columns matrix_image = axes_cor.imshow( data, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap ) plt.colorbar(matrix_image) axes_cor.set_xticks(np.arange(0, data.shape[0], float(data.shape[0]) / len(labels))) axes_cor.set_yticks(np.arange(0, data.shape[1], float(data.shape[1]) / len(labels))) font_size = get_correlation_font_size(len(labels)) axes_cor.set_xticklabels(labels, rotation=90, fontsize=font_size) axes_cor.set_yticklabels(labels, fontsize=font_size) plt.subplots_adjust(bottom=0.2) return plot_360_n0sc0pe(plt)
def mini_histogram( config: Settings, series: np.ndarray, bins: Union[int, np.ndarray], date: bool = False, ) -> str: """Plot a small (mini) histogram of the data. Args: config: Settings series: The data to plot. bins: number of bins (int for equal size, ndarray for variable size) Returns: The resulting mini histogram encoded as a string. """ plot = _plot_histogram(config, series, bins, figsize=(3, 2.25), date=date) plot.axes.get_yaxis().set_visible(False) plot.set_facecolor("w") for tick in plot.xaxis.get_major_ticks(): tick.label1.set_fontsize(6 if date else 8) plot.xaxis.set_tick_params(rotation=90 if date else 45) plot.figure.tight_layout() return plot_360_n0sc0pe(config)
def scatter_complex(series) -> str: plt.ylabel("Imaginary") plt.xlabel("Real") if len(series) > 1000: plt.hexbin(series.real, series.imag) else: plt.scatter(series.real, series.imag) return plot_360_n0sc0pe(plt)
def correlation_matrix(data: pd.DataFrame, vmin: int = -1) -> str: """Plot image of a matrix correlation. Args: data: The matrix correlation to plot. vmin: Minimum value of value range. Returns: The resulting correlation matrix encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): fig_cor, axes_cor = plt.subplots() cmap_name = config["plot"]["correlation"]["cmap"].get(str) cmap_bad = config["plot"]["correlation"]["bad"].get(str) cmap = plt.get_cmap(cmap_name) if vmin == 0: cmap = get_cmap_half(cmap) cmap.set_bad(cmap_bad) labels = data.columns matrix_image = axes_cor.imshow(data, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap) cbar = plt.colorbar(matrix_image) cbar.outline.set_visible(False) if data.isnull().values.any(): legend_elements = [ Patch(facecolor=cmap(np.nan), label="invalid\ncoefficient") ] plt.legend( handles=legend_elements, loc="upper right", handleheight=2.5, ) axes_cor.set_xticks( np.arange(0, data.shape[0], float(data.shape[0]) / len(labels))) axes_cor.set_yticks( np.arange(0, data.shape[1], float(data.shape[1]) / len(labels))) font_size = get_correlation_font_size(len(labels)) axes_cor.set_xticklabels(labels, rotation=90, fontsize=font_size) axes_cor.set_yticklabels(labels, fontsize=font_size) plt.subplots_adjust(bottom=0.2) return plot_360_n0sc0pe(plt)
def scatter_pairwise(series1, series2, x_label, y_label) -> str: plt.xlabel(x_label) plt.ylabel(y_label) if len(series1) > 1000: color = config["html"]["style"]["primary_color"].get(str) cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(series1.tolist(), series2.tolist(), gridsize=15, cmap=cmap) else: plt.scatter(series1.tolist(), series2.tolist()) return plot_360_n0sc0pe(plt)
def correlation_matrix(config: Settings, data: pd.DataFrame, vmin: int = -1) -> str: """Plot image of a matrix correlation. Args: config: Settings data: The matrix correlation to plot. vmin: Minimum value of value range. Returns: The resulting correlation matrix encoded as a string. """ fig_cor, axes_cor = plt.subplots() cmap = plt.get_cmap(config.plot.correlation.cmap) if vmin == 0: cmap = get_cmap_half(cmap) cmap = copy.copy(cmap) cmap.set_bad(config.plot.correlation.bad) labels = data.columns matrix_image = axes_cor.imshow(data, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap) plt.colorbar(matrix_image) if data.isnull().values.any(): legend_elements = [ Patch(facecolor=cmap(np.nan), label="invalid\ncoefficient") ] plt.legend( handles=legend_elements, loc="upper right", handleheight=2.5, ) axes_cor.set_xticks( np.arange(0, data.shape[0], float(data.shape[0]) / len(labels))) axes_cor.set_yticks( np.arange(0, data.shape[1], float(data.shape[1]) / len(labels))) font_size = get_correlation_font_size(len(labels)) axes_cor.set_xticklabels(labels, rotation=90, fontsize=font_size) axes_cor.set_yticklabels(labels, fontsize=font_size) plt.subplots_adjust(bottom=0.2) return plot_360_n0sc0pe(config)
def cat_frequency_plot( config: Settings, data: pd.Series, ) -> str: """Generate category frequency plot to show category frequency. Works for boolean and categorical features. Modify colors by setting 'config.plot.cat_freq.colors' to a list of valid matplotib colors: https://matplotlib.org/stable/tutorials/colors/colors.html Args: config (Settings): a profile report config data (pd.Series): category frequencies with category names as index Returns: str: encoded category frequency plot encoded """ # Get colors, if not defined, use matplotlib defaults colors = config.plot.cat_freq.colors if colors is None: # Get matplotlib defaults colors = plt.rcParams["axes.prop_cycle"].by_key()["color"] # If there are more categories than colors, loop through the colors again if len(colors) < len(data): multiplier = int(len(data) / len(colors)) + 1 colors = multiplier * colors # repeat colors as required colors = colors[0:len( data)] # select the exact number of colors required # Create the plot plot_type = config.plot.cat_freq.type if plot_type == "bar": plot, legend = _plot_stacked_barh(data, colors) elif plot_type == "pie": plot, legend = _plot_pie_chart(data, colors) else: msg = (f"'{plot_type}' is not a valid plot type! " "Expected values are ['bar', 'pie']") msg raise ValueError(msg) return plot_360_n0sc0pe( config, bbox_extra_artists=[ legend, ], bbox_inches="tight", )
def scatter_complex(series) -> str: plt.ylabel("Imaginary") plt.xlabel("Real") color = config["html"]["style"]["primary_color"].get(str) if len(series) > 1000: cmap = sns.light_palette(color, as_cmap=True) plt.hexbin(series.real, series.imag, cmap=cmap) else: plt.scatter(series.real, series.imag, color=color) return plot_360_n0sc0pe(plt)
def missing_dendrogram(data: pd.DataFrame) -> str: """Generate a dendrogram plot for missing values. Args: data: Pandas DataFrame to generate missing values dendrogram plot from. Returns: The resulting missing values dendrogram plot encoded as a string. """ missingno.dendrogram(data, fontsize=get_font_size(data) * 2.0) plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2) return plot_360_n0sc0pe(plt)
def pie_plot(data, legend_kws=None): if legend_kws is None: legend_kws = {} def func(pct, allvals): absolute = int(pct / 100.0 * np.sum(allvals)) return f"{pct:.1f}%\n({absolute:d})" wedges, _, _ = plt.pie( data, autopct=lambda pct: func(pct, data), textprops={"color": "w"} ) plt.legend(wedges, data.index.values, **legend_kws) return plot_360_n0sc0pe(plt)
def pie_plot(data, legend_kws=None): if legend_kws is None: legend_kws = {} def func(pct, allvals): absolute = int(pct / 100.0 * np.sum(allvals)) return "{:.1f}%\n({:d})".format(pct, absolute) wedges, _, _ = plt.pie(data, autopct=lambda pct: func(pct, data), textprops=dict(color="w")) plt.legend(wedges, data.index.values, **legend_kws) return plot_360_n0sc0pe(plt)
def histogram(series: np.ndarray, bins: Union[int, np.ndarray], date=False) -> str: """Plot an histogram of the data. Args: series: The data to plot. bins: number of bins (int for equal size, ndarray for variable size) Returns: The resulting histogram encoded as a string. """ plot = _plot_histogram(series, bins, date=date) plot.xaxis.set_tick_params(rotation=90 if date else 45) plot.figure.tight_layout() return plot_360_n0sc0pe(plt)
def predictivity(data: pd.DataFrame) -> str: """Plot image of a matrix correlation. Args: data: The matrix correlation to plot. Returns: The resulting predictivity plot encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): target_variables = config["correlations"]["targets"].get() if len(target_variables) == 0: target_variables = list( data.select_dtypes(include=np.number).columns) palette = sns.color_palette().as_hex() tmp = palette[3] palette[3] = palette[1] palette[1] = tmp fig_pred, axes_pred = plt.subplots() axes_pred.set_ylim(0, 100) # Rescale in range [0, 100] for better visualization predictivity = (100 * data[target_variables].round(2).abs()).astype(int) # Barplot predictivity predictivity.plot.bar( figsize=(10, 6), width=0.8, legend=True, fontsize=get_predictivity_font_size(predictivity), rot=45, ax=axes_pred, color=palette) for patch in axes_pred.patches: axes_pred.annotate(patch.get_height(), (patch.get_x() + patch.get_width() / 2., 100), ha="center", va="center", xytext=(0, 15), textcoords="offset points", rotation=45) plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.2) return plot_360_n0sc0pe(plt)
def histogram(series: np.ndarray, series_description: dict, bins: Union[int, np.ndarray]) -> str: """Plot an histogram of the data. Args: series: The data to plot. series_description: bins: number of bins (int for equal size, ndarray for variable size) Returns: The resulting histogram encoded as a string. """ plot = _plot_histogram(series, series_description, bins) plot.xaxis.set_tick_params(rotation=45) plot.figure.tight_layout() return plot_360_n0sc0pe(plt)
def missing_dendrogram(data: pd.DataFrame) -> str: """Generate a dendrogram plot for missing values. Args: data: Pandas DataFrame to generate missing values dendrogram plot from. Returns: The resulting missing values dendrogram plot encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): missingno.dendrogram(data, fontsize=get_font_size(data) * 2.0) plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2) return plot_360_n0sc0pe(plt)
def clustermap(data: pd.DataFrame) -> str: """Plot a clustermap of the data. Args: series: The data to plot. Returns: The resulting clustermap encoded as a string. :param data: """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): plot = _plot_clustermap(data) return plot_360_n0sc0pe(plt)
def boxplot(series: np.ndarray, series_description: dict) -> str: """Plot a boxplot of the data. Args: series: The data to plot. series_description: Returns: The resulting boxplot encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): plot = _plot_boxplot(series, series_description) plot.figure.tight_layout() return plot_360_n0sc0pe(plt)
def missing_matrix(data: pd.DataFrame) -> str: """Generate missing values matrix plot Args: data: Pandas DataFrame to generate missing values matrix from. Returns: The resulting missing values matrix encoded as a string. """ labels = config["plot"]["missing"]["force_labels"].get(bool) missingno.matrix( data, figsize=(10, 4), color=hex_to_rgb(config["html"]["style"]["primary_color"].get(str)), fontsize=get_font_size(data) / 20 * 16, sparkline=False, labels=labels, ) plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2) return plot_360_n0sc0pe(plt)
def pie_plot(config: Settings, data: pd.Series, legend_kws: Optional[dict] = None) -> str: if legend_kws is None: legend_kws = {} def make_autopct(values: pd.Series) -> Callable: def my_autopct(pct: float) -> str: total = np.sum(values) val = int(round(pct * total / 100.0)) return f"{pct:.1f}% ({val:d})" return my_autopct wedges, _, _ = plt.pie(data, autopct=make_autopct(data), textprops={"color": "w"}) plt.legend(wedges, data.index.values, **legend_kws) return plot_360_n0sc0pe(config)
def histogram(series: np.ndarray, series_description: dict, bins: Union[int, np.ndarray]) -> str: """Plot an histogram of the data. Args: series: The data to plot. series_description: bins: number of bins (int for equal size, ndarray for variable size) Returns: The resulting histogram encoded as a string. """ with matplotlib.style.context([ "seaborn-ticks", str(get_resource("styles/pandas_profiling_frame.mplstyle")) ]): plot = _plot_histogram(series, series_description, bins) plot.xaxis.set_tick_params(rotation=45) plot.figure.tight_layout() return plot_360_n0sc0pe(plt)
def missing_matrix(config: Settings, data: pd.DataFrame) -> str: """Generate missing values matrix plot Args: config: Settings data: Pandas DataFrame to generate missing values matrix from. Returns: The resulting missing values matrix encoded as a string. """ missingno.matrix( data, figsize=(10, 4), fontsize=get_font_size(data) / 20 * 16, sparkline=False, color=hex_to_rgb(config.html.style.primary_color), labels=config.plot.missing.force_labels, ) plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2) return plot_360_n0sc0pe(config)
def missing_bar(data: pd.DataFrame) -> str: """Generate missing values bar plot. Args: data: Pandas DataFrame to generate missing values bar plot from. Returns: The resulting missing values bar plot encoded as a string. """ labels = config["plot"]["missing"]["force_labels"].get(bool) missingno.bar( data, figsize=(10, 5), color=hex_to_rgb(config["html"]["style"]["primary_color"].get(str)), fontsize=get_font_size(data), labels=labels, ) for ax0 in plt.gcf().get_axes(): ax0.grid(False) plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.3) return plot_360_n0sc0pe(plt)
def scatter_series(series, x_label="Width", y_label="Height") -> str: """ Examples: >>> scatter_series(file_sizes, "Width", "Height") Args: series: x_label: y_label: Returns: """ plt.xlabel(x_label) plt.ylabel(y_label) if len(series) > 1000: plt.hexbin(*zip(*series.tolist())) else: plt.scatter(*zip(*series.tolist())) return plot_360_n0sc0pe(plt)
def missing_bar(config: Settings, data: pd.DataFrame) -> str: """Generate missing values bar plot. Args: config: Settings data: Pandas DataFrame to generate missing values bar plot from. Returns: The resulting missing values bar plot encoded as a string. """ missingno.bar( data, figsize=(10, 5), fontsize=get_font_size(data), color=hex_to_rgb(config.html.style.primary_color), labels=config.plot.missing.force_labels, ) for ax0 in plt.gcf().get_axes(): ax0.grid(False) plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.3) return plot_360_n0sc0pe(config)