Exemplo n.º 1
0
def scatter_pairwise(series1, series2, x_label, y_label) -> str:
    """Scatter plot (or hexbin plot) from two series

    Examples:
        >>> widths = pd.Series([800, 1024])
        >>> heights = pd.Series([600, 768])
        >>> scatter_series(widths, heights, "Width", "Height")

    Args:
        series1: the series corresponding to the x-axis
        series2: the series corresponding to the y-axis
        x_label: the label on the x-axis
        y_label: the label on the y-axis

    Returns:
        A string containing (a reference to) the image
    """
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    color = config["html"]["style"]["primary_color"].get(str)
    scatter_threshold = config["plot"]["scatter_threshold"].get(int)

    if len(series1) > scatter_threshold:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(series1.tolist(), series2.tolist(), gridsize=15, cmap=cmap)
    else:
        plt.scatter(series1.tolist(), series2.tolist(), color=color)
    return plot_360_n0sc0pe(plt)
Exemplo n.º 2
0
def scatter_series(config: Settings,
                   series: pd.Series,
                   x_label: str = "Width",
                   y_label: str = "Height") -> str:
    """Scatter plot (or hexbin plot) from one series of sequences with length 2

    Examples:
        >>> scatter_series(file_sizes, "Width", "Height")

    Args:
        config: report Settings object
        series: the Series
        x_label: the label on the x-axis
        y_label: the label on the y-axis

    Returns:
        A string containing (a reference to) the image
    """
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    color = config.html.style.primary_color

    data = zip(*series.tolist())
    if len(series) > config.plot.scatter_threshold:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(*data, cmap=cmap)
    else:
        plt.scatter(*data, color=color)
    return plot_360_n0sc0pe(config)
Exemplo n.º 3
0
def scatter_pairwise(config: Settings, series1: pd.Series, series2: pd.Series,
                     x_label: str, y_label: str) -> str:
    """Scatter plot (or hexbin plot) from two series

    Examples:
        >>> widths = pd.Series([800, 1024])
        >>> heights = pd.Series([600, 768])
        >>> scatter_series(widths, heights, "Width", "Height")

    Args:
        config: Settings
        series1: the series corresponding to the x-axis
        series2: the series corresponding to the y-axis
        x_label: the label on the x-axis
        y_label: the label on the y-axis

    Returns:
        A string containing (a reference to) the image
    """
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    color = config.html.style.primary_color

    indices = (series1.notna()) & (series2.notna())
    if len(series1) > config.plot.scatter_threshold:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(series1[indices], series2[indices], gridsize=15, cmap=cmap)
    else:
        plt.scatter(series1[indices], series2[indices], color=color)
    return plot_360_n0sc0pe(config)
Exemplo n.º 4
0
def scatter_series(series, x_label="Width", y_label="Height") -> str:
    """Scatter plot (or hexbin plot) from one series of sequences with length 2

    Examples:
        >>> scatter_series(file_sizes, "Width", "Height")

    Args:
        series: the Series
        x_label: the label on the x-axis
        y_label: the label on the y-axis

    Returns:
        A string containing (a reference to) the image
    """
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    color = config["html"]["style"]["primary_color"].get(str)
    scatter_threshold = config["plot"]["scatter_threshold"].get(int)

    if len(series) > scatter_threshold:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(*zip(*series.tolist()), cmap=cmap)
    else:
        plt.scatter(*zip(*series.tolist()), color=color)
    return plot_360_n0sc0pe(plt)
Exemplo n.º 5
0
def scatter_complex(series: pd.Series) -> str:
    """Scatter plot (or hexbin plot) from a series of complex values

    Examples:
        >>> complex_series = pd.Series([complex(1, 3), complex(3, 1)])
        >>> scatter_complex(complex_series)

    Args:
        series: the Series

    Returns:
        A string containing (a reference to) the image
    """
    plt.ylabel("Imaginary")
    plt.xlabel("Real")

    color = config["html"]["style"]["primary_color"].get(str)
    scatter_threshold = config["plot"]["scatter_threshold"].get(int)

    if len(series) > scatter_threshold:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(series.real, series.imag, cmap=cmap)
    else:
        plt.scatter(series.real, series.imag, color=color)

    return plot_360_n0sc0pe(plt)
Exemplo n.º 6
0
def scatter_series(series, x_label="Width", y_label="Height") -> str:
    """

    Examples:
        >>> scatter_series(file_sizes, "Width", "Height")

    Args:
        series:
        x_label:
        y_label:

    Returns:

    """
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    color = config["html"]["style"]["primary_color"].get(str)

    if len(series) > 1000:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(*zip(*series.tolist()), cmap=cmap)
    else:
        plt.scatter(*zip(*series.tolist()), color=color)
    return plot_360_n0sc0pe(plt)
Exemplo n.º 7
0
def correlation_matrix(data: pd.DataFrame, vmin: int = -1) -> str:
    """Plot image of a matrix correlation.

    Args:
      data: The matrix correlation to plot.
      vmin: Minimum value of value range.

    Returns:
      The resulting correlation matrix encoded as a string.
    """
    fig_cor, axes_cor = plt.subplots()
    cmap = config["plot"]["correlation"]["cmap"].get(str)
    if vmin == 0:
        cmap = get_cmap_half(cmap)

    labels = data.columns
    matrix_image = axes_cor.imshow(
        data, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap
    )
    plt.colorbar(matrix_image)
    axes_cor.set_xticks(np.arange(0, data.shape[0], float(data.shape[0]) / len(labels)))
    axes_cor.set_yticks(np.arange(0, data.shape[1], float(data.shape[1]) / len(labels)))

    font_size = get_correlation_font_size(len(labels))
    axes_cor.set_xticklabels(labels, rotation=90, fontsize=font_size)
    axes_cor.set_yticklabels(labels, fontsize=font_size)
    plt.subplots_adjust(bottom=0.2)

    return plot_360_n0sc0pe(plt)
Exemplo n.º 8
0
def mini_histogram(
    config: Settings,
    series: np.ndarray,
    bins: Union[int, np.ndarray],
    date: bool = False,
) -> str:
    """Plot a small (mini) histogram of the data.

    Args:
      config: Settings
      series: The data to plot.
      bins: number of bins (int for equal size, ndarray for variable size)

    Returns:
      The resulting mini histogram encoded as a string.
    """
    plot = _plot_histogram(config, series, bins, figsize=(3, 2.25), date=date)
    plot.axes.get_yaxis().set_visible(False)
    plot.set_facecolor("w")

    for tick in plot.xaxis.get_major_ticks():
        tick.label1.set_fontsize(6 if date else 8)
    plot.xaxis.set_tick_params(rotation=90 if date else 45)
    plot.figure.tight_layout()

    return plot_360_n0sc0pe(config)
Exemplo n.º 9
0
def scatter_complex(series) -> str:
    plt.ylabel("Imaginary")
    plt.xlabel("Real")

    if len(series) > 1000:
        plt.hexbin(series.real, series.imag)
    else:
        plt.scatter(series.real, series.imag)

    return plot_360_n0sc0pe(plt)
Exemplo n.º 10
0
def correlation_matrix(data: pd.DataFrame, vmin: int = -1) -> str:
    """Plot image of a matrix correlation.

    Args:
      data: The matrix correlation to plot.
      vmin: Minimum value of value range.

    Returns:
      The resulting correlation matrix encoded as a string.
    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        fig_cor, axes_cor = plt.subplots()
        cmap_name = config["plot"]["correlation"]["cmap"].get(str)
        cmap_bad = config["plot"]["correlation"]["bad"].get(str)

        cmap = plt.get_cmap(cmap_name)
        if vmin == 0:
            cmap = get_cmap_half(cmap)
        cmap.set_bad(cmap_bad)

        labels = data.columns
        matrix_image = axes_cor.imshow(data,
                                       vmin=vmin,
                                       vmax=1,
                                       interpolation="nearest",
                                       cmap=cmap)
        cbar = plt.colorbar(matrix_image)
        cbar.outline.set_visible(False)

        if data.isnull().values.any():
            legend_elements = [
                Patch(facecolor=cmap(np.nan), label="invalid\ncoefficient")
            ]

            plt.legend(
                handles=legend_elements,
                loc="upper right",
                handleheight=2.5,
            )

        axes_cor.set_xticks(
            np.arange(0, data.shape[0],
                      float(data.shape[0]) / len(labels)))
        axes_cor.set_yticks(
            np.arange(0, data.shape[1],
                      float(data.shape[1]) / len(labels)))

        font_size = get_correlation_font_size(len(labels))
        axes_cor.set_xticklabels(labels, rotation=90, fontsize=font_size)
        axes_cor.set_yticklabels(labels, fontsize=font_size)
        plt.subplots_adjust(bottom=0.2)
        return plot_360_n0sc0pe(plt)
Exemplo n.º 11
0
def scatter_pairwise(series1, series2, x_label, y_label) -> str:
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    if len(series1) > 1000:
        color = config["html"]["style"]["primary_color"].get(str)
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(series1.tolist(), series2.tolist(), gridsize=15, cmap=cmap)
    else:
        plt.scatter(series1.tolist(), series2.tolist())
    return plot_360_n0sc0pe(plt)
Exemplo n.º 12
0
def correlation_matrix(config: Settings,
                       data: pd.DataFrame,
                       vmin: int = -1) -> str:
    """Plot image of a matrix correlation.

    Args:
      config: Settings
      data: The matrix correlation to plot.
      vmin: Minimum value of value range.

    Returns:
      The resulting correlation matrix encoded as a string.
    """
    fig_cor, axes_cor = plt.subplots()

    cmap = plt.get_cmap(config.plot.correlation.cmap)
    if vmin == 0:
        cmap = get_cmap_half(cmap)
    cmap = copy.copy(cmap)
    cmap.set_bad(config.plot.correlation.bad)

    labels = data.columns
    matrix_image = axes_cor.imshow(data,
                                   vmin=vmin,
                                   vmax=1,
                                   interpolation="nearest",
                                   cmap=cmap)
    plt.colorbar(matrix_image)

    if data.isnull().values.any():
        legend_elements = [
            Patch(facecolor=cmap(np.nan), label="invalid\ncoefficient")
        ]

        plt.legend(
            handles=legend_elements,
            loc="upper right",
            handleheight=2.5,
        )

    axes_cor.set_xticks(
        np.arange(0, data.shape[0],
                  float(data.shape[0]) / len(labels)))
    axes_cor.set_yticks(
        np.arange(0, data.shape[1],
                  float(data.shape[1]) / len(labels)))

    font_size = get_correlation_font_size(len(labels))
    axes_cor.set_xticklabels(labels, rotation=90, fontsize=font_size)
    axes_cor.set_yticklabels(labels, fontsize=font_size)
    plt.subplots_adjust(bottom=0.2)

    return plot_360_n0sc0pe(config)
Exemplo n.º 13
0
def cat_frequency_plot(
    config: Settings,
    data: pd.Series,
) -> str:
    """Generate category frequency plot to show category frequency.
    Works for boolean and categorical features.

    Modify colors by setting 'config.plot.cat_freq.colors' to a
    list of valid matplotib colors:
    https://matplotlib.org/stable/tutorials/colors/colors.html

    Args:
        config (Settings): a profile report config
        data (pd.Series): category frequencies with category names as index

    Returns:
        str: encoded category frequency plot encoded
    """
    # Get colors, if not defined, use matplotlib defaults
    colors = config.plot.cat_freq.colors
    if colors is None:
        # Get matplotlib defaults
        colors = plt.rcParams["axes.prop_cycle"].by_key()["color"]

    # If there are more categories than colors, loop through the colors again
    if len(colors) < len(data):
        multiplier = int(len(data) / len(colors)) + 1
        colors = multiplier * colors  # repeat colors as required
        colors = colors[0:len(
            data)]  # select the exact number of colors required

    # Create the plot
    plot_type = config.plot.cat_freq.type
    if plot_type == "bar":
        plot, legend = _plot_stacked_barh(data, colors)

    elif plot_type == "pie":
        plot, legend = _plot_pie_chart(data, colors)

    else:
        msg = (f"'{plot_type}' is not a valid plot type! "
               "Expected values are ['bar', 'pie']")
        msg
        raise ValueError(msg)

    return plot_360_n0sc0pe(
        config,
        bbox_extra_artists=[
            legend,
        ],
        bbox_inches="tight",
    )
Exemplo n.º 14
0
def scatter_complex(series) -> str:
    plt.ylabel("Imaginary")
    plt.xlabel("Real")

    color = config["html"]["style"]["primary_color"].get(str)

    if len(series) > 1000:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(series.real, series.imag, cmap=cmap)
    else:
        plt.scatter(series.real, series.imag, color=color)

    return plot_360_n0sc0pe(plt)
def missing_dendrogram(data: pd.DataFrame) -> str:
    """Generate a dendrogram plot for missing values.

    Args:
      data: Pandas DataFrame to generate missing values dendrogram plot from.

    Returns:
      The resulting missing values dendrogram plot encoded as a string.

    """
    missingno.dendrogram(data, fontsize=get_font_size(data) * 2.0)
    plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2)
    return plot_360_n0sc0pe(plt)
Exemplo n.º 16
0
def pie_plot(data, legend_kws=None):
    if legend_kws is None:
        legend_kws = {}

    def func(pct, allvals):
        absolute = int(pct / 100.0 * np.sum(allvals))
        return f"{pct:.1f}%\n({absolute:d})"

    wedges, _, _ = plt.pie(
        data, autopct=lambda pct: func(pct, data), textprops={"color": "w"}
    )
    plt.legend(wedges, data.index.values, **legend_kws)

    return plot_360_n0sc0pe(plt)
Exemplo n.º 17
0
def pie_plot(data, legend_kws=None):
    if legend_kws is None:
        legend_kws = {}

    def func(pct, allvals):
        absolute = int(pct / 100.0 * np.sum(allvals))
        return "{:.1f}%\n({:d})".format(pct, absolute)

    wedges, _, _ = plt.pie(data,
                           autopct=lambda pct: func(pct, data),
                           textprops=dict(color="w"))
    plt.legend(wedges, data.index.values, **legend_kws)

    return plot_360_n0sc0pe(plt)
Exemplo n.º 18
0
def histogram(series: np.ndarray, bins: Union[int, np.ndarray], date=False) -> str:
    """Plot an histogram of the data.

    Args:
      series: The data to plot.
      bins: number of bins (int for equal size, ndarray for variable size)

    Returns:
      The resulting histogram encoded as a string.

    """
    plot = _plot_histogram(series, bins, date=date)
    plot.xaxis.set_tick_params(rotation=90 if date else 45)
    plot.figure.tight_layout()
    return plot_360_n0sc0pe(plt)
Exemplo n.º 19
0
def predictivity(data: pd.DataFrame) -> str:
    """Plot image of a matrix correlation.

    Args:
      data: The matrix correlation to plot.

    Returns:
      The resulting predictivity plot encoded as a string.
    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        target_variables = config["correlations"]["targets"].get()
        if len(target_variables) == 0:
            target_variables = list(
                data.select_dtypes(include=np.number).columns)
        palette = sns.color_palette().as_hex()
        tmp = palette[3]
        palette[3] = palette[1]
        palette[1] = tmp

        fig_pred, axes_pred = plt.subplots()
        axes_pred.set_ylim(0, 100)

        # Rescale in range [0, 100] for better visualization
        predictivity = (100 *
                        data[target_variables].round(2).abs()).astype(int)

        # Barplot predictivity
        predictivity.plot.bar(
            figsize=(10, 6),
            width=0.8,
            legend=True,
            fontsize=get_predictivity_font_size(predictivity),
            rot=45,
            ax=axes_pred,
            color=palette)
        for patch in axes_pred.patches:
            axes_pred.annotate(patch.get_height(),
                               (patch.get_x() + patch.get_width() / 2., 100),
                               ha="center",
                               va="center",
                               xytext=(0, 15),
                               textcoords="offset points",
                               rotation=45)
        plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.2)
        return plot_360_n0sc0pe(plt)
Exemplo n.º 20
0
def histogram(series: np.ndarray, series_description: dict,
              bins: Union[int, np.ndarray]) -> str:
    """Plot an histogram of the data.

    Args:
      series: The data to plot.
      series_description:
      bins: number of bins (int for equal size, ndarray for variable size)

    Returns:
      The resulting histogram encoded as a string.

    """
    plot = _plot_histogram(series, series_description, bins)
    plot.xaxis.set_tick_params(rotation=45)
    plot.figure.tight_layout()
    return plot_360_n0sc0pe(plt)
Exemplo n.º 21
0
def missing_dendrogram(data: pd.DataFrame) -> str:
    """Generate a dendrogram plot for missing values.

    Args:
      data: Pandas DataFrame to generate missing values dendrogram plot from.

    Returns:
      The resulting missing values dendrogram plot encoded as a string.

    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        missingno.dendrogram(data, fontsize=get_font_size(data) * 2.0)
        plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2)
        return plot_360_n0sc0pe(plt)
Exemplo n.º 22
0
def clustermap(data: pd.DataFrame) -> str:
    """Plot a clustermap of the data.

    Args:
      series: The data to plot.

    Returns:
      The resulting clustermap encoded as a string.
      :param data:

    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        plot = _plot_clustermap(data)
        return plot_360_n0sc0pe(plt)
Exemplo n.º 23
0
def boxplot(series: np.ndarray, series_description: dict) -> str:
    """Plot a boxplot of the data.

    Args:
      series: The data to plot.
      series_description:

    Returns:
      The resulting boxplot encoded as a string.

    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        plot = _plot_boxplot(series, series_description)
        plot.figure.tight_layout()
        return plot_360_n0sc0pe(plt)
def missing_matrix(data: pd.DataFrame) -> str:
    """Generate missing values matrix plot

    Args:
      data: Pandas DataFrame to generate missing values matrix from.

    Returns:
      The resulting missing values matrix encoded as a string.
    """
    labels = config["plot"]["missing"]["force_labels"].get(bool)
    missingno.matrix(
        data,
        figsize=(10, 4),
        color=hex_to_rgb(config["html"]["style"]["primary_color"].get(str)),
        fontsize=get_font_size(data) / 20 * 16,
        sparkline=False,
        labels=labels,
    )
    plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2)
    return plot_360_n0sc0pe(plt)
Exemplo n.º 25
0
def pie_plot(config: Settings,
             data: pd.Series,
             legend_kws: Optional[dict] = None) -> str:
    if legend_kws is None:
        legend_kws = {}

    def make_autopct(values: pd.Series) -> Callable:
        def my_autopct(pct: float) -> str:
            total = np.sum(values)
            val = int(round(pct * total / 100.0))
            return f"{pct:.1f}%  ({val:d})"

        return my_autopct

    wedges, _, _ = plt.pie(data,
                           autopct=make_autopct(data),
                           textprops={"color": "w"})
    plt.legend(wedges, data.index.values, **legend_kws)

    return plot_360_n0sc0pe(config)
Exemplo n.º 26
0
def histogram(series: np.ndarray, series_description: dict,
              bins: Union[int, np.ndarray]) -> str:
    """Plot an histogram of the data.

    Args:
      series: The data to plot.
      series_description:
      bins: number of bins (int for equal size, ndarray for variable size)

    Returns:
      The resulting histogram encoded as a string.

    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        plot = _plot_histogram(series, series_description, bins)
        plot.xaxis.set_tick_params(rotation=45)
        plot.figure.tight_layout()
        return plot_360_n0sc0pe(plt)
Exemplo n.º 27
0
def missing_matrix(config: Settings, data: pd.DataFrame) -> str:
    """Generate missing values matrix plot

    Args:
      config: Settings
      data: Pandas DataFrame to generate missing values matrix from.

    Returns:
      The resulting missing values matrix encoded as a string.
    """

    missingno.matrix(
        data,
        figsize=(10, 4),
        fontsize=get_font_size(data) / 20 * 16,
        sparkline=False,
        color=hex_to_rgb(config.html.style.primary_color),
        labels=config.plot.missing.force_labels,
    )
    plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2)
    return plot_360_n0sc0pe(config)
Exemplo n.º 28
0
def missing_bar(data: pd.DataFrame) -> str:
    """Generate missing values bar plot.

    Args:
      data: Pandas DataFrame to generate missing values bar plot from.

    Returns:
      The resulting missing values bar plot encoded as a string.
    """
    labels = config["plot"]["missing"]["force_labels"].get(bool)
    missingno.bar(
        data,
        figsize=(10, 5),
        color=hex_to_rgb(config["html"]["style"]["primary_color"].get(str)),
        fontsize=get_font_size(data),
        labels=labels,
    )
    for ax0 in plt.gcf().get_axes():
        ax0.grid(False)
    plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.3)
    return plot_360_n0sc0pe(plt)
Exemplo n.º 29
0
def scatter_series(series, x_label="Width", y_label="Height") -> str:
    """

    Examples:
        >>> scatter_series(file_sizes, "Width", "Height")

    Args:
        series:
        x_label:
        y_label:

    Returns:

    """
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    if len(series) > 1000:
        plt.hexbin(*zip(*series.tolist()))
    else:
        plt.scatter(*zip(*series.tolist()))
    return plot_360_n0sc0pe(plt)
Exemplo n.º 30
0
def missing_bar(config: Settings, data: pd.DataFrame) -> str:
    """Generate missing values bar plot.

    Args:
      config: Settings
      data: Pandas DataFrame to generate missing values bar plot from.

    Returns:
      The resulting missing values bar plot encoded as a string.
    """
    missingno.bar(
        data,
        figsize=(10, 5),
        fontsize=get_font_size(data),
        color=hex_to_rgb(config.html.style.primary_color),
        labels=config.plot.missing.force_labels,
    )
    for ax0 in plt.gcf().get_axes():
        ax0.grid(False)
    plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.3)

    return plot_360_n0sc0pe(config)