示例#1
0
def plot_radar(df, ax=None, colors="default", legend=False):
    """
    Creates a radar plot from the given dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        This should be a dataframe where each index is a 'radar' area to be
        plotted, and each column is a category to plot, starting on the x-axis
        and going round the circle in the order of df.columns.
    ax : plt.Axes or None
        This should be a polar axes. If None (default), one will be created
        with ax = plt.subplot(polar=True).
    colors : {'default'} or list of str or dict
        The colors to be used for each person in the chat. Should be either
        'default' in which case the default color scheme is used, a list of
        colors the same length as the number of names in df['name'], or a dict
        which maps each name to a color. For more info about color options,
        see `here <https://matplotlib.org/2.0.2/api/colors_api.html>`_.
    legend : bool
        If True, will add a legend to the plot. Default is False.

    Returns
    -------
    matplotlib.axes._subplots.PolarAxesSubplot
        A radar plot with one area per index of the dataframe.
    """
    n_cols = len(df.columns)
    if ax is None:
        ax = plt.subplot(polar=True)

    angles = [n / float(n_cols) * 2 * np.pi for n in range(n_cols)]
    angles += angles[:1]  # repeat first value to go full circle

    color_dict = _build_color_dict(colors, df)
    for ind, (name, row) in enumerate(df.fillna(0).iterrows()):
        values = list(row)
        values += values[:1]
        ax.plot(angles,
                values,
                linewidth=3,
                linestyle="solid",
                color=color_dict[name])
        ax.fill(angles, values, color_dict[name], alpha=0.1)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(list(df.columns))
    ax.set_rlabel_position(0)
    ax.set_yticklabels([str(int(i)) for i in ax.get_yticks()[:-2]])

    ax.tick_params(axis="both", colors="grey")
    ax.tick_params(axis="y", labelrotation=45)
    ax.tick_params(axis="x", pad=20)
    if legend:
        patches = [
            mpatches.Patch(color=c, label=n) for (n, c) in color_dict.items()
        ]
        ax.legend(handles=patches)
    return ax
示例#2
0
def plot_reply_times(df, ax=None, colors="default", show_ylabels=False):
    """
    Creates a horizontal bar chart showing the average reply time in hours
    for each person in the chat.

    Reply time for person A is calculated as the time at which a message was
    sent minus the time of the last message sent by someone other than
    person A.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe of messages. Must have the columns ['date', 'name'].
    ax : plt.Axes or None
        The axes to plot onto. If None (default), will create a new axes.
    colors : {'default'} or list of str or dict
        The colors to be used for each person in the chat. Should be either:

        - 'default' in which case the default color scheme is used
        - a list of colors the same length as the number of names in df['name']
        or

        - a dict which maps each name to a color.
        For more info about the possible color strings,
        see `the matplotlib documentation <https://matplotlib.org/2.0.2/api/colors_api.html>`_.
    show_ylabels : bool
        If True, will show names around the donut. If False (default), then they
        will be hidden.

    Returns
    -------
    plt.Axes
        The horizontal bar chart axes plot.

    Examples
    --------

    .. plot:: ../examples/reply_times_example.py
       :width: 800px
    """
    if ax is None:
        ax = plt.subplot(111)
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    reply_data = _create_reply_time_df(df)
    if reply_data is None:
        ax.axis("off")
        return ax
    color_dict = _build_color_dict(colors, df)
    reply_data = reply_data[list(color_dict.keys())[::-1]]
    reply_data.plot(kind="barh",
                    color=_map_colors(color_dict, reply_data),
                    ax=ax)
    ax.set_ylabel("")
    if not show_ylabels:
        ax.set_yticklabels([])
    ax.set_xlabel("Hours")
    return ax
示例#3
0
def plot_one_donut(df, title, ax, colors, show_ylabels=False):
    def func(pct, allvals):
        absolute = int(pct / 100.0 * np.sum(allvals))
        return "{:d}".format(absolute)

    color_dict = _build_color_dict(colors, df)
    df = df.loc[list(color_dict.keys())[::-1]]
    ax.pie(
        df["text"],
        wedgeprops=dict(width=0.3),
        labels=df.index if show_ylabels else None,
        colors=_map_colors(color_dict, df),
        startangle=90,
        autopct=lambda pct: func(pct, df),
        pctdistance=0.45,
    )
    ax.set_title(f"Number of {title}\nTotal: {df.sum().iloc[0]}")
    return ax
示例#4
0
文件: main.py 项目: lrjball/chatviz
def visualize_chat(
    df,
    title,
    colors="default",
    timeline_freq="MS",
    timeline_tick_format="%b '%y",
    timeline_tick_step=6,
    timeline_color="default",
    timeline_stacked=False,
    top_n_words=10,
    stopwords=None,
):
    """
    Creates a series of plots given a dataframe of messages.

    The plots created are:

    1. 3 donut plots showing the number of messages, words and characters used
       by each person in the chat.
    2. A bar chart showing the average time to reply in hours for each person.
    3. A timeline showing the number of messages over time, can optionally be
       split down to per person.
    4. Bar charts showing the most used words by each person.
    5. 2 radar plots showing the number of messages sent at each hour and day
       respectively for each person.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe of messages. Must have the columns
        ['date', 'name', 'text'].
    title : str
        The title for the plot.
    colors : {'default'} or list of str or dict
        The colors to be used for each person in the chat. Should be either
        'default' in which case the default color scheme is used, a list of
        colors the same length as the number of names in df['name'], or a dict
        which maps each name to a color. For more info about color options,
        see `here <https://matplotlib.org/2.0.2/api/colors_api.html>`_.
    timeline_freq: str
        The offset string for the resample frequency in the timeline plot. See `here
        <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`_
        for more. The default is 'MS', which will generate a monthly plot.
    timeline_tick_format : str
        The format string for the x tick labels in the timeline plot, which are dates.
        The default is '%b \'%y' which gives for example `Jan '19`.
    timeline_tick_step : int
        The number of steps between the ticks on the x-axis for the timeline
        plot. By default this is 6, so every 6 bars will have an x-tick label.
    timeline_color : str
        Only applicable if timeline_stacked is False. In this case, the timeline
        plot will only be one color, which may want to be different from `colors`
        as these are per person. Default is 'default' which will just use matplotlibs
        default color.
    timeline_stacked : bool
        If True, then a stacked bar chart will be created for the timeline plot,
        with one color per person involved in the chat. If False (default),
        will just plot one colored bar with the overall count.
    top_n_words : int
        The number of top words to include in the words bar chart. Default is 10.
    stopwords : None or iterable
        If given, then these words will be removed from the words bar charts.
        If None, all words will be kept (Note: this will lead to poor results
        as 'the', 'and', 'a', 'is' etc. will be the top words. A stopword list
        is recommended).

    Returns
    -------
    plt.figure
        A matplotlib figure with all of the message plots on it.

    Examples
    --------

    This example visualizes the script from Monty Python Flying Circus
    Series 1. In just a few lines we can used chatviz to create a sophisticated
    infographic. Note: The dates have been randomly created for this example.

    .. plot:: ../examples/complete_example.py
    """
    fig = plt.figure()
    gs = fig.add_gridspec(4,
                          4,
                          height_ratios=[0.2, 0.5, 0.2, 0.2],
                          hspace=0.6,
                          wspace=0.5)

    color_dict = _build_color_dict(colors, df)

    gsdonuts = gs[0, :3].subgridspec(1, 3)
    ax_donuts = [
        fig.add_subplot(gsdonuts[0]),
        fig.add_subplot(gsdonuts[1]),
        fig.add_subplot(gsdonuts[2]),
    ]
    plot_donuts(df, ax=ax_donuts, colors=color_dict)

    ax_legend = fig.add_subplot(gs[0, 3])
    # plot_reply_times(df, ax=ax_reply, colors=color_dict)
    plot_legend(color_dict, ax=ax_legend)

    ax_timeline = fig.add_subplot(gs[1, :])

    plot_timeline(
        df,
        ax=ax_timeline,
        colors=color_dict if timeline_stacked else [timeline_color],
        freq=timeline_freq,
        tick_format=timeline_tick_format,
        tick_step=timeline_tick_step,
        stacked=timeline_stacked,
    )

    gswords = gs[2, :].subgridspec(1, len(set(df["name"])), wspace=1.3)
    ax_words_title = fig.add_subplot(gswords[:])
    ax_words_title.set_title("Most Used Words", y=1.1)
    if len(color_dict) > 1:
        ax_words_title.axis(False)
    ax_words = [
        fig.add_subplot(gswords[i]) for i in range(len(set(df["name"])))
    ]
    plot_words(df,
               ax=ax_words,
               colors=color_dict,
               top_n=top_n_words,
               stopwords=stopwords)

    gsradar = gs[3, 2:].subgridspec(1, 2)
    ax_radar_title = fig.add_subplot(gsradar[:])
    ax_radar_title.set_title("Distribution of Message Times", y=1.2)
    ax_radar_title.axis(False)
    hour_radar_ax = fig.add_subplot(gsradar[0], polar=True)
    plot_hours_radar(df, ax=hour_radar_ax, colors=colors)
    day_radar_ax = fig.add_subplot(gsradar[1], polar=True)
    plot_days_radar(df, ax=day_radar_ax, colors=color_dict)

    gsreply = gs[3, :2].subgridspec(1, 2)
    ax_reply_title = fig.add_subplot(gsreply[:])
    ax_reply_title.axis(False)
    ax_reply_title.set_title("Average Time to Reply", y=1.2)
    ax_reply = fig.add_subplot(gs[3, :2])
    plot_reply_times(df, ax=ax_reply, colors=color_dict)

    fig.suptitle(title, y=0.95)
    return fig
示例#5
0
def plot_words(df,
               ax=None,
               top_n=10,
               stopwords=None,
               colors="default",
               show_titles=False):
    """
    Plots a bar chart per person with their top words used.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe of messages. Must have the columns ['name', 'text'].
    ax : plt.Axes or None
        This should be an iterable of M axes, where M is the number of unique
        names in df['name']. If None (default), will create M new axes, via
        _, ax = plt.subplots(1, M).
    top_n : int
        The number of top words to include. Default is 10.
    stopwords : None or iterable
        If given, then these words will be removed from the plots. If None,
        all words will be kept (Note: this will lead to poor results as 'the',
        'and', 'a', 'is' etc. will be the top words. A stopword list is
        recommended).
    colors : {'default'} or list of str or dict
        The colors to be used for each person in the chat. Should be either
        'default' in which case the default color scheme is used, a list of
        colors the same length as the number of names in df['name'], or a dict
        which maps each name to a color. For more info about color options,
        see `here <https://matplotlib.org/2.0.2/api/colors_api.html>`_.
    show_titles : bool
        If True, will show names about each plot. If False (default), then they
        will be hidden.

    Returns
    -------
    array of plt.Axes
        The horizontal bar chart axes plots, one for each person in the chat.

    Examples
    --------

    .. plot:: ../examples/words_example.py
       :width: 800px


    The example below shows the results without stopwords being removed. In
    most cases the results with be better by removing stopwords, either using
    the `chatviz.utils.STOPWORDS` or a custom list of stopwords.

    .. plot:: ../examples/words_example2.py
       :width: 800px
    """
    counts = _word_counts(df, stopwords)
    if ax is None:
        _, ax = plt.subplots(1, len(counts))
    color_dict = _build_color_dict(colors, df)
    for ind, (name, color) in enumerate(color_dict.items()):
        name_counts = counts.get(name, Counter())
        top_words = name_counts.most_common(top_n)
        ax[ind].barh(
            [i[0] for i in top_words][::-1],
            [i[1] for i in top_words][::-1],
            color=color,
        )
        ax[ind].spines["right"].set_visible(False)
        ax[ind].spines["top"].set_visible(False)
        if show_titles:
            ax[ind].set_title(name)
    return ax
示例#6
0
def plot_timeline(
    df,
    ax=None,
    freq="MS",
    colors="default",
    tick_format="%b '%y",
    tick_step=6,
    stacked=False,
    legend=False,
):
    """
    Creates a bar chart of number of messages over time.

    The size of the bins in the bar chart can be adjusted via `freq`, and there
    are other options to control whether or not to create a stacked bar chart.
    See the examples below for more information.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe of messages. Must have the columns ['date', 'text'], as
        well as a 'name' columns if stacked=True.
    ax : plt.Axes or None
        The axes to plot onto. If None (default), will create a new axes.
    freq: str
        The offset string for the resample frequency. See `the pandas documentation
        <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`_
        for all options. The default is 'MS', which will generate a monthly plot.
    colors : {'default'} or str or list of str or dict
        The colors to be used for each person in the chat. Should be either:

        - 'default' in which case the default color scheme is used.
        - a list of colors the same length as the number of names in df['name']
          when `stacked=True`.
        - a single color string when `stacked=False`.
        or

        - a dict which maps each name to a color.
        If `stacked=False` and multiple colors are passed, then the first one will
        be used.
        For more info about the possible color strings,
        see `the matplotlib documentation <https://matplotlib.org/2.0.2/api/colors_api.html>`_.
    tick_format : str
        The format string for the x tick labels, which are dates. The default
        is '%b \'%y' which gives for example `Jan '19`. For more information
        about datetime format strings, `see the datetime documentation
        <https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes>`_.
    tick_step : int
        The number of steps between the ticks on the x-axis. By default this
        is 6, so every 6 bars will have an x-tick label.
    stacked : bool
        If True, then a stacked bar chart will be created, with one color per
        person involved in the chat. If False (default), will just plot one
        colored bar with the overall count.
    legend : bool
        If True, will add a legend to the plot when stacked=True.
        Default is False.

    Returns
    -------
    plt.Axes
        The bar chart axes plot.

    Examples
    --------

    This first example shows how to plot a stacked timeline with each bar being
    2 days, and every third bar having a label on it, which is formatted as
    day/month/year.

    .. plot:: ../examples/timeline_example.py
       :width: 800px

    The second example shows how the frequency can be change to plot weekly
    bars, with the labels now being on every bar and showing the week number
    and year.

    .. plot:: ../examples/timeline_example2.py
       :width: 800px

    The third example shows the plot with `stacked=False`, so the bars are
    only in one color.

    .. plot:: ../examples/timeline_example3.py
       :width: 800px


    .. note:: You may need to alter the font sizes and other parameters using
              `plt.rcParams` to get a suitable plot.
    """
    if ax is None:
        ax = plt.subplot(111)
    df2 = df.copy()
    if stacked:
        df2 = (df2.groupby(
            [pd.Grouper(key="date", freq=freq),
             "name"]).count().unstack("name").fillna(0).resample(freq).sum())
        color_dict = _build_color_dict(colors, df)
        cats = list(color_dict.keys())[::-1]
        df2 = df2.reindex(cats, axis=1, level=1)
        df2["text"].plot(
            kind="bar",
            stacked=True,
            width=0.75,
            color=_map_colors(colors, df2["text"].transpose()),
            ax=ax,
            rot=45,
        )
    else:
        df2 = df2.resample(freq, on="date").count()
        df2["text"].plot(
            kind="bar",
            width=0.75,
            color=list(_build_color_dict(colors, df).values())[0],
            ax=ax,
            rot=45,
        )
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.set_xticks(list(range(len(df2)))[::tick_step])
    ax.set_xticklabels(
        [item.strftime(tick_format) for item in df2.index[::tick_step]])
    ax.set_xlabel("Date")
    ax.set_ylabel("Count")
    ax.set_title("Message Timeline")
    ax.xaxis.labelpad = 20
    ax.yaxis.labelpad = 20
    if stacked:
        if legend:
            patches = [
                mpatches.Patch(facecolor=c, label=n)
                for n, c in color_dict.items()
            ]
            ax.legend(handles=patches, loc="upper right")
        else:
            ax.get_legend().remove()
    return ax