示例#1
0
def plot_regression_predictions(y_test, model_name, prediction):
    y_test1 = y_test.copy()
    y_test1 = y_test1.reset_index()

    #   y_test1['Date'] = dt.strftime(y_test.index.get_level_values('Date'), "%Y-%m-%d")
    y_test1['Date'] = dt.strftime(y_test.index.get_level_values('Date'),
                                  "%Y-%m")
    y_test1['Date'] = pd.to_datetime(y_test1['Date'])
    y_test1.drop('POI', axis=1, inplace=True)

    f, ax = plt.subplots(figsize=(8, 5))
    ax.plot(y_test1.Date,
            y_test1.Percentage_Bad_Words_count,
            label='% of Total Email Messages')
    ax.plot(y_test1.Date,
            prediction,
            color='red',
            label="% of Financial Negative Words Predicted")
    ax.plot_date(y_test1.Date, prediction, color='black')
    ax.set_title(model_name +
                 ': % of Financial Negative Words Predicted by Date')
    ax.set_xlabel("Date", rotation=0)
    plt.xticks(rotation=90)
    ax.set_ylabel("% Of Negative Word")
    ax.legend(loc="upper right")
    f.autofmt_xdate()
    plt.show()
def get_all_dates_to_download(
        from_date: DatetimeIndex, to_date: DatetimeIndex,
        time_resolution: TimeResolution) -> DatetimeIndex:
    """
    Returns a DatetimeIndex with dates from_date to to_date in time_resolution.

    Args:
        from_date (DatetimeIndex): DatetimeIndex object setting the start of time interval.
        to_date (DatetimeIndex): DatetimeIndex object setting the end of the time interval.
        time_resolution (TimeResolution): Specifying the time resolution used in blob storage.

    Returns:
        Returns a pandas.DatetimeIndex with all dates from_date to to_date (both included) in time_resolution.
    """
    # Get all the dates we need
    if time_resolution == TimeResolution.NONE:
        # We handle this case by making a DatetimeIndex containing one element. We will ignore the
        # date anyway.
        return pd.date_range(start='1/1/2021', periods=1, freq='D')
    if time_resolution == TimeResolution.YEAR:
        return pd.date_range(from_date.strftime("%Y"),
                             to_date.strftime("%Y"),
                             freq='AS')
    if time_resolution == TimeResolution.MONTH:
        return pd.date_range(from_date.strftime("%Y-%m"),
                             to_date.strftime("%Y-%m"),
                             freq='MS')
    if time_resolution == TimeResolution.DAY:
        return pd.date_range(from_date.strftime("%Y-%m-%d"),
                             to_date.strftime("%Y-%m-%d"),
                             freq='D')
    if time_resolution == TimeResolution.HOUR:
        return pd.date_range(from_date.strftime("%Y-%m-%dT%H"),
                             to_date.strftime("%Y-%m-%dT%H"),
                             freq='H')
    if time_resolution == TimeResolution.MINUTE:
        return pd.date_range(from_date.strftime("%Y-%m-%dT%H:%M"),
                             to_date.strftime("%Y-%m-%dT%H:%M"),
                             freq='T')

    raise ValueError('(ValueError) Unknown time resolution given.')
def plot_exploration(email_data_frame_class):
    plot_Data = email_data_frame_class.copy()
    plot_Data['POI1'] = email_data_frame_class.index.get_level_values('POI')
    plot_Data['Date1'] = dt.strftime(
        email_data_frame_class.index.get_level_values('Date'), "%Y-%m-%d")
    plot_Data['Year'] = DatetimeIndex(plot_Data['Date1']).year
    plot_Data['Month'] = DatetimeIndex(plot_Data['Date1']).month
    plot_Data['Day'] = DatetimeIndex(plot_Data['Date1']).day
    plot_Data['Year'] = DatetimeIndex(plot_Data['Date1']).year
    plot_Data['Year_Month'] = plot_Data["Year"].astype(
        str) + '/' + plot_Data["Month"].astype(str)
    plot_Data['Year_Month'] = sorted(
        plot_Data['Year_Month'],
        key=lambda x: datetime.datetime.strptime(x, '%Y/%m'))
    #    plot_Data['Year_Month'] = plot_Data["Year"].astype(str) + plot_Data["Month"].astype(str)
    #    plot_Data['Year_Month'] = plot_Data['Year_Month'].astype(int)

    df3 = plot_Data.groupby(['Year']).count()
    df4 = plot_Data.groupby(['Year_Month']).count()
    df5 = plot_Data[plot_Data['Bad_Words_count'] > 0]
    df5 = df5.groupby(['Year_Month']).count()

    ax = df3.file.plot(kind='bar',
                       title="Message By Year",
                       figsize=(5, 5),
                       fontsize=8)
    ax.set_xlabel("Year", fontsize=8)
    ax.set_ylabel("Number Of Message", fontsize=8)
    plt.xticks(rotation=90)
    plt.grid()
    plt.show()

    f, ax = plt.subplots(figsize=(15, 10))
    ax = df4.file.plot(kind='bar',
                       title="Count Of Messages By Year Month",
                       fontsize=10)
    df4.file.plot(kind='line',
                  ax=ax,
                  color='green',
                  label='Total Messages',
                  linewidth=3)
    ax.legend(loc='upper center', shadow=True)
    plt.xticks(rotation=90)
    ax.set_xticklabels(
        sorted(df5.index.get_level_values('Year_Month'),
               key=lambda x: datetime.datetime.strptime(x, '%Y/%m')))
    ax.set_ylabel("Number Of Message", fontsize=8)
    plt.grid()
    plt.show()

    f, ax = plt.subplots(figsize=(15, 10))
    ax = df5.file.plot(
        kind='bar',
        title="Count Of Messages With Negative Word By Year Month",
        fontsize=10)
    df5.file.plot(kind='line',
                  ax=ax,
                  color='red',
                  label='Messages With Negative Words',
                  linewidth=5)
    ax.legend(loc='upper center', shadow=True)
    plt.xticks(rotation=90)
    ax.set_xticklabels(
        sorted(df5.index.get_level_values('Year_Month'),
               key=lambda x: datetime.datetime.strptime(x, '%Y/%m')))
    ax.set_ylabel("Number Of Message With  Negative Words", fontsize=8)
    plt.grid()
    plt.show()