def plot_regression_predictions(y_test, model_name, prediction): y_test1 = y_test.copy() y_test1 = y_test1.reset_index() # y_test1['Date'] = dt.strftime(y_test.index.get_level_values('Date'), "%Y-%m-%d") y_test1['Date'] = dt.strftime(y_test.index.get_level_values('Date'), "%Y-%m") y_test1['Date'] = pd.to_datetime(y_test1['Date']) y_test1.drop('POI', axis=1, inplace=True) f, ax = plt.subplots(figsize=(8, 5)) ax.plot(y_test1.Date, y_test1.Percentage_Bad_Words_count, label='% of Total Email Messages') ax.plot(y_test1.Date, prediction, color='red', label="% of Financial Negative Words Predicted") ax.plot_date(y_test1.Date, prediction, color='black') ax.set_title(model_name + ': % of Financial Negative Words Predicted by Date') ax.set_xlabel("Date", rotation=0) plt.xticks(rotation=90) ax.set_ylabel("% Of Negative Word") ax.legend(loc="upper right") f.autofmt_xdate() plt.show()
def get_all_dates_to_download( from_date: DatetimeIndex, to_date: DatetimeIndex, time_resolution: TimeResolution) -> DatetimeIndex: """ Returns a DatetimeIndex with dates from_date to to_date in time_resolution. Args: from_date (DatetimeIndex): DatetimeIndex object setting the start of time interval. to_date (DatetimeIndex): DatetimeIndex object setting the end of the time interval. time_resolution (TimeResolution): Specifying the time resolution used in blob storage. Returns: Returns a pandas.DatetimeIndex with all dates from_date to to_date (both included) in time_resolution. """ # Get all the dates we need if time_resolution == TimeResolution.NONE: # We handle this case by making a DatetimeIndex containing one element. We will ignore the # date anyway. return pd.date_range(start='1/1/2021', periods=1, freq='D') if time_resolution == TimeResolution.YEAR: return pd.date_range(from_date.strftime("%Y"), to_date.strftime("%Y"), freq='AS') if time_resolution == TimeResolution.MONTH: return pd.date_range(from_date.strftime("%Y-%m"), to_date.strftime("%Y-%m"), freq='MS') if time_resolution == TimeResolution.DAY: return pd.date_range(from_date.strftime("%Y-%m-%d"), to_date.strftime("%Y-%m-%d"), freq='D') if time_resolution == TimeResolution.HOUR: return pd.date_range(from_date.strftime("%Y-%m-%dT%H"), to_date.strftime("%Y-%m-%dT%H"), freq='H') if time_resolution == TimeResolution.MINUTE: return pd.date_range(from_date.strftime("%Y-%m-%dT%H:%M"), to_date.strftime("%Y-%m-%dT%H:%M"), freq='T') raise ValueError('(ValueError) Unknown time resolution given.')
def plot_exploration(email_data_frame_class): plot_Data = email_data_frame_class.copy() plot_Data['POI1'] = email_data_frame_class.index.get_level_values('POI') plot_Data['Date1'] = dt.strftime( email_data_frame_class.index.get_level_values('Date'), "%Y-%m-%d") plot_Data['Year'] = DatetimeIndex(plot_Data['Date1']).year plot_Data['Month'] = DatetimeIndex(plot_Data['Date1']).month plot_Data['Day'] = DatetimeIndex(plot_Data['Date1']).day plot_Data['Year'] = DatetimeIndex(plot_Data['Date1']).year plot_Data['Year_Month'] = plot_Data["Year"].astype( str) + '/' + plot_Data["Month"].astype(str) plot_Data['Year_Month'] = sorted( plot_Data['Year_Month'], key=lambda x: datetime.datetime.strptime(x, '%Y/%m')) # plot_Data['Year_Month'] = plot_Data["Year"].astype(str) + plot_Data["Month"].astype(str) # plot_Data['Year_Month'] = plot_Data['Year_Month'].astype(int) df3 = plot_Data.groupby(['Year']).count() df4 = plot_Data.groupby(['Year_Month']).count() df5 = plot_Data[plot_Data['Bad_Words_count'] > 0] df5 = df5.groupby(['Year_Month']).count() ax = df3.file.plot(kind='bar', title="Message By Year", figsize=(5, 5), fontsize=8) ax.set_xlabel("Year", fontsize=8) ax.set_ylabel("Number Of Message", fontsize=8) plt.xticks(rotation=90) plt.grid() plt.show() f, ax = plt.subplots(figsize=(15, 10)) ax = df4.file.plot(kind='bar', title="Count Of Messages By Year Month", fontsize=10) df4.file.plot(kind='line', ax=ax, color='green', label='Total Messages', linewidth=3) ax.legend(loc='upper center', shadow=True) plt.xticks(rotation=90) ax.set_xticklabels( sorted(df5.index.get_level_values('Year_Month'), key=lambda x: datetime.datetime.strptime(x, '%Y/%m'))) ax.set_ylabel("Number Of Message", fontsize=8) plt.grid() plt.show() f, ax = plt.subplots(figsize=(15, 10)) ax = df5.file.plot( kind='bar', title="Count Of Messages With Negative Word By Year Month", fontsize=10) df5.file.plot(kind='line', ax=ax, color='red', label='Messages With Negative Words', linewidth=5) ax.legend(loc='upper center', shadow=True) plt.xticks(rotation=90) ax.set_xticklabels( sorted(df5.index.get_level_values('Year_Month'), key=lambda x: datetime.datetime.strptime(x, '%Y/%m'))) ax.set_ylabel("Number Of Message With Negative Words", fontsize=8) plt.grid() plt.show()