def calculate_scores(data): if len(data) == 0: return math.nan, math.nan else: #relative stock performance data_pct = (data - data.iloc[0])/data.iloc[0]*100 #yearly stock performance data_year = data.groupby([data.index.year]).apply(lambda x: (x.iloc[-1]-x.iloc[0])/x.iloc[0]*100) year_median = data_year.median() cagr = (((data_pct.iloc[-1] / 100 + 1) ** (1/data_year.index.size) - 1) * 100) return cagr, year_median
#%% # aggregating data with rolling mean daily = data.resample('D').sum() daily.rolling(30, center = True).sum().plot(style = [':', '--', '-']) plt.ylabel('mean hourly count') #%% # Ussing Gaussian window daily.rolling(50, center = True, win_type = 'gaussian').sum(std = 10).plot() #%% # Digging in # average traffic as a function of tge time of day by_time = data.groupby(data.index.time).mean() hourly_ticks = 4 * 60 * 60 * np.arange(6) by_time.plot(xticks = hourly_ticks) #%% # How things change based on the day of the week by_weekday = data.groupby(data.index.dayofweek).mean() by_weekday.index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'] by_weekday.plot() #%% # let's do a compound groupby and look at the hourly trend on # weekdats versus weekends weekend = np.where(data.index.weekday < 5, 'Weekday', 'Weekend') by_time = data.groupby([weekend, data.index.time]).mean()
data.to_csv("/Users/xiangliu/Desktop/CSC560 Data/pollution_AQI.csv", index=True, sep=',') # In[6]: data = pd.read_csv('/Users/xiangliu/Desktop/CSC560 Data/pollution_AQI.csv') data.shape # In[7]: data.head(3) # In[8]: data.groupby(['State']).count() #Found there are 5 states no there(Montana, Nebraska, Mississippi, West virgina, Vermont) # In[9]: le = data['CO AQI'] le[le.isnull()] # In[10]: le = data['SO2 AQI'] le[le.isnull()] # In[11]: le = data['NO2 AQI']