def calc_holidays(start_date=datetime.datetime(2012, 10, 1), end_date=datetime.datetime.now()): cal = USFederalHolidayCalendar() holidays = np.ones(cal.holidays().shape[0]) holidays = pd.Series(holidays, index=cal.holidays()).resample("D").fillna(0.0) h = holidays[start_date:end_date].values h = np.where(h)[0] - 1 return h
def test_calendar(self): calendar = USFederalHolidayCalendar() holidays = calendar.holidays(self.start_date, self.end_date) holidays_1 = calendar.holidays(self.start_date.strftime("%Y-%m-%d"), self.end_date.strftime("%Y-%m-%d")) holidays_2 = calendar.holidays(Timestamp(self.start_date), Timestamp(self.end_date)) self.assertEqual(list(holidays.to_pydatetime()), self.holiday_list) self.assertEqual(list(holidays_1.to_pydatetime()), self.holiday_list) self.assertEqual(list(holidays_2.to_pydatetime()), self.holiday_list)
def test_calendar(self): calendar = USFederalHolidayCalendar() holidays = calendar.holidays(self.start_date, self.end_date) holidays_1 = calendar.holidays(self.start_date.strftime('%Y-%m-%d'), self.end_date.strftime('%Y-%m-%d')) holidays_2 = calendar.holidays(Timestamp(self.start_date), Timestamp(self.end_date)) assert list(holidays.to_pydatetime()) == self.holiday_list assert list(holidays_1.to_pydatetime()) == self.holiday_list assert list(holidays_2.to_pydatetime()) == self.holiday_list
def test_calendar(self): calendar = USFederalHolidayCalendar() holidays = calendar.holidays(self.start_date, self.end_date) holidays_1 = calendar.holidays( self.start_date.strftime('%Y-%m-%d'), self.end_date.strftime('%Y-%m-%d')) holidays_2 = calendar.holidays( Timestamp(self.start_date), Timestamp(self.end_date)) assert list(holidays.to_pydatetime()) == self.holiday_list assert list(holidays_1.to_pydatetime()) == self.holiday_list assert list(holidays_2.to_pydatetime()) == self.holiday_list
def holiday_column(airline_df, start='2014-01-01', end='2018-12-31'): '''Takes airline df and returns df with column indicating if date of flight is a US Federal Holiday''' #Pull Holiday Dates and convert to Spark DF with timestamp column cal = USFederalHolidayCalendar() holidays = cal.holidays(start, end).to_pydatetime() holidays_df = pd.DataFrame(pd.DataFrame(holidays)[0].astype('string')) schema = StructType([StructField('Holiday_Date', StringType())]) holidays_sc = spark.createDataFrame(holidays_df, schema) holidays_sc = holidays_sc.select( to_timestamp(holidays_sc.Holiday_Date, 'yyyy-MM-dd').alias('holiday_date')) #Join holidays to airlines holiday_joined_df = airline_df.join( holidays_sc, (airline_df.fl_date == holidays_sc.holiday_date), 'left') #Change date column to binary holiday_joined_df = holiday_joined_df.withColumn( "Holiday", (f.col('holiday_date').isNotNull()).cast("integer")) #Drop redundant holiday_date column holiday_joined_df = holiday_joined_df.drop(holiday_joined_df.holiday_date) return holiday_joined_df
def pandas_holidays(): from pandas.tseries.holiday import USFederalHolidayCalendar cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2014-01-01', end='2014-12-31').to_pydatetime() if datetime.datetime(2014, 1, 1) in holidays: print(True)
def __init__( self, path: str = None, data: pd.DataFrame = pd.DataFrame(), holdings: pd.DataFrame = pd.DataFrame(), ): """Initialize the portfolio with holdings and market data. Args: path: The name of an a directory containing holdings and market data. If this is None then the portfolio must be described by the data and holdings arguments. data: A DataFrame containing market close data for a set of financial instruments over a period of time. holdings: A DataFrame containing the number of shares held of the set of symbols in data over a time period corresponding to that of data. """ if len(data) > 0 and len(holdings) > 0: logger.debug("Data and holdings arguments are set.") self.data = data self.holdings = holdings else: logger.debug("Data and holdings are not set.") if path is None: path = Path().home() / ".portfolio" / "data" self.path = Path(path) if not self.path.is_dir(): logger.info("%s is not a directory, creating it...", self.path) self.path.mkdir() self.data_file = self.path / "prices.feather" self.holdings_file = self.path / "holdings.feather" try: # The feather format does not support date indices, # so set the Date colemn to be the index. self.holdings = pd.read_feather( self.holdings_file).set_index("date") except FileNotFoundError: logger.info("No stored holdings found.") self.holdings = holdings symbols = self.holdings.columns try: # The feather format does not support date indices, # so set the Date colemn to be the index. self.data = pd.read_feather(self.data_file).set_index("date") except FileNotFoundError: logger.info("Market data is not stored .") calendar = USFederalHolidayCalendar() today = pd.Timestamp.today().normalize() last_business_day = min(today - BDay(1), self.data.index.max() + BDay(1)) holidays = calendar.holidays(last_business_day, today) if (today not in self.data.index and last_business_day not in self.data.index and holidays.empty): self.data = self.get_market_data( symbols, start=last_business_day, end=today, ) self.holdings = self.holdings.reindex(self.data.index, method="ffill").dropna()
def get_us_holidays(df: pd.DataFrame = None) -> pd.DataFrame: ''' holidays = get_us_holidays(df) ''' ushc = USFederalHolidayCalendar() if df is not None: t_min = df.index.min() t_max = df.index.max() vals = [x for x in ushc.holidays() if (t_min <= x) & (x <= t_max)] holidays = pd.to_datetime(vals) else: holidays = pd.to_datetime(ushc.holidays()) return holidays
def isHoliday(d): cal = USFederalHolidayCalendar() y_str = d.strftime("%Y") holidays = cal.holidays(start=y_str+'-01-01', end=y_str+'-12-31').to_pydatetime() if d in holidays: return 1 else: return 0
def get_holiday_list(start_yr, start_mon, start_day, end_yr, end_mon, end_day): calendar = USFederalHolidayCalendar() holiday_list = calendar.holidays(datetime.datetime(start_yr, 11, 1), datetime.datetime(end_yr, 5, 1)) holidays = [] for holiday in holiday_list: holidays.append(holiday) return holidays
def holiday_lists(): cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2021-01-01', end='2021-12-31').to_pydatetime() hs = [] for h in holidays: hs.append(h.strftime("%Y/%m/%d")) return hs
def near_holiday(self): calendar = USFederalHolidayCalendar() holidays = calendar.holidays() df1 = self.df_raw[['id', 'date_account_created']] df1.date_account_created = pd.to_datetime(df1.date_account_created) df1.date_account_created = pd.to_datetime(df1.date_account_created.dt.date) df1['near_holiday'] = (df1.date_account_created.isin(holidays + timedelta(days=1)) | df1.date_account_created.isin(holidays - timedelta(days=1))) return df1[['id', 'near_holiday']]
def __init__(self,epsPath,pricePath,daysNeed = 3): self.raw_eps = pd.read_csv(epsPath) self.price = pd.read_csv(pricePath) self.daysNeed = daysNeed self.__getEps() minDate = str(min(self.epsBefore['DATE'],self.epsAfter['DATE'])) maxDate = str(max(self.epsBefore['DATE'],self.epsAfter['DATE'])) cal = USFederalHolidayCalendar() self.holidays = cal.holidays(start=minDate, end=maxDate).to_pydatetime()
def get_path(self, initial_values, t_start, t_end, freq, interest_rate, seed=None): # check initial value os the right size and dimension #print("getting path " , freq.) # set up parameters of the pathway model covar = stats.corr_to_cov(self.correlation_matrix, self.sigmas) covar_param = param.SimpleArrayParam(covar) chol = linalg.cholesky(covar, lower=True) chol[chol < 1.0e-9] = 0.0 cholesky_param = param.SimpleArrayParam(chol) r_param = param.SimpleParam(interest_rate) # create the index of times that are in market hours between # requested times timeindex = pd.date_range(start=t_start, end=t_end, freq=freq) # get the frequency (in seconds) now before removing non-market times freq_in_secs = pd.to_timedelta(timeindex.freq, unit='s').total_seconds() # only trading hours timeindex = timeindex[timeindex.indexer_between_time('09:30', '16:00')] # only weekdays timeindex = timeindex[~(timeindex.dayofweek > 4)] # remove fed holidays cal = USFederalHolidayCalendar() hols = cal.holidays(start=timeindex.min(), end=timeindex.max()) timeindex = timeindex[~timeindex.isin(hols)] # get array of time in yearly units and get stock pathways times = np.arange( 0, len(timeindex)) * freq_in_secs / MARKET_SECONDS_PER_YEAR # seed and create the pathway generator object np.random.seed(seed=seed) path_maker = path.GeometricDiffusionManyAsset(self.generator, r_param, covar_param, cholesky_param) if len(times) > 0: s_paths = path_maker.get_single_timed_path(initial_values, times) else: raise RunTimeError('Trying to generate stocks on empty time list') # put all data into a pandas Dataframe stocks_df = pd.DataFrame(index=timeindex, data=s_paths, columns=self.stock_names) print("internal : ", stocks_df.groupby(stocks_df.index.dayofweek).sum()) np.random.seed(seed=None) return stocks_df
def next_business_day(date): cal = USFederalHolidayCalendar() holidays = cal.holidays() date = date + BDay(1) date = date.to_pydatetime() while date in holidays: date = date + BDay(1) date = date.to_pydatetime() return date
def judgeOpenDaysInRange(from_date, to_date): cal = USFederalHolidayCalendar() holidays = cal.holidays(from_date, to_date) duedays = pd.bdate_range(from_date, to_date) df = pd.DataFrame() df['date'] = duedays df['holiday'] = duedays.isin(holidays) opendays = df[df['holiday'] == False] return opendays
def create_ts_calendar(df, ts_settings, additional_events=None): """ df: pandas df ts_settings: dict Parameters for time series project additional_events: pandas df(optional) df of additional events to add to calendar Returns: -------- Calendar of events """ date_col = ts_settings['date_col'] cal = USFederalHolidayCalendar(AbstractHolidayCalendar) black_friday = Holiday( "Black Friday", month=11, day=1, offset=[pd.DateOffset(weekday=TH(4)), pd.DateOffset(1)]) cyber_monday = Holiday( "Cyber Monday", month=11, day=1, offset=[pd.DateOffset(weekday=TH(4)), pd.DateOffset(4)]) cal.rules.append(black_friday) cal.rules.append(cyber_monday) cal.rules[9] = Holiday('Christmas', month=12, day=25) cal.rules[4] = Holiday('July 4th', month=7, day=4) events = pd.DataFrame( cal.holidays( start=pd.to_datetime(df[date_col].min()), end=pd.to_datetime(df[date_col].max()) + dt.timedelta(days=365), return_name=True, )) events = events.reset_index() events.columns = ['Date', 'Event'] if additional_events is not None: assert additional_events.shape[ 1] == 2, 'additional events must be a df with 2 columns' additional_events.columns = ['Date', 'Event'] additional_events['Date'] = pd.to_datetime(additional_events['Date']) events = events.append(additional_events) events['Date'] = [ dt.datetime.strftime(pd.to_datetime(date), '%Y-%m-%d') for date in events['Date'] ] return events.drop_duplicates().sort_values(by='Date').reset_index( drop=True)
class USHoliday(Feature): def __init__(self, **kwargs): super().__init__(**kwargs) self.n_periods = 2 self.cal = USFederalHolidayCalendar() def indexer(self, index, column=None): holidays = self.cal.holidays(start=index.min() - pd.Timedelta('2d'), end=index.max() + pd.Timedelta('2d')) return np.isin(index.date, holidays.date)
def create_date_feature_is_public_holiday(df, date, start, end, country = 'US'): """ create date feature: is holiday only support USFederal Holiday """ cal = USFederalHolidayCalendar() holidays = cal.holidays(start=start, end=end).to_pydatetime() foo = lambda x: 1 if x in holidays else 0 df['is_holiday'] = df[date].map(foo) return df
def holidays_check(forecast_date): ''' Check whether the prediction date a holiday or weekend. The programme will select the last working from the date that users select ''' calendar = USFederalHolidayCalendar() holiday = calendar.holidays(start=forecast_date, end=forecast_date) if not holiday.empty or forecast_date.weekday() >= 5: print( '***Notice***\nThe date that you want to predict is a holiday or weekend' ) print('Our programme will choose the last working date') while not holiday.empty or forecast_date.weekday( ) >= 5: # check if the date is a holiday or weekend forecast_date = forecast_date - timedelta(1) # move 1 day backward holiday = calendar.holidays(start=forecast_date, end=forecast_date) print('The forecast date is changed to {}'.format( forecast_date.strftime('%m/%d/%Y'))) return forecast_date
def find_holidays(base_year): ''' Using Pandas calendar, find all 10 US Federal Holidays, plus California's Cesar Chavez Day (March 31). ''' yr = str(base_year) cal = USFederalHolidayCalendar() holidays = cal.holidays(start=yr + '-01-01', end=yr + '-12-31').to_pydatetime() return [d.strftime('%m-%d') for d in holidays] + ['03-31']
def getHolidays(dates): holidaysVectorReturn = [] cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2015-01-01', end='2017-12-31').to_pydatetime() for idx, date in enumerate(dates): if date in holidays: holidaysVectorReturn.append(1) else: holidaysVectorReturn.append(0) return holidaysVectorReturn
def __init__(self, epsPath, pricePath, daysNeed=3): self.raw_eps = pd.read_csv(epsPath) self.price = pd.read_csv(pricePath) self.daysNeed = daysNeed self.__getEps() minDate = str(min(self.epsBefore['DATE'], self.epsAfter['DATE'])) maxDate = str(max(self.epsBefore['DATE'], self.epsAfter['DATE'])) cal = USFederalHolidayCalendar() self.holidays = cal.holidays(start=minDate, end=maxDate).to_pydatetime()
def __init__(self, data_df, date): # df = pd.read_csv(path, date_parser='Date') self.date = date df = data_df.copy() # df = pd.read_csv(path, date_parser='Date') # df = data_df.copy() test = df[[ 'Date', 'Hour', 'Weekday', 'Month', 'Load', 'Mean_Temp', 'Mean_Humi', 'RIV_Temp', 'RIV_Humi', 'LAX_Temp', 'LAX_Humi', 'USC_Temp', 'USC_Humi', 'WJF_Temp', 'WJF_Humi', 'TRM_Temp', 'TRM_Humi' ]] test.loc[:, 'RIV_Temp_Log'] = np.log(df['RIV_Temp']) test.loc[:, 'Load_Log'] = np.log(df['Load']) test['Load_Lag_48'] = test['Load_Log'].shift(48, axis=0) # test['Temp_Lag_48'] = test['Mean_Temp'].shift(48, axis=0) test['Humi_Lag_48'] = test['Mean_Humi'].shift(48, axis=0) test['RIV_Temp_Log_Lag_48'] = test['RIV_Temp_Log'].shift(48, axis=0) # test['RIV_Temp_Lag_48']= test['RIV_Temp'].shift(48, axis=0) cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2014-01-01', end=str(datetime.datetime.now()), return_name=True) holidays = pd.DataFrame(holidays) holidays = holidays.reset_index() holidays = holidays.rename(columns={'index': "Date", 0: 'Holiday'}) holidays['Date'] = pd.to_datetime(holidays['Date']) test['Date'] = pd.to_datetime(test['Date']) lm_data = test.loc[49:len(test), ].merge(holidays, how='left', on='Date') lm_data['Holiday'] = lm_data['Holiday'].fillna("Not Holiday") lm_data[["Hour", "Weekday", "Month", "Holiday"]] = lm_data[["Hour", "Weekday", "Month", "Holiday"]].astype('category') DateTime = pd.DataFrame( lm_data.apply(lambda line: pd.to_datetime(line['Date']) + datetime. timedelta(hours=line['Hour']), axis=1)) DateTime.columns = ['DateTime'] self.lm_data = pd.concat([DateTime, lm_data], axis=1) self.lm_data.set_index('DateTime', inplace=True)
def build_features(data, features): #Timestamp First Active data['timestamp_first_active'] = data['timestamp_first_active'].astype(str) data['timestamp_first_active_date'] = data['timestamp_first_active'].str[:8] data['timestamp_first_active_date'] = pd.to_datetime(data['timestamp_first_active_date'], format='%Y%m%d') data['tfa_month'] = data['timestamp_first_active_date'].map(lambda x : x.month) data['tfa_year'] = data['timestamp_first_active_date'].map(lambda x : x.year) data['tfa_day'] = data['timestamp_first_active_date'].map(lambda x : x.day) data['tfa_dayofyear'] = data.timestamp_first_active_date.dt.dayofyear data['tfa_dayofweek'] = data.timestamp_first_active_date.dt.dayofweek data['tfa_week'] = data.timestamp_first_active_date.dt.week data['tfa_quarter'] = data.timestamp_first_active_date.dt.quarter features.extend(['tfa_day','tfa_month','tfa_year','tfa_dayofyear','tfa_dayofweek','tfa_week','tfa_quarter']) #TFA Holidays #calendar = USFederalHolidayCalendar() #tfa_holidays = calendar.holidays(start=data['timestamp_first_active_date'].min(),end=data['timestamp_first_active_date'].max()) #for i in range(len(tfa_holidays)): #data['tfa_holiday_diff_'+str(i)] = data['timestamp_first_active_date'].map(lambda x : (x-tfa_holidays[i]).days) #data['tfa_holiday_diff_'+str(i)] = data['tfa_holiday_diff_'+str(i)].map(holiday_transform) #data_dummy = pd.get_dummies(data['tfa_holiday_diff_'+str(i)],prefix='tfa_holiday_diff_'+str(i)) #features.extend(data_dummy.columns.values) #data.drop(['tfa_holiday_diff_'+str(i)],axis=1,inplace=True) #data = pd.concat((data,data_dummy),axis=1) #features.extend('tfa_holiday_diff_'+str(i)) #Date Account Created data['date_account_created'] = pd.to_datetime(data['date_account_created']) data['dac_month'] = data['date_account_created'].map(lambda x : x.month) data['dac_year'] = data['date_account_created'].map(lambda x : x.year) data['dac_day'] = data['date_account_created'].map(lambda x : x.day) data['dac_dayofyear'] = data.date_account_created.dt.dayofyear data['dac_dayofweek'] = data.date_account_created.dt.dayofweek data['dac_week'] = data.date_account_created.dt.week data['dac_quarter'] = data.date_account_created.dt.quarter features.extend(['dac_year','dac_month','dac_day','dac_dayofyear','dac_dayofweek','dac_week','dac_quarter']) #DAC Holidays calendar = USFederalHolidayCalendar() dac_holidays = calendar.holidays(start=data['date_account_created'].min(),end=data['date_account_created'].max()) for i in range(len(dac_holidays)): data['dac_holiday_diff_'+str(i)] = data['date_account_created'].map(lambda x : (x-dac_holidays[i]).days) data['dac_holiday_diff_'+str(i)] = data['dac_holiday_diff_'+str(i)].map(holiday_transform) features.extend(['dac_holiday_diff_'+str(i)]) #Days Difference Between TFA and DAC data['days_diff'] = (data['date_account_created'] - data['timestamp_first_active_date']).dt.days #data['days_diff'] = data['days_diff'].map(holiday_transform) features.extend(['days_diff']) data.drop(['date_account_created','timestamp_first_active','timestamp_first_active_date'],axis=1,inplace=True) other_features = ['gender', 'signup_method','signup_flow','language','affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for f in other_features: data_dummy = pd.get_dummies(data[f],prefix=f) features.extend(data_dummy.columns.values) data.drop([f],axis=1,inplace=True) data = pd.concat((data,data_dummy),axis=1) return data
def predict(dict): """ This function pickles the model and generates the test features by passing the dict parameter. """ folder_selected = folderSelect() filename = os.path.join(folder_selected, "model.joblib") if filename is None: train_and_persist() else: with open(filename, "rb") as f: model = load(f) testDf = pd.DataFrame(dict, index=[0]) testDf["hr"] = testDf["date"].dt.hour testDf["yr"] = testDf["date"].dt.year - testDf["date"].dt.year.min() testDf["mnth"] = testDf["date"].dt.month testDf["season"] = testDf["date"].map(get_season) testDf["weekday"] = testDf["date"].dt.weekday testDf["dteday"] = testDf["date"].dt.day testDf["dteday"] = pd.to_datetime(testDf["dteday"]) cal = USFederalHolidayCalendar() holidays = pd.to_datetime(cal.holidays(start="2011-01-01", end="2011-06-30")) testDf["holiday"] = pd.to_datetime(testDf["date"]).dt.date in holidays testDf["workingday"] = pd.to_datetime(testDf["date"]).dt.date not in holidays testDf["holiday"] = testDf["holiday"].map(lambda x: 1 if x else 0) testDf["workingday"] = testDf["workingday"].map(lambda x: 1 if x else 0) t_max , t_min = 50, -8 # This is for tempretaure normalization testDf["temp"] = (testDf["temperature_C"] - t_min)/(t_max-t_min) # We divide humidity by 100 to scale it between 0 and 1 testDf["hum"] = testDf["humidity"]/100 testDf = testDf.drop(columns=["temperature_C", "humidity"]) # Convert the data type to eithwe category or to float testDf = generating_new_features(testDf) testDf = pd.get_dummies(testDf) # Finally start with Machine Learning test = testDf.drop(columns=["date", "dteday", "feeling_temperature_C"]) # savedir = Path.home() # filename = os.path.join(savedir, "model.joblib") # with open(filename, "rb") as f: # model = load(f) # print("done!") # f.close() train_X, train_y, test_X, test_y = prepare_train_data() train_features = train_X.columns.values test = fill_missing_features(test, train_features) pred = model.predict(test) pred = pred.astype(int) print(" Rounded predictions:\n", pred)
def DR_Temp_data_cleaning(dataframe): ''' inplace change of the dataframe, for the structure purpose, return this dataframe ''' dataframe['Date'] = pd.to_datetime(dataframe['Date']) test = dataframe[[ 'Date', 'Hour', 'Weekday', 'Month', 'Load', 'Mean_Temp', 'Mean_Humi', 'RIV_Temp', 'RIV_Humi', 'LAX_Temp', 'LAX_Humi', 'USC_Temp', 'USC_Humi', 'WJF_Temp', 'WJF_Humi', 'TRM_Temp', 'TRM_Humi' ]] test.loc[:, 'RIV_Temp_Log'] = np.log(dataframe['RIV_Temp']) test.loc[:, 'LAX_Temp_Log'] = np.log(dataframe['LAX_Temp']) test.loc[:, 'USC_Temp_Log'] = np.log(dataframe['USC_Temp']) test.loc[:, 'WJF_Temp_Log'] = np.log(dataframe['WJF_Temp']) test.loc[:, 'TRM_Temp_Log'] = np.log(dataframe['TRM_Temp']) test.loc[:, 'Mean_Temp_Log'] = np.log(dataframe['TRM_Temp']) test.loc[:, 'Load_Log'] = np.log(dataframe['Load']) test['Load_Lag_48'] = test['Load_Log'].shift(48, axis=0) test['Humi_Lag_48'] = test['Mean_Humi'].shift(48, axis=0) test['RIV_Temp_Log_Lag_48'] = test['RIV_Temp_Log'].shift(48, axis=0) test['LAX_Temp_Log_Lag_48'] = test['LAX_Temp_Log'].shift(48, axis=0) test['USC_Temp_Log_Lag_48'] = test['USC_Temp_Log'].shift(48, axis=0) test['WJF_Temp_Log_Lag_48'] = test['WJF_Temp_Log'].shift(48, axis=0) test['TRM_Temp_Log_Lag_48'] = test['TRM_Temp_Log'].shift(48, axis=0) test['Mean_Temp_Log_Lag_48'] = test['Mean_Temp_Log'].shift(48, axis=0) cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2014-01-01', end=str(datetime.datetime.now()), return_name=True) holidays = pd.DataFrame(holidays) holidays = holidays.reset_index() holidays = holidays.rename(columns={'index': "Date", 0: 'Holiday'}) holidays['Date'] = pd.to_datetime(holidays['Date']) test['Date'] = pd.to_datetime(test['Date']) lm_data = test.loc[49:len(test), ].merge(holidays, how='left', on='Date') lm_data['Holiday'] = lm_data['Holiday'].fillna("Not Holiday") lm_data[["Hour", "Weekday", "Month", "Holiday"]] = lm_data[["Hour", "Weekday", "Month", "Holiday"]].astype('category') DateTime = pd.DataFrame( lm_data.apply(lambda line: pd.to_datetime(line['Date']) + datetime. timedelta(hours=line['Hour']), axis=1)) DateTime.columns = ['DateTime'] lm_data = pd.concat([DateTime, lm_data], axis=1) lm_data.set_index('DateTime', inplace=True) return lm_data
def getStartDate(): # get 200 business days since yesterday but leave room in case this is run on weekend yesterday = date.today() - timedelta(days=1) start_date = yesterday - BDay(202) # find the number of holidays between yesterday and 200 days ago and subtract that to get real start date calendar = USFederalHolidayCalendar() holidays = calendar.holidays(start_date, yesterday) start_date = start_date - BDay(len(holidays.tolist())) s = pd.Series(start_date) return s.map(pd.Timestamp.date)[0]
def format_subway_data(path_to_csv_file: str): # read data, convert to datetime df = pd.read_csv(path_to_csv_file) df["servicedate"] = pd.to_datetime(df["servicedate"]) # add holidays cal = USFederalHolidayCalendar() holidays = cal.holidays(start=df["servicedate"].min(), end=df["servicedate"].max()) # mark as holiday and weekday df["holiday"] = df["servicedate"].dt.date.astype("datetime64").isin( holidays.date) df["weekday"] = df["servicedate"].dt.dayofweek # define peak, mark weekdays, convert service date back conditions = [(df["holiday"] == False) & (df["weekday"] < 5)] choices = ["peak"] df["peak"] = np.select(conditions, choices, default="offpeak") df["week"] = df["servicedate"].dt.isocalendar().week df["year"] = df["servicedate"].dt.isocalendar().year df["servicedate"] = df["servicedate"].dt.date.astype(str) # select date of the week dates = df[df["weekday"] == 0] dates = dates[["servicedate", "week", "year"]].drop_duplicates() # limit data to just peak, merge back dates final = df[df["peak"] == "peak"] final = final.groupby(["year", "week", "route_or_line" ])["sum"].mean().round().reset_index() final = final.merge(dates, on=["week", "year"], how="left") # get list of bus routes routelist = list(set(final["route_or_line"].tolist())) # create dict output = {} # write out each set of routes to dict for route in routelist: dftemp = final[final["route_or_line"] == route] dictdata = (dftemp[["servicedate", "sum"]].rename(columns={ "servicedate": "date", "sum": "riders" }).to_dict(orient="records")) output[route] = dictdata return output
def validate ( date ): cal = USFederalHolidayCalendar() holidays = cal.holidays(date) # check recent holidays adjusted = False if date in holidays: #create offset, where offset replace BDay as bussiness day offset to account for holiday offset = CustomBusinessDay(calendar=cal) date+=offset adjusted = True return date, adjusted
def add_time_windows(df): """Adds time windows as target variable, based on delivery date and time. Calculates number of business days (excl weekends, shipment date, federal holidays) for delivery. Adds delivery time to number of business days to get time-in-transit. Creates window thresholds based on discussion with Jose. Lastly assign time windows to shipments based on time-in-transit. Args: df (pandas dataframe obj): Pandas dataframe that must contain shipment_date, delivery_date, delivery_time. Returns: pandas dataframe obj: Pandas dataframe with new columns days_in_transit, days_taken_float (time-in-transit), Y (target variable i.e. time window) """ print("Adding time windows i.e. target variable...") print(f"Starting with {report_df_stats(df)}") start_time = time.time() # Calculate days in transit (exclude shipment date, holidays, weekends) start_date = df['shipment_date'].min() end_date = df['shipment_date'].max() calendar = USFederalHolidayCalendar() holidays = calendar.holidays(start_date, end_date).date.tolist() shipment_dates = [d.date() for d in df['shipment_date']] delivery_dates = [d.date() for d in df['delivery_date']] # -1 because we will add transit time df['days_in_transit'] = np.busday_count( shipment_dates, delivery_dates, holidays=holidays) - 1 # Convert days in transit/delivery time to days taken (with decimals)) # e.g. if parcel reaches at 12.00pm on 2nd business day, days taken is 1.5 delivery_percentage_of_day = [ (timedelta.total_seconds(d) / timedelta(days=1).total_seconds()) for d in df['delivery_time'] ] df['days_taken_float'] = df['days_in_transit'] + delivery_percentage_of_day # Keep rows from -1 to 5 days in transit. The rest are rare occurrences. max_days_to_keep = 5 df = df[df['days_in_transit'].isin(np.arange(-1, max_days_to_keep))] # Assign time windows time_window_thresholds = create_time_window_thresholds() tqdm.pandas(desc="Assign time window") df['Y'] = df.progress_apply(lambda x: assign_time_window( x['days_taken_float'], time_window_thresholds), axis=1) print(f"Ending with {report_df_stats(df)}.") utilities.print_elapsed_time(start_time) return df
def assign_workdays(self, data, start, end, speed=False): cal = USFederalHolidayCalendar() holidays = cal.holidays(start=start, end=end) if speed: num_process = mp.cpu_count() chunk_size = int(data.shape[0] / num_process) chunks = [data.loc[data.index[i: i + chunk_size]] for i in range(0, data.shape[0], chunk_size)] pools = mp.Pool(num_process) res = pools.map( partial(self._workday_process, holidays=holidays), chunks) return pd.concat(res) else: return self._workday_process(data, holidays)
def validate(date): cal = USFederalHolidayCalendar() holidays = cal.holidays(date) # check recent holidays adjusted = False if date in holidays: #create offset, where offset replace BDay as bussiness day offset to account for holiday offset = CustomBusinessDay(calendar=cal) date += offset adjusted = True return date, adjusted
def __init__(self,hour_to_wait_for,avoid_holidays=None,weekdays_to_avoid=None,logfile_path=None,logging_level=None): pass self.logger = init_root_logger( 'logfile.log' if logfile_path is None else logfile_path, 'INFO' if logging_level is None else logging_level) self.hour_to_wait_for = hour_to_wait_for wta = '5,6' if weekdays_to_avoid is None else weekdays_to_avoid self.days_to_go_back_to_sleep = [int(s) for s in wta.split(',')] self.avoid_holidays = True if avoid_holidays is None else avoid_holidays.lower()=='true' # get all of the holdidays in the current year cal = USFederalHolidayCalendar() y1 = dt.datetime.now().year y2= y1+1 self.holidays = cal.holidays(start=str(y1)+'-01-01', end=str(y2)+'-01-01').to_pydatetime()
def _fix_new_dates(df): """ Year, month, hour, weekday: Come from the index holiday: Comes from a Pandas module called "USFederalHolidayCalendar" working day: True if is not a holiday and it is between monday to friday season, weathersit, temp, atemp, hum, windspeed: take the same record from an hour ago casual, registered: Mean of the samples that correspond to the same hour and week day cnt: sum of casual and registered """ ### Get holidays in the specific period of time cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2011-01-01', end='2012-12-31').to_pydatetime() nan_index = df[df.dteday.isna()].index for index in nan_index: df.loc[index, ['yr', 'mnth', 'hr']] = (0 if int(index.year) == 2011 else 1, index.month, index.hour) df.loc[index, 'weekday'] = index.dayofweek df.loc[index, 'workingday'] = int(index.dayofweek in [0, 1, 2, 3, 4] and index not in holidays) df.loc[index, 'holiday'] = int(index in holidays) sub_index = index while np.isnan(df.loc[sub_index, 'instant']): sub_index = sub_index - pd.Timedelta(1, unit='hr') if not np.isnan(df.loc[sub_index, 'instant']): df.loc[index, [ 'weathersit', 'season', 'temp', 'atemp', 'hum', 'windspeed' ]] = df.loc[sub_index, [ 'weathersit', 'season', 'temp', 'atemp', 'hum', 'windspeed' ]] df.loc[index, 'casual'] = round( df[(df['hr'] == df.loc[index, 'hr']) & (df['weekday'] == df.loc[index, 'weekday'])]['casual'].mean()) df.loc[index, 'registered'] = round( df[(df['hr'] == df.loc[index, 'hr']) & (df['weekday'] == df.loc[index, 'weekday'])]['registered'].mean()) df.loc[index, 'cnt'] = df.loc[index, 'casual'] + df.loc[index, 'registered'] return df
def solar_calcs(hourly_ac_list, tariff): last_day = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31} hour = 0 month = 1 day = 1 index = 0 total_ac_output = 0 annual_kwh_solar_revenue = 0 season = "winter" cal = USFederalHolidayCalendar() holidays = cal.holidays(start=str(YEAR) + '-01-01', end=str(YEAR) + '-12-31').to_pydatetime() for i in range(len(hourly_ac_list)): total_ac_output += hourly_ac_list[i] date = datetime.datetime(YEAR, month, day, 0, 0) if date == tariff.seasons['Summer Start']: season = "summer" if date == tariff.seasons['Winter Start']: season = "winter" if date.weekday() == 5 or date.weekday() == 6 or date in holidays: if season == "winter": rate = tariff.rates['kWh Off Peak (winter)'] if season == "summer": rate = tariff.rates['kWh Off Peak (summer)'] else: if season == "winter": if hour < int(tariff.hours['Off Peak Morning Start (winter)']) or hour >= int(tariff.hours['Off Peak Evening End (winter)']): rate = tariff.rates['kWh Off Peak (winter)'] elif hour >= int(tariff.hours['Peak Start (winter)']) and hour < int(tariff.hours['Peak End (winter)']): rate = tariff.rates['kWh Peak (winter)'] else: rate = tariff.rates['kWh Part Peak (winter)'] else: if hour < int(tariff.hours['Off Peak Morning Start (summer)']) or hour >= int(tariff.hours['Off Peak Evening End (summer)']): rate = tariff.rates['kWh Off Peak (summer)'] elif hour >= int(tariff.hours['Peak Start (summer)']) and hour < int(tariff.hours['Peak End (summer)']): rate = tariff.rates['kWh Peak (summer)'] else: rate = tariff.rates['kWh Part Peak (summer)'] annual_kwh_solar_revenue += float(rate) * hourly_ac_list[i] hour += 1 if hour == 24: day += 1 if day == last_day[month] + 1: month += 1 day = 1 hour = 0 return total_ac_output/1000, annual_kwh_solar_revenue/1000
def create_date_features(df, start_date, end_date): dt_format = '%Y-%m-%dT%H:%M:%S.%f' # Get the list of US federal hollidays cal = USFederalHolidayCalendar() us_holidays = cal.holidays(start=start_date, end=end_date).to_pydatetime() date_feature_names = ["tpep_dropoff_datetime", "tpep_pickup_datetime"] date_feature_to_datetimes = {"tpep_dropoff_datetime" : [], "tpep_pickup_datetime" : []} for feature_name in date_feature_names: y, mo, d = [], [], [] # Year, Month, Day h, mi, s = [], [], [] # Hours, Minutes, Seconds day_of_week = [] # Day of the Week is_holiday = [] # Is the date a holiday? for time in df[feature_name]: # Extract the datetime object from the timestamp dt = datetime.strptime(time, dt_format) date_feature_to_datetimes[feature_name].append(dt) # Add the Year/Month/Day y.append(dt.year) mo.append(dt.month) d.append(dt.day) # Add Hour/Minute/Second h.append(dt.hour) mi.append(dt.minute) s.append(dt.second) # Add Day of the Week day_of_week.append(dt.weekday()) # Add is_holiday y_m_d = datetime(dt.year, dt.month, dt.day) is_holiday.append(1 if y_m_d in us_holidays else 0) df[feature_name + "_years"] = y df[feature_name + "_months"] = mo df[feature_name + "_days"] = d df[feature_name + "_hours"] = h df[feature_name + "_minutes"] = mi df[feature_name + "_seconds"] = s df[feature_name + "_day_of_week"] = day_of_week df[feature_name + "_is_holiday"] = is_holiday durations = [] for dropoff_time, pickup_time in zip(date_feature_to_datetimes["tpep_dropoff_datetime"], date_feature_to_datetimes["tpep_pickup_datetime"]): duration = (dropoff_time - pickup_time).total_seconds() / 60.0 durations.append(round(duration)) df["duration"] = durations return df
def test_calendar(transform): start_date = datetime(2012, 1, 1) end_date = datetime(2012, 12, 31) calendar = USFederalHolidayCalendar() holidays = calendar.holidays(transform(start_date), transform(end_date)) expected = [ datetime(2012, 1, 2), datetime(2012, 1, 16), datetime(2012, 2, 20), datetime(2012, 5, 28), datetime(2012, 7, 4), datetime(2012, 9, 3), datetime(2012, 10, 8), datetime(2012, 11, 12), datetime(2012, 11, 22), datetime(2012, 12, 25) ] assert list(holidays.to_pydatetime()) == expected
def get_first_bday_of_month(mnth=None, yr=None): ''' Return the first business day of the current month if no variables provided Return the first business day of the month and year provided if variables provided Tests: In [188]: config.get_first_bday_of_month(12,2015) Out[188]: datetime.date(2015, 12, 1) In [189]: config.get_first_bday_of_month(11,2015) Out[189]: datetime.date(2015, 11, 2) In [190]: config.get_first_bday_of_month(10,2015) Out[190]: datetime.date(2015, 10, 1) In [191]: config.get_first_bday_of_month(1,2016) Out[191]: datetime.date(2016, 1, 4) In [192]: config.get_first_bday_of_month(8,2015) Out[192]: datetime.date(2015, 8, 3) :param mnth: :param yr: :return: ''' from calendar import monthrange from pandas.tseries.holiday import USFederalHolidayCalendar from pandas.tseries.offsets import CustomBusinessDay if yr is None or mnth is None: yr = pd.datetime.now().year if pd.datetime.now().month != 1 else pd.datetime.now().year - 1 mnth = pd.datetime.now().month - 1 if pd.datetime.now().month != 1 else 12 else: yr = yr if mnth != 1 else yr - 1 mnth = mnth - 1 if mnth != 1 else 12 end_last = monthrange(yr, mnth) end_last = pd.Timestamp('%s/%s/%s' % (mnth, end_last[1], yr)).date() cal = USFederalHolidayCalendar() holidays = cal.holidays(start=end_last - pd.tseries.offsets.Day(60), end=end_last + pd.tseries.offsets.Day(60)).to_pydatetime() bday_cus = CustomBusinessDay(holidays=holidays) return (end_last + bday_cus).date()
days_since_school_year_start = (date_of_interest - beginning_range).days #for 2015/2016 School year beginning_range = datetime.date(2015, 8, 31) end_range = datetime.date(2016, 8, 29) if beginning_range <= date_of_interest <= end_range: days_since_school_year_start = (date_of_interest - beginning_range).days datekey_list.append(weather_datekey) days_since_school_year_start_list.append(days_since_school_year_start) days_since_school_year_start_db = {'days_since_school_year_start' : pd.Series(days_since_school_year_start_list, index=datekey_list)} days_since_school_year_start_df = pd.DataFrame(days_since_school_year_start_db) #Account for major US federal holidays cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2013-01-01', end='2016-12-31').to_pydatetime() datekey_list = [] holidays_list = [] #for weather_datekey in weather_df.index.values: for weather_datekey in weather_df.datekey: year = int(str(weather_datekey)[:4]) month = int(str(weather_datekey)[4:6]) day = int(str(weather_datekey)[6:]) date_of_interest = datetime.date(year, month, day) if date_of_interest in holidays: holidays_list.append(1) else: holidays_list.append(0) datekey_list.append(weather_datekey) holidays_db = {'holidays' : pd.Series(holidays_list, index=datekey_list)} holidays_df = pd.DataFrame(holidays_db)
#add a dummy variable for weekend day merged_master['weekend_day'] = np.where((merged_master['day_of_week']==5)|(merged_master['day_of_week']==6),1,0) #drop day_of_week field merged_master.drop('day_of_week', axis=1, inplace=True) #add a column for month and create month dummies merged_master['month'] = merged_master.index.month dummy_df = pd.get_dummies(merged_master['month'], prefix='month') merged_master = pd.concat([merged_master,dummy_df], axis=1) merged_master.drop('month', axis=1, inplace=True) #add a column for holiday merged_master['date'] = merged_master.index.date merged_master['date'] = pd.to_datetime(merged_master['date']) cal = USFederalHolidayCalendar() holidays = cal.holidays(start=merged_master.date.min(), end=merged_master.date.max()) merged_master['holiday'] = merged_master['date'].isin(holidays).astype(int) merged_master.drop('date', axis=1, inplace=True) #add a column for days since first date in dataset merged_master['days_since_first_date'] = (merged_master.index.date - (merged_master.index.date.min())).astype('timedelta64[D]').astype(int) #Drop any rows with null values merged_master.dropna(axis=0, inplace=True) #rearrange column order merged_master['station_id'] = merged_master['from_station_id'] merged_master.drop(['from_station_id'], axis=1, inplace=True)
def make_regression_features_oct(df): '''Create features for regression given a dataframe ''' from pandas.tseries.holiday import USFederalHolidayCalendar x_dayssincestart = df.dayssincestart x_dayofweek = df.dayofweek x_dayofweek = pd.get_dummies(x_dayofweek,prefix='dayofweek') x_dayofmonth = df.dayofmonth x_dayofmonth = pd.get_dummies(x_dayofmonth, prefix='dayofmonth') x_week1 = pd.Series([1 if day < 8 else 0 for day in df.dayofmonth],name='week1') x_week2 = pd.Series([1 if (day >= 8 and day < 16) else 0 for day in df.dayofmonth],name='week2') x_week3 = pd.Series([1 if (day >= 16 and day < 23) else 0 for day in df.dayofmonth],name='week3') x_week4 = pd.Series([1 if day >= 23 else 0 for day in df.dayofmonth],name='week4') x_isweekend = pd.Series([1 if (day == 5 or day == 6) else 0 for day in df.dayofweek],name='isweekend') x_istueswed = pd.Series([1 if (day == 1 or day == 2) else 0 for day in df.dayofweek],name='istueswed') x_isfrisat = pd.Series([1 if (day == 4 or day == 5) else 0 for day in df.dayofweek],name='isfrisat') #get holidays calendar = USFederalHolidayCalendar() holidays = calendar.holidays(start=df.datetime.min(), end=df.datetime.max()) x_isholiday = pd.Series([1 if day in holidays.tolist() else 0 for day in df.datetime],name='x_isholiday') #find 3-day weekends x_isholidaywknd = pd.Series(np.zeros(len(x_isholiday)),name='x_isholidaywknd') x_weekendandholiday = x_isweekend + x_isholiday for i in range(1, len(x_weekendandholiday)-2): if x_weekendandholiday[i] == 1: if x_weekendandholiday[i+1] == 1 and x_weekendandholiday[i+2] ==1: x_isholidaywknd[i] = 1 x_isholidaywknd[i+1] = 1 x_isholidaywknd[i+2] = 1 x_week1tueswed = x_week1 & x_istueswed x_week2tueswed = x_week2 & x_istueswed x_week3tueswed = x_week3 & x_istueswed x_week4tueswed = x_week4 & x_istueswed x_week1frisat = x_week1 & x_isfrisat x_week2frisat = x_week2 & x_isfrisat x_week3frisat = x_week3 & x_isfrisat x_week4frisat = x_week4 & x_isfrisat x_week1wknd = x_week1 & x_isweekend x_week2wknd = x_week2 & x_isweekend x_week3wknd = x_week3 & x_isweekend x_week4wknd = x_week4 & x_isweekend X_vars = pd.concat([x_dayssincestart,x_dayofweek,x_dayofmonth, x_week1,x_week2,x_week3,x_week4, x_isweekend,x_istueswed,x_isfrisat, x_isholiday,x_isholidaywknd, x_week1tueswed,x_week2tueswed,x_week3tueswed,x_week4tueswed, x_week1frisat,x_week2frisat,x_week3frisat,x_week4frisat, x_week1wknd,x_week2wknd,x_week3wknd,x_week4wknd], axis=1) return X_vars