def add_time_cols(df: pd.DataFrame): cal = calendar() holidays = cal.holidays(start=df.index.date.min(), end=df.index.date.max()) df['time_of_day'] = (df.index.hour.values * 100) + df.index.minute.values df['weekend_or_holiday'] = df.index.to_series().apply( lambda x: (x.weekday() >= 5) or (x.date() in holidays)) return df
def create_features(df): """ Creates time series features from datetime index. """ df = df.copy() weekdays = { 0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday' } mapped = {True: 1, False: 0} df['Date'] = pd.to_datetime(df.date.dt.date) df['year'] = df.date.dt.year df['month'] = df.date.dt.month df['day'] = df.date.dt.dayofyear df['hour'] = df.date.dt.hour df['weekday'] = df.date.dt.weekday.map(weekdays) df['season'] = df.date.dt.month.apply(season_calc) cal = calendar() holidays = cal.holidays(start=df['Date'].min(), end=df['Date'].max()) df['holiday'] = df['Date'].isin(holidays) df.holiday = df.holiday.map(mapped) return df
def set_holiday(data): us_hol_cal = calendar() holidays = us_hol_cal.holidays(start=data['pickup_datetime'].min(), end=data['pickup_datetime'].max()) data['isWeekend'] = (data['pickup_datetime'].dt.dayofweek > 5).astype(int) data['isUSHoliday'] = (data['pickup_datetime'].isin(holidays)).astype(int) data['isHoliday'] = data['isWeekend'] | data['isUSHoliday']
def getHolidays(df): # Get Holidays cal = calendar() dr = pd.date_range(start=df['tpep_pickup_datetime'].min(), end=df['tpep_pickup_datetime'].max()) holidays = cal.holidays(start=dr.min(), end=dr.max()) return df['tpep_pickup_datetime'].isin(holidays)
def trips_data(filepath): """ Reads trips data and create: date, year, month, day, hour, week, holiday, weekend, duration_m """ trips = pd.read_csv(filepath, parse_dates=[1,2]) # standardize column names trips.columns = [col.replace(' ', '_', -1).lower() for col in trips.columns.values] # feature engineer date variables cal = calendar() holidays = cal.holidays(min(trips['start_date']), max(trips['start_date'])) trips['date'] = trips['start_date'].dt.date trips['year'] = trips['start_date'].dt.year.astype(int) trips['month'] = trips['start_date'].dt.month.astype(int) trips['day'] = trips['start_date'].dt.dayofweek.astype(int) trips['hour'] = trips['start_date'].dt.hour.astype(int) trips['week'] = trips['start_date'].dt.week.astype(int) trips['holiday'] = trips['date'].astype('datetime64').isin(holidays) trips['weekend'] = trips['day'].isin([6, 7]) trips['duration_m'] = trips['duration']/60 trips.rename({ "start_station_number":"start_station_id", "end_station_number":"end_station_id" }, axis=1, inplace=True) return(trips)
def localize_df(self, df, device): """ Data from the VOLTTRON historian will be in UTC timezone. Regressions typically are meaningful for localtime as TCC agents utilize local time for predictions and control. :param df: :param device: :return: """ df = df.reset_index() try: # Convert UTC time to local time in configuration file. df['Date'] = df['Date'].dt.tz_convert(self.local_tz) except Exception as e: _log.error('Failed to convert Date column to localtime - {}'.format(e)) if self.debug: filename = '{}/{}-{} - {}.csv'.format(WORKING_DIR, self.start, self.end, device) try: with open(filename, 'w+') as outfile: df.to_csv(outfile, mode='a', index=True) _log.debug('*** Finished outputting data ***') except Exception as e: _log.error('File output failed, check whether the dataframe is empty - {}'.format(e)) # Weekends and holidays will only be present if # one_shot is true. For scheduled regression those # days are excluded from query to historian. if self.exclude_weekends_holidays: holiday = CustomBusinessDay(calendar=calendar()).onOffset match = df["Date"].map(holiday) df = df[match] return df
def expand_date(timeseries): """ Expand a pandas datetime series returning a dataframe with these columns: - hour : 0 - 23 - year: - month: 1 - 12 - weekday : 0 Monday - 6 Sunday - holiday : 0 - 1 holiday - workingday : 0 weekend or holiday - 1 workingday """ from pandas.tseries.holiday import USFederalHolidayCalendar as calendar assert type( timeseries) == pd.core.series.Series, 'input must be pandas series' assert timeseries.dtypes == 'datetime64[ns]', 'input must be pandas datetime' df = pd.DataFrame() df['hour'] = timeseries.dt.hour date = timeseries.dt.date df['year'] = pd.DatetimeIndex(date).year df['month'] = pd.DatetimeIndex(date).month df['day'] = pd.DatetimeIndex(date).day df['weekday'] = pd.DatetimeIndex(date).weekday holidays = calendar().holidays(start=date.min(), end=date.max()) hol = date.astype('datetime64[ns]').isin(holidays) df['holiday'] = hol.values.astype(int) df['workingday'] = ((df['weekday'] < 5) & (df['holiday'] == 0)).astype(int) return df
def build_features(time, temp, conditions): ''' need ['TMAX', 'some_ppt', 'ppt', 'hour_minute', 'weekday', 'day_of_year', 'total_days', 'Holiday'] ''' feature_df = pd.DataFrame(columns=[ 'TMAX', 'some_ppt', 'ppt', 'hour_minute', 'weekday', 'day_of_year', 'total_days', 'Holiday' ]) feature_df['hour_minute'] = range(time.hour * 60 + time.minute + 1, time.hour * 60 + time.minute + 61) feature_df['TMAX'] = temp * 10 feature_df['weekday'] = int(time.weekday < 5) feature_df['day_of_year'] = time.dayofyear feature_df['total_days'] = (time - pd.to_datetime('July 1, 2013')).days heavy_ppt_query = r'(:?thunderstorm)|(:?blizzard)' ppt_query = r'(:?rain)|(:?snow)|(:?hail)' light_ppt_query = r'(:?drizzle)|(:?showers)' if re.match(heavy_ppt_query, conditions): feature_df[['some_ppt', 'ppt']] = [0, 1] elif re.match(light_ppt_query, conditions): feature_df[['some_ppt', 'ppt']] = [1, 0] elif re.match(ppt_query, conditions): if re.match(r'(:?light)|(:?chance)', conditions): feature_df[['some_ppt', 'ppt']] = [1, 0] else: feature_df[['some_ppt', 'ppt']] = [0, 1] else: feature_df[['some_ppt', 'ppt']] = [0, 0] cal = calendar() holidays = cal.holidays(start=time, end=time) feature_df['Holiday'] = time in holidays return feature_df
def create_time_feature(df): #create created_at_year, created_at_month, created_at_day, created_at_date, created_at_dayOfWeek, #created_at_time, created_at_hour, created_at_minute, created_at_second, created_at_isWeekend, #created_at_isHoliday df['created_at_year'], df['created_at_month'], df['created_at_day'], df[ 'created_at_date'], df['created_at_dayOfWeek'], df['created_at_time'], df[ 'created_at_hour'], df['created_at_minute'], df[ 'created_at_second'] = df['created_at_datetime'].dt.year, df[ 'created_at_datetime'].dt.month, df[ 'created_at_datetime'].dt.day, df[ 'created_at_datetime'].dt.date, df[ 'created_at_datetime'].dt.dayofweek, df[ 'created_at_datetime'].dt.time, df[ 'created_at_datetime'].dt.hour, df[ 'created_at_datetime'].dt.minute, df[ 'created_at_datetime'].dt.second df.loc[df['created_at_dayOfWeek'].isin([5, 6]), 'created_at_isWeekend'] = 1 df.loc[df['created_at_dayOfWeek'].isin([0, 1, 2, 3, 4]), 'created_at_isWeekend'] = 0 cal = calendar() holidays = cal.holidays(start=df['created_at_date'].min(), end=df['created_at_date'].max()) df['created_at_isHoliday'] = np.where( df.created_at_datetime.dt.normalize().isin(holidays), 1, 0) return df
def prophetModelandPrediction(train, demandForcastingData_train, demandForcastingData_test, holiday_df): cal = calendar() #train_holidays = cal.holidays(start=demandForcastingData_train.index.min(),end=demandForcastingData_train.index.max()) #test_holidays = cal.holidays(start=demandForcastingData_test.index.min(),end=demandForcastingData_test.index.max()) holiday_df['ds'] = pd.to_datetime(holiday_df['ds']) demandForcastingData_train.reset_index().rename(columns={ 'ArrivalDate': 'ds', 'Count': 'y' }).head() model = Prophet(holidays=holiday_df) model.fit(demandForcastingData_train.reset_index().rename(columns={ 'ArrivalDate': 'ds', 'Count': 'y' })) #demandForcastingData_test_fcst = model.predict(df=demandForcastingData_train.reset_index().rename(columns={'ArrivalDate':'ds'})) #demandForcastingData_test['Count_Prediction4'] = demandForcastingData_test_fcst.yhat.values if not train: demandForcastingData_test_fcst = model.predict( df=demandForcastingData_test.reset_index().rename( columns={'ArrivalDate': 'ds'})) demandForcastingData_test[ 'Count_Prediction4'] = demandForcastingData_test_fcst.yhat.values return demandForcastingData_test else: demandForcastingData_test_fcst = model.predict( df=demandForcastingData_train.reset_index().rename( columns={'ArrivalDate': 'ds'})) #print(demandForcastingData_test_fcst) demandForcastingData_train[ 'Count_Prediction4'] = demandForcastingData_test_fcst.yhat.values return demandForcastingData_train
def generate_date_features(date_index): out_df = pd.DataFrame(index=date_index) days = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] for i in range(len(days)): kwargs = {days[i]: date_index.map(lambda row: int(row.weekday() == i))} out_df = out_df.assign(**kwargs) months = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec' ] for i in range(len(months)): kwargs = { months[i]: date_index.map(lambda row: int(row.month == i + 1)) } out_df = out_df.assign(**kwargs) quarter = ['Q1', 'Q2', 'Q3', 'Q4'] for i in range(len(quarter)): kwargs = { quarter[i]: date_index.map(lambda row: int(row.quarter == i)) } out_df = out_df.assign(**kwargs) years = ['y14', 'y15', 'y16', 'y17', 'y18'] for i in range(len(years)): kwargs = { years[i]: date_index.map(lambda row: int(row.year == i + 2014)) } out_df = out_df.assign(**kwargs) weeks = ['w{}'.format(i) for i in range(1, 54)] for i in range(len(weeks)): kwargs = { weeks[i]: date_index.map(lambda row: int(row.isocalendar()[1] == i + 1)) } out_df = out_df.assign(**kwargs) # TODO: fix this def is_xmas_new_year(row): ret = ((dt.datetime(row.year, 12, 25) < row and row < dt.datetime(row.year + 1, 1, 5)) or (dt.datetime(row.year - 1, 12, 25) < row and row < dt.datetime(row.year, 1, 5))) return int(ret) # kwargs = {'xmas': date_index.map(is_xmas_new_year)} # out_df = out_df.assign(**kwargs) # print(out_df.head()) dr = pd.to_datetime(pd.to_datetime(date_index)) cal = calendar() holidays = cal.holidays(start=dr.min(), end=dr.max()) out_df["holiday"] = dr.isin(holidays) out_df["holiday"] = out_df.holiday.astype(int) # out_df = out_df.assign(**kwargs) return out_df
def get_prediction_dataframe(series): hour_of_day = series.index.hour month_of_year = series.index.month day_of_week = series.index.dayofweek year_idx = series.index.year target = series.values cal = calendar() holidays = cal.holidays(start=series.index.min(), end=series.index.max()) df = pd.DataFrame( zip( year_idx, month_of_year, day_of_week, hour_of_day, series.index.isin(holidays), target, ), columns=[ "year_idx", "month_of_year", "day_of_week", "hour_of_day", "holiday", "target", ], ) convert_type = {x: "category" for x in df.columns.values[:4]} df = df.astype(convert_type) return df
def build_features(time, temp, conditions): ''' need ['TMAX', 'some_ppt', 'ppt', 'hour_minute', 'weekday', 'day_of_year', 'total_days', 'Holiday'] ''' feature_df = pd.DataFrame(columns=['TMAX', 'some_ppt', 'ppt', 'hour_minute', 'weekday', 'day_of_year', 'total_days', 'Holiday']) feature_df['hour_minute'] = range(time.hour*60+time.minute+1, time.hour*60+time.minute+61) feature_df['TMAX'] = temp*10 feature_df['weekday'] = int(time.weekday<5) feature_df['day_of_year'] = time.dayofyear feature_df['total_days'] = (time-pd.to_datetime('July 1, 2013')).days heavy_ppt_query = r'(:?thunderstorm)|(:?blizzard)' ppt_query = r'(:?rain)|(:?snow)|(:?hail)' light_ppt_query = r'(:?drizzle)|(:?showers)' if re.match(heavy_ppt_query, conditions): feature_df[['some_ppt', 'ppt']] = [0, 1] elif re.match(light_ppt_query, conditions): feature_df[['some_ppt', 'ppt']] = [1, 0] elif re.match(ppt_query, conditions): if re.match(r'(:?light)|(:?chance)', conditions): feature_df[['some_ppt', 'ppt']] = [1, 0] else: feature_df[['some_ppt', 'ppt']] = [0, 1] else: feature_df[['some_ppt', 'ppt']] = [0, 0] cal = calendar() holidays = cal.holidays(start=time, end=time) feature_df['Holiday'] = time in holidays return feature_df
def temporal_features(self, df): df['start_year'] = df['start_date'].dt.year df['start_month'] = df['start_date'].dt.month df['start_weekday'] = df['start_date'].dt.weekday df['start_hour'] = df['start_date'].dt.hour df['end_year'] = df['end_date'].dt.year df['end_month'] = df['end_date'].dt.month df['end_weekday'] = df['end_date'].dt.weekday df['end_hour'] = df['end_date'].dt.hour cal = calendar() holidays = cal.holidays(start=df['start_date'].min(), end=df['start_date'].max()) df['is_start_holiday'] = np.where(df['start_date'].isin(holidays), 1, 0) df['is_start_working_day'] = np.where( (df['start_date'].dt.weekday != 5) & (df['start_date'].dt.weekday != 6) & (df['is_start_holiday'] == 0), 1, 0) df['is_start_weekend'] = np.where((df['start_date'].dt.weekday == 5) | (df['start_date'].dt.weekday == 6), 1, 0) df['start_year_part'] = df.apply( lambda x: self.year_part(x['start_year'], x['start_month']), axis=1) return df
def holiday_indicator(df): """Appends holiday (US Federal) indicator column""" min_date = df.ts.min() max_date = df.ts.max() cal = calendar() holidays = cal.holidays(start=min_date, end=max_date) df['holiday'] = ( df.ts.dt.date.astype('datetime64').isin(holidays)).astype(int)
def cross_holidays(joint_df): joint_df['FlightDate'] = pd.to_datetime(joint_df['FlightDate'], infer_datetime_format=True) cal = calendar() holidays = cal.holidays(start=joint_df['FlightDate'].min(), end=joint_df['FlightDate'].max()) joint_df["is_holiday"] = joint_df["FlightDate"].isin(holidays) return joint_df
def process_date(X): ts = pd.to_datetime(X[['Year', 'Month', 'Day']]) X['weekday'] = ts.dt.weekday cal = calendar() holidays = cal.holidays(start=ts.min(), end=ts.max()) X['IsHoliday'] = ts.apply(lambda x: int(x in holidays)) return np.c_[X['Year'], X['Month'], X['Day'], X['Hour'], X['weekday'], X['IsHoliday']]
def prepare_data(data_path): """Returns dataframe with features.""" # Get data df = pd.read_csv(data_path) # Remove NaNs df = df.dropna() # Convert date to datetime df['date'] = pd.to_datetime(df.date) # Create and age variable df['age'] = df.index.astype('int') # Create a day of week field df['day'] = df.date.dt.dayofweek # Create a month of year field df['month'] = df.date.dt.month # Create a boolean for US federal holidays holidays = calendar().holidays(start=df.date.min(), end=df.date.max()) df['holiday'] = df['date'].isin(holidays).apply(int) # Rearrange columns df = df[ [ 'date', 'count', 'age', 'month', 'day', 'holiday' ] ] # Create monthly dummies tmp = pd.get_dummies(df.month) tmp.columns = ['month' + str(value) for value in tmp.columns] df = pd.concat([df, tmp], axis=1) # Create daily dummies tmp = pd.get_dummies(df.day) tmp.columns = ['day' + str(value) for value in tmp.columns] df = pd.concat([df, tmp], axis=1) # Reset index df = df.reset_index(drop=True) # Log transform count data df['count'] = np.log1p(df['count']) # Drop unnecessary columns df = df.drop(['month', 'day', 'age'], axis=1) df = df.dropna() return df
def is_holiday(self): is_hday = 0 cal = calendar() holidays = cal.holidays(start=dt.date(2015, 1, 1), end=dt.date(2020, 12, 31)) if np.datetime64(self.usage_date) in holidays: is_hday = 1 return is_hday
def is_holiday(date): cal = calendar() st = datetime.datetime(date.year, date.month, 1) ed = st + datetime.timedelta(days=31) hols = cal.holidays(start=st, end=ed) if date in hols: return True else: return False
def add_holidays(train_df,test_df): concat = pd.concat([train_df['pickup_datetime'],test_df['pickup_datetime']]) cal = calendar() holidays = cal.holidays(start=concat['pickup_datetime'].min(), end=concat['pickup_datetime'].max()) train_df['Holiday'] = train_df['pickup_datetime'].isin(holidays) test_df['Holiday'] = test_df['pickup_datetime'].isin(holidays) return train_df,test_df
def clean_data(df_data, features=None): ''' Clean weather data and create features. INPUT: dataframe, list OUTPUT: dataframe ''' df = df_data.copy() df['time'] = pd.to_datetime(df['time'], unit='s') df.set_index('time', inplace=True) df = df.resample('1D', how='mean') # feature creation df['dayofweek'] = pd.DatetimeIndex(df.index).weekday df['dayofyear'] = pd.DatetimeIndex(df.index).dayofyear df['weekofyear'] = pd.DatetimeIndex(df.index).weekofyear # mark holidays cal = calendar() holidays = cal.holidays(start=df.index.min(), end=df.index.max()) holidays df['holiday'] = 0 df.loc[df.index.isin(holidays), 'holiday'] = 1 # rolling means c = [ 'apparenttemperaturemax', 'apparenttemperaturemin', 'temperaturemax', 'temperaturemin' ] d = ['7'] for col in c: for day in d: df[col + day] = pd.rolling_mean(df[col], int(day)) # create lag features c = ['apparenttemperaturemax', 'apparenttemperaturemin', 'windspeed'] d = ['-3', '-7'] for col in c: for day in d: df[col + day] = df[col].shift(int(day)) # impute null values df.fillna(0, inplace=True) if features is None: # drop unneeded columns df.drop(['precipintensity', 'precipintensitymax'], axis=1, inplace=True) else: # use only specified features df = df[features] return df
def dt_range(self): today = datetime.datetime.now().date() today = datetime.datetime(today.year, today.month, today.day) d = datetime.timedelta(days=1) cal = calendar() holidays = [x.to_pydatetime() for x in cal.holidays((today - d * self.trail_days), today)] dt_range_ = sorted([(today - d * i) for i in range(self.trail_days)]) dt_range_ = [dt for dt in dt_range_ if datetime.date.weekday(dt) < 5 and dt not in holidays] return dt_range_
def holiday_checker(release_date): vReturn = 0 cal = calendar() startDate = datetime.strptime(release_date, '%m/%d/%Y') - timedelta(days=7) endDate = datetime.strptime(release_date, '%m/%d/%Y') + timedelta(days=5) holidays = cal.holidays(start=startDate, end=endDate).to_pydatetime() print(holidays) if holidays: vReturn = 1 return vReturn
def is_holiday(cls): """ Reference - https://stackoverflow.com/questions/64276059 """ is_hday = 0 cal = calendar() holidays = cal.holidays(start=dt.date(2015, 1, 1), end=dt.date(2020, 12, 31)) is_hday = cls.usage_date.in_(holidays) return is_hday
def get_features_dataframe( series: pd.Series, time_features: List[TimeFeature], lag_indices: List[int], past_data: Optional[pd.Series] = None, ) -> pd.DataFrame: """Constructs a DataFrame of features for a given Series. Features include some date-time features (like hour-of-day, day-of-week, ...) and lagged values from the series itself. Lag indices are specified by `lags`, while previous data can be specified by `past_data`: the latter allows to get lags also for the initial values of the series. Parameters ---------- series Series on which features should be computed. time_features List of time features to be included in the data frame. lag_indices List of indices of lagged observations to be included as features. past_data Prior data, to be used to compute lagged observations. Returns ------- pd.DataFrame A DataFrame containing the features. This has the same index as `series`. """ # TODO check if anything can be optimized here assert past_data is None or series.index.freq == past_data.index.freq assert past_data is None or series.index[0] > past_data.index[-1] cal = calendar() holidays = cal.holidays(start=series.index.min(), end=series.index.max()) time_feature_columns = { feature.__class__.__name__: feature(series.index) for feature in time_features } all_data = ( series if past_data is None else past_data.append(series).asfreq(series.index.freq) ) lag_columns = { f"lag_{idx}": all_data.shift(idx)[series.index].values for idx in lag_indices } columns = {**time_feature_columns, **lag_columns, "target": series.values} return pd.DataFrame(columns, index=series.index)
class custom_calendar(AbstractHolidayCalendar): new_rules = [ Holiday('Halloween', month=10, day=31), Holiday('Christmas Eve', month=12, day=24), Holiday('New Years Eve', month=12, day=31), Holiday('DST time change', month=3, day=1, offset=pd.DateOffset(weekday=SU(2))) ] rules = calendar().rules + new_rules
def get_holidays(x): cal = calendar() holidays = cal.holidays(start=x.min(), end=x.max(), return_name=True) holidays = cal.holidays(start='2015-01-01', end='2016-12-31', return_name=True) holidays = holidays[~holidays.isin(['Presidents Day', 'Columbus Day', 'Veterans Day'])] mlk_days = holidays[holidays == 'Dr. Martin Luther King Jr.'] return holidays, mlk_days
def clean_data(df_data, features=None): ''' Clean weather data and create features. INPUT: dataframe, list OUTPUT: dataframe ''' df = df_data.copy() df['time'] = pd.to_datetime(df['time'], unit='s') df.set_index('time', inplace=True) df = df.resample('1D', how='mean') # feature creation df['dayofweek'] = pd.DatetimeIndex(df.index).weekday df['dayofyear'] = pd.DatetimeIndex(df.index).dayofyear df['weekofyear'] = pd.DatetimeIndex(df.index).weekofyear # mark holidays cal = calendar() holidays = cal.holidays(start=df.index.min(), end=df.index.max()) holidays df['holiday'] = 0 df.loc[df.index.isin(holidays), 'holiday'] = 1 # rolling means c = ['apparenttemperaturemax','apparenttemperaturemin', 'temperaturemax', 'temperaturemin'] d = ['7'] for col in c: for day in d: df[col+day] = pd.rolling_mean(df[col], int(day)) # create lag features c = ['apparenttemperaturemax','apparenttemperaturemin', 'windspeed'] d = ['-3', '-7'] for col in c: for day in d: df[col+day] = df[col].shift(int(day)) # impute null values df.fillna(0, inplace=True) if features is None: # drop unneeded columns df.drop(['precipintensity', 'precipintensitymax'], axis = 1, inplace=True) else: # use only specified features df = df[features] return df
def transform(self, X, y=None): """Create Squared Variables.""" cal = calendar() holidays = cal.holidays(start='2000-01-01', end='2050-01-01') holiday_bin_temp = pd.DataFrame(X.index.date, index=X.index, columns=['date']) holiday_bin = holiday_bin_temp['date'].astype('datetime64').isin( holidays) holiday_bin = pd.DataFrame(holiday_bin) del holiday_bin_temp return holiday_bin
def download_pulse_range(start, end, pulsedir): cal = calendar() if not os.path.exists(pulsedir): os.makedirs(pulsedir) for day in _daterange(start, end): if day.weekday() < 5: # Monday...Friday == 0..4 if day in cal.holidays(): print u'(Holiday)...', _download_pulse(day, pulsedir) else: print u'%s (Weekend)' % str(day)
def _remove_WE_holidays_NaN(data): no_WE = ~((data.index.weekday == 5) | (data.index.weekday == 6)) # remove if WE cal = calendar() start = datetime.datetime.strftime(data.index.min(),"%Y-%m-%d") end =datetime.datetime.strftime(data.index.max(),"%Y-%m-%d") hol_cal = cal.holidays(start=start, end=end) no_hol = ~data.index.isin(hol_cal) # remove if it is a national holiday no_NaN = ~data.isna().all(axis=1) # remove if has any NaN for any hour return data[no_WE & no_hol & no_NaN]
def encode_dataset(train,test,meta,target_model='xgb'): y_train = train[meta['target']] train = train.drop([meta['target']],axis=1) assert train.shape[1] == test.shape[1] for i in range(train.shape[1]): assert train.columns[i] == test.columns[i] train_obs = len(train) # all_data = pd.concat([train,test],axis=0) for i,f in enumerate(meta['cols'].keys()): print(i,f,meta['cols'][f]) if meta['cols'][f] == 'CAT': all_data[f] = all_data[f].fillna('missing') encoder = LabelEncoder() encoder.fit(all_data[f]) if target_model == 'xgb': all_data[f] = encoder.transform(all_data[f]) else: all_data[f] = encoder.transform(all_data[f]).astype(int) elif meta['cols'][f] == 'NUM': all_data[f] = all_data[f].fillna(-1) elif meta['cols'][f] == 'DATE': tmp = pd.to_datetime(all_data[f]) all_data[f] = tmp.dt.weekday cal = calendar() #holidays = cal.holidays(start=tmp.min(), end=tmp.max()) #$all_data[f+'_is_holiday'] = 1*tmp.isin(holidays) elif meta['cols'][f] == 'REM': all_data = all_data.drop(f,axis=1) elif meta['cols'][f] == 'LEN': all_data[f+'_len'] = all_data[f].apply(count_desc_len) all_data = all_data.drop(f,axis=1) else: raise Exception(str(meta['cols'][f])+":unknown mapping") assert train_obs == len(y_train) return all_data , y_train
# Presumably, crime rates will be different on working days on the # one hand and weekdays and holidays on the other hand. # So we'll introduce a column WorkingDay # Data on holidays and which businesses actually observe them are sketchy # at best, so we'll only count the most important ones as holidays: # - New Year # - Memorial Day # - Independence Day # - Labor Day # - Thanksgiving # - Black Friday # - Christmas cal = calendar() # need to remove rules in descending order! cal.rules.pop(7) # remove Veterans Day cal.rules.pop(6) # remove Columbus Day cal.rules.pop(2) # remove President's Day cal.rules.pop(1) # remove Martin Luther King Day # create new rule for Black Friday USBlackFriday = Holiday('BlackFriday', month=11, day=1, offset=DateOffset(weekday=FR(4))) # create own holiday calendar based on the above rules ownCal = HolidayCalendarFactory('OwnCalendar', cal, USBlackFriday) #print(ownCal.rules) cal = ownCal() #holidays = cal.holidays(start='2003-01-01', end='2015-05-13', return_name=True) # also returns name of holiday holidays = cal.holidays(start='2003-01-01', end='2015-05-13')