Пример #1
0
def add_time_cols(df: pd.DataFrame):
    cal = calendar()
    holidays = cal.holidays(start=df.index.date.min(), end=df.index.date.max())
    df['time_of_day'] = (df.index.hour.values * 100) + df.index.minute.values
    df['weekend_or_holiday'] = df.index.to_series().apply(
        lambda x: (x.weekday() >= 5) or (x.date() in holidays))
    return df
def create_features(df):
    """
    Creates time series features from datetime index.
    """
    df = df.copy()
    weekdays = {
        0: 'Monday',
        1: 'Tuesday',
        2: 'Wednesday',
        3: 'Thursday',
        4: 'Friday',
        5: 'Saturday',
        6: 'Sunday'
    }
    mapped = {True: 1, False: 0}
    df['Date'] = pd.to_datetime(df.date.dt.date)
    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['day'] = df.date.dt.dayofyear
    df['hour'] = df.date.dt.hour
    df['weekday'] = df.date.dt.weekday.map(weekdays)
    df['season'] = df.date.dt.month.apply(season_calc)
    cal = calendar()
    holidays = cal.holidays(start=df['Date'].min(), end=df['Date'].max())
    df['holiday'] = df['Date'].isin(holidays)
    df.holiday = df.holiday.map(mapped)

    return df
Пример #3
0
def set_holiday(data):
    us_hol_cal = calendar()
    holidays = us_hol_cal.holidays(start=data['pickup_datetime'].min(),
                                   end=data['pickup_datetime'].max())
    data['isWeekend'] = (data['pickup_datetime'].dt.dayofweek > 5).astype(int)
    data['isUSHoliday'] = (data['pickup_datetime'].isin(holidays)).astype(int)
    data['isHoliday'] = data['isWeekend'] | data['isUSHoliday']
Пример #4
0
def getHolidays(df):
    # Get Holidays
    cal = calendar()
    dr = pd.date_range(start=df['tpep_pickup_datetime'].min(),
                       end=df['tpep_pickup_datetime'].max())
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    return df['tpep_pickup_datetime'].isin(holidays)
Пример #5
0
def trips_data(filepath):
    """
        Reads trips data and create: date, year, month, day, hour, week, holiday, weekend, duration_m 

    """

    trips = pd.read_csv(filepath, parse_dates=[1,2])

    # standardize column names
    trips.columns = [col.replace(' ', '_', -1).lower() for col in trips.columns.values]

    # feature engineer date variables
    cal = calendar()
    holidays = cal.holidays(min(trips['start_date']), max(trips['start_date']))
    trips['date'] = trips['start_date'].dt.date
    trips['year'] = trips['start_date'].dt.year.astype(int)
    trips['month'] = trips['start_date'].dt.month.astype(int)
    trips['day'] = trips['start_date'].dt.dayofweek.astype(int)
    trips['hour'] = trips['start_date'].dt.hour.astype(int)
    trips['week'] = trips['start_date'].dt.week.astype(int)
    trips['holiday'] = trips['date'].astype('datetime64').isin(holidays)
    trips['weekend'] = trips['day'].isin([6, 7])
    trips['duration_m'] = trips['duration']/60
    trips.rename({
        "start_station_number":"start_station_id",
        "end_station_number":"end_station_id"
        }, axis=1, inplace=True)

    return(trips)
Пример #6
0
    def localize_df(self, df, device):
        """
        Data from the VOLTTRON historian will be in UTC timezone.
        Regressions typically are meaningful for localtime as TCC
        agents utilize local time for predictions and control.
        :param df:
        :param device:
        :return:
        """
        df = df.reset_index()
        try:
            # Convert UTC time to local time in configuration file.
            df['Date'] = df['Date'].dt.tz_convert(self.local_tz)
        except Exception as e:
            _log.error('Failed to convert Date column to localtime - {}'.format(e))
        if self.debug:
            filename = '{}/{}-{} - {}.csv'.format(WORKING_DIR, self.start, self.end, device)
            try:
                with open(filename, 'w+') as outfile:
                    df.to_csv(outfile, mode='a', index=True)
                    _log.debug('*** Finished outputting data ***')
            except Exception as e:
                _log.error('File output failed, check whether the dataframe is empty - {}'.format(e))

        # Weekends and holidays will only be present if
        # one_shot is true.  For scheduled regression those
        # days are excluded from query to historian.
        if self.exclude_weekends_holidays:
            holiday = CustomBusinessDay(calendar=calendar()).onOffset
            match = df["Date"].map(holiday)
            df = df[match]
        return df
Пример #7
0
def expand_date(timeseries):
    """
    Expand a pandas datetime series returning a dataframe with these columns:
    - hour : 0 - 23
    - year:
    - month: 1 - 12
    - weekday : 0 Monday - 6 Sunday
    - holiday : 0 - 1 holiday
    - workingday : 0 weekend or holiday - 1 workingday

    """
    from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

    assert type(
        timeseries) == pd.core.series.Series, 'input must be pandas series'
    assert timeseries.dtypes == 'datetime64[ns]', 'input must be pandas datetime'

    df = pd.DataFrame()

    df['hour'] = timeseries.dt.hour

    date = timeseries.dt.date
    df['year'] = pd.DatetimeIndex(date).year
    df['month'] = pd.DatetimeIndex(date).month
    df['day'] = pd.DatetimeIndex(date).day
    df['weekday'] = pd.DatetimeIndex(date).weekday

    holidays = calendar().holidays(start=date.min(), end=date.max())
    hol = date.astype('datetime64[ns]').isin(holidays)
    df['holiday'] = hol.values.astype(int)
    df['workingday'] = ((df['weekday'] < 5) & (df['holiday'] == 0)).astype(int)

    return df
Пример #8
0
def build_features(time, temp, conditions):
    '''
  need ['TMAX', 'some_ppt', 'ppt', 'hour_minute', 'weekday', 'day_of_year', 'total_days', 'Holiday']
  '''
    feature_df = pd.DataFrame(columns=[
        'TMAX', 'some_ppt', 'ppt', 'hour_minute', 'weekday', 'day_of_year',
        'total_days', 'Holiday'
    ])
    feature_df['hour_minute'] = range(time.hour * 60 + time.minute + 1,
                                      time.hour * 60 + time.minute + 61)
    feature_df['TMAX'] = temp * 10
    feature_df['weekday'] = int(time.weekday < 5)
    feature_df['day_of_year'] = time.dayofyear
    feature_df['total_days'] = (time - pd.to_datetime('July 1, 2013')).days

    heavy_ppt_query = r'(:?thunderstorm)|(:?blizzard)'
    ppt_query = r'(:?rain)|(:?snow)|(:?hail)'
    light_ppt_query = r'(:?drizzle)|(:?showers)'
    if re.match(heavy_ppt_query, conditions):
        feature_df[['some_ppt', 'ppt']] = [0, 1]
    elif re.match(light_ppt_query, conditions):
        feature_df[['some_ppt', 'ppt']] = [1, 0]
    elif re.match(ppt_query, conditions):
        if re.match(r'(:?light)|(:?chance)', conditions):
            feature_df[['some_ppt', 'ppt']] = [1, 0]
        else:
            feature_df[['some_ppt', 'ppt']] = [0, 1]
    else:
        feature_df[['some_ppt', 'ppt']] = [0, 0]

    cal = calendar()
    holidays = cal.holidays(start=time, end=time)
    feature_df['Holiday'] = time in holidays
    return feature_df
Пример #9
0
def create_time_feature(df):
    #create created_at_year, created_at_month, created_at_day, created_at_date, created_at_dayOfWeek,
    #created_at_time, created_at_hour, created_at_minute, created_at_second, created_at_isWeekend,
    #created_at_isHoliday
    df['created_at_year'], df['created_at_month'], df['created_at_day'], df[
        'created_at_date'], df['created_at_dayOfWeek'], df['created_at_time'], df[
            'created_at_hour'], df['created_at_minute'], df[
                'created_at_second'] = df['created_at_datetime'].dt.year, df[
                    'created_at_datetime'].dt.month, df[
                        'created_at_datetime'].dt.day, df[
                            'created_at_datetime'].dt.date, df[
                                'created_at_datetime'].dt.dayofweek, df[
                                    'created_at_datetime'].dt.time, df[
                                        'created_at_datetime'].dt.hour, df[
                                            'created_at_datetime'].dt.minute, df[
                                                'created_at_datetime'].dt.second
    df.loc[df['created_at_dayOfWeek'].isin([5, 6]), 'created_at_isWeekend'] = 1
    df.loc[df['created_at_dayOfWeek'].isin([0, 1, 2, 3, 4]),
           'created_at_isWeekend'] = 0
    cal = calendar()
    holidays = cal.holidays(start=df['created_at_date'].min(),
                            end=df['created_at_date'].max())
    df['created_at_isHoliday'] = np.where(
        df.created_at_datetime.dt.normalize().isin(holidays), 1, 0)
    return df
Пример #10
0
def prophetModelandPrediction(train, demandForcastingData_train,
                              demandForcastingData_test, holiday_df):
    cal = calendar()
    #train_holidays = cal.holidays(start=demandForcastingData_train.index.min(),end=demandForcastingData_train.index.max())
    #test_holidays = cal.holidays(start=demandForcastingData_test.index.min(),end=demandForcastingData_test.index.max())
    holiday_df['ds'] = pd.to_datetime(holiday_df['ds'])
    demandForcastingData_train.reset_index().rename(columns={
        'ArrivalDate': 'ds',
        'Count': 'y'
    }).head()
    model = Prophet(holidays=holiday_df)
    model.fit(demandForcastingData_train.reset_index().rename(columns={
        'ArrivalDate': 'ds',
        'Count': 'y'
    }))
    #demandForcastingData_test_fcst = model.predict(df=demandForcastingData_train.reset_index().rename(columns={'ArrivalDate':'ds'}))
    #demandForcastingData_test['Count_Prediction4'] = demandForcastingData_test_fcst.yhat.values
    if not train:
        demandForcastingData_test_fcst = model.predict(
            df=demandForcastingData_test.reset_index().rename(
                columns={'ArrivalDate': 'ds'}))
        demandForcastingData_test[
            'Count_Prediction4'] = demandForcastingData_test_fcst.yhat.values
        return demandForcastingData_test
    else:
        demandForcastingData_test_fcst = model.predict(
            df=demandForcastingData_train.reset_index().rename(
                columns={'ArrivalDate': 'ds'}))
        #print(demandForcastingData_test_fcst)
        demandForcastingData_train[
            'Count_Prediction4'] = demandForcastingData_test_fcst.yhat.values
        return demandForcastingData_train
Пример #11
0
def generate_date_features(date_index):
    out_df = pd.DataFrame(index=date_index)
    days = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
    for i in range(len(days)):
        kwargs = {days[i]: date_index.map(lambda row: int(row.weekday() == i))}
        out_df = out_df.assign(**kwargs)

    months = [
        'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct',
        'nov', 'dec'
    ]
    for i in range(len(months)):
        kwargs = {
            months[i]: date_index.map(lambda row: int(row.month == i + 1))
        }
        out_df = out_df.assign(**kwargs)

    quarter = ['Q1', 'Q2', 'Q3', 'Q4']
    for i in range(len(quarter)):
        kwargs = {
            quarter[i]: date_index.map(lambda row: int(row.quarter == i))
        }
        out_df = out_df.assign(**kwargs)

    years = ['y14', 'y15', 'y16', 'y17', 'y18']
    for i in range(len(years)):
        kwargs = {
            years[i]: date_index.map(lambda row: int(row.year == i + 2014))
        }
        out_df = out_df.assign(**kwargs)

    weeks = ['w{}'.format(i) for i in range(1, 54)]
    for i in range(len(weeks)):
        kwargs = {
            weeks[i]:
            date_index.map(lambda row: int(row.isocalendar()[1] == i + 1))
        }
        out_df = out_df.assign(**kwargs)

    # TODO: fix this
    def is_xmas_new_year(row):
        ret = ((dt.datetime(row.year, 12, 25) < row
                and row < dt.datetime(row.year + 1, 1, 5))
               or (dt.datetime(row.year - 1, 12, 25) < row
                   and row < dt.datetime(row.year, 1, 5)))
        return int(ret)

    # kwargs = {'xmas': date_index.map(is_xmas_new_year)}
    # out_df = out_df.assign(**kwargs)
    # print(out_df.head())

    dr = pd.to_datetime(pd.to_datetime(date_index))
    cal = calendar()
    holidays = cal.holidays(start=dr.min(), end=dr.max())

    out_df["holiday"] = dr.isin(holidays)
    out_df["holiday"] = out_df.holiday.astype(int)
    # out_df = out_df.assign(**kwargs)

    return out_df
Пример #12
0
def get_prediction_dataframe(series):
    hour_of_day = series.index.hour
    month_of_year = series.index.month
    day_of_week = series.index.dayofweek
    year_idx = series.index.year
    target = series.values
    cal = calendar()
    holidays = cal.holidays(start=series.index.min(), end=series.index.max())
    df = pd.DataFrame(
        zip(
            year_idx,
            month_of_year,
            day_of_week,
            hour_of_day,
            series.index.isin(holidays),
            target,
        ),
        columns=[
            "year_idx",
            "month_of_year",
            "day_of_week",
            "hour_of_day",
            "holiday",
            "target",
        ],
    )
    convert_type = {x: "category" for x in df.columns.values[:4]}
    df = df.astype(convert_type)
    return df
def build_features(time, temp, conditions):
  '''
  need ['TMAX', 'some_ppt', 'ppt', 'hour_minute', 'weekday', 'day_of_year', 'total_days', 'Holiday']
  '''
  feature_df = pd.DataFrame(columns=['TMAX', 'some_ppt', 'ppt', 
                                     'hour_minute', 'weekday', 'day_of_year', 
                                     'total_days', 'Holiday'])
  feature_df['hour_minute'] = range(time.hour*60+time.minute+1, time.hour*60+time.minute+61)
  feature_df['TMAX'] = temp*10
  feature_df['weekday'] = int(time.weekday<5)
  feature_df['day_of_year'] = time.dayofyear
  feature_df['total_days'] = (time-pd.to_datetime('July 1, 2013')).days
  
  heavy_ppt_query = r'(:?thunderstorm)|(:?blizzard)'
  ppt_query = r'(:?rain)|(:?snow)|(:?hail)'
  light_ppt_query = r'(:?drizzle)|(:?showers)'
  if re.match(heavy_ppt_query, conditions):
    feature_df[['some_ppt', 'ppt']] = [0, 1]
  elif re.match(light_ppt_query, conditions):
    feature_df[['some_ppt', 'ppt']] = [1, 0]
  elif re.match(ppt_query, conditions):
    if re.match(r'(:?light)|(:?chance)', conditions):
      feature_df[['some_ppt', 'ppt']] = [1, 0]
    else:
      feature_df[['some_ppt', 'ppt']] = [0, 1]
  else:
    feature_df[['some_ppt', 'ppt']] = [0, 0]

  cal = calendar()
  holidays = cal.holidays(start=time, end=time)
  feature_df['Holiday'] = time in holidays
  return feature_df
Пример #14
0
 def temporal_features(self, df):
     df['start_year'] = df['start_date'].dt.year
     df['start_month'] = df['start_date'].dt.month
     df['start_weekday'] = df['start_date'].dt.weekday
     df['start_hour'] = df['start_date'].dt.hour
     df['end_year'] = df['end_date'].dt.year
     df['end_month'] = df['end_date'].dt.month
     df['end_weekday'] = df['end_date'].dt.weekday
     df['end_hour'] = df['end_date'].dt.hour
     cal = calendar()
     holidays = cal.holidays(start=df['start_date'].min(),
                             end=df['start_date'].max())
     df['is_start_holiday'] = np.where(df['start_date'].isin(holidays), 1,
                                       0)
     df['is_start_working_day'] = np.where(
         (df['start_date'].dt.weekday != 5) &
         (df['start_date'].dt.weekday != 6) & (df['is_start_holiday'] == 0),
         1, 0)
     df['is_start_weekend'] = np.where((df['start_date'].dt.weekday == 5) |
                                       (df['start_date'].dt.weekday == 6),
                                       1, 0)
     df['start_year_part'] = df.apply(
         lambda x: self.year_part(x['start_year'], x['start_month']),
         axis=1)
     return df
Пример #15
0
def holiday_indicator(df):
    """Appends holiday (US Federal) indicator column"""
    min_date = df.ts.min()
    max_date = df.ts.max()
    cal = calendar()
    holidays = cal.holidays(start=min_date, end=max_date)
    df['holiday'] = (
        df.ts.dt.date.astype('datetime64').isin(holidays)).astype(int)
Пример #16
0
def cross_holidays(joint_df):
    joint_df['FlightDate'] = pd.to_datetime(joint_df['FlightDate'],
                                            infer_datetime_format=True)
    cal = calendar()
    holidays = cal.holidays(start=joint_df['FlightDate'].min(),
                            end=joint_df['FlightDate'].max())
    joint_df["is_holiday"] = joint_df["FlightDate"].isin(holidays)
    return joint_df
 def process_date(X):
     ts = pd.to_datetime(X[['Year', 'Month', 'Day']])
     X['weekday'] = ts.dt.weekday
     cal = calendar()
     holidays = cal.holidays(start=ts.min(), end=ts.max())
     X['IsHoliday'] = ts.apply(lambda x: int(x in holidays))
     return np.c_[X['Year'], X['Month'], X['Day'], X['Hour'],
                  X['weekday'], X['IsHoliday']]
Пример #18
0
def prepare_data(data_path):
    """Returns dataframe with features."""

    # Get data
    df = pd.read_csv(data_path)

    # Remove NaNs
    df = df.dropna()

    # Convert date to datetime
    df['date'] = pd.to_datetime(df.date)

    # Create and age variable
    df['age'] = df.index.astype('int')

    # Create a day of week field
    df['day'] = df.date.dt.dayofweek

    # Create a month of year field
    df['month'] = df.date.dt.month

    # Create a boolean for US federal holidays
    holidays = calendar().holidays(start=df.date.min(), end=df.date.max())
    df['holiday'] = df['date'].isin(holidays).apply(int)

    # Rearrange columns
    df = df[
        [
            'date',
            'count',
            'age',
            'month',
            'day',
            'holiday'
        ]
    ]

    # Create monthly dummies
    tmp = pd.get_dummies(df.month)
    tmp.columns = ['month' + str(value) for value in tmp.columns]
    df = pd.concat([df, tmp], axis=1)

    # Create daily dummies
    tmp = pd.get_dummies(df.day)
    tmp.columns = ['day' + str(value) for value in tmp.columns]
    df = pd.concat([df, tmp], axis=1)

    # Reset index
    df = df.reset_index(drop=True)

    # Log transform count data
    df['count'] = np.log1p(df['count'])

    # Drop unnecessary columns
    df = df.drop(['month', 'day', 'age'], axis=1)
    df = df.dropna()

    return df
Пример #19
0
    def is_holiday(self):
        is_hday = 0
        cal = calendar()
        holidays = cal.holidays(start=dt.date(2015, 1, 1),
                                end=dt.date(2020, 12, 31))
        if np.datetime64(self.usage_date) in holidays:
            is_hday = 1

        return is_hday
Пример #20
0
def is_holiday(date):
    cal = calendar()
    st = datetime.datetime(date.year, date.month, 1)
    ed = st + datetime.timedelta(days=31)
    hols = cal.holidays(start=st, end=ed)
    if date in hols:
        return True
    else:
        return False
Пример #21
0
def add_holidays(train_df,test_df):
    concat = pd.concat([train_df['pickup_datetime'],test_df['pickup_datetime']])
    cal = calendar()
    holidays = cal.holidays(start=concat['pickup_datetime'].min(), end=concat['pickup_datetime'].max())

    train_df['Holiday'] = train_df['pickup_datetime'].isin(holidays)
    test_df['Holiday'] = test_df['pickup_datetime'].isin(holidays)
    
    return train_df,test_df
Пример #22
0
def clean_data(df_data, features=None):
    '''
    Clean weather data and create features.

    INPUT: dataframe, list
    OUTPUT: dataframe
    '''
    df = df_data.copy()
    df['time'] = pd.to_datetime(df['time'], unit='s')
    df.set_index('time', inplace=True)
    df = df.resample('1D', how='mean')

    # feature creation
    df['dayofweek'] = pd.DatetimeIndex(df.index).weekday
    df['dayofyear'] = pd.DatetimeIndex(df.index).dayofyear
    df['weekofyear'] = pd.DatetimeIndex(df.index).weekofyear

    # mark holidays
    cal = calendar()
    holidays = cal.holidays(start=df.index.min(), end=df.index.max())
    holidays
    df['holiday'] = 0
    df.loc[df.index.isin(holidays), 'holiday'] = 1

    # rolling means
    c = [
        'apparenttemperaturemax', 'apparenttemperaturemin', 'temperaturemax',
        'temperaturemin'
    ]
    d = ['7']

    for col in c:
        for day in d:
            df[col + day] = pd.rolling_mean(df[col], int(day))

    # create lag features
    c = ['apparenttemperaturemax', 'apparenttemperaturemin', 'windspeed']
    d = ['-3', '-7']

    for col in c:
        for day in d:
            df[col + day] = df[col].shift(int(day))

    # impute null values
    df.fillna(0, inplace=True)

    if features is None:
        # drop unneeded columns
        df.drop(['precipintensity', 'precipintensitymax'],
                axis=1,
                inplace=True)
    else:
        # use only specified features
        df = df[features]

    return df
Пример #23
0
 def dt_range(self):
     today = datetime.datetime.now().date()
     today = datetime.datetime(today.year, today.month, today.day)
     
     d = datetime.timedelta(days=1)
     cal = calendar()
     holidays = [x.to_pydatetime() for x in cal.holidays((today - d * self.trail_days), today)]
     dt_range_ = sorted([(today - d * i) for i in range(self.trail_days)])
     dt_range_ = [dt for dt in dt_range_ if datetime.date.weekday(dt) < 5 and dt not in holidays]
     return dt_range_
Пример #24
0
def holiday_checker(release_date):

    vReturn = 0
    cal = calendar()
    startDate = datetime.strptime(release_date, '%m/%d/%Y') - timedelta(days=7)
    endDate = datetime.strptime(release_date, '%m/%d/%Y') + timedelta(days=5)

    holidays = cal.holidays(start=startDate, end=endDate).to_pydatetime()
    print(holidays)
    if holidays: vReturn = 1
    return vReturn
Пример #25
0
    def is_holiday(cls):
        """
            Reference - https://stackoverflow.com/questions/64276059
        """
        is_hday = 0
        cal = calendar()
        holidays = cal.holidays(start=dt.date(2015, 1, 1),
                                end=dt.date(2020, 12, 31))

        is_hday = cls.usage_date.in_(holidays)
        return is_hday
Пример #26
0
def get_features_dataframe(
    series: pd.Series,
    time_features: List[TimeFeature],
    lag_indices: List[int],
    past_data: Optional[pd.Series] = None,
) -> pd.DataFrame:
    """Constructs a DataFrame of features for a given Series.

    Features include some date-time features (like hour-of-day, day-of-week, ...) and
    lagged values from the series itself. Lag indices are specified by `lags`, while
    previous data can be specified by `past_data`: the latter allows to get lags also
    for the initial values of the series.

    Parameters
    ----------
    series
        Series on which features should be computed.
    time_features
        List of time features to be included in the data frame.
    lag_indices
        List of indices of lagged observations to be included as features.
    past_data
        Prior data, to be used to compute lagged observations.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the features. This has the same index as `series`.
    """
    # TODO check if anything can be optimized here

    assert past_data is None or series.index.freq == past_data.index.freq
    assert past_data is None or series.index[0] > past_data.index[-1]

    cal = calendar()
    holidays = cal.holidays(start=series.index.min(), end=series.index.max())
    time_feature_columns = {
        feature.__class__.__name__: feature(series.index)
        for feature in time_features
    }

    all_data = (
        series
        if past_data is None
        else past_data.append(series).asfreq(series.index.freq)
    )
    lag_columns = {
        f"lag_{idx}": all_data.shift(idx)[series.index].values
        for idx in lag_indices
    }

    columns = {**time_feature_columns, **lag_columns, "target": series.values}

    return pd.DataFrame(columns, index=series.index)
Пример #27
0
 class custom_calendar(AbstractHolidayCalendar):
     new_rules = [
         Holiday('Halloween', month=10, day=31),
         Holiday('Christmas Eve', month=12, day=24),
         Holiday('New Years Eve', month=12, day=31),
         Holiday('DST time change',
                 month=3,
                 day=1,
                 offset=pd.DateOffset(weekday=SU(2)))
     ]
     rules = calendar().rules + new_rules
def get_holidays(x):
    cal = calendar()
    holidays = cal.holidays(start=x.min(), end=x.max(), return_name=True)
    holidays = cal.holidays(start='2015-01-01',
                            end='2016-12-31',
                            return_name=True)
    holidays = holidays[~holidays.isin(['Presidents Day',
                                        'Columbus Day',
                                        'Veterans Day'])]
    mlk_days = holidays[holidays == 'Dr. Martin Luther King Jr.']
    return holidays, mlk_days
Пример #29
0
def clean_data(df_data, features=None):
    '''
    Clean weather data and create features.

    INPUT: dataframe, list
    OUTPUT: dataframe
    '''
    df = df_data.copy()
    df['time'] = pd.to_datetime(df['time'], unit='s')
    df.set_index('time', inplace=True)
    df = df.resample('1D', how='mean')

    # feature creation
    df['dayofweek'] = pd.DatetimeIndex(df.index).weekday
    df['dayofyear'] = pd.DatetimeIndex(df.index).dayofyear
    df['weekofyear'] = pd.DatetimeIndex(df.index).weekofyear

    # mark holidays
    cal = calendar()
    holidays = cal.holidays(start=df.index.min(), end=df.index.max())
    holidays
    df['holiday'] = 0
    df.loc[df.index.isin(holidays), 'holiday'] = 1

    # rolling means
    c = ['apparenttemperaturemax','apparenttemperaturemin',
         'temperaturemax', 'temperaturemin']
    d = ['7']

    for col in c:
        for day in d:
            df[col+day] = pd.rolling_mean(df[col], int(day))

    # create lag features
    c = ['apparenttemperaturemax','apparenttemperaturemin', 'windspeed']
    d = ['-3', '-7']

    for col in c:
        for day in d:
            df[col+day] = df[col].shift(int(day))

    # impute null values
    df.fillna(0, inplace=True)

    if features is None:
        # drop unneeded columns
        df.drop(['precipintensity', 'precipintensitymax'],
                axis = 1, inplace=True)
    else:
        # use only specified features
        df = df[features]

    return df
Пример #30
0
 def transform(self, X, y=None):
     """Create Squared Variables."""
     cal = calendar()
     holidays = cal.holidays(start='2000-01-01', end='2050-01-01')
     holiday_bin_temp = pd.DataFrame(X.index.date,
                                     index=X.index,
                                     columns=['date'])
     holiday_bin = holiday_bin_temp['date'].astype('datetime64').isin(
         holidays)
     holiday_bin = pd.DataFrame(holiday_bin)
     del holiday_bin_temp
     return holiday_bin
def download_pulse_range(start, end, pulsedir):
    cal = calendar()

    if not os.path.exists(pulsedir):
        os.makedirs(pulsedir)

    for day in _daterange(start, end):
        if day.weekday() < 5: # Monday...Friday == 0..4
            if day in cal.holidays():
                print u'(Holiday)...',
            _download_pulse(day, pulsedir)
        else:
            print u'%s (Weekend)' % str(day)
def _remove_WE_holidays_NaN(data):

    no_WE = ~((data.index.weekday == 5) | (data.index.weekday == 6)) # remove if WE

    cal = calendar()
    start = datetime.datetime.strftime(data.index.min(),"%Y-%m-%d")
    end =datetime.datetime.strftime(data.index.max(),"%Y-%m-%d")
    hol_cal = cal.holidays(start=start, end=end)
    no_hol = ~data.index.isin(hol_cal) # remove if it is a national holiday

    no_NaN = ~data.isna().all(axis=1) # remove if has any NaN for any hour

    return data[no_WE & no_hol & no_NaN]
Пример #33
0
def encode_dataset(train,test,meta,target_model='xgb'):
    y_train = train[meta['target']]
    train = train.drop([meta['target']],axis=1)
    assert train.shape[1] == test.shape[1]
    for i in range(train.shape[1]):
        assert train.columns[i] == test.columns[i]
    train_obs = len(train)
    #
    all_data = pd.concat([train,test],axis=0)
    for i,f in enumerate(meta['cols'].keys()):
        print(i,f,meta['cols'][f])
        if meta['cols'][f] == 'CAT':
            all_data[f] = all_data[f].fillna('missing')
            encoder = LabelEncoder()
            encoder.fit(all_data[f])
            if target_model == 'xgb':
                all_data[f] = encoder.transform(all_data[f])
            else:
                all_data[f] = encoder.transform(all_data[f]).astype(int)
        elif meta['cols'][f] == 'NUM':
            all_data[f] = all_data[f].fillna(-1)
        elif meta['cols'][f] == 'DATE':
            tmp = pd.to_datetime(all_data[f])
            all_data[f] = tmp.dt.weekday
            cal = calendar()
            #holidays = cal.holidays(start=tmp.min(), end=tmp.max())
            #$all_data[f+'_is_holiday'] = 1*tmp.isin(holidays)
        elif meta['cols'][f] == 'REM':
            all_data = all_data.drop(f,axis=1)
        elif meta['cols'][f] == 'LEN':
            all_data[f+'_len'] = all_data[f].apply(count_desc_len)
            all_data = all_data.drop(f,axis=1)
        else:
            raise Exception(str(meta['cols'][f])+":unknown mapping")
    assert train_obs == len(y_train)
    return all_data , y_train
# Presumably, crime rates will be different on working days on the
# one hand and weekdays and holidays on the other hand.
# So we'll introduce a column WorkingDay

# Data on holidays and which businesses actually observe them are sketchy
# at best, so we'll only count the most important ones as holidays:
#  - New Year
#  - Memorial Day
#  - Independence Day
#  - Labor Day
#  - Thanksgiving
#  - Black Friday
#  - Christmas

cal = calendar()
# need to remove rules in descending order!
cal.rules.pop(7) # remove Veterans Day
cal.rules.pop(6) # remove Columbus Day
cal.rules.pop(2) # remove President's Day
cal.rules.pop(1) # remove Martin Luther King Day
# create new rule for Black Friday
USBlackFriday = Holiday('BlackFriday', month=11, day=1,
                            offset=DateOffset(weekday=FR(4)))

# create own holiday calendar based on the above rules
ownCal = HolidayCalendarFactory('OwnCalendar', cal, USBlackFriday)
#print(ownCal.rules)
cal = ownCal()
#holidays = cal.holidays(start='2003-01-01', end='2015-05-13', return_name=True) # also returns name of holiday
holidays = cal.holidays(start='2003-01-01', end='2015-05-13')