def holiday_column(airline_df, start='2014-01-01', end='2018-12-31'):
    '''Takes airline df and returns df with column indicating if
  date of flight is a US Federal Holiday'''

    #Pull Holiday Dates and convert to Spark DF with timestamp column
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start, end).to_pydatetime()
    holidays_df = pd.DataFrame(pd.DataFrame(holidays)[0].astype('string'))
    schema = StructType([StructField('Holiday_Date', StringType())])
    holidays_sc = spark.createDataFrame(holidays_df, schema)
    holidays_sc = holidays_sc.select(
        to_timestamp(holidays_sc.Holiday_Date,
                     'yyyy-MM-dd').alias('holiday_date'))

    #Join holidays to airlines
    holiday_joined_df = airline_df.join(
        holidays_sc, (airline_df.fl_date == holidays_sc.holiday_date), 'left')

    #Change date column to binary
    holiday_joined_df = holiday_joined_df.withColumn(
        "Holiday", (f.col('holiday_date').isNotNull()).cast("integer"))

    #Drop redundant holiday_date column
    holiday_joined_df = holiday_joined_df.drop(holiday_joined_df.holiday_date)

    return holiday_joined_df
示例#2
0
    def __init__(
            self,
            path: str = None,
            data: pd.DataFrame = pd.DataFrame(),
            holdings: pd.DataFrame = pd.DataFrame(),
    ):
        """Initialize the portfolio with holdings and market data.

        Args:
            path: The name of an a directory containing holdings and market data. If this
                is None then the portfolio must be described by the data and holdings arguments.
            data: A DataFrame  containing market close data for a set of
                financial instruments over a period of time.
            holdings:   A DataFrame containing the number of shares held of the set of
                symbols in data over a time period corresponding to that of data.
        """
        if len(data) > 0 and len(holdings) > 0:
            logger.debug("Data and holdings arguments are set.")
            self.data = data
            self.holdings = holdings
        else:
            logger.debug("Data and holdings are not set.")
        if path is None:
            path = Path().home() / ".portfolio" / "data"
        self.path = Path(path)
        if not self.path.is_dir():
            logger.info("%s is not a directory, creating it...", self.path)
            self.path.mkdir()
        self.data_file = self.path / "prices.feather"
        self.holdings_file = self.path / "holdings.feather"
        try:
            # The feather format does not support date indices,
            # so set the Date colemn to be the index.
            self.holdings = pd.read_feather(
                self.holdings_file).set_index("date")
        except FileNotFoundError:
            logger.info("No stored holdings found.")
            self.holdings = holdings
        symbols = self.holdings.columns
        try:
            # The feather format does not support date indices,
            # so set the Date colemn to be the index.
            self.data = pd.read_feather(self.data_file).set_index("date")
        except FileNotFoundError:
            logger.info("Market data is not stored .")
        calendar = USFederalHolidayCalendar()
        today = pd.Timestamp.today().normalize()
        last_business_day = min(today - BDay(1),
                                self.data.index.max() + BDay(1))
        holidays = calendar.holidays(last_business_day, today)
        if (today not in self.data.index
                and last_business_day not in self.data.index
                and holidays.empty):
            self.data = self.get_market_data(
                symbols,
                start=last_business_day,
                end=today,
            )
            self.holdings = self.holdings.reindex(self.data.index,
                                                  method="ffill").dropna()
示例#3
0
def pandas_holidays():
    from pandas.tseries.holiday import USFederalHolidayCalendar
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start='2014-01-01',
                            end='2014-12-31').to_pydatetime()
    if datetime.datetime(2014, 1, 1) in holidays:
        print(True)
示例#4
0
文件: kernels.py 项目: afcarl/gplib
 def calc_holidays(start_date=datetime.datetime(2012, 10, 1),
                   end_date=datetime.datetime.now()):
     cal = USFederalHolidayCalendar()
     holidays = np.ones(cal.holidays().shape[0])
     holidays = pd.Series(holidays, index=cal.holidays()).resample("D").fillna(0.0)
     h = holidays[start_date:end_date].values
     h = np.where(h)[0] - 1
     return h
示例#5
0
 def near_holiday(self):
     calendar = USFederalHolidayCalendar()
     holidays = calendar.holidays()
     df1 = self.df_raw[['id', 'date_account_created']]
     df1.date_account_created = pd.to_datetime(df1.date_account_created)
     df1.date_account_created = pd.to_datetime(df1.date_account_created.dt.date)
     df1['near_holiday'] = (df1.date_account_created.isin(holidays + timedelta(days=1)) | df1.date_account_created.isin(holidays - timedelta(days=1)))
     return df1[['id', 'near_holiday']]
示例#6
0
def get_holiday_list(start_yr, start_mon, start_day, end_yr, end_mon, end_day):
    calendar = USFederalHolidayCalendar()
    holiday_list = calendar.holidays(datetime.datetime(start_yr, 11, 1),
                                     datetime.datetime(end_yr, 5, 1))
    holidays = []
    for holiday in holiday_list:
        holidays.append(holiday)
    return holidays
示例#7
0
def isHoliday(d):
    cal = USFederalHolidayCalendar()
    y_str = d.strftime("%Y")
    holidays = cal.holidays(start=y_str+'-01-01', end=y_str+'-12-31').to_pydatetime()
    if d in holidays:
        return 1
    else:
        return 0
示例#8
0
def holiday_lists():
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start='2021-01-01',
                            end='2021-12-31').to_pydatetime()
    hs = []
    for h in holidays:
        hs.append(h.strftime("%Y/%m/%d"))
    return hs
示例#9
0
def next_business_day(date):
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays()
    date = date + BDay(1)
    date = date.to_pydatetime()
    while date in holidays:
        date = date + BDay(1)
        date = date.to_pydatetime()
    return date
示例#10
0
 def __init__(self,epsPath,pricePath,daysNeed = 3):
     self.raw_eps = pd.read_csv(epsPath)
     self.price = pd.read_csv(pricePath)
     self.daysNeed = daysNeed
     self.__getEps()
     minDate = str(min(self.epsBefore['DATE'],self.epsAfter['DATE']))
     maxDate = str(max(self.epsBefore['DATE'],self.epsAfter['DATE']))
     cal = USFederalHolidayCalendar()
     self.holidays = cal.holidays(start=minDate, end=maxDate).to_pydatetime()
def judgeOpenDaysInRange(from_date, to_date):
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(from_date, to_date)
    duedays = pd.bdate_range(from_date, to_date)
    df = pd.DataFrame()
    df['date'] = duedays
    df['holiday'] = duedays.isin(holidays)
    opendays = df[df['holiday'] == False]
    return opendays
示例#12
0
def create_ts_calendar(df, ts_settings, additional_events=None):
    """
    df: pandas df
    ts_settings: dict
        Parameters for time series project
    additional_events: pandas df(optional)
        df of additional events to add to calendar

    Returns:
    --------
    Calendar of events

    """
    date_col = ts_settings['date_col']

    cal = USFederalHolidayCalendar(AbstractHolidayCalendar)

    black_friday = Holiday(
        "Black Friday",
        month=11,
        day=1,
        offset=[pd.DateOffset(weekday=TH(4)),
                pd.DateOffset(1)])
    cyber_monday = Holiday(
        "Cyber Monday",
        month=11,
        day=1,
        offset=[pd.DateOffset(weekday=TH(4)),
                pd.DateOffset(4)])
    cal.rules.append(black_friday)
    cal.rules.append(cyber_monday)

    cal.rules[9] = Holiday('Christmas', month=12, day=25)
    cal.rules[4] = Holiday('July 4th', month=7, day=4)

    events = pd.DataFrame(
        cal.holidays(
            start=pd.to_datetime(df[date_col].min()),
            end=pd.to_datetime(df[date_col].max()) + dt.timedelta(days=365),
            return_name=True,
        ))
    events = events.reset_index()
    events.columns = ['Date', 'Event']

    if additional_events is not None:
        assert additional_events.shape[
            1] == 2, 'additional events must be a df with 2 columns'
        additional_events.columns = ['Date', 'Event']
        additional_events['Date'] = pd.to_datetime(additional_events['Date'])
        events = events.append(additional_events)

    events['Date'] = [
        dt.datetime.strftime(pd.to_datetime(date), '%Y-%m-%d')
        for date in events['Date']
    ]
    return events.drop_duplicates().sort_values(by='Date').reset_index(
        drop=True)
示例#13
0
    def get_path(self,
                 initial_values,
                 t_start,
                 t_end,
                 freq,
                 interest_rate,
                 seed=None):
        # check initial value os the right size and dimension
        #print("getting path " , freq.)
        # set up parameters of the pathway model
        covar = stats.corr_to_cov(self.correlation_matrix, self.sigmas)
        covar_param = param.SimpleArrayParam(covar)

        chol = linalg.cholesky(covar, lower=True)
        chol[chol < 1.0e-9] = 0.0
        cholesky_param = param.SimpleArrayParam(chol)

        r_param = param.SimpleParam(interest_rate)

        # create the index of times that are in market hours between
        # requested times
        timeindex = pd.date_range(start=t_start, end=t_end, freq=freq)
        # get the frequency (in seconds) now before removing non-market times
        freq_in_secs = pd.to_timedelta(timeindex.freq,
                                       unit='s').total_seconds()
        # only trading hours
        timeindex = timeindex[timeindex.indexer_between_time('09:30', '16:00')]
        # only weekdays
        timeindex = timeindex[~(timeindex.dayofweek > 4)]
        # remove fed holidays
        cal = USFederalHolidayCalendar()
        hols = cal.holidays(start=timeindex.min(), end=timeindex.max())
        timeindex = timeindex[~timeindex.isin(hols)]

        # get array of time in yearly units and get stock pathways
        times = np.arange(
            0, len(timeindex)) * freq_in_secs / MARKET_SECONDS_PER_YEAR

        # seed and create the pathway generator object
        np.random.seed(seed=seed)
        path_maker = path.GeometricDiffusionManyAsset(self.generator, r_param,
                                                      covar_param,
                                                      cholesky_param)

        if len(times) > 0:
            s_paths = path_maker.get_single_timed_path(initial_values, times)
        else:
            raise RunTimeError('Trying to generate stocks on empty time list')

        # put all data into a pandas Dataframe
        stocks_df = pd.DataFrame(index=timeindex,
                                 data=s_paths,
                                 columns=self.stock_names)
        print("internal : ",
              stocks_df.groupby(stocks_df.index.dayofweek).sum())
        np.random.seed(seed=None)
        return stocks_df
示例#14
0
 def __init__(self, epsPath, pricePath, daysNeed=3):
     self.raw_eps = pd.read_csv(epsPath)
     self.price = pd.read_csv(pricePath)
     self.daysNeed = daysNeed
     self.__getEps()
     minDate = str(min(self.epsBefore['DATE'], self.epsAfter['DATE']))
     maxDate = str(max(self.epsBefore['DATE'], self.epsAfter['DATE']))
     cal = USFederalHolidayCalendar()
     self.holidays = cal.holidays(start=minDate,
                                  end=maxDate).to_pydatetime()
示例#15
0
def create_date_feature_is_public_holiday(df, date, start, end, country = 'US'):
    """
    create date feature: is holiday
    only support USFederal Holiday
    """
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start=start, end=end).to_pydatetime()
    foo = lambda x: 1 if x in holidays else 0
    df['is_holiday'] = df[date].map(foo)
    return df
示例#16
0
def find_holidays(base_year):
    ''' Using Pandas calendar, find all 10 US Federal Holidays,
        plus California's Cesar Chavez Day (March 31).
    '''
    yr = str(base_year)
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start=yr + '-01-01',
                            end=yr + '-12-31').to_pydatetime()

    return [d.strftime('%m-%d') for d in holidays] + ['03-31']
示例#17
0
def getHolidays(dates):
    holidaysVectorReturn  = []
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start='2015-01-01', end='2017-12-31').to_pydatetime()
    for idx, date in enumerate(dates):
         if date in holidays:
             holidaysVectorReturn.append(1)
         else:
             holidaysVectorReturn.append(0)
    return  holidaysVectorReturn
示例#18
0
    def __init__(self, config_name):
        self.today = datetime.datetime.now().date()
        self._holidays = USFederalHolidayCalendar()
        self.config_name = config_name

        self._set_config()
        self.data = self._pull_data()
        self._clean_data()
        if len(self.data) != 0:
            self._amend_data()
        self._store_data()
示例#19
0
    def test_calendar(self):

        calendar = USFederalHolidayCalendar()
        holidays = calendar.holidays(self.start_date, self.end_date)

        holidays_1 = calendar.holidays(self.start_date.strftime("%Y-%m-%d"), self.end_date.strftime("%Y-%m-%d"))
        holidays_2 = calendar.holidays(Timestamp(self.start_date), Timestamp(self.end_date))

        self.assertEqual(list(holidays.to_pydatetime()), self.holiday_list)
        self.assertEqual(list(holidays_1.to_pydatetime()), self.holiday_list)
        self.assertEqual(list(holidays_2.to_pydatetime()), self.holiday_list)
示例#20
0
def build_features(data, features):
    #Timestamp First Active
    data['timestamp_first_active'] = data['timestamp_first_active'].astype(str)
    data['timestamp_first_active_date'] = data['timestamp_first_active'].str[:8]
    data['timestamp_first_active_date'] = pd.to_datetime(data['timestamp_first_active_date'], format='%Y%m%d')
    data['tfa_month'] = data['timestamp_first_active_date'].map(lambda x : x.month)
    data['tfa_year'] = data['timestamp_first_active_date'].map(lambda x : x.year)
    data['tfa_day'] = data['timestamp_first_active_date'].map(lambda x : x.day)
    data['tfa_dayofyear'] = data.timestamp_first_active_date.dt.dayofyear
    data['tfa_dayofweek'] = data.timestamp_first_active_date.dt.dayofweek
    data['tfa_week'] = data.timestamp_first_active_date.dt.week
    data['tfa_quarter'] = data.timestamp_first_active_date.dt.quarter
    features.extend(['tfa_day','tfa_month','tfa_year','tfa_dayofyear','tfa_dayofweek','tfa_week','tfa_quarter'])
    #TFA Holidays
    #calendar = USFederalHolidayCalendar()
    #tfa_holidays = calendar.holidays(start=data['timestamp_first_active_date'].min(),end=data['timestamp_first_active_date'].max())
    #for i in range(len(tfa_holidays)):
        #data['tfa_holiday_diff_'+str(i)] = data['timestamp_first_active_date'].map(lambda x : (x-tfa_holidays[i]).days)
        #data['tfa_holiday_diff_'+str(i)] = data['tfa_holiday_diff_'+str(i)].map(holiday_transform)
        #data_dummy = pd.get_dummies(data['tfa_holiday_diff_'+str(i)],prefix='tfa_holiday_diff_'+str(i))
        #features.extend(data_dummy.columns.values)
        #data.drop(['tfa_holiday_diff_'+str(i)],axis=1,inplace=True)
        #data = pd.concat((data,data_dummy),axis=1)
        #features.extend('tfa_holiday_diff_'+str(i))
    #Date Account Created
    data['date_account_created'] = pd.to_datetime(data['date_account_created'])
    data['dac_month'] = data['date_account_created'].map(lambda x : x.month)
    data['dac_year'] = data['date_account_created'].map(lambda x : x.year)
    data['dac_day'] = data['date_account_created'].map(lambda x : x.day)
    data['dac_dayofyear'] = data.date_account_created.dt.dayofyear
    data['dac_dayofweek'] = data.date_account_created.dt.dayofweek
    data['dac_week'] = data.date_account_created.dt.week
    data['dac_quarter'] = data.date_account_created.dt.quarter
    features.extend(['dac_year','dac_month','dac_day','dac_dayofyear','dac_dayofweek','dac_week','dac_quarter'])
    #DAC Holidays
    calendar = USFederalHolidayCalendar()
    dac_holidays = calendar.holidays(start=data['date_account_created'].min(),end=data['date_account_created'].max())
    for i in range(len(dac_holidays)):
        data['dac_holiday_diff_'+str(i)] = data['date_account_created'].map(lambda x : (x-dac_holidays[i]).days)
        data['dac_holiday_diff_'+str(i)] = data['dac_holiday_diff_'+str(i)].map(holiday_transform)
        features.extend(['dac_holiday_diff_'+str(i)])
    #Days Difference Between TFA and DAC
    data['days_diff'] = (data['date_account_created'] - data['timestamp_first_active_date']).dt.days
    #data['days_diff'] = data['days_diff'].map(holiday_transform)
    features.extend(['days_diff'])
    data.drop(['date_account_created','timestamp_first_active','timestamp_first_active_date'],axis=1,inplace=True)
    other_features = ['gender', 'signup_method','signup_flow','language','affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
    for f in other_features:
        data_dummy = pd.get_dummies(data[f],prefix=f)
        features.extend(data_dummy.columns.values)
        data.drop([f],axis=1,inplace=True)
        data = pd.concat((data,data_dummy),axis=1)
    return data
示例#21
0
    def __init__(self, data_df, date):
        # df = pd.read_csv(path, date_parser='Date')
        self.date = date
        df = data_df.copy()
        # df = pd.read_csv(path, date_parser='Date')
        # df = data_df.copy()
        test = df[[
            'Date', 'Hour', 'Weekday', 'Month', 'Load', 'Mean_Temp',
            'Mean_Humi', 'RIV_Temp', 'RIV_Humi', 'LAX_Temp', 'LAX_Humi',
            'USC_Temp', 'USC_Humi', 'WJF_Temp', 'WJF_Humi', 'TRM_Temp',
            'TRM_Humi'
        ]]

        test.loc[:, 'RIV_Temp_Log'] = np.log(df['RIV_Temp'])

        test.loc[:, 'Load_Log'] = np.log(df['Load'])
        test['Load_Lag_48'] = test['Load_Log'].shift(48, axis=0)
        # test['Temp_Lag_48'] = test['Mean_Temp'].shift(48, axis=0)
        test['Humi_Lag_48'] = test['Mean_Humi'].shift(48, axis=0)
        test['RIV_Temp_Log_Lag_48'] = test['RIV_Temp_Log'].shift(48, axis=0)

        # test['RIV_Temp_Lag_48']= test['RIV_Temp'].shift(48, axis=0)

        cal = USFederalHolidayCalendar()

        holidays = cal.holidays(start='2014-01-01',
                                end=str(datetime.datetime.now()),
                                return_name=True)

        holidays = pd.DataFrame(holidays)

        holidays = holidays.reset_index()
        holidays = holidays.rename(columns={'index': "Date", 0: 'Holiday'})
        holidays['Date'] = pd.to_datetime(holidays['Date'])

        test['Date'] = pd.to_datetime(test['Date'])
        lm_data = test.loc[49:len(test), ].merge(holidays,
                                                 how='left',
                                                 on='Date')
        lm_data['Holiday'] = lm_data['Holiday'].fillna("Not Holiday")

        lm_data[["Hour", "Weekday", "Month",
                 "Holiday"]] = lm_data[["Hour", "Weekday", "Month",
                                        "Holiday"]].astype('category')

        DateTime = pd.DataFrame(
            lm_data.apply(lambda line: pd.to_datetime(line['Date']) + datetime.
                          timedelta(hours=line['Hour']),
                          axis=1))
        DateTime.columns = ['DateTime']

        self.lm_data = pd.concat([DateTime, lm_data], axis=1)
        self.lm_data.set_index('DateTime', inplace=True)
示例#22
0
def DR_Temp_data_cleaning(dataframe):
    '''
        inplace change of the dataframe, for the structure purpose, return this dataframe
    '''
    dataframe['Date'] = pd.to_datetime(dataframe['Date'])
    test = dataframe[[
        'Date', 'Hour', 'Weekday', 'Month', 'Load', 'Mean_Temp', 'Mean_Humi',
        'RIV_Temp', 'RIV_Humi', 'LAX_Temp', 'LAX_Humi', 'USC_Temp', 'USC_Humi',
        'WJF_Temp', 'WJF_Humi', 'TRM_Temp', 'TRM_Humi'
    ]]

    test.loc[:, 'RIV_Temp_Log'] = np.log(dataframe['RIV_Temp'])
    test.loc[:, 'LAX_Temp_Log'] = np.log(dataframe['LAX_Temp'])
    test.loc[:, 'USC_Temp_Log'] = np.log(dataframe['USC_Temp'])
    test.loc[:, 'WJF_Temp_Log'] = np.log(dataframe['WJF_Temp'])
    test.loc[:, 'TRM_Temp_Log'] = np.log(dataframe['TRM_Temp'])
    test.loc[:, 'Mean_Temp_Log'] = np.log(dataframe['TRM_Temp'])

    test.loc[:, 'Load_Log'] = np.log(dataframe['Load'])
    test['Load_Lag_48'] = test['Load_Log'].shift(48, axis=0)
    test['Humi_Lag_48'] = test['Mean_Humi'].shift(48, axis=0)

    test['RIV_Temp_Log_Lag_48'] = test['RIV_Temp_Log'].shift(48, axis=0)
    test['LAX_Temp_Log_Lag_48'] = test['LAX_Temp_Log'].shift(48, axis=0)
    test['USC_Temp_Log_Lag_48'] = test['USC_Temp_Log'].shift(48, axis=0)
    test['WJF_Temp_Log_Lag_48'] = test['WJF_Temp_Log'].shift(48, axis=0)
    test['TRM_Temp_Log_Lag_48'] = test['TRM_Temp_Log'].shift(48, axis=0)
    test['Mean_Temp_Log_Lag_48'] = test['Mean_Temp_Log'].shift(48, axis=0)

    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start='2014-01-01',
                            end=str(datetime.datetime.now()),
                            return_name=True)
    holidays = pd.DataFrame(holidays)
    holidays = holidays.reset_index()
    holidays = holidays.rename(columns={'index': "Date", 0: 'Holiday'})
    holidays['Date'] = pd.to_datetime(holidays['Date'])
    test['Date'] = pd.to_datetime(test['Date'])
    lm_data = test.loc[49:len(test), ].merge(holidays, how='left', on='Date')
    lm_data['Holiday'] = lm_data['Holiday'].fillna("Not Holiday")
    lm_data[["Hour", "Weekday", "Month",
             "Holiday"]] = lm_data[["Hour", "Weekday", "Month",
                                    "Holiday"]].astype('category')
    DateTime = pd.DataFrame(
        lm_data.apply(lambda line: pd.to_datetime(line['Date']) + datetime.
                      timedelta(hours=line['Hour']),
                      axis=1))
    DateTime.columns = ['DateTime']
    lm_data = pd.concat([DateTime, lm_data], axis=1)
    lm_data.set_index('DateTime', inplace=True)

    return lm_data
示例#23
0
文件: model.py 项目: adelina-ie/MLOPS
def predict(dict):
    """
    This function pickles the
    model and generates the test
    features by passing the dict
    parameter.
    """

    folder_selected = folderSelect()
    filename = os.path.join(folder_selected, "model.joblib")
    if filename is None:
        train_and_persist()
    else:
        with open(filename, "rb") as f:
            model = load(f)
    testDf = pd.DataFrame(dict, index=[0])
    testDf["hr"] = testDf["date"].dt.hour
    testDf["yr"] = testDf["date"].dt.year - testDf["date"].dt.year.min()
    testDf["mnth"] = testDf["date"].dt.month
    testDf["season"] = testDf["date"].map(get_season)
    testDf["weekday"] = testDf["date"].dt.weekday
    testDf["dteday"] = testDf["date"].dt.day
    testDf["dteday"] = pd.to_datetime(testDf["dteday"])
    cal = USFederalHolidayCalendar()
    holidays = pd.to_datetime(cal.holidays(start="2011-01-01", end="2011-06-30"))
    testDf["holiday"] = pd.to_datetime(testDf["date"]).dt.date in holidays
    testDf["workingday"] = pd.to_datetime(testDf["date"]).dt.date not in holidays
    testDf["holiday"] = testDf["holiday"].map(lambda x: 1 if x else 0)
    testDf["workingday"] = testDf["workingday"].map(lambda x: 1 if x else 0)
    t_max , t_min = 50, -8
    # This is for tempretaure normalization
    testDf["temp"] = (testDf["temperature_C"] - t_min)/(t_max-t_min)
    # We divide humidity by 100 to scale it between 0 and 1
    testDf["hum"] = testDf["humidity"]/100
    testDf = testDf.drop(columns=["temperature_C", "humidity"])
    # Convert the data type to eithwe category or to float
    testDf = generating_new_features(testDf)
    testDf = pd.get_dummies(testDf)
    # Finally start with Machine Learning
    test = testDf.drop(columns=["date", "dteday", "feeling_temperature_C"])
    # savedir = Path.home()
    # filename = os.path.join(savedir, "model.joblib")
    # with open(filename, "rb") as f:
    #     model = load(f)
    # print("done!")
    # f.close()
    train_X, train_y, test_X, test_y = prepare_train_data()
    train_features = train_X.columns.values
    test = fill_missing_features(test, train_features)
    pred = model.predict(test)
    pred = pred.astype(int)
    print(" Rounded predictions:\n", pred)
示例#24
0
def getStartDate():
    # get 200 business days since yesterday but leave room in case this is run on weekend
    yesterday = date.today() - timedelta(days=1)
    start_date = yesterday - BDay(202)

    # find the number of holidays between yesterday and 200 days ago and subtract that to get real start date
    calendar = USFederalHolidayCalendar()
    holidays = calendar.holidays(start_date, yesterday)

    start_date = start_date - BDay(len(holidays.tolist()))

    s = pd.Series(start_date)
    return s.map(pd.Timestamp.date)[0]
示例#25
0
    def test_calendar(self):

        calendar = USFederalHolidayCalendar()
        holidays = calendar.holidays(self.start_date, self.end_date)

        holidays_1 = calendar.holidays(self.start_date.strftime('%Y-%m-%d'),
                                       self.end_date.strftime('%Y-%m-%d'))
        holidays_2 = calendar.holidays(Timestamp(self.start_date),
                                       Timestamp(self.end_date))

        assert list(holidays.to_pydatetime()) == self.holiday_list
        assert list(holidays_1.to_pydatetime()) == self.holiday_list
        assert list(holidays_2.to_pydatetime()) == self.holiday_list
示例#26
0
def validate(date):

    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(date)  # check recent holidays

    adjusted = False

    if date in holidays:
        #create offset, where offset replace BDay as bussiness day offset to account for holiday
        offset = CustomBusinessDay(calendar=cal)
        date += offset
        adjusted = True

    return date, adjusted
示例#27
0
 def assign_workdays(self, data, start, end, speed=False):
     cal = USFederalHolidayCalendar()
     holidays = cal.holidays(start=start, end=end)
     if speed:
         num_process = mp.cpu_count()
         chunk_size = int(data.shape[0] / num_process)
         chunks = [data.loc[data.index[i: i + chunk_size]]
                   for i in range(0, data.shape[0], chunk_size)]
         pools = mp.Pool(num_process)
         res = pools.map(
             partial(self._workday_process, holidays=holidays), chunks)
         return pd.concat(res)
     else:
         return self._workday_process(data, holidays)
示例#28
0
def add_time_windows(df):
    """Adds time windows as target variable, based on delivery date and time.

    Calculates number of business days (excl weekends, shipment date, federal holidays) for delivery.
    Adds delivery time to number of business days to get time-in-transit.
    Creates window thresholds based on discussion with Jose.
    Lastly assign time windows to shipments based on time-in-transit.

    Args:
        df (pandas dataframe obj): Pandas dataframe that must contain shipment_date, delivery_date, delivery_time.

    Returns:
        pandas dataframe obj: Pandas dataframe with new columns days_in_transit, days_taken_float (time-in-transit),
            Y (target variable i.e. time window)

    """
    print("Adding time windows i.e. target variable...")
    print(f"Starting with {report_df_stats(df)}")
    start_time = time.time()
    # Calculate days in transit (exclude shipment date, holidays, weekends)
    start_date = df['shipment_date'].min()
    end_date = df['shipment_date'].max()
    calendar = USFederalHolidayCalendar()
    holidays = calendar.holidays(start_date, end_date).date.tolist()
    shipment_dates = [d.date() for d in df['shipment_date']]
    delivery_dates = [d.date() for d in df['delivery_date']]
    # -1 because we will add transit time
    df['days_in_transit'] = np.busday_count(
        shipment_dates, delivery_dates, holidays=holidays) - 1
    # Convert days in transit/delivery time to days taken (with decimals))
    # e.g. if parcel reaches at 12.00pm on 2nd business day, days taken is 1.5
    delivery_percentage_of_day = [
        (timedelta.total_seconds(d) / timedelta(days=1).total_seconds())
        for d in df['delivery_time']
    ]
    df['days_taken_float'] = df['days_in_transit'] + delivery_percentage_of_day
    # Keep rows from -1 to 5 days in transit. The rest are rare occurrences.
    max_days_to_keep = 5
    df = df[df['days_in_transit'].isin(np.arange(-1, max_days_to_keep))]

    # Assign time windows
    time_window_thresholds = create_time_window_thresholds()
    tqdm.pandas(desc="Assign time window")
    df['Y'] = df.progress_apply(lambda x: assign_time_window(
        x['days_taken_float'], time_window_thresholds),
                                axis=1)

    print(f"Ending with {report_df_stats(df)}.")
    utilities.print_elapsed_time(start_time)
    return df
def format_subway_data(path_to_csv_file: str):
    # read data, convert to datetime
    df = pd.read_csv(path_to_csv_file)
    df["servicedate"] = pd.to_datetime(df["servicedate"])

    # add holidays
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start=df["servicedate"].min(),
                            end=df["servicedate"].max())

    # mark as holiday and weekday
    df["holiday"] = df["servicedate"].dt.date.astype("datetime64").isin(
        holidays.date)
    df["weekday"] = df["servicedate"].dt.dayofweek

    # define peak, mark weekdays, convert service date back
    conditions = [(df["holiday"] == False) & (df["weekday"] < 5)]
    choices = ["peak"]
    df["peak"] = np.select(conditions, choices, default="offpeak")
    df["week"] = df["servicedate"].dt.isocalendar().week
    df["year"] = df["servicedate"].dt.isocalendar().year
    df["servicedate"] = df["servicedate"].dt.date.astype(str)

    # select date of the week
    dates = df[df["weekday"] == 0]
    dates = dates[["servicedate", "week", "year"]].drop_duplicates()

    # limit data to just peak, merge back dates
    final = df[df["peak"] == "peak"]
    final = final.groupby(["year", "week", "route_or_line"
                           ])["sum"].mean().round().reset_index()

    final = final.merge(dates, on=["week", "year"], how="left")

    # get list of bus routes
    routelist = list(set(final["route_or_line"].tolist()))

    # create dict
    output = {}

    # write out each set of routes to dict
    for route in routelist:
        dftemp = final[final["route_or_line"] == route]
        dictdata = (dftemp[["servicedate", "sum"]].rename(columns={
            "servicedate": "date",
            "sum": "riders"
        }).to_dict(orient="records"))
        output[route] = dictdata

    return output
示例#30
0
def validate ( date ):
  
    cal = USFederalHolidayCalendar() 
    holidays = cal.holidays(date) # check recent holidays

    adjusted = False

    if date in holidays:
        #create offset, where offset replace BDay as bussiness day offset to account for holiday
        offset = CustomBusinessDay(calendar=cal)
        date+=offset
        adjusted = True

    return date, adjusted
示例#31
0
    def test_calendar(self):

        calendar = USFederalHolidayCalendar()
        holidays = calendar.holidays(self.start_date, self.end_date)

        holidays_1 = calendar.holidays(
            self.start_date.strftime('%Y-%m-%d'),
            self.end_date.strftime('%Y-%m-%d'))
        holidays_2 = calendar.holidays(
            Timestamp(self.start_date),
            Timestamp(self.end_date))

        assert list(holidays.to_pydatetime()) == self.holiday_list
        assert list(holidays_1.to_pydatetime()) == self.holiday_list
        assert list(holidays_2.to_pydatetime()) == self.holiday_list
示例#32
0
def generate_calendar(year, drop_index=False):
    '''
    Simple function to generate a calendar containing
    US holidays, weekdays and  holiday weeks.
    '''
    from pandas.tseries.offsets import YearEnd
    from pandas.tseries.holiday import USFederalHolidayCalendar

    start_date = pd.to_datetime('1/1/'+str(year))
    end_date = start_date + YearEnd()
    DAT = pd.date_range(str(start_date), str(end_date), freq='D')
    MO = [d.strftime('%B') for d in DAT]
    holidays = USFederalHolidayCalendar().holidays(start=start_date, end=end_date)

    cal_df = pd.DataFrame({'date':DAT, 'month':MO})
    cal_df['year'] = [format(d, '%Y') for d in DAT]
    cal_df['weekday'] = [format(d, '%A') for d in DAT]
    cal_df['is_weekday'] = cal_df.weekday.isin(['Monday','Tuesday','Wednesday','Thursday','Friday'])
    cal_df['is_weekday'] = cal_df['is_weekday'].astype(int)
    cal_df['is_holiday'] = cal_df['date'].isin(holidays)
    cal_df['is_holiday'] = cal_df['is_holiday'].astype(int)
    cal_df['is_holiday_week'] = cal_df.is_holiday.rolling(window=7,center=True,min_periods=1).sum()
    cal_df['is_holiday_week'] = cal_df['is_holiday_week'].astype(int)

    if not drop_index: cal_df.set_index('date', inplace=True)

    return cal_df
def addExogenousVariables(df):
    holidays = USFederalHolidayCalendar().holidays(start=df.index.min(),
                                                   end=df.index.max())
    df['holiday'] = df.index.isin(holidays).astype(int)
    df['week_number'] = df.index.week
    df['day_of_week'] = df.index.dayofweek
    return df
示例#34
0
def get_calendar():
    """Summary
    
    Returns:
        TYPE: Description
    """
    return CustomBusinessDay(calendar=USFederalHolidayCalendar())
def predict_future_price(df, lookback_days, forcast_days, model, sc):
    """
    Prediction of future stock price
    """

    # calculate forcast dates and indexs, used for plot
    B_DAY = CustomBusinessDay(calendar=USFederalHolidayCalendar()) # business day
    forcast_dates = [df['Date'].iloc[-1]+i*B_DAY for i in range(1,forcast_days+1)]
    forcast_indexs = [df['Date'].index[-1]+i for i in range(1,forcast_days+1)]

    # input X for forcast
    X_scaled = df[['Scaled_Close']][-lookback_days:].values
    X_scaled = X_scaled.reshape((1, lookback_days, 1))

    # prediction the scaled price with model
    forcast_price_scaled = model.predict(X_scaled)

    # transform back to the normal price
    forcast_prices = sc.inverse_transform(forcast_price_scaled)

    # create the forcast_dataframe
    forcast_dataframe = pd.DataFrame({
        'index' : forcast_indexs,
        'Date' : forcast_dates,
        'Adj Close' : forcast_prices[0],
        'Scaled_Close' : forcast_price_scaled[0]
    })

    forcast_dataframe = forcast_dataframe.set_index('index', drop=True)
    return forcast_dataframe
        def describe_date(amd_data):
            """
            Add columns for weekday, month, and holiday. Based on existing
            timestamp column in amd_data dataframe.
            """

            holidays = USFederalHolidayCalendar().holidays()

            if type(amd_data) == dd.DataFrame:

                amd_data = amd_data.assign(weekday=amd_data.timestamp.map(
                    lambda x: x.weekday() > 4, meta=('weekday', 'bool')))

                amd_data = amd_data.assign(month=amd_data.timestamp.map(
                    lambda x: x.month, meta=('month', 'int')))

                amd_data = amd_data.assign(holiday=amd_data.timestamp.map(
                    lambda x: x.date() in holidays, meta=('holiday', 'bool')))

            if type(amd_data) == pd.DataFrame:

                amd_data['weekday'] = amd_data.timestamp.map(
                    lambda x: x.weekday() > 4)

                amd_data['month'] = amd_data.timestamp.map(lambda x: x.month)

                amd_data['holiday'] = amd_data.timestamp.map(
                    lambda x: x in holidays)

            return amd_data
示例#37
0
def Converter_Monthly_Daily(Name_O, Indicator, start_date):
    usb = CustomBusinessDay(calendar=USFederalHolidayCalendar())
    rng = pd.date_range(start=start_date, end="1/1/2018", freq=usb)
    Name_Daily = pd.DataFrame({'DATE': rng})
    Name_Daily['DATE'] = pd.to_datetime(Name_Daily.DATE)
    Name_Daily['Year'] = Name_Daily.DATE.dt.year
    Name_Daily['Month'] = Name_Daily.DATE.dt.month
    Name_Daily['Day'] = Name_Daily.DATE.dt.day
    #Name_Daily = Name_Daily.drop(columns=['DATE'])

    year = Name_Daily['Year'].unique()
    year = np.append(year, [2018])
    arr = []
    index = 0
    for y in year:
        newY = Name_Daily.loc[Name_Daily.Year == y]
        for m in range(1, 13):
            numberofDAYS = len(newY.loc[(newY.Month == m)])
            for day in range(0, numberofDAYS):
                arr.append(Name_O[Indicator][index] +
                           (Name_O[Indicator][index + 1] -
                            Name_O[Indicator][index]) * day / numberofDAYS)
            index = index + 1

    #Name_Daily = Name_Daily.drop(columns=['Year','Month','Day'])
    Name_Daily[Indicator] = arr
    #out_filename = "Economy_Indicators_Data"+ "\\" + Indicator + '_Daily_Insoo.csv'
    #Name_Daily.to_csv(out_filename,sep=',', encoding='utf-8')
    return Name_Daily
示例#38
0
def test_calendar(transform):
    start_date = datetime(2012, 1, 1)
    end_date = datetime(2012, 12, 31)

    calendar = USFederalHolidayCalendar()
    holidays = calendar.holidays(transform(start_date), transform(end_date))

    expected = [
        datetime(2012, 1, 2),
        datetime(2012, 1, 16),
        datetime(2012, 2, 20),
        datetime(2012, 5, 28),
        datetime(2012, 7, 4),
        datetime(2012, 9, 3),
        datetime(2012, 10, 8),
        datetime(2012, 11, 12),
        datetime(2012, 11, 22),
        datetime(2012, 12, 25)
    ]

    assert list(holidays.to_pydatetime()) == expected
示例#39
0
def get_first_bday_of_month(mnth=None, yr=None):
    '''
    Return the first business day of the current month if no variables provided
    Return the first business day of the month and year provided if variables provided

    Tests:
    In [188]: config.get_first_bday_of_month(12,2015)
    Out[188]: datetime.date(2015, 12, 1)

    In [189]: config.get_first_bday_of_month(11,2015)
    Out[189]: datetime.date(2015, 11, 2)

    In [190]: config.get_first_bday_of_month(10,2015)
    Out[190]: datetime.date(2015, 10, 1)

    In [191]: config.get_first_bday_of_month(1,2016)
    Out[191]: datetime.date(2016, 1, 4)

    In [192]: config.get_first_bday_of_month(8,2015)
    Out[192]: datetime.date(2015, 8, 3)
    :param mnth:
    :param yr:
    :return:
    '''
    from calendar import monthrange
    from pandas.tseries.holiday import USFederalHolidayCalendar
    from pandas.tseries.offsets import CustomBusinessDay
    if yr is None or mnth is None:
        yr = pd.datetime.now().year if pd.datetime.now().month != 1 else pd.datetime.now().year - 1
        mnth = pd.datetime.now().month - 1 if pd.datetime.now().month != 1 else 12
    else:
        yr = yr if mnth != 1 else yr - 1
        mnth = mnth - 1 if mnth != 1 else 12
    end_last = monthrange(yr, mnth)
    end_last = pd.Timestamp('%s/%s/%s' % (mnth, end_last[1], yr)).date()
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start=end_last - pd.tseries.offsets.Day(60),
                            end=end_last + pd.tseries.offsets.Day(60)).to_pydatetime()
    bday_cus = CustomBusinessDay(holidays=holidays)
    return (end_last + bday_cus).date()
示例#40
0
    if beginning_range <= date_of_interest <= end_range:
        days_since_school_year_start = (date_of_interest - beginning_range).days
    #for 2015/2016 School year
    beginning_range = datetime.date(2015, 8, 31)
    end_range = datetime.date(2016, 8, 29)
    if beginning_range <= date_of_interest <= end_range:
        days_since_school_year_start = (date_of_interest - beginning_range).days
    datekey_list.append(weather_datekey)
    days_since_school_year_start_list.append(days_since_school_year_start)
days_since_school_year_start_db = {'days_since_school_year_start' : pd.Series(days_since_school_year_start_list, index=datekey_list)}
days_since_school_year_start_df = pd.DataFrame(days_since_school_year_start_db)



#Account for major US federal holidays
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2013-01-01', end='2016-12-31').to_pydatetime()
datekey_list = []
holidays_list = []
#for weather_datekey in weather_df.index.values:
for weather_datekey in weather_df.datekey:
    year = int(str(weather_datekey)[:4])
    month = int(str(weather_datekey)[4:6])
    day = int(str(weather_datekey)[6:])
    date_of_interest = datetime.date(year, month, day)
    if date_of_interest in holidays:
        holidays_list.append(1)
    else:
        holidays_list.append(0)
    datekey_list.append(weather_datekey)
holidays_db = {'holidays' : pd.Series(holidays_list, index=datekey_list)}
def make_regression_features_oct(df):
    '''Create features for regression given a dataframe
    '''

    from pandas.tseries.holiday import USFederalHolidayCalendar

    x_dayssincestart = df.dayssincestart

    x_dayofweek = df.dayofweek
    x_dayofweek = pd.get_dummies(x_dayofweek,prefix='dayofweek')

    x_dayofmonth = df.dayofmonth
    x_dayofmonth = pd.get_dummies(x_dayofmonth, prefix='dayofmonth')

    x_week1 = pd.Series([1 if day < 8 else 0 for day in df.dayofmonth],name='week1')
    x_week2 = pd.Series([1 if (day >= 8 and day < 16) else 0 for day in df.dayofmonth],name='week2')
    x_week3 = pd.Series([1 if (day >= 16 and day < 23) else 0 for day in df.dayofmonth],name='week3')
    x_week4 = pd.Series([1 if day >= 23 else 0 for day in df.dayofmonth],name='week4')

    x_isweekend = pd.Series([1 if (day == 5 or day == 6) else 0 for day in df.dayofweek],name='isweekend')

    x_istueswed = pd.Series([1 if (day == 1 or day == 2) else 0 for day in df.dayofweek],name='istueswed')

    x_isfrisat = pd.Series([1 if (day == 4 or day == 5) else 0 for day in df.dayofweek],name='isfrisat')

    #get holidays
    calendar = USFederalHolidayCalendar()
    holidays = calendar.holidays(start=df.datetime.min(), end=df.datetime.max())
    x_isholiday = pd.Series([1 if day in holidays.tolist() else 0 for day in df.datetime],name='x_isholiday')

    #find 3-day weekends
    x_isholidaywknd = pd.Series(np.zeros(len(x_isholiday)),name='x_isholidaywknd')
    x_weekendandholiday = x_isweekend + x_isholiday
    for i in range(1, len(x_weekendandholiday)-2):
        if x_weekendandholiday[i] == 1:
            if x_weekendandholiday[i+1] == 1 and x_weekendandholiday[i+2] ==1:
                x_isholidaywknd[i] = 1
                x_isholidaywknd[i+1] = 1
                x_isholidaywknd[i+2] = 1

    x_week1tueswed = x_week1 & x_istueswed
    x_week2tueswed = x_week2 & x_istueswed
    x_week3tueswed = x_week3 & x_istueswed
    x_week4tueswed = x_week4 & x_istueswed

    x_week1frisat = x_week1 & x_isfrisat
    x_week2frisat = x_week2 & x_isfrisat
    x_week3frisat = x_week3 & x_isfrisat
    x_week4frisat = x_week4 & x_isfrisat

    x_week1wknd = x_week1 & x_isweekend
    x_week2wknd = x_week2 & x_isweekend
    x_week3wknd = x_week3 & x_isweekend
    x_week4wknd = x_week4 & x_isweekend

    X_vars = pd.concat([x_dayssincestart,x_dayofweek,x_dayofmonth,
                       x_week1,x_week2,x_week3,x_week4,
                       x_isweekend,x_istueswed,x_isfrisat,
                       x_isholiday,x_isholidaywknd,
                       x_week1tueswed,x_week2tueswed,x_week3tueswed,x_week4tueswed,
                       x_week1frisat,x_week2frisat,x_week3frisat,x_week4frisat,
                       x_week1wknd,x_week2wknd,x_week3wknd,x_week4wknd],
                       axis=1)

    return X_vars
#add a dummy variable for weekend day
merged_master['weekend_day'] = np.where((merged_master['day_of_week']==5)|(merged_master['day_of_week']==6),1,0)

#drop day_of_week field
merged_master.drop('day_of_week', axis=1, inplace=True)

#add a column for month and create month dummies
merged_master['month'] = merged_master.index.month
dummy_df = pd.get_dummies(merged_master['month'], prefix='month')
merged_master = pd.concat([merged_master,dummy_df], axis=1)
merged_master.drop('month', axis=1, inplace=True)

#add a column for holiday
merged_master['date'] = merged_master.index.date
merged_master['date'] = pd.to_datetime(merged_master['date'])
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start=merged_master.date.min(), end=merged_master.date.max())
merged_master['holiday'] = merged_master['date'].isin(holidays).astype(int)
merged_master.drop('date', axis=1, inplace=True)

#add a column for days since first date in dataset
merged_master['days_since_first_date'] = (merged_master.index.date - (merged_master.index.date.min())).astype('timedelta64[D]').astype(int)

#Drop any rows with null values
merged_master.dropna(axis=0, inplace=True)

#rearrange column order
merged_master['station_id'] = merged_master['from_station_id']
merged_master.drop(['from_station_id'], axis=1, inplace=True)