def bus_day_of_month_seasonality( self, data_frame, month_list=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], cum=True, cal="FX", partition_by_month=True, ): tsc = TimeSeriesCalcs() tsf = TimeSeriesFilter() data_frame.index = pandas.to_datetime(data_frame.index) data_frame = tsf.filter_time_series_by_holidays(data_frame, cal) monthly_seasonality = tsc.average_by_month_day_by_bus_day(data_frame, cal) monthly_seasonality = monthly_seasonality.loc[month_list] if partition_by_month: monthly_seasonality = monthly_seasonality.unstack(level=0) if cum is True: monthly_seasonality.ix[0] = numpy.zeros(len(monthly_seasonality.columns)) if partition_by_month: monthly_seasonality.index = monthly_seasonality.index + 1 # shifting index monthly_seasonality = monthly_seasonality.sort() # sorting by index monthly_seasonality = tsc.create_mult_index(monthly_seasonality) return monthly_seasonality
def get_time_series_cached(self, time_series_request): """ get_time_series_cached - Loads time series from cache (if it exists) Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ if (time_series_request.freq == "intraday"): ticker = time_series_request.tickers else: ticker = None fname = self.create_time_series_hash_key(time_series_request, ticker) if (fname in self._time_series_cache): data_frame = self._time_series_cache[fname] tsf = TimeSeriesFilter() return tsf.filter_time_series(time_series_request, data_frame) return None
def get_surprise_against_intraday_moves_over_event( self, data_frame_cross_orig, cross, event_fx, event_name, start, end, offset_list=[1, 5, 30, 60], add_surprise=False, surprise_field='survey-average'): tsf = TimeSeriesFilter() fields = [ 'actual-release', 'survey-median', 'survey-average', 'survey-high', 'survey-low' ] ef_time_frame = self.get_economic_event_date_time_fields( fields, event_fx, event_name) ef_time_frame = tsf.filter_time_series_by_date(start, end, ef_time_frame) return self.get_surprise_against_intraday_moves_over_custom_event( data_frame_cross_orig, ef_time_frame, cross, event_fx, event_name, start, end, offset_list=offset_list, add_surprise=add_surprise, surprise_field=surprise_field)
def get_intraday_moves_over_event(self, data_frame_rets, cross, event_fx, event_name, start, end, vol, mins=3 * 60, min_offset=0, create_index=False, resample=False, freq='minutes'): tsf = TimeSeriesFilter() ef_time_frame = self.get_economic_event_date_time_dataframe( event_fx, event_name) ef_time_frame = tsf.filter_time_series_by_date(start, end, ef_time_frame) return self.get_intraday_moves_over_custom_event( data_frame_rets, ef_time_frame, vol, mins=mins, min_offset=min_offset, create_index=create_index, resample=resample, freq=freq) #, start, end)
def get_economic_event_ret_over_custom_event_day(self, data_frame_in, event_dates, name, event, start, end, lagged = False, NYC_cutoff = 10): time_series_filter = TimeSeriesFilter() event_dates = time_series_filter.filter_time_series_by_date(start, end, event_dates) data_frame = data_frame_in.copy(deep=True) # because we change the dates! time_series_tz = TimeSeriesTimezone() calendar = Calendar() bday = CustomBusinessDay(weekmask='Mon Tue Wed Thu Fri') event_dates_nyc = time_series_tz.convert_index_from_UTC_to_new_york_time(event_dates) average_hour_nyc = numpy.average(event_dates_nyc.index.hour) event_dates = calendar.floor_date(event_dates) # realised is traditionally on later day eg. 3rd Jan realised ON is 2nd-3rd Jan realised # so if Fed meeting is on 2nd Jan later, then we need realised labelled on 3rd (so minus a day) # implied expires on next day eg. 3rd Jan implied ON is 3rd-4th Jan implied # TODO smarter way of adjusting dates, as sometimes events can be before/after 10am NY cut if (lagged and average_hour_nyc >= NYC_cutoff): data_frame.index = data_frame.index - bday elif (not lagged and average_hour_nyc < NYC_cutoff): # ie. implied data_frame.index = data_frame.index + bday # set as New York time and select only those ON vols at the 10am NY cut just before the event data_frame_events = data_frame.ix[event_dates.index] data_frame_events.columns = data_frame.columns.values + '-' + name + ' ' + event return data_frame_events
def harvest_time_series(self, time_series_request, kill_session = True): """ havest_time_series - Loads time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- DataFrame """ tickers = time_series_request.tickers loader = self.get_loader(time_series_request.data_source) # check if tickers have been specified (if not load all of them for a category) # also handle single tickers/list tickers create_tickers = False if tickers is None : create_tickers = True elif isinstance(tickers, str): if tickers == '': create_tickers = True elif isinstance(tickers, list): if tickers == []: create_tickers = True if create_tickers: time_series_request.tickers = self.config.get_tickers_list_for_category( time_series_request.category, time_series_request.source, time_series_request.freq, time_series_request.cut) # intraday or tick: only one ticker per cache file if (time_series_request.freq in ['intraday', 'tick']): data_frame_agg = self.download_intraday_tick(time_series_request, loader) # daily: multiple tickers per cache file - assume we make one API call to vendor library else: data_frame_agg = self.download_daily(time_series_request, loader) if('internet_load' in time_series_request.cache_algo): self.logger.debug("Internet loading.. ") # signal to loader template to exit session if loader is not None and kill_session == True: loader.kill_session() if(time_series_request.cache_algo == 'cache_algo'): self.logger.debug("Only caching data in memory, do not return any time series."); return tsf = TimeSeriesFilter() # only return time series if specified in the algo if 'return' in time_series_request.cache_algo: # special case for events/events-dt which is not indexed like other tables if hasattr(time_series_request, 'category'): if 'events' in time_series_request.category: return data_frame_agg try: return tsf.filter_time_series(time_series_request, data_frame_agg) except: return None
def bus_day_of_month_seasonality(self, data_frame, month_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], cum = True, cal = "FX", partition_by_month = True, add_average = False, price_index = False): tsc = TimeSeriesCalcs() tsf = TimeSeriesFilter() if price_index: data_frame = data_frame.resample('B') # resample into business days data_frame = tsc.calculate_returns(data_frame) data_frame.index = pandas.to_datetime(data_frame.index) data_frame = tsf.filter_time_series_by_holidays(data_frame, cal) monthly_seasonality = tsc.average_by_month_day_by_bus_day(data_frame, cal) monthly_seasonality = monthly_seasonality.loc[month_list] if partition_by_month: monthly_seasonality = monthly_seasonality.unstack(level=0) if add_average: monthly_seasonality['Avg'] = monthly_seasonality.mean(axis=1) if cum is True: if partition_by_month: monthly_seasonality.loc[0] = numpy.zeros(len(monthly_seasonality.columns)) # monthly_seasonality.index = monthly_seasonality.index + 1 # shifting index monthly_seasonality = monthly_seasonality.sort() monthly_seasonality = tsc.create_mult_index(monthly_seasonality) return monthly_seasonality
def compare_strategy_vs_benchmark(self, br, strategy_df, benchmark_df): """ compare_strategy_vs_benchmark - Compares the trading strategy we are backtesting against a benchmark Parameters ---------- br : BacktestRequest Parameters for backtest such as start and finish dates strategy_df : pandas.DataFrame Strategy time series benchmark_df : pandas.DataFrame Benchmark time series """ include_benchmark = False calc_stats = False if hasattr(br, 'include_benchmark'): include_benchmark = br.include_benchmark if hasattr(br, 'calc_stats'): calc_stats = br.calc_stats if include_benchmark: tsd = TimeSeriesDesc() cash_backtest = CashBacktest() ts_filter = TimeSeriesFilter() ts_calcs = TimeSeriesCalcs() # align strategy time series with that of benchmark strategy_df, benchmark_df = strategy_df.align(benchmark_df, join='left', axis = 0) # if necessary apply vol target to benchmark (to make it comparable with strategy) if hasattr(br, 'portfolio_vol_adjust'): if br.portfolio_vol_adjust is True: benchmark_df = cash_backtest.calculate_vol_adjusted_index_from_prices(benchmark_df, br = br) # only calculate return statistics if this has been specified (note when different frequencies of data # might underrepresent vol if calc_stats: benchmark_df = benchmark_df.fillna(method='ffill') tsd.calculate_ret_stats_from_prices(benchmark_df, br.ann_factor) benchmark_df.columns = tsd.summary() # realign strategy & benchmark strategy_benchmark_df = strategy_df.join(benchmark_df, how='inner') strategy_benchmark_df = strategy_benchmark_df.fillna(method='ffill') strategy_benchmark_df = ts_filter.filter_time_series_by_date(br.plot_start, br.finish_date, strategy_benchmark_df) strategy_benchmark_df = ts_calcs.create_mult_index_from_prices(strategy_benchmark_df) self._benchmark_pnl = benchmark_df self._benchmark_tsd = tsd return strategy_benchmark_df return strategy_df
def get_intraday_moves_over_event(self, data_frame_rets, cross, event_fx, event_name, start, end, vol, mins = 3 * 60, min_offset = 0, create_index = False, resample = False, freq = 'minutes'): tsf = TimeSeriesFilter() ef_time_frame = self.get_economic_event_date_time_dataframe(event_fx, event_name) ef_time_frame = tsf.filter_time_series_by_date(start, end, ef_time_frame) return self.get_intraday_moves_over_custom_event(data_frame_rets, ef_time_frame, vol, mins = mins, min_offset = min_offset, create_index = create_index, resample = resample, freq = freq)#, start, end)
def __init__(self): super(EventStudy, self).__init__() self.config = ConfigManager() self.logger = LoggerManager().getLogger(__name__) self.time_series_filter = TimeSeriesFilter() self.time_series_io = TimeSeriesIO() if (LightEventsFactory._econ_data_frame is None): self.load_economic_events() return
def get_surprise_against_intraday_moves_over_event(self, data_frame_cross_orig, cross, event_fx, event_name, start, end, offset_list = [1, 5, 30, 60], add_surprise = False, surprise_field = 'survey-average'): tsf = TimeSeriesFilter() fields = ['actual-release', 'survey-median', 'survey-average', 'survey-high', 'survey-low'] ef_time_frame = self.get_economic_event_date_time_fields(fields, event_fx, event_name) ef_time_frame = tsf.filter_time_series_by_date(start, end, ef_time_frame) return self.get_surprise_against_intraday_moves_over_custom_event(data_frame_cross_orig, ef_time_frame, cross, event_fx, event_name, start, end, offset_list = offset_list, add_surprise = add_surprise, surprise_field = surprise_field)
def __init__(self): # self.config = ConfigManager() self.logger = LoggerManager().getLogger(__name__) self.time_series_filter = TimeSeriesFilter() self.time_series_io = TimeSeriesIO() self._bbg_default_api = Constants().bbg_default_api self._intraday_code = -1 return
def dump_indicators(self): tsf = TimeSeriesFilter() self.logger.info("About to write all indicators to CSV") self.indicator.to_csv(self._csv_indicator_dump, date_format='%d/%m/%Y') if (self._csv_econ_indicator_dump is not None): self.logger.info("About to write economy based indicators to CSV") self.indicator_econ.to_csv(self._csv_econ_indicator_dump, date_format='%d/%m/%Y') self.logger.info("About to write final indicators to CSV") # remove weekends and remove start of series if (self._csv_final_indicator_dump is not None): indicator_final_copy = tsf.filter_time_series_by_holidays(self.indicator_final, cal = 'WEEKDAY') indicator_final_copy = tsf.filter_time_series_by_date( start_date="01 Jan 2000", finish_date = None, data_frame=indicator_final_copy) indicator_final_copy.to_csv(self._csv_final_indicator_dump, date_format='%d/%m/%Y')
def get_economic_event_ret_over_custom_event_day(self, data_frame_in, event_dates, name, event, start, end, lagged=False, NYC_cutoff=10): time_series_filter = TimeSeriesFilter() event_dates = time_series_filter.filter_time_series_by_date( start, end, event_dates) data_frame = data_frame_in.copy( deep=True) # because we change the dates! time_series_tz = TimeSeriesTimezone() calendar = Calendar() bday = CustomBusinessDay(weekmask='Mon Tue Wed Thu Fri') event_dates_nyc = time_series_tz.convert_index_from_UTC_to_new_york_time( event_dates) average_hour_nyc = numpy.average(event_dates_nyc.index.hour) event_dates = calendar.floor_date(event_dates) # realised is traditionally on later day eg. 3rd Jan realised ON is 2nd-3rd Jan realised # so if Fed meeting is on 2nd Jan later, then we need realised labelled on 3rd (so minus a day) # implied expires on next day eg. 3rd Jan implied ON is 3rd-4th Jan implied # TODO smarter way of adjusting dates, as sometimes events can be before/after 10am NY cut if (lagged and average_hour_nyc >= NYC_cutoff): data_frame.index = data_frame.index - bday elif (not lagged and average_hour_nyc < NYC_cutoff): # ie. implied data_frame.index = data_frame.index + bday # set as New York time and select only those ON vols at the 10am NY cut just before the event data_frame_events = data_frame.ix[event_dates.index] data_frame_events.columns = data_frame.columns.values + '-' + name + ' ' + event return data_frame_events
def bus_day_of_month_seasonality( self, data_frame, month_list=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], cum=True, cal="FX", partition_by_month=True, add_average=False, price_index=False): tsc = TimeSeriesCalcs() tsf = TimeSeriesFilter() if price_index: data_frame = data_frame.resample( 'B') # resample into business days data_frame = tsc.calculate_returns(data_frame) data_frame.index = pandas.to_datetime(data_frame.index) data_frame = tsf.filter_time_series_by_holidays(data_frame, cal) monthly_seasonality = tsc.average_by_month_day_by_bus_day( data_frame, cal) monthly_seasonality = monthly_seasonality.loc[month_list] if partition_by_month: monthly_seasonality = monthly_seasonality.unstack(level=0) if add_average: monthly_seasonality['Avg'] = monthly_seasonality.mean(axis=1) if cum is True: if partition_by_month: monthly_seasonality.loc[0] = numpy.zeros( len(monthly_seasonality.columns)) # monthly_seasonality.index = monthly_seasonality.index + 1 # shifting index monthly_seasonality = monthly_seasonality.sort() monthly_seasonality = tsc.create_mult_index(monthly_seasonality) return monthly_seasonality
def dump_indicators(self): tsf = TimeSeriesFilter() self.logger.info("About to write all indicators to CSV") self.indicator.to_csv(self._csv_indicator_dump, date_format='%d/%m/%Y') if (self._csv_econ_indicator_dump is not None): self.logger.info("About to write economy based indicators to CSV") self.indicator_econ.to_csv(self._csv_econ_indicator_dump, date_format='%d/%m/%Y') self.logger.info("About to write final indicators to CSV") # remove weekends and remove start of series if (self._csv_final_indicator_dump is not None): indicator_final_copy = tsf.filter_time_series_by_holidays( self.indicator_final, cal='WEEKDAY') indicator_final_copy = tsf.filter_time_series_by_date( start_date="01 Jan 2000", finish_date=None, data_frame=indicator_final_copy) indicator_final_copy.to_csv(self._csv_final_indicator_dump, date_format='%d/%m/%Y')
def get_intraday_moves_over_custom_event(self, data_frame_rets, ef_time_frame, vol=False, minute_start = 5, mins = 3 * 60, min_offset = 0 , create_index = False, resample = False, freq = 'minutes'): tsf = TimeSeriesFilter() ef_time_frame = tsf.filter_time_series_by_date(data_frame_rets.index[0], data_frame_rets.index[-1], ef_time_frame) ef_time = ef_time_frame.index if freq == 'minutes': ef_time_start = ef_time - timedelta(minutes = minute_start) ef_time_end = ef_time + timedelta(minutes = mins) ann_factor = 252 * 1440 elif freq == 'days': ef_time = ef_time_frame.index.normalize() ef_time_start = ef_time - timedelta(days = minute_start) ef_time_end = ef_time + timedelta(days = mins) ann_factor = 252 ords = range(-minute_start + min_offset, mins + min_offset) # all data needs to be equally spaced if resample: tsf = TimeSeriesFilter() # make sure time series is properly sampled at 1 min intervals data_frame_rets = data_frame_rets.resample('1min') data_frame_rets = data_frame_rets.fillna(value = 0) data_frame_rets = tsf.remove_out_FX_out_of_hours(data_frame_rets) data_frame_rets['Ind'] = numpy.nan start_index = data_frame_rets.index.searchsorted(ef_time_start) finish_index = data_frame_rets.index.searchsorted(ef_time_end) # not all observation windows will be same length (eg. last one?) # fill the indices which represent minutes # TODO vectorise this! for i in range(0, len(ef_time_frame.index)): try: data_frame_rets.ix[start_index[i]:finish_index[i], 'Ind'] = ords except: data_frame_rets.ix[start_index[i]:finish_index[i], 'Ind'] = ords[0:(finish_index[i] - start_index[i])] # set the release dates data_frame_rets.ix[start_index,'Rel'] = ef_time # set entry points data_frame_rets.ix[finish_index + 1,'Rel'] = numpy.zeros(len(start_index)) # set exit points data_frame_rets['Rel'] = data_frame_rets['Rel'].fillna(method = 'pad') # fill down signals data_frame_rets = data_frame_rets[pandas.notnull(data_frame_rets['Ind'])] # get rid of other data_frame = data_frame_rets.pivot(index='Ind', columns='Rel', values=data_frame_rets.columns[0]) data_frame.index.names = [None] if create_index: tsc = TimeSeriesCalcs() data_frame.ix[-minute_start + min_offset,:] = numpy.nan data_frame = tsc.create_mult_index(data_frame) else: if vol is True: # annualise (if vol) data_frame = pandas.rolling_std(data_frame, window=5) * math.sqrt(ann_factor) else: data_frame = data_frame.cumsum() return data_frame
def calculate_leverage_factor(self, returns_df, vol_target, vol_max_leverage, vol_periods = 60, vol_obs_in_year = 252, vol_rebalance_freq = 'BM', data_resample_freq = None, data_resample_type = 'mean', returns = True, period_shift = 0): """ calculate_leverage_factor - Calculates the time series of leverage for a specified vol target Parameters ---------- returns_df : DataFrame Asset returns vol_target : float vol target for assets vol_max_leverage : float maximum leverage allowed vol_periods : int number of periods to calculate volatility vol_obs_in_year : int number of observations in the year vol_rebalance_freq : str how often to rebalance vol_resample_type : str do we need to resample the underlying data first? (eg. have we got intraday data?) returns : boolean is this returns time series or prices? period_shift : int should we delay the signal by a number of periods? Returns ------- pandas.Dataframe """ tsc = TimeSeriesCalcs() tsf = TimeSeriesFilter() if data_resample_freq is not None: return # TODO not implemented yet if not returns: returns_df = tsc.calculate_returns(returns_df) roll_vol_df = tsc.rolling_volatility(returns_df, periods = vol_periods, obs_in_year = vol_obs_in_year).shift(period_shift) # calculate the leverage as function of vol target (with max lev constraint) lev_df = vol_target / roll_vol_df lev_df[lev_df > vol_max_leverage] = vol_max_leverage lev_df = tsf.resample_time_series_frequency(lev_df, vol_rebalance_freq, data_resample_type) returns_df, lev_df = returns_df.align(lev_df, join='left', axis = 0) lev_df = lev_df.fillna(method='ffill') lev_df.ix[0:vol_periods] = numpy.nan # ignore the first elements before the vol window kicks in return lev_df
groupby([date_index.month, date_index.day]).mean() def group_by_year(self, data_frame): date_index = data_frame.index return data_frame.\ groupby([date_index.year]) def average_by_day_hour_min_by_bus_day(self, data_frame): date_index = data_frame.index return data_frame.\ groupby([Calendar().get_bus_day_of_month(date_index), date_index.hour, date_index.minute]).mean() def remove_NaN_rows(self, data_frame): return data_frame.dropna() if __name__ == '__main__': tsc = TimeSeriesCalcs() tsf = TimeSeriesFilter() # test rolling ewma date_range = pandas.bdate_range('2014-01-01', '2014-02-28') print(tsc.get_bus_day_of_month(date_range)) foo = pandas.DataFrame(numpy.arange(0.0,13.0)) print(tsc.rolling_ewma(foo, span=3))
class LightEventsFactory(EventStudy): _econ_data_frame = None # where your HDF5 file is stored with economic data MUST CHANGE!! _hdf5_file_econ_file = "somefilnename.h5" ### manual offset for certain events where Bloomberg displays the wrong date (usually because of time differences) _offset_events = { 'AUD-Australia Labor Force Employment Change SA.release-dt': 1 } def __init__(self): super(EventStudy, self).__init__() self.config = ConfigManager() self.logger = LoggerManager().getLogger(__name__) self.time_series_filter = TimeSeriesFilter() self.time_series_io = TimeSeriesIO() if (LightEventsFactory._econ_data_frame is None): self.load_economic_events() return def load_economic_events(self): LightEventsFactory._econ_data_frame = self.time_series_io.read_time_series_cache_from_disk( self._hdf5_file_econ_file) def harvest_category(self, category_name): cat = self.config.get_categories_from_tickers_selective_filter( category_name) for k in cat: time_series_request = self.time_series_factory.populate_time_series_request( k) data_frame = self.time_series_factory.harvest_time_series( time_series_request) # TODO allow merge of multiple sources return data_frame def get_economic_events(self): return LightEventsFactory._econ_data_frame def dump_economic_events_csv(self, path): LightEventsFactory._econ_data_frame.to_csv(path) def get_economic_event_date_time(self, name, event=None, csv=None): ticker = self.create_event_desciptor_field(name, event, "release-date-time-full") if csv is None: data_frame = LightEventsFactory._econ_data_frame[ticker] data_frame.index = LightEventsFactory._econ_data_frame[ticker] else: dateparse = lambda x: datetime.datetime.strptime( x, '%d/%m/%Y %H:%M') data_frame = pandas.read_csv(csv, index_col=0, parse_dates=True, date_parser=dateparse) data_frame = data_frame[pandas.notnull(data_frame.index)] start_date = datetime.datetime.strptime("01-Jan-1971", "%d-%b-%Y") self.time_series_filter.filter_time_series_by_date( start_date, None, data_frame) return data_frame def get_economic_event_date_time_dataframe(self, name, event=None, csv=None): series = self.get_economic_event_date_time(name, event, csv) data_frame = pandas.DataFrame(series.values, index=series.index) data_frame.columns.name = self.create_event_desciptor_field( name, event, "release-date-time-full") return data_frame def get_economic_event_date_time_fields(self, fields, name, event=None): ### acceptible fields # actual-release # survey-median # survey-average # survey-high # survey-low # survey-high # number-observations # release-dt # release-date-time-full # first-revision # first-revision-date ticker = [] # construct tickers of the form USD-US Employees on Nonfarm Payrolls Total MoM Net Change SA.actual-release for i in range(0, len(fields)): ticker.append( self.create_event_desciptor_field(name, event, fields[i])) # index on the release-dt field eg. 20101230 (we shall convert this later) ticker_index = self.create_event_desciptor_field( name, event, "release-dt") ######## grab event date/times event_date_time = self.get_economic_event_date_time(name, event) date_time_fore = event_date_time.index # create dates for join later date_time_dt = [ datetime.datetime(date_time_fore[x].year, date_time_fore[x].month, date_time_fore[x].day) for x in range(len(date_time_fore)) ] event_date_time_frame = pandas.DataFrame(event_date_time.index, date_time_dt) event_date_time_frame.index = date_time_dt ######## grab event date/fields data_frame = LightEventsFactory._econ_data_frame[ticker] data_frame.index = LightEventsFactory._econ_data_frame[ticker_index] data_frame = data_frame[data_frame.index != 0] # eliminate any 0 dates (artifact of Excel) data_frame = data_frame[pandas.notnull( data_frame.index)] # eliminate any NaN dates (artifact of Excel) ind_dt = data_frame.index # convert yyyymmdd format to datetime data_frame.index = [ datetime.datetime( int((ind_dt[x] - (ind_dt[x] % 10000)) / 10000), int(((ind_dt[x] % 10000) - (ind_dt[x] % 100)) / 100), int(ind_dt[x] % 100)) for x in range(len(ind_dt)) ] # HACK! certain events need an offset because BBG have invalid dates if ticker_index in self._offset_events: data_frame.index = data_frame.index + timedelta( days=self._offset_events[ticker_index]) ######## join together event dates/date-time/fields in one data frame data_frame = event_date_time_frame.join(data_frame, how='inner') data_frame.index = pandas.to_datetime(data_frame.index) data_frame.index.name = ticker_index return data_frame def create_event_desciptor_field(self, name, event, field): if event is None: return name + "." + field else: return name + "-" + event + "." + field def get_all_economic_events_date_time(self): event_names = self.get_all_economic_events() columns = ['event-name', 'release-date-time-full'] data_frame = pandas.DataFrame(data=numpy.zeros((0, len(columns))), columns=columns) for event in event_names: event_times = self.get_economic_event_date_time(event) for time in event_times: data_frame.append( { 'event-name': event, 'release-date-time-full': time }, ignore_index=True) return data_frame def get_all_economic_events(self): field_names = LightEventsFactory._econ_data_frame.columns.values event_names = [x.split('.')[0] for x in field_names if '.Date' in x] event_names_filtered = [x for x in event_names if len(x) > 4] # sort list alphabetically (and remove any duplicates) return list(set(event_names_filtered)) def get_economic_event_date(self, name, event=None): return LightEventsFactory._econ_data_frame[ self.create_event_desciptor_field(name, event, ".release-dt")] def get_economic_event_ret_over_custom_event_day(self, data_frame_in, name, event, start, end, lagged=False, NYC_cutoff=10): # get the times of events event_dates = self.get_economic_event_date_time(name, event) return super(LightEventsFactory, self).get_economic_event_ret_over_custom_event_day( data_frame_in, event_dates, name, event, start, end, lagged=lagged, NYC_cutoff=NYC_cutoff) def get_economic_event_vol_over_event_day(self, vol_in, name, event, start, end, realised=False): return self.get_economic_event_ret_over_custom_event_day( vol_in, name, event, start, end, lagged=realised) # return super(EventsFactory, self).get_economic_event_ret_over_event_day(vol_in, name, event, start, end, lagged = realised) def get_daily_moves_over_event(self): # TODO pass # return only US events etc. by dates def get_intraday_moves_over_event(self, data_frame_rets, cross, event_fx, event_name, start, end, vol, mins=3 * 60, min_offset=0, create_index=False, resample=False, freq='minutes'): tsf = TimeSeriesFilter() ef_time_frame = self.get_economic_event_date_time_dataframe( event_fx, event_name) ef_time_frame = tsf.filter_time_series_by_date(start, end, ef_time_frame) return self.get_intraday_moves_over_custom_event( data_frame_rets, ef_time_frame, vol, mins=mins, min_offset=min_offset, create_index=create_index, resample=resample, freq=freq) #, start, end) def get_surprise_against_intraday_moves_over_event( self, data_frame_cross_orig, cross, event_fx, event_name, start, end, offset_list=[1, 5, 30, 60], add_surprise=False, surprise_field='survey-average'): tsf = TimeSeriesFilter() fields = [ 'actual-release', 'survey-median', 'survey-average', 'survey-high', 'survey-low' ] ef_time_frame = self.get_economic_event_date_time_fields( fields, event_fx, event_name) ef_time_frame = tsf.filter_time_series_by_date(start, end, ef_time_frame) return self.get_surprise_against_intraday_moves_over_custom_event( data_frame_cross_orig, ef_time_frame, cross, event_fx, event_name, start, end, offset_list=offset_list, add_surprise=add_surprise, surprise_field=surprise_field)
def get_intraday_moves_over_custom_event(self, data_frame_rets, ef_time_frame, vol=False, minute_start=5, mins=3 * 60, min_offset=0, create_index=False, resample=False, freq='minutes'): tsf = TimeSeriesFilter() ef_time_frame = tsf.filter_time_series_by_date( data_frame_rets.index[0], data_frame_rets.index[-1], ef_time_frame) ef_time = ef_time_frame.index if freq == 'minutes': ef_time_start = ef_time - timedelta(minutes=minute_start) ef_time_end = ef_time + timedelta(minutes=mins) ann_factor = 252 * 1440 elif freq == 'days': ef_time = ef_time_frame.index.normalize() ef_time_start = ef_time - timedelta(days=minute_start) ef_time_end = ef_time + timedelta(days=mins) ann_factor = 252 ords = range(-minute_start + min_offset, mins + min_offset) # all data needs to be equally spaced if resample: tsf = TimeSeriesFilter() # make sure time series is properly sampled at 1 min intervals data_frame_rets = data_frame_rets.resample('1min') data_frame_rets = data_frame_rets.fillna(value=0) data_frame_rets = tsf.remove_out_FX_out_of_hours(data_frame_rets) data_frame_rets['Ind'] = numpy.nan start_index = data_frame_rets.index.searchsorted(ef_time_start) finish_index = data_frame_rets.index.searchsorted(ef_time_end) # not all observation windows will be same length (eg. last one?) # fill the indices which represent minutes # TODO vectorise this! for i in range(0, len(ef_time_frame.index)): try: data_frame_rets.ix[start_index[i]:finish_index[i], 'Ind'] = ords except: data_frame_rets.ix[start_index[i]:finish_index[i], 'Ind'] = ords[0:(finish_index[i] - start_index[i])] # set the release dates data_frame_rets.ix[start_index, 'Rel'] = ef_time # set entry points data_frame_rets.ix[finish_index + 1, 'Rel'] = numpy.zeros( len(start_index)) # set exit points data_frame_rets['Rel'] = data_frame_rets['Rel'].fillna( method='pad') # fill down signals data_frame_rets = data_frame_rets[pandas.notnull( data_frame_rets['Ind'])] # get rid of other data_frame = data_frame_rets.pivot(index='Ind', columns='Rel', values=data_frame_rets.columns[0]) data_frame.index.names = [None] if create_index: tsc = TimeSeriesCalcs() data_frame.ix[-minute_start + min_offset, :] = numpy.nan data_frame = tsc.create_mult_index(data_frame) else: if vol is True: # annualise (if vol) data_frame = pandas.rolling_std( data_frame, window=5) * math.sqrt(ann_factor) else: data_frame = data_frame.cumsum() return data_frame
class LightEventsFactory(EventStudy): _econ_data_frame = None # where your HDF5 file is stored with economic data MUST CHANGE!! _hdf5_file_econ_file = "somefilnename.h5" ### manual offset for certain events where Bloomberg displays the wrong date (usually because of time differences) _offset_events = {'AUD-Australia Labor Force Employment Change SA.release-dt' : 1} def __init__(self): super(EventStudy, self).__init__() self.config = ConfigManager() self.logger = LoggerManager().getLogger(__name__) self.time_series_filter = TimeSeriesFilter() self.time_series_io = TimeSeriesIO() if (LightEventsFactory._econ_data_frame is None): self.load_economic_events() return def load_economic_events(self): LightEventsFactory._econ_data_frame = self.time_series_io.read_time_series_cache_from_disk(self._hdf5_file_econ_file) def harvest_category(self, category_name): cat = self.config.get_categories_from_tickers_selective_filter(category_name) for k in cat: time_series_request = self.time_series_factory.populate_time_series_request(k) data_frame = self.time_series_factory.harvest_time_series(time_series_request) # TODO allow merge of multiple sources return data_frame def get_economic_events(self): return LightEventsFactory._econ_data_frame def dump_economic_events_csv(self, path): LightEventsFactory._econ_data_frame.to_csv(path) def get_economic_event_date_time(self, name, event = None, csv = None): ticker = self.create_event_desciptor_field(name, event, "release-date-time-full") if csv is None: data_frame = LightEventsFactory._econ_data_frame[ticker] data_frame.index = LightEventsFactory._econ_data_frame[ticker] else: dateparse = lambda x: datetime.datetime.strptime(x, '%d/%m/%Y %H:%M') data_frame = pandas.read_csv(csv, index_col=0, parse_dates = True, date_parser=dateparse) data_frame = data_frame[pandas.notnull(data_frame.index)] start_date = datetime.datetime.strptime("01-Jan-1971", "%d-%b-%Y") self.time_series_filter.filter_time_series_by_date(start_date, None, data_frame) return data_frame def get_economic_event_date_time_dataframe(self, name, event = None, csv = None): series = self.get_economic_event_date_time(name, event, csv) data_frame = pandas.DataFrame(series.values, index=series.index) data_frame.columns.name = self.create_event_desciptor_field(name, event, "release-date-time-full") return data_frame def get_economic_event_date_time_fields(self, fields, name, event = None): ### acceptible fields # actual-release # survey-median # survey-average # survey-high # survey-low # survey-high # number-observations # release-dt # release-date-time-full # first-revision # first-revision-date ticker = [] # construct tickers of the form USD-US Employees on Nonfarm Payrolls Total MoM Net Change SA.actual-release for i in range(0, len(fields)): ticker.append(self.create_event_desciptor_field(name, event, fields[i])) # index on the release-dt field eg. 20101230 (we shall convert this later) ticker_index = self.create_event_desciptor_field(name, event, "release-dt") ######## grab event date/times event_date_time = self.get_economic_event_date_time(name, event) date_time_fore = event_date_time.index # create dates for join later date_time_dt = [datetime.datetime( date_time_fore[x].year, date_time_fore[x].month, date_time_fore[x].day) for x in range(len(date_time_fore))] event_date_time_frame = pandas.DataFrame(event_date_time.index, date_time_dt) event_date_time_frame.index = date_time_dt ######## grab event date/fields data_frame = LightEventsFactory._econ_data_frame[ticker] data_frame.index = LightEventsFactory._econ_data_frame[ticker_index] data_frame = data_frame[data_frame.index != 0] # eliminate any 0 dates (artifact of Excel) data_frame = data_frame[pandas.notnull(data_frame.index)] # eliminate any NaN dates (artifact of Excel) ind_dt = data_frame.index # convert yyyymmdd format to datetime data_frame.index = [datetime.datetime( int((ind_dt[x] - (ind_dt[x] % 10000))/10000), int(((ind_dt[x] % 10000) - (ind_dt[x] % 100))/100), int(ind_dt[x] % 100)) for x in range(len(ind_dt))] # HACK! certain events need an offset because BBG have invalid dates if ticker_index in self._offset_events: data_frame.index = data_frame.index + timedelta(days=self._offset_events[ticker_index]) ######## join together event dates/date-time/fields in one data frame data_frame = event_date_time_frame.join(data_frame, how='inner') data_frame.index = pandas.to_datetime(data_frame.index) data_frame.index.name = ticker_index return data_frame def create_event_desciptor_field(self, name, event, field): if event is None: return name + "." + field else: return name + "-" + event + "." + field def get_all_economic_events_date_time(self): event_names = self.get_all_economic_events() columns = ['event-name', 'release-date-time-full'] data_frame = pandas.DataFrame(data=numpy.zeros((0,len(columns))), columns=columns) for event in event_names: event_times = self.get_economic_event_date_time(event) for time in event_times: data_frame.append({'event-name':event, 'release-date-time-full':time}, ignore_index=True) return data_frame def get_all_economic_events(self): field_names = LightEventsFactory._econ_data_frame.columns.values event_names = [x.split('.')[0] for x in field_names if '.Date' in x] event_names_filtered = [x for x in event_names if len(x) > 4] # sort list alphabetically (and remove any duplicates) return list(set(event_names_filtered)) def get_economic_event_date(self, name, event = None): return LightEventsFactory._econ_data_frame[ self.create_event_desciptor_field(name, event, ".release-dt")] def get_economic_event_ret_over_custom_event_day(self, data_frame_in, name, event, start, end, lagged = False, NYC_cutoff = 10): # get the times of events event_dates = self.get_economic_event_date_time(name, event) return super(LightEventsFactory, self).get_economic_event_ret_over_custom_event_day(data_frame_in, event_dates, name, event, start, end, lagged = lagged, NYC_cutoff = NYC_cutoff) def get_economic_event_vol_over_event_day(self, vol_in, name, event, start, end, realised = False): return self.get_economic_event_ret_over_custom_event_day(vol_in, name, event, start, end, lagged = realised) # return super(EventsFactory, self).get_economic_event_ret_over_event_day(vol_in, name, event, start, end, lagged = realised) def get_daily_moves_over_event(self): # TODO pass # return only US events etc. by dates def get_intraday_moves_over_event(self, data_frame_rets, cross, event_fx, event_name, start, end, vol, mins = 3 * 60, min_offset = 0, create_index = False, resample = False, freq = 'minutes'): tsf = TimeSeriesFilter() ef_time_frame = self.get_economic_event_date_time_dataframe(event_fx, event_name) ef_time_frame = tsf.filter_time_series_by_date(start, end, ef_time_frame) return self.get_intraday_moves_over_custom_event(data_frame_rets, ef_time_frame, vol, mins = mins, min_offset = min_offset, create_index = create_index, resample = resample, freq = freq)#, start, end) def get_surprise_against_intraday_moves_over_event(self, data_frame_cross_orig, cross, event_fx, event_name, start, end, offset_list = [1, 5, 30, 60], add_surprise = False, surprise_field = 'survey-average'): tsf = TimeSeriesFilter() fields = ['actual-release', 'survey-median', 'survey-average', 'survey-high', 'survey-low'] ef_time_frame = self.get_economic_event_date_time_fields(fields, event_fx, event_name) ef_time_frame = tsf.filter_time_series_by_date(start, end, ef_time_frame) return self.get_surprise_against_intraday_moves_over_custom_event(data_frame_cross_orig, ef_time_frame, cross, event_fx, event_name, start, end, offset_list = offset_list, add_surprise = add_surprise, surprise_field = surprise_field)
def get_bus_day_of_month(self, date, cal = 'FX'): """ get_bus_day_of_month(date = list of dates, cal = calendar name) returns the business day of the month (ie. 3rd Jan, on a Monday, would be the 1st business day of the month """ tsf = TimeSeriesFilter() try: date = date.normalize() # strip times off the dates - for business dates just want dates! except: pass start = pandas.to_datetime(datetime.datetime(date.year[0], date.month[0], 1)) end = datetime.datetime.today()#pandas.to_datetime(datetime.datetime(date.year[-1], date.month[-1], date.day[-1])) holidays = tsf.get_holidays(start, end, cal) bday = CustomBusinessDay(holidays=holidays, weekmask='Mon Tue Wed Thu Fri') bus_dates = pandas.date_range(start, end, freq=bday) month = bus_dates.month work_day_index = numpy.zeros(len(bus_dates)) work_day_index[0] = 1 for i in range(1, len(bus_dates)): if month[i] == month[i-1]: work_day_index[i] = work_day_index[i-1] + 1 else: work_day_index[i] = 1 bus_day_of_month = work_day_index[bus_dates.searchsorted(date)] # bus_day_of_month = numpy.zeros(len(date)) # for i in range(0, len(date)): # index = bus_dates.searchsorted(date[i]) # bus_day_of_month[i] = work_day_index[index] # # holidays = tsf.get_holidays(start, end, cal) # # bday = CustomBusinessDay(holidays=holidays, weekmask='Mon Tue Wed Thu Fri') # bmth_begin = CustomBusinessMonthBegin(holidays=holidays) # tsf = TimeSeriesFilter() # # # floored_dates = datetime.date(date.year, date.month, date.day[0]) # start = pandas.to_datetime(datetime.datetime(date.year[0], date.month[0], 1)) # end = pandas.to_datetime(datetime.datetime(date.year[-1], date.month[-1], date.day[-1])) # # holidays = tsf.get_holidays(start, end, cal) # # bday = CustomBusinessDay(holidays=holidays, weekmask='Mon Tue Wed Thu Fri') # bmth_begin = CustomBusinessMonthBegin(holidays=holidays) # # bus_day_dict = {} # first_day_of_month = [] # first_day_of_month.append(start - bmth_begin) # # i = 0 # # while(first_day_of_month[i-1] <= end): # first_day_of_month.append(first_day_of_month[i-1] + bmth_begin) # i = i + 1 # # # create a dictionary of dates and business day of the month # # fill every day with NaN to begin with (to be overwritten) # # will fail if our data contains weekends # last_date = first_day_of_month[0] # # while(last_date <= end): # bus_day_dict[last_date.year * 10000 + last_date.month * 100 + last_date.day] = numpy.NaN # last_date = last_date + pandas.DateOffset(1) # # # for the business days # # create a dictionary of dates and business day of the month # for first in first_day_of_month: # curr_month = first.month # last_date = first # # bus_day_offset = 1 # # while(curr_month == last_date.month): # bus_day_dict[last_date.year * 10000 + last_date.month * 100 + last_date.day] = bus_day_offset # last_date = last_date + bday # bus_day_offset = bus_day_offset + 1 # # date_key = date.year * 10000 + date.month * 100 + date.day # # date_key_unique = numpy.unicode(date_key) # # # if the day doesn't appear in the calendar assign -1 # #bus_day_of_month = map(lambda x: bus_day_dict[x], date_key) # find_bus_ordinal = numpy.vectorize(lambda x: bus_day_dict[x]) # bus_day_of_month = find_bus_ordinal(date_key_unique) # #[lambda x: bus_day_dict[x] for x in date] return bus_day_of_month
def harvest_time_series(self, time_series_request, kill_session = True): """ havest_time_series - Loads time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ tickers = time_series_request.tickers loader = self.get_loader(time_series_request.data_source) # check if tickers have been specified (if not load all of them for a category) # also handle single tickers/list tickers create_tickers = False if tickers is None : create_tickers = True elif isinstance(tickers, str): if tickers == '': create_tickers = True elif isinstance(tickers, list): if tickers == []: create_tickers = True if create_tickers: time_series_request.tickers = self.config.get_tickers_list_for_category( time_series_request.category, time_series_request.source, time_series_request.freq, time_series_request.cut) # intraday or tick: only one ticker per cache file if (time_series_request.freq in ['intraday', 'tick']): data_frame_agg = self.download_intraday_tick(time_series_request, loader) # daily: multiple tickers per cache file - assume we make one API call to vendor library else: data_frame_agg = self.download_daily(time_series_request, loader) if('internet_load' in time_series_request.cache_algo): self.logger.debug("Internet loading.. ") # signal to loader template to exit session if loader is not None and kill_session == True: loader.kill_session() if(time_series_request.cache_algo == 'cache_algo'): self.logger.debug("Only caching data in memory, do not return any time series."); return tsf = TimeSeriesFilter() # only return time series if specified in the algo if 'return' in time_series_request.cache_algo: # special case for events/events-dt which is not indexed like other tables if hasattr(time_series_request, 'category'): if 'events' in time_series_request.category: return data_frame_agg try: return tsf.filter_time_series(time_series_request, data_frame_agg) except: import traceback self.logger.error(traceback.format_exc()) return None
# fetch NFP times from Bloomberg time_series_request = TimeSeriesRequest( start_date = start_date, # start date finish_date = finish_date, # finish date category = "events", freq = 'daily', # daily data data_source = 'bloomberg', # use Bloomberg as data source tickers = ['NFP'], fields = ['release-date-time-full'], # which fields to download vendor_tickers = ['NFP TCH Index'], # ticker (Bloomberg) vendor_fields = ['ECO_FUTURE_RELEASE_DATE_LIST'], # which Bloomberg fields to download cache_algo = 'internet_load_return') # how to return data ltsf = LightTimeSeriesFactory() ts_filter = TimeSeriesFilter() df_event_times = ltsf.harvest_time_series(time_series_request) utc_time = pytz.utc df_event_times = pandas.DataFrame(index = df_event_times['NFP.release-date-time-full']) df_event_times.index = df_event_times.index.tz_localize(utc_time) # work in UTC time df_event_times = ts_filter.filter_time_series_by_date(start_date, finish_date, df_event_times) # get last NFP time start_date = df_event_times.index[-1] - timedelta(minutes=1) finish_date = start_date + timedelta(minutes=4) tickers = ['EURUSD', 'JPYUSD', 'GBPUSD'] vendor_tickers = ['EURUSD BGN Curncy', 'USDJPY BGN Curncy', 'GBPUSD BGN Curncy']