class LoaderQuandl(LoaderTemplate): def __init__(self): super(LoaderQuandl, self).__init__() self.logger = LoggerManager().getLogger(__name__) # implement method in abstract superclass def load_ticker(self, time_series_request): time_series_request_vendor = self.construct_vendor_time_series_request(time_series_request) self.logger.info("Request Quandl data") data_frame = self.download_daily(time_series_request_vendor) if data_frame is None or data_frame.index is []: return None # convert from vendor to Thalesians tickers/fields if data_frame is not None: returned_tickers = data_frame.columns if data_frame is not None: # tidy up tickers into a format that is more easily translatable returned_tickers = [x.replace(' - Value', '') for x in returned_tickers] returned_tickers = [x.replace(' - VALUE', '') for x in returned_tickers] returned_tickers = [x.replace('.', '/') for x in returned_tickers] fields = self.translate_from_vendor_field(['close' for x in returned_tickers], time_series_request) tickers = self.translate_from_vendor_ticker(returned_tickers, time_series_request) ticker_combined = [] for i in range(0, len(fields)): ticker_combined.append(tickers[i] + "." + fields[i]) data_frame.columns = ticker_combined data_frame.index.name = 'Date' self.logger.info("Completed request from Quandl.") return data_frame def download_daily(self, time_series_request): trials = 0 data_frame = None while(trials < 5): try: data_frame = Quandl.get(time_series_request.tickers, authtoken=Constants().quandl_api_key, trim_start=time_series_request.start_date, trim_end=time_series_request.finish_date) break except: trials = trials + 1 self.logger.info("Attempting... " + str(trials) + " request to download from Quandl") if trials == 5: self.logger.error("Couldn't download from Quandl after several attempts!") return data_frame
class BBGLowLevelRef(BBGLowLevelTemplate): def __init__(self): super(BBGLowLevelRef, self).__init__() self.logger = LoggerManager().getLogger(__name__) self._options = [] # populate options for Bloomberg request for asset intraday request def fill_options(self, time_series_request): self._options = OptionsBBG() self._options.security = time_series_request.tickers self._options.startDateTime = time_series_request.start_date self._options.endDateTime = time_series_request.finish_date self._options.fields = time_series_request.fields return self._options def process_message(self, msg): data = collections.defaultdict(dict) # process received events securityDataArray = msg.getElement('securityData') index = 0 for securityData in list(securityDataArray.values()): ticker = securityData.getElementAsString("security") fieldData = securityData.getElement("fieldData") for field in fieldData.elements(): if not field.isValid(): field_name = "%s" % field.name() self.logger.error(field_name + " is NULL") elif field.isArray(): # iterate over complex data returns. field_name = "%s" % field.name() for i, row in enumerate(field.values()): data[(field_name, ticker)][index] = re.findall(r'"(.*?)"', "%s" % row)[0] index = index + 1 # else: # vals.append(re.findall(r'"(.*?)"', "%s" % row)[0]) # print("%s = %s" % (field.name(), field.getValueAsString())) fieldExceptionArray = securityData.getElement("fieldExceptions") for fieldException in list(fieldExceptionArray.values()): errorInfo = fieldException.getElement("errorInfo") print(errorInfo.getElementAsString("category"), ":", \ fieldException.getElementAsString("fieldId")) data_frame = pandas.DataFrame(data) # if obsolete ticker could return no values if (not(data_frame.empty)): data_frame.columns = pandas.MultiIndex.from_tuples(data, names=['field', 'ticker']) self.logger.info("Reading: " + ticker + ' ' + str(data_frame.index[0]) + ' - ' + str(data_frame.index[-1])) else: return None return data_frame def combine_slices(self, data_frame, data_frame_slice): if (data_frame_slice.columns.get_level_values(1).values[0] not in data_frame.columns.get_level_values(1).values): return data_frame.join(data_frame_slice, how="outer") return data_frame # create request for data def send_bar_request(self, session, eventQueue): refDataService = session.getService("//blp/refdata") request = refDataService.createRequest('ReferenceDataRequest') self.add_override(request, 'TIME_ZONE_OVERRIDE', 23) # force GMT time self.add_override(request, 'START_DT', self._options.startDateTime.strftime('%Y%m%d')) self.add_override(request, 'END_DT', self._options.endDateTime.strftime('%Y%m%d')) # only one security/eventType per request for field in self._options.fields: request.getElement("fields").appendValue(field) for security in self._options.security: request.getElement("securities").appendValue(security) self.logger.info("Sending Bloomberg Ref Request:" + str(request)) session.sendRequest(request)
class LoaderQuandl(LoaderTemplate): def __init__(self): super(LoaderQuandl, self).__init__() self.logger = LoggerManager().getLogger(__name__) # implement method in abstract superclass def load_ticker(self, time_series_request): time_series_request_vendor = self.construct_vendor_time_series_request( time_series_request) self.logger.info("Request Quandl data") data_frame = self.download_daily(time_series_request_vendor) if data_frame is None or data_frame.index is []: return None # convert from vendor to Thalesians tickers/fields if data_frame is not None: returned_tickers = data_frame.columns if data_frame is not None: # tidy up tickers into a format that is more easily translatable # we can often get multiple fields returned (even if we don't ask for them!) # convert to lower case returned_fields = [(x.split(' - ')[1]).lower().replace(' ', '-') for x in returned_tickers] returned_fields = [ x.replace('value', 'close') for x in returned_fields ] # special case for close returned_tickers = [x.replace('.', '/') for x in returned_tickers] returned_tickers = [x.split(' - ')[0] for x in returned_tickers] fields = self.translate_from_vendor_field(returned_fields, time_series_request) tickers = self.translate_from_vendor_ticker( returned_tickers, time_series_request) ticker_combined = [] for i in range(0, len(fields)): ticker_combined.append(tickers[i] + "." + fields[i]) data_frame.columns = ticker_combined data_frame.index.name = 'Date' self.logger.info("Completed request from Quandl.") return data_frame def download_daily(self, time_series_request): trials = 0 data_frame = None while (trials < 5): try: data_frame = Quandl.get( time_series_request.tickers, authtoken=Constants().quandl_api_key, trim_start=time_series_request.start_date, trim_end=time_series_request.finish_date) break except: trials = trials + 1 self.logger.info("Attempting... " + str(trials) + " request to download from Quandl") if trials == 5: self.logger.error( "Couldn't download from Quandl after several attempts!") return data_frame
class BBGLowLevelRef(BBGLowLevelTemplate): def __init__(self): super(BBGLowLevelRef, self).__init__() self.logger = LoggerManager().getLogger(__name__) self._options = [] # populate options for Bloomberg request for asset intraday request def fill_options(self, time_series_request): self._options = OptionsBBG() self._options.security = time_series_request.tickers self._options.startDateTime = time_series_request.start_date self._options.endDateTime = time_series_request.finish_date self._options.fields = time_series_request.fields return self._options def process_message(self, msg): data = collections.defaultdict(dict) # process received events securityDataArray = msg.getElement('securityData') index = 0 for securityData in list(securityDataArray.values()): ticker = securityData.getElementAsString("security") fieldData = securityData.getElement("fieldData") for field in fieldData.elements(): if not field.isValid(): field_name = "%s" % field.name() self.logger.error(field_name + " is NULL") elif field.isArray(): # iterate over complex data returns. field_name = "%s" % field.name() for i, row in enumerate(field.values()): data[(field_name, ticker)][index] = re.findall( r'"(.*?)"', "%s" % row)[0] index = index + 1 # else: # vals.append(re.findall(r'"(.*?)"', "%s" % row)[0]) # print("%s = %s" % (field.name(), field.getValueAsString())) fieldExceptionArray = securityData.getElement("fieldExceptions") for fieldException in list(fieldExceptionArray.values()): errorInfo = fieldException.getElement("errorInfo") print(errorInfo.getElementAsString("category"), ":", \ fieldException.getElementAsString("fieldId")) data_frame = pandas.DataFrame(data) # if obsolete ticker could return no values if (not (data_frame.empty)): data_frame.columns = pandas.MultiIndex.from_tuples( data, names=['field', 'ticker']) self.logger.info("Reading: " + ticker + ' ' + str(data_frame.index[0]) + ' - ' + str(data_frame.index[-1])) else: return None return data_frame def combine_slices(self, data_frame, data_frame_slice): if (data_frame_slice.columns.get_level_values(1).values[0] not in data_frame.columns.get_level_values(1).values): return data_frame.join(data_frame_slice, how="outer") return data_frame # create request for data def send_bar_request(self, session, eventQueue): refDataService = session.getService("//blp/refdata") request = refDataService.createRequest('ReferenceDataRequest') self.add_override(request, 'TIME_ZONE_OVERRIDE', 23) # force GMT time self.add_override(request, 'START_DT', self._options.startDateTime.strftime('%Y%m%d')) self.add_override(request, 'END_DT', self._options.endDateTime.strftime('%Y%m%d')) # only one security/eventType per request for field in self._options.fields: request.getElement("fields").appendValue(field) for security in self._options.security: request.getElement("securities").appendValue(security) self.logger.info("Sending Bloomberg Ref Request:" + str(request)) session.sendRequest(request)
class LightTimeSeriesFactory: _time_series_cache = {} # shared across all instances of object! def __init__(self): # self.config = ConfigManager() self.logger = LoggerManager().getLogger(__name__) self.time_series_filter = TimeSeriesFilter() self.time_series_io = TimeSeriesIO() self._bbg_default_api = Constants().bbg_default_api self._intraday_code = -1 return def set_bloomberg_com_api(self): """ set_bloomberg_com_api - Sets Bloomberg API to COM library """ self._bbg_default_api = 'com-api' def set_bloomberg_open_api(self): """ set_bloomberg_open_api - Sets Bloomberg API to OpenAPI (recommended) """ self._bbg_default_api = 'open-api' def flush_cache(self): """ flush_cache - Flushs internal cache of time series """ self._time_series_cache = {} def set_intraday_code(self, code): self._intraday_code = code def get_loader(self, source): """ get_loader - Loads appropriate data service class Parameters ---------- source : str the data service to use "bloomberg", "quandl", "yahoo", "google", "fred" etc. Returns ------- LoaderTemplate """ loader = None if source == 'bloomberg': ### allow use of COM API (older) and Open APIs (newer) for Bloomberg if self._bbg_default_api == 'com-api': from pythalesians.market.loaders.lowlevel.bbg.loaderbbg import LoaderBBGCOM loader = LoaderBBGCOM() elif self._bbg_default_api == 'open-api': from pythalesians.market.loaders.lowlevel.bbg.loaderbbgopen import LoaderBBGOpen loader = LoaderBBGOpen() elif source == 'quandl': from pythalesians.market.loaders.lowlevel.quandl.loaderquandl import LoaderQuandl loader = LoaderQuandl() elif source == 'dukascopy': from pythalesians.market.loaders.lowlevel.brokers.loaderdukascopy import LoaderDukasCopy loader = LoaderDukasCopy() elif source in ['yahoo', 'google', 'fred']: from pythalesians.market.loaders.lowlevel.pandasweb.loaderpandasweb import LoaderPandasWeb loader = LoaderPandasWeb() # TODO add support for other data sources (like Reuters) return loader def harvest_time_series(self, time_series_request, kill_session = True): """ havest_time_series - Loads time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ tickers = time_series_request.tickers loader = self.get_loader(time_series_request.data_source) # check if tickers have been specified (if not load all of them for a category) # also handle single tickers/list tickers create_tickers = False if tickers is None : create_tickers = True elif isinstance(tickers, str): if tickers == '': create_tickers = True elif isinstance(tickers, list): if tickers == []: create_tickers = True if create_tickers: time_series_request.tickers = self.config.get_tickers_list_for_category( time_series_request.category, time_series_request.source, time_series_request.freq, time_series_request.cut) # intraday or tick: only one ticker per cache file if (time_series_request.freq in ['intraday', 'tick']): data_frame_agg = self.download_intraday_tick(time_series_request, loader) # daily: multiple tickers per cache file - assume we make one API call to vendor library else: data_frame_agg = self.download_daily(time_series_request, loader) if('internet_load' in time_series_request.cache_algo): self.logger.debug("Internet loading.. ") # signal to loader template to exit session if loader is not None and kill_session == True: loader.kill_session() if(time_series_request.cache_algo == 'cache_algo'): self.logger.debug("Only caching data in memory, do not return any time series."); return tsf = TimeSeriesFilter() # only return time series if specified in the algo if 'return' in time_series_request.cache_algo: # special case for events/events-dt which is not indexed like other tables if hasattr(time_series_request, 'category'): if 'events' in time_series_request.category: return data_frame_agg try: return tsf.filter_time_series(time_series_request, data_frame_agg) except: import traceback self.logger.error(traceback.format_exc()) return None def get_time_series_cached(self, time_series_request): """ get_time_series_cached - Loads time series from cache (if it exists) Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ if (time_series_request.freq == "intraday"): ticker = time_series_request.tickers else: ticker = None fname = self.create_time_series_hash_key(time_series_request, ticker) if (fname in self._time_series_cache): data_frame = self._time_series_cache[fname] tsf = TimeSeriesFilter() return tsf.filter_time_series(time_series_request, data_frame) return None def create_time_series_hash_key(self, time_series_request, ticker = None): """ create_time_series_hash_key - Creates a hash key for retrieving the time series Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- str """ if(isinstance(ticker, list)): ticker = ticker[0] return self.create_cache_file_name( self.create_category_key(time_series_request, ticker)) def download_intraday_tick(self, time_series_request, loader): """ download_intraday_tick - Loads intraday time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ data_frame_agg = None ticker_cycle = 0 # handle intraday ticker calls separately one by one for ticker in time_series_request.tickers: time_series_request_single = copy.copy(time_series_request) time_series_request_single.tickers = ticker if hasattr(time_series_request, 'vendor_tickers'): time_series_request_single.vendor_tickers = [time_series_request.vendor_tickers[ticker_cycle]] ticker_cycle = ticker_cycle + 1 # we downscale into float32, to avoid memory problems in Python (32 bit) # data is stored on disk as float32 anyway data_frame_single = loader.load_ticker(time_series_request_single) # if the vendor doesn't provide any data, don't attempt to append if data_frame_single is not None: if data_frame_single.empty == False: data_frame_single.index.name = 'Date' data_frame_single = data_frame_single.astype('float32') # if you call for returning multiple tickers, be careful with memory considerations! if data_frame_agg is not None: data_frame_agg = data_frame_agg.join(data_frame_single, how='outer') else: data_frame_agg = data_frame_single # key = self.create_category_key(time_series_request, ticker) # fname = self.create_cache_file_name(key) # self._time_series_cache[fname] = data_frame_agg # cache in memory (disable for intraday) return data_frame_agg def download_daily(self, time_series_request, loader): """ download_daily - Loads daily time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ # daily data does not include ticker in the key, as multiple tickers in the same file data_frame_agg = loader.load_ticker(time_series_request) key = self.create_category_key(time_series_request) fname = self.create_cache_file_name(key) self._time_series_cache[fname] = data_frame_agg # cache in memory (ok for daily data) return data_frame_agg def create_category_key(self, time_series_request, ticker=None): """ create_category_key - Returns a category key for the associated TimeSeriesRequest Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- str """ category = 'default-cat' cut = 'default-cut' if hasattr(time_series_request, 'category'): category = time_series_request.category source = time_series_request.data_source freq = time_series_request.freq if hasattr(time_series_request, 'cut'): cut = time_series_request.cut if (ticker is not None): key = category + '.' + source + '.' + freq + '.' + cut + '.' + ticker else: key = category + '.' + source + '.' + freq + '.' + cut return key def create_cache_file_name(self, filename): return Constants().folder_time_series_data + "/" + filename
class LightTimeSeriesFactory: _time_series_cache = {} # shared across all instances of object! def __init__(self): # self.config = ConfigManager() self.logger = LoggerManager().getLogger(__name__) self.time_series_filter = TimeSeriesFilter() self.time_series_io = TimeSeriesIO() self._bbg_default_api = Constants().bbg_default_api self._intraday_code = -1 return def set_bloomberg_com_api(self): """ set_bloomberg_com_api - Sets Bloomberg API to COM library """ self._bbg_default_api = 'com-api' def set_bloomberg_open_api(self): """ set_bloomberg_open_api - Sets Bloomberg API to OpenAPI (recommended) """ self._bbg_default_api = 'open-api' def flush_cache(self): """ flush_cache - Flushs internal cache of time series """ self._time_series_cache = {} def set_intraday_code(self, code): self._intraday_code = code def get_loader(self, source): """ get_loader - Loads appropriate data service class Parameters ---------- source : str the data service to use "bloomberg", "quandl", "yahoo", "google", "fred" etc. Returns ------- LoaderTemplate """ loader = None if source == 'bloomberg': ### allow use of COM API (older) and Open APIs (newer) for Bloomberg if self._bbg_default_api == 'com-api': from pythalesians.market.loaders.lowlevel.bbg.loaderbbg import LoaderBBGCOM loader = LoaderBBGCOM() elif self._bbg_default_api == 'open-api': from pythalesians.market.loaders.lowlevel.bbg.loaderbbgopen import LoaderBBGOpen loader = LoaderBBGOpen() elif source == 'quandl': from pythalesians.market.loaders.lowlevel.quandl.loaderquandl import LoaderQuandl loader = LoaderQuandl() elif source == 'dukascopy': from pythalesians.market.loaders.lowlevel.brokers.loaderdukascopy import LoaderDukasCopy loader = LoaderDukasCopy() elif source in ['yahoo', 'google', 'fred']: from pythalesians.market.loaders.lowlevel.pandasweb.loaderpandasweb import LoaderPandasWeb loader = LoaderPandasWeb() # TODO add support for other data sources (like Reuters) return loader def harvest_time_series(self, time_series_request, kill_session = True): """ havest_time_series - Loads time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ tickers = time_series_request.tickers loader = self.get_loader(time_series_request.data_source) # check if tickers have been specified (if not load all of them for a category) # also handle single tickers/list tickers create_tickers = False if tickers is None : create_tickers = True elif isinstance(tickers, str): if tickers == '': create_tickers = True elif isinstance(tickers, list): if tickers == []: create_tickers = True if create_tickers: time_series_request.tickers = self.config.get_tickers_list_for_category( time_series_request.category, time_series_request.source, time_series_request.freq, time_series_request.cut) # intraday or tick: only one ticker per cache file if (time_series_request.freq in ['intraday', 'tick']): data_frame_agg = self.download_intraday_tick(time_series_request, loader) # daily: multiple tickers per cache file - assume we make one API call to vendor library else: data_frame_agg = self.download_daily(time_series_request, loader) if('internet_load' in time_series_request.cache_algo): self.logger.debug("Internet loading.. ") # signal to loader template to exit session # if loader is not None and kill_session == True: loader.kill_session() if(time_series_request.cache_algo == 'cache_algo'): self.logger.debug("Only caching data in memory, do not return any time series."); return tsf = TimeSeriesFilter() # only return time series if specified in the algo if 'return' in time_series_request.cache_algo: # special case for events/events-dt which is not indexed like other tables if hasattr(time_series_request, 'category'): if 'events' in time_series_request.category: return data_frame_agg try: return tsf.filter_time_series(time_series_request, data_frame_agg) except: import traceback self.logger.error(traceback.format_exc()) return None def get_time_series_cached(self, time_series_request): """ get_time_series_cached - Loads time series from cache (if it exists) Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ if (time_series_request.freq == "intraday"): ticker = time_series_request.tickers else: ticker = None fname = self.create_time_series_hash_key(time_series_request, ticker) if (fname in self._time_series_cache): data_frame = self._time_series_cache[fname] tsf = TimeSeriesFilter() return tsf.filter_time_series(time_series_request, data_frame) return None def create_time_series_hash_key(self, time_series_request, ticker = None): """ create_time_series_hash_key - Creates a hash key for retrieving the time series Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- str """ if(isinstance(ticker, list)): ticker = ticker[0] return self.create_cache_file_name(self.create_category_key(time_series_request, ticker)) def download_intraday_tick(self, time_series_request, loader): """ download_intraday_tick - Loads intraday time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ data_frame_agg = None ticker_cycle = 0 # single threaded version # handle intraday ticker calls separately one by one if len(time_series_request.tickers) == 1 or Constants().time_series_factory_thread_no['other'] == 1: for ticker in time_series_request.tickers: time_series_request_single = copy.copy(time_series_request) time_series_request_single.tickers = ticker if hasattr(time_series_request, 'vendor_tickers'): time_series_request_single.vendor_tickers = [time_series_request.vendor_tickers[ticker_cycle]] ticker_cycle = ticker_cycle + 1 # we downscale into float32, to avoid memory problems in Python (32 bit) # data is stored on disk as float32 anyway data_frame_single = loader.load_ticker(time_series_request_single) # if the vendor doesn't provide any data, don't attempt to append if data_frame_single is not None: if data_frame_single.empty == False: data_frame_single.index.name = 'Date' data_frame_single = data_frame_single.astype('float32') # if you call for returning multiple tickers, be careful with memory considerations! if data_frame_agg is not None: data_frame_agg = data_frame_agg.join(data_frame_single, how='outer') else: data_frame_agg = data_frame_single # key = self.create_category_key(time_series_request, ticker) # fname = self.create_cache_file_name(key) # self._time_series_cache[fname] = data_frame_agg # cache in memory (disable for intraday) return data_frame_agg else: time_series_request_list = [] # create a list of TimeSeriesRequests for ticker in time_series_request.tickers: time_series_request_single = copy.copy(time_series_request) time_series_request_single.tickers = ticker if hasattr(time_series_request, 'vendor_tickers'): time_series_request_single.vendor_tickers = [time_series_request.vendor_tickers[ticker_cycle]] ticker_cycle = ticker_cycle + 1 time_series_request_list.append(time_series_request_single) return self.fetch_group_time_series(time_series_request_list) def fetch_single_time_series(self, time_series_request): data_frame_single = self.get_loader(time_series_request.data_source).load_ticker(time_series_request) if data_frame_single is not None: if data_frame_single.empty == False: data_frame_single.index.name = 'Date' data_frame_single = data_frame_single.astype('float32') return data_frame_single def fetch_group_time_series(self, time_series_request_list): data_frame_agg = None # depends on the nature of operation as to whether we should use threading or multiprocessing library if Constants().time_series_factory_thread_technique is "thread": from multiprocessing.dummy import Pool else: # most of the time is spend waiting for Bloomberg to return, so can use threads rather than multiprocessing # must use the multiprocessing_on_dill library otherwise can't pickle objects correctly # note: currently not very stable from multiprocessing_on_dill import Pool thread_no = Constants().time_series_factory_thread_no['other'] if time_series_request_list[0].data_source in Constants().time_series_factory_thread_no: thread_no = Constants().time_series_factory_thread_no[time_series_request_list[0].data_source] pool = Pool(thread_no) # open the market data downloads in their own threads and return the results result = pool.map_async(self.fetch_single_time_series, time_series_request_list) data_frame_group = result.get() pool.close() pool.join() # data_frame_group = results.get() # data_frame_group = results # data_frame_group = None #import multiprocessing as multiprocessing # close the pool and wait for the work to finish # processes = [] # for x in range(0, len(time_series_request_list)): # time_series_request = time_series_request_list[x] # processes = [multiprocessing.Process(target = self.fetch_single_time_series, # args = (x)) for x in time_series_request_list] # pool.apply_async(tsf.harvest_category, args = (category_desc, environment, freq, # exclude_freq_cat, force_new_download_freq_cat, include_freq_cat)) # Run processes # for p in processes: p.start() # Exit the completed processes # for p in processes: p.join() # collect together all the time series if data_frame_group is not None: for data_frame_single in data_frame_group: # if you call for returning multiple tickers, be careful with memory considerations! if data_frame_single is not None: if data_frame_agg is not None: data_frame_agg = data_frame_agg.join(data_frame_single, how='outer') else: data_frame_agg = data_frame_single return data_frame_agg def download_daily(self, time_series_request, loader): """ download_daily - Loads daily time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ # daily data does not include ticker in the key, as multiple tickers in the same file if Constants().time_series_factory_thread_no['other'] == 1: data_frame_agg = loader.load_ticker(time_series_request) else: time_series_request_list = [] group_size = int(len(time_series_request.tickers) / Constants().time_series_factory_thread_no['other'] - 1) if group_size == 0: group_size = 1 # split up tickers into groups related to number of threads to call for i in range(0, len(time_series_request.tickers), group_size): time_series_request_single = copy.copy(time_series_request) time_series_request_single.tickers = time_series_request.tickers[i:i + group_size] if hasattr(time_series_request, 'vendor_tickers'): time_series_request_single.vendor_tickers = \ time_series_request.vendor_tickers[i:i + group_size] time_series_request_list.append(time_series_request_single) data_frame_agg = self.fetch_group_time_series(time_series_request_list) key = self.create_category_key(time_series_request) fname = self.create_cache_file_name(key) self._time_series_cache[fname] = data_frame_agg # cache in memory (ok for daily data) return data_frame_agg def create_category_key(self, time_series_request, ticker=None): """ create_category_key - Returns a category key for the associated TimeSeriesRequest Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- str """ category = 'default-cat' cut = 'default-cut' if hasattr(time_series_request, 'category'): category = time_series_request.category environment = time_series_request.environment source = time_series_request.data_source freq = time_series_request.freq if hasattr(time_series_request, 'cut'): cut = time_series_request.cut if (ticker is not None): key = environment + "." + category + '.' + source + '.' + freq + '.' + cut + '.' + ticker else: key = environment + "." + category + '.' + source + '.' + freq + '.' + cut return key def create_cache_file_name(self, filename): return Constants().folder_time_series_data + "/" + filename
class HistEconDataFactory: def __init__(self): self.logger = LoggerManager().getLogger(__name__) self._all_econ_tickers = pandas.read_csv(Constants().all_econ_tickers) self._econ_country_codes = pandas.read_csv( Constants().econ_country_codes) self._econ_country_groups = pandas.read_csv( Constants().econ_country_groups) self.time_series_factory = LightTimeSeriesFactory() # if Constants().default_time_series_factory == 'lighttimeseriesfactory': # self.time_series_factory = LightTimeSeriesFactory() # else: # self.time_series_factory = CachedTimeSeriesFactory() # return def get_economic_data_history(self, start_date, finish_date, country_group, data_type, source='fred', cache_algo="internet_load_return"): #vendor_country_codes = self.fred_country_codes[country_group] #vendor_pretty_country = self.fred_nice_country_codes[country_group] if isinstance(country_group, list): pretty_country_names = country_group else: # get all the country names in the country_group pretty_country_names = list(self._econ_country_groups[ self._econ_country_groups["Country Group"] == country_group] ['Country']) # construct the pretty tickers pretty_tickers = [x + '-' + data_type for x in pretty_country_names] # get vendor tickers vendor_tickers = [] for pretty_ticker in pretty_tickers: vendor_ticker = list( self._all_econ_tickers[self._all_econ_tickers["Full Code"] == pretty_ticker][source].values) if vendor_ticker == []: vendor_ticker = None self.logger.error('Could not find match for ' + pretty_ticker) else: vendor_ticker = vendor_ticker[0] vendor_tickers.append(vendor_ticker) vendor_fields = ['close'] if source == 'bloomberg': vendor_fields = ['PX_LAST'] time_series_request = TimeSeriesRequest( start_date=start_date, # start date finish_date=finish_date, # finish date category='economic', freq='daily', # intraday data data_source=source, # use Bloomberg as data source cut='LOC', tickers=pretty_tickers, fields=['close'], # which fields to download vendor_tickers=vendor_tickers, vendor_fields=vendor_fields, # which Bloomberg fields to download cache_algo=cache_algo) # how to return data return self.time_series_factory.harvest_time_series( time_series_request) def grasp_coded_entry(self, df, index): df = df.ix[index:].stack() df = df.reset_index() df.columns = ['Date', 'Name', 'Val'] countries = df['Name'] countries = [x.split('-', 1)[0] for x in countries] df['Code'] = sum([ list(self._econ_country_codes[self._econ_country_codes["Country"] == x]['Code']) for x in countries ], []) return df
class HistEconDataFactory: def __init__(self): self.logger = LoggerManager().getLogger(__name__) self._all_econ_tickers = pandas.read_csv(Constants().all_econ_tickers) self._econ_country_codes = pandas.read_csv(Constants().econ_country_codes) self._econ_country_groups = pandas.read_csv(Constants().econ_country_groups) self.time_series_factory = LightTimeSeriesFactory() # if Constants().default_time_series_factory == 'lighttimeseriesfactory': # self.time_series_factory = LightTimeSeriesFactory() # else: # self.time_series_factory = CachedTimeSeriesFactory() # return def get_economic_data_history( self, start_date, finish_date, country_group, data_type, source="fred", cache_algo="internet_load_return" ): # vendor_country_codes = self.fred_country_codes[country_group] # vendor_pretty_country = self.fred_nice_country_codes[country_group] if isinstance(country_group, list): pretty_country_names = country_group else: # get all the country names in the country_group pretty_country_names = list( self._econ_country_groups[self._econ_country_groups["Country Group"] == country_group]["Country"] ) # construct the pretty tickers pretty_tickers = [x + "-" + data_type for x in pretty_country_names] # get vendor tickers vendor_tickers = [] for pretty_ticker in pretty_tickers: vendor_ticker = list( self._all_econ_tickers[self._all_econ_tickers["Full Code"] == pretty_ticker][source].values ) if vendor_ticker == []: vendor_ticker = None self.logger.error("Could not find match for " + pretty_ticker) else: vendor_ticker = vendor_ticker[0] vendor_tickers.append(vendor_ticker) vendor_fields = ["close"] if source == "bloomberg": vendor_fields = ["PX_LAST"] time_series_request = TimeSeriesRequest( start_date=start_date, # start date finish_date=finish_date, # finish date category="economic", freq="daily", # intraday data data_source=source, # use Bloomberg as data source cut="LOC", tickers=pretty_tickers, fields=["close"], # which fields to download vendor_tickers=vendor_tickers, vendor_fields=vendor_fields, # which Bloomberg fields to download cache_algo=cache_algo, ) # how to return data return self.time_series_factory.harvest_time_series(time_series_request) def grasp_coded_entry(self, df, index): df = df.ix[index:].stack() df = df.reset_index() df.columns = ["Date", "Name", "Val"] countries = df["Name"] countries = [x.split("-", 1)[0] for x in countries] df["Code"] = sum( [list(self._econ_country_codes[self._econ_country_codes["Country"] == x]["Code"]) for x in countries], [] ) return df
class LightTimeSeriesFactory: _time_series_cache = {} # shared across all instances of object! def __init__(self): # self.config = ConfigManager() self.logger = LoggerManager().getLogger(__name__) self.time_series_filter = TimeSeriesFilter() self.time_series_io = TimeSeriesIO() self._bbg_default_api = Constants().bbg_default_api self._intraday_code = -1 return def set_bloomberg_com_api(self): """ set_bloomberg_com_api - Sets Bloomberg API to COM library """ self._bbg_default_api = 'com-api' def set_bloomberg_open_api(self): """ set_bloomberg_open_api - Sets Bloomberg API to OpenAPI (recommended) """ self._bbg_default_api = 'open-api' def flush_cache(self): """ flush_cache - Flushs internal cache of time series """ self._time_series_cache = {} def set_intraday_code(self, code): self._intraday_code = code def get_loader(self, source): """ get_loader - Loads appropriate data service class Parameters ---------- source : str the data service to use "bloomberg", "quandl", "yahoo", "google", "fred" etc. Returns ------- LoaderTemplate """ loader = None if source == 'bloomberg': ### allow use of COM API (older) and Open APIs (newer) for Bloomberg if self._bbg_default_api == 'com-api': from pythalesians.market.loaders.lowlevel.bbg.loaderbbg import LoaderBBGCOM loader = LoaderBBGCOM() elif self._bbg_default_api == 'open-api': from pythalesians.market.loaders.lowlevel.bbg.loaderbbgopen import LoaderBBGOpen loader = LoaderBBGOpen() elif source == 'quandl': from pythalesians.market.loaders.lowlevel.quandl.loaderquandl import LoaderQuandl loader = LoaderQuandl() elif source == 'dukascopy': from pythalesians.market.loaders.lowlevel.brokers.loaderdukascopy import LoaderDukasCopy loader = LoaderDukasCopy() elif source in ['yahoo', 'google', 'fred']: from pythalesians.market.loaders.lowlevel.pandasweb.loaderpandasweb import LoaderPandasWeb loader = LoaderPandasWeb() # TODO add support for other data sources (like Reuters) return loader def harvest_time_series(self, time_series_request, kill_session=True): """ havest_time_series - Loads time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ tickers = time_series_request.tickers loader = self.get_loader(time_series_request.data_source) # check if tickers have been specified (if not load all of them for a category) # also handle single tickers/list tickers create_tickers = False if tickers is None: create_tickers = True elif isinstance(tickers, str): if tickers == '': create_tickers = True elif isinstance(tickers, list): if tickers == []: create_tickers = True if create_tickers: time_series_request.tickers = self.config.get_tickers_list_for_category( time_series_request.category, time_series_request.source, time_series_request.freq, time_series_request.cut) # intraday or tick: only one ticker per cache file if (time_series_request.freq in ['intraday', 'tick', 'second', 'hour', 'minute']): data_frame_agg = self.download_intraday_tick( time_series_request, loader) # daily: multiple tickers per cache file - assume we make one API call to vendor library else: data_frame_agg = self.download_daily(time_series_request, loader) if ('internet_load' in time_series_request.cache_algo): self.logger.debug("Internet loading.. ") # signal to loader template to exit session # if loader is not None and kill_session == True: loader.kill_session() if (time_series_request.cache_algo == 'cache_algo'): self.logger.debug( "Only caching data in memory, do not return any time series.") return tsf = TimeSeriesFilter() # only return time series if specified in the algo if 'return' in time_series_request.cache_algo: # special case for events/events-dt which is not indexed like other tables if hasattr(time_series_request, 'category'): if 'events' in time_series_request.category: return data_frame_agg try: return tsf.filter_time_series(time_series_request, data_frame_agg) except: import traceback self.logger.error(traceback.format_exc()) return None def get_time_series_cached(self, time_series_request): """ get_time_series_cached - Loads time series from cache (if it exists) Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ if (time_series_request.freq == "intraday"): ticker = time_series_request.tickers else: ticker = None fname = self.create_time_series_hash_key(time_series_request, ticker) if (fname in self._time_series_cache): data_frame = self._time_series_cache[fname] tsf = TimeSeriesFilter() return tsf.filter_time_series(time_series_request, data_frame) return None def create_time_series_hash_key(self, time_series_request, ticker=None): """ create_time_series_hash_key - Creates a hash key for retrieving the time series Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- str """ if (isinstance(ticker, list)): ticker = ticker[0] return self.create_cache_file_name( self.create_category_key(time_series_request, ticker)) def download_intraday_tick(self, time_series_request, loader): """ download_intraday_tick - Loads intraday time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ data_frame_agg = None ticker_cycle = 0 # single threaded version # handle intraday ticker calls separately one by one if len(time_series_request.tickers) == 1 or Constants( ).time_series_factory_thread_no['other'] == 1: for ticker in time_series_request.tickers: time_series_request_single = copy.copy(time_series_request) time_series_request_single.tickers = ticker if hasattr(time_series_request, 'vendor_tickers'): time_series_request_single.vendor_tickers = [ time_series_request.vendor_tickers[ticker_cycle] ] ticker_cycle = ticker_cycle + 1 # we downscale into float32, to avoid memory problems in Python (32 bit) # data is stored on disk as float32 anyway data_frame_single = loader.load_ticker( time_series_request_single) # if the vendor doesn't provide any data, don't attempt to append if data_frame_single is not None: if data_frame_single.empty == False: data_frame_single.index.name = 'Date' data_frame_single = data_frame_single.astype('float32') # if you call for returning multiple tickers, be careful with memory considerations! if data_frame_agg is not None: data_frame_agg = data_frame_agg.join( data_frame_single, how='outer') else: data_frame_agg = data_frame_single # key = self.create_category_key(time_series_request, ticker) # fname = self.create_cache_file_name(key) # self._time_series_cache[fname] = data_frame_agg # cache in memory (disable for intraday) return data_frame_agg else: time_series_request_list = [] # create a list of TimeSeriesRequests for ticker in time_series_request.tickers: time_series_request_single = copy.copy(time_series_request) time_series_request_single.tickers = ticker if hasattr(time_series_request, 'vendor_tickers'): time_series_request_single.vendor_tickers = [ time_series_request.vendor_tickers[ticker_cycle] ] ticker_cycle = ticker_cycle + 1 time_series_request_list.append(time_series_request_single) return self.fetch_group_time_series(time_series_request_list) def fetch_single_time_series(self, time_series_request): data_frame_single = self.get_loader( time_series_request.data_source).load_ticker(time_series_request) if data_frame_single is not None: if data_frame_single.empty == False: data_frame_single.index.name = 'Date' # will fail for dataframes which includes dates try: data_frame_single = data_frame_single.astype('float32') except: pass if time_series_request.freq == "second": data_frame_single = data_frame_single.resample("1s") return data_frame_single def fetch_group_time_series(self, time_series_request_list): data_frame_agg = None # depends on the nature of operation as to whether we should use threading or multiprocessing library if Constants().time_series_factory_thread_technique is "thread": from multiprocessing.dummy import Pool else: # most of the time is spend waiting for Bloomberg to return, so can use threads rather than multiprocessing # must use the multiprocessing_on_dill library otherwise can't pickle objects correctly # note: currently not very stable from multiprocessing_on_dill import Pool thread_no = Constants().time_series_factory_thread_no['other'] if time_series_request_list[0].data_source in Constants( ).time_series_factory_thread_no: thread_no = Constants().time_series_factory_thread_no[ time_series_request_list[0].data_source] pool = Pool(thread_no) # open the market data downloads in their own threads and return the results result = pool.map_async(self.fetch_single_time_series, time_series_request_list) data_frame_group = result.get() pool.close() pool.join() # data_frame_group = results.get() # data_frame_group = results # data_frame_group = None #import multiprocessing as multiprocessing # close the pool and wait for the work to finish # processes = [] # for x in range(0, len(time_series_request_list)): # time_series_request = time_series_request_list[x] # processes = [multiprocessing.Process(target = self.fetch_single_time_series, # args = (x)) for x in time_series_request_list] # pool.apply_async(tsf.harvest_category, args = (category_desc, environment, freq, # exclude_freq_cat, force_new_download_freq_cat, include_freq_cat)) # Run processes # for p in processes: p.start() # Exit the completed processes # for p in processes: p.join() # collect together all the time series if data_frame_group is not None: for data_frame_single in data_frame_group: # if you call for returning multiple tickers, be careful with memory considerations! if data_frame_single is not None: if data_frame_agg is not None: data_frame_agg = data_frame_agg.join(data_frame_single, how='outer') else: data_frame_agg = data_frame_single return data_frame_agg def download_daily(self, time_series_request, loader): """ download_daily - Loads daily time series from specified data provider Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- pandas.DataFrame """ # daily data does not include ticker in the key, as multiple tickers in the same file if Constants().time_series_factory_thread_no['other'] == 1: data_frame_agg = loader.load_ticker(time_series_request) else: time_series_request_list = [] group_size = int( len(time_series_request.tickers) / Constants().time_series_factory_thread_no['other'] - 1) if group_size == 0: group_size = 1 # split up tickers into groups related to number of threads to call for i in range(0, len(time_series_request.tickers), group_size): time_series_request_single = copy.copy(time_series_request) time_series_request_single.tickers = time_series_request.tickers[ i:i + group_size] if hasattr(time_series_request, 'vendor_tickers'): time_series_request_single.vendor_tickers = \ time_series_request.vendor_tickers[i:i + group_size] time_series_request_list.append(time_series_request_single) data_frame_agg = self.fetch_group_time_series( time_series_request_list) key = self.create_category_key(time_series_request) fname = self.create_cache_file_name(key) self._time_series_cache[ fname] = data_frame_agg # cache in memory (ok for daily data) return data_frame_agg def create_category_key(self, time_series_request, ticker=None): """ create_category_key - Returns a category key for the associated TimeSeriesRequest Parameters ---------- time_series_request : TimeSeriesRequest contains various properties describing time series to fetched, including ticker, start & finish date etc. Returns ------- str """ category = 'default-cat' cut = 'default-cut' if hasattr(time_series_request, 'category'): category = time_series_request.category environment = time_series_request.environment source = time_series_request.data_source freq = time_series_request.freq if hasattr(time_series_request, 'cut'): cut = time_series_request.cut if (ticker is not None): key = environment + "." + category + '.' + source + '.' + freq + '.' + cut + '.' + ticker else: key = environment + "." + category + '.' + source + '.' + freq + '.' + cut return key def create_cache_file_name(self, filename): return Constants().folder_time_series_data + "/" + filename