def __init__(self, db_cache_server=None, db_cache_port=None, engine='redis'): from findatapy.util import DataConstants if db_cache_server is None: self.db_cache_server = DataConstants().db_cache_server if db_cache_port is None: self.db_cache_port = DataConstants().db_cache_port self.engine = engine self.io_engine = IOEngine()
def force_type_conversion(self, data_frame): constants = DataConstants() logger = LoggerManager().getLogger(__name__) if data_frame is not None: if not (data_frame.empty): # Need to convert numerical and datetime columns separately # post pandas 0.23 for c in data_frame.columns: is_date = False # Special case for ECO_RELEASE_DT / FIRST_REVISION_DATE if 'ECO_RELEASE_DT' in c or 'FIRST_REVISION_DATE' in c: try: temp_col = [] # data_frame[c].values for i in range(0, len(data_frame[c].values)): try: temp_col.append( pd.to_datetime(str( int(data_frame[c].values[i])), format='%Y%m%d')) except: temp_col.append(np.datetime64('NaT')) data_frame[c] = temp_col except Exception as e: logger.warning( "Couldn't convert " + str(c) + " to date.. was this column empty? " + str(e)) else: # Only convert those Bloomberg reference fields to # dates which have been listed explicitly for d in constants.always_date_columns: if d in c: try: data_frame[c] = pd.to_datetime( data_frame[c], errors='coerce') is_date = True break except: pass # Otherwise this is not a date field so attempt to # convert into numbers if not (is_date): try: data_frame[c] = pd.to_numeric(data_frame[c], errors='ignore') except: pass logger.debug("Returning converted dataframe...") return data_frame
def get_reference_data(self, md_request_vendor, md_request): logger = LoggerManager().getLogger(__name__) constants = DataConstants() end = datetime.utcnow() from datetime import timedelta end = end + timedelta( days=365) # because very often we may with to download data about # future calendar events # end.replace(year = end.year + 1) md_request_vendor.finish_date = end logger.debug("Requesting ref for " + md_request_vendor.tickers[0] + " etc.") data_frame = self.download_ref(md_request_vendor) logger.debug("Waiting for ref...") # Convert from vendor to findatapy tickers/fields if data_frame is not None: if data_frame.empty: return None returned_fields = data_frame.columns.get_level_values(0) returned_tickers = data_frame.columns.get_level_values(1) if data_frame is not None: # TODO if empty try downloading again a year later fields = self.translate_from_vendor_field(returned_fields, md_request) tickers = self.translate_from_vendor_ticker( returned_tickers, md_request) ticker_combined = [] for i in range(0, len(fields)): ticker_combined.append(tickers[i] + "." + fields[i]) data_frame.columns = ticker_combined # Need to convert numerical and datetime columns separately post # pandas 0.23 data_frame = self.force_type_conversion(data_frame) # data_frame = data_frame.apply(pd.to_datetime, errors='ignore') # data_frame = data_frame.apply(pd.to_numeric, errors='ignore') # TODO coerce will be deprecated from pandas 0.23.0 onwards) so # remove! # data_frame = data_frame.convert_objects(convert_dates = 'coerce', # convert_numeric= 'coerce') return data_frame
def to_parquet(self, df, path, aws_region=constants.aws_region, parquet_compression=constants.parquet_compression): constants = DataConstants() # is_date = False # # # Force any date columns to default time units (Parquet with pyarrow has problems with ns dates) # for c in df.columns: # # # If it's a date column don't append to convert to a float # for d in constants.always_date_columns: # if d in c or 'release-dt' in c: # is_date = True # break # # if is_date: # try: # df[c] = pd.to_datetime(df[c], errors='coerce', unit=constants.default_time_units) # except: # pass try: df.index = pd.to_datetime(df.index, unit=constants.default_time_units) except: pass if 's3://' in path: s3 = pyarrow.fs.S3FileSystem(region=aws_region) table = pa.Table.from_pandas(df) path_in_s3 = path.replace("s3://", "") with s3.open_output_stream(path_in_s3) as f: pq.write_table( table, f, compression=parquet_compression, coerce_timestamps=constants.default_time_units, allow_truncated_timestamps=True, ) else: # Using pandas.to_parquet, doesn't let us pass in parameters to allow coersion of timestamps # ie. ns -> us table = pa.Table.from_pandas(df) pq.write_table(table, path, compression=parquet_compression, coerce_timestamps=constants.default_time_units, allow_truncated_timestamps=True)
def getLogger(name=None): if not name: try: logging.config.fileConfig(DataConstants().logging_conf) except: pass log = logging.getLogger(); elif name not in LoggerManager._loggers.keys(): try: logging.config.fileConfig(DataConstants().logging_conf) except: pass LoggerManager._loggers[name] = logging.getLogger(str(name)) log = LoggerManager._loggers[name] # when recalling appears to make other loggers disabled # hence apply this hack! for name in LoggerManager._loggers.keys(): LoggerManager._loggers[name].disabled = False return log
def start_bloomberg_session(self): tries = 0 session = None logger = LoggerManager().getLogger(__name__) # Try up to 5 times to start a session while (tries < 5): try: # fill SessionOptions sessionOptions = blpapi.SessionOptions() sessionOptions.setServerHost(DataConstants().bbg_server) sessionOptions.setServerPort(DataConstants().bbg_server_port) logger.info("Starting Bloomberg session...") # create a Session session = blpapi.Session(sessionOptions) # start a Session if not session.start(): logger.error("Failed to start session.") return logger.info("Returning session...") tries = 5 except: tries = tries + 1 # BBGLowLevelTemplate._session = session if session is None: logger.error("Failed to start session.") return return session
def get_instance(cls, data_constants=None): if not ConfigManager.__instance: with ConfigManager.__lock: if not ConfigManager.__instance: ConfigManager.__instance = super( ConfigManager, cls).__new__(ConfigManager) if data_constants is None: data_constants = DataConstants() ConfigManager.__instance.populate_time_series_dictionaries( data_constants=data_constants) return ConfigManager.__instance
def process_message(self, msg): constants = DataConstants() # Process received events # SLOW loop (careful, not all the fields will be returned every time # hence need to include the field name in the tuple) # perhaps try to run in parallel? logger = LoggerManager().getLogger(__name__) ticker = msg.getElement('securityData').getElement( 'security').getValue() fieldData = msg.getElement('securityData').getElement('fieldData') data = defaultdict(dict) # FASTER avoid calling getValue/getElement methods in blpapi, # very slow, better to cache variables for i in range(fieldData.numValues()): mini_field_data = fieldData.getValue(i) date = mini_field_data.getElement(0).getValue() for j in range(1, mini_field_data.numElements()): field_value = mini_field_data.getElement(j) data[(str(field_value.name()), ticker)][date] = field_value.getValue() # ORIGINAL repeated calling getValue/getElement much slower # for i in range(fieldData.numValues()): # for j in range(1, fieldData.getValue(i).numElements()): # data[(str(fieldData.getValue(i).getElement(j).name()), # ticker)][fieldData.getValue(i).getElement(0).getValue()] \ # = fieldData.getValue(i).getElement(j).getValue() data_frame = pd.DataFrame(data) # If obsolete ticker could return no values if data_frame.empty: return None else: # data_frame.columns = pd.MultiIndex.from_tuples(data, # names=['field', 'ticker']) data_frame.index = pd.to_datetime(data_frame.index) logger.info("Read: " + ticker + ' ' + str(data_frame.index[0]) + ' - ' + str(data_frame.index[-1])) return data_frame
def fill_options(self, md_request): constants = DataConstants() options = OptionsBBG() options.security = None # md_request.tickers options.startDateTime = md_request.start_date options.endDateTime = md_request.finish_date options.fields = md_request.fields options.overrides = md_request.overrides options_list = [] override_dict = {} if md_request.old_tickers is not None: ticker_list = [] # curr_options = OptionsBBG(options_bbg=options) ## Special case for GDP where the advance, final and preliminary # releases (but can define more in DataConstants) # have the same ticker but different overrides bbg_keyword_dict_override = constants.bbg_keyword_dict_override for tick, old_tick in zip(md_request.tickers, md_request.old_tickers): if old_tick is not None: t = old_tick.lower() # eg. RELEASE_STAGE_OVERRIDE for bbg_override in bbg_keyword_dict_override.keys(): keyword_dict = bbg_keyword_dict_override[bbg_override] for bbg_keyword in keyword_dict.keys(): # eg. ['gdp', 'advance'] keyword = keyword_dict[bbg_keyword] # if this matches a case, we have override if all(k.lower() in t for k in keyword): # In case we have multiple overrides for # this ticker if tick in override_dict: override_dict[tick][ bbg_override] = bbg_keyword else: override_dict[tick] = { bbg_override: bbg_keyword } ## Add other special cases if tick not in override_dict: override_dict[tick] = {} # if ticker_list != []: # curr_options.security = ticker_list last_override = {} def add_new_options(tick_): curr_options = OptionsBBG(options_bbg=options) curr_options.security = [tick_] if override != {}: curr_options.overrides = override options_list.append(curr_options) # Combine the securities into a list of options (each with common # overrides) for tick, override in override_dict.items(): if override == last_override: if len(options_list) > 0: options_list[-1].security.append(tick) else: add_new_options(tick) else: add_new_options(tick) last_override = override # print('stop') # options_list.append(curr_options) else: options.security = md_request.tickers return options if len(options_list) == 1: return options_list[0] return options_list
def load_ticker(self, md_request): """Retrieves market data from external data source (in this case Bloomberg) Parameters ---------- md_request : MarketDataRequest contains all the various parameters detailing time series start and finish, tickers etc Returns ------- DataFrame """ constants = DataConstants() md_request = MarketDataRequest(md_request=md_request) md_request_vendor = self.construct_vendor_md_request(md_request) data_frame = None logger = LoggerManager().getLogger(__name__) logger.info("Request Bloomberg data") # Do we need daily or intraday data? if (md_request.freq in ['daily', 'weekly', 'monthly', 'quarterly', 'yearly']): # Work out the fields which need to be downloaded via Bloomberg ref request (BDP) and # those that can be downloaded via Historical request (BDH) ref_fields = [] ref_vendor_fields = [] # Get user defined list of BBG fields/vendor fields which need to # be downloaded by BDP bbg_ref_fields = list(constants.bbg_ref_fields.keys()) bbg_ref_vendor_fields = list(constants.bbg_ref_fields.values()) for i in range(0, len(md_request.fields)): if md_request.fields[i] in bbg_ref_fields \ or md_request_vendor.fields[ i] in bbg_ref_vendor_fields: ref_fields.append(md_request.fields[i]) ref_vendor_fields.append(md_request_vendor.fields[i]) non_ref_fields = [] non_ref_vendor_fields = [] for i in range(0, len(md_request.fields)): if md_request.fields[i] not in bbg_ref_fields \ and md_request_vendor.fields[ i] not in bbg_ref_vendor_fields: non_ref_fields.append(md_request.fields[i]) non_ref_vendor_fields.append(md_request_vendor.fields[i]) # For certain cases, need to use ReferenceDataRequest # eg. for events times/dates, last tradeable date fields (when specified) if len(ref_fields) > 0: # Careful: make sure you copy the market data request object # (when threading, altering that can # cause concurrency issues!) old_fields = copy.deepcopy(md_request.fields) old_vendor_fields = copy.deepcopy(md_request_vendor.fields) # md_request = MarketDataRequest(md_request=md_request_copy) md_request.fields = ref_fields md_request.vendor_fields = ref_vendor_fields md_request_vendor = self.construct_vendor_md_request( md_request) # Just select those reference fields to download via reference datetime_data_frame = self.get_reference_data( md_request_vendor, md_request) # Download all the other event or non-ref fields # (uses HistoricalDataRequest to Bloomberg) # concatenate with date time fields if len(non_ref_fields) > 0: md_request.fields = non_ref_fields md_request.vendor_fields = non_ref_vendor_fields md_request_vendor = self.construct_vendor_md_request( md_request) events_data_frame = self.get_daily_data( md_request, md_request_vendor) col = events_data_frame.index.name events_data_frame = events_data_frame.reset_index( drop=False) data_frame = pd.concat( [events_data_frame, datetime_data_frame], axis=1) temp = data_frame[col] del data_frame[col] data_frame.index = temp else: data_frame = datetime_data_frame md_request.fields = copy.deepcopy(old_fields) md_request_vendor.fields = copy.deepcopy(old_vendor_fields) # For all other daily/monthly/quarter data, we can use # HistoricalDataRequest to Bloomberg else: data_frame = self.get_daily_data(md_request, md_request_vendor) # if data_frame is not None: # # Convert fields with release-dt to dates (special case!) and assume everything else numerical # for c in data_frame.columns: # try: # if 'release-dt' in c: # data_frame[c] = (data_frame[c]).astype('int').astype(str).apply( # lambda x: pd.to_datetime(x, format='%Y%m%d')) # else: # data_frame[c] = pd.to_numeric(data_frame[c]) # except: # pass # Assume one ticker only for intraday data and use IntradayDataRequest # to Bloomberg if (md_request.freq in ['tick', 'intraday', 'second', 'minute', 'hourly']): md_request_vendor.tickers = \ md_request_vendor.tickers[0] if md_request.freq in ['tick', 'second']: data_frame = self.download_tick(md_request_vendor) else: data_frame = self.download_intraday(md_request_vendor) if data_frame is not None: if data_frame.empty: try: logger.info("No tickers returned for: " + md_request_vendor.tickers) except: pass return None cols = data_frame.columns.values import pytz try: data_frame = data_frame.tz_localize(pytz.utc) except: data_frame = data_frame.tz_convert(pytz.utc) cols = md_request.tickers[0] + "." + cols data_frame.columns = cols logger.info("Completed request from Bloomberg.") return data_frame
def read_time_series_cache_from_disk(self, fname, engine='hdf5', start_date=None, finish_date=None, db_server=DataConstants().db_server, db_port=DataConstants().db_port, username=None, password=None): """Reads time series cache from disk in either HDF5 or bcolz Parameters ---------- fname : str (or list) file to be read from engine : str (optional) 'hd5' - reads HDF5 files (default) 'arctic' - reads from Arctic/MongoDB database 'bcolz' = reads from bcolz file (not fully implemented) start_date : str/datetime (optional) Start date finish_date : str/datetime (optional) Finish data db_server : str IP address of MongdDB (default '127.0.0.1') Returns ------- DataFrame """ logger = LoggerManager.getLogger(__name__) data_frame_list = [] if not (isinstance(fname, list)): if '*' in fname: fname = glob.glob(fname) else: fname = [fname] for fname_single in fname: logger.debug("Reading " + fname_single + "..") if (engine == 'bcolz'): try: name = self.get_bcolz_filename(fname_single) zlens = bcolz.open(rootdir=name) data_frame = zlens.todataframe() data_frame.index = pandas.DatetimeIndex(data_frame['DTS_']) data_frame.index.name = 'Date' del data_frame['DTS_'] # convert invalid characters (which Bcolz can't deal with) to more readable characters for pandas data_frame.columns = self.find_replace_chars( data_frame.columns, _replace_chars, _invalid_chars) data_frame.columns = [x[2:] for x in data_frame.columns] except: data_frame = None elif (engine == 'redis'): fname_single = os.path.basename(fname_single).replace('.', '_') msg = None try: # for pyarrow context = pa.default_serialization_context() r = redis.StrictRedis(host=db_server, port=db_port, db=0) # is there a compressed key stored?) k = r.keys('comp_*_' + fname_single) # if so, then it means that we have stored it as a compressed object # if have more than 1 element, take the last (which will be the latest to be added) if (len(k) >= 1): k = k[-1].decode('utf-8') comp = r.get(k) siz = int(k.split('_')[1]) dec = pa.decompress(comp, codec='lz4', decompressed_size=siz) msg = context.deserialize(dec) else: msg = r.get(fname_single) # print(fname_single) if msg is not None: msg = context.deserialize(msg) # logger.warning("Key " + fname_single + " not in Redis cache?") except Exception as e: logger.info("Cache not existent for " + fname_single + " in Redis: " + str(e)) if msg is None: data_frame = None else: logger.info('Load Redis cache: ' + fname_single) data_frame = msg # pandas.read_msgpack(msg) elif (engine == 'arctic'): socketTimeoutMS = 2 * 1000 import pymongo from arctic import Arctic fname_single = os.path.basename(fname_single).replace('.', '_') logger.info('Load Arctic/MongoDB library: ' + fname_single) if username is not None and password is not None: c = pymongo.MongoClient( host="mongodb://" + username + ":" + password + "@" + str(db_server) + ":" + str(db_port), connect=False ) # , username=username, password=password) else: c = pymongo.MongoClient(host="mongodb://" + str(db_server) + ":" + str(db_port), connect=False) store = Arctic(c, socketTimeoutMS=socketTimeoutMS, serverSelectionTimeoutMS=socketTimeoutMS) # Access the library if True: #try: library = store[fname_single] if start_date is None and finish_date is None: item = library.read(fname_single) else: from arctic.date import DateRange item = library.read( fname_single, date_range=DateRange( start_date.replace(tzinfo=None), finish_date.replace(tzinfo=None))) c.close() logger.info('Read ' + fname_single) data_frame = item.data #except Exception as e: # logger.warning('Library may not exist or another error: ' + fname_single + ' & message is ' + str(e)) # data_frame = None elif os.path.isfile(self.get_h5_filename(fname_single)): store = pandas.HDFStore(self.get_h5_filename(fname_single)) data_frame = store.select("data") if ('intraday' in fname_single): data_frame = data_frame.astype('float32') store.close() elif os.path.isfile(fname_single): data_frame = pandas.read_parquet(fname_single) data_frame_list.append(data_frame) if len(data_frame_list) == 1: return data_frame_list[0] return data_frame_list
def process_message(self, msg): constants = DataConstants() # Process received events # SLOW loop (careful, not all the fields will be returned every time hence need to include the field name in the tuple) # perhaps try to run in parallel? logger = LoggerManager().getLogger(__name__) implementation = 'simple' if implementation == 'simple': ticker = msg.getElement('securityData').getElement( 'security').getValue() fieldData = msg.getElement('securityData').getElement('fieldData') data = defaultdict(dict) # # # FASTER avoid calling getValue/getElement methods in blpapi, very slow, better to cache variables for i in range(fieldData.numValues()): mini_field_data = fieldData.getValue(i) date = mini_field_data.getElement(0).getValue() for j in range(1, mini_field_data.numElements()): field_value = mini_field_data.getElement(j) data[(str(field_value.name()), ticker)][date] = field_value.getValue() # ORIGINAL repeated calling getValue/getElement much slower # for i in range(fieldData.numValues()): # for j in range(1, fieldData.getValue(i).numElements()): # data[(str(fieldData.getValue(i).getElement(j).name()), ticker)][fieldData.getValue(i).getElement(0).getValue()] \ # = fieldData.getValue(i).getElement(j).getValue() elif implementation == 'py4j': pass # TODO Py4J # from findatapy.market.bbgloop import bbgloop # from py4j.java_gateway import JavaGateway # gateway = JavaGateway() # data = gateway.entry_point.parseFieldDataArray(msg) elif implementation == 'cython': ticker = msg.getElement('securityData').getElement( 'security').getValue() fieldData = msg.getElement('securityData').getElement('fieldData') from findatapy.market.bbgloop import bbgloop data = bbgloop(fieldData, ticker) elif implementation == 'numba': ticker = msg.getElement('securityData').getElement( 'security').getValue() fieldData = msg.getElement('securityData').getElement('fieldData') from findatapy.market.bbgloop_numba import bbgloop_numba data = bbgloop_numba(fieldData, ticker) # TODO cython data_frame = pd.DataFrame(data) # if obsolete ticker could return no values if (not (data_frame.empty)): # data_frame.columns = pd.MultiIndex.from_tuples(data, names=['field', 'ticker']) data_frame.index = pd.to_datetime(data_frame.index) logger.info("Read: " + ticker + ' ' + str(data_frame.index[0]) + ' - ' + str(data_frame.index[-1])) else: return None return data_frame
# # Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # # See the License for the specific language governing permissions and limitations under the License. # from findatapy.util.dataconstants import DataConstants from findatapy.util.loggermanager import LoggerManager from datetime import timedelta import datetime import copy data_constants = DataConstants() class MarketDataRequest(object): """Provides parameters for requesting market data. Includes parameters to define the ticker we'd like to fetch, the start and finish dates for our request, as well as the various fields we would like and also the frequency of the data. """ # properties # # data_source eg. bbg, yahoo, quandl # start_date # finish_date
# if __name__ == '__main__': ###### below line CRUCIAL when running Windows, otherwise multiprocessing # doesn't work! (not necessary on Linux) from findatapy.util import SwimPool SwimPool() from findatapy.market import Market, MarketDataRequest, MarketDataGenerator market = Market(market_data_generator=MarketDataGenerator()) from findatapy.util.dataconstants import DataConstants eikon_api_key = DataConstants().eikon_api_key df = None if eikon_api_key is None: eikon_api_key = 'TYPE_YOUR_API' import datetime from datetime import timedelta # You need to have Eikon installed and to have a valid licence for this to # work # For intraday pricing, you can usually access a history back a few months # from the current date # (if you need older history there are other Refinitiv products like
def populate_time_series_dictionaries(data_constants=None): if data_constants is None: data_constants = DataConstants() # There are several CSV files which contain data on the tickers # time_series_tickers_list - contains every tickers (findatapy tickers => vendor tickers) # category, data_source, freq, tickers, cut, fields, vendor_tickers (from your data provider) # eg. fx / bloomberg / daily / EURUSD / TOK / close,open,high,low / EURUSD CMPT Curncy # time_series_fields_list - translate findatapy fields name to vendor fields names # findatapy fields => vendor fields # data_source, fields, vendor_fields # time_series_categories_fields - for each category specific generic properties # category, freq, data_source, fields, startdate # eg. fx / daily / bloomberg / close,high,low,open / 01-Jan-70 # eg. bloomberg / close / PX_LAST ## Populate tickers list (allow for multiple files) if isinstance(data_constants.time_series_tickers_list, str): time_series_tickers_list_file = \ data_constants.time_series_tickers_list.split( ";") else: time_series_tickers_list_file = \ data_constants.time_series_tickers_list df_tickers = [] for tickers_list_file in time_series_tickers_list_file: if os.path.isfile(tickers_list_file): # reader = csv.DictReader(open(tickers_list_file)) df = pd.read_csv(tickers_list_file) df = df.dropna(how="all") df_tickers.append(df) for index, line in df.iterrows(): category = line["category"] data_source = line["data_source"] freq_list = line["freq"].split(",") if isinstance(freq_list, str): freq_list = [freq_list] for freq in freq_list: tickers = line["tickers"] cut = line["cut"] vendor_tickers = line["vendor_tickers"] expiry = None try: expiry = line["expiry"] except: pass if category != "": # Conversion from library tickers to vendor vendor_tickers ConfigManager.\ _dict_time_series_tickers_list_library_to_vendor[ category + "." + data_source + "." + freq + "." + cut + "." + tickers] = vendor_tickers try: if expiry != "": expiry = parse(expiry) else: expiry = None except: pass # Library of tickers by category key = category + "." + data_source + "." + freq \ + "." + cut # Conversion from library tickers to library expiry date ConfigManager._dict_time_series_ticker_expiry_date_library_to_library[ data_source + "." + tickers] = expiry # Conversion from vendor vendor_tickers to library tickers try: ConfigManager._dict_time_series_tickers_list_vendor_to_library[ key + "." + vendor_tickers] = tickers except: logger.warning( "Ticker not specified correctly (is some " "of this missing?) " + str( key) + "." + str(vendor_tickers)) if key in ConfigManager._dict_time_series_category_tickers_library_to_library: ConfigManager._dict_time_series_category_tickers_library_to_library[ key].append(tickers) else: ConfigManager._dict_time_series_category_tickers_library_to_library[ key] = [tickers] try: df_tickers = pd.concat(df_tickers).sort_values( by=["category", "data_source", "freq", "cut"]) except: pass try: df_tickers = df_tickers.reset_index() except: pass try: df_tickers = df_tickers.drop("level_0", axis=1).reset_index() except: pass ConfigManager._data_frame_time_series_tickers = df_tickers ## Populate fields conversions # reader = csv.DictReader(open(data_constants.time_series_fields_list)) df = pd.read_csv(data_constants.time_series_fields_list) df = df.dropna(how="all") for index, line in df.iterrows(): data_source = line["data_source"] fields = line["fields"] vendor_fields = line["vendor_fields"] # Conversion from vendor vendor_fields to library fields ConfigManager._dict_time_series_fields_list_vendor_to_library[ data_source + "." + vendor_fields] = fields # Conversion from library tickers to vendor vendor_fields ConfigManager._dict_time_series_fields_list_library_to_vendor[ data_source + "." + fields] = vendor_fields ## Populate categories fields list # reader = csv.DictReader(open(data_constants.time_series_categories_fields)) df = pd.read_csv(data_constants.time_series_categories_fields) df = df.dropna(how="all") for index, line in df.iterrows(): category = line["category"] data_source = line["data_source"] freq = line["freq"] cut = line["cut"] fields = line["fields"].split(",") # Can have multiple fields startdate = line["startdate"] revision_periods = line["revision_periods"] if category != "": # Conversion from library category to library fields list ConfigManager._dict_time_series_category_fields_library_to_library[ category + "." + data_source + "." + freq + "." + cut] = fields # Conversion from library category to library startdate ConfigManager._dict_time_series_category_startdate_library_to_library[ category + "." + data_source + "." + freq + "." + cut] = parse( startdate).date() # Conversion from library category to library revision periods ConfigManager._dict_time_series_category_revision_periods_library_to_library[ category + "." + data_source + "." + freq + "." + cut] = revision_periods
def read_time_series_cache_from_disk(self, fname, engine='hdf5', start_date=None, finish_date=None, db_server=DataConstants().db_server, db_port=DataConstants().db_port, username=None, password=None): """Reads time series cache from disk in either HDF5 or bcolz Parameters ---------- fname : str (or list) file to be read from engine : str (optional) 'hd5' - reads HDF5 files (default) 'arctic' - reads from Arctic/MongoDB database 'bcolz' = reads from bcolz file (not fully implemented) start_date : str/datetime (optional) Start date finish_date : str/datetime (optional) Finish data db_server : str IP address of MongdDB (default '127.0.0.1') Returns ------- DataFrame """ logger = LoggerManager.getLogger(__name__) data_frame_list = [] if not(isinstance(fname, list)): if '*' in fname: fname = glob.glob(fname) else: fname = [fname] for fname_single in fname: logger.debug("Reading " + fname_single + "..") if (engine == 'bcolz'): try: name = self.get_bcolz_filename(fname_single) zlens = bcolz.open(rootdir=name) data_frame = zlens.todataframe() data_frame.index = pandas.DatetimeIndex(data_frame['DTS_']) data_frame.index.name = 'Date' del data_frame['DTS_'] # convert invalid characters (which Bcolz can't deal with) to more readable characters for pandas data_frame.columns = self.find_replace_chars(data_frame.columns, _replace_chars, _invalid_chars) data_frame.columns = [x[2:] for x in data_frame.columns] except: data_frame = None elif (engine == 'redis'): import redis fname_single = os.path.basename(fname_single).replace('.', '_') msg = None try: r = redis.StrictRedis(host=db_server, port=db_port, db=0) msg = r.get(fname_single) except: self.logger.info("Cache not existent for " + fname_single + " in Redis") if msg is None: data_frame = None else: self.logger.info('Load Redis cache: ' + fname_single) data_frame = pandas.read_msgpack(msg) elif (engine == 'arctic'): socketTimeoutMS = 2 * 1000 import pymongo from arctic import Arctic fname_single = os.path.basename(fname_single).replace('.', '_') self.logger.info('Load Arctic/MongoDB library: ' + fname_single) if username is not None and password is not None: c = pymongo.MongoClient( host="mongodb://" + username + ":" + password + "@" + str(db_server) + ":" + str(db_port), connect=False) # , username=username, password=password) else: c = pymongo.MongoClient(host="mongodb://" + str(db_server) + ":" + str(db_port), connect=False) store = Arctic(c, socketTimeoutMS=socketTimeoutMS, serverSelectionTimeoutMS=socketTimeoutMS) # Access the library try: library = store[fname_single] if start_date is None and finish_date is None: item = library.read(fname_single) else: from arctic.date import DateRange item = library.read(fname_single, date_range=DateRange(start_date, finish_date)) c.close() self.logger.info('Read ' + fname_single) data_frame = item.data except Exception as e: self.logger.warning('Library does not exist: ' + fname_single + ' & message is ' + str(e)) data_frame = None elif os.path.isfile(self.get_h5_filename(fname_single)): store = pandas.HDFStore(self.get_h5_filename(fname_single)) data_frame = store.select("data") if ('intraday' in fname_single): data_frame = data_frame.astype('float32') store.close() elif os.path.isfile(fname_single): data_frame = pandas.read_parquet(fname_single) data_frame_list.append(data_frame) if len(data_frame_list) == 1: return data_frame_list[0] return data_frame_list
@staticmethod def convert_vendor_to_library_field(source, sourcefield): return ConfigManager._dict_time_series_fields_list_vendor_to_library[ source + '.' + sourcefield] @staticmethod def convert_library_to_vendor_field(source, field): return ConfigManager._dict_time_series_fields_list_library_to_vendor[ source + '.' + field] ## test function if __name__ == '__main__': logger = LoggerManager().getLogger(__name__) data_constants = DataConstants( override_fields={'use_cache_compression': False}) print(data_constants.use_cache_compression) cm = ConfigManager().get_instance() categories = cm.get_categories_from_fields() logger.info("Categories from fields list") print(categories) categories = cm.get_categories_from_tickers() logger.info("Categories from tickers list") print(categories)
# need to specify cache_algo_return md_request.cache_algo = "cache_algo_return" df = market.fetch_market(md_request) print(df) if run_example == 3: # In this case we are saving predefined daily tickers to disk, and then # reading back from findatapy.util.dataconstants import DataConstants from findatapy.market.ioengine import IOEngine import os quandl_api_key = DataConstants().quandl_api_key # change with your own # Quandl API key! md_request = MarketDataRequest( category="fx", data_source="quandl", freq="daily", quandl_api_key=quandl_api_key ) market = Market(market_data_generator=MarketDataGenerator()) df = market.fetch_market(md_request=md_request) print(df)
def write_time_series_cache_to_disk(self, fname, data_frame, engine='hdf5_fixed', append_data=False, db_server=DataConstants().db_server, db_port=DataConstants().db_port, username=None, password=None, filter_out_matching=None, timeout=10): """Writes Pandas data frame to disk as HDF5 format or bcolz format or in Arctic Parmeters --------- fname : str path of file data_frame : DataFrame data frame to be written to disk engine : str 'hdf5_fixed' - use HDF5 fixed format, very quick, but cannot append to this 'hdf5_table' - use HDF5 table format, slower but can append to 'parquet' - use Parquet 'arctic' - use Arctic/MongoDB database 'redis' - use Redis append_data : bool False - write a fresh copy of data on disk each time True - append data to disk db_server : str Database server for arctic (default: '127.0.0.1') timeout : int Number of seconds to do timeout """ # default HDF5 format hdf5_format = 'fixed' if 'hdf5' in engine: hdf5_format = engine.split('_')[1] engine = 'hdf5' if (engine == 'bcolz'): # convert invalid characters to substitutes (which Bcolz can't deal with) data_frame.columns = self.find_replace_chars(data_frame.columns, _invalid_chars, _replace_chars) data_frame.columns = ['A_' + x for x in data_frame.columns] data_frame['DTS_'] = pandas.to_datetime(data_frame.index, unit='ns') bcolzpath = self.get_bcolz_filename(fname) shutil.rmtree(bcolzpath, ignore_errors=True) zlens = bcolz.ctable.fromdataframe(data_frame, rootdir=bcolzpath) elif (engine == 'redis'): import redis fname = os.path.basename(fname).replace('.', '_') try: r = redis.StrictRedis(host=db_server, port=db_port, db=0, socket_timeout=timeout, socket_connect_timeout=timeout) if isinstance(data_frame, pandas.DataFrame): r.set(fname, data_frame.to_msgpack(compress='blosc')) self.logger.info("Pushed " + fname + " to Redis") except Exception as e: self.logger.warning("Couldn't push " + fname + " to Redis: " + str(e)) elif (engine == 'arctic'): from arctic import Arctic import pymongo socketTimeoutMS = 30 * 1000 fname = os.path.basename(fname).replace('.', '_') self.logger.info('Load Arctic/MongoDB library: ' + fname) if username is not None and password is not None: c = pymongo.MongoClient( host="mongodb://" + username + ":" + password + "@" + str(db_server) + ":" + str(db_port), connect=False) # , username=username, password=password) else: c = pymongo.MongoClient(host="mongodb://" + str(db_server) + ":" + str(db_port), connect=False) store = Arctic(c, socketTimeoutMS=socketTimeoutMS, serverSelectionTimeoutMS=socketTimeoutMS, connectTimeoutMS=socketTimeoutMS) database = None try: database = store[fname] except: pass if database is None: store.initialize_library(fname, audit=False) self.logger.info("Created MongoDB library: " + fname) else: self.logger.info("Got MongoDB library: " + fname) # Access the library library = store[fname] if ('intraday' in fname): data_frame = data_frame.astype('float32') if filter_out_matching is not None: cols = data_frame.columns new_cols = [] for col in cols: if filter_out_matching not in col: new_cols.append(col) data_frame = data_frame[new_cols] # can duplicate values if we have existing dates if append_data: library.append(fname, data_frame) else: library.write(fname, data_frame) c.close() self.logger.info("Written MongoDB library: " + fname) elif (engine == 'hdf5'): h5_filename = self.get_h5_filename(fname) # append data only works for HDF5 stored as tables (but this is much slower than fixed format) # removes duplicated entries at the end if append_data: store = pandas.HDFStore(h5_filename, format=hdf5_format, complib="blosc", complevel=9) if ('intraday' in fname): data_frame = data_frame.astype('float32') # get last row which matches and remove everything after that (because append # function doesn't check for duplicated rows nrows = len(store['data'].index) last_point = data_frame.index[-1] i = nrows - 1 while (i > 0): read_index = store.select('data', start=i, stop=nrows).index[0] if (read_index <= last_point): break i = i - 1 # remove rows at the end, which are duplicates of the incoming time series store.remove(key='data', start=i, stop=nrows) store.put(key='data', value=data_frame, format=hdf5_format, append=True) store.close() else: h5_filename_temp = self.get_h5_filename(fname + ".temp") # delete the old copy try: os.remove(h5_filename_temp) except: pass store = pandas.HDFStore(h5_filename_temp, format=hdf5_format, complib="blosc", complevel=9) if ('intraday' in fname): data_frame = data_frame.astype('float32') store.put(key='data', value=data_frame, format=hdf5_format) store.close() # delete the old copy try: os.remove(h5_filename) except: pass # once written to disk rename os.rename(h5_filename_temp, h5_filename) self.logger.info("Written HDF5: " + fname) elif (engine == 'parquet'): if fname[-5:] != '.gzip': fname = fname + '.gzip' data_frame.to_parquet(fname, compression='gzip') self.logger.info("Written Parquet: " + fname)
def populate_time_series_dictionaries(data_constants=None): if data_constants is None: data_constants = DataConstants() # There are several CSV files which contain data on the tickers # time_series_tickers_list - contains every ticker (findatapy tickers => vendor tickers) # category, source, freq, ticker, cut, fields, sourceticker (from your data provider) # eg. fx / bloomberg / daily / EURUSD / TOK / close,open,high,low / EURUSD CMPT Curncy # time_series_fields_list - translate findatapy field name to vendor field names # findatapy fields => vendor fields # source, field, sourcefield # time_series_categories_fields - for each category specific generic properties # category, freq, source, fields, startdate # eg. fx / daily / bloomberg / close,high,low,open / 01-Jan-70 # eg. bloomberg / close / PX_LAST ## Populate tickers list (allow for multiple files) time_series_tickers_list_file = data_constants.time_series_tickers_list.split( ';') import os for tickers_list_file in time_series_tickers_list_file: if os.path.isfile(tickers_list_file): reader = csv.DictReader(open(tickers_list_file)) for line in reader: category = line["category"] source = line["source"] freq_list = line["freq"].split(',') if isinstance(freq_list, str): freq_list = [freq_list] for freq in freq_list: ticker = line["ticker"] cut = line["cut"] sourceticker = line["sourceticker"] expiry = None try: expiry = line['expiry'] except: pass if category != "": # print("stop" + category + '.' + # source + '.' + # freq + '.' + # cut + '.' + # ticker) # conversion from library ticker to vendor sourceticker ConfigManager._dict_time_series_tickers_list_library_to_vendor[ category + '.' + source + '.' + freq + '.' + cut + '.' + ticker] = sourceticker try: if expiry != '': expiry = parse(expiry) else: expiry = None except: pass # conversion from library ticker to library expiry date ConfigManager._dict_time_series_ticker_expiry_date_library_to_library[ source + '.' + ticker] = expiry # conversion from vendor sourceticker to library ticker ConfigManager._dict_time_series_tickers_list_vendor_to_library[ source + '.' + sourceticker] = ticker # library of tickers by category key = category + '.' + source + '.' + freq + '.' + cut if key in ConfigManager._dict_time_series_category_tickers_library_to_library: ConfigManager._dict_time_series_category_tickers_library_to_library[ key].append(ticker) else: ConfigManager._dict_time_series_category_tickers_library_to_library[ key] = [ticker] ## Populate fields conversions reader = csv.DictReader(open(data_constants.time_series_fields_list)) for line in reader: source = line["source"] field = line["field"] sourcefield = line["sourcefield"] # Conversion from vendor sourcefield to library field ConfigManager._dict_time_series_fields_list_vendor_to_library[ source + '.' + sourcefield] = field # Conversion from library ticker to vendor sourcefield ConfigManager._dict_time_series_fields_list_library_to_vendor[ source + '.' + field] = sourcefield ## Populate categories field list reader = csv.DictReader( open(data_constants.time_series_categories_fields)) for line in reader: category = line["category"] source = line["source"] freq = line["freq"] cut = line["cut"] fields = line["fields"].split(',') # can have multiple fields startdate = line["startdate"] if category != "": # conversion from library category to library fields list ConfigManager._dict_time_series_category_fields_library_to_library[ category + '.' + source + '.' + freq + '.' + cut] = fields # conversion from library category to library startdate ConfigManager._dict_time_series_category_startdate_library_to_library[ category + '.' + source + '.' + freq + '.' + cut] = parse(startdate).date()
def create_cache_file_name(self, filename): return DataConstants().folder_time_series_data + "/" + filename
def populate_time_series_dictionaries(): # there are several CSV files which contain data on the tickers # time_series_tickers_list - contains every ticker (findatapy tickers => vendor tickers) # category, source, freq, ticker, cut, fields, sourceticker (from your data provider) # eg. fx / bloomberg / daily / EURUSD / TOK / close,open,high,low / EURUSD CMPT Curncy # time_series_fields_list - translate findatapy field name to vendor field names # findatapy fields => vendor fields # source, field, sourcefield # time_series_categories_fields - for each category specific generic properties # category, freq, source, fields, startdate # eg. fx / daily / bloomberg / close,high,low,open / 01-Jan-70 # eg. bloomberg / close / PX_LAST ## populate tickers list (allow for multiple files) time_series_tickers_list_file = DataConstants( ).time_series_tickers_list.split(';') for tickers_list_file in time_series_tickers_list_file: reader = csv.DictReader(open(tickers_list_file)) for line in reader: category = line["category"] source = line["source"] freq = line["freq"] ticker = line["ticker"] cut = line["cut"] sourceticker = line["sourceticker"] if category == "": # print("stop") pass # conversion from library ticker to vendor sourceticker ConfigManager._dict_time_series_tickers_list_library_to_vendor[ category + '.' + source + '.' + freq + '.' + cut + '.' + ticker] = sourceticker # conversion from vendor sourceticker to library ticker ConfigManager._dict_time_series_tickers_list_vendor_to_library[ source + '.' + sourceticker] = ticker # library of tickers by category key = category + '.' + source + '.' + freq + '.' + cut if key in ConfigManager._dict_time_series_category_tickers_library_to_library: ConfigManager._dict_time_series_category_tickers_library_to_library[ key].append(ticker) else: ConfigManager._dict_time_series_category_tickers_library_to_library[ key] = [ticker] ## populate fields conversions reader = csv.DictReader(open(DataConstants().time_series_fields_list)) for line in reader: source = line["source"] field = line["field"] sourcefield = line["sourcefield"] # conversion from vendor sourcefield to library field ConfigManager._dict_time_series_fields_list_vendor_to_library[ source + '.' + sourcefield] = field # conversion from library ticker to vendor sourcefield ConfigManager._dict_time_series_fields_list_library_to_vendor[ source + '.' + field] = sourcefield ## populate categories field list reader = csv.DictReader( open(DataConstants().time_series_categories_fields)) for line in reader: category = line["category"] source = line["source"] freq = line["freq"] cut = line["cut"] fields = line["fields"].split(',') # can have multiple fields startdate = line["startdate"] # conversion from library category to library fields list ConfigManager._dict_time_series_category_fields_library_to_library[ category + '.' + source + '.' + freq + '.' + cut] = fields # conversion from library category to library startdate ConfigManager._dict_time_series_category_startdate_library_to_library[ category + '.' + source + '.' + freq + '.' + cut] = parse(startdate).date()
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied_vol. # # See the License for the specific language governing permissions and limitations under the License. # from chartpy import Chart, Style, ChartConstants from findatapy.util.dataconstants import DataConstants from findatapy.market import Market, MarketDataRequest, MarketDataGenerator from findatapy.timeseries import Calculations import datetime from datetime import timedelta dataconstants = DataConstants() class QuickChart(object): """Displays charts from downloaded data, in a single line code call. Ideal for quickly generating charts from sources including Bloomberg, Quandl, ALFRED/FRED etc. """ def __init__(self, engine='plotly', data_source='bloomberg', market_data_generator=MarketDataGenerator()): self._chart = Chart(engine=engine) self._market = Market(market_data_generator=market_data_generator) self._data_source = data_source
def populate_time_series_dictionaries(data_constants=None): if data_constants is None: data_constants = DataConstants() # There are several CSV files which contain data on the tickers # time_series_tickers_list - contains every tickers (findatapy tickers => vendor tickers) # category, data_source, freq, tickers, cut, fields, vendor_tickers (from your data provider) # eg. fx / bloomberg / daily / EURUSD / TOK / close,open,high,low / EURUSD CMPT Curncy # time_series_fields_list - translate findatapy fields name to vendor fields names # findatapy fields => vendor fields # data_source, fields, vendor_fields # time_series_categories_fields - for each category specific generic properties # category, freq, data_source, fields, startdate # eg. fx / daily / bloomberg / close,high,low,open / 01-Jan-70 # eg. bloomberg / close / PX_LAST ## Populate tickers list (allow for multiple files) if isinstance(data_constants.time_series_tickers_list, str): time_series_tickers_list_file = data_constants.time_series_tickers_list.split( ';') else: time_series_tickers_list_file = data_constants.time_series_tickers_list df_tickers = [] for tickers_list_file in time_series_tickers_list_file: if os.path.isfile(tickers_list_file): reader = csv.DictReader(open(tickers_list_file)) df_tickers.append(pd.read_csv(tickers_list_file)) for line in reader: category = line["category"] data_source = line["data_source"] freq_list = line["freq"].split(',') if isinstance(freq_list, str): freq_list = [freq_list] for freq in freq_list: tickers = line["tickers"] cut = line["cut"] vendor_tickers = line["vendor_tickers"] expiry = None try: expiry = line['expiry'] except: pass if category != "": # print("stop" + category + '.' + # data_source + '.' + # freq + '.' + # cut + '.' + # tickers) # Conversion from library tickers to vendor vendor_tickers ConfigManager._dict_time_series_tickers_list_library_to_vendor[ category + '.' + data_source + '.' + freq + '.' + cut + '.' + tickers] = vendor_tickers try: if expiry != '': expiry = parse(expiry) else: expiry = None except: pass # Library of tickers by category key = category + '.' + data_source + '.' + freq + '.' + cut # Conversion from library tickers to library expiry date ConfigManager._dict_time_series_ticker_expiry_date_library_to_library[ data_source + '.' + tickers] = expiry # Conversion from vendor vendor_tickers to library tickers ConfigManager._dict_time_series_tickers_list_vendor_to_library[ key + '.' + vendor_tickers] = tickers if key in ConfigManager._dict_time_series_category_tickers_library_to_library: ConfigManager._dict_time_series_category_tickers_library_to_library[ key].append(tickers) else: ConfigManager._dict_time_series_category_tickers_library_to_library[ key] = [tickers] try: df_tickers = pd.concat(df_tickers).sort_values( by=['category', 'data_source', 'freq', 'cut']) except: pass try: df_tickers = df_tickers.reset_index() except: pass try: df_tickers = df_tickers.drop('level_0', axis=1).reset_index() except: pass ConfigManager._data_frame_time_series_tickers = df_tickers ## Populate fields conversions reader = csv.DictReader(open(data_constants.time_series_fields_list)) for line in reader: data_source = line["data_source"] fields = line["fields"] vendor_fields = line["vendor_fields"] # Conversion from vendor vendor_fields to library fields ConfigManager._dict_time_series_fields_list_vendor_to_library[ data_source + '.' + vendor_fields] = fields # Conversion from library tickers to vendor vendor_fields ConfigManager._dict_time_series_fields_list_library_to_vendor[ data_source + '.' + fields] = vendor_fields ## Populate categories fields list reader = csv.DictReader( open(data_constants.time_series_categories_fields)) for line in reader: category = line["category"] data_source = line["data_source"] freq = line["freq"] cut = line["cut"] fields = line["fields"].split(',') # Can have multiple fields startdate = line["startdate"] revision_periods = line["revision_periods"] if category != "": # Conversion from library category to library fields list ConfigManager._dict_time_series_category_fields_library_to_library[ category + '.' + data_source + '.' + freq + '.' + cut] = fields # Conversion from library category to library startdate ConfigManager._dict_time_series_category_startdate_library_to_library[ category + '.' + data_source + '.' + freq + '.' + cut] = parse(startdate).date() # Conversion from library category to library revision periods ConfigManager._dict_time_series_category_revision_periods_library_to_library[ category + '.' + data_source + '.' + freq + '.' + cut] = revision_periods