class FXCrossFactory(object): """Generates FX spot time series and FX total return time series (assuming we already have total return indices available from xxxUSD form) from underlying series. Can also produce cross rates from the USD crosses. """ def __init__(self, market_data_generator=None): self.logger = LoggerManager().getLogger(__name__) self.fxconv = FXConv() self.cache = {} self.calculations = Calculations() self.market_data_generator = market_data_generator return def get_fx_cross_tick(self, start, end, cross, cut="NYC", data_source="dukascopy", cache_algo='internet_load_return', type='spot', environment='backtest', fields=['bid', 'ask']): if isinstance(cross, str): cross = [cross] market_data_request = MarketDataRequest( gran_freq="tick", freq_mult=1, freq='tick', cut=cut, fields=['bid', 'ask', 'bidv', 'askv'], cache_algo=cache_algo, environment=environment, start_date=start, finish_date=end, data_source=data_source, category='fx') market_data_generator = self.market_data_generator data_frame_agg = None for cr in cross: if (type == 'spot'): market_data_request.tickers = cr cross_vals = market_data_generator.fetch_market_data( market_data_request) # if user only wants 'close' calculate that from the bid/ask fields if fields == ['close']: cross_vals = cross_vals[[cr + '.bid', cr + '.ask']].mean(axis=1) cross_vals.columns = [cr + '.close'] else: filter = Filter() filter_columns = [cr + '.' + f for f in fields] cross_vals = filter.filter_time_series_by_columns( filter_columns, cross_vals) if data_frame_agg is None: data_frame_agg = cross_vals else: data_frame_agg = data_frame_agg.join(cross_vals, how='outer') # strip the nan elements data_frame_agg = data_frame_agg.dropna() return data_frame_agg def get_fx_cross(self, start, end, cross, cut="NYC", data_source="bloomberg", freq="intraday", cache_algo='internet_load_return', type='spot', environment='backtest', fields=['close']): if data_source == "gain" or data_source == 'dukascopy' or freq == 'tick': return self.get_fx_cross_tick(start, end, cross, cut=cut, data_source=data_source, cache_algo=cache_algo, type='spot', fields=fields) if isinstance(cross, str): cross = [cross] market_data_request_list = [] freq_list = [] type_list = [] for cr in cross: market_data_request = MarketDataRequest(freq_mult=1, cut=cut, fields=['close'], freq=freq, cache_algo=cache_algo, start_date=start, finish_date=end, data_source=data_source, environment=environment) market_data_request.type = type market_data_request.cross = cr if freq == 'intraday': market_data_request.gran_freq = "minute" # intraday elif freq == 'daily': market_data_request.gran_freq = "daily" # daily market_data_request_list.append(market_data_request) data_frame_agg = [] # depends on the nature of operation as to whether we should use threading or multiprocessing library if DataConstants().market_thread_technique is "thread": from multiprocessing.dummy import Pool else: # most of the time is spend waiting for Bloomberg to return, so can use threads rather than multiprocessing # must use the multiprocessing_on_dill library otherwise can't pickle objects correctly # note: currently not very stable from multiprocessing_on_dill import Pool thread_no = DataConstants().market_thread_no['other'] if market_data_request_list[0].data_source in DataConstants( ).market_thread_no: thread_no = DataConstants().market_thread_no[ market_data_request_list[0].data_source] # fudge, issue with multithreading and accessing HDF5 files # if self.market_data_generator.__class__.__name__ == 'CachedMarketDataGenerator': # thread_no = 0 if (thread_no > 0): pool = Pool(thread_no) # open the market data downloads in their own threads and return the results data_frame_agg = self.calculations.iterative_outer_join( pool.map_async(self._get_individual_fx_cross, market_data_request_list).get()) # data_frame_agg = self.calculations.pandas_outer_join(result.get()) try: pool.close() pool.join() except: pass else: for md_request in market_data_request_list: data_frame_agg.append( self._get_individual_fx_cross(md_request)) data_frame_agg = self.calculations.pandas_outer_join( data_frame_agg) # strip the nan elements data_frame_agg = data_frame_agg.dropna() # self.speed_cache.put_dataframe(key, data_frame_agg) return data_frame_agg def _get_individual_fx_cross(self, market_data_request): cr = market_data_request.cross type = market_data_request.type freq = market_data_request.freq base = cr[0:3] terms = cr[3:6] if (type == 'spot'): # non-USD crosses if base != 'USD' and terms != 'USD': base_USD = self.fxconv.correct_notation('USD' + base) terms_USD = self.fxconv.correct_notation('USD' + terms) # TODO check if the cross exists in the database # download base USD cross market_data_request.tickers = base_USD market_data_request.category = 'fx' if base_USD + '.close' in self.cache: base_vals = self.cache[base_USD + '.close'] else: base_vals = self.market_data_generator.fetch_market_data( market_data_request) self.cache[base_USD + '.close'] = base_vals # download terms USD cross market_data_request.tickers = terms_USD market_data_request.category = 'fx' if terms_USD + '.close' in self.cache: terms_vals = self.cache[terms_USD + '.close'] else: terms_vals = self.market_data_generator.fetch_market_data( market_data_request) self.cache[terms_USD + '.close'] = terms_vals # if quoted USD/base flip to get USD terms if (base_USD[0:3] == 'USD'): if 'USD' + base in '.close' in self.cache: base_vals = self.cache['USD' + base + '.close'] else: base_vals = 1 / base_vals self.cache['USD' + base + '.close'] = base_vals # if quoted USD/terms flip to get USD terms if (terms_USD[0:3] == 'USD'): if 'USD' + terms in '.close' in self.cache: terms_vals = self.cache['USD' + terms + '.close'] else: terms_vals = 1 / terms_vals self.cache['USD' + terms + '.close'] = base_vals base_vals.columns = ['temp'] terms_vals.columns = ['temp'] cross_vals = base_vals.div(terms_vals, axis='index') cross_vals.columns = [cr + '.close'] base_vals.columns = [base_USD + '.close'] terms_vals.columns = [terms_USD + '.close'] else: # if base == 'USD': non_USD = terms # if terms == 'USD': non_USD = base correct_cr = self.fxconv.correct_notation(cr) market_data_request.tickers = correct_cr market_data_request.category = 'fx' if correct_cr + '.close' in self.cache: cross_vals = self.cache[correct_cr + '.close'] else: cross_vals = self.market_data_generator.fetch_market_data( market_data_request) # flip if not convention if (correct_cr != cr): if cr + '.close' in self.cache: cross_vals = self.cache[cr + '.close'] else: cross_vals = 1 / cross_vals self.cache[cr + '.close'] = cross_vals self.cache[correct_cr + '.close'] = cross_vals # cross_vals = self.market_data_generator.harvest_time_series(market_data_request) cross_vals.columns.names = [cr + '.close'] elif type[0:3] == "tot": if freq == 'daily': # download base USD cross market_data_request.tickers = base + 'USD' market_data_request.category = 'fx-tot' if type == "tot": base_vals = self.market_data_generator.fetch_market_data( market_data_request) else: x = 0 # download terms USD cross market_data_request.tickers = terms + 'USD' market_data_request.category = 'fx-tot' if type == "tot": terms_vals = self.market_data_generator.fetch_market_data( market_data_request) else: pass base_rets = self.calculations.calculate_returns(base_vals) terms_rets = self.calculations.calculate_returns(terms_vals) cross_rets = base_rets.sub(terms_rets.iloc[:, 0], axis=0) # first returns of a time series will by NaN, given we don't know previous point cross_rets.iloc[0] = 0 cross_vals = self.calculations.create_mult_index(cross_rets) cross_vals.columns = [cr + '-tot.close'] elif freq == 'intraday': self.logger.info( 'Total calculated returns for intraday not implemented yet' ) return None return cross_vals
# in the config file, we can use keywords 'open', 'high', 'low', 'close' and 'volume' for alphavantage data # download equities data from alphavantage md_request = MarketDataRequest( start_date="01 Jan 2002", # start date finish_date="05 Feb 2017", # finish date data_source='alphavantage', # use alphavantage as data source tickers=[ 'Apple', 'Citigroup', 'Microsoft', 'Oracle', 'IBM', 'Walmart', 'Amazon', 'UPS', 'Exxon' ], # ticker (findatapy) fields=['close'], # which fields to download vendor_tickers=[ 'aapl', 'c', 'msft', 'orcl', 'ibm', 'wmt', 'amzn', 'ups', 'xom' ], # ticker (alphavantage) vendor_fields=['Close'], # which alphavantage fields to download cache_algo='internet_load_return') logger.info("Load data from alphavantage directly") df = market.fetch_market(md_request) logger.info( "Loaded data from alphavantage directly, now try reading from Redis in-memory cache" ) md_request.cache_algo = 'cache_algo_return' # change flag to cache algo so won't attempt to download via web df = market.fetch_market(md_request) logger.info("Read from Redis cache.. that was a lot quicker!")
class DataVendorDukasCopy(DataVendor): tick_name = "{symbol}/{year}/{month}/{day}/{hour}h_ticks.bi5" def __init__(self): super(DataVendor, self).__init__() self.logger = LoggerManager().getLogger(__name__) import logging logging.getLogger("requests").setLevel(logging.WARNING) self.config = ConfigManager() # implement method in abstract superclass def load_ticker(self, market_data_request): """ load_ticker - Retrieves market data from external data source (in this case Bloomberg) Parameters ---------- market_data_request : TimeSeriesRequest contains all the various parameters detailing time series start and finish, tickers etc Returns ------- DataFrame """ market_data_request_vendor = self.construct_vendor_market_data_request(market_data_request) data_frame = None self.logger.info("Request Dukascopy data") # doesn't support non-tick data if (market_data_request.freq in ['daily', 'weekly', 'monthly', 'quarterly', 'yearly', 'intraday', 'minute', 'hourly']): self.logger.warning("Dukascopy loader is for tick data only") return None # assume one ticker only (MarketDataGenerator only calls one ticker at a time) if (market_data_request.freq in ['tick']): # market_data_request_vendor.tickers = market_data_request_vendor.tickers[0] data_frame = self.get_tick(market_data_request, market_data_request_vendor) if data_frame is not None: data_frame.tz_localize('UTC') self.logger.info("Completed request from Dukascopy") return data_frame def kill_session(self): return def get_tick(self, market_data_request, market_data_request_vendor): data_frame = self.download_tick(market_data_request_vendor) # convert from vendor to findatapy tickers/fields if data_frame is not None: returned_fields = data_frame.columns returned_tickers = [market_data_request_vendor.tickers[0]] * (len(returned_fields)) if data_frame is not None: fields = self.translate_from_vendor_field(returned_fields, market_data_request) tickers = self.translate_from_vendor_ticker(returned_tickers, market_data_request) ticker_combined = [] for i in range(0, len(fields)): ticker_combined.append(tickers[i] + "." + fields[i]) data_frame.columns = ticker_combined data_frame.index.name = 'Date' return data_frame def download_tick(self, market_data_request): symbol = market_data_request.tickers[0] df_list = [] self.logger.info("About to download from Dukascopy... for " + symbol) # single threaded df_list = [self.fetch_file(time, symbol) for time in self.hour_range(market_data_request.start_date, market_data_request.finish_date)] # TODO parallel (has pickle issues) # time_list = self.hour_range(market_data_request.start_date, market_data_request.finish_date) # import multiprocessing_on_dill as multiprocessing # # pool = multiprocessing.Pool(processes=4) # results = [pool.apply_async(self.fetch_file, args=(time, symbol)) for time in time_list] # df_list = [p.get() for p in results] try: return pandas.concat(df_list) except: return None def fetch_file(self, time, symbol): if time.hour % 24 == 0: self.logger.info("Downloading... " + str(time)) tick_path = self.tick_name.format( symbol = symbol, year = str(time.year).rjust(4, '0'), month = str(time.month).rjust(2, '0'), day = str(time.day).rjust(2, '0'), hour = str(time.hour).rjust(2, '0') ) tick = self.fetch_tick(DataConstants().dukascopy_base_url + tick_path) if DataConstants().dukascopy_write_temp_tick_disk: out_path = DataConstants().temp_folder + "/dkticks/" + tick_path if not os.path.exists(out_path): if not os.path.exists(os.path.dirname(out_path)): os.makedirs(os.path.dirname(out_path)) self.write_tick(tick, out_path) try: return self.retrieve_df(lzma.decompress(tick), symbol, time) except: return None def fetch_tick(self, tick_url): i = 0 tick_request = None # try up to 5 times to download while i < 5: try: tick_request = requests.get(tick_url) i = 5 except: i = i + 1 if (tick_request is None): self.logger("Failed to download from " + tick_url) return None return tick_request.content def write_tick(self, content, out_path): data_file = open(out_path, "wb+") data_file.write(content) data_file.close() def chunks(self, list, n): if n < 1: n = 1 return [list[i:i + n] for i in range(0, len(list), n)] def retrieve_df(self, data, symbol, epoch): date, tuple = self.parse_tick_data(data, epoch) df = pandas.DataFrame(data = tuple, columns=['temp', 'ask', 'bid', 'askv', 'bidv'], index = date) df.drop('temp', axis = 1) df.index.name = 'Date' divisor = 100000 # where JPY is the terms currency we have different divisor if symbol[3:6] == 'JPY': divisor = 1000 # prices are returned without decimal point df['bid'] = df['bid'] / divisor df['ask'] = df['ask'] / divisor return df def hour_range(self, start_date, end_date): delta_t = end_date - start_date delta_hours = (delta_t.days * 24.0) + (delta_t.seconds / 3600.0) for n in range(int (delta_hours)): yield start_date + timedelta(0, 0, 0, 0, 0, n) # Hours def parse_tick_data(self, data, epoch): import struct # tick = namedtuple('Tick', 'Date ask bid askv bidv') chunks_list = self.chunks(data, 20) parsed_list = [] date = [] # note: Numba can speed up for loops for row in chunks_list: d = struct.unpack(">LLLff", row) date.append((epoch + timedelta(0,0,0, d[0]))) # SLOW: no point using named tuples! # row_data = tick._asdict(tick._make(d)) # row_data['Date'] = (epoch + timedelta(0,0,0,row_data['Date'])) parsed_list.append(d) return date, parsed_list def chunks(self, list, n): if n < 1: n = 1 return [list[i:i + n] for i in range(0, len(list), n)] def get_daily_data(self): pass
class DataVendorQuandl(DataVendor): def __init__(self): super(DataVendorQuandl, self).__init__() self.logger = LoggerManager().getLogger(__name__) # implement method in abstract superclass def load_ticker(self, market_data_request): market_data_request_vendor = self.construct_vendor_market_data_request(market_data_request) self.logger.info("Request Quandl data") data_frame = self.download_daily(market_data_request_vendor) if data_frame is None or data_frame.index is []: return None # convert from vendor to findatapy tickers/fields if data_frame is not None: returned_tickers = data_frame.columns if data_frame is not None: # tidy up tickers into a format that is more easily translatable # we can often get multiple fields returned (even if we don't ask for them!) # convert to lower case returned_fields = [(x.split(' - ')[1]).lower().replace(' ', '-').replace('.', '-').replace('--', '-') for x in returned_tickers] returned_fields = [x.replace('value', 'close') for x in returned_fields] # special case for close returned_tickers = [x.replace('.', '/') for x in returned_tickers] returned_tickers = [x.split(' - ')[0] for x in returned_tickers] fields = self.translate_from_vendor_field(returned_fields, market_data_request) tickers = self.translate_from_vendor_ticker(returned_tickers, market_data_request) ticker_combined = [] for i in range(0, len(fields)): ticker_combined.append(tickers[i] + "." + fields[i]) data_frame.columns = ticker_combined data_frame.index.name = 'Date' self.logger.info("Completed request from Quandl.") return data_frame def download_daily(self, market_data_request): trials = 0 data_frame = None while(trials < 5): try: data_frame = Quandl.get(market_data_request.tickers, authtoken=DataConstants().quandl_api_key, trim_start=market_data_request.start_date, trim_end=market_data_request.finish_date) break except: trials = trials + 1 self.logger.info("Attempting... " + str(trials) + " request to download from Quandl") if trials == 5: self.logger.error("Couldn't download from Quandl after several attempts!") return data_frame
# and "volume" for alphavantage data # Download equities data from yahoo md_request = MarketDataRequest( start_date="01 Jan 2002", # start date finish_date="05 Feb 2017", # finish date data_source="yahoo", # use alphavantage as data source tickers=["Apple", "Citigroup", "Microsoft", "Oracle", "IBM", "Walmart", "Amazon", "UPS", "Exxon"], # ticker (findatapy) fields=["close"], # which fields to download vendor_tickers=["aapl", "c", "msft", "orcl", "ibm", "wmt", "amzn", "ups", "xom"], # ticker (yahoo) vendor_fields=["Close"], # which yahoo fields to download cache_algo="internet_load_return") logger.info("Load data from yahoo directly") df = market.fetch_market(md_request) print(df) logger.info( "Loaded data from yahoo directly, now try reading from Redis " "in-memory cache") md_request.cache_algo = "cache_algo_return" # change flag to cache algo # so won"t attempt to download via web df = market.fetch_market(md_request) print(df) logger.info("Read from Redis cache.. that was a lot quicker!")