def _benchmark_calculation(self, trade_order_df, bid_price, ask_price, date_start, date_end, weights=None): benchmark = [] for i in range(0, len(trade_order_df.index)): # If the trade is a buy if trade_order_df['side'][i] == 1: price = ask_price # If the trade is a sell elif trade_order_df['side'][i] == -1: price = bid_price if date_start[i] == date_end[i]: benchmark.append(price[date_start[i]]) else: try: benchmark.append(self._get_price(price[date_start[i]:date_end[i]], side=trade_order_df['side'][i])) except Exception as e: err_msg = self._benchmark_name + " cannot be calculated, given market data does not fully overlap with trade data: " \ + str(e) LoggerManager.getLogger(__name__).error(err_msg) raise TradeMarketNonOverlapException(err_msg) return benchmark
def calculate_benchmark(self, trade_order_df=None, market_df=None, trade_order_name=None, bid_benchmark=None, ask_benchmark=None, benchmark_date_start_field=None, benchmark_date_end_field=None): if not (self._check_calculate_benchmark(trade_order_name=trade_order_name)): return trade_order_df, market_df # for the specified field (usually 'mid' field) calculate the time weighted average price, which is the simple # average if bid_benchmark is None: bid_benchmark = self._bid_benchmark if ask_benchmark is None: ask_benchmark = self._ask_benchmark if benchmark_date_start_field is None: benchmark_date_start_field = self._benchmark_date_start_field if benchmark_date_end_field is None: benchmark_date_end_field = self._benchmark_date_end_field if bid_benchmark in market_df.columns and ask_benchmark in market_df: trade_order_df[self._benchmark_name] = np.nan date_start = trade_order_df[benchmark_date_start_field].values date_end = trade_order_df[benchmark_date_end_field].values date_start = np.searchsorted(market_df.index, date_start) date_end = np.searchsorted(market_df.index, date_end) bid_price = market_df[bid_benchmark].values ask_price = market_df[ask_benchmark].values dt = market_df.index.to_series().diff().values / np.timedelta64(1, 's') dt[0] = 0 # first point should be weighted zero (since don't know how long it's been there) twap = [] for i in range(0, len(trade_order_df.index)): if trade_order_df['side'][i] == 1: price = ask_price elif trade_order_df['side'][i] == -1: price = bid_price try: if date_start[i] == date_end[i]: twap.append(price[date_start[i]]) else: twap_val = np.average(price[date_start[i]:date_end[i]], weights=dt[date_start[i]:date_end[i]]) twap.append(twap_val) except Exception as e: err_msg = "TWAP cannot be calculated, given market data does not fully overlap with trade data: " \ + str(e) LoggerManager.getLogger(__name__).error(err_msg) raise TradeMarketNonOverlapException(err_msg) trade_order_df[self._benchmark_name] = twap else: LoggerManager.getLogger(__name__).warn(bid_benchmark + " and " + ask_benchmark + " may not be in market data.") return trade_order_df, market_df
def calculate_benchmark(self, trade_order_df=None, market_df=None, trade_order_name=None, bid_benchmark=None, ask_benchmark=None, volume_field=None, benchmark_date_start_field=None, benchmark_date_end_field=None): if not (self._check_calculate_benchmark(trade_order_name=trade_order_name)): return trade_order_df, market_df # if fields have not been specified, then take them from the field variables if bid_benchmark is None: bid_benchmark = self._bid_benchmark if ask_benchmark is None: ask_benchmark = self._ask_benchmark if volume_field is None: volume_field = self._volume_field if benchmark_date_start_field is None: benchmark_date_start_field = self._benchmark_date_start_field if benchmark_date_end_field is None: benchmark_date_end_field = self._benchmark_date_end_field if bid_benchmark in market_df.columns and ask_benchmark in market_df.columns and volume_field in market_df.columns: trade_order_df[self._benchmark_name] = np.nan date_start = trade_order_df[benchmark_date_start_field].values date_end = trade_order_df[benchmark_date_end_field].values date_start = np.searchsorted(market_df.index, date_start) date_end = np.searchsorted(market_df.index, date_end) bid_price = market_df[bid_benchmark].values ask_price = market_df[ask_benchmark].values volume = market_df[volume_field].values vwap = [] for i in range(0, len(trade_order_df.index)): if trade_order_df['side'][i] == 1: price = ask_price elif trade_order_df['side'][i] == -1: price = bid_price if date_start[i] == date_end[i]: vwap.append(price[date_start[i]]) else: try: vwap.append( np.average(price[date_start[i]:date_end[i]], weights=volume[date_start[i]:date_end[i]])) except Exception as e: err_msg = "VWAP cannot be calculated, given market data does not fully overlap with trade data: " \ + str(e) LoggerManager.getLogger(__name__).error(err_msg) raise TradeMarketNonOverlapException(err_msg) trade_order_df[self._benchmark_name] = vwap else: LoggerManager.getLogger(__name__).warn( bid_benchmark + ", " + ask_benchmark + " " + volume_field + " may not be in market data") return trade_order_df, market_df
def _check_data_store(self, data_store): try: # constants = Constants() if not data_store in constants.valid_data_store or '.csv' not in data_store or '.h5' not in data_store: err_msg = data_store + " is not a defined data source." LoggerManager.getLogger(__name__).error(err_msg) raise ValidationException(err_msg) except: pass return data_store
def _write_df_to_db_single_thread(self, ticker, remove_duplicates=True, if_exists_table='append', if_exists_ticker='replace'): logger = LoggerManager.getLogger(__name__) postfix = '-' + self._get_postfix() + '-with-duplicates' if remove_duplicates: postfix = '-' + self._get_postfix() + '-no-duplicates' filename = os.path.join(self.temp_large_data_folder, ticker + postfix) + '.' + fileformat logger.info("Reading " + filename) util_func = UtilFunc() time_series_ops = TimeSeriesOps() data_source_local = self._get_output_data_source() df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: df = time_series_ops.localize_as_UTC(df) data_source_local.append_market_data(df, ticker, if_exists_table=if_exists_table, if_exists_ticker=if_exists_ticker) else: logger.warn("Couldn't write dataframe for " + ticker + " to database, appears it is empty!")
def _chunk_dataframes(self, obj, chunk_size_mb=constants. volatile_cache_redis_max_cache_chunk_size_mb): logger = LoggerManager.getLogger(__name__) # Can sometime have very large dataframes, which need to be split, otherwise won't fit in a single Redis key mem = obj.memory_usage(deep='deep').sum() mem_float = round(float(mem) / (1024.0 * 1024.0), 3) mem = '----------- ' + str(mem_float) + ' MB -----------' chunks = int(math.ceil(mem_float / chunk_size_mb)) if chunks > 1: obj_list = self._time_series_ops.split_array_chunks(obj, chunks=chunks) else: obj_list = [obj] if obj_list != []: logger.debug("Pandas dataframe of size: " + mem + " in " + str(chunks) + " chunk(s)") return obj_list
def create_resampled_spot_data(resample_freq='1min', data_vendor='dukascopy'): logger = LoggerManager.getLogger(__name__) csv_input_folder = '/data/csv_dump/' + data_vendor + '/' for ticker in ticker_mkt: logger.info("Processing for " + ticker + " resample freq " + resample_freq + " data vendor " + data_vendor) flat_file = csv_input_folder + ticker + '_' + data_vendor + '_*.' + file_extension df_dd = dd.read_parquet(flat_file).compute()['mid'] logger.info("About to resample OHLC for " + ticker + " resample freq " + resample_freq + " data vendor " + data_vendor) resampler = df_dd.resample(resample_freq) df_dd_ohlc = resampler.ohlc() print(df_dd_ohlc.columns) logger.info("About to resample count for " + ticker) df_dd_count = resampler.count() df_dd_count.name = 'tickcount' df_dd = pd.concat([df_dd_ohlc, df_dd_count], axis=1) df_dd.columns = [ticker + '.' + x for x in df_dd.columns] df_dd = df_dd.dropna() df_dd.to_parquet(csv_output + ticker + '_' + resample_freq + '_' + data_vendor + '.' + file_extension) df_dd = None
def get_market_trade_metrics(self, tca_request, dummy_market=False): """Collects together all the market and trade data (and computes metrics) for each ticker specified in the TCARequest Parameters ---------- tca_request : TCARequest Parameters for the TCA dummy_market : bool (default: False) Should dummy market data be returned (requires less memory)? Returns ------- DataFrame (dict) , DataFrame (dict), TCARequest (list) """ logger = LoggerManager.getLogger(__name__) logger.debug("Start loading trade/data/computation") # split up TCARequest into a list of TCA with different tickers tca_request_list = self._split_tca_request_into_list(tca_request) market_df_dict, trade_order_results_df_dict = self._get_market_trade_metrics( tca_request_list, dummy_market) logger.debug( "Finished loading data and calculating metrics on individual tickers" ) return market_df_dict, trade_order_results_df_dict, tca_request_list
def _get(self, key, burn_after_reading=False): logger = LoggerManager.getLogger(__name__) logger.debug('Attempting to get list from cache: ' + str(key)) old_key = key # Use a pipeline which is quicker for multiple database operations pipeline = VolatileRedis._db.pipeline() # Check if the key is inside Redis (may have the "size" after it, which will be needed to decompress) for k in key: pipeline.keys(k + "*") key = pipeline.execute() key = self._util_func.flatten_list_of_lists(key) if key != []: # Convert byte to string key = [k.decode("utf-8") for k in key] pipeline = VolatileRedis._db.pipeline() # Get list of values for each element for k in key: pipeline.lrange(k, 0, -1) if burn_after_reading: key_burn = [k for k in key if '_expiry_' in k] self.delete(key_burn, pipeline=pipeline) cache_output = pipeline.execute() else: cache_output = [None] * len(old_key) key = old_key if burn_after_reading: if len(cache_output) == len(key) + 1: logger.debug("Deleted " + str(cache_output[-1]) + ' keys') cache_output = cache_output[:-1] for i in range(0, len(key)): if cache_output[i] is not None: try: cache_output[i] = self._convert_binary_to_python( cache_output[i], key[i]) except Exception as e: logger.error( "Error converting binary object to Python for key: " + key[i] + " and " + str(e)) # print(cache_output[i]) cache_output[i] = None # print(cache_output) return cache_output
def get_market_trade_order_holder(self, tca_request): """Gets the both the market data and trade/order data associated with a TCA calculation as a tuple of (DataFrame, DataFrameHolder) Parameters ---------- tca_request : TCARequest Parameters for a TCA calculation Returns ------- DataFrame, DataFrameHolder """ logger = LoggerManager.getLogger(__name__) logger.debug("Get market and trade/order data for " + str(tca_request.ticker) + " from " + str(tca_request.start_date) + " - " + str(tca_request.finish_date)) # Get all the trade/orders which have been requested, eg. trade_df and order_df # do separate calls given they are assumed to be stored in different database tables return self.get_market_data(tca_request), \ self.get_trade_order_holder(tca_request)
def create_resampled_spot_data(): logger = LoggerManager.getLogger(__name__) for ticker in ticker_mkt: logger.info("Processing for " + ticker) flat_file = csv_folder + ticker + '_' + data_vendor + '_*.' + file_extension df_dd = dd.read_parquet(flat_file).compute()['mid'] logger.info("About to resample OHLC for " + ticker) df_dd_ohlc = df_dd.resample(resample_freq).ohlc() print(df_dd_ohlc.columns) logger.info("About to resample count for " + ticker) df_dd_count = df_dd.resample(resample_freq).count() df_dd_count.name = 'tickcount' df_dd = df_dd_ohlc.join(df_dd_count) df_dd.columns = [ticker + '.' + x for x in df_dd.columns] df_dd = df_dd.dropna() df_dd.to_parquet(csv_output + ticker + '_' + resample_freq + '_' + data_vendor + '.' + file_extension)
def _calculate_additional_metrics(self, market_df, trade_order_df_dict, tca_request): logger = LoggerManager.getLogger(__name__) # Add candlesticks/sparse DataFrames for plotting if requested if tca_request.tca_type == 'detailed' or tca_request.summary_display == 'candlestick': trade_order_list = self._util_func.dict_key_list( trade_order_df_dict.keys()) # only add the ticker name if we have a non-detailed plot to differentiate between currency pairs if tca_request.tca_type == 'detailed': ticker_label = '' else: ticker_label = tca_request.ticker + '_' logger.debug( "Generating downsampled market data for potentional display") market_downsampled_df = self._time_series_ops.downsample_time_series_usable( market_df) # Combine downsampled market data with trade data fields = [ 'bid', 'ask', 'open', 'high', 'low', 'close', 'mid', 'vwap', 'twap', 'arrival', 'buy_trade', 'sell_trade', 'notional', 'executed_notional', 'executed_price', 'side' ] # create a sparse representation of the trades/orders which can later be displayed to users for trade_order in trade_order_list: if trade_order in trade_order_df_dict: trade_order_df_dict[ticker_label + 'sparse_market_' + trade_order] = \ self._join_market_downsampled_trade_orders(market_downsampled_df, trade_order_df_dict[trade_order], fields=fields) trade_order_df_dict[ ticker_label + 'market_df_downsampled'] = market_downsampled_df trade_order_df_dict[ticker_label + 'candlestick_fig'] = \ self._plot_render.generate_candlesticks(market_downsampled_df) if tca_request.summary_display == 'candlestick': for trade_order in trade_order_list: if trade_order in trade_order_df_dict: title = ticker_label + " " + trade_order lines_to_plot = self._util_func.dict_key_list( constants.detailed_timeline_plot_lines.keys()) lines_to_plot.append('candlestick') trade_order_df_dict[ticker_label + 'sparse_market_' + trade_order.replace('df', 'fig')]\ = self._plot_render.plot_market_trade_timeline( title=title, sparse_market_trade_df=trade_order_df_dict[ticker_label + 'sparse_market_' + trade_order], lines_to_plot=lines_to_plot, candlestick_fig=trade_order_df_dict[ticker_label + 'candlestick_fig']) return trade_order_df_dict
def __init__(self, version=constants.tcapy_version): self._util_func = UtilFunc() self._tca_market_trade_loader = Mediator.get_tca_market_trade_loader(version=version) self._time_series_ops = TimeSeriesOps() self._trade_order_tag = TradeOrderFilterTag() logger = LoggerManager.getLogger(__name__) logger.info("Init TCAEngine version: " + self._tca_market_trade_loader.get_tca_version() + " - Env: " + constants.env)
def _apply_summary_metrics(self, tca_request_list, trade_order_results_df_dict, market_df_dict): trade_order_list = self._util_func.dict_key_list(trade_order_results_df_dict.keys()) if not (isinstance(trade_order_list, list)): trade_order_list = [trade_order_list] # First get the market data market_df = market_df_dict[tca_request_list[0].ticker] logger = LoggerManager.getLogger(__name__) logger.debug("Constructing results form to summarize analysis...") # Calculate user specified aggregate result forms (eg. timelines, distribution etc.) for each trade/order # which has been selected results_form = tca_request_list[0].results_form join_tables = tca_request_list[0].join_tables # If dummy market (ie. don't return market data to the user) has been specified then market data cannot # be included in ResultsForm calculations if results_form is not None: for i in range(0, len(trade_order_results_df_dict)): current_key = self._util_func.dict_key_list(trade_order_results_df_dict.keys())[i] # Ignore 'fig' objects which are Plotly JSON Figures, and only process DataFrames if 'df' in current_key: for r in results_form: # Filter the trades for the event type which has been requested (eg. 'trade' or 'placement') trade_order_df = self._trade_order_tag.filter_trade_order( trade_order_df=trade_order_results_df_dict[trade_order_list[i]], tag_value_combinations={'event_type': tca_request_list[0].event_type}) # Calculate aggregate ResultForm results = r.aggregate_results( trade_order_df=trade_order_df, market_df=market_df, trade_order_name=trade_order_list[i]) if results[0] is not None: for results_form_df, results_form_name in results: trade_order_results_df_dict[results_form_name] = results_form_df logger.debug("Now join table results...") # As a final stage, join together any tables which have been specified by the user # for example: does the user want to combine certain metrics or trades together? if join_tables is not None: for j in join_tables: results = j.aggregate_tables(df_dict=trade_order_results_df_dict) if results != []: if results[0] is not None: for results_form_df, results_form_name in results: trade_order_results_df_dict[results_form_name] = results_form_df logger.debug("Finished calculating results form and join table results!") return trade_order_results_df_dict
def calculate_benchmark(self, market_df=None, mid=None, bid=None, ask=None, bid_mid_bp=None, ask_mid_bp=None, overwrite_bid_ask=None): if self._check_empty_benchmark_market_data(market_df): return market_df if mid is None: mid = self._mid if bid is None: bid = self._bid if ask is None: ask = self._ask if bid_mid_bp is None: bid_mid_bp = self._bid_mid_bp if ask_mid_bp is None: ask_mid_bp = self._ask_mid_bp if overwrite_bid_ask is None: overwrite_bid_ask = self._overwrite_bid_ask bid_mid_bp = float(bid_mid_bp); ask_mid_bp = float(ask_mid_bp) # market_df_list = [market_df] if mid not in market_df.columns: market_df[mid] = (market_df[bid].values + market_df[ask].values)/2.0 # Calculate the bid-mid and ask-mid spreads from market data if bid in market_df.columns and ask in market_df.columns and not (overwrite_bid_ask): # market_df[bid + '_' + mid + '_spread'] = (market_df[bid].values / market_df[mid].values) - 1.0 # market_df[ask + '_' + mid + '_spread'] = (market_df[mid].values / market_df[ask].values) - 1.0 market_df[bid + '_' + mid + '_spread'] = pd.eval('(market_df.bid / market_df.mid) - 1.0') market_df[ask + '_' + mid + '_spread'] = pd.eval('(market_df.mid / market_df.ask) - 1.0') # If we have been asked to overwrite bid/ask columns with an artificial proxy elif bid in market_df.columns and ask in market_df.columns and overwrite_bid_ask: # otherwise if we don't have sufficient bid/ask data (and only mid data), or if we want to forecibly overwrite it, # create a synthetic bid/ask and use the user specified spread market_df[bid + '_' + mid + '_spread'] = -bid_mid_bp / 10000.0 market_df[ask + '_' + mid + '_spread'] = -ask_mid_bp / 10000.0 market_df[bid] = (market_df[mid].values) * (1.0 - bid_mid_bp / 10000.0) market_df[ask] = (market_df[mid].values) / (1.0 - ask_mid_bp / 10000.0) # market_df[bid + '_' + mid + '_spread'] = pd.eval('-bid_mid_bp / 10000.0') # market_df[ask + '_' + mid + '_spread'] = pd.eval('-ask_mid_bp / 10000.0') # market_df[bid] = pd.eval('(market_df.mid) * (1.0 - bid_mid_bp / 10000.0)') # market_df[ask] = pd.eval('(market_df.mid) / (1.0 - ask_mid_bp / 10000.0)') # If we only have the mid column elif mid in market_df.columns and bid not in market_df.columns and ask not in market_df.columns: market_df[bid + '_' + mid + '_spread'] = -bid_mid_bp / 10000.0 market_df[ask + '_' + mid + '_spread'] = -ask_mid_bp / 10000.0 market_df[bid] = (market_df[mid].values) * (1.0 - bid_mid_bp / 10000.0) market_df[ask] = (market_df[mid].values) / (1.0 - ask_mid_bp / 10000.0) # market_df[bid + '_' + mid + '_spread'] = pd.eval('-bid_mid_bp / 10000.0') # market_df[ask + '_' + mid + '_spread'] = pd.eval('-ask_mid_bp / 10000.0') # market_df[bid] = pd.eval('(market_df.mid) * (1.0 - bid_mid_bp / 10000.0)') # market_df[ask] = pd.eval('(market_df.mid) / (1.0 - ask_mid_bp / 10000.0)') else: LoggerManager().getLogger(__name__).warning("Couldn't calculate spread from mid, check market data has appropriate fields.") return market_df
def _fill_reporting_spot(self, ticker, trade_df, start_date, finish_date, tca_request): logger = LoggerManager.getLogger(__name__) market_request = MarketRequest( start_date=start_date, finish_date=finish_date, ticker=ticker, data_store=tca_request.market_data_store, data_offset_ms=tca_request.market_data_offset_ms, use_multithreading=tca_request.use_multithreading, market_data_database_table=tca_request.market_data_database_table, multithreading_params=tca_request.multithreading_params) market_conversion_df = self.get_market_data(market_request) # Make sure the trades/orders are within the market data (for the purposes of the reporting spot) # we don't need to consider the length of the order, JUST the starting point trade_df = self.strip_trade_order_data_to_market( trade_df, market_conversion_df, consider_order_length=False) reporting_spot = None # need to check whether we actually have any trade data/market data if trade_df is not None and market_conversion_df is not None: if not (trade_df.empty) and not (market_conversion_df.empty): try: reporting_spot = \ self._time_series_ops.vlookup_style_data_frame(trade_df.index, market_conversion_df, 'mid')[ 0] except: logger.error( "Reporting spot is missing for this trade data sample!" ) if reporting_spot is None: market_start_finish = "No market data in this sample. " if market_conversion_df is not None: market_start_finish = "Market data is between " + str( market_conversion_df.index[0]) + " - " + str( market_conversion_df.index[-1]) + ". " logger.warning(market_start_finish) logger.warning("Trade data is between " + str(trade_df.index[0]) + " - " + str(trade_df.index[-1]) + ".") logger.warning( "Couldn't get spot data to convert notionals currency. Hence not returning trading data." ) return reporting_spot, trade_df
def combine_resampled_spot_data_into_single_dataframe(resample_freq='1min', data_vendor='dukascopy', usd_base=False): df_list = [] logger = LoggerManager.getLogger(__name__) for ticker in ticker_combined_mkt: logger.info("Reading " + ticker + " resample freq " + resample_freq + " data vendor " + data_vendor) df = pd.read_parquet(csv_output + ticker + '_' + resample_freq + '_' + data_vendor + '.' + file_extension) base = ticker[0:3] terms = ticker[3:6] if usd_base: if terms == 'USD': df_invert = pd.DataFrame(index=df.index) df_invert[terms + base + '.close'] = 1.0 / df[ticker + '.close'] df_invert[terms + base + '.open'] = 1.0 / df[ticker + '.open'] # Invert high and low! df_invert[terms + base + '.high'] = 1.0 / df[ticker + '.low'] df_invert[terms + base + '.low'] = 1.0 / df[ticker + '.high'] df_invert[terms + base + '.close'] = 1.0 / df[ticker + '.close'] df_invert[terms + base + '.tickcount'] = df[ticker + '.tickcount'] df = df_invert df_list.append(df) logger.info("Combining all tickers with resample freq " + resample_freq + " data vendor " + data_vendor) df = pd.DataFrame(index=df.index) df['USDUSD.close'] = 1.0 df_list.append(df) df = calculations.join(df_list, how='outer') df = df.dropna() if usd_base: combined_file = 'fx_' + resample_freq + '_' + data_vendor + '_usd_base.' + file_extension else: combined_file = 'fx_' + resample_freq + '_' + data_vendor + '.' + file_extension df.to_parquet(csv_output + combined_file)
def aggregate_tables(self, df_dict={}, tables_dict={}, round_figures_by=None, scalar=None): logger = LoggerManager.getLogger(__name__) if tables_dict == {}: tables_dict = self._tables_dict if round_figures_by is None: round_figures_by = self._round_figures_by if scalar is None: scalar = self._scalar joined_results = [] table_name = tables_dict['table_name'] table_list = tables_dict['table_list'] column_list = None; replace_text = None if 'column_list' in tables_dict.keys(): column_list = tables_dict['column_list'] if 'replace_text' in tables_dict.keys(): replace_text = tables_dict['replace_text'] agg_results = [] for i in range(0, len(table_list)): table = table_list[i] # If the table in the output if table in df_dict.keys(): df = df_dict[table].copy() if column_list is not None and column_list != []: df.columns = [x + ' ' + column_list[i] for x in df.columns] df = self._util_func.replace_text_in_cols(df, replace_text) # Round/multiply elements in the table if requested if df is not None: df = self._time_series_ops.multiply_scalar_dataframe(df, scalar=scalar) df = self._time_series_ops.round_dataframe(df, round_figures_by) agg_results.append(df) else: logger.warning(table + ' not in calculation output, are you use the dictionary entry is correct?') # If we've collected the tables, try doing a join on all them # to combine them into one large table if agg_results != []: if len(agg_results) > 1: df_joined = self._time_series_ops.outer_join(agg_results) else: df_joined = agg_results[0] joined_results.append((df_joined, table_name)) return joined_results
def vlookup_style_data_frame(self, dt, data_frame, search_field, timedelta_amount=None, just_before_point=True): """Does a VLOOKUP style search in a DataFrame given a set of times for a particular field. We assume both our DataFrame and dates to lookup are sorted (oldest first). Parameters ---------- dt : DateTimeIndex list Dates to be looked up data_frame : DataFrame The DataFrame where we wish to do our lookup search_field : str Which field do we want to output timedelta_amount : TimeDelta (default: None) How much we wish to perturb our search times just_before_point : bool (default: True) Should we fetch the point just before (in the case of not matching), which would be necessary for slippage calculations, by contrast for market impact we would likely want to set this to False (ie. for points just after) Returns ------- Series, DateTimeIndex """ logger = LoggerManager.getLogger(__name__) # logger.debug("Applying VLOOKUP in timezone " + str(dt.tz) + " with " + str(data_frame.index.tz)) if dt is None: return None, None if len(dt) == 0: return None, None # Check that our input times are within the bounds of our data frame if dt[0] <= data_frame.index[0] or dt[-1] >= data_frame.index[-1]: err_msg = "Lookup data (eg. trade) does not fully overlap with the main search space of data (eg. market)" logger.error(err_msg) raise ValidationException(err_msg) # return None, None indices = self.search_series(data_frame, dt, timedelta_amount=timedelta_amount, just_before_point=just_before_point) search_series = data_frame[search_field].iloc[indices] actual_dt = search_series.index search_series.index = dt # Return our VLOOKUPed values and alongside it, the time stamps of those observations return search_series, actual_dt
def check_empty_combined_dataframe_dict(self, df_dict=None): if df_dict is None: df_dict = self.get_combined_dataframe_dict() logger = LoggerManager().getLogger(__name__) valid_data = True if df_dict is not None: if len(df_dict.keys()) > 0: t_remove = [] for t in df_dict.keys(): if df_dict[t] is None: logger.warn("Market/trade/order data not in " + t) t_remove.append(t) else: if df_dict[t].empty: logger.warn("Market/trade/order data not in " + t) t_remove.append(t) for t in t_remove: df_dict.pop(t) else: valid_data = False if len(df_dict.keys()) == 0: valid_data = False else: valid_data = False return valid_data
def __init__(self, temp_data_folder=constants.temp_data_folder, temp_large_data_folder=constants.temp_large_data_folder, tickers=None, data_store=None): self.temp_data_folder = temp_data_folder self.temp_large_data_folder = temp_large_data_folder self.tickers = None self.util_func = UtilFunc() self.time_series_ops = TimeSeriesOps() self.data_store = data_store logger = LoggerManager().getLogger(__name__) if not (os.path.isdir(self.temp_data_folder)): logger.warn("Temp data folder " + self.temp_data_folder + " does not exist") if not (os.path.isdir(self.temp_large_data_folder)): logger.warn("Temp large data folder " + self.temp_data_folder + " does not exist") if tickers is not None: self.tickers = tickers
def _check_trade_order_type(self, trade_order_type): try: # constants = Constants() valid_trade_order_type = constants.trade_order_list if not trade_order_type in valid_trade_order_type: # don't make LoggerManager field variable so this can be pickled (important for Celery) LoggerManager().getLogger(__name__).error(trade_order_type & " is not a defined trade or order.") raise ValidationException(trade_order_type & " is not a defined trade or order.") except: pass return trade_order_type
def _join_market_downsampled_trade_orders(self, market_downsampled_df, trade_order_df, fields=None): """Combine market data with trade/orders, into a sparse DataFrame. Typically, used when preparing to display a mixture of market/trades data together. Parameters ---------- market_downsampled_df : DataFrame Market data which has been downsampled trade_order_df : DataFrame Trade/order data to be combined fields : str (list) Fields to keep Returns ------- DataFrame """ logger = LoggerManager.getLogger(__name__) if fields is not None: trade_order_df = self._time_series_ops.filter_time_series_by_matching_columns( trade_order_df, fields) logger.debug('About to join') sparse_market_trade_df = market_downsampled_df.join(trade_order_df, how='outer') # Add buy/sell trade prices in new columns (easier for plotting later) if 'executed_price' not in sparse_market_trade_df.columns: print('x') executed_price = sparse_market_trade_df['executed_price'].values side_to_match = sparse_market_trade_df['side'].values sparse_market_trade_df['buy_trade'] \ = self._time_series_ops.nanify_array_based_on_other(side_to_match, -1, executed_price) # make sells NaN (NOT buys!) sparse_market_trade_df['sell_trade'] \ = self._time_series_ops.nanify_array_based_on_other(side_to_match, 1, executed_price) # make buys NaN (NOT sells!) logger.debug('Finished joining') return sparse_market_trade_df
def _fetch_market_data(self, start, finish, ticker, write_to_disk=True, read_cached_from_disk=True, web_proxies=constants.web_proxies): logger = LoggerManager.getLogger(__name__) key = (str(start) + str(finish) + ticker + '_' + self._get_postfix()).replace(":", '_') filename = os.path.join(self.temp_data_folder, key) + '.' + fileformat util_func = UtilFunc() start_time_stamp = pd.Timestamp(start) finish_time_stamp = pd.Timestamp(finish) if self._remove_weekend_points(): weekend_data = "Weekend? " + key weekday_point = UtilFunc().is_weekday_point(start_time_stamp, finish_time_stamp, friday_close_nyc_hour=constants.friday_close_utc_hour, sunday_open_utc_hour=constants.sunday_open_utc_hour) if not(weekday_point): return None, weekend_data df = None if read_cached_from_disk: if os.path.exists(filename): df = util_func.read_dataframe_from_binary(filename, format=binary_format) if df is not None: logger.debug("Read " + filename + " from disk") if df is None: # Convert tcapy ticker into vendor ticker df = self._get_input_data_source().fetch_market_data(start, finish, ticker=self._get_tickers_vendor()[ticker], web_proxies=web_proxies) if df is not None: if write_to_disk: # Write a small temporary dataframe to disk (if the process fails later, these can be picked up, # without having a call the external vendor again util_func.write_dataframe_to_binary(df, filename, format=binary_format) msg = None if df is None: msg = "No data? " + key return df, msg
def calculate_benchmark(self, market_df=None, field=None, bid=None, ask=None): if self._check_empty_benchmark_market_data(market_df): return market_df if field is None: field = self._field if bid is None: bid = self._bid if ask is None: ask = self._ask # If the 'mid' price does not already exist in the market data, calculate it from the underlying bid/ask prices if field not in market_df.columns: # if field == 'bid/ask': market_df[field] = (market_df[bid].values + market_df[ask].values) / 2.0 else: LoggerManager().getLogger(__name__).warning(field + " not in market data") return market_df
def get(self, key, burn_after_reading=False): """Gets the object(s) associated with the key(s) or CacheHandle(s) Parameters ---------- key : str or CacheHandle (list) Key(s) to be fetched burn_after_reading : bool (default: False) Should the key be erased after reading? Returns ------- object """ logger = LoggerManager.getLogger(__name__) key = copy.copy(key) single = False if not (isinstance(key, list)): key = [key] single = True for i in range(0, len(key)): if isinstance(key[i], CacheHandle): key[i] = key[i].handle_name obj = None try: obj = self._get(key, burn_after_reading=burn_after_reading) except Exception as e: logger.warning("Couldn't retrieve " + str(key) + " from cache: " + str(e)) if ('market_df' in key): print("market_df") if single and obj is not None: return obj[0] return obj
def get_trade_order_holder(self, tca_request): logger = LoggerManager.getLogger(__name__) logger.debug( "Get trade order holder for " + str(tca_request.ticker) + " from " + str(tca_request.start_date) + " - " + str(tca_request.finish_date)) # Get all the trade/orders which have been requested, eg. trade_df and order_df # do separate calls given they are assumed to be stored in different database tables trade_order_holder = DataFrameHolder() if tca_request.trade_order_mapping is not None: for trade_order_type in tca_request.trade_order_mapping: trade_order_df = self.get_trade_order_data(tca_request, trade_order_type) trade_order_holder.add_dataframe(trade_order_df, trade_order_type) return trade_order_holder
def _download(self, md_request, folder_prefix): from findatapy.market import MarketDataRequest, MarketDataGenerator, Market logger = LoggerManager.getLogger(__name__) market = Market(market_data_generator=MarketDataGenerator()) ticker = md_request.ticker[0] df = market.fetch_market(md_request=md_request) df.columns = ['bid', 'ask', 'bidv', 'askv'] df['venue'] = 'dukascopy' df['ticker'] = ticker df['mid'] = (df['bid'].values + df['ask'].values) / 2.0 self.dump_hdf5_file(df, folder_prefix + "_" + ticker + ".h5") logger.info('Dumped to ' + folder_prefix + "_" + ticker + ".h5")
def calculate_benchmark_market(self, market_df, tca_request): logger = LoggerManager.getLogger(__name__) benchmark_calcs = tca_request.benchmark_calcs valid_market = self._check_valid_market(market_df) # Calculations on market data only if valid_market: for b in benchmark_calcs: # For benchmarks which only modify market data (and don't need trade specific information) if isinstance(b, BenchmarkMarket): logger.debug("Calculating " + type(b).__name__ + " for market data") market_df = b.calculate_benchmark(market_df=market_df) return market_df
def _check_is_empty_trade_order(self, trade_df, tca_request, start_date, finish_date, trade_order_type): logger = LoggerManager.getLogger(__name__) if trade_df is None: logger.warning("Missing trade data for " + tca_request.ticker + " between " + str(start_date) + " - " + str(finish_date) + " in " + trade_order_type) return True elif trade_df.empty: logger.warning("Missing trade data for " + tca_request.ticker + " between " + str(start_date) + " - " + str(finish_date) + " in " + trade_order_type) return True return False