def _get_market_trade_metrics(self, tca_request_list, dummy_market): """Gets the market and trade data, as well as computed metrics on them Parameters ---------- tca_request_list : TCARequest (list) Requests for multiple TCARequests (eg. for different tickers) dummy_market : bool Return dummy market data? Returns ------- DataFrame (dict), DataFrame (dict) """ tca_ticker_loader = Mediator.get_tca_ticker_loader( version=self._version) market_df_dict = {} trade_order_holder_list = DataFrameHolder() for tca_request_single in tca_request_list: market_df, trade_order_df_dict = tca_ticker_loader.get_market_trade_order_holder( tca_request_single) market_df, trade_order_df_list, ticker, trade_order_keys = \ tca_ticker_loader.calculate_metrics_single_ticker((market_df, trade_order_df_dict), tca_request_single, dummy_market) market_df_dict[ticker] = market_df trade_order_holder_list.add_dataframe_dict( dict(zip(trade_order_keys, trade_order_df_list))) # Unpack the DataFrameHolder into a dictionary (combining the lists of trade, orders etc. into single dataframes) # this may also decompress the trades trade_order_results_df_dict = trade_order_holder_list.get_combined_dataframe_dict( ) return market_df_dict, trade_order_results_df_dict
def test_data_frame_holder(): """Tests the storing of DataFrameHolder object which is like an enhanced dict specifically for storing DataFrames, alongside using the VolatileCache """ from tcapy.analysis.dataframeholder import DataFrameHolder from tcapy.data.volatilecache import VolatileRedis as VolatileCache volatile_cache = VolatileCache() # Create a very large DataFrame, which needs to be chunked in storage dt = pd.date_range(start='01 Jan 2000', end='05 Jan 2020', freq='10s') df = pd.DataFrame(index=dt, columns=['bid', 'mid', 'ask']) df['bid'] = np.ones(len(dt)) df['mid'] = np.ones(len(dt)) df['ask'] = np.ones(len(dt)) df_list = TimeSeriesOps().split_array_chunks(df, chunks=2) df_lower = df_list[0] df_higher = df_list[1] for i in ['_comp', '']: df_holder = DataFrameHolder() df_holder.add_dataframe( volatile_cache.put_dataframe_handle(df_lower, use_cache_handles=True), 'EURUSD_df' + i) df_holder.add_dataframe( volatile_cache.put_dataframe_handle(df_higher, use_cache_handles=True), 'EURUSD_df' + i) df_dict = df_holder.get_combined_dataframe_dict() df_final = df_dict['EURUSD_df' + i] assert_frame_equal(df, df_final)
def _parallel_get_market_trade_metrics(self, tca_request_list, dummy_market): logger = LoggerManager.getLogger(__name__) market_holder_list = DataFrameHolder() trade_order_holder_list = DataFrameHolder() # For each currency pair select collect the trades and market data, then calculate benchmarks and slippage result = [] keep_looping = True # If we have also asked for trades/order if tca_request_list[0].trade_order_mapping is not None: point_in_time_executions_only = \ self._util_func.dict_key_list(tca_request_list[0].trade_order_mapping) == ['trade_df'] else: point_in_time_executions_only = True parallel_library = tca_request_list[0].multithreading_params[ 'parallel_library'] if parallel_library == 'single': # from tcapy.analysis.tcatickerloaderimpl import TCATickerLoaderImpl tca_ticker_loader = Mediator.get_tca_ticker_loader( version=self._version) start_date = tca_request_list[0].start_date finish_date = tca_request_list[0].finish_date # Parameters for the loop i = 0 no_of_tries = 5 # Error trapping for Celery, if have failed event retry it while i < no_of_tries and keep_looping: try: # For each TCA request kick off a thread for tca_request_single_ticker in tca_request_list: # Split up the request by date (monthly/weekly chunks) tca_request_date_split = self._split_tca_request_by_date( tca_request_single_ticker, tca_request_single_ticker.ticker, period=tca_request_single_ticker. multithreading_params['cache_period']) if not(constants.multithreading_params['splice_request_by_dates']) \ or tca_request_list[0].tca_type == 'detailed' \ or tca_request_list[0].tca_type == 'compliance' \ or tca_request_list[0].summary_display == 'candlestick'\ or not(point_in_time_executions_only): if 'celery' in parallel_library: # Load all the data for this ticker and THEN calculate the metrics on it result.append( chord( (get_market_trade_holder_via_celery.s( tca_request_data) for tca_request_data in tca_request_date_split), calculate_metrics_single_ticker_via_celery. s(tca_request_single_ticker, dummy_market)).apply_async()) elif parallel_library == 'single': # This is not actually parallel, but is mainly for debugging purposes for tca_request_s in tca_request_date_split: # print(tca_request_s.start_date) market_df, trade_order_df_dict = tca_ticker_loader.get_market_trade_order_holder( tca_request_s, return_cache_handles=False) market_df, trade_order_df_list, ticker, trade_order_keys = \ tca_ticker_loader.calculate_metrics_single_ticker((market_df, trade_order_df_dict), tca_request_s, dummy_market) market_holder_list.add_dataframe( market_df, ticker) trade_order_holder_list.add_dataframe_dict( dict( zip(trade_order_keys, trade_order_df_list))) else: # Otherwise work on parallel chunks by date # doesn't currently work with orders which straddle day/week/month boundaries # but should work with points in time # # In practice, it's not really much faster than the above code if 'celery' == parallel_library: # For each ticker/date combination load data and process chunk (so can do fully in parallel) result.append( group( get_market_trade_holder_and_calculate_metrics_single_ticker_via_celery .s(tca_request_data, dummy_market) for tca_request_data in tca_request_date_split).apply_async()) # Now combine the results from the parallel operations, if using celery if 'celery' in parallel_library: # Careful, when the output is empty! output = [ p.get(timeout=constants.celery_timeout_seconds) for p in result if p is not None ] # If pipelined/splice_request_by_dates will have two lists so flatten it into one output = self._util_func.flatten_list_of_lists(output) for market_df, trade_order_df_list, ticker, trade_order_keys in output: market_holder_list.add_dataframe(market_df, ticker) # market_df_dict[ticker] = market_df trade_order_holder_list.add_dataframe_dict( dict(zip(trade_order_keys, trade_order_df_list))) del result del output keep_looping = False except DateException as e: raise e keep_looping = False except TradeMarketNonOverlapException as e: raise e keep_looping = False except DataMissingException as e: raise e keep_looping = False except ErrorWritingOverlapDataException as e: raise e keep_looping = False # Exception likely related to Celery and possibly lack of communication with Redis message broker # or Memcached results backend # except Exception as e: except Exception as e: if i == no_of_tries - 1: err_msg = "Failed with " + parallel_library + " after multiple attempts: " + str( e) + ", " + str(traceback.format_exc()) raise Exception(err_msg) i = i + 1 logger.warning("Failed with " + parallel_library + ", trying again for " + str(i) + " time: " + str(e) + ", " + str(traceback.format_exc())) logger.debug("Finished parallel computation") # Expand out the DataFrame holders into dictionaries of DataFrames market_df_dict = market_holder_list.get_combined_dataframe_dict() trade_order_results_df_dict = trade_order_holder_list.get_combined_dataframe_dict( start_date=start_date, finish_date=finish_date) # TODO add candlestick drawing here for cases when using split threading by date trade_order_results_df_dict = self._util_func.remove_keymatch_dict( trade_order_results_df_dict, 'market_df_downsampled') return market_df_dict, trade_order_results_df_dict