def get_spread(ob_state: dd) -> float: best_bid = DataSplitter.get_side("buy", ob_state).sort_values( by='price', ascending=False)['price'].max() best_ask = DataSplitter.get_side( "sell", ob_state).sort_values(by='price')['price'].min() return best_ask - best_bid
def get_feed_stats(df: dd) -> Dict[str, Union[int, Any]]: """Calculate and print some statistics relating to the data feed""" stats = { 'num_total_msgs': get_total(df), 'num_trades': Statistics.get_reason_count('filled', df), 'num_cancel': Statistics.get_reason_count('canceled', df), 'num_received': Statistics.get_type_count('received', df), 'num_open': Statistics.get_type_count('open', df), 'num_done': Statistics.get_type_count('done', df), 'num_match': Statistics.get_type_count('match', df), 'num_change': Statistics.get_type_count('change', df), 'avg_trade_price': Statistics.get_mean('price', DataSplitter.get_trades(df)), 'std_dev_trade_price': Statistics.get_std_dev('price', DataSplitter.get_trades(df)) } return stats
def __init__(self, start_ob_state_df: pd.DataFrame, st: datetime.datetime, start_seq: int): self.column_order = ['price', 'order_id', 'side', 'size'] bids = DataSplitter.get_side("buy", start_ob_state_df) # have to change the sign of the prices column so that we can use a min heap as a max heap... bids['price'] = bids['price'].apply(lambda x: -x) bids = bids[self.column_order] bids = bids.values.tolist() self.bids_max_heap = list(map(tuple, bids)) self.bids_max_heap.sort() heapq.heapify(self.bids_max_heap) asks = DataSplitter.get_side("sell", start_ob_state_df) asks = asks[self.column_order] asks = asks.values.tolist() self.asks_min_heap = list(map(tuple, asks)) self.asks_min_heap.sort() heapq.heapify(self.asks_min_heap) # Keep track of order ids which should no longer be on the book self.invalid_order_ids = set() self.st = st self.start_seq = start_seq
def get_buy_sell_volume_ratio(df: dd): buys = DataSplitter.get_side("buy", df) sells = DataSplitter.get_side("sell", df) buy_vol = buys['size'].sum() sell_vol = sells['size'].sum() return Statistics.get_ratio(buy_vol, sell_vol)
def get_limit_market_order_ratio(df: dd): limits = DataSplitter.get_limit_orders_from_feed(df) markets = DataSplitter.get_market_orders_from_feed(df) num_limits = len(limits) num_markets = len(markets) return Statistics.get_ratio(num_limits, num_markets)
def load_split_data(real_root, start_time, end_time, product): feed_df = DataLoader().load_feed(real_root, start_time, end_time, product) feed_df = DataSplitter.get_product(product, feed_df) orders_df = DataSplitter.get_orders(feed_df) trades_df = DataSplitter.get_trades(feed_df) cancels_df = DataSplitter.get_cancellations(feed_df) return orders_df, trades_df, cancels_df
def test_sim_spread_plot(self): plt.figure(figsize=(12, 8)) product = "LTC-USD" root = "/Users/jamesprince/project-data/results/sims/LTC-USD/2018-05-17/01:00:00/" st = datetime.datetime(2018, 5, 17, 1, 0, 0) et = datetime.datetime(2018, 5, 17, 1, 5, 0) all_sims = DataLoader().load_sim_data(root) # orders_dd, trades_dd, cancels_dd, midprices_dd, best_bids_dd, best_asks_dd orders_df = all_sims[0][0].compute() cancels_df = all_sims[0][2].compute() midprice_df = all_sims[0][3].compute() conf = configparser.ConfigParser() conf.read("../config/backtest.ini") config = BacktestConfig(conf) # limit_orders = DataSplitter.get_limit_orders(orders_df) # limit_orders['seconds'] = (limit_orders['time'] - limit_orders['time'].iloc[0]).apply( # lambda x: x.total_seconds()) # # buy_limit_orders = DataSplitter.get_side("buy", limit_orders) # sell_limit_orders = DataSplitter.get_side("sell", limit_orders) # # plt.plot(buy_limit_orders['seconds'], buy_limit_orders['price'], 'r+', label="Buy limit orders") # plt.plot(sell_limit_orders['seconds'], sell_limit_orders['price'], 'b+', label="Sell limit orders") cancels_df['seconds'] = ( cancels_df['time'] - cancels_df['time'].iloc[0]).apply(lambda x: x.total_seconds()) buy_cancels = DataSplitter.get_side("buy", cancels_df) sell_cancels = DataSplitter.get_side("sell", cancels_df) plt.plot(buy_cancels['seconds'], buy_cancels['price'], 'r+', label="Buy side cancels") plt.plot(sell_cancels['seconds'], sell_cancels['price'], 'b+', label="Sell side cancels") # plt.plot(res_df['seconds'], res_df['best_bid'], label='Best bid price') # plt.plot(res_df['seconds'], res_df['best_ask'], label='Best ask price') start_price = midprice_df['price'].iloc[0] plt.ylim(start_price - 5, start_price + 5) plt.legend() plt.show()
def compare_order_metrics(real_orders: pd.DataFrame, multi_sim_orders: List[pd.DataFrame]): """Compares metrics which only make sense in orders (e.g. buy/sell split)""" real_buy_orders = DataSplitter.get_side("buy", real_orders) sim_buy_orders = list(map(lambda sim: DataSplitter.get_side("buy", sim), multi_sim_orders)) print("Buy metrics:") Evaluation.compare_metrics(real_buy_orders, sim_buy_orders) real_sell_orders = DataSplitter.get_side("sell", real_orders) sim_sell_orders = list(map(lambda sim: DataSplitter.get_side("sell", sim), multi_sim_orders)) print("Sell metrics:") Evaluation.compare_metrics(real_sell_orders, sim_sell_orders)
def graph_relative_price_distribution(self, trades_df: dd, other_df: dd, num_bins=100): buy_orders = DataSplitter.get_side("buy", other_df) sell_orders = DataSplitter.get_side("sell", other_df) buy_prices = DataTransformer.get_relative_prices(trades_df, buy_orders) buy_prices = buy_prices.apply(lambda x: -x) sell_prices = DataTransformer.get_relative_prices(trades_df, sell_orders) # Graphing self.config.plt.figure(figsize=(12, 8)) self.graph_distribution(buy_prices, self.data_description + ", Buy Side", "Price relative to most recent trade", bins=num_bins) self.graph_distribution(sell_prices, self.data_description + ", Sell Side", "Price relative to most recent trade", bins=num_bins)
def load_feed(cls, root, start_time: datetime, end_time: datetime, product: str, fmt: str = "parquet") -> dd: """Loads in a feed of real data and applies formatting to timestamp, price and size columns""" # Assume data is on the same day and just hours apart for now hour_delta = end_time.hour - start_time.hour files_to_load = [] # TODO: introduce wrapping over days # TODO: split this function up! # TODO: BUG: struggles to load small blobs of data for i in range(0, hour_delta + 1): filename = start_time.date().isoformat() + "/" + str( "%02i" % (start_time.hour + i)) + "." + fmt cls.logger.debug(filename) files_to_load.append(filename) feed_df = pd.DataFrame() for filename in files_to_load: file_path = root + filename if fmt == "parquet": file_df = pd.read_parquet(file_path) else: file_df = pd.read_csv(file_path) file_df = DataSplitter.get_product(product, file_df) file_df = DataLoader().format_dd(file_df) file_df = file_df[start_time < file_df['time']] file_df = file_df[file_df['time'] < end_time] feed_df = feed_df.append(file_df) return feed_df
def plot_orderbook(ob_state, xwindow, log_y_scale=False): import matplotlib.pyplot as plt plt.figure(figsize=(12, 8)) bids = DataSplitter.get_side("buy", ob_state) asks = DataSplitter.get_side("sell", ob_state) OrderBookCreator.__plot_bid_side(bids, xwindow, percentile=0.9) OrderBookCreator.__plot_ask_side(asks, xwindow, percentile=0.9) plt.title("Order Book") plt.xlabel("Price") plt.ylabel("Cumulative size") if log_y_scale: plt.yscale('log') plt.legend() plt.show()
def test_compare_order_metrics(self): sim_root = self.config.sim_root + self.sim_st.date().isoformat( ) + "/" + self.sim_st.time().isoformat() + "/" all_sims = DataLoader().load_sim_data(sim_root) all_sim_limit_orders = list( map(lambda sim: DataSplitter.get_limit_orders(sim[0].compute()), all_sims)) all_sim_market_orders = list( map(lambda sim: DataSplitter.get_market_orders(sim[0].compute()), all_sims)) all_sim_trades = list(map(lambda sim: sim[1].compute(), all_sims)) all_sim_cancels = list(map(lambda sim: sim[2].compute(), all_sims)) feed_df = DataLoader().load_feed( self.config.real_root, self.sim_st, self.sim_st + timedelta(seconds=self.config.simulation_window), self.config.product) real_orders = DataSplitter.get_orders(feed_df) real_limit_orders = DataSplitter.get_limit_orders(real_orders) real_market_orders = DataSplitter.get_market_orders(real_orders) real_trades = DataSplitter.get_trades(feed_df) real_trades['size'] = pd.to_numeric(real_trades['remaining_size']) real_cancels = DataSplitter.get_cancellations(feed_df) real_cancels['size'] = pd.to_numeric(real_cancels['remaining_size']) print("Order Buy/Sell limit metrics") Evaluation.compare_order_metrics(real_limit_orders, all_sim_limit_orders) print("Order Buy/Sell market metrics") Evaluation.compare_order_metrics(real_market_orders, all_sim_market_orders) print("Cancel metrics") Evaluation.compare_order_metrics(real_cancels, all_sim_cancels) print("Trade metrics") Evaluation.compare_metrics(real_trades, all_sim_trades)
def __fetch_real_prices(self): df = DataLoader().load_feed(self.config.real_root, self.sim_st, self.sim_st + timedelta(seconds=self.config.simulation_window), self.config.product) trades_df = DataSplitter.get_trades(df) trades_df['time'] = DataUtils().get_times_in_seconds_after_start(trades_df['time']) trades_df['price'].iloc[0] = DataUtils().get_first_non_nan(trades_df['price']) return trades_df[['time', 'price']]
def test_get_orders_per_minute(self): product = "LTC-USD" root = "/Users/jamesprince/project-data/data/consolidated-feed/" st = datetime.datetime(2018, 5, 17, 0, 0, 0) et = datetime.datetime(2018, 5, 17, 23, 59, 59) feed_df = DataLoader.load_feed(root + product + "/", st, et, product) orders = DataSplitter.get_orders(feed_df) limit_orders = DataSplitter.get_limit_orders(orders) print( str(len(limit_orders)) + " total limit orders per day for " + product) print( str(len(limit_orders) / (24 * 60)) + " limit orders per minute (on average) for " + product)
def graph_price_time(self, df: dd, data_desc: str, mid: int, ywindow: int): self.config.plt.figure(figsize=(12, 8)) buy_df = DataSplitter.get_side("buy", df) sell_df = DataSplitter.get_side("sell", df) self.__graph_price_time_set(buy_df, 'r+') self.__graph_price_time_set(sell_df, 'b+') self.config.plt.xlabel('Time (s)') self.config.plt.ylabel('Price ($)') ymin, ymax = self.get_y_bounds(mid, ywindow) self.config.plt.ylim(ymin, ymax) self.config.plt.xlim(0, self.config.simulation_window) self.config.plt.title(self.data_description + " " + data_desc + ' price') return self.config.plt
def get_price_size_corr(trades_df: dd, limit_orders: dd): ret = {} for side in ["buy", "sell"]: side_df = DataSplitter.get_side(side, limit_orders) prices = DataTransformer.get_relative_prices(trades_df, side_df) sizes = side_df[side_df['size'].index.isin(prices.index)]['size'] if side == "buy": prices = prices.apply(lambda x: -x) ret[side] = Correlations.get_correlation_matrix(prices, sizes)[0, 1] return ret
def get_all_data(st: datetime, config): # Get all data which we will use to reconstruct the order book all_ob_start_time = st - datetime.timedelta(seconds=config.orderbook_window) all_ob_end_time = st all_ob_data = DataLoader().load_split_data(config.real_root, all_ob_start_time, all_ob_end_time, config.product) # Assume orderbook_window > sampling_window, and therefore filter already loaded ob data all_sample_start_time = st - datetime.timedelta(seconds=config.sampling_window) all_sample_end_time = st all_sampling_data = map(lambda x: DataSplitter.get_between(x, all_sample_start_time, all_sample_end_time), all_ob_data) # Get future data all_future_data_start_time = st all_future_data_end_time = st + datetime.timedelta(seconds=config.sampling_window) all_future_data = DataLoader().load_split_data(config.real_root, all_future_data_start_time, all_future_data_end_time, config.product) return all_ob_data, all_sampling_data, all_future_data
def get_order_stats(df: dd) -> Dict[Union[str, Any], Union[float, Any]]: stats = { 'buy_order_ratio': Statistics.get_buy_sell_ratio(df)[0], 'sell_order_ratio': Statistics.get_buy_sell_ratio(df)[1], 'buy_volume_ratio': Statistics.get_buy_sell_volume_ratio(df)[0], 'sell_volume_ratio': Statistics.get_buy_sell_volume_ratio(df)[1], 'avg_order_size': Statistics.get_mean('size', df), 'std_dev_order_size': Statistics.get_std_dev('size', df), 'avg_sell_order_size': Statistics.get_mean('size', DataSplitter.get_side('sell', df)), 'std_dev_sell_order_size': Statistics.get_std_dev('size', DataSplitter.get_side('sell', df)), 'avg_buy_order_size': Statistics.get_mean('size', DataSplitter.get_side('buy', df)), 'std_dev_buy_order_size': Statistics.get_std_dev('size', DataSplitter.get_side('buy', df)), 'avg_price': df['price'].astype('float64').mean(), 'std_dev_price': df['price'].astype('float64').std(), 'avg_sell_order_price': Statistics.get_mean('price', DataSplitter.get_side('sell', df)), 'std_dev_sell_price': Statistics.get_std_dev('price', DataSplitter.get_side('sell', df)), 'avg_buy_price': Statistics.get_mean('price', DataSplitter.get_side('buy', df)), 'std_dev_buy_order_price': Statistics.get_std_dev('price', DataSplitter.get_side('buy', df)) } return stats
def get_lyapunov_exponent_over_time(trades, st, et, step_minutes, window_minutes): num_steps = ((et - st).total_seconds() / 60) / step_minutes lyap_exps = [] times = [] for i in range(0, int(num_steps)): iter_st = st + datetime.timedelta(minutes=step_minutes * i) iter_et = iter_st + datetime.timedelta(minutes=window_minutes) window = DataSplitter.get_between(trades, iter_st, iter_et) prices = np.asarray(window['price'].dropna(), dtype=np.float32) if len(prices) == 0: continue lyap_exp = nolds.lyap_r(prices) if lyap_exp > 0: lyap_exps.append(lyap_exp) times.append(iter_et) else: pass return times, lyap_exps
def get_hurst_exponent_over_time(trades, st, et, step_minutes, window_minutes): num_steps = ((et - st).total_seconds() / 60) / step_minutes hurst_exps = [] times = [] for i in range(0, int(num_steps)): iter_st = st + datetime.timedelta(minutes=step_minutes * i) iter_et = iter_st + datetime.timedelta(minutes=window_minutes) window = DataSplitter.get_between(trades, iter_st, iter_et) prices = np.asarray(window['price'].dropna(), dtype=np.float32) if len(prices) == 0: continue hurst_exp = nolds.hurst_rs(prices) # hurst_exp = nolds.dfa(prices) - 1 print(hurst_exp) if 0 < hurst_exp < 1: hurst_exps.append(hurst_exp) times.append(iter_st) else: pass return times, hurst_exps
def test_orders_per_minute_windowed(self): product = "LTC-USD" root = "/Users/jamesprince/project-data/data/consolidated-feed/" st = datetime.datetime(2018, 5, 17, 0, 0, 0) et = datetime.datetime(2018, 5, 17, 23, 59, 59) feed_df = DataLoader.load_feed(root + product + "/", st, et, product) orders = DataSplitter.get_orders(feed_df) limit_orders = DataSplitter.get_limit_orders(orders) market_orders = DataSplitter.get_market_orders(orders) trades = DataSplitter.get_trades(feed_df) cancels = DataSplitter.get_cancellations(feed_df) print("Total limit orders: " + str(len(limit_orders))) print("Total market orders: " + str(len(market_orders))) print("Total trades: " + str(len(trades))) print("Total cancels: " + str(len(cancels))) # total_vol = trades['remaining_size'].sum() # print("Total traded volume: " + str(total_vol)) window_minutes = 60 step_minutes = 5 times = [] num_limit_orders = [] num_market_orders = [] num_trades = [] num_cancels = [] traded_vols = [] for i in range(0, int((24 * 60) / step_minutes - 1)): window_st = st + datetime.timedelta(seconds=i * step_minutes * 60) window_et = window_st + datetime.timedelta(seconds=window_minutes * 60) limit_orders_this_window = DataSplitter.get_between( limit_orders, window_st, window_et) market_orders_this_window = DataSplitter.get_between( market_orders, window_st, window_et) trades_this_window = DataSplitter.get_between( trades, window_st, window_et) cancels_this_window = DataSplitter.get_between( cancels, window_st, window_et) times.append(window_st) num_limit_orders.append(len(limit_orders_this_window)) num_market_orders.append(len(market_orders_this_window)) num_trades.append(len(trades_this_window)) num_cancels.append(len(cancels_this_window)) # vol_this_window = trades_this_window['remaining_size'].sum() # traded_vols.append(vol_this_window) Statistics.plot_metric_daily_comparison(times, num_limit_orders, num_cancels, "LTC-USD", st, step_minutes, window_minutes, "Limit Orders", "Cancels") Statistics.plot_metric_daily(times, num_limit_orders, "LTC-USD", st, step_minutes, window_minutes, "Limit Orders") Statistics.plot_metric_daily(times, num_market_orders, "LTC-USD", st, step_minutes, window_minutes, "Market Orders") Statistics.plot_metric_daily(times, num_trades, "LTC-USD", st, step_minutes, window_minutes, "Trades") Statistics.plot_metric_daily(times, num_cancels, "LTC-USD", st, step_minutes, window_minutes, "Cancels") Statistics.plot_metric_daily(times, traded_vols, "LTC-USD", st, step_minutes, window_minutes, "Traded Volume")
def get_buy_sell_ratio(df: dd) -> (float, float): num_buys = len(DataSplitter.get_side("buy", df)) num_sells = len(DataSplitter.get_side("sell", df)) return Statistics.get_ratio(num_buys, num_sells)
def check_ob_valid(ob: dd) -> bool: highest_buy = DataSplitter.get_side("buy", ob)['price'].max() lowest_sell = DataSplitter.get_side("sell", ob)['price'].min() return highest_buy < lowest_sell
def backtest_mode(st: datetime.datetime = None): all_data_st = take_secs(st, max(config.orderbook_window, config.sampling_window)) all_data_et = add_secs(st, config.num_predictions * config.interval) all_data = DataLoader.load_split_data(config.real_root, all_data_st, all_data_et, config.product) validate_future = None previous_backtest = None current_backtest = None sim_future = None sim_success = False sim_st = None for i in range(0, config.num_predictions): logger.info("Iteration " + str(i)) sim_st = add_secs(st, config.interval * i) sim_et = add_secs(sim_st, config.simulation_window) ob_st = take_secs(sim_st, config.orderbook_window) ob_et = sim_st sam_st = take_secs(sim_st, config.sampling_window) sam_et = sim_st try: logger.info("Gathering data for simulation at: " + sim_st.isoformat()) all_sampling_data = map(lambda x: DataSplitter.get_between(x, sam_st, sam_et), all_data) all_future_data = map(lambda x: DataSplitter.get_between(x, sim_st, sim_et), all_data) previous_backtest = current_backtest current_backtest = Backtest(config, sim_st, all_sampling_data, all_future_data) except Exception as e: logger.error("Error occurred when gathering data: " + str(e)) current_backtest = None # Initiate simulation prep synchronously prep_success = current_backtest.prepare_simulation() # Wait for previous simulation to finish sim_future, sim_success = wait_on_simulation(sim_future, sim_st, sim_success) # Wait for previous validation to finish wait_on_validation(validate_future) # Set off validation for previous iteration validate_future = run_validation_async(previous_backtest, sim_success) # Run this current iteration's simulation async if current_backtest is not None and prep_success: sim_future = current_backtest.run_simulation() # Wait for previous validation to finish wait_on_validation(validate_future) sim_future, sim_success = wait_on_simulation(sim_future, sim_st, sim_success) if sim_success: logger.info("Starting final validation") current_backtest.evaluate_simulation(prog_start)
def test_filter_when_cutoff_after_end(self): assert len( DataSplitter.get_first_n_nanos(self.df, 15 * 10**9)['time']) == 2
def graph_order_cancel_relative_price_distribution(self, feed_df): trades_df = DataSplitter.get_trades(feed_df) cancels_df = DataSplitter.get_cancellations(feed_df) self.graph_relative_price_distribution(trades_df, cancels_df)
def graph_sides(self, df: dd) -> None: btc_usd_price_buy = pd.Series(DataSplitter.get_side('buy', df)['price'].astype('float64').tolist()) btc_usd_price_sell = pd.Series(DataSplitter.get_side('sell', df)['price'].astype('float64').tolist()) self.graph_distribution(btc_usd_price_buy, self.data_description + ' buy side', 'Price ($)', bins=50) self.graph_distribution(btc_usd_price_sell, self.data_description + ' sell side', 'Price ($)', bins=50)
def generate_sim_params(cls, orders_df, trades_df, cancels_df, feed_df, ob_state, ob_state_seq_num, ob_state_time, graph=False): cls.check_has_elements([orders_df, trades_df, cancels_df]) try: params = {} distributions = {} ratios = {} correlations = {} discrete_distributions = {} # TODO: reduce code duplication and parallelise inverse CDF generation with pebble.ProcessPool() as pool: price_size_corrs = Correlations.get_price_size_corr( trades_df, DataSplitter.get_limit_orders_from_feed(orders_df)) correlations['buy_price_size'] = price_size_corrs['buy'] correlations['sell_price_size'] = price_size_corrs['sell'] # Sell order prices relative sell_orders = DataSplitter.get_side("sell", orders_df) sell_prices_relative = DataTransformer.get_prices_relative_to_midprice( ob_state, ob_state_seq_num, ob_state_time, feed_df, sell_orders) sell_x, sell_cy = Sample.get_cdf_data(sell_prices_relative) discrete_distributions["sell_price_relative"] = { 'x': sell_x.tolist(), 'cy': sell_cy.tolist() } Sample.plot_cdf(sell_x, sell_cy, "Sell order prices (relative)") # Buy order prices relative buy_orders = DataSplitter.get_side("buy", orders_df) buy_prices_relative = DataTransformer.get_prices_relative_to_midprice( ob_state, ob_state_seq_num, ob_state_time, feed_df, buy_orders) buy_prices_relative = buy_prices_relative.apply(lambda x: -x) buy_x, buy_cy = Sample.get_cdf_data(buy_prices_relative) discrete_distributions["buy_price_relative"] = { 'x': buy_x.tolist(), 'cy': buy_cy.tolist() } Sample.plot_cdf( buy_x, buy_cy, "Buy prices (relative) (flipped for comparison)") # Buy side cancel prices relative buy_cancels = DataSplitter.get_side("buy", cancels_df) buy_cancels_relative = DataTransformer.get_prices_relative_to_midprice( ob_state, ob_state_seq_num, ob_state_time, feed_df, buy_cancels) buy_cancels_relative = buy_cancels_relative.apply(lambda x: -x) buy_cancels_x, buy_cancels_cy = Sample.get_cdf_data( buy_cancels_relative) discrete_distributions["buy_cancels_relative"] = { 'x': buy_cancels_x.tolist(), 'cy': buy_cancels_cy.tolist() } Sample.plot_cdf( buy_cancels_x, buy_cancels_cy, "Buy cancel prices (relative) (flipped for comparison)") # Sell side cancel prices relative sell_cancels = DataSplitter.get_side("sell", cancels_df) sell_cancels_relative = DataTransformer.get_prices_relative_to_midprice( ob_state, ob_state_seq_num, ob_state_time, feed_df, sell_cancels) sell_cancels_x, sell_cancels_cy = Sample.get_cdf_data( sell_cancels_relative) discrete_distributions["sell_cancels_relative"] = { 'x': sell_cancels_x.tolist(), 'cy': sell_cancels_cy.tolist() } Sample.plot_cdf(sell_cancels_x, sell_cancels_cy, "Sell cancel prices (relative)") # Market orders market_orders = DataSplitter.get_market_orders_from_feed( orders_df) # Buy market order sizes buy_market_sizes = DataSplitter.get_side( "buy", market_orders)['size'].dropna().apply(lambda x: abs(x)) buy_market_sizes_x, buy_market_sizes_cy = Sample.get_cdf_data( buy_market_sizes) discrete_distributions["buy_market_size"] = \ {'x': buy_market_sizes_x.tolist(), 'cy': buy_market_sizes_cy.tolist()} Sample.plot_cdf(buy_market_sizes_x, buy_market_sizes_cy, "Buy market order sizes") # Sell market order sizes sell_market_sizes = DataSplitter.get_side( "sell", market_orders)['size'].dropna().apply(lambda x: abs(x)) sell_market_sizes_x, sell_market_sizes_cy = Sample.get_cdf_data( sell_market_sizes) discrete_distributions["sell_market_size"] = \ {'x': sell_market_sizes_x.tolist(), 'cy': sell_market_sizes_cy.tolist()} Sample.plot_cdf(sell_market_sizes_x, sell_market_sizes_cy, "Sell market order sizes") # Find distributions using different procs # relative_order_price_distributions = pool.schedule(DataTransformer.price_distributions, # (trades_df, orders_df,), # dict(relative=True, graph=graph)) # Buy/sell Price # order_price_distributions = pool.schedule(DataTransformer.price_distributions, # (trades_df, orders_df,), # dict(relative=False, graph=True)) # Buy/sell price Cancellation # relative_cancel_price_distributions = pool.schedule(DataTransformer.price_distributions, # (trades_df, cancels_df,)) # Limit Order Size limit_orders = DataSplitter.get_limit_orders_from_feed( orders_df) buy_limit_orders_size = DataSplitter.get_side( "buy", limit_orders)['size'].dropna().apply(lambda x: abs(x)) buy_limit_order_sizes_x, buy_limit_order_sizes_cy = Sample.get_cdf_data( buy_limit_orders_size) discrete_distributions["buy_limit_size"] = \ {'x': buy_limit_order_sizes_x.tolist(), 'cy': buy_limit_order_sizes_cy.tolist()} Sample.plot_cdf(buy_limit_order_sizes_x, buy_limit_order_sizes_cy, "Buy limit order sizes") sell_limit_orders_size = DataSplitter.get_side( "sell", limit_orders)['size'].dropna().apply(lambda x: abs(x)) sell_limit_order_sizes_x, sell_limit_order_sizes_cy = Sample.get_cdf_data( sell_limit_orders_size) discrete_distributions["sell_limit_size"] = \ {'x': sell_limit_order_sizes_x.tolist(), 'cy': sell_limit_order_sizes_cy.tolist()} Sample.plot_cdf(sell_limit_order_sizes_x, sell_limit_order_sizes_cy, "Sell limit order sizes") intervals = DataTransformer.get_time_intervals(orders_df) intervals_x, intervals_cy = Sample.get_cdf_data(intervals) discrete_distributions["intervals"] = \ {'x': intervals_x.tolist(), 'cy': intervals_cy.tolist()} Sample.plot_cdf(intervals_x, intervals_cy, "Order intervals") # buy_limit_size = pool.schedule(DistributionFitter.best_fit_distribution, # (buy_limit_orders['size'],)) # sell_limit_size = pool.schedule(DistributionFitter.best_fit_distribution, # (sell_limit_orders['size'],)) # Market Order Size # market_orders = DataSplitter.get_market_orders(orders_df) # buy_market_orders = DataSplitter.get_side("buy", market_orders) # sell_market_orders = DataSplitter.get_side("sell", market_orders) # buy_market_size = pool.schedule(DistributionFitter.best_fit_distribution, # (buy_market_orders['size'],)) # sell_market_size = pool.schedule(DistributionFitter.best_fit_distribution, # (sell_market_orders['size'],)) # intervals = pool.schedule(DataTransformer.intervals_distribution, (orders_df,)) ratios["buy_sell_order_ratio"] = Statistics.get_buy_sell_ratio( orders_df) ratios[ "buy_sell_cancel_ratio"] = Statistics.get_buy_sell_ratio( cancels_df) ratios[ "buy_sell_volume_ratio"] = Statistics.get_buy_sell_volume_ratio( orders_df) ratios[ 'limit_market_order_ratio'] = Statistics.get_limit_market_order_ratio( orders_df) # Buy/sell Price relative # distributions["buy_price_relative"] = relative_order_price_distributions.result()["buy"][1] # distributions["sell_price_relative"] = relative_order_price_distributions.result()["sell"][1] # distributions["buy_price"] = order_price_distributions.result()["buy"][1] # distributions["sell_price"] = order_price_distributions.result()["sell"][1] # distributions["buy_cancel_price"] = relative_cancel_price_distributions.result()["buy"][1] # distributions["sell_cancel_price"] = relative_cancel_price_distributions.result()["sell"][1] # buy_limit_size_best_fit, buy_limit_size_best_fit_params = buy_limit_size.result() # _, distributions["buy_limit_size"] = DistributionFitter.get_distribution_string(buy_limit_size_best_fit, # buy_limit_size_best_fit_params) # # sell_limit_size_best_fit, sell_limit_size_best_fit_params = sell_limit_size.result() # _, distributions["sell_limit_size"] = DistributionFitter.get_distribution_string(sell_limit_size_best_fit, # sell_limit_size_best_fit_params) # buy_market_size_best_fit, buy_market_size_best_fit_params = buy_market_size.result() # _, distributions["buy_market_size"] = DistributionFitter.get_distribution_string(buy_market_size_best_fit, # buy_market_size_best_fit_params) # # sell_market_size_best_fit, sell_market_size_best_fit_params = sell_market_size.result() # _, distributions["sell_market_size"] = DistributionFitter.get_distribution_string(sell_market_size_best_fit, # sell_market_size_best_fit_params) # _, distributions["interval"] = intervals.result() params['ratios'] = ratios params['correlations'] = correlations params['distributions'] = distributions params['discreteDistributions'] = discrete_distributions return params except Exception as e: cls.logger.error("Failed to generate parameters, exception was " + str(e)) raise e
def test_real_spread_plot(self): plt.figure(figsize=(12, 8)) product = "LTC-USD" root = "/Users/jamesprince/project-data/data/consolidated-feed/" st = datetime.datetime(2018, 5, 17, 1, 0, 0) et = datetime.datetime(2018, 5, 17, 1, 5, 0) feed_df = DataLoader.load_feed(root + product + "/", st, et, product) conf = configparser.ConfigParser() conf.read("../config/backtest.ini") config = BacktestConfig(conf) ob_seq, ob_state = reconstruct_orderbook(config, st, logging.getLogger("test")) orderbook_evo = OrderBookEvolutor(ob_state, st, ob_seq) res_df = orderbook_evo.evolve_orderbook(feed_df) res_df['seconds'] = ( res_df['time'] - res_df['time'].iloc[0]).apply(lambda x: x.total_seconds()) print(res_df) limit_orders = DataSplitter.get_limit_orders_from_feed(feed_df) limit_orders['seconds'] = ( limit_orders['time'] - limit_orders['time'].iloc[0]).apply(lambda x: x.total_seconds()) buy_limit_orders = DataSplitter.get_side("buy", limit_orders) sell_limit_orders = DataSplitter.get_side("sell", limit_orders) cancels = DataSplitter.get_cancellations(feed_df) # print(cancels) cancels_merged = cancels.merge(limit_orders, on='order_id', how='left') # print(cancels_merged) cancels_merged['price'] = cancels_merged['price_x'] cancels_merged['side'] = cancels_merged['side_x'] cancels_merged['seconds'] = (cancels_merged['time_x'] - cancels_merged['time_x'].iloc[0] ).apply(lambda x: x.total_seconds()) cancels_merged['lifetime'] = abs(cancels_merged['time_x'] - cancels_merged['time_y']).dropna() print(cancels_merged) median_idx = int(len(cancels_merged['lifetime']) / 2) print(cancels_merged['lifetime'].sort_values().iloc[median_idx]) buy_cancels = DataSplitter.get_side("buy", cancels_merged) sell_cancels = DataSplitter.get_side("sell", cancels_merged) plt.plot(buy_limit_orders['seconds'], buy_limit_orders['price'], 'r+', label="Buy limit orders") plt.plot(sell_limit_orders['seconds'], sell_limit_orders['price'], 'b+', label="Sell limit orders") # plt.plot(buy_cancels['seconds'], buy_cancels['price'], 'r+', label="Buy side cancels") # plt.plot(sell_cancels['seconds'], sell_cancels['price'], 'b+', label="Sell side cancels") plt.plot(res_df['seconds'], res_df['best_bid'], label='Best bid price') plt.plot(res_df['seconds'], res_df['best_ask'], label='Best ask price') start_price = res_df['midprice'].iloc[0] plt.ylim(start_price - 5, start_price + 5) plt.legend() plt.show()
def test_filter_when_cutoff_before_end(self): assert len(DataSplitter.get_first_n_nanos(self.df, 5)['time']) == 1